diff --git a/.gitmodules b/.gitmodules
index a024019b14..9eb6c53c34 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,6 +10,9 @@
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
+[submodule "akg"]
+	path = akg
+	url = https://gitee.com/mindspore/akg.git
 [submodule "graphengine"]
 	path = graphengine
 	url = https://gitee.com/ms-incubator/graphengine.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b69c510d5..37c3288f12 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ endif ()
 
 include(${CMAKE_SOURCE_DIR}/cmake/options.cmake)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/modules/")
-if (ENABLE_GE)
+if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
     add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
 endif ()
 
@@ -86,10 +86,18 @@ if (ENABLE_GE OR ENABLE_D OR ENABLE_TESTCASES)
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/third_party/fwkacllib/inc/toolchain)
 endif()
 
+if (ENABLE_AKG AND ENABLE_D)
+    add_subdirectory("${CMAKE_SOURCE_DIR}/akg")
+endif()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
 add_subdirectory(mindspore/ccsrc)
 if (ENABLE_TESTCASES)
     add_subdirectory(tests)
 endif()
 
-include(cmake/package.cmake)
\ No newline at end of file
+if (ENABLE_SERVING)
+    add_subdirectory(serving)
+endif()
+
+include(cmake/package.cmake)
diff --git a/RELEASE.md b/RELEASE.md
index f919bd7a2f..9824f803f0 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -7,6 +7,7 @@
     * DeepFM: a factorization-machine based neural network for CTR prediction on Criteo dataset.
     * DeepLabV3: significantly improves over our previous DeepLab versions without DenseCRF post-processing and attains comparable performance with other state-of-art models on the PASCAL VOC 2007 semantic image segmentation benchmark.
     * Faster-RCNN: towards real-time object detection with region proposal networks on COCO 2017 dataset.
+    * SSD: a single stage object detection methods on COCO 2017 dataset.
     * GoogLeNet: a deep convolutional neural network architecture codenamed Inception V1 for classification and detection on CIFAR-10 dataset.
     * Wide&Deep: jointly trained wide linear models and deep neural networks for recommender systems on Criteo dataset.
 * Frontend and User Interface
@@ -62,7 +63,7 @@
 ## Contributors
 Thanks goes to these wonderful people:
 
-Alexey Shevlyakov, Amir Lashkari, anthony, baihuawei, biffex, buxue, caifubi, candanzg, caojian05, Cathy Wong, changzherui, chenfei, chengxianbin, chenhaozhe, chenzomi, chujinjin, cristoval, dengwentao, eric, etone-chan, fary86, gaojing, gengdongjie, gongchen, guohongzilong, guozhijian, heleiwang, hesham, He Wei, Hoai Linh Tran h00472437, hongxing, huangdongrun, huanghui, Jamie Nisbet, Jesse Lee, jiangjinsheng, jiangzhiwen, jinyaohui, jjfeing, jonwe, jonyguo, Junhan Hu, Kang, kingfo, kswang, laiyongqiang, leopz, lichenever, lihongkang, limingqi107, liubuyu, liuliyan2, liuwenhao4, liuxiao, liuxiao, liyong, lizhenyu, lvliang, Margaret_wangrui, meixiaowei, ms_yan, Nat Sutyanyong, ougongchang, panfengfeng, panyifeng, Peilin Wang, peixu_ren, qianlong, rick_sanchez, seatea, sheng, shijianning, simson, sunsuodong, Tinazhang, VectorSL, wandongdong, wangcong, wanghua, wangnan39, Wei Luning, wenchunjiang, wilfChen, WilliamLian, wsc, wukesong, wuxuejian, Xiaoda Zhang, xiefangqi, xulei2020, Yang, yangjie159, yangruoqi713, yangyongjie, yangzhenzhang, Yanjun Peng, yanzhenxiang2020, yao_yf, Yi Huaijie, yoonlee666, yujianfeng, YuJianfeng, yvetteliu, z00478463, zhangdengcheng, Zhang Qinghua, zhangz0911gm, zhaojichen, zhaoting, zhaozhenlong, zhoufeng, zhouneng, zhousiyi, zhouyuanshen, Zirui Wu, Ziyan, zjun, ZPaC, lihongzhang
+Alexey Shevlyakov, Amir Lashkari, anthony, baihuawei, biffex, buxue, caifubi, candanzg, caojian05, Cathy Wong, changzherui, chenfei, chengxianbin, chenhaozhe, chenzomi, chujinjin, cristoval, dengwentao, eric, etone-chan, fary86, gaojing, gengdongjie, gongchen, guohongzilong, guozhijian, heleiwang, hesham, He Wei, Hoai Linh Tran, hongxing, huangdongrun, huanghui, Jamie Nisbet, Jesse Lee, jiangjinsheng, jiangzhiwen, jinyaohui, jjfeing, jonwe, jonyguo, Junhan Hu, Kang, kingfo, kswang, laiyongqiang, leopz, lichenever, lihongkang, limingqi107, liubuyu, liuliyan2, liuwenhao4, liuxiao, liuxiao, liyong, lizhenyu, lvliang, Margaret_wangrui, meixiaowei, ms_yan, Nat Sutyanyong, ougongchang, panfengfeng, panyifeng, Peilin Wang, peixu_ren, qianlong, rick_sanchez, seatea, sheng, shijianning, simson, sunsuodong, Tinazhang, VectorSL, wandongdong, wangcong, wanghua, wangnan39, Wei Luning, wenchunjiang, wilfChen, WilliamLian, wsc, wukesong, wuxuejian, Xiaoda Zhang, xiefangqi, xulei2020, Yang, yangjie159, yangruoqi713, yangyongjie, yangzhenzhang, Yanjun Peng, yanzhenxiang2020, yao_yf, Yi Huaijie, yoonlee666, yujianfeng, YuJianfeng, yvetteliu, zhangdengcheng, Zhang Qinghua, zhangz0911gm, zhaojichen, zhaoting, zhaozhenlong, zhoufeng, zhouneng, zhousiyi, zhouyuanshen, Zirui Wu, Ziyan, zjun, ZPaC, lihongzhang
 
 Contributions of any kind are welcome!
 
diff --git a/Third_Party_Open_Source_Software_Notice b/Third_Party_Open_Source_Software_Notice
index 3a5c5403eb..3c29cb49e8 100644
--- a/Third_Party_Open_Source_Software_Notice
+++ b/Third_Party_Open_Source_Software_Notice
@@ -2245,14 +2245,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 Please also refer to the file CONTRIBUTING.md, which clarifies licensing of
 external contributions to this project including patches, pull requests, etc.
 
-Software: SQLite 3.31.1
+Software: SQLite 3.32.2
 Copyright notice:
-Copyright 2008 D. Richard Hipp and Hipp, Wyrick & Company, Inc.
-Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2007 2008 Free Software Foundation, Inc.
-(c)  The page number is greater than the largest page that existed in
 Copyright (c) 1991-2011 Unicode, Inc.
+Copyright 2008 D. Richard Hipp and Hipp, Wyrick & Company, Inc.
 Copyright (c) 2002 by David Gravereaux.
 Copyright (c) 2006 by Pat Thoyts
+(c)  The page number is greater than the largest page that existed in
+Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2007 2008 Free Software Foundation, Inc.
 
 License: Public Domain
 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means.
@@ -3053,6 +3053,646 @@ Copyright 2003 Google Inc.
 Copyright 2009 Google Inc.
 Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
 
+Software: tinyxml2 8.0.0
+Copyright 2011, John Resig.
+Copyright 2011, The Dojo Foundation.
+
+Software: icu 67.1
+Copyright (C) 2000-2004, International Business Machines Corporation
+Copyright (C) 2002-2014, International Business Machines(C) Copyright IBM Corp. 1998-2011 - All Rights Reserved
+Copyright (C) 2003-2008, International Business Machines
+Copyright (C) 2005-2006, International Business Machines
+Copyright (C) 2016 and later: Unicode, Inc. and others.
+Copyright (c) 2001-2010 International Business Machines
+Copyright (C) 2009, International Business Machines
+Copyright (c) 2010-2015 International Business Machines Corporation and others. All rights reserved.
+Copyright (C) 2002-2015, International Business Machines verbatim (minus copyright and #include) and copied together into this file.
+Copyright (c) 1997-2014, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 1997-2008, International Business Machines Corporation and
+Copyright (c) 1997-2003, International Business Machines Corporation and
+Copyright (c) 1996-2012, International Business Machines Corporation and
+Copyright (c) 1997-2016, International Business Machines
+Copyright (c) 1997-2013 International Business Machines
+Copyright (c) 1997-2016, International Business Machines Corporation and
+Copyright (c) 1997-2001, International Business Machines Corporation and
+Copyright (c) 1997-2012, International Business Machines Corporation and
+Copyright (c) 1997-2005, International Business Machines Corporation and
+Copyright (c) 1997-2010, International Business Machines Corporation and
+Copyright (c) 2011-2016, International Business Machines Corporation
+Copyright (c) 1997-2009, International Business Machines Corporation and
+Copyright (c) 1997-2002,2008, International Business Machines Corporation and
+Copyright (c) 1997-2009,2014, International Business Machines
+Copyright (C) 2000-2009, International Business Machines
+Copyright (c) 1997-2015, International Business Machines Corporation and
+Copyright (c) 1997-2013, International Business Machines Corporation and
+Copyright (c) 2001-2016, International Business Machines Corporation and
+Copyright (c) 1997-2016, International Business Machines Corporation
+Copyright (c) 1997-2003, 2007-2009 International Business Machines Corporation and
+Copyright (c) 2011-2014, International Business Machines Corporation
+Copyright (c) 2003-2009, International Business Machines
+Copyright (c) 2016, International Business Machines Corporation
+Copyright (c) 1997-2004, International Business Machines Corporation and
+Copyright (C) 2002-2016, International Business Machines
+Copyright (C) 1998-2014, International Business Machines Corporation
+Copyright (c) 2003-2013, International Business Machines Corporation and
+Copyright (c) 2005-2016, International Business Machines Corporation and
+Copyright (c) 1999-2013, International Business Machines Corporation and
+Copyright (c) 2003-2015, International Business Machines Corporation and
+Copyright (C) 2003-2016, International Business Machines
+Copyright (C) 2003-2014, International Business Machines
+Copyright (C) 2003, International Business Machines
+Copyright (c) 1998-2016, International Business Machines Corporation and
+Copyright (c) 2004-2015, International Business Machines Corporation and
+Copyright (c) 2009-2016, International Business Machines Corporation and
+Copyright (C) 2003-2012, International Business Machines
+Copyright (c) 2000-2016, International Business Machines Corporation and
+Copyright (C) 2001-2014, International Business Machines
+Copyright (C) 2001-2016, International Business Machines
+Copyright (c) 1997-2014, International Business Machines © 2017 and later: Unicode, Inc. and others.
+Copyright (C) 2007-2016, International Business Machines © 2018 and later: Unicode, Inc. and others.
+Copyright (c) 2015, International Business Machines Corporation
+Copyright (c) 2014-2016, International Business Machines Corporation
+Copyright (c) 2002-2016, International Business Machines
+Copyright (c) 2001-2011,2015 International Business Machines
+Copyright (c) 2001-2016 International Business Machines
+Copyright (c) 2005-2013, International Business Machines Corporation and
+Copyright (c) 1998-2014, International Business Machines Corporation and
+Copyright (C) 1997-2016 International Business Machines
+Copyright (C) 2009-2014, International Business Machines Corporation and
+Copyright (c) 2002-2014, International Business Machines Corporation
+Copyright (c) 2002-2007, International Business Machines Corporation
+Copyright (C) 1996-2012, International Business Machines Corporation
+Copyright (C) 1996-2008, International Business Machines Corporation
+Copyright (C) 2007-2013, International Business Machines Corporation and
+Copyright (C) 2008-2015, International Business Machines
+Copyright (C) 2003-2013, International Business Machines Corporation and
+Copyright (C) 2003-2013, International Business Machines Corporation
+Copyright (C) 1997-2016, International Business Machines Corporation and
+Copyright (C) 2001-2011, International Business Machines
+Copyright (C) 2001-2008, International Business Machines
+Copyright (C) 2003 - 2009, International Business Machines Corporation and
+Copyright (C) 2003 - 2008, International Business Machines Corporation and
+Copyright (C) 2007-2014, International Business Machines Corporation
+Copyright (C) 2007-2013, International Business Machines Corporation
+Copyright (C) 1997-2013, International Business Machines Corporation and
+Copyright (C) 1996-2014, International Business Machines Corporation and
+Copyright (C) 2010-2014, International Business Machines
+Copyright (C) 2010-2015, International Business Machines
+Copyright (C) 2013-2014, International Business Machines
+Copyright (C) 1996-2015, International Business Machines
+Copyright (C) 1996-2014, International Business Machines
+Copyright (C) 2012-2015, International Business Machines
+Copyright (C) 2012-2014, International Business Machines
+Copyright (C) 2013-2015, International Business Machines
+Copyright (C) 2013-2016, International Business Machines
+Copyright (C) 1999-2016, International Business Machines
+Copyright (C) 1999-2015, International Business Machines
+Copyright (C) 1999-2014, International Business Machines
+Copyright (C) 2015-2016, International Business Machines Corporation and others.
+Copyright (C) 2003 - 2013, International Business Machines Corporation and
+Copyright (C) 1999-2011, International Business Machines
+Copyright (C) 2005-2016, International Business Machines
+Copyright (C) 2005-2012, International Business Machines
+Copyright (C) 2005-2015, International Business Machines
+Copyright (C) 2005-2013, International Business Machines
+Copyright (C) 2005-2014, International Business Machines
+Copyright (c) 2004, International Business Machines
+Copyright (c) 2004-2014 International Business Machines
+Copyright (c) 2004-2014, International Business Machines
+Copyright (C) 2013, International Business Machines Corporation
+Copyright (C) 1997-2015, International Business Machines Corporation and
+Copyright (C) 2016, International Business Machines
+Copyright (c) IBM Corporation, 2000-2012. All rights reserved.
+Copyright (c) IBM Corporation, 2000-2011. All rights reserved.
+Copyright (c) IBM Corporation, 2000-2014. All rights reserved.
+Copyright (c) IBM Corporation, 2000-2010. All rights reserved.
+Copyright (c) IBM Corporation, 2000-2016. All rights reserved.
+Copyright 2010 the V8 project authors. All rights reserved.
+Copyright 2006-2008 the V8 project authors. All rights reserved.
+Copyright 2012 the V8 project authors. All rights reserved.
+Copyright (C) 2008-2016, International Business Machines Corporation and
+Copyright (C) 2007-2016, International Business Machines Corporation and
+Copyright (C) 2007-2012, International Business Machines Corporation and
+Copyright (c) 2001-2011, International Business Machines
+Copyright (c) 2001-2007, International Business Machines
+Copyright (C) 2010-2014, International Business Machines Corporation and
+Copyright (C) 1997-2010, International Business Machines Corporation and
+Copyright (C) 1997-2012, International Business Machines Corporation and
+Copyright (C) 2009-2015, International Business Machines Corporation and
+Copyright (C) 2009-2012, International Business Machines Corporation and
+Copyright (c) 2002-2012, International Business Machines Corporation
+Copyright (c) 2002-2011, International Business Machines Corporation
+Copyright (C) 2008-2013, International Business Machines Corporation and
+Copyright (c) 2003-2008, International Business Machines
+Copyright (C) 2003-2016, International Business Machines Corporation
+Copyright (C) 2003-2014, International Business Machines Corporation
+Copyright (C) 2003-2008, International Business Machines Corporation
+Copyright (C) 2005-2008, International Business Machines
+Copyright (C) 2003-2015, International Business Machines Corporation
+Copyright (C) 2003-2009,2012,2016 International Business Machines Corporation and
+Copyright (c) 2004-2016, International Business Machines © 2020 and later: Unicode, Inc. and others.
+Copyright (C) 2007-2008, International Business Machines Corporation and
+Copyright (C) 2001-2007, International Business Machines
+Copyright (C) 1997-2012, International Business Machines
+Copyright (C) 1997-2015, International Business Machines
+Copyright (C) 2001-2010, International Business Machines
+Copyright (c) 2000-2005, International Business Machines
+Copyright (c) 2000-2007, International Business Machines © 2019 and later: Unicode, Inc. and others.
+Copyright (C) 2010-2015, International Business Machines Corporation and
+Copyright (C) 2015, International Business Machines Corporation and
+Copyright (c) 2003-2013, International Business Machines
+Copyright (C) 2001-2012, International Business Machines
+Copyright (C) 2001-2011, International Business Machines Corporation
+Copyright (C) 2014-2016, International Business Machines
+Copyright (C) 1997-2015, International Business Machines Corporation
+Copyright (C) 1999-2007, International Business Machines
+Copyright (C) 1999-2007, International Business Machines Corporation
+Copyright (C) 1999-2011, International Business Machines Corporation
+Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2002-2016 International Business Machines Corporation and others.
+Copyright (C) 2002-2016, International Business Machines Corporation and others.
+Copyright (C) 2002-2016 International Business Machines Corporation
+Copyright (C) 2002-2015, International Business Machines Corporation and others.
+Copyright (C) 2012 International Business Machines Corporation
+Copyright (C) 2002-2015 International Business Machines Corporation
+Copyright (C) 2004-2015, International Business Machines Corporation and others.
+Copyright (C) 2003-2010, International Business Machines Corporation and others.
+Copyright (c) 2008-2011, International Business Machines Corporation and
+Copyright (c) 2008-2010, International Business Machines Corporation and
+Copyright (C) 2014-2016, International Business Machines Corporation and
+Copyright (C) 2013, International Business Machines Corporation and
+Copyright (c) 2014, International Business Machines
+Copyright (C) 2014, International Business Machines
+Copyright (C) 2013, International Business Machines
+Copyright (C) 2001-2008,2010 IBM and others. All rights reserved.
+Copyright (C) 2010 , Yahoo! Inc.
+Copyright (c) 1997-2011, International Business Machines Corporation and
+Copyright (C) 2013-2014, International Business Machines Corporation and
+Copyright (C) 2009-2013, International Business Machines Corporation and
+Copyright (C) 1996-2012, International Business Machines Corporation and
+Copyright (C) 2015, International Business Machines Corporation
+Copyright (c) 2001-2012, International Business Machines Corporation
+Copyright (C) 2001-2014 IBM and others. All rights reserved.
+Copyright (C) 2008-2014, Google, International Business Machines Corporation and
+Copyright (C) 2008, Google, International Business Machines Corporation and
+Copyright (C) 2008-2015, Google, International Business Machines Corporation
+Copyright (c) 2001-2014, International Business Machines
+Copyright (c) 2002-2010, International Business Machines Corporation
+Copyright (C) 2011-2015, International Business Machines Corporation and
+Copyright (C) 2011-2016, International Business Machines Corporation and
+Copyright (C) 2011-2012, International Business Machines Corporation and
+Copyright (C) 1996-2016, International Business Machines
+Copyright (C) 1998-2014, International Business Machines
+Copyright (C) 2004-2016, International Business Machines
+Copyright (C) 2010-2011, International Business Machines
+Copyright (C) 2009-2015, International Business Machines
+Copyright (C) 2015, International Business Machines
+Copyright (C) 2012-2016, International Business Machines
+Copyright (C) 1999-2012, International Business Machines
+Copyright (C) 2001, International Business Machines
+Copyright (C) 2013, International Business Machines Corporation and others.
+Copyright (C) 2010-2012, International Business Machines
+Copyright (C) 2004-2015, International Business Machines
+Copyright (C) 2003-2006, International Business Machines
+Copyright (C) 2013-2015, International Business Machines Corporation and others.
+Copyright (C) 2001-2015 IBM and others. All rights reserved.
+Copyright (C) 2008-2015, International Business Machines Corporation
+Copyright (C) 2008-2016, International Business Machines
+Copyright (C) 2008-2013, International Business Machines Corporation
+Copyright (C) 2004-2012, International Business Machines Corporation and
+Copyright (C) 1997-2009,2014 International Business Machines
+Copyright (C) 2009-2011, International Business Machines Corporation and
+Copyright (C) 2009-2016, International Business Machines Corporation and
+Copyright (C) 2009-2013, International Business Machines
+Copyright (C) 2008-2011, International Business Machines
+Copyright (C) 2007-2014, International Business Machines Corporation and
+Copyright (C) 2009-2010, International Business Machines Corporation and
+Copyright (C) 2001-2016 International Business Machines Corporation
+Copyright (c) 2002-2011, International Business Machines
+Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
+Copyright (c) 2013-2016 International Business Machines Corporation and others. All rights reserved.
+Copyright (c) 2013-2015 International Business Machines Corporation and others. All rights reserved.
+Copyright (c) 2007-2012, International Business Machines Corporation and
+Copyright (c) 2007-2012, International Business Machines
+Copyright (C) 2010, International Business Machines
+Copyright (C) 1997-2011, International Business Machines
+Copyright (C) 1997-2005, International Business Machines
+Copyright (C) 2009-2011, International Business Machines
+Copyright (C) 2003-2015, International Business Machines
+Copyright (C) 2009-2016, International Business Machines
+Copyright (C) 2008-2012, International Business Machines
+Copyright (C) 2008, International Business Machines
+Copyright (C) 2011-2014, International Business Machines
+Copyright (C) 2011-2013, International Business Machines
+Copyright (C) 2005, International Business Machines
+Copyright (C) 1999-2013, International Business Machines
+Copyright (C) 1998-2016, International Business Machines
+Copyright (c) 2007-2014, International Business Machines Corporation and
+Copyright (C) 2003-2013, International Business Machines
+Copyright (c) 2007-2016, International Business Machines Corporation and
+Copyright (c) 2008-2015, International Business Machines
+Copyright (C) 1999-2010, International Business Machines
+Copyright (C) 2000-2015, International Business Machines
+Copyright (C) 2000-2011, International Business Machines
+Copyright (C) 2000-2012, International Business Machines
+Copyright (C) 2000-2010, International Business Machines
+Copyright (C) 2004-2010, International Business Machines
+Copyright (C) 2004-2005, International Business Machines
+Copyright (c) 2013-2014, International Business Machines
+Copyright (c) 1991-2013 Unicode, Inc. © 2019 Unicode®, Inc.
+Copyright (C) 2018 and later: Unicode, Inc. and others.
+Copyright (c) 2008-2013 International Business Machines
+Copyright (C) 2002-2010, International Business Machines
+Copyright (c) 2012-2015 International Business Machines © 2020 Unicode®, Inc.
+Copyright (c) 2005-2013 IBM Corporation and others. All rights reserved
+Copyright (c) 2011-2012, International Business Machines Corporation and
+Copyright (C) 1998-2000, International Business Machines © 2017 Unicode®, Inc.
+Copyright (c) 2007-2015 International Business Machines
+Copyright (C) 2004-2006, International Business Machines
+Copyright (C) 2003-2005, International Business Machines
+Copyright (c) 1999-2014 International Business Machines
+Copyright (c) 2003, International Business Machines
+Copyright (C) 2014 International Business Machines
+Copyright (c) 2001-2003 International Business Machines
+Copyright (c) 2004-2011 International Business Machines
+Copyright (C) 2015-2016, International Business Machines
+Copyright (c) 2001-2015 International Business Machines
+Copyright (C) 2003-2012, International Business Machines Corporation and COPYRIGHT AND PERMISSION NOTICE
+Copyright (c) 2003 National Electronics and Computer Technology Center and others
+Copyright (C) 2005-2010, International Business Machines
+Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved
+Copyright (C) 2004-2016 International Business Machines
+Copyright (C) 1998-2013, International Business Machines
+Copyright (C) 1998-2010, International Business Machines
+Copyright (c) 1999-2004, International Business Machines
+Copyright (C) 2002-2006 International Business Machines Corporation
+Copyright (C) 1999-2006, International Business Machines
+Copyright (C) 2002-2016 IBM, Inc. All Rights Reserved.
+Copyright (c) 2002-2006, International Business Machines(C) Copyright IBM Corp. 1998-2007 - All Rights Reserved
+Copyright (C) 1999-2003, International Business Machines
+Copyright (C) 1998-2006, International Business Machines Corporation and
+Copyright (C) 1998-2003, International Business Machines Corporation and
+Copyright (C) 2003 - 2008, International Business Machines
+Copyright (C) 1999-2008, International Business Machines
+Copyright (C) 1999-2001, International Business Machines
+Copyright (C) 1999-2005, International Business Machines
+Copyright (C) 2016 and later: Unicode, Inc. and others.
+Copyright (c) 2001-2010 IBM Corporation and others. All Rights Reserved.
+Copyright (C) 1998-2005, International Business Machines Corporation and
+Copyright (C) 1998-2001, International Business Machines Corporation and
+Copyright (c) 2002-2005, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2000-2014, International Business Machines
+Copyright (C) 1996-2013, International Business Machines
+Copyright (c) 2002-2006, International Business Machines Corporation and
+Copyright (c) 2004-2010, International Business Machines Corporation and
+Copyright (C) 2004-2011, International Business Machines
+Copyright (c) 2002-2005, International Business Machines Corporation and
+Copyright (c) 2002-2014, International Business Machines
+Copyright (c) 1997-2012, International Business Machines
+Copyright (c) 2002-2008, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2011-2013, Apple Inc.; Unicode, Inc.; and others. All Rights Reserved.
+Copyright (C) 2011-2013, Apple Inc. and others. All Rights Reserved.
+Copyright (c) 2005-2007,2010 Apple Inc., Unicode Inc.,and others. All Rights Reserved.
+Copyright (c) 1999-2003, International Business Machines Corporation and
+Copyright (c) 2003-2014, International Business Machines
+Copyright (c) 2002-2010, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 1999-2010, International Business Machines Corporation and
+Copyright (c) 1999-2002, International Business Machines Corporation and
+Copyright (C) 2002-2003, International Business Machines
+Copyright (C) 2002, International Business Machines
+Copyright (c) 2007, International Business Machines Corporation and
+Copyright (C) 2007, International Business Machines
+Copyright (C) 2001-2006, International Business Machines
+Copyright (C) 2010-2014, International Business Machines Corporation and others.
+Copyright (C) 2005-2016, International Business Machines Corporation and
+Copyright (C) 2015-2016, International Business Machines Corporation and
+Copyright (C) 2008-2012, International Business Machines Corporation
+Copyright (c) 2006-2015 International Business Machines Corporation and others. All rights reserved.
+Copyright (c) 2014-2015 International Business Machines Corporation and others. All rights reserved.
+Copyright (C) 2002-2011, International Business Machines
+Copyright (c) 2003-2010, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2012 IBM Corporation and Others. All Rights Reserved.
+Copyright (C) 1998-2012, International Business Machines Corporation
+Copyright (c) 2009, International Business Machines Corporation and
+Copyright (C) The Internet Society (2002). All Rights Reserved.
+Copyright (c) 2015, International Business Machines Corporation and
+Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 1998-2016, International Business Machines Corporation
+Copyright (c) 2011-2016,International Business Machines
+Copyright (C) 2012 International Business Machines Corporation and Others. All Rights Reserved.
+Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 2011-2012,International Business Machines
+Copyright (c) 2007, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2007-2007, International Business Machines(C) Copyright IBM Corp. 1998-2014 - All Rights Reserved
+Copyright (C) 1998-2002, International Business Machines
+Copyright (c) 2001-2007, International Business Machines Corporation and others. All Rights Reserved.(C) Copyright IBM Corp. 1998-2013 - All Rights Reserved
+Copyright (C) 1998-2015, International Business Machines
+Copyright (C) 2001-2014 International Business Machines
+Copyright (C) 2011-2016, International Business Machines
+Copyright (C) 2011-2015, International Business Machines
+Copyright (c) 1999-2014, International Business Machines Corporation and
+Copyright (c) 1999-2009, International Business Machines Corporation and
+Copyright (c) 2010,International Business Machines
+Copyright (c) 2010-2016,International Business Machines
+Copyright (c) 2002-2005, International Business Machines
+Copyright (C) 2000-2003, International Business Machines
+Copyright (c) 2008-2014, International Business Machines Corporation and
+Copyright (C) 2001 - 2005, International Business Machines
+Copyright (C) 2001-2005, International Business Machines
+Copyright (C) 1995-2014, International Business Machines
+Copyright (c) 2000-2004 IBM, Inc. and Others.
+Copyright (c) 2002-2014, International Business Machines Corporation and
+Copyright (c) 2007-2013, International Business Machines Corporation and
+Copyright (c) 2002-2012, International Business Machines Corporation and
+Copyright (C) 2002-2012, International Business Machines
+Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
+Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2009-2014, International Business Machines
+Copyright (C) 2008, International Business Machines Corporation and others.
+Copyright (C) 2000-2016, International Business Machines
+Copyright (C) 2011-2014 International Business Machines
+Copyright (C) 1997-2014, International Business Machines
+Copyright (C) 1997-2013, International Business Machines
+Copyright (c) 2004-2006, International Business Machines
+Copyright (C) 1997-2016, International Business Machines
+Copyright (C) 1997-2006, International Business Machines
+Copyright (C) 1997-2011, International Business Machines Corporation and others.
+Copyright (C) 1997-2013, International Business Machines Corporation and others.
+Copyright (c) 2004-2015, International Business Machines
+Copyright (C) 2009-2017, International Business Machines Corporation,Google, and others. All Rights Reserved.
+Copyright (C) 1997-2016, International Business Machines Corporation and others.
+Copyright (C) 2008-2015, International Business Machines Corporation and
+Copyright (C) 1997-2015, International Business Machines Corporation and others.
+Copyright (C) 2014-2016, International Business Machines Corporation and others.
+Copyright (c) 2014-2016, International Business Machines
+Copyright (C) 2001-2011 IBM and others. All rights reserved.
+Copyright (C) 1996-2014, International Business Machines Corporation and others.
+Copyright (C) 1996-2016, International Business Machines Corporation and
+Copyright (C) 2009-2016, International Business Machines Corporation,
+Copyright (C) 2009-2010, Google, International Business Machines Corporation and
+Copyright (C) 2008-2014, Google, International Business Machines Corporation
+Copyright (C) 1996-2015, International Business Machines Corporation and
+Copyright (c) 1996-2015, International Business Machines Corporation and others.
+Copyright (C) 2010-2012,2015 International Business Machines
+Copyright (C) 2007-2015, International Business Machines
+Copyright (C) 2013-2014, International Business Machines Corporation and others.
+Copyright (C) 2010-2013, International Business Machines
+Copyright (c) 2002-2005, International Business Machines Corporation
+Copyright (C) 2001-2011,2014 IBM and others. All rights reserved.
+Copyright (C) 2008-2016, International Business Machines Corporation
+Copyright (C) 2004 - 2008, International Business Machines Corporation and
+Copyright (C) 1997-2011,2014-2015 International Business Machines
+Copyright (C) 2001-2003, International Business Machines
+Copyright (C) 1999-2009, International Business Machines
+Copyright (C) 2020 and later: Unicode, Inc. and others.
+Copyright (c) 2002, International Business Machines Corporation and
+Copyright (C) 2000-2008, International Business Machines
+Copyright (C) 1998-2006, International Business Machines
+Copyright (C) 1998-2001, International Business Machines Corporation
+Copyright (C) 1998-2004, International Business Machines Corporation
+Copyright (C) 2000, International Business Machines
+Copyright (c) 1999-2016, International Business Machines Corporation and
+Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 1999-2012, International Business Machines Corporation and
+Copyright (C) 1998-2011, International Business Machines
+Copyright (C) 2008-2014, International Business Machines Corporation and
+Copyright (C) 2003-2004, International Business Machines
+Copyright (c) 2003-2005, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2002-2006 IBM, Inc. All Rights Reserved.
+Copyright (C) 2004-2008, International Business Machines
+Copyright (c) 2002-2016 International Business Machines Corporation and
+Copyright (c) 2002-2015, International Business Machines Corporation and
+Copyright (C) 2002-2016, International Business Machines Corporation
+Copyright (c) 2002-2010,International Business Machines
+Copyright (c) 2002-2014,International Business Machines
+Copyright (c) 2002-2016,International Business Machines
+Copyright (C) 2016 International Business Machines Corporation
+Copyright © 2019 and later: Unicode, Inc. and others.
+Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 2016 International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 2005-2006, International Business Machines Corporation and
+Copyright (c) 1997-2004, International Business Machines Corporation
+Copyright (c) 2012-2016, International Business Machines Corporation
+Copyright (c) 2012-2014, International Business Machines Corporation and
+Copyright (c) 1997-2014, International Business Machines Corporation
+Copyright (c) 1996-2016, International Business Machines Corporation and
+Copyright (c) 2003-2013, International Business Machines Corporation
+Copyright (c) 2003-2008, International Business Machines Corporation
+Copyright (c) 1997-2015, International Business Machines Corporation
+Copyright (c) 2002-2016, International Business Machines Corporation and
+Copyright (c) 1997-2002, International Business Machines Corporation and
+Copyright (C) 1996-2012, International Business Machines
+Copyright (c) 1997-2013 International Business Machines Corporation and
+Copyright (c) 2010-2012, International Business Machines Corporation and
+Copyright (c) 1997-2011, International Business Machines Corporation
+Copyright (c) 1997-2006, International Business Machines Corporation and
+Copyright (c) 2008-2016 International Business Machines Corporation and
+Copyright (c) 2008-2016, International Business Machines Corporation and
+Copyright (c) 1997-2016 International Business Machines Corporation and
+Copyright (c) 2007-2011, International Business Machines
+Copyright (c) 2007-2010, International Business Machines
+Copyright (C) 2001-2016, International Business Machines Corporation and
+Copyright (C) 2001-2003, International Business Machines Corporation and
+Copyright (C) 2003-2011, International Business Machines
+Copyright (c) 1997-2007, International Business Machines Corporation and
+Copyright (c) 1997-2015, International Business Machines
+Copyright (C) 2004-2009, International Business Machines Corporation and
+Copyright (C) 2004, International Business Machines Corporation and
+Copyright (C) 1996-2009, International Business Machines Corporation and
+Copyright (C) 1996-2006, International Business Machines Corporation and
+Copyright (C) 2011-2013, International Business Machines Corporation
+Copyright (C) 2000-2007, International Business Machines
+Copyright (c) 2001, International Business Machines Corporation and
+Copyright (C) 2012-2013, International Business Machines
+Copyright (c) 2010-2016, International Business Machines Corporation and
+Copyright (c) 2010-2016, International Business Machines Corporation
+Copyright (c) 1997-2010, International Business Machines Corporation
+Copyright (c) 1997-2003, International Business Machines
+Copyright (C) 2014-2015, International Business Machines Corporation and
+Copyright (c) 1997-2013, International Business Machines Corporation
+Copyright (c) 1999-2016, International Business Machines
+Copyright (c) 1999-2016 International Business Machines Corporation and
+Copyright (c) 2016, International Business Machines Corporation and
+Copyright (c) 2016, International Business Machines
+Copyright (c) 2013-2016, International Business Machines Corporation
+Copyright (c) 2013, International Business Machines Corporation
+Copyright (C) 2013-2016, International Business Machines Corporation and
+Copyright (c) 2001-2010, International Business Machines Corporation and
+Copyright (C) 2014, International Business Machines Corporation and
+Copyright (c) 1999-2015, International Business Machines Corporation and
+Copyright (C) 2001-2016, International Business Machines orporation
+Copyright (c) 2001-2008, International Business Machines Corporation and others
+Copyright (C) 2003-2016, International Business Machines Corporation and
+Copyright (c) 2004, International Business Machines Corporation
+Copyright (C) 2001-2009, International Business Machines
+Copyright (c) 2004,2011 International Business Machines
+Copyright (c) 2004-2011, International Business Machines
+Copyright (c) 2000-2016, International Business Machines Corporation
+Copyright (c) 2001-2005, International Business Machines Corporation and
+Copyright (C) 2001-2004, International Business Machines
+Copyright (c) 2001-2009, International Business Machines
+Copyright (c) 1997-2009, International Business Machines Corporation
+Copyright (c) 1997-2013, International Business Machines
+Copyright (c) 1997-2012, International Business Machines Corporation
+Copyright (C) 2007-2015, International Business Machines Corporation and
+Copyright (C) 2007-2011, International Business Machines Corporation and
+Copyright (C) 2007, International Business Machines Corporation and
+Copyright (c) 1998-2005, International Business Machines Corporation and
+Copyright (c) 2002-2010, International Business Machines Corporation and
+Copyright (C) 1999-2016 International Business Machines Corporation and
+Copyright (c) 2004-2011, International Business Machines Corporation and
+Copyright (c) 2002-2007, International Business Machines Corporation and
+Copyright (C) 2003, International Business Machines Corporation and
+Copyright (C) 2005-2011, International Business Machines
+Copyright (C) 2011-2012, International Business Machines
+Copyright (C) 2007-2012, International Business Machines
+Copyright (C) 2006-2016, International Business Machines Corporation
+Copyright (C) 2006-2012, International Business Machines Corporation and others.
+Copyright 2007 Google Inc. All Rights Reserved.
+Copyright (c) 2001-2015, International Business Machines
+Copyright (C) 2006-2014, International Business Machines Corporation
+Copyright (C) 2008, International Business Machines Corporation and
+Copyright (C) 2009-2012, International Business Machines
+Copyright (C) 2006 International Business Machines Corporation
+Copyright (C) 2010-2016, International Business Machines Corporation and
+Copyright (C) 2002-2014, International Business Machines Corporation and
+Copyright (C) 2002-2005, International Business Machines Corporation and
+Copyright (C) 2011, International Business Machines
+Copyright (c) 2003-2010 International Business Machines
+Copyright (C) 2003-2003, International Business Machines
+Copyright (C) 1999-2016 International Business Machines Corporation
+Copyright (C) 1999-2014 International Business Machines Corporation
+Copyright (C) 1999-2014 International Business Machines
+Copyright (C) 2002-2011, International Business Machines Corporation and others.
+Copyright (C) 2002-2008, International Business Machines Corporation and others.
+Copyright (C) 2002-2008 International Business Machines Corporation
+Copyright (c) 2001-2005, International Business Machines
+Copyright (C) 2002-2014 International Business Machines Corporation
+Copyright (c) 2003-2011, International Business Machines
+Copyright (C) 1998-2012, International Business Machines Corporation and
+Copyright (C) 2001-2014, International Business Machines Corporation.
+Copyright (C) 2001-2011, International Business Machines Corporation.
+Copyright (C) 2001-2014, International Business Machines Corporation and
+Copyright (C) 2001-2011, International Business Machines Corporation and
+Copyright (C) 2001-2012, International Business Machines Corporation and
+Copyright 2004 and onwards Google Inc.
+Copyright (C) 2004-2014, International Business Machines
+Copyright (C) 2006, International Business Machines
+Copyright (C) 2004-2012, International Business Machines
+Copyright (C) 2001-2013, International Business Machines
+Copyright (C) 1998-2004, International Business Machines
+Copyright (C) 2000-2013, International Business Machines
+Copyright (C) 1999-2015 International Business Machines
+Copyright (C) 2000-2006, International Business Machines
+Copyright (C) 1999-2004, International Business Machines
+Copyright (C) 2003-2007, International Business Machines
+Copyright (C) 2002-2006, International Business Machines
+Copyright (C) 2001-2015, International Business Machines
+Copyright (c) 2001-2012, International Business Machines
+Copyright (c) 2002-2004, International Business Machines
+Copyright (C) 1999-2016, International Business Machines Corporation and
+Copyright (c) 1996-2014, International Business Machines
+Copyright (C) 1999-2016, International Business Machines Corporation
+Copyright (C) 2009-2014 International Business Machines
+Copyright (C) 2004-2007, International Business Machines
+Copyright (c) 2001-2016, International Business Machines
+Copyright (C) 2003-2009, International Business Machines
+Copyright (C) 1999-2013, International Business Machines Corporation and
+Copyright (C) 1999-2015, International Business Machines Corporation and
+Copyright (c) 2002-2011, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2001-2016 IBM, Inc. All Rights Reserved.
+Copyright (C) 1999-2016 International Business Machines
+Copyright (C) 2009-2010 IBM Corporation and Others. All Rights Reserved.
+Copyright (C) 1998-2012, International Business Machines
+Copyright (C) 1991 and later: Unicode, Inc. and others.
+Copyright (C) 1997-2000, International Business Machines
+Copyright (c) 1999-2007, International Business Machines Corporation and
+Copyright (c) 2000 IBM, Inc. and Others.
+Copyright (C) 2008-2013, International Business Machines
+Copyright (C) 1998-2003, 2006, International Business Machines Corporation
+Copyright (c) 2002-2003,International Business Machines
+Copyright (C) 2009 International Business Machines
+Copyright (C) 2010-2016 International Business Machines
+Copyright (C) 2008-2012 IBM, Inc. All Rights Reserved.
+Copyright (C) 1998-2008, International Business Machines
+Copyright (C) 2010-2016, International Business Machines
+Copyright (C) 1999-2006,2013 IBM Corp. All rights reserved.
+Copyright (C) 2008-2009, International Business Machines Corporation and
+Copyright (C) 2012,2014 International Business Machines
+Copyright (c) 1996-2015, International Business Machines Corporation and
+Copyright (C) 1997-2005, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 1999-2012, International Business Machines Corporation and
+Copyright (C) 1996-2013, International Business Machines Corporation
+Copyright (C) 1998-2005, International Business Machines
+Copyright 2001 and onwards Google Inc.
+Copyright (C) 2010-2012,2014, International Business Machines
+Copyright (C) 1996-2015, International Business Machines Corporation and others.
+Copyright (c) 2003-2004, International Business Machines
+Copyright (C) 2000-2004, International Business Machines
+Copyright (C) 2002-2013, International Business Machines
+Copyright (C) 2002-2011 International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 1999-2010, International Business Machines Corporation and others.
+Copyright (C) 2001-2005, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 1996-2016, International Business Machines Corporation
+Copyright (C) 1997-2010, International Business Machines
+
+Software: libtiff 4.1.0
+Copyright notice:
+Copyright © 2015 Open Microscopy Environment / University of Dundee
+Copyright (c) 2004, Andrey Kiselev <dron@ak4719.spb.edu>
+Copyright (c) 1990-1997 Sam Leffler
+Copyright (c) 1991-1997 Silicon Graphics, Inc.
+Copyright (c) 1988-1997 Sam Leffler
+Copyright (c) 1991-1997 Sam Leffler
+Use and Copyright
+Copyright (C) 1990, 1995 Frank D. Cringle.
+Copyright (c) 1994-1997 Sam Leffler
+Copyright (c) 1994-1997 Silicon Graphics, Inc.
+Copyright (c) 1997 Greg Ward Larson
+Copyright (c) 1997 Silicon Graphics, Inc.
+Copyright (c) 2010, Andrey Kiselev <dron@ak4719.spb.edu>
+Copyright (c) Joris Van Damme <info@awaresystems.be>
+Copyright (c) AWare Systems <http:www.awaresystems.be/>
+Copyright (c) 1996-1997 Sam Leffler
+Copyright (c) 1996 Pixar
+Copyright (c) 1995-1997 Sam Leffler
+Copyright (c) 1995-1997 Silicon Graphics, Inc.
+Copyright (c) 1988-1996 Sam Leffler
+Copyright (c) 1991-1996 Silicon Graphics, Inc.
+Copyright (c) 1992-1997 Sam Leffler
+Copyright (c) 1992-1997 Silicon Graphics, Inc.
+Copyright (c) 2018, Mapbox
+Copyright (c) 2017, Planet Labs
+Copyright (c) 1990 by Sun Microsystems, Inc.
+Copyright 1990 by Digital Equipment Corporation, Maynard, Massachusetts.
+Copyright 1991 by Digital Equipment Corporation, Maynard, Massachusetts.
+Copyright (c) 2002, Andrey Kiselev <dron@ak4719.spb.edu>
+Copyright (c) 2003 Ross Finlayson
+Additions (c) Richard Nolde 2006-2010
+Copyright (c) 2003, Andrey Kiselev <dron@ak4719.spb.edu>
+Copyright (c) 2000, Frank Warmerdam
+Copyright (c) 1987, 1993, 1994
+Copyright (c) 1989, 1993
+Copyright (c) 2009 Frank Warmerdam
+Copyright (c) 1987, 1993
+Copyright (c) 2005 The DragonFly Project.  All rights reserved.
+Copyright (c) 2003 Citrus Project,
+All rights reserved.
+Copyright (c) 1990, 1993
+Copyright (c) 1996 Mike Johnson
+Copyright (c) 1996 BancTec AB
+Copyright (c) 2004, Andrey Kiselev  <dron@ak4719.spb.edu>
+Copyright (c) 2012, Frank Warmerdam <warmerdam@pobox.com>
+Copyright (c) 2019, Even Rouault <even.rouault at spatialys.com>
+Copyright (c) 2007, Frank Warmerdam <warmerdam@pobox.com>
+Copyright (c) 2019, Thomas Bernard  <miniupnp@free.fr>
+Copyright (c) 2008, Andrey Kiselev  <dron@ak4719.spb.edu>
+Copyright (c) 1999, Frank Warmerdam
+Copyright (c) 1991-1996 Sam Leffler
+Copyright (c) 1996 USAF Phillips Laboratory
+
 Software: opencv 4.2.0
 Copyright notice:
 Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
diff --git a/akg b/akg
new file mode 160000
index 0000000000..c460176523
--- /dev/null
+++ b/akg
@@ -0,0 +1 @@
+Subproject commit c460176523d039c8995f1d71089753725ebc0792
diff --git a/build.sh b/build.sh
index dd909e9f51..70718bf89b 100755
--- a/build.sh
+++ b/build.sh
@@ -49,10 +49,11 @@ usage()
   echo "    -Q Enable dump memory, default off"
   echo "    -D Enable dumping of function graph ir, default on"
   echo "    -z Compile dataset & mindrecord, default on"
-  echo "    -M Enable MPI and NCCL for GPU training, default on"
+  echo "    -M Enable MPI and NCCL for GPU training, gpu default on"
   echo "    -V Specify the minimum required cuda version, default CUDA 9.2"
   echo "    -I Compile predict, default off"
   echo "    -K Compile with AKG, default off"
+  echo "    -s Enable serving module, default off"
 }
 
 # check value of input is 'on' or 'off'
@@ -86,15 +87,15 @@ checkopts()
   ENABLE_DUMPE2E="off"
   ENABLE_DUMP_IR="on"
   COMPILE_MINDDATA="on"
-  ENABLE_MPI="on"
+  ENABLE_MPI="off"
   CUDA_VERSION="9.2"
   COMPILE_PREDICT="off"
   USE_GLOG="on"
   PREDICT_PLATFORM=""
-  ENABLE_AKG="off"
-
+  ENABLE_AKG="on"
+  ENABLE_SERVING="off"
   # Process the options
-  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:I:LRP:Q:D:zM:V:K' opt
+  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:I:LRP:Q:D:zM:V:K:s' opt
   do
     OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
     case "${opt}" in
@@ -168,6 +169,7 @@ checkopts()
         if [[ "X$OPTARG" == "Xgpu" ]]; then
           ENABLE_GPU="on"
           ENABLE_CPU="on"
+          ENABLE_MPI="on"
         elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
           ENABLE_D="on"
           ENABLE_CPU="on"
@@ -234,6 +236,10 @@ checkopts()
         ENABLE_AKG="on"
         echo "enable compile with akg"
         ;;
+      s)
+        ENABLE_SERVING="on"
+        echo "enable serving"
+        ;;
       *)
         echo "Unknown option ${opt}!"
         usage
@@ -242,9 +248,12 @@ checkopts()
   done
 }
 checkopts "$@"
-echo "---------------- mindspore: build start ----------------"
+echo "---------------- MindSpore: build start ----------------"
 mkdir -pv "${BUILD_PATH}/package/mindspore/lib"
 git submodule update --init graphengine
+if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" ]]; then
+    git submodule update --init --recursive akg
+fi
 
 build_exit()
 {
@@ -307,9 +316,13 @@ build_mindspore()
     if [[ "X$USE_GLOG" = "Xon" ]]; then
         CMAKE_ARGS="${CMAKE_ARGS} -DUSE_GLOG=ON"
     fi
-    if [[ "X$ENABLE_AKG" = "Xon" ]]; then
+    if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" ]]; then
         CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_AKG=ON"
     fi
+    if [[ "X$ENABLE_SERVING" = "Xon" ]]; then
+        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_SERVING=ON"
+    fi
+
     echo "${CMAKE_ARGS}"
     if [[ "X$INC_BUILD" = "Xoff" ]]; then
       cmake ${CMAKE_ARGS} ../..
diff --git a/cmake/dependency_graphengine.cmake b/cmake/dependency_graphengine.cmake
index 991eb2a24a..91a471d1f2 100644
--- a/cmake/dependency_graphengine.cmake
+++ b/cmake/dependency_graphengine.cmake
@@ -36,6 +36,7 @@ elseif (DEFINED ENV{D_LINK_PATH})
     find_library(hccl libhccl.so ${GE_LIB_PATH})
     find_library(cce libcce.so ${GE_LIB_PATH})
     find_library(resource libresource.so ${GE_LIB_PATH})
+    find_library(error_manager liberror_manager.so ${GE_LIB_PATH})
 else()
     # Ascend mode
     if(DEFINED ENV{ASCEND_CUSTOM_PATH})
@@ -54,6 +55,7 @@ else()
     find_library(msprof libmsprof.so ${ASCEND_RUNTIME_PATH})
     find_library(register libregister.so ${ASCEND_RUNTIME_PATH})
     find_library(resource libresource.so ${ASCEND_RUNTIME_PATH})
+    find_library(error_manager liberror_manager.so ${ASCEND_RUNTIME_PATH})
 endif()
 
 # compile libraries from following directories
diff --git a/cmake/external_libs/gtest.cmake b/cmake/external_libs/gtest.cmake
index df2eaec2cc..eb64655a86 100644
--- a/cmake/external_libs/gtest.cmake
+++ b/cmake/external_libs/gtest.cmake
@@ -1,4 +1,4 @@
-set(gtest_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2")
+set(gtest_CXXFLAGS "-D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
 set(gtest_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
 mindspore_add_pkg(gtest
         VER 1.8.0
diff --git a/cmake/external_libs/icu4c.cmake b/cmake/external_libs/icu4c.cmake
new file mode 100644
index 0000000000..7d13e4fd2a
--- /dev/null
+++ b/cmake/external_libs/icu4c.cmake
@@ -0,0 +1,19 @@
+set(LIB_ICU_COMMON icuuc)
+set(LIB_ICU_DATA icudata)
+set(LIB_ICU_I18N icui18n)
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    message("icu4c thirdparty do not support windows currently.")
+else()
+    mindspore_add_pkg(icu4c
+            VER 67.1
+            LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
+            URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
+            MD5 0c2662a2b0bc80b0eb56495205247c8f
+            CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-rpath --disable-tests --disable-samples --disable-icuio --disable-extras ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
+            )
+    include_directories(${icu4c_INC})
+    add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
+    add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
+    add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
+    add_definitions(-D ENABLE_ICU4C)
+endif()
\ No newline at end of file
diff --git a/cmake/external_libs/opencv.cmake b/cmake/external_libs/opencv.cmake
index b4f8d55a9e..4c7db821f4 100644
--- a/cmake/external_libs/opencv.cmake
+++ b/cmake/external_libs/opencv.cmake
@@ -8,7 +8,7 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
     set(opencv_CXXFLAGS "${opencv_CXXFLAGS} -Wno-attributes -Wno-unknown-pragmas")
     set(opencv_CXXFLAGS "${opencv_CXXFLAGS} -Wno-unused-value -Wno-implicit-fallthrough")
 else()
-    set(opencv_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -D_FORTIFY_SOURCE=2 -O2")
+    set(opencv_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
     set(opencv_CFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -D_FORTIFY_SOURCE=2 -O2")
     set(opencv_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
 endif()
diff --git a/cmake/external_libs/protobuf.cmake b/cmake/external_libs/protobuf.cmake
index 6fe34577af..53cbebfcb9 100644
--- a/cmake/external_libs/protobuf.cmake
+++ b/cmake/external_libs/protobuf.cmake
@@ -1,9 +1,12 @@
 set(protobuf_USE_STATIC_LIBS ON)
 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
     set(protobuf_CXXFLAGS "-fstack-protector-all -Wno-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
-else()
+elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
     set(protobuf_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
+else()
+    set(protobuf_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
 endif()
+
 set(protobuf_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
 set(_ms_tmp_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 set(CMAKE_CXX_FLAGS ${_ms_tmp_CMAKE_CXX_FLAGS})
diff --git a/cmake/external_libs/sqlite.cmake b/cmake/external_libs/sqlite.cmake
index 1d280cef4b..6b7a5e24d4 100644
--- a/cmake/external_libs/sqlite.cmake
+++ b/cmake/external_libs/sqlite.cmake
@@ -1,10 +1,10 @@
 if (WIN32)
     mindspore_add_pkg(sqlite
-        VER 3.31.1
+        VER 3.32.2
         LIBS sqlite3
-        URL https://sqlite.org/2020/sqlite-amalgamation-3310100.zip
-        MD5 2b7bfcdd97dc281903a9aee966213fe4
-        PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/sqlite/sqlite.windows.patch001 ${CMAKE_SOURCE_DIR}/third_party/patch/sqlite/sqlite.windows.patch002 ${CMAKE_SOURCE_DIR}/third_party/patch/sqlite/sqlite.windows.patch003
+        URL https://sqlite.org/2020/sqlite-amalgamation-3320200.zip
+        MD5 1eccea18d248eb34c7378b2b3f63f1db
+        PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/sqlite/sqlite.windows.patch001
         CMAKE_OPTION " "
     )
 
@@ -18,11 +18,11 @@ else ()
     endif()
     set(sqlite_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
     mindspore_add_pkg(sqlite
-        VER 3.31.1
+        VER 3.32.2
         LIBS sqlite3
-        URL https://github.com/sqlite/sqlite/archive/version-3.31.1.tar.gz
-        MD5 5f4e7b4016c15f4fb5855615279819da
-        PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/sqlite/sqlite.patch001 ${CMAKE_SOURCE_DIR}/third_party/patch/sqlite/sqlite.patch002 ${CMAKE_SOURCE_DIR}/third_party/patch/sqlite/sqlite.patch003
+        URL https://github.com/sqlite/sqlite/archive/version-3.32.2.tar.gz
+        MD5 ea6d3b3289b4ac216fb06081a01ef101
+        PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/sqlite/sqlite.patch001
         CONFIGURE_COMMAND ./configure --enable-shared=no --disable-tcl --disable-editline --enable-json1)
 endif ()
 
diff --git a/cmake/mind_expression.cmake b/cmake/mind_expression.cmake
index f20683a2d8..86337c1dd2 100644
--- a/cmake/mind_expression.cmake
+++ b/cmake/mind_expression.cmake
@@ -26,6 +26,9 @@ include_directories(${Python3_INCLUDE_DIRS})
 include_directories(${CMAKE_SOURCE_DIR}/third_party)
 if (ENABLE_CPU)
     include(${CMAKE_SOURCE_DIR}/cmake/external_libs/mkl_dnn.cmake)
+    if (ENABLE_MPI)
+        include(${CMAKE_SOURCE_DIR}/cmake/external_libs/ompi.cmake)
+    endif()
 endif()
 
 if (ENABLE_GPU)
@@ -36,7 +39,6 @@ if (ENABLE_GPU)
 
     if (ENABLE_MPI)
         include(${CMAKE_SOURCE_DIR}/cmake/external_libs/nccl.cmake)
-        include(${CMAKE_SOURCE_DIR}/cmake/external_libs/ompi.cmake)
     endif()
 endif()
 
@@ -52,6 +54,7 @@ elseif(ENABLE_D OR ENABLE_TESTCASES)
 endif()
 
 if (ENABLE_MINDDATA)
+    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/icu4c.cmake)
     include(${CMAKE_SOURCE_DIR}/cmake/external_libs/jpeg_turbo.cmake)
     include(${CMAKE_SOURCE_DIR}/cmake/external_libs/libtiff.cmake)
     include(${CMAKE_SOURCE_DIR}/cmake/external_libs/opencv.cmake)
diff --git a/cmake/package.cmake b/cmake/package.cmake
index 875ba5217d..1cff396ef1 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -91,7 +91,20 @@ if (ENABLE_MINDDATA)
         DESTINATION ${INSTALL_LIB_DIR}
         COMPONENT mindspore
     )
-
+    if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+        message("icu4c does not support windows system temporarily")
+    else()
+        file(GLOB_RECURSE ICU4C_LIB_LIST
+            ${icu4c_LIBPATH}/libicuuc*
+            ${icu4c_LIBPATH}/libicudata*
+            ${icu4c_LIBPATH}/libicui18n*
+        )
+        install(
+            FILES ${ICU4C_LIB_LIST}
+            DESTINATION ${INSTALL_LIB_DIR}
+            COMPONENT mindspore
+        )
+    endif()
 endif ()
 
 if (ENABLE_CPU)
@@ -109,19 +122,20 @@ if (ENABLE_CPU)
     )
 endif ()
 
+if (ENABLE_MPI)
+    install(
+        TARGETS _ms_mpi
+        DESTINATION ${INSTALL_BASE_DIR}
+        COMPONENT mindspore
+    )
+endif ()
+
 if (ENABLE_GPU)
-    if (ENABLE_MPI)
-        install(
-            TARGETS _ms_mpi
-            DESTINATION ${INSTALL_BASE_DIR}
-            COMPONENT mindspore
-        )
         install(
             TARGETS gpu_collective
             DESTINATION ${INSTALL_LIB_DIR}
             COMPONENT mindspore
         )
-    endif ()
     install(
         TARGETS gpu_queue
         DESTINATION ${INSTALL_LIB_DIR}
@@ -222,6 +236,16 @@ if (ENABLE_GPU)
     endif ()
 endif ()
 
+if (ENABLE_D AND ENABLE_AKG)
+    set (AKG_PATH ${CMAKE_SOURCE_DIR}/build/mindspore/akg)
+    install(
+        DIRECTORY
+            ${AKG_PATH}/akg
+        DESTINATION ${INSTALL_PY_DIR}/..
+        COMPONENT mindspore
+    )
+endif ()
+
 if (EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset)
     install(
         DIRECTORY ${CMAKE_SOURCE_DIR}/mindspore/dataset
diff --git a/cmake/package_script.cmake b/cmake/package_script.cmake
index dcc8ee0ad0..94ffc71b49 100644
--- a/cmake/package_script.cmake
+++ b/cmake/package_script.cmake
@@ -51,7 +51,7 @@ endif ()
 # get git commit id
 set(GIT_COMMIT_ID "")
 execute_process(
-    COMMAND ${GIT} log --format='[sha1]:%h,[branch]:%d' -1
+    COMMAND ${GIT} log --format='[sha1]:%h,[branch]:%d' --abbrev=8 -1
     OUTPUT_VARIABLE GIT_COMMIT_ID
     WORKING_DIRECTORY ${MS_ROOT_DIR}
     ERROR_QUIET)
diff --git a/example/googlenet_cifar10/README.md b/example/googlenet_cifar10/README.md
deleted file mode 100755
index 1acc7d1e1e..0000000000
--- a/example/googlenet_cifar10/README.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# Googlenet Example
-
-## Description
-
-This example is for Googlenet model training and evaluation.
-
-## Requirements
-
-- Install [MindSpore](https://www.mindspore.cn/install/en).
-
-- Download the CIFAR-10 binary version dataset.
-
-> Unzip the CIFAR-10 dataset to any path you want and the folder structure should be as follows:
-> ```
-> .
-> ├── cifar-10-batches-bin  # train dataset
-> └── cifar-10-verify-bin   # infer dataset
-> ```
-
-## Running the Example
-
-### Training
-
-```
-python train.py --data_path=your_data_path --device_id=6 > out.train.log 2>&1 & 
-```
-The python command above will run in the background, you can view the results through the file `out.train.log`.
-
-After training, you'll get some checkpoint files under the script folder by default.
-
-You will get the loss value as following:
-```
-# grep "loss is " out.train.log
-epoch: 1 step: 390, loss is 1.4842823
-epcoh: 2 step: 390, loss is 1.0897788
-...
-```
-
-### Evaluation
-
-```
-python eval.py --data_path=your_data_path --device_id=6 --checkpoint_path=./train_googlenet_cifar10-125-390.ckpt > out.eval.log 2>&1 & 
-```
-The above python command will run in the background, you can view the results through the file `out.eval.log`.
-
-You will get the accuracy as following:
-```
-# grep "result: " out.eval.log
-result: {'acc': 0.934}
-```
-
-### Distribute Training
-```
-sh run_distribute_train.sh rank_table.json your_data_path
-```
-The above shell script will run distribute training in the background, you can view the results through the file `train_parallel[X]/log`.
-
-You will get the loss value as following:
-```
-# grep "result: " train_parallel*/log
-train_parallel0/log:epoch: 1 step: 48, loss is 1.4302931
-train_parallel0/log:epcoh: 2 step: 48, loss is 1.4023874
-...
-train_parallel1/log:epoch: 1 step: 48, loss is 1.3458025
-train_parallel1/log:epcoh: 2 step: 48, loss is 1.3729336
-...
-...
-```
-> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
-
-## Usage:
-
-### Training
-```
-usage: train.py [--device_target TARGET][--data_path DATA_PATH]
-                [--device_id DEVICE_ID]
-
-parameters/options:
-  --device_target       the training backend type, default is Ascend.
-  --data_path           the storage path of dataset
-  --device_id           the device which used to train model.
-
-```
-
-### Evaluation
-
-```
-usage: eval.py [--device_target TARGET][--data_path DATA_PATH]
-                [--device_id DEVICE_ID][--checkpoint_path CKPT_PATH]
-
-parameters/options:
-  --device_target       the evaluation backend type, default is Ascend.
-  --data_path           the storage path of datasetd 
-  --device_id           the device which used to evaluate model.
-  --checkpoint_path     the checkpoint file path used to evaluate model.
-```
-
-### Distribute Training
-
-```
-Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]
-
-parameters/options:
-  MINDSPORE_HCCL_CONFIG_PATH   HCCL configuration file path.
-  DATA_PATH                    the storage path of dataset.
-```
diff --git a/example/graph_to_mindrecord/README.md b/example/graph_to_mindrecord/README.md
index cc6f6a1c70..df7ab33444 100644
--- a/example/graph_to_mindrecord/README.md
+++ b/example/graph_to_mindrecord/README.md
@@ -24,9 +24,6 @@ This example provides an efficient way to generate MindRecord. Users only need t
 
 1. Download and prepare the Cora dataset as required.
 
-    > [Cora dataset download address](https://github.com/jzaldi/datasets/tree/master/cora)
-
-
 2. Edit write_cora.sh and modify the parameters
     ```
     --mindrecord_file: output MindRecord file.
diff --git a/example/graph_to_mindrecord/citeseer/mr_api.py b/example/graph_to_mindrecord/citeseer/mr_api.py
index 8b1f424b0a..aa9e2a2c4d 100644
--- a/example/graph_to_mindrecord/citeseer/mr_api.py
+++ b/example/graph_to_mindrecord/citeseer/mr_api.py
@@ -15,29 +15,27 @@
 """
 User-defined API for MindRecord GNN writer.
 """
-import csv
 import os
 
+import pickle as pkl
 import numpy as np
 import scipy.sparse as sp
+from mindspore import log as logger
 
 # parse args from command line parameter 'graph_api_args'
 #     args delimiter is ':'
 args = os.environ['graph_api_args'].split(':')
-CITESEER_CONTENT_FILE = args[0]
-CITESEER_CITES_FILE = args[1]
-CITESEER_MINDRECRD_LABEL_FILE = CITESEER_CONTENT_FILE + "_label_mindrecord"
-CITESEER_MINDRECRD_ID_MAP_FILE = CITESEER_CONTENT_FILE + "_id_mindrecord"
-
-node_id_map = {}
+CITESEER_PATH = args[0]
+dataset_str = 'citeseer'
 
 # profile:  (num_features, feature_data_types, feature_shapes)
-node_profile = (2, ["float32", "int64"], [[-1], [-1]])
+node_profile = (2, ["float32", "int32"], [[-1], [-1]])
 edge_profile = (0, [], [])
 
+node_ids = []
+
 
 def _normalize_citeseer_features(features):
-    features = np.array(features)
     row_sum = np.array(features.sum(1))
     r_inv = np.power(row_sum * 1.0, -1).flatten()
     r_inv[np.isinf(r_inv)] = 0.
@@ -46,6 +44,14 @@ def _normalize_citeseer_features(features):
     return features
 
 
+def _parse_index_file(filename):
+    """Parse index file."""
+    index = []
+    for line in open(filename):
+        index.append(int(line.strip()))
+    return index
+
+
 def yield_nodes(task_id=0):
     """
     Generate node data
@@ -53,30 +59,47 @@ def yield_nodes(task_id=0):
     Yields:
         data (dict): data row which is dict.
     """
-    print("Node task is {}".format(task_id))
-    label_types = {}
-    label_size = 0
-    node_num = 0
-    with open(CITESEER_CONTENT_FILE) as content_file:
-        content_reader = csv.reader(content_file, delimiter='\t')
-        line_count = 0
-        for row in content_reader:
-            if not row[-1] in label_types:
-                label_types[row[-1]] = label_size
-                label_size += 1
-            if not row[0] in node_id_map:
-                node_id_map[row[0]] = node_num
-                node_num += 1
-            raw_features = [[int(x) for x in row[1:-1]]]
-            node = {'id': node_id_map[row[0]], 'type': 0, 'feature_1': _normalize_citeseer_features(raw_features),
-                    'feature_2': [label_types[row[-1]]]}
-            yield node
-            line_count += 1
-    print('Processed {} lines for nodes.'.format(line_count))
-    # print('label types {}.'.format(label_types))
-    with open(CITESEER_MINDRECRD_LABEL_FILE, 'w') as f:
-        for k in label_types:
-            print(k + ',' + str(label_types[k]), file=f)
+    logger.info("Node task is {}".format(task_id))
+    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally']
+    objects = []
+    for name in names:
+        with open("{}/ind.{}.{}".format(CITESEER_PATH, dataset_str, name), 'rb') as f:
+            objects.append(pkl.load(f, encoding='latin1'))
+    x, y, tx, ty, allx, ally = tuple(objects)
+    test_idx_reorder = _parse_index_file(
+        "{}/ind.{}.test.index".format(CITESEER_PATH, dataset_str))
+    test_idx_range = np.sort(test_idx_reorder)
+
+    tx = _normalize_citeseer_features(tx)
+    allx = _normalize_citeseer_features(allx)
+
+    # Fix citeseer dataset (there are some isolated nodes in the graph)
+    # Find isolated nodes, add them as zero-vecs into the right position
+    test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
+    tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
+    tx_extended[test_idx_range-min(test_idx_range), :] = tx
+    tx = tx_extended
+    ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
+    ty_extended[test_idx_range-min(test_idx_range), :] = ty
+    ty = ty_extended
+
+    features = sp.vstack((allx, tx)).tolil()
+    features[test_idx_reorder, :] = features[test_idx_range, :]
+    features = features.A
+
+    labels = np.vstack((ally, ty))
+    labels[test_idx_reorder, :] = labels[test_idx_range, :]
+
+    line_count = 0
+    for i, label in enumerate(labels):
+        if not 1 in label.tolist():
+            continue
+        node = {'id': i, 'type': 0, 'feature_1': features[i].tolist(),
+                'feature_2': label.tolist().index(1)}
+        line_count += 1
+        node_ids.append(i)
+        yield node
+    logger.info('Processed {} lines for nodes.'.format(line_count))
 
 
 def yield_edges(task_id=0):
@@ -86,24 +109,21 @@ def yield_edges(task_id=0):
     Yields:
         data (dict): data row which is dict.
     """
-    print("Edge task is {}".format(task_id))
-    # print(map_string_int)
-    with open(CITESEER_CITES_FILE) as cites_file:
-        cites_reader = csv.reader(cites_file, delimiter='\t')
+    logger.info("Edge task is {}".format(task_id))
+    with open("{}/ind.{}.graph".format(CITESEER_PATH, dataset_str), 'rb') as f:
+        graph = pkl.load(f, encoding='latin1')
         line_count = 0
-        for row in cites_reader:
-            if not row[0] in node_id_map:
-                print('Source node {} does not exist.'.format(row[0]))
-                continue
-            if not row[1] in node_id_map:
-                print('Destination node {} does not exist.'.format(row[1]))
-                continue
-            line_count += 1
-            edge = {'id': line_count,
-                    'src_id': node_id_map[row[0]], 'dst_id': node_id_map[row[1]], 'type': 0}
-            yield edge
-
-        with open(CITESEER_MINDRECRD_ID_MAP_FILE, 'w') as f:
-            for k in node_id_map:
-                print(k + ',' + str(node_id_map[k]), file=f)
-        print('Processed {} lines for edges.'.format(line_count))
+        for i in graph:
+            for dst_id in graph[i]:
+                if not i in node_ids:
+                    logger.info('Source node {} does not exist.'.format(i))
+                    continue
+                if not dst_id in node_ids:
+                    logger.info('Destination node {} does not exist.'.format(
+                        dst_id))
+                    continue
+                edge = {'id': line_count,
+                        'src_id': i, 'dst_id': dst_id, 'type': 0}
+                line_count += 1
+                yield edge
+        logger.info('Processed {} lines for edges.'.format(line_count))
diff --git a/example/graph_to_mindrecord/cora/mr_api.py b/example/graph_to_mindrecord/cora/mr_api.py
index 0963fd78f7..aeeb0e04de 100644
--- a/example/graph_to_mindrecord/cora/mr_api.py
+++ b/example/graph_to_mindrecord/cora/mr_api.py
@@ -15,29 +15,24 @@
 """
 User-defined API for MindRecord GNN writer.
 """
-import csv
 import os
 
+import pickle as pkl
 import numpy as np
 import scipy.sparse as sp
 
 # parse args from command line parameter 'graph_api_args'
 #     args delimiter is ':'
 args = os.environ['graph_api_args'].split(':')
-CORA_CONTENT_FILE = args[0]
-CORA_CITES_FILE = args[1]
-CORA_MINDRECRD_LABEL_FILE = CORA_CONTENT_FILE + "_label_mindrecord"
-CORA_CONTENT_ID_MAP_FILE = CORA_CONTENT_FILE + "_id_mindrecord"
-
-node_id_map = {}
+CORA_PATH = args[0]
+dataset_str = 'cora'
 
 # profile:  (num_features, feature_data_types, feature_shapes)
-node_profile = (2, ["float32", "int64"], [[-1], [-1]])
+node_profile = (2, ["float32", "int32"], [[-1], [-1]])
 edge_profile = (0, [], [])
 
 
 def _normalize_cora_features(features):
-    features = np.array(features)
     row_sum = np.array(features.sum(1))
     r_inv = np.power(row_sum * 1.0, -1).flatten()
     r_inv[np.isinf(r_inv)] = 0.
@@ -46,6 +41,14 @@ def _normalize_cora_features(features):
     return features
 
 
+def _parse_index_file(filename):
+    """Parse index file."""
+    index = []
+    for line in open(filename):
+        index.append(int(line.strip()))
+    return index
+
+
 def yield_nodes(task_id=0):
     """
     Generate node data
@@ -54,32 +57,32 @@ def yield_nodes(task_id=0):
         data (dict): data row which is dict.
     """
     print("Node task is {}".format(task_id))
-    label_types = {}
-    label_size = 0
-    node_num = 0
-    with open(CORA_CONTENT_FILE) as content_file:
-        content_reader = csv.reader(content_file, delimiter=',')
-        line_count = 0
-        for row in content_reader:
-            if line_count == 0:
-                line_count += 1
-                continue
-            if not row[0] in node_id_map:
-                node_id_map[row[0]] = node_num
-                node_num += 1
-            if not row[-1] in label_types:
-                label_types[row[-1]] = label_size
-                label_size += 1
-            raw_features = [[int(x) for x in row[1:-1]]]
-            node = {'id': node_id_map[row[0]], 'type': 0, 'feature_1': _normalize_cora_features(raw_features),
-                    'feature_2': [label_types[row[-1]]]}
-            yield node
-            line_count += 1
+
+    names = ['tx', 'ty', 'allx', 'ally']
+    objects = []
+    for name in names:
+        with open("{}/ind.{}.{}".format(CORA_PATH, dataset_str, name), 'rb') as f:
+            objects.append(pkl.load(f, encoding='latin1'))
+    tx, ty, allx, ally = tuple(objects)
+    test_idx_reorder = _parse_index_file(
+        "{}/ind.{}.test.index".format(CORA_PATH, dataset_str))
+    test_idx_range = np.sort(test_idx_reorder)
+
+    features = sp.vstack((allx, tx)).tolil()
+    features[test_idx_reorder, :] = features[test_idx_range, :]
+    features = _normalize_cora_features(features)
+    features = features.A
+
+    labels = np.vstack((ally, ty))
+    labels[test_idx_reorder, :] = labels[test_idx_range, :]
+
+    line_count = 0
+    for i, label in enumerate(labels):
+        node = {'id': i, 'type': 0, 'feature_1': features[i].tolist(),
+                'feature_2': label.tolist().index(1)}
+        line_count += 1
+        yield node
     print('Processed {} lines for nodes.'.format(line_count))
-    print('label types {}.'.format(label_types))
-    with open(CORA_MINDRECRD_LABEL_FILE, 'w') as f:
-        for k in label_types:
-            print(k + ',' + str(label_types[k]), file=f)
 
 
 def yield_edges(task_id=0):
@@ -90,24 +93,13 @@ def yield_edges(task_id=0):
         data (dict): data row which is dict.
     """
     print("Edge task is {}".format(task_id))
-    with open(CORA_CITES_FILE) as cites_file:
-        cites_reader = csv.reader(cites_file, delimiter=',')
+    with open("{}/ind.{}.graph".format(CORA_PATH, dataset_str), 'rb') as f:
+        graph = pkl.load(f, encoding='latin1')
         line_count = 0
-        for row in cites_reader:
-            if line_count == 0:
+        for i in graph:
+            for dst_id in graph[i]:
+                edge = {'id': line_count,
+                        'src_id': i, 'dst_id': dst_id, 'type': 0}
                 line_count += 1
-                continue
-            if not row[0] in node_id_map:
-                print('Source node {} does not exist.'.format(row[0]))
-                continue
-            if not row[1] in node_id_map:
-                print('Destination node {} does not exist.'.format(row[1]))
-                continue
-            edge = {'id': line_count,
-                    'src_id': node_id_map[row[0]], 'dst_id': node_id_map[row[1]], 'type': 0}
-            yield edge
-            line_count += 1
+                yield edge
         print('Processed {} lines for edges.'.format(line_count))
-    with open(CORA_CONTENT_ID_MAP_FILE, 'w') as f:
-        for k in node_id_map:
-            print(k + ',' + str(node_id_map[k]), file=f)
diff --git a/example/graph_to_mindrecord/graph_map_schema.py b/example/graph_to_mindrecord/graph_map_schema.py
index e131de9f65..1da1ced2f7 100644
--- a/example/graph_to_mindrecord/graph_map_schema.py
+++ b/example/graph_to_mindrecord/graph_map_schema.py
@@ -16,6 +16,7 @@
 Graph data convert tool for MindRecord.
 """
 import numpy as np
+from mindspore import log as logger
 
 __all__ = ['GraphMapSchema']
 
@@ -41,6 +42,7 @@ class GraphMapSchema:
             "edge_feature_index": {"type": "int32", "shape": [-1]}
         }
 
+    @property
     def get_schema(self):
         """
         Get schema
@@ -52,6 +54,7 @@ class GraphMapSchema:
         Set node features profile
         """
         if num_features != len(features_data_type) or num_features != len(features_shape):
+            logger.info("Node feature profile is not match.")
             raise ValueError("Node feature profile is not match.")
 
         self.num_node_features = num_features
@@ -66,6 +69,7 @@ class GraphMapSchema:
         Set edge features profile
         """
         if num_features != len(features_data_type) or num_features != len(features_shape):
+            logger.info("Edge feature profile is not match.")
             raise ValueError("Edge feature profile is not match.")
 
         self.num_edge_features = num_features
@@ -83,6 +87,10 @@ class GraphMapSchema:
         Returns:
             graph data with union schema
         """
+        if node is None:
+            logger.info("node cannot be None.")
+            raise ValueError("node cannot be None.")
+
         node_graph = {"first_id": node["id"], "second_id": 0, "third_id": 0, "attribute": 'n', "type": node["type"],
                       "node_feature_index": []}
         for i in range(self.num_node_features):
@@ -117,6 +125,10 @@ class GraphMapSchema:
         Returns:
             graph data with union schema
         """
+        if edge is None:
+            logger.info("edge cannot be None.")
+            raise ValueError("edge cannot be None.")
+
         edge_graph = {"first_id": edge["id"], "second_id": edge["src_id"], "third_id": edge["dst_id"], "attribute": 'e',
                       "type": edge["type"], "edge_feature_index": []}
 
diff --git a/example/graph_to_mindrecord/sns/__init__.py b/example/graph_to_mindrecord/sns/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/graph_to_mindrecord/sns/mr_api.py b/example/graph_to_mindrecord/sns/mr_api.py
new file mode 100644
index 0000000000..4e01441601
--- /dev/null
+++ b/example/graph_to_mindrecord/sns/mr_api.py
@@ -0,0 +1,81 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+User-defined API for MindRecord GNN writer.
+"""
+social_data = [[348, 350], [348, 327], [348, 329], [348, 331], [348, 335],
+               [348, 336], [348, 337], [348, 338], [348, 340], [348, 341],
+               [348, 342], [348, 343], [348, 344], [348, 345], [348, 346],
+               [348, 347], [347, 351], [347, 327], [347, 329], [347, 331],
+               [347, 335], [347, 341], [347, 345], [347, 346], [346, 335],
+               [346, 340], [346, 339], [346, 349], [346, 353], [346, 354],
+               [346, 341], [346, 345], [345, 335], [345, 336], [345, 341],
+               [344, 338], [344, 342], [343, 332], [343, 338], [343, 342],
+               [342, 332], [340, 349], [334, 349], [333, 349], [330, 349],
+               [328, 349], [359, 349], [358, 352], [358, 349], [358, 354],
+               [358, 356], [357, 350], [357, 354], [357, 356], [356, 350],
+               [355, 352], [353, 350], [352, 349], [351, 349], [350, 349]]
+
+# profile:  (num_features, feature_data_types, feature_shapes)
+node_profile = (0, [], [])
+edge_profile = (0, [], [])
+
+
+def yield_nodes(task_id=0):
+    """
+    Generate node data
+
+    Yields:
+        data (dict): data row which is dict.
+    """
+    print("Node task is {}".format(task_id))
+    node_list = []
+    for edge in social_data:
+        src, dst = edge
+        if src not in node_list:
+            node_list.append(src)
+        if dst not in node_list:
+            node_list.append(dst)
+    node_list.sort()
+    print(node_list)
+    for node_id in node_list:
+        node = {'id': node_id, 'type': 1}
+        yield node
+
+
+def yield_edges(task_id=0):
+    """
+    Generate edge data
+
+    Yields:
+        data (dict): data row which is dict.
+    """
+    print("Edge task is {}".format(task_id))
+    line_count = 0
+    for undirected_edge in social_data:
+        line_count += 1
+        edge = {
+            'id': line_count,
+            'src_id': undirected_edge[0],
+            'dst_id': undirected_edge[1],
+            'type': 1}
+        yield edge
+        line_count += 1
+        edge = {
+            'id': line_count,
+            'src_id': undirected_edge[1],
+            'dst_id': undirected_edge[0],
+            'type': 1}
+        yield edge
diff --git a/example/graph_to_mindrecord/write_citeseer.sh b/example/graph_to_mindrecord/write_citeseer.sh
index 33235372fa..523b2b8850 100644
--- a/example/graph_to_mindrecord/write_citeseer.sh
+++ b/example/graph_to_mindrecord/write_citeseer.sh
@@ -9,4 +9,4 @@ python writer.py --mindrecord_script citeseer \
 --mindrecord_partitions 1 \
 --mindrecord_header_size_by_bit 18 \
 --mindrecord_page_size_by_bit 20 \
---graph_api_args "$SRC_PATH/citeseer.content:$SRC_PATH/citeseer.cites"
+--graph_api_args "$SRC_PATH"
diff --git a/example/graph_to_mindrecord/write_cora.sh b/example/graph_to_mindrecord/write_cora.sh
index 84ccf34f5e..fd1b6fc92a 100644
--- a/example/graph_to_mindrecord/write_cora.sh
+++ b/example/graph_to_mindrecord/write_cora.sh
@@ -9,4 +9,4 @@ python writer.py --mindrecord_script cora \
 --mindrecord_partitions 1 \
 --mindrecord_header_size_by_bit 18 \
 --mindrecord_page_size_by_bit 20 \
---graph_api_args "$SRC_PATH/cora_content.csv:$SRC_PATH/cora_cites.csv"
+--graph_api_args "$SRC_PATH"
diff --git a/example/graph_to_mindrecord/write_sns.sh b/example/graph_to_mindrecord/write_sns.sh
new file mode 100644
index 0000000000..f564ddc8ff
--- /dev/null
+++ b/example/graph_to_mindrecord/write_sns.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+MINDRECORD_PATH=/tmp/sns
+
+rm -f $MINDRECORD_PATH/*
+
+python writer.py --mindrecord_script sns \
+--mindrecord_file "$MINDRECORD_PATH/sns" \
+--mindrecord_partitions 1 \
+--mindrecord_header_size_by_bit 14 \
+--mindrecord_page_size_by_bit 15
diff --git a/example/graph_to_mindrecord/writer.py b/example/graph_to_mindrecord/writer.py
index 1024c82372..9dce63e265 100644
--- a/example/graph_to_mindrecord/writer.py
+++ b/example/graph_to_mindrecord/writer.py
@@ -164,7 +164,7 @@ if __name__ == "__main__":
     num_features, feature_data_types, feature_shapes = mr_api.edge_profile
     graph_map_schema.set_edge_feature_profile(num_features, feature_data_types, feature_shapes)
 
-    graph_schema = graph_map_schema.get_schema()
+    graph_schema = graph_map_schema.get_schema
 
     # init writer
     writer = init_writer(graph_schema)
diff --git a/example/nlp_to_mindrecord/CLUERNER2020/README.md b/example/nlp_to_mindrecord/CLUERNER2020/README.md
new file mode 100644
index 0000000000..c862156a47
--- /dev/null
+++ b/example/nlp_to_mindrecord/CLUERNER2020/README.md
@@ -0,0 +1,82 @@
+# Guideline to Convert Training Data CLUERNER2020 to MindRecord For Bert Fine Tuning
+
+<!-- TOC -->
+
+- [What does the example do](#what-does-the-example-do)
+- [How to use the example to process CLUERNER2020](#how-to-use-the-example-to-process-cluerner2020)
+    - [Download CLUERNER2020 and unzip](#download-cluerner2020-and-unzip)
+    - [Generate MindRecord](#generate-mindrecord)
+    - [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
+
+
+<!-- /TOC -->
+
+## What does the example do
+
+This example is based on [CLUERNER2020](https://www.cluebenchmarks.com/introduce.html) training data, generating MindRecord file, and finally used for Bert Fine Tuning progress.
+
+1.  run.sh: generate MindRecord entry script
+2.  run_read.py: create MindDataset by MindRecord entry script.
+    - create_dataset.py: use MindDataset to read MindRecord to generate dataset.
+
+## How to use the example to process CLUERNER2020
+
+Download CLUERNER2020, convert it to MindRecord, use MindDataset to read MindRecord.
+
+### Download CLUERNER2020 and unzip
+
+1. Download the training data zip.
+    > [CLUERNER2020 dataset download address](https://www.cluebenchmarks.com/introduce.html) **-> 任务介绍 -> CLUENER 细粒度命名实体识别 -> cluener下载链接**
+
+2. Unzip the training data to dir example/nlp_to_mindrecord/CLUERNER2020/cluener_public.
+    ```
+    unzip -d {your-mindspore}/example/nlp_to_mindrecord/CLUERNER2020/data/cluener_public cluener_public.zip
+    ```
+
+### Generate MindRecord
+
+1. Run the run.sh script.
+    ```bash
+    bash run.sh
+    ```
+
+2. Output like this:
+    ```
+    ...
+    [INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:12.498.235 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/train.mindrecord'], and the list of index files are: ['data/train.mindrecord.db']
+    ...
+    [INFO] ME(17603,python):2020-04-28-16:56:13.400.175 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
+    [INFO] ME(17603,python):2020-04-28-16:56:13.400.863 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
+    [INFO] ME(17603,python):2020-04-28-16:56:13.401.534 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
+    [INFO] ME(17603,python):2020-04-28-16:56:13.402.179 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
+    [INFO] ME(17603,python):2020-04-28-16:56:13.402.702 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
+    ...
+    [INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:13.431.208 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/dev.mindrecord'], and the list of index files are: ['data/dev.mindrecord.db']
+    ```
+
+3. Generate files like this:
+    ```bash
+    $ ls output/
+    dev.mindrecord  dev.mindrecord.db  README.md  train.mindrecord  train.mindrecord.db
+    ```
+
+### Create MindDataset By MindRecord
+
+1. Run the run_read.sh script.
+    ```bash
+    bash run_read.sh
+    ```
+
+2. Output like this:
+    ```
+    ...
+    example 1340: input_ids: [ 101 3173 1290 4852 7676 3949  122 3299  123  126 3189 4510 8020 6381 5442 7357 2590 3636 8021 7676 3949 4294 1166 6121 3124 1277 6121 3124 7270 2135 3295 5789 3326 123  126 3189 1355 6134 1093 1325 3173 2399 6590 6791 8024  102    0    0    0    0    0    0    0    0    0    0   0    0    0    0    0    0    0    0]
+    example 1340: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    example 1340: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    example 1340: label_ids: [ 0 18 19 20  2  4  0  0  0  0  0  0  0 34 36 26 27 28  0 34 35 35 35 35 35 35 35 35 35 36 26 27 28  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
+    example 1341: input_ids: [ 101 1728  711 4293 3868 1168 2190 2150 3791  934 3633 3428 4638 6237 7025 8024 3297 1400 5310 3362 6206 5023 5401 1744 3297 7770 3791 7368  976 1139 1104 2137  511 102    0    0    0    0    0    0    0    0   0    0    0    0    0    0    0    0    0    0    0    0    0    0   0    0    0    0    0    0    0    0]
+    example 1341: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    example 1341: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+   example 1341: label_ids: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 18 19 19 19 19 20  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
+    ...
+    ```
diff --git a/example/nlp_to_mindrecord/CLUERNER2020/create_dataset.py b/example/nlp_to_mindrecord/CLUERNER2020/create_dataset.py
new file mode 100644
index 0000000000..616bc71028
--- /dev/null
+++ b/example/nlp_to_mindrecord/CLUERNER2020/create_dataset.py
@@ -0,0 +1,36 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""create MindDataset by MindRecord"""
+import mindspore.dataset as ds
+
+def create_dataset(data_file):
+    """create MindDataset"""
+    num_readers = 4
+    data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
+    index = 0
+    for item in data_set.create_dict_iterator():
+        # print("example {}: {}".format(index, item))
+        print("example {}: input_ids: {}".format(index, item['input_ids']))
+        print("example {}: input_mask: {}".format(index, item['input_mask']))
+        print("example {}: segment_ids: {}".format(index, item['segment_ids']))
+        print("example {}: label_ids: {}".format(index, item['label_ids']))
+        index += 1
+        if index % 1000 == 0:
+            print("read rows: {}".format(index))
+    print("total rows: {}".format(index))
+
+if __name__ == '__main__':
+    create_dataset('output/train.mindrecord')
+    create_dataset('output/dev.mindrecord')
diff --git a/example/nlp_to_mindrecord/CLUERNER2020/data/.gitignore b/example/nlp_to_mindrecord/CLUERNER2020/data/.gitignore
new file mode 100644
index 0000000000..cbbd6256c0
--- /dev/null
+++ b/example/nlp_to_mindrecord/CLUERNER2020/data/.gitignore
@@ -0,0 +1 @@
+cluener_public
diff --git a/example/nlp_to_mindrecord/CLUERNER2020/data/README.md b/example/nlp_to_mindrecord/CLUERNER2020/data/README.md
new file mode 100644
index 0000000000..b54948808e
--- /dev/null
+++ b/example/nlp_to_mindrecord/CLUERNER2020/data/README.md
@@ -0,0 +1 @@
+## The input dataset
diff --git a/example/nlp_to_mindrecord/CLUERNER2020/output/README.md b/example/nlp_to_mindrecord/CLUERNER2020/output/README.md
new file mode 100644
index 0000000000..7904933f43
--- /dev/null
+++ b/example/nlp_to_mindrecord/CLUERNER2020/output/README.md
@@ -0,0 +1 @@
+## output dir
diff --git a/example/nlp_to_mindrecord/CLUERNER2020/run.sh b/example/nlp_to_mindrecord/CLUERNER2020/run.sh
new file mode 100644
index 0000000000..15c6aa4362
--- /dev/null
+++ b/example/nlp_to_mindrecord/CLUERNER2020/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+rm -f output/train.mindrecord*
+rm -f output/dev.mindrecord*
+
+if [ ! -d "../../../third_party/to_mindrecord/CLUERNER2020" ]; then
+    echo "The patch base dir ../../../third_party/to_mindrecord/CLUERNER2020 is not exist."
+    exit 1
+fi
+
+if [ ! -f "../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch" ]; then
+    echo "The patch file ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch is not exist."
+    exit 1
+fi
+
+# patch for data_processor_seq.py
+patch -p0 -d ../../../third_party/to_mindrecord/CLUERNER2020/ -o data_processor_seq_patched.py < ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch
+if [ $? -ne 0 ]; then
+    echo "Patch ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq.py failed"
+    exit 1
+fi
+
+# use patched script
+python ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq_patched.py \
+--vocab_file=../../../third_party/to_mindrecord/CLUERNER2020/vocab.txt \
+--label2id_file=../../../third_party/to_mindrecord/CLUERNER2020/label2id.json
diff --git a/example/nlp_to_mindrecord/CLUERNER2020/run_read.sh b/example/nlp_to_mindrecord/CLUERNER2020/run_read.sh
new file mode 100644
index 0000000000..1ffe4de1cf
--- /dev/null
+++ b/example/nlp_to_mindrecord/CLUERNER2020/run_read.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+python create_dataset.py
diff --git a/example/nlp_to_mindrecord/enwiki/README.md b/example/nlp_to_mindrecord/enwiki/README.md
new file mode 100644
index 0000000000..e92e8dbcc6
--- /dev/null
+++ b/example/nlp_to_mindrecord/enwiki/README.md
@@ -0,0 +1,173 @@
+# Guideline to Convert Training Data enwiki to MindRecord For Bert Pre Training
+
+<!-- TOC -->
+
+- [What does the example do](#what-does-the-example-do)
+- [How to use the example to process enwiki](#how-to-use-the-example-to-process-enwiki)
+    - [Download enwiki training data](#download-enwiki-training-data)
+    - [Process the enwiki](#process-the-enwiki)
+    - [Generate MindRecord](#generate-mindrecord)
+    - [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
+
+
+<!-- /TOC -->
+
+## What does the example do
+
+This example is based on [enwiki](https://dumps.wikimedia.org/enwiki) training data, generating MindRecord file, and finally used for Bert network training.
+
+1.  run.sh: generate MindRecord entry script.
+2.  run_read.py: create MindDataset by MindRecord entry script.
+    - create_dataset.py: use MindDataset to read MindRecord to generate dataset.
+
+## How to use the example to process enwiki
+
+Download enwiki data, process it, convert it to MindRecord, use MindDataset to read MindRecord.
+
+### Download enwiki training data
+
+> [enwiki dataset download address](https://dumps.wikimedia.org/enwiki) **-> 20200501 -> enwiki-20200501-pages-articles-multistream.xml.bz2**
+
+### Process the enwiki
+
+1. Please follow the steps in [process enwiki](https://github.com/mlperf/training/tree/master/language_model/tensorflow/bert)
+- All permissions of this step belong to the link address website.
+
+### Generate MindRecord
+
+1. Run the run.sh script.
+    ```
+    bash run.sh input_dir output_dir vocab_file
+    ```
+    - input_dir: the directory which contains files like 'part-00251-of-00500'.
+    - output_dir: which will store the output mindrecord files.
+    - vocab_file: the vocab file which you can download from other opensource project.
+
+2. The output like this:
+    ```
+    ...
+    Begin preprocess Wed Jun 10 09:21:23 CST 2020
+    Begin preprocess input file: /mnt/data/results/part-00000-of-00500
+    Begin output file: part-00000-of-00500.mindrecord
+    Total task: 510, processing: 1
+    Begin preprocess input file: /mnt/data/results/part-00001-of-00500
+    Begin output file: part-00001-of-00500.mindrecord
+    Total task: 510, processing: 2
+    Begin preprocess input file: /mnt/data/results/part-00002-of-00500
+    Begin output file: part-00002-of-00500.mindrecord
+    Total task: 510, processing: 3
+    Begin preprocess input file: /mnt/data/results/part-00003-of-00500
+    Begin output file: part-00003-of-00500.mindrecord
+    Total task: 510, processing: 4
+    Begin preprocess input file: /mnt/data/results/part-00004-of-00500
+    Begin output file: part-00004-of-00500.mindrecord
+    Total task: 510, processing: 4
+    ...
+    ```
+
+3. Generate files like this:
+    ```bash
+    $ ls {your_output_dir}/
+    part-00000-of-00500.mindrecord part-00000-of-00500.mindrecord.db part-00001-of-00500.mindrecord part-00001-of-00500.mindrecord.db part-00002-of-00500.mindrecord part-00002-of-00500.mindrecord.db ...
+    ```
+
+### Create MindDataset By MindRecord
+
+1. Run the run_read.sh script.
+    ```bash
+    bash run_read.sh input_dir
+    ```
+    - input_dir: the directory which contains mindrecord files.
+
+2. The output like this:
+    ```
+    ...
+    example 633: input_ids: [  101  2043 19781  4305  2140  4520  2041  1010   103  2034  2455  2002
+      7879  2003  1996  2455  1997   103 26378  4160  1012   102  7291  2001
+      1996   103  1011  2343  1997  6327  1010  3423  1998   103  4262  2005
+      1996  2118  1997  2329  3996   103   102     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0]
+    example 633: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+     1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    example 633: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+     1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    example 633: masked_lm_positions: [ 8 17 20 25 33 41  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
+      0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
+      0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
+      0  0  0  0]
+    example 633: masked_lm_ids: [ 1996 16137  1012  3580  2451  1012     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0     0     0     0     0     0     0     0     0
+         0     0     0     0]
+    example 633: masked_lm_weights: [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
+     0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
+     0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
+     0. 0. 0. 0.]
+    example 633: next_sentence_labels: [1]
+    ...
+    ```
diff --git a/example/nlp_to_mindrecord/enwiki/create_dataset.py b/example/nlp_to_mindrecord/enwiki/create_dataset.py
new file mode 100644
index 0000000000..d90d12b7f2
--- /dev/null
+++ b/example/nlp_to_mindrecord/enwiki/create_dataset.py
@@ -0,0 +1,43 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""create MindDataset by MindRecord"""
+import argparse
+import mindspore.dataset as ds
+
+def create_dataset(data_file):
+    """create MindDataset"""
+    num_readers = 4
+    data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
+    index = 0
+    for item in data_set.create_dict_iterator():
+        # print("example {}: {}".format(index, item))
+        print("example {}: input_ids: {}".format(index, item['input_ids']))
+        print("example {}: input_mask: {}".format(index, item['input_mask']))
+        print("example {}: segment_ids: {}".format(index, item['segment_ids']))
+        print("example {}: masked_lm_positions: {}".format(index, item['masked_lm_positions']))
+        print("example {}: masked_lm_ids: {}".format(index, item['masked_lm_ids']))
+        print("example {}: masked_lm_weights: {}".format(index, item['masked_lm_weights']))
+        print("example {}: next_sentence_labels: {}".format(index, item['next_sentence_labels']))
+        index += 1
+        if index % 1000 == 0:
+            print("read rows: {}".format(index))
+    print("total rows: {}".format(index))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_file", nargs='+', type=str, help='Input mindreord file')
+    args = parser.parse_args()
+
+    create_dataset(args.input_file)
diff --git a/example/nlp_to_mindrecord/enwiki/run.sh b/example/nlp_to_mindrecord/enwiki/run.sh
new file mode 100644
index 0000000000..cf66bed0fd
--- /dev/null
+++ b/example/nlp_to_mindrecord/enwiki/run.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# -ne 3 ]; then
+    echo "Usage: $0 input_dir output_dir vocab_file"
+    exit 1
+fi
+
+if [ ! -d $1 ]; then
+    echo "The input dir: $1 is not exist."
+    exit 1
+fi
+
+if [ ! -d $2 ]; then
+    echo "The output dir: $2 is not exist."
+    exit 1
+fi
+rm -fr $2/*.mindrecord*
+
+if [ ! -f $3 ]; then
+    echo "The vocab file: $3 is not exist."
+    exit 1
+fi
+
+data_dir=$1
+output_dir=$2
+vocab_file=$3
+file_list=()
+output_filename=()
+file_index=0
+
+function getdir() {
+    elements=`ls $1`
+    for element in ${elements[*]};
+    do
+        dir_or_file=$1"/"$element
+        if [ -d $dir_or_file ];
+        then
+            getdir $dir_or_file
+        else
+            file_list[$file_index]=$dir_or_file
+            echo "${dir_or_file}" | tr '/' '\n' > dir_file_list.txt   # dir dir file to mapfile
+            mapfile parent_dir < dir_file_list.txt
+            rm dir_file_list.txt >/dev/null 2>&1
+            tmp_output_filename=${parent_dir[${#parent_dir[@]}-1]}".mindrecord"
+            output_filename[$file_index]=`echo ${tmp_output_filename} | sed 's/ //g'`
+            file_index=`expr $file_index + 1`
+        fi
+    done
+}
+
+getdir "${data_dir}"
+# echo "The input files: "${file_list[@]}
+# echo "The output files: "${output_filename[@]}
+
+if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
+    echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
+    exit 1
+fi
+
+if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
+    echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
+    exit 1
+fi
+
+# patch for create_pretraining_data.py
+patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
+if [ $? -ne 0 ]; then
+    echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
+    exit 1
+fi
+
+# get the cpu core count
+num_cpu_core=`cat /proc/cpuinfo | grep "processor" | wc -l`
+avaiable_core_size=`expr $num_cpu_core / 3 \* 2`
+
+echo "Begin preprocess `date`"
+
+# using patched script to generate mindrecord
+file_list_len=`expr ${#file_list[*]} - 1`
+for index in $(seq 0 $file_list_len); do
+    echo "Begin preprocess input file: ${file_list[$index]}"
+    echo "Begin output file: ${output_filename[$index]}"
+    python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
+        --input_file=${file_list[$index]} \
+        --output_file=${output_dir}/${output_filename[$index]} \
+        --partition_number=1 \
+        --vocab_file=${vocab_file} \
+        --do_lower_case=True \
+        --max_seq_length=512 \
+        --max_predictions_per_seq=76 \
+        --masked_lm_prob=0.15 \
+        --random_seed=12345 \
+        --dupe_factor=10 >/tmp/${output_filename[$index]}.log 2>&1 &
+    process_count=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
+    echo "Total task: ${#file_list[*]}, processing: ${process_count}"
+    if [ $process_count -ge $avaiable_core_size ]; then
+        while [ 1 ]; do
+            process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
+            if [ $process_count -gt $process_num ]; then
+                process_count=$process_num
+                break;
+            fi
+            sleep 2
+        done
+    fi
+done
+
+process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
+while [ 1 ]; do
+    if [ $process_num -eq 0 ]; then
+        break;
+    fi
+    echo "There are still ${process_num} preprocess running ..."
+    sleep 2
+    process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
+done
+
+echo "Preprocess all the data success."
+echo "End preprocess `date`"
diff --git a/example/nlp_to_mindrecord/enwiki/run_read.sh b/example/nlp_to_mindrecord/enwiki/run_read.sh
new file mode 100644
index 0000000000..737e9375c4
--- /dev/null
+++ b/example/nlp_to_mindrecord/enwiki/run_read.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 input_dir"
+    exit 1
+fi
+
+if [ ! -d $1 ]; then
+    echo "The input dir: $1 is not exist."
+    exit 1
+fi
+
+file_list=()
+file_index=0
+
+# get all the mindrecord file from output dir
+function getdir() {
+    elements=`ls $1/part-*.mindrecord`
+    for element in ${elements[*]};
+    do
+        file_list[$file_index]=$element
+        file_index=`expr $file_index + 1`
+    done
+}
+
+getdir $1
+echo "Get all the mindrecord files: "${file_list[*]}
+
+# create dataset for train
+python create_dataset.py --input_file ${file_list[*]}
diff --git a/example/nlp_to_mindrecord/zhwiki/README.md b/example/nlp_to_mindrecord/zhwiki/README.md
new file mode 100644
index 0000000000..1a9de05114
--- /dev/null
+++ b/example/nlp_to_mindrecord/zhwiki/README.md
@@ -0,0 +1,113 @@
+# Guideline to Convert Training Data zhwiki to MindRecord For Bert Pre Training
+
+<!-- TOC -->
+
+- [What does the example do](#what-does-the-example-do)
+- [Run simple test](#run-simple-test)
+- [How to use the example to process zhwiki](#how-to-use-the-example-to-process-zhwiki)
+    - [Download zhwiki training data](#download-zhwiki-training-data)
+    - [Extract the zhwiki](#extract-the-zhwiki)
+    - [Generate MindRecord](#generate-mindrecord)
+    - [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
+
+
+<!-- /TOC -->
+
+## What does the example do
+
+This example is based on [zhwiki](https://dumps.wikimedia.org/zhwiki) training data, generating MindRecord file, and finally used for Bert network training.
+
+1.  run.sh: generate MindRecord entry script.
+2.  run_read.py: create MindDataset by MindRecord entry script.
+    - create_dataset.py: use MindDataset to read MindRecord to generate dataset.
+
+## Run simple test
+
+Follow the step:
+
+```bash
+bash run_simple.sh         # generate output/simple.mindrecord* by ../../../third_party/to_mindrecord/zhwiki/sample_text.txt
+bash run_read_simple.sh    # use MindDataset to read output/simple.mindrecord*
+```
+
+## How to use the example to process zhwiki
+
+Download zhwiki data, extract it, convert it to MindRecord, use MindDataset to read MindRecord.
+
+### Download zhwiki training data
+
+> [zhwiki dataset download address](https://dumps.wikimedia.org/zhwiki) **-> 20200401 -> zhwiki-20200401-pages-articles-multistream.xml.bz2**
+
+- put the zhwiki-20200401-pages-articles-multistream.xml.bz2 in {your-mindspore}/example/nlp_to_mindrecord/zhwiki/data directory.
+
+### Extract the zhwiki
+
+1. Download [wikiextractor](https://github.com/attardi/wikiextractor) script to {your-mindspore}/example/nlp_to_mindrecord/zhwiki/data directory.
+
+    ```
+    $ ls data/
+    README.md  wikiextractor  zhwiki-20200401-pages-articles-multistream.xml.bz2
+    ```
+
+2. Extract the zhwiki.
+    ```python
+    python data/wikiextractor/WikiExtractor.py data/zhwiki-20200401-pages-articles-multistream.xml.bz2 --processes 4 --templates data/template --bytes 8M --min_text_length 0 --filter_disambig_pages --output data/extract
+    ```
+
+3. Generate like this:
+    ```
+    $ ls data/extract
+    AA AB
+    ```
+
+### Generate MindRecord
+
+1. Run the run.sh script.
+    ```
+    bash run.sh
+    ```
+    > Caution: This process maybe slow, please wait patiently. If you do not have a machine with enough memory and cpu, it is recommended that you modify the script to generate mindrecord in step by step.
+
+2. The output like this:
+    ```
+    patching file create_pretraining_data_patched.py (read from create_pretraining_data.py)
+    Begin preprocess input file: ./data/extract/AA/wiki_00
+    Begin output file: AAwiki_00.mindrecord
+    Total task: 5, processing: 1
+    Begin preprocess input file: ./data/extract/AA/wiki_01
+    Begin output file: AAwiki_01.mindrecord
+    Total task: 5, processing: 2
+    Begin preprocess input file: ./data/extract/AA/wiki_02
+    Begin output file: AAwiki_02.mindrecord
+    Total task: 5, processing: 3
+    Begin preprocess input file: ./data/extract/AB/wiki_02
+    Begin output file: ABwiki_02.mindrecord
+    Total task: 5, processing: 4
+    ...
+    ```
+
+3. Generate files like this:
+    ```bash
+    $ ls output/
+    AAwiki_00.mindrecord AAwiki_00.mindrecord.db AAwiki_01.mindrecord AAwiki_01.mindrecord.db AAwiki_02.mindrecord AAwiki_02.mindrecord.db ... ABwiki_00.mindrecord ABwiki_00.mindrecord.db ...
+    ```
+
+### Create MindDataset By MindRecord
+
+1. Run the run_read.sh script.
+    ```bash
+    bash run_read.sh
+    ```
+
+2. The output like this:
+    ```
+    ...
+    example 74: input_ids: [  101  8168   118 12847  8783  9977 15908   117  8256  9245 11643  8168  8847  8588 11575  8154  8228   143  8384  8376  9197 10241   103 10564 11421  8199 12268   112   161  8228 11541  9586  8436  8174  8363  9864  9702   103   103   119   103  9947 10564   103  8436  8806 11479   103  8912   119   103   103   103 12209  8303   103  8757  8824   117  8256   103  8619  8168 11541   102 11684  8196   103  8228  8847 11523   117  9059  9064 12410  8358  8181 10764   117 11167 11706  9920   148  8332 11390  8936  8205 10951 11997   103  8154   117   103  8670 10467   112   161 10951 13139 12413   117 10288   143 10425  8205   152 10795  8472  8196   103   161 12126  9172 13129 12106  8217  8174 12244  8205   143   103  8461  8277 10628   160  8221   119   102]
+    example 74: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+    example 74: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+    example 74: masked_lm_positions: [  6  22  37  38  40  43  47  50  51  52  55  60  67  76  89  92  98 109 120   0]
+    example 74: masked_lm_ids: [ 8118  8165  8329  8890  8554  8458   119  8850  8565 10392  8174 11467  10291  8181  8549 12718 13139   112   158     0]
+    example 74: masked_lm_weights: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
+    example 74: next_sentence_labels: [0]
+    ...
+    ```
diff --git a/example/nlp_to_mindrecord/zhwiki/create_dataset.py b/example/nlp_to_mindrecord/zhwiki/create_dataset.py
new file mode 100644
index 0000000000..d90d12b7f2
--- /dev/null
+++ b/example/nlp_to_mindrecord/zhwiki/create_dataset.py
@@ -0,0 +1,43 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""create MindDataset by MindRecord"""
+import argparse
+import mindspore.dataset as ds
+
+def create_dataset(data_file):
+    """create MindDataset"""
+    num_readers = 4
+    data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
+    index = 0
+    for item in data_set.create_dict_iterator():
+        # print("example {}: {}".format(index, item))
+        print("example {}: input_ids: {}".format(index, item['input_ids']))
+        print("example {}: input_mask: {}".format(index, item['input_mask']))
+        print("example {}: segment_ids: {}".format(index, item['segment_ids']))
+        print("example {}: masked_lm_positions: {}".format(index, item['masked_lm_positions']))
+        print("example {}: masked_lm_ids: {}".format(index, item['masked_lm_ids']))
+        print("example {}: masked_lm_weights: {}".format(index, item['masked_lm_weights']))
+        print("example {}: next_sentence_labels: {}".format(index, item['next_sentence_labels']))
+        index += 1
+        if index % 1000 == 0:
+            print("read rows: {}".format(index))
+    print("total rows: {}".format(index))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_file", nargs='+', type=str, help='Input mindreord file')
+    args = parser.parse_args()
+
+    create_dataset(args.input_file)
diff --git a/example/nlp_to_mindrecord/zhwiki/data/.gitignore b/example/nlp_to_mindrecord/zhwiki/data/.gitignore
new file mode 100644
index 0000000000..f15cab0c89
--- /dev/null
+++ b/example/nlp_to_mindrecord/zhwiki/data/.gitignore
@@ -0,0 +1,3 @@
+wikiextractor/
+zhwiki-20200401-pages-articles-multistream.xml.bz2
+extract/
diff --git a/example/nlp_to_mindrecord/zhwiki/data/README.md b/example/nlp_to_mindrecord/zhwiki/data/README.md
new file mode 100644
index 0000000000..b54948808e
--- /dev/null
+++ b/example/nlp_to_mindrecord/zhwiki/data/README.md
@@ -0,0 +1 @@
+## The input dataset
diff --git a/example/nlp_to_mindrecord/zhwiki/output/README.md b/example/nlp_to_mindrecord/zhwiki/output/README.md
new file mode 100644
index 0000000000..b7cfba1b47
--- /dev/null
+++ b/example/nlp_to_mindrecord/zhwiki/output/README.md
@@ -0,0 +1 @@
+## Output the mindrecord
diff --git a/example/nlp_to_mindrecord/zhwiki/run.sh b/example/nlp_to_mindrecord/zhwiki/run.sh
new file mode 100644
index 0000000000..a057031e6b
--- /dev/null
+++ b/example/nlp_to_mindrecord/zhwiki/run.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+rm -f output/*.mindrecord*
+
+data_dir="./data/extract"
+file_list=()
+output_filename=()
+file_index=0
+
+function getdir() {
+    elements=`ls $1`
+    for element in ${elements[*]};
+    do
+        dir_or_file=$1"/"$element
+        if [ -d $dir_or_file ];
+        then
+            getdir $dir_or_file
+        else
+            file_list[$file_index]=$dir_or_file
+            echo "${dir_or_file}" | tr '/' '\n' > dir_file_list.txt   # dir dir file to mapfile
+            mapfile parent_dir < dir_file_list.txt
+            rm dir_file_list.txt >/dev/null 2>&1
+            tmp_output_filename=${parent_dir[${#parent_dir[@]}-2]}${parent_dir[${#parent_dir[@]}-1]}".mindrecord"
+            output_filename[$file_index]=`echo ${tmp_output_filename} | sed 's/ //g'`
+            file_index=`expr $file_index + 1`
+        fi
+    done
+}
+
+getdir "${data_dir}"
+# echo "The input files: "${file_list[@]}
+# echo "The output files: "${output_filename[@]}
+
+if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
+    echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
+    exit 1
+fi
+
+if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
+    echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
+    exit 1
+fi
+
+# patch for create_pretraining_data.py
+patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
+if [ $? -ne 0 ]; then
+    echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
+    exit 1
+fi
+
+# get the cpu core count
+num_cpu_core=`cat /proc/cpuinfo | grep "processor" | wc -l`
+avaiable_core_size=`expr $num_cpu_core / 3 \* 2`
+
+echo "Begin preprocess `date`"
+
+# using patched script to generate mindrecord
+file_list_len=`expr ${#file_list[*]} - 1`
+for index in $(seq 0 $file_list_len); do
+    echo "Begin preprocess input file: ${file_list[$index]}"
+    echo "Begin output file: ${output_filename[$index]}"
+    python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
+        --input_file=${file_list[$index]} \
+        --output_file=output/${output_filename[$index]} \
+        --partition_number=1 \
+        --vocab_file=../../../third_party/to_mindrecord/zhwiki/vocab.txt \
+        --do_lower_case=True \
+        --max_seq_length=128 \
+        --max_predictions_per_seq=20 \
+        --masked_lm_prob=0.15 \
+        --random_seed=12345 \
+        --dupe_factor=10 >/tmp/${output_filename[$index]}.log 2>&1 &   # user defined
+    process_count=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
+    echo "Total task: ${#file_list[*]}, processing: ${process_count}"
+    if [ $process_count -ge $avaiable_core_size ]; then
+        while [ 1 ]; do
+            process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
+            if [ $process_count -gt $process_num ]; then
+                process_count=$process_num
+                break;
+            fi
+            sleep 2
+        done
+    fi
+done
+
+process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
+while [ 1 ]; do
+    if [ $process_num -eq 0 ]; then
+        break;
+    fi
+    echo "There are still ${process_num} preprocess running ..."
+    sleep 2
+    process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
+done
+
+echo "Preprocess all the data success."
+echo "End preprocess `date`"
diff --git a/example/nlp_to_mindrecord/zhwiki/run_read.sh b/example/nlp_to_mindrecord/zhwiki/run_read.sh
new file mode 100644
index 0000000000..3cc368457b
--- /dev/null
+++ b/example/nlp_to_mindrecord/zhwiki/run_read.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+file_list=()
+file_index=0
+
+# get all the mindrecord file from output dir
+function getdir() {
+    elements=`ls $1/[A-Z]*.mindrecord`
+    for element in ${elements[*]};
+    do
+        file_list[$file_index]=$element
+        file_index=`expr $file_index + 1`
+    done
+}
+
+getdir "./output"
+echo "Get all the mindrecord files: "${file_list[*]}
+
+# create dataset for train
+python create_dataset.py --input_file ${file_list[*]}
diff --git a/example/nlp_to_mindrecord/zhwiki/run_read_simple.sh b/example/nlp_to_mindrecord/zhwiki/run_read_simple.sh
new file mode 100644
index 0000000000..1c26dec449
--- /dev/null
+++ b/example/nlp_to_mindrecord/zhwiki/run_read_simple.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# create dataset for train
+python create_dataset.py --input_file=output/simple.mindrecord0
diff --git a/example/nlp_to_mindrecord/zhwiki/run_simple.sh b/example/nlp_to_mindrecord/zhwiki/run_simple.sh
new file mode 100644
index 0000000000..20c1d98d66
--- /dev/null
+++ b/example/nlp_to_mindrecord/zhwiki/run_simple.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+rm -f output/simple.mindrecord*
+
+if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
+    echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
+    exit 1
+fi
+
+if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
+    echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
+    exit 1
+fi
+
+# patch for create_pretraining_data.py
+patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
+if [ $? -ne 0 ]; then
+    echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
+    exit 1
+fi
+
+# using patched script to generate mindrecord
+python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
+--input_file=../../../third_party/to_mindrecord/zhwiki/sample_text.txt \
+--output_file=output/simple.mindrecord \
+--partition_number=4 \
+--vocab_file=../../../third_party/to_mindrecord/zhwiki/vocab.txt \
+--do_lower_case=True \
+--max_seq_length=128 \
+--max_predictions_per_seq=20 \
+--masked_lm_prob=0.15 \
+--random_seed=12345 \
+--dupe_factor=10    # user defined
diff --git a/example/resnet50_cifar10/train.py b/example/resnet50_cifar10/train.py
index 275f7188a7..323695ae29 100755
--- a/example/resnet50_cifar10/train.py
+++ b/example/resnet50_cifar10/train.py
@@ -15,6 +15,7 @@
 """train_imagenet."""
 import os
 import argparse
+import numpy as np
 from dataset import create_dataset
 from lr_generator import get_lr
 from config import config
@@ -45,6 +46,7 @@ if __name__ == '__main__':
     target = args_opt.device_target
     ckpt_save_dir = config.save_checkpoint_path
     context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
+    np.random.seed(1)
     if not args_opt.do_eval and args_opt.run_distribute:
         if target == "Ascend":
             device_id = int(os.getenv('DEVICE_ID'))
diff --git a/example/resnet50_imagenet2012/train.py b/example/resnet50_imagenet2012/train.py
index a76de78f6d..6896320ece 100755
--- a/example/resnet50_imagenet2012/train.py
+++ b/example/resnet50_imagenet2012/train.py
@@ -15,6 +15,7 @@
 """train_imagenet."""
 import os
 import argparse
+import numpy as np
 from dataset import create_dataset
 from lr_generator import get_lr
 from config import config
@@ -48,6 +49,7 @@ if __name__ == '__main__':
     target = args_opt.device_target
     ckpt_save_dir = config.save_checkpoint_path
     context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
+    np.random.seed(1)
     if not args_opt.do_eval and args_opt.run_distribute:
         if target == "Ascend":
             device_id = int(os.getenv('DEVICE_ID'))
@@ -77,12 +79,12 @@ if __name__ == '__main__':
         for _, cell in net.cells_and_names():
             if isinstance(cell, nn.Conv2d):
                 cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
-                                                                    cell.weight.default_input.shape(),
-                                                                    cell.weight.default_input.dtype()).to_tensor()
+                                                                    cell.weight.default_input.shape,
+                                                                    cell.weight.default_input.dtype).to_tensor()
             if isinstance(cell, nn.Dense):
                 cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
-                                                                    cell.weight.default_input.shape(),
-                                                                    cell.weight.default_input.dtype()).to_tensor()
+                                                                    cell.weight.default_input.shape,
+                                                                    cell.weight.default_input.dtype).to_tensor()
     if not config.use_label_smooth:
         config.label_smooth_factor = 0.0
 
diff --git a/example/resnet50_imagenet2012_THOR/model/dataset_helper.py b/example/resnet50_imagenet2012_THOR/model/dataset_helper.py
index 474bccf42f..77f67344c2 100644
--- a/example/resnet50_imagenet2012_THOR/model/dataset_helper.py
+++ b/example/resnet50_imagenet2012_THOR/model/dataset_helper.py
@@ -15,6 +15,7 @@
 """Dataset help for minddata dataset"""
 from mindspore._checkparam import check_bool
 from mindspore.parallel._utils import _get_device_num, _get_parallel_mode
+from mindspore.train.dataset_helper import _send_data
 from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes, \
     _to_full_shapes
 from mindspore.train.parallel_utils import ParallelMode
@@ -67,7 +68,13 @@ class _DatasetIter:
                 self.loop_size = dataset.get_dataset_size()
             else:
                 self.loop_size = dataset.__loop_size__
-            dataset.__ME_INITED__ = _exec_datagraph(dataset, self.loop_size).queue_name
+            dataset.__TRANSFER_DATASET__ = _exec_datagraph(dataset, self.loop_size)
+            dataset.__ME_INITED__ = dataset.__TRANSFER_DATASET__.queue_name
+
+            if not hasattr(dataset, '__no_send__'):
+                _send_data(dataset)
+        else:
+            _send_data(dataset)
 
         self.ind = 0
         self.dataset = dataset
diff --git a/example/resnet50_imagenet2012_THOR/model/model_thor.py b/example/resnet50_imagenet2012_THOR/model/model_thor.py
index f3418437a3..25e3dd7f82 100644
--- a/example/resnet50_imagenet2012_THOR/model/model_thor.py
+++ b/example/resnet50_imagenet2012_THOR/model/model_thor.py
@@ -29,7 +29,7 @@ from mindspore.nn.wrap.cell_wrapper import _VirtualDatasetCell
 from mindspore.parallel._utils import _get_parallel_mode, _get_device_num, _get_global_rank, \
     _get_parameter_broadcast, _device_number_check, _parameter_broadcast_check
 from mindspore.train import amp
-from mindspore.train.callback import _InternalCallbackParam, RunContext, _build_callbacks
+from mindspore.train.callback import _InternalCallbackParam, RunContext, _CallbackManager
 from mindspore.train.parallel_utils import ParallelMode
 
 from model.dataset_helper import DatasetHelper
@@ -374,7 +374,6 @@ class Model:
             self._train_network.set_broadcast_flag()
 
         # build callback list
-        list_callback = _build_callbacks(callbacks)
         cb_params = _InternalCallbackParam()
         cb_params.train_network = self._train_network
         cb_params.epoch_num = epoch
@@ -385,17 +384,17 @@ class Model:
         cb_params.parallel_mode = self._parallel_mode
         cb_params.device_number = self._device_number
         cb_params.train_dataset = train_dataset
-        cb_params.list_callback = list_callback
+        cb_params.list_callback = callbacks
 
-        if dataset_sink_mode:
-            if context.get_context("mode") == context.PYNATIVE_MODE:
+        with _CallbackManager(callbacks) as list_callback:
+            if not dataset_sink_mode:
+                self._train_process(epoch, train_dataset, list_callback, cb_params)
+            elif context.get_context("mode") == context.PYNATIVE_MODE:
                 logger.warning("The pynative mode cannot support dataset sink mode currently."
                                "So the training process will be performed with dataset not sink.")
                 self._train_process(epoch, train_dataset, list_callback, cb_params)
             else:
                 self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params)
-        else:
-            self._train_process(epoch, train_dataset, list_callback, cb_params)
 
     def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None):
         """
@@ -408,7 +407,7 @@ class Model:
                                      returned and passed to the network. Otherwise, a tuple (data, label) should
                                      be returned, and the data and label are passed to the network and loss
                                      function respectively.
-            list_callback (_ListCallback): Executor of callback list. Default: None.
+            list_callback (Callback): Executor of callback list. Default: None.
             cb_params (_InternalCallbackParam): Callback parameters. Default: None.
         """
         iter_first_order = self._frequency - 1
@@ -473,7 +472,7 @@ class Model:
                                      returned and passed to the network. Otherwise, a tuple (data, label) should
                                      be returned, and the data and label are passed to the network and loss
                                      function respectively.
-            list_callback (_ListCallback): Executor of callback list. Default: None.
+            list_callback (Callback): Executor of callback list. Default: None.
             cb_params (_InternalCallbackParam): Callback parameters. Default: None.
         """
         dataset_helper, _ = self._exec_preprocess(self._train_network,
@@ -580,7 +579,7 @@ class Model:
 
         Args:
             valid_dataset (Dataset): Dataset to evaluate the model.
-            list_callback (ListCallback): Executor of callback list. Default: None.
+            list_callback (Callback): Executor of callback list. Default: None.
             cb_params (_InternalCallbackParam): Callback parameters. Default: None.
 
         Returns:
@@ -619,7 +618,7 @@ class Model:
 
         Args:
             valid_dataset (Dataset): Dataset to evaluate the model.
-            list_callback (ListCallback): Executor of callback list. Default: None.
+            list_callback (Callback): Executor of callback list. Default: None.
             cb_params (_InternalCallbackParam): Callback parameters. Default: None.
 
         Returns:
@@ -678,7 +677,6 @@ class Model:
         if not self._metric_fns:
             raise ValueError("metric fn can not be None or empty.")
 
-        list_callback = _build_callbacks(callbacks)
         cb_params = _InternalCallbackParam()
         cb_params.eval_network = self._eval_network
         cb_params.valid_dataset = valid_dataset
@@ -691,9 +689,10 @@ class Model:
 
         self._clear_metrics()
 
-        if dataset_sink_mode:
-            return self._eval_dataset_sink_process(valid_dataset, list_callback, cb_params)
-        return self._eval_process(valid_dataset, list_callback, cb_params)
+        with _CallbackManager(callbacks) as list_callback:
+            if dataset_sink_mode:
+                return self._eval_dataset_sink_process(valid_dataset, list_callback, cb_params)
+            return self._eval_process(valid_dataset, list_callback, cb_params)
 
     def predict(self, *predict_data):
         """
diff --git a/example/resnet50_imagenet2012_THOR/model/thor.py b/example/resnet50_imagenet2012_THOR/model/thor.py
index 0da1714fe6..6786cb7485 100644
--- a/example/resnet50_imagenet2012_THOR/model/thor.py
+++ b/example/resnet50_imagenet2012_THOR/model/thor.py
@@ -151,6 +151,8 @@ class THOR(Optimizer):
                 temp_g = self.mul(temp_g, matrix_G_inv_max)
                 temp_max = self.mul(matrix_A_max_allreduce[i], matrix_G_max_allreduce[i])
                 temp_max = self.mul(temp_max, self.feature_map[i])
+                temp_a = self.cast(temp_a, mstype.float16)
+                temp_g = self.cast(temp_g, mstype.float16)
                 if i == 53:
                     g = self.cube_matmul_left_fc(temp_g, g)
                     g = self.cube_matmul_right_fc(g, temp_a, temp_max)
diff --git a/example/resnet50_imagenet2012_THOR/model/thor_layer.py b/example/resnet50_imagenet2012_THOR/model/thor_layer.py
index fea74605b6..d84cbf7a93 100644
--- a/example/resnet50_imagenet2012_THOR/model/thor_layer.py
+++ b/example/resnet50_imagenet2012_THOR/model/thor_layer.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 # ============================================================================
 """thor_layer"""
+import numpy as np
+
 import mindspore as ms
 import mindspore.common.dtype as mstype
 from mindspore._checkparam import check_bool, twice, check_int_positive
@@ -23,7 +25,6 @@ from mindspore.common.tensor import Tensor
 from mindspore.nn.cell import Cell
 from mindspore.nn.layer.activation import get_activation
 from mindspore.ops import operations as P
-import numpy as np
 C0 = 16
 
 def caculate_device_shape(matrix_dim, channel, is_A):
@@ -171,7 +172,6 @@ class Conv2d_Thor(_Conv):
         self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False)
         self.fake_G = Tensor(
             np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape))
-        self.fake_G_inv_max = Tensor(np.zeros([1,]).astype(np.float32))
 
         self.shape = P.Shape()
         self.reshape = P.Reshape()
@@ -286,7 +286,6 @@ class Conv2d_Thor(_Conv):
                 matrix_A_inv = self.device_shape_pad(matrix_A_inv)
             matrix_A_inv = self.reshape(matrix_A_inv, self.matrix_A_device_temp_shape)
             matrix_A_inv = self.transpose(matrix_A_inv, (2, 0, 1, 3))
-            self.G_inv_max = self.fake_G_inv_max
             self.matrix_A_inv = matrix_A_inv
             self.matrix_G_inv = self.fake_G
             out = self.conv2d(x, self.weight)
@@ -339,15 +338,15 @@ class Dense_Thor(Cell):
         self.has_bias = check_bool(has_bias)
         self.thor = True
         if isinstance(weight_init, Tensor):
-            if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \
-                    weight_init.shape()[1] != in_channels:
+            if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \
+                    weight_init.shape[1] != in_channels:
                 raise ValueError("weight_init shape error")
 
         self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight")
 
         if self.has_bias:
             if isinstance(bias_init, Tensor):
-                if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels:
+                if bias_init.dim() != 1 or bias_init.shape[0] != out_channels:
                     raise ValueError("bias_init shape error")
 
             self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias")
diff --git a/example/resnet50_imagenet2012_THOR/train.py b/example/resnet50_imagenet2012_THOR/train.py
index 881f3cf598..309018da57 100644
--- a/example/resnet50_imagenet2012_THOR/train.py
+++ b/example/resnet50_imagenet2012_THOR/train.py
@@ -17,6 +17,8 @@ import argparse
 import os
 import random
 
+import numpy as np
+
 from mindspore import Tensor
 from mindspore import context
 from mindspore.communication.management import init
@@ -28,7 +30,6 @@ from model.model_thor import Model
 from model.resnet import resnet50
 from model.thor import THOR
 
-import numpy as np
 from config import config
 from crossentropy import CrossEntropy
 from dataset_imagenet import create_dataset
diff --git a/example/ssd_coco2017/README.md b/example/ssd_coco2017/README.md
deleted file mode 100644
index bd43344b8b..0000000000
--- a/example/ssd_coco2017/README.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# SSD Example
-
-## Description
-
-SSD network based on MobileNetV2, with support for training and evaluation.
-
-## Requirements
-
-- Install [MindSpore](https://www.mindspore.cn/install/en).
-
-- Dataset
-
-    We use coco2017 as training dataset in this example by default, and you can also use your own datasets.
-
-    1. If coco dataset is used. **Select dataset to coco when run script.**
-        Install Cython and pycocotool.
-
-        ```
-        pip install Cython
-
-        pip install pycocotools
-        ```
-        And change the COCO_ROOT and other settings you need in `config.py`. The directory structure is as follows:
-
-
-        ```
-        └─coco2017
-            ├── annotations  # annotation jsons
-            ├── train2017    # train dataset
-            └── val2017      # infer dataset
-        ```
-
-    2. If your own dataset is used. **Select dataset to other when run script.**
-        Organize the dataset infomation into a TXT file, each row in the file is as follows:
-
-        ```
-        train2017/0000001.jpg 0,259,401,459,7 35,28,324,201,2 0,30,59,80,2
-        ```
-
-        Each row is an image annotation which split by space, the first column is a relative path of image, the others are box and class infomations of the format [xmin,ymin,xmax,ymax,class]. We read image from an image path joined by the `IMAGE_DIR`(dataset directory) and the relative path in `ANNO_PATH`(the TXT file path), `IMAGE_DIR` and `ANNO_PATH` are setting in `config.py`.
-
-
-## Running the example
-
-### Training
-
-To train the model, run `train.py`. If the `MINDRECORD_DIR` is empty, it will generate [mindrecord](https://www.mindspore.cn/tutorial/en/master/use/data_preparation/converting_datasets.html) files by `COCO_ROOT`(coco dataset) or `IMAGE_DIR` and `ANNO_PATH`(own dataset). **Note if MINDRECORD_DIR isn't empty, it will use MINDRECORD_DIR instead of raw images.**
-
-
-- Stand alone mode
-
-    ```
-    python train.py --dataset coco
-
-    ```
-
-    You can run ```python train.py -h```  to get more information.
-
-
-- Distribute mode
-
-    ```
-    sh run_distribute_train.sh 8 150 coco /data/hccl.json
-    ```
-
-    The input parameters are device numbers, epoch size, dataset mode and [hccl json configuration file](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). **It is better to use absolute path.** 
-
-You will get the loss value of each step as following:
-
-```
-epoch: 1 step: 455, loss is 5.8653416
-epoch: 2 step: 455, loss is 5.4292373
-epoch: 3 step: 455, loss is 5.458992
-...
-epoch: 148 step: 455, loss is 1.8340507
-epoch: 149 step: 455, loss is 2.0876894
-epoch: 150 step: 455, loss is 2.239692
-```
-
-### Evaluation
-
-for evaluation , run `eval.py` with `ckpt_path`. `ckpt_path` is the path of [checkpoint](https://www.mindspore.cn/tutorial/en/master/use/saving_and_loading_model_parameters.html) file.
-
-```
-python eval.py --ckpt_path ssd.ckpt --dataset coco
-```
-
-You can run ```python eval.py -h```  to get more information.
diff --git a/example/ssd_coco2017/config.py b/example/ssd_coco2017/config.py
deleted file mode 100644
index 452aaf9700..0000000000
--- a/example/ssd_coco2017/config.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""Config parameters for SSD models."""
-
-
-class ConfigSSD:
-    """
-    Config parameters for SSD.
-
-    Examples:
-        ConfigSSD().
-    """
-    IMG_SHAPE = [300, 300]
-    NUM_SSD_BOXES = 1917
-    NEG_PRE_POSITIVE = 3
-    MATCH_THRESHOLD = 0.5
-
-    NUM_DEFAULT = [3, 6, 6, 6, 6, 6]
-    EXTRAS_IN_CHANNELS = [256, 576, 1280, 512, 256, 256]
-    EXTRAS_OUT_CHANNELS = [576, 1280, 512, 256, 256, 128]
-    EXTRAS_STRIDES = [1, 1, 2, 2, 2, 2]
-    EXTRAS_RATIO = [0.2, 0.2, 0.2, 0.25, 0.5, 0.25]
-    FEATURE_SIZE = [19, 10, 5, 3, 2, 1]
-    SCALES = [21, 45, 99, 153, 207, 261, 315]
-    ASPECT_RATIOS = [(1,), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3)]
-    STEPS = (16, 32, 64, 100, 150, 300)
-    PRIOR_SCALING = (0.1, 0.2)
-
-
-    # `MINDRECORD_DIR` and `COCO_ROOT` are better to use absolute path.
-    MINDRECORD_DIR = "MindRecord_COCO"
-    COCO_ROOT = "coco2017"
-    TRAIN_DATA_TYPE = "train2017"
-    VAL_DATA_TYPE = "val2017"
-    INSTANCES_SET = "annotations/instances_{}.json"
-    COCO_CLASSES = ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
-                    'train', 'truck', 'boat', 'traffic light', 'fire', 'hydrant',
-                    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
-                    'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
-                    'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
-                    'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-                    'kite', 'baseball bat', 'baseball glove', 'skateboard',
-                    'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
-                    'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
-                    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
-                    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
-                    'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
-                    'keyboard', 'cell phone', 'microwave oven', 'toaster', 'sink',
-                    'refrigerator', 'book', 'clock', 'vase', 'scissors',
-                    'teddy bear', 'hair drier', 'toothbrush')
-    NUM_CLASSES = len(COCO_CLASSES)
diff --git a/example/ssd_coco2017/dataset.py b/example/ssd_coco2017/dataset.py
deleted file mode 100644
index b88b22c862..0000000000
--- a/example/ssd_coco2017/dataset.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""SSD dataset"""
-from __future__ import division
-
-import os
-import math
-import itertools as it
-import numpy as np
-import cv2
-
-import mindspore.dataset as de
-import mindspore.dataset.transforms.vision.c_transforms as C
-from mindspore.mindrecord import FileWriter
-from config import ConfigSSD
-
-config = ConfigSSD()
-
-class GeneratDefaultBoxes():
-    """
-    Generate Default boxes for SSD, follows the order of (W, H, archor_sizes).
-    `self.default_boxes` has a shape of [archor_sizes, H, W, 4], the last dimension is [x, y, w, h].
-    `self.default_boxes_ltrb` has a shape as `self.default_boxes`, the last dimension is [x1, y1, x2, y2].
-    """
-    def __init__(self):
-        fk = config.IMG_SHAPE[0] / np.array(config.STEPS)
-        self.default_boxes = []
-        for idex, feature_size in enumerate(config.FEATURE_SIZE):
-            sk1 = config.SCALES[idex] / config.IMG_SHAPE[0]
-            sk2 = config.SCALES[idex + 1] / config.IMG_SHAPE[0]
-            sk3 = math.sqrt(sk1 * sk2)
-
-            if config.NUM_DEFAULT[idex] == 3:
-                all_sizes = [(0.5, 1.0), (1.0, 1.0), (1.0, 0.5)]
-            else:
-                all_sizes = [(sk1, sk1), (sk3, sk3)]
-                for aspect_ratio in config.ASPECT_RATIOS[idex]:
-                    w, h = sk1 * math.sqrt(aspect_ratio), sk1 / math.sqrt(aspect_ratio)
-                    all_sizes.append((w, h))
-                    all_sizes.append((h, w))
-
-            assert len(all_sizes) == config.NUM_DEFAULT[idex]
-
-            for i, j in it.product(range(feature_size), repeat=2):
-                for w, h in all_sizes:
-                    cx, cy = (j + 0.5) / fk[idex], (i + 0.5) / fk[idex]
-                    box = [np.clip(k, 0, 1) for k in (cx, cy, w, h)]
-                    self.default_boxes.append(box)
-
-        def to_ltrb(cx, cy, w, h):
-            return cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2
-
-        # For IoU calculation
-        self.default_boxes_ltrb = np.array(tuple(to_ltrb(*i) for i in self.default_boxes), dtype='float32')
-        self.default_boxes = np.array(self.default_boxes, dtype='float32')
-
-
-default_boxes_ltrb = GeneratDefaultBoxes().default_boxes_ltrb
-default_boxes = GeneratDefaultBoxes().default_boxes
-x1, y1, x2, y2 = np.split(default_boxes_ltrb[:, :4], 4, axis=-1)
-vol_anchors = (x2 - x1) * (y2 - y1)
-matching_threshold = config.MATCH_THRESHOLD
-
-
-def ssd_bboxes_encode(boxes):
-    """
-    Labels anchors with ground truth inputs.
-
-    Args:
-        boxex: ground truth with shape [N, 5], for each row, it stores [x, y, w, h, cls].
-
-    Returns:
-        gt_loc: location ground truth with shape [num_anchors, 4].
-        gt_label: class ground truth with shape [num_anchors, 1].
-        num_matched_boxes: number of positives in an image.
-    """
-
-    def jaccard_with_anchors(bbox):
-        """Compute jaccard score a box and the anchors."""
-        # Intersection bbox and volume.
-        xmin = np.maximum(x1, bbox[0])
-        ymin = np.maximum(y1, bbox[1])
-        xmax = np.minimum(x2, bbox[2])
-        ymax = np.minimum(y2, bbox[3])
-        w = np.maximum(xmax - xmin, 0.)
-        h = np.maximum(ymax - ymin, 0.)
-
-        # Volumes.
-        inter_vol = h * w
-        union_vol = vol_anchors + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) - inter_vol
-        jaccard = inter_vol / union_vol
-        return np.squeeze(jaccard)
-
-    pre_scores = np.zeros((config.NUM_SSD_BOXES), dtype=np.float32)
-    t_boxes = np.zeros((config.NUM_SSD_BOXES, 4), dtype=np.float32)
-    t_label = np.zeros((config.NUM_SSD_BOXES), dtype=np.int64)
-    for bbox in boxes:
-        label = int(bbox[4])
-        scores = jaccard_with_anchors(bbox)
-        mask = (scores > matching_threshold)
-        if not np.any(mask):
-            mask[np.argmax(scores)] = True
-
-        mask = mask & (scores > pre_scores)
-        pre_scores = np.maximum(pre_scores, scores)
-        t_label = mask * label + (1 - mask) * t_label
-        for i in range(4):
-            t_boxes[:, i] = mask * bbox[i] + (1 - mask) * t_boxes[:, i]
-
-    index = np.nonzero(t_label)
-
-    # Transform to ltrb.
-    bboxes = np.zeros((config.NUM_SSD_BOXES, 4), dtype=np.float32)
-    bboxes[:, [0, 1]] = (t_boxes[:, [0, 1]] + t_boxes[:, [2, 3]]) / 2
-    bboxes[:, [2, 3]] = t_boxes[:, [2, 3]] - t_boxes[:, [0, 1]]
-
-    # Encode features.
-    bboxes_t = bboxes[index]
-    default_boxes_t = default_boxes[index]
-    bboxes_t[:, :2] = (bboxes_t[:, :2] - default_boxes_t[:, :2]) / (default_boxes_t[:, 2:] * config.PRIOR_SCALING[0])
-    bboxes_t[:, 2:4] = np.log(bboxes_t[:, 2:4] / default_boxes_t[:, 2:4]) / config.PRIOR_SCALING[1]
-    bboxes[index] = bboxes_t
-
-    num_match_num = np.array([len(np.nonzero(t_label)[0])], dtype=np.int32)
-    return bboxes, t_label.astype(np.int32), num_match_num
-
-def ssd_bboxes_decode(boxes, index):
-    """Decode predict boxes to [x, y, w, h]"""
-    boxes_t = boxes[index]
-    default_boxes_t = default_boxes[index]
-    boxes_t[:, :2] = boxes_t[:, :2] * config.PRIOR_SCALING[0] * default_boxes_t[:, 2:] + default_boxes_t[:, :2]
-    boxes_t[:, 2:4] = np.exp(boxes_t[:, 2:4] * config.PRIOR_SCALING[1]) * default_boxes_t[:, 2:4]
-
-    bboxes = np.zeros((len(boxes_t), 4), dtype=np.float32)
-
-    bboxes[:, [0, 1]] = boxes_t[:, [0, 1]] - boxes_t[:, [2, 3]] / 2
-    bboxes[:, [2, 3]] = boxes_t[:, [0, 1]] + boxes_t[:, [2, 3]] / 2
-
-    return bboxes
-
-def preprocess_fn(image, box, is_training):
-    """Preprocess function for dataset."""
-
-    def _rand(a=0., b=1.):
-        """Generate random."""
-        return np.random.rand() * (b - a) + a
-
-    def _infer_data(image, input_shape, box):
-        img_h, img_w, _ = image.shape
-        input_h, input_w = input_shape
-
-        scale = min(float(input_w) / float(img_w), float(input_h) / float(img_h))
-        nw = int(img_w * scale)
-        nh = int(img_h * scale)
-
-        image = cv2.resize(image, (nw, nh))
-
-        new_image = np.zeros((input_h, input_w, 3), np.float32)
-        dh = (input_h - nh) // 2
-        dw = (input_w - nw) // 2
-        new_image[dh: (nh + dh), dw: (nw + dw), :] = image
-        image = new_image
-
-        #When the channels of image is 1
-        if len(image.shape) == 2:
-            image = np.expand_dims(image, axis=-1)
-            image = np.concatenate([image, image, image], axis=-1)
-
-        box = box.astype(np.float32)
-
-        box[:, [0, 2]] = (box[:, [0, 2]] * scale + dw) / input_w
-        box[:, [1, 3]] = (box[:, [1, 3]] * scale + dh) / input_h
-        return image, np.array((img_h, img_w), np.float32), box
-
-    def _data_aug(image, box, is_training, image_size=(300, 300)):
-        """Data augmentation function."""
-        ih, iw, _ = image.shape
-        w, h = image_size
-
-        if not is_training:
-            return _infer_data(image, image_size, box)
-        # Random settings
-        scale_w = _rand(0.75, 1.25)
-        scale_h = _rand(0.75, 1.25)
-
-        flip = _rand() < .5
-        nw = iw * scale_w
-        nh = ih * scale_h
-        scale = min(w / nw, h / nh)
-        nw = int(scale * nw)
-        nh = int(scale * nh)
-
-        # Resize image
-        image = cv2.resize(image, (nw, nh))
-
-        # place image
-        new_image = np.zeros((h, w, 3), dtype=np.float32)
-        dw = (w - nw) // 2
-        dh = (h - nh) // 2
-        new_image[dh:dh + nh, dw:dw + nw, :] = image
-        image = new_image
-
-        # Flip image or not
-        if flip:
-            image = cv2.flip(image, 1, dst=None)
-
-        # Convert image to gray or not
-        gray = _rand() < .25
-        if gray:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-
-        # When the channels of image is 1
-        if len(image.shape) == 2:
-            image = np.expand_dims(image, axis=-1)
-            image = np.concatenate([image, image, image], axis=-1)
-
-        box = box.astype(np.float32)
-
-        # Transform box with shape[x1, y1, x2, y2].
-        box[:, [0, 2]] = (box[:, [0, 2]] * scale * scale_w + dw) / w
-        box[:, [1, 3]] = (box[:, [1, 3]] * scale * scale_h + dh) / h
-
-        if flip:
-            box[:, [0, 2]] = 1 - box[:, [2, 0]]
-
-        box, label, num_match_num = ssd_bboxes_encode(box)
-        return image, box, label, num_match_num
-    return _data_aug(image, box, is_training, image_size=config.IMG_SHAPE)
-
-
-def create_coco_label(is_training):
-    """Get image path and annotation from COCO."""
-    from pycocotools.coco import COCO
-
-    coco_root = config.COCO_ROOT
-    data_type = config.VAL_DATA_TYPE
-    if is_training:
-        data_type = config.TRAIN_DATA_TYPE
-
-    #Classes need to train or test.
-    train_cls = config.COCO_CLASSES
-    train_cls_dict = {}
-    for i, cls in enumerate(train_cls):
-        train_cls_dict[cls] = i
-
-    anno_json = os.path.join(coco_root, config.INSTANCES_SET.format(data_type))
-
-    coco = COCO(anno_json)
-    classs_dict = {}
-    cat_ids = coco.loadCats(coco.getCatIds())
-    for cat in cat_ids:
-        classs_dict[cat["id"]] = cat["name"]
-
-    image_ids = coco.getImgIds()
-    image_files = []
-    image_anno_dict = {}
-
-    for img_id in image_ids:
-        image_info = coco.loadImgs(img_id)
-        file_name = image_info[0]["file_name"]
-        anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=None)
-        anno = coco.loadAnns(anno_ids)
-        image_path = os.path.join(coco_root, data_type, file_name)
-        annos = []
-        for label in anno:
-            bbox = label["bbox"]
-            class_name = classs_dict[label["category_id"]]
-            if class_name in train_cls:
-                x_min, x_max = bbox[0], bbox[0] + bbox[2]
-                y_min, y_max = bbox[1], bbox[1] + bbox[3]
-                annos.append(list(map(round, [x_min, y_min, x_max, y_max])) + [train_cls_dict[class_name]])
-        if len(annos) >= 1:
-            image_files.append(image_path)
-            image_anno_dict[image_path] = np.array(annos)
-    return image_files, image_anno_dict
-
-
-def anno_parser(annos_str):
-    """Parse annotation from string to list."""
-    annos = []
-    for anno_str in annos_str:
-        anno = list(map(int, anno_str.strip().split(',')))
-        annos.append(anno)
-    return annos
-
-
-def filter_valid_data(image_dir, anno_path):
-    """Filter valid image file, which both in image_dir and anno_path."""
-    image_files = []
-    image_anno_dict = {}
-    if not os.path.isdir(image_dir):
-        raise RuntimeError("Path given is not valid.")
-    if not os.path.isfile(anno_path):
-        raise RuntimeError("Annotation file is not valid.")
-
-    with open(anno_path, "rb") as f:
-        lines = f.readlines()
-    for line in lines:
-        line_str = line.decode("utf-8").strip()
-        line_split = str(line_str).split(' ')
-        file_name = line_split[0]
-        image_path = os.path.join(image_dir, file_name)
-        if os.path.isfile(image_path):
-            image_anno_dict[image_path] = anno_parser(line_split[1:])
-            image_files.append(image_path)
-    return image_files, image_anno_dict
-
-
-def data_to_mindrecord_byte_image(dataset="coco", is_training=True, prefix="ssd.mindrecord", file_num=8):
-    """Create MindRecord file."""
-    mindrecord_dir = config.MINDRECORD_DIR
-    mindrecord_path = os.path.join(mindrecord_dir, prefix)
-    writer = FileWriter(mindrecord_path, file_num)
-    if dataset == "coco":
-        image_files, image_anno_dict = create_coco_label(is_training)
-    else:
-        image_files, image_anno_dict = filter_valid_data(config.IMAGE_DIR, config.ANNO_PATH)
-
-    ssd_json = {
-        "image": {"type": "bytes"},
-        "annotation": {"type": "int32", "shape": [-1, 5]},
-    }
-    writer.add_schema(ssd_json, "ssd_json")
-
-    for image_name in image_files:
-        with open(image_name, 'rb') as f:
-            img = f.read()
-        annos = np.array(image_anno_dict[image_name], dtype=np.int32)
-        row = {"image": img, "annotation": annos}
-        writer.write_raw_data([row])
-    writer.commit()
-
-
-def create_ssd_dataset(mindrecord_file, batch_size=32, repeat_num=10, device_num=1, rank=0,
-                       is_training=True, num_parallel_workers=4):
-    """Creatr SSD dataset with MindDataset."""
-    ds = de.MindDataset(mindrecord_file, columns_list=["image", "annotation"], num_shards=device_num, shard_id=rank,
-                        num_parallel_workers=num_parallel_workers, shuffle=is_training)
-    decode = C.Decode()
-    ds = ds.map(input_columns=["image"], operations=decode)
-    compose_map_func = (lambda image, annotation: preprocess_fn(image, annotation, is_training))
-
-    if is_training:
-        hwc_to_chw = C.HWC2CHW()
-        ds = ds.map(input_columns=["image", "annotation"],
-                    output_columns=["image", "box", "label", "num_match_num"],
-                    columns_order=["image", "box", "label", "num_match_num"],
-                    operations=compose_map_func, python_multiprocessing=True, num_parallel_workers=num_parallel_workers)
-        ds = ds.map(input_columns=["image"], operations=hwc_to_chw, python_multiprocessing=True,
-                    num_parallel_workers=num_parallel_workers)
-        ds = ds.batch(batch_size, drop_remainder=True)
-        ds = ds.repeat(repeat_num)
-    else:
-        hwc_to_chw = C.HWC2CHW()
-        ds = ds.map(input_columns=["image", "annotation"],
-                    output_columns=["image", "image_shape", "annotation"],
-                    columns_order=["image", "image_shape", "annotation"],
-                    operations=compose_map_func)
-        ds = ds.map(input_columns=["image"], operations=hwc_to_chw, num_parallel_workers=num_parallel_workers)
-        ds = ds.batch(batch_size, drop_remainder=True)
-        ds = ds.repeat(repeat_num)
-    return ds
diff --git a/example/ssd_coco2017/util.py b/example/ssd_coco2017/util.py
deleted file mode 100644
index 6e10285375..0000000000
--- a/example/ssd_coco2017/util.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""metrics utils"""
-
-import numpy as np
-from config import ConfigSSD
-from dataset import ssd_bboxes_decode
-
-
-def calc_iou(bbox_pred, bbox_ground):
-    """Calculate iou of predicted bbox and ground truth."""
-    bbox_pred = np.expand_dims(bbox_pred, axis=0)
-
-    pred_w = bbox_pred[:, 2] - bbox_pred[:, 0]
-    pred_h = bbox_pred[:, 3] - bbox_pred[:, 1]
-    pred_area = pred_w * pred_h
-
-    gt_w = bbox_ground[:, 2] - bbox_ground[:, 0]
-    gt_h = bbox_ground[:, 3] - bbox_ground[:, 1]
-    gt_area = gt_w * gt_h
-
-    iw = np.minimum(bbox_pred[:, 2], bbox_ground[:, 2]) - np.maximum(bbox_pred[:, 0], bbox_ground[:, 0])
-    ih = np.minimum(bbox_pred[:, 3], bbox_ground[:, 3]) - np.maximum(bbox_pred[:, 1], bbox_ground[:, 1])
-
-    iw = np.maximum(iw, 0)
-    ih = np.maximum(ih, 0)
-    intersection_area = iw * ih
-
-    union_area = pred_area + gt_area - intersection_area
-    union_area = np.maximum(union_area, np.finfo(float).eps)
-
-    iou = intersection_area *  1. / union_area
-    return iou
-
-
-def apply_nms(all_boxes, all_scores, thres, max_boxes):
-    """Apply NMS to bboxes."""
-    x1 = all_boxes[:, 0]
-    y1 = all_boxes[:, 1]
-    x2 = all_boxes[:, 2]
-    y2 = all_boxes[:, 3]
-    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-
-    order = all_scores.argsort()[::-1]
-    keep = []
-
-    while order.size > 0:
-        i = order[0]
-        keep.append(i)
-
-        if len(keep) >= max_boxes:
-            break
-
-        xx1 = np.maximum(x1[i], x1[order[1:]])
-        yy1 = np.maximum(y1[i], y1[order[1:]])
-        xx2 = np.minimum(x2[i], x2[order[1:]])
-        yy2 = np.minimum(y2[i], y2[order[1:]])
-
-        w = np.maximum(0.0, xx2 - xx1 + 1)
-        h = np.maximum(0.0, yy2 - yy1 + 1)
-        inter = w * h
-
-        ovr = inter / (areas[i] + areas[order[1:]] - inter)
-
-        inds = np.where(ovr <= thres)[0]
-
-        order = order[inds + 1]
-    return keep
-
-
-def calc_ap(recall, precision):
-    """Calculate AP."""
-    correct_recall = np.concatenate(([0.], recall, [1.]))
-    correct_precision = np.concatenate(([0.], precision, [0.]))
-
-    for i in range(correct_recall.size - 1, 0, -1):
-        correct_precision[i - 1] = np.maximum(correct_precision[i - 1], correct_precision[i])
-
-    i = np.where(correct_recall[1:] != correct_recall[:-1])[0]
-
-    ap = np.sum((correct_recall[i + 1] - correct_recall[i]) * correct_precision[i + 1])
-
-    return ap
-
-def metrics(pred_data):
-    """Calculate mAP of predicted bboxes."""
-    config = ConfigSSD()
-    num_classes = config.NUM_CLASSES
-
-    all_detections = [None for i in range(num_classes)]
-    all_pred_scores = [None for i in range(num_classes)]
-    all_annotations = [None for i in range(num_classes)]
-    average_precisions = {}
-    num = [0 for i in range(num_classes)]
-    accurate_num = [0 for i in range(num_classes)]
-
-    for sample in pred_data:
-        pred_boxes = sample['boxes']
-        boxes_scores = sample['box_scores']
-        annotation = sample['annotation']
-
-        annotation = np.squeeze(annotation, axis=0)
-
-        pred_labels = np.argmax(boxes_scores, axis=-1)
-        index = np.nonzero(pred_labels)
-        pred_boxes = ssd_bboxes_decode(pred_boxes, index)
-
-        pred_boxes = pred_boxes.clip(0, 1)
-        boxes_scores = np.max(boxes_scores, axis=-1)
-        boxes_scores = boxes_scores[index]
-        pred_labels = pred_labels[index]
-
-        top_k = 50
-
-        for c in range(1, num_classes):
-            if len(pred_labels) >= 1:
-                class_box_scores = boxes_scores[pred_labels == c]
-                class_boxes = pred_boxes[pred_labels == c]
-
-                nms_index = apply_nms(class_boxes, class_box_scores, config.MATCH_THRESHOLD, top_k)
-
-                class_boxes = class_boxes[nms_index]
-                class_box_scores = class_box_scores[nms_index]
-
-                cmask = class_box_scores > 0.5
-                class_boxes = class_boxes[cmask]
-                class_box_scores = class_box_scores[cmask]
-
-                all_detections[c] = class_boxes
-                all_pred_scores[c] = class_box_scores
-
-        for c in range(1, num_classes):
-            if len(annotation) >= 1:
-                all_annotations[c] = annotation[annotation[:, 4] == c, :4]
-
-        for c in range(1, num_classes):
-            false_positives = np.zeros((0,))
-            true_positives = np.zeros((0,))
-            scores = np.zeros((0,))
-            num_annotations = 0.0
-
-            annotations = all_annotations[c]
-            num_annotations += annotations.shape[0]
-            detections = all_detections[c]
-            pred_scores = all_pred_scores[c]
-
-            for index, detection in enumerate(detections):
-                scores = np.append(scores, pred_scores[index])
-                if len(annotations) >= 1:
-                    IoUs = calc_iou(detection, annotations)
-                    assigned_anno = np.argmax(IoUs)
-                    max_overlap = IoUs[assigned_anno]
-
-                    if max_overlap >= 0.5:
-                        false_positives = np.append(false_positives, 0)
-                        true_positives = np.append(true_positives, 1)
-                    else:
-                        false_positives = np.append(false_positives, 1)
-                        true_positives = np.append(true_positives, 0)
-                else:
-                    false_positives = np.append(false_positives, 1)
-                    true_positives = np.append(true_positives, 0)
-
-            if num_annotations == 0:
-                if c not in average_precisions.keys():
-                    average_precisions[c] = 0
-                continue
-            accurate_num[c] = 1
-            indices = np.argsort(-scores)
-            false_positives = false_positives[indices]
-            true_positives = true_positives[indices]
-
-            false_positives = np.cumsum(false_positives)
-            true_positives = np.cumsum(true_positives)
-
-            recall = true_positives * 1. / num_annotations
-            precision = true_positives * 1. / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)
-
-            average_precision = calc_ap(recall, precision)
-
-            if c not in average_precisions.keys():
-                average_precisions[c] = average_precision
-            else:
-                average_precisions[c] += average_precision
-
-            num[c] += 1
-
-    count = 0
-    for key in average_precisions:
-        if num[key] != 0:
-            count += (average_precisions[key] / num[key])
-
-    mAP = count * 1. / accurate_num.count(1)
-    return mAP
diff --git a/graphengine b/graphengine
index c27e428e96..8891f0546c 160000
--- a/graphengine
+++ b/graphengine
@@ -1 +1 @@
-Subproject commit c27e428e9698dd4f9b198008596676bc2d1b49aa
+Subproject commit 8891f0546c4a250095ff68e1262f58772b938fd9
diff --git a/include/inference.h b/include/inference.h
new file mode 100644
index 0000000000..7e5ee27d49
--- /dev/null
+++ b/include/inference.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_INCLUDE_MS_SESSION_H
+#define MINDSPORE_INCLUDE_MS_SESSION_H
+
+#include <memory>
+#include <vector>
+#include <string>
+#include "include/ms_tensor.h"
+
+namespace mindspore {
+class FuncGraph;
+namespace inference {
+class MS_API MSSession {
+ public:
+  MSSession() = default;
+
+  static std::shared_ptr<MSSession> CreateSession(const std::string &device, uint32_t device_id);
+
+  virtual uint32_t CompileGraph(std::shared_ptr<FuncGraph> funcGraphPtr) = 0;
+
+  virtual MultiTensor RunGraph(uint32_t graph_id, const std::vector<std::shared_ptr<inference::MSTensor>> &inputs) = 0;
+};
+
+std::shared_ptr<FuncGraph> MS_API LoadModel(const char *model_buf, size_t size, const std::string &device);
+
+void MS_API ExitInference();
+}  // namespace inference
+}  // namespace mindspore
+#endif  // MINDSPORE_INCLUDE_MS_SESSION_H
diff --git a/include/ms_tensor.h b/include/ms_tensor.h
new file mode 100644
index 0000000000..1f9661df5e
--- /dev/null
+++ b/include/ms_tensor.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_INCLUDE_MS_TENSOR_H_
+#define MINDSPORE_INCLUDE_MS_TENSOR_H_
+
+#include <utility>
+#include <vector>
+#include <memory>
+#include "ir/dtype/type_id.h"
+
+namespace mindspore {
+#define MS_API __attribute__((visibility("default")))
+namespace inference {
+class MS_API MSTensor {
+ public:
+  MSTensor() = default;
+  // brief Create a MSTensor pointer.
+  //
+  // param data_type DataTypeId of tensor to be created.
+  // param shape Shape of tensor to be created.
+  // return MSTensor pointer.
+  static MSTensor *CreateTensor(TypeId data_type, const std::vector<int> &shape);
+
+  ~MSTensor() = default;
+
+  virtual TypeId data_type() const = 0;
+
+  virtual TypeId set_data_type(const TypeId data_type) = 0;
+
+  virtual std::vector<int> shape() const = 0;
+
+  virtual size_t set_shape(const std::vector<int> &shape) = 0;
+
+  virtual int DimensionSize(size_t index) const = 0;
+  // brief Get number of element in MSTensor.
+  //
+  // return Number of element in MSTensor.
+  virtual int ElementsNum() const = 0;
+
+  virtual std::size_t hash() const = 0;
+  // brief Get byte size of data in MSTensor.
+  //
+  // return Byte size of data in MSTensor.
+  virtual size_t Size() const = 0;
+  // brief Get pointer of data in MSTensor.
+  //
+  // The data pointer can be used to both write or read data in MSTensor.
+  //
+  // return A pointer points to data in MSTensor.
+  virtual void *MutableData() const = 0;
+};
+using MultiTensor = std::vector<std::shared_ptr<inference::MSTensor>>;
+}  // namespace inference
+}  // namespace mindspore
+#endif  // MINDSPORE_INCLUDE_MS_TENSOR_H_
diff --git a/mindspore/_akg/gpu/__init__.py b/mindspore/_akg/gpu/__init__.py
index f9db48c634..4c11499594 100644
--- a/mindspore/_akg/gpu/__init__.py
+++ b/mindspore/_akg/gpu/__init__.py
@@ -35,3 +35,5 @@ from .logical_not import LogicalNot, gpu_schedule_LogicalNot
 from .logical_and import LogicalAnd, gpu_schedule_LogicalAnd
 from .sub import Sub, gpu_schedule_Sub
 from .less_equal import LessEqual, gpu_schedule_LessEqual
+from .notequal import NotEqual, gpu_schedule_NotEqual
+from .greater_equal import GreaterEqual, gpu_schedule_GreaterEqual
diff --git a/mindspore/_akg/gpu/greater_equal.py b/mindspore/_akg/gpu/greater_equal.py
new file mode 100644
index 0000000000..0212cac03c
--- /dev/null
+++ b/mindspore/_akg/gpu/greater_equal.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""greater_equal"""
+import _akg.tvm
+from _akg.ops.math import greater_equal
+from _akg.topi.generic import schedule_elemwise
+
+def GreaterEqual(x, y):
+    """GreaterEqual."""
+    return greater_equal.greater_equal(x, y)
+
+
+def gpu_schedule_GreaterEqual(outs):
+    """
+    GPU schedule for GreaterEqual.
+
+    Args:
+        outs (tvm.tensor.Tensor): Outputs of compute.
+
+    Returns:
+        sch (schedule.Schedule): The created schedule.
+    """
+    device = 'cuda'
+    ctx = _akg.tvm.context(device, 0)
+    if not ctx.exist:
+        raise SystemError("Skip because %s is not enabled" % device)
+    with _akg.tvm.target.create(device):
+        sch = schedule_elemwise(outs)
+    return sch
diff --git a/mindspore/_akg/gpu/notequal.py b/mindspore/_akg/gpu/notequal.py
new file mode 100644
index 0000000000..3e3a6561a1
--- /dev/null
+++ b/mindspore/_akg/gpu/notequal.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""notequal"""
+import _akg.tvm
+from _akg.ops.math import notequal
+from _akg.topi.generic import schedule_elemwise
+
+def NotEqual(x, y):
+    """notequal."""
+    return notequal.notequal(x, y)
+
+
+def gpu_schedule_NotEqual(outs):
+    """
+    gpu schedule for NotEqual.
+
+    Args:
+        outs (tvm.tensor.Tensor): outputs of compute.
+
+    Returns:
+        sch (schedule.Schedule): The created schedule.
+    """
+    device = 'cuda'
+    ctx = _akg.tvm.context(device, 0)
+    if not ctx.exist:
+        raise SystemError("Skip because %s is not enabled" % device)
+    with _akg.tvm.target.create(device):
+        sch = schedule_elemwise(outs)
+    return sch
diff --git a/mindspore/_akg/ops/math/greater_equal.py b/mindspore/_akg/ops/math/greater_equal.py
new file mode 100644
index 0000000000..00ad016643
--- /dev/null
+++ b/mindspore/_akg/ops/math/greater_equal.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""operator dsl function: greaterequal"""
+import _akg.tvm
+import _akg.topi
+from _akg.utils.dsl_create import produce_shapes
+from _akg.utils import validation_check as vc_util
+
+
+@vc_util.check_input_type(_akg.tvm.tensor.Tensor, _akg.tvm.tensor.Tensor)
+def greater_equal(input1, input2):
+    """
+    Check whether input1 greaterquals to input2.
+
+    Args:
+        input1 (tvm.tensor.Tensor): Tensor.
+        input2 (tvm.tensor.Tensor): Tensor.
+
+    Returns:
+        tvm.tensor.Tensor. If input1 greaterquals to input2 return True, else return False.
+    """
+    shape1 = [x.value for x in input1.shape]
+    shape2 = [x.value for x in input2.shape]
+    vc_util.check_shape(shape1)
+    vc_util.check_shape(shape2)
+
+    shape1, shape2, shape = produce_shapes(shape1, shape2)
+
+    vc_util.elemwise_dtype_check(input1.dtype, input2.dtype)
+    dtype = input1.dtype
+
+    # get greaterquals compute
+    t_value = _akg.tvm.compute(shape, lambda *indice: _akg.tvm.const(1, dtype), "T")
+    f_value = _akg.tvm.compute(shape, lambda *indice: _akg.tvm.const(0, dtype), "F")
+
+    input1_bro = _akg.topi.broadcast_to(input1, shape)
+    input2_bro = _akg.topi.broadcast_to(input2, shape)
+    c_out = _akg.tvm.compute(shape, lambda *indice: _akg.tvm.expr.Select(input1_bro[indice] >= input2_bro[indice],
+                                                                         t_value[indice], f_value[indice]), name="C")
+    res = _akg.tvm.compute(shape, lambda *indice: c_out(*indice).astype("bool"), name="res")
+
+    return res
diff --git a/mindspore/_akg/ops/math/notequal.py b/mindspore/_akg/ops/math/notequal.py
new file mode 100644
index 0000000000..16d5e4a0f4
--- /dev/null
+++ b/mindspore/_akg/ops/math/notequal.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""operator dsl function: notequal"""
+import _akg.tvm
+import _akg.topi
+from _akg.utils.dsl_create import produce_shapes
+from _akg.utils import validation_check as vc_util
+
+
+@vc_util.check_input_type(_akg.tvm.tensor.Tensor, _akg.tvm.tensor.Tensor)
+def notequal(input1, input2):
+    """
+    check whether input1 notequals to input2.
+
+    Args:
+        input1 (tvm.tensor.Tensor): Tensor.
+        input2 (tvm.tensor.Tensor): Tensor.
+
+    Returns:
+        tvm.tensor.Tensor. If input1 notequal to input2 return True, else return False.
+    """
+    shape1 = [x.value for x in input1.shape]
+    shape2 = [x.value for x in input2.shape]
+    vc_util.check_shape(shape1)
+    vc_util.check_shape(shape2)
+
+    shape1, shape2, shape = produce_shapes(shape1, shape2)
+
+    vc_util.elemwise_dtype_check(input1.dtype, input2.dtype)
+    dtype = input1.dtype
+
+    # get notequal compute
+    t_value = _akg.tvm.compute(shape, lambda *indice: _akg.tvm.const(1, dtype), "T")
+    f_value = _akg.tvm.compute(shape, lambda *indice: _akg.tvm.const(0, dtype), "F")
+
+    input1_bro = _akg.topi.broadcast_to(input1, shape)
+    input2_bro = _akg.topi.broadcast_to(input2, shape)
+    c_out = _akg.tvm.compute(shape, lambda *indice: _akg.tvm.expr.Select(input1_bro[indice] != input2_bro[indice],
+                                                                         t_value[indice], f_value[indice]), name="C")
+    res = _akg.tvm.compute(shape, lambda *indice: c_out(*indice).astype("bool"), name="res")
+
+    return res
diff --git a/mindspore/_extends/builtin_operations.py b/mindspore/_extends/builtin_operations.py
index a423fe6395..6bd382c1b6 100644
--- a/mindspore/_extends/builtin_operations.py
+++ b/mindspore/_extends/builtin_operations.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ============================================================================
 """builtin_operations"""
-import functools
 import numpy as np
 from mindspore.common.tensor import Tensor
 from mindspore.common.dtype import dtype_to_nptype, get_py_obj_dtype
@@ -114,6 +113,24 @@ def bool_or(x, y):
     """Implement `bool_or`."""
     return x or y
 
+def vm_compare(*args):
+    """Implement `vm_compare` for tensor."""
+    obj_str = args[-1]
+    if obj_str == "shape":
+        fn = getattr(args[0].asnumpy(), obj_str)
+        return fn
+    if len(args) == 2:
+        fn = getattr(args[0].asnumpy(), obj_str)
+        return Tensor(fn())
+    if isinstance(args[0], Tensor):
+        fn = getattr(args[0].asnumpy(), obj_str)
+        y = args[1].asnumpy() if isinstance(args[1], Tensor) else args[1]
+    else:
+        obj_str = "__r" + obj_str[2:]
+        fn = getattr(args[1].asnumpy(), obj_str)
+        y = args[0]
+    return Tensor(np.array(fn(y)))
+
 
 def make_list(*xs):
     """Implement `make_list`."""
@@ -124,17 +141,8 @@ def list_len(x):
     """Implement `list_len`."""
     return len(x)
 
-
-# only used in PyNative mode
-def partial(*args):
-    """Implement `partial`."""
-    func = args[0].__call__
-    partial_func = functools.partial(func, *args[1:])
-    return partial_func
-
-
-# only used in PyNative mode
-def depend(value, expr):
+def Depend(value, expr):
+    """Implement `Depend`."""
     return value
 
 # only used in PyNative mode
diff --git a/mindspore/_extends/parallel_compile/akg_compiler/__init__.py b/mindspore/_extends/parallel_compile/akg_compiler/__init__.py
new file mode 100644
index 0000000000..e30774307c
--- /dev/null
+++ b/mindspore/_extends/parallel_compile/akg_compiler/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
diff --git a/mindspore/_extends/parallel_compile/akg_compiler/compiler.py b/mindspore/_extends/parallel_compile/akg_compiler/compiler.py
new file mode 100644
index 0000000000..de78aad7e4
--- /dev/null
+++ b/mindspore/_extends/parallel_compile/akg_compiler/compiler.py
@@ -0,0 +1,35 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Providing akg compile with json"""
+import sys
+def run_compiler(op_json):
+    """
+    Run AKG compiler to compile op with subprocess, if this process of
+    compilation failed, an exception will be raised
+
+    Args:
+        op_json (str): json string of the op
+
+    Returns:
+        None
+    """
+    p = __import__("akg", globals(), locals(), ['ms'], 0)
+    func = getattr(p.ms, "compilewithjson")
+    res = func(op_json)
+    if not res:
+        raise ValueError("Compile error")
+
+if __name__ == "__main__":
+    run_compiler(sys.argv[1])
diff --git a/mindspore/_extends/parallel_compile/akg_compiler/multi_process_compiler.py b/mindspore/_extends/parallel_compile/akg_compiler/multi_process_compiler.py
new file mode 100644
index 0000000000..ffe9c85dc3
--- /dev/null
+++ b/mindspore/_extends/parallel_compile/akg_compiler/multi_process_compiler.py
@@ -0,0 +1,71 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Providing multi process compile with json"""
+import os
+import subprocess
+import sys
+from multiprocessing import Pool, cpu_count
+
+
+def _compile_akg_task(*json_strs):
+    """
+    compile func called in single process
+
+    Parameters:
+        json_strs: list. List contains multiple kernel infos, suitable for json compile api.
+    """
+    akg_compiler = os.path.join(os.path.split(
+        os.path.realpath(__file__))[0], "compiler.py")
+    for json_str in json_strs:
+        res = subprocess.run(
+            [sys.executable, akg_compiler, json_str], text=True)
+        if res.returncode != 0:
+            raise ValueError("Failed, args: {}!".format(json_str))
+
+
+def compile_akg_kernel_parallel(json_infos, process, waitime):
+    """
+    compile kernel use multi processes
+
+    Parameters:
+        json_infos: list. list contain kernel info(task id and json str)
+        process: int. processes num
+        waittime: int. max time the function blocked
+
+    Returns:
+        True for all compile success, False for some failed.
+    """
+    if not isinstance(json_infos, list):
+        raise ValueError("json_infos must be a list")
+    if not isinstance(process, int):
+        raise ValueError("process must be a num")
+    if not isinstance(waitime, int):
+        raise ValueError("waittime must be a num")
+
+    if process == 0 and json_infos:
+        process = 1
+
+    cpu_proc_num = cpu_count()
+    max_proc_num = 16
+    process = min([cpu_proc_num, max_proc_num, process])
+
+    args = [[] for _ in range(process)]
+    for p, info in enumerate(json_infos):
+        args[p % process].append(info)
+
+    with Pool(processes=process) as pool:
+        res = pool.starmap_async(_compile_akg_task, args)
+        res.get(timeout=waitime)
+    return True
diff --git a/mindspore/_extends/parallel_compile/multi_compiler.py b/mindspore/_extends/parallel_compile/multi_compiler.py
deleted file mode 100644
index 86e1b684d2..0000000000
--- a/mindspore/_extends/parallel_compile/multi_compiler.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Providing multi process compile with json"""
-import json
-import math
-import os
-import subprocess
-import sys
-from multiprocessing import Pool
-
-
-def _compiletask(platform, *jsons):
-    """
-        compile func called in single process
-
-        Parameters:
-            platform: str. AKG platform or TBE platform
-            *jsons: str. json str contain kernel info, suitable for json compile
-                    api
-
-        """
-    if platform == "AKG":
-        p = __import__("_akg", globals(), locals(), ['ms'], 0)
-        func = getattr(p.ms, "compilewithjson")
-        for json_item in jsons:
-            res = func(json_item)
-            if not res:
-                raise ValueError("Compile error")
-    if platform == "TBE":
-        tbe_compiler = os.path.join(os.path.split(os.path.realpath(__file__))[0], "tbe_compiler", "compiler.py")
-        for json_item in jsons:
-            res = subprocess.run([sys.executable, tbe_compiler], input=json_item, text=True)
-            if res.returncode != 0:
-                raise ValueError("Tbe compile error")
-
-
-def compilekernelparallel(jsons, process, waitime):
-    """
-    compile kernel use multi processes
-
-    Parameters:
-        jsons: list. json str list contain kernel info
-        process: int. processes num
-        waittime: int. max time the function blocked
-    """
-    if not isinstance(jsons, list):
-        raise ValueError("jsons must be a list")
-    if not isinstance(process, int):
-        raise ValueError("process must be a num")
-    if not isinstance(waitime, int):
-        raise ValueError("waittime must be a num")
-
-    jsons_akg = []
-    jsons_tbe = []
-    for json_ in jsons:
-        j = json.loads(json_)
-        if j["platform"] == "TBE":
-            jsons_tbe.append(json_)
-            continue
-        if j["platform"] == "AKG":
-            jsons_akg.append(json_)
-            continue
-        raise RuntimeError(
-            "not support this platform {0}".format(j["platform"]))
-    if jsons_akg:
-        process_akg = math.floor(len(jsons)/len(jsons_akg)*process)
-    else:
-        process_akg = 0
-
-    if process_akg == 0 and jsons_akg:
-        process_akg = 1
-    process_tbe = process-process_akg
-    if process_tbe == 0 and jsons_tbe:
-        process_tbe = 1
-        raise RuntimeWarning("we add a process for compile more operator")
-
-    args = [[] for _ in range(process_akg+process_tbe)]
-    args_lens = len(args)
-    for p in range(args_lens):
-        if p < process_tbe:
-            args[p].append("TBE")
-        else:
-            args[p].append("AKG")
-    jsons_tbe_lens = len(jsons_tbe)
-    for p in range(jsons_tbe_lens):
-        args[p % process_tbe].append(jsons_tbe[p])
-    jsons_akg_lens = len(jsons_akg)
-    for p in range(jsons_akg_lens):
-        args[process-p % process_akg-1].append(jsons_akg[p])
-    for p in range(args_lens):
-        args[p] = tuple(args[p])
-    with Pool(processes=process) as pool:
-        res = pool.starmap_async(_compiletask, args)
-        res.get(timeout=waitime)
-    return True
diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/common.py b/mindspore/_extends/parallel_compile/tbe_compiler/common.py
index 1aeba9889d..3d55cf60a2 100644
--- a/mindspore/_extends/parallel_compile/tbe_compiler/common.py
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/common.py
@@ -15,13 +15,6 @@
 """tbe common"""
 import json
 import os
-from attrdict import AttrDict
-
-class ParamType(AttrDict):
-    Required = "required"
-    Dynamic = "dynamic"
-    Optional = "optional"
-
 
 class TBEException(Exception):
     """tbe exception class"""
@@ -112,7 +105,7 @@ def get_input_output(io_info, args):
                 if len(item) > 1:
                     arg.append(info)
                 else:
-                    if info['param_type'] == ParamType.Dynamic:
+                    if info['param_type'] == 'dynamic':
                         arg.append(info)
                         args.append(arg)
                     else:
diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/compiler.py b/mindspore/_extends/parallel_compile/tbe_compiler/compiler.py
index c385f7dee0..a241bf9e10 100755
--- a/mindspore/_extends/parallel_compile/tbe_compiler/compiler.py
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/compiler.py
@@ -28,7 +28,8 @@ build_in_impl_path = get_build_in_impl_path()
 # op function list
 op_build = "compile"
 op_pre_build = "pre_build"
-
+fusion_pattern_start_flag = "fusion_pattern_start"
+fusion_pattern_end_flag = "fusion_pattern_end"
 
 def _initialize(impl_path):
     """Initialize"""
@@ -42,7 +43,6 @@ def _initialize(impl_path):
 
     sys.path.insert(0, op_module_name)
 
-
 def build_op(build_type, json_str):
     """
     call op functions with function name and input args json_str
@@ -108,7 +108,7 @@ def build_op(build_type, json_str):
 
         # pre build
         if build_type == op_pre_build:
-            op_func(*inputs_args, *outputs_args, *attrs_args, kernel_name)
+            op_func(*inputs_args, *outputs_args, *attrs_args, kernel_name=kernel_name)
             # disable only pattern configuration
             op_build_cfg_en()
             return get_op_pattern()
@@ -159,11 +159,14 @@ def compile_with_json(json_str):
     json_info = json.loads(json_str)
     if "fusion_op" in json_info:
         ret = compile_fusion_op(json_str)
+    elif "compile_type" in json_info:
+        ret = build_op(op_pre_build, json_str)
     else:
         ret = build_op(op_build, json_str)
     return ret
 
-
 if __name__ == "__main__":
     in_args = sys.stdin.readline()
-    compile_with_json(in_args)
+    result = compile_with_json(in_args)
+    sys.stdout.write(fusion_pattern_start_flag + str(result) + fusion_pattern_end_flag)
+    sys.stdout.flush()
diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py
index 9a3846c4f9..80b50c45a9 100644
--- a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py
@@ -75,7 +75,6 @@ def check_supported(op_json: str):
 
     return ret
 
-
 def run_compiler(op_json):
     """
     run compiler to compile op with subprocess
@@ -88,15 +87,16 @@ def run_compiler(op_json):
     """
     try:
         tbe_compiler = os.path.join(os.path.split(os.path.realpath(__file__))[0], "compiler.py")
-        subprocess.run([sys.executable, tbe_compiler], input=op_json, timeout=300,
-                       text=True, capture_output=True, check=True)
-        return "Success", "Success"
+        completed_object = subprocess.run([sys.executable, tbe_compiler], input=op_json, timeout=300,
+                                          text=True, capture_output=True, check=True)
+        if completed_object:
+            out = completed_object.stdout
+        return "Success", out
     except subprocess.TimeoutExpired:
         tb = traceback.format_exc()
-        return "TBEException", "CompileTimeOut: " + tb + "\ninput_args: " + op_json
+        return "TBEException", "PreCompileTimeOut: " + tb + "\ninput_args: " + op_json
     except subprocess.CalledProcessError as e:
-        return "TBEException", "CompileProcessFailed:\n" + e.stdout + "\n" + e.stderr + "\ninput_args: " + op_json
-
+        return "TBEException", "PreCompileProcessFailed:\n" + e.stdout + "\n" + e.stderr + "\ninput_args: " + op_json
 
 class CompilerPool:
     """compiler pool"""
@@ -154,11 +154,11 @@ class CompilerPool:
             task_id, task_future = self.__running_tasks.pop(0)
             ret_type, result = task_future.get(330)
             if ret_type == "Success":
-                ret = task_id, "Success"
+                ret = task_id, "Success", result
             elif ret_type in ("Exception", "TBEException"):
-                ret = task_id, ret_type + ":" + result
+                ret = task_id, ret_type + ":" + result, "_"
             else:
-                ret = task_id, "Exception: Not support return type:" + str(ret_type)
+                ret = task_id, "Exception: Not support return type:" + str(ret_type), "_"
         return ret
 
     def reset_task_info(self):
diff --git a/mindspore/_extends/parse/__init__.py b/mindspore/_extends/parse/__init__.py
index 62ba2e5406..323932560a 100644
--- a/mindspore/_extends/parse/__init__.py
+++ b/mindspore/_extends/parse/__init__.py
@@ -19,14 +19,15 @@ Interfaces for parser module in c++.
 from .parser import (Parser, create_obj_instance, generate_scope,
                      get_bprop_method_of_class, get_class_instance_type,
                      get_class_member_namespace_symbol, create_slice_obj,
-                     get_dataclass_attributes, get_dataclass_methods,
+                     get_dataclass_attributes, get_dataclass_methods, get_obj_id,
                      get_module_namespace, get_obj_type, get_object_key,
-                     get_parse_method_of_class, get_scope_name,
-                     is_class_member, parse_cb, resolve_symbol, create_ellipsis_obj)
+                     get_default_input, get_parse_method_of_class, get_scope_name,
+                     is_class_member, parse_cb, resolve_symbol)
 from .serialize import *
 
 __all__ = ['parse_cb', 'get_parse_method_of_class', 'get_bprop_method_of_class', 'resolve_symbol',
-           'get_object_key', 'get_class_instance_type', 'is_class_member', 'get_obj_type',
-           'create_obj_instance', 'get_module_namespace', 'get_class_member_namespace_symbol',
-           'Parser', 'get_dataclass_attributes', 'get_dataclass_methods', 'dump_obj', 'load_obj',
-           'get_dataclass_methods', 'get_scope_name', 'create_slice_obj', 'create_ellipsis_obj']
+           'get_object_key', 'get_default_input', 'get_class_instance_type', 'is_class_member',
+           'get_obj_type', 'get_obj_id', 'create_obj_instance', 'get_module_namespace',
+           'get_class_member_namespace_symbol', 'get_obj_id', 'Parser', 'get_dataclass_attributes',
+           'get_dataclass_methods', 'dump_obj', 'load_obj', 'get_dataclass_methods', 'get_scope_name',
+           'create_slice_obj']
diff --git a/mindspore/_extends/parse/parser.py b/mindspore/_extends/parse/parser.py
index 462565fd7f..2a1c9e0943 100644
--- a/mindspore/_extends/parse/parser.py
+++ b/mindspore/_extends/parse/parser.py
@@ -29,7 +29,6 @@ from mindspore.common.dtype import pytype_to_dtype
 from mindspore.common.api import _MindSporeFunction
 from .namespace import CellNamespace, ClosureNamespace, ClassMemberNamespace
 from .resources import parse_object_map, convert_object_map, trope_ns, SYMBOL_UNDEFINE, NO_IMPLEMENT
-from ..utils import Slice, Ellipsis_
 
 # define return value
 RET_SUCCESS = 0
@@ -70,14 +69,9 @@ parse_expr_statement_white_list = (
     "append",
 )
 
-def create_ellipsis_obj():
-    """Create Slice object"""
-    return Ellipsis_()
-
-
 def create_slice_obj(start, end, step):
-    """Create Slice object"""
-    return Slice(start, end, step)
+    """Create slice object"""
+    return slice(start, end, step)
 
 
 def parse_cb(func, parse_method=None):
@@ -209,6 +203,14 @@ def get_object_key(obj):
         obj_id = instance_id + obj_id
     return obj_id, obj_key
 
+def get_default_input(obj):
+    if hasattr(obj, '__parameter__'):
+        return obj.default_input
+    if isinstance(obj, tuple):
+        convert = lambda x: x.default_input if hasattr(x, '__parameter__') else x
+        args = tuple(convert(x) for x in obj)
+        return args
+    return obj
 
 def is_class_member(node):
     """Check the attr is class member variable."""
@@ -221,6 +223,9 @@ def is_class_member(node):
             return True
     return False
 
+def get_obj_id(obj):
+    """Get the obj id."""
+    return str(id(obj))
 
 def get_obj_type(obj):
     """Get the obj type."""
diff --git a/mindspore/_extends/parse/resources.py b/mindspore/_extends/parse/resources.py
index 60847c4338..2ae8b7172f 100644
--- a/mindspore/_extends/parse/resources.py
+++ b/mindspore/_extends/parse/resources.py
@@ -126,7 +126,7 @@ convert_object_map = {
     T.make_list:    F.make_list,
     T.make_slice:   F.make_slice,
     T.range:        F.make_range,
-
+    T.while_cond:   M.while_cond,
     # lib function
     math.floor:     NO_IMPLEMENT,
     math.trunc:     NO_IMPLEMENT,
diff --git a/mindspore/_extends/parse/standard_method.py b/mindspore/_extends/parse/standard_method.py
index 2c94240ba2..0f3f843b63 100644
--- a/mindspore/_extends/parse/standard_method.py
+++ b/mindspore/_extends/parse/standard_method.py
@@ -16,8 +16,10 @@
 # ============================================================================
 """standard_method"""
 from dataclasses import dataclass
+from mindspore.common import dtype as mstype
 from ...ops import functional as F
 from ...ops import operations as P
+from ...ops.primitive import constexpr
 from ...ops.composite import tail, core, MultitypeFuncGraph, env_get, hyper_add, \
     zeros_like, ones_like
 from ...ops.composite.base import _append
@@ -102,11 +104,44 @@ def bool_(x):
     return x.__bool__()
 
 
-def tensor_bool(x):
-    """return immedate x, x is a tensor of bool value"""
+def while_cond(x):
+    """For while condtion, if the condition is a tensor, the loop will not be unrolled"""
+    if F.issubclass_(F.typeof(x), F.typeof(mstype.tensor)):
+        is_cond = check_is_tensor_bool_cond(F.shape(x))
+        if is_cond:
+            return F.cast(x, mstype.bool_)
     return x
 
 
+@constexpr
+def check_is_tensor_bool_cond(shp):
+    """check if tensor is a bool condition"""
+    if shp in ((), (1,)):
+        return True
+    raise ValueError("tensor as bool condition, its shape should be () or (1,), but got ", shp)
+
+@constexpr
+def const_tensor_to_bool(x):
+    """convert bool tensor to bool condition"""
+    if x is None:
+        raise ValueError("Only constant tensor bool can be converted to bool")
+    x = x.asnumpy()
+    if x.shape not in ((), (1,)):
+        raise ValueError("Tensor to bool should input shape () or (1), but got ", x.shape)
+    if x.shape == ():
+        value = bool(x)
+    else:
+        value = bool(x[0])
+    return value
+
+def tensor_bool(x):
+    """tensor as conditon, if is constant, return immediate bool value"""
+    is_cond = check_is_tensor_bool_cond(F.shape(x))
+    if is_cond and F.isconstant(x):
+        return const_tensor_to_bool(x)
+    return F.cast(x, mstype.bool_)
+
+
 def and_(x, y):
     """Implementation of `and` (`&`)."""
     return x.__and__(y)
diff --git a/mindspore/_extends/parse/trope.py b/mindspore/_extends/parse/trope.py
index 7b40adcd16..f169c58fb9 100644
--- a/mindspore/_extends/parse/trope.py
+++ b/mindspore/_extends/parse/trope.py
@@ -91,3 +91,7 @@ def to_array(x):  # pragma: no cover
 def not_contains(x):  # pragma: no cover
     """Not in function."""
     raise RuntimeError('This operation is not meant to be called directly.')
+
+def while_cond(x):  # pragma: no cover
+    """Not in function."""
+    raise RuntimeError('This operation is not meant to be called directly.')
diff --git a/mindspore/_extends/utils.py b/mindspore/_extends/utils.py
index fecbf546f5..8469ddda8b 100644
--- a/mindspore/_extends/utils.py
+++ b/mindspore/_extends/utils.py
@@ -19,7 +19,6 @@ import logging
 import os
 import inspect
 from functools import wraps
-from dataclasses import dataclass
 
 
 def cal_sha256(file_path):
@@ -100,20 +99,3 @@ def cell_attr_register(fn=None, attrs=None):
     if fn is not None:
         return wrap_cell(fn)
     return wrap_cell
-
-
-@dataclass
-class Slice:
-    """
-    Slice class
-    """
-    start: int
-    end: int
-    step: int
-
-
-@dataclass
-class Ellipsis_:
-    """
-    Ellipsis class
-    """
diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 4184d29281..c435672bde 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -8,6 +8,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Windows")
     add_compile_definitions(BUILDING_DLL)
 endif()
 
+if (ENABLE_MPI)
+    add_compile_definitions(ENABLE_MPI)
+endif ()
+
 if(ENABLE_GPU)
     find_package(CUDA REQUIRED)
     find_package(Threads)
@@ -35,7 +39,7 @@ if(ENABLE_GPU)
             "device/gpu/*.cu"
             "kernel/gpu/*.cu"
             "kernel/akg/gpu/*.cc"
-            "kernel/akg/akgkernelbuild.cc"
+            "kernel/akg/akg_kernel_build.cc"
             "kernel/akg/akg_kernel_attrs_process.cc"
             )
 
@@ -75,7 +79,9 @@ if (ENABLE_DUMP_PROTO)
     file(GLOB_RECURSE PROTO_PY RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "utils/anf_ir.proto"
         "utils/summary.proto"
+        "utils/lineage.proto"
         "utils/checkpoint.proto"
+        "utils/print.proto"
     )
     ms_protobuf_generate_py(PY_SRCS PY_HDRS PY_PYS ${PROTO_PY})
 
@@ -120,7 +126,11 @@ endforeach ()
 set_property(SOURCE ${SUB_OBJECTS_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ME)
 add_library(mindspore STATIC ${SUB_OBJECTS_SRC})
 target_link_libraries(mindspore proto_input)
-target_link_libraries(mindspore securec mindspore::flatbuffers)
+if (ENABLE_CPU AND ENABLE_MPI)
+    target_link_libraries(mindspore securec mindspore::flatbuffers mindspore::ompi)
+else ()
+    target_link_libraries(mindspore securec mindspore::flatbuffers)
+endif ()
 if (NOT WIN32)
   target_link_libraries(mindspore dl)
 endif()
@@ -227,3 +237,29 @@ if (ENABLE_MINDDATA)
     add_subdirectory(mindrecord)
     add_subdirectory(dataset)
 endif ()
+
+# build inference
+set(LOAD_ONNX_SRC
+        ${CMAKE_CURRENT_SOURCE_DIR}/utils/load_onnx/anf_converter.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/utils/load_onnx/anf_model_parser.cc
+        )
+add_library(inference SHARED
+        ${CMAKE_CURRENT_SOURCE_DIR}/session/session.cc
+        ${LOAD_ONNX_SRC}
+        )
+target_link_libraries(inference PRIVATE ${PYTHON_LIBRARIES} ${SECUREC_LIBRARY}
+        -Wl,--whole-archive mindspore -Wl,--no-whole-archive mindspore_gvar mindspore::protobuf)
+
+if (ENABLE_CPU)
+    target_link_libraries(inference PRIVATE mindspore::dnnl mindspore::mkldnn)
+endif ()
+
+if (USE_GLOG)
+    target_link_libraries(inference PRIVATE mindspore::glog)
+else()
+    if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+        target_link_options(inference PRIVATE -Wl,-init,mindspore_log_init)
+    elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
+        set_target_properties(inference PROPERTIES MACOSX_RPATH ON)
+    endif ()
+endif()
diff --git a/mindspore/ccsrc/common/trans.cc b/mindspore/ccsrc/common/trans.cc
index 55e4761036..9cf6eb3a5a 100644
--- a/mindspore/ccsrc/common/trans.cc
+++ b/mindspore/ccsrc/common/trans.cc
@@ -14,11 +14,9 @@
  * limitations under the License.
  */
 #include "common/trans.h"
-#include <algorithm>
 #include <functional>
 #include <numeric>
 #include <utility>
-#include "./securec.h"
 #include "common/utils.h"
 #include "session/anf_runtime_algorithm.h"
 #include "kernel/kernel.h"
@@ -29,34 +27,7 @@
 
 namespace mindspore {
 namespace trans {
-namespace {
-std::vector<size_t> PaddingShapeTo4dByDefault(const std::vector<size_t> &shape) {
-  std::vector<size_t> shape_4d(4, 1);
-  switch (shape.size()) {
-    case 0:
-      return shape_4d;
-    case 1:
-      shape_4d[1] = shape[0];
-      break;
-    case 2:
-      shape_4d[1] = shape[0];
-      shape_4d[2] = shape[1];
-      break;
-    case 3:
-      shape_4d[1] = shape[0];
-      shape_4d[2] = shape[1];
-      shape_4d[3] = shape[2];
-      break;
-    case 4:
-      std::copy(shape.begin(), shape.end(), shape_4d.begin());
-      break;
-    default:
-      MS_LOG(EXCEPTION) << "Unexpect shape size = " << shape.size();
-  }
-  return shape_4d;
-}
-}  // namespace
-const size_t kNchwDims = 4;
+enum kAxis : int { kN = 0, kC, kH, kW, kNchwDims, kNdhwc };
 const std::map<TypeId, size_t> type_map = {{kNumberTypeBool, 1},    {kNumberTypeInt, 4},     {kNumberTypeInt8, 1},
                                            {kNumberTypeInt16, 2},   {kNumberTypeInt32, 4},   {kNumberTypeInt64, 8},
                                            {kNumberTypeUInt, 4},    {kNumberTypeUInt8, 1},   {kNumberTypeUInt16, 2},
@@ -84,7 +55,10 @@ inline void SetData(size_t size, bool pad_zero, size_t src_idx, size_t dst_idx,
 
 template <typename T>
 T DivCeil(T n1, T n2) {
-  return (n2 != 0) ? (n1 - 1) / n2 + 1 : 0;
+  if (n2 != 0) {
+    return (n1 - 1) / n2 + 1;
+  }
+  return 0;
 }
 
 enum DataTypeTransMode {
@@ -226,8 +200,7 @@ size_t CubeSizeByType(const TypeId data_type) {
 }
 
 size_t ShapeSize(const std::vector<size_t> &shape) {
-  size_t product = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
-  return product;
+  return std::accumulate(shape.begin(), shape.end(), IntToSize(1), std::multiplies<size_t>());
 }
 
 size_t TypeIdSize(const TypeId data_type) {
@@ -239,57 +212,9 @@ size_t TypeIdSize(const TypeId data_type) {
   return unsupported_type_error;
 }
 
-bool IsNeedPadding(const std::string &format, const size_t shape_size) {
-  if (shape_size == 0) {
-    return false;
-  }
-  if (format == kOpFormat_DEFAULT || format == kOpFormat_FRAC_NZ) {
-    return false;
-  } else if (shape_size < 4) {
-    return true;
-  }
-  return false;
-}
-
-std::vector<int> GetRuntimePaddingShape(const AnfNodePtr &node, size_t index) {
-  std::vector<int> shape;
-  std::vector<size_t> host_shape;
-  if (node->isa<ValueNode>()) {
-    auto value_node = node->cast<ValueNodePtr>();
-    auto node_value = value_node->value();
-    auto tensor = node_value->cast<tensor::TensorPtr>();
-    if (tensor == nullptr) {
-      MS_LOG(EXCEPTION) << " the node[ " << node->DebugString() << "]'s cannot convert ";
-    }
-    auto shape_temp = tensor->shape();
-    (void)std::transform(shape_temp.begin(), shape_temp.end(), std::back_inserter(host_shape), IntToSize);
-    if (host_shape.empty()) {
-      host_shape.push_back(1);
-    }
-  } else {
-    host_shape = AnfAlgo::GetOutputInferShape(node, index);
-  }
-  if (trans::IsNeedPadding(AnfAlgo::GetOutputFormat(node, 0), host_shape.size())) {
-    host_shape = trans::PaddingShapeTo4d(host_shape, AnfAlgo::GetOutputReshapeType(node, 0));
-  }
-  std::transform(host_shape.begin(), host_shape.end(), std::back_inserter(shape), SizeToInt);
-  return shape;
-}
-
-std::vector<size_t> PaddingShapeTo4d(const std::vector<size_t> &shape, const std::vector<kernel::Axis> &padding_axis) {
-  if (padding_axis.empty() || shape.size() != padding_axis.size()) {
-    return PaddingShapeTo4dByDefault(shape);
-  }
-  std::vector<size_t> shape_4d(4, 1);
-  for (size_t index = 0; index < padding_axis.size(); index++) {
-    shape_4d[padding_axis[index]] = shape[index];
-  }
-  return shape_4d;
-}
-
 namespace {
 bool CheckDims(const std::vector<size_t> &shape) {
-  if (shape.size() != 4) {
+  if (shape.size() != kNchwDims) {
     MS_LOG(ERROR) << "Host shape dims shoud be 4";
     return false;
   }
@@ -308,10 +233,10 @@ std::vector<size_t> NhwcDeviceShape(const std::vector<size_t> &shape) {
     MS_LOG(EXCEPTION) << "Ccheck dims failed.";
   }
   std::vector<size_t> device_shape;
-  device_shape.push_back(shape[0]);
-  device_shape.push_back(shape[2]);
-  device_shape.push_back(shape[3]);
-  device_shape.push_back(shape[1]);
+  device_shape.push_back(shape[kN]);
+  device_shape.push_back(shape[kH]);
+  device_shape.push_back(shape[kW]);
+  device_shape.push_back(shape[kC]);
   return device_shape;
 }
 
@@ -320,10 +245,10 @@ std::vector<size_t> HwchDeviceShape(const std::vector<size_t> &shape) {
     MS_LOG(EXCEPTION) << "Check dims failed.";
   }
   std::vector<size_t> device_shape;
-  device_shape.push_back(shape[2]);
-  device_shape.push_back(shape[3]);
-  device_shape.push_back(shape[1]);
-  device_shape.push_back(shape[0]);
+  device_shape.push_back(shape[kH]);
+  device_shape.push_back(shape[kW]);
+  device_shape.push_back(shape[kC]);
+  device_shape.push_back(shape[kN]);
   return device_shape;
 }
 
@@ -332,9 +257,9 @@ std::vector<size_t> FracZDeviceShape(const std::vector<size_t> &shape) {
     MS_LOG(EXCEPTION) << "Check dims failed.";
   }
   std::vector<size_t> device_shape;
-  size_t cout16 = ((shape[0] + kCubeSize - 1) / kCubeSize) * kCubeSize;
-  size_t cin16 = ((shape[1] + kCubeSize - 1) / kCubeSize) * kCubeSize;
-  device_shape.push_back(shape[2] * shape[3] * cin16 / kCubeSize);
+  const size_t cout16 = ((shape[kN] + kCubeSize - 1) / kCubeSize) * kCubeSize;
+  const size_t cin16 = ((shape[kC] + kCubeSize - 1) / kCubeSize) * kCubeSize;
+  device_shape.push_back(shape[kH] * shape[kW] * cin16 / kCubeSize);
   device_shape.push_back(cout16 / kCubeSize);
   device_shape.push_back(kCubeSize);
   device_shape.push_back(kCubeSize);
@@ -346,12 +271,12 @@ std::vector<size_t> Nc1hwc0DeviceShape(const std::vector<size_t> &shape) {
     MS_LOG(EXCEPTION) << "Check dims failed.";
   }
   std::vector<size_t> device_shape;
-  size_t C1 = (shape[1] + kCubeSize - 1) / kCubeSize;
-  size_t C0 = kCubeSize;
-  device_shape.push_back(shape[0]);
+  const size_t C1 = (shape[kC] + kCubeSize - 1) / kCubeSize;
+  const size_t C0 = kCubeSize;
+  device_shape.push_back(shape[kN]);
   device_shape.push_back(C1);
-  device_shape.push_back(shape[2]);
-  device_shape.push_back(shape[3]);
+  device_shape.push_back(shape[kH]);
+  device_shape.push_back(shape[kW]);
   device_shape.push_back(C0);
   return device_shape;
 }
@@ -361,10 +286,10 @@ std::vector<size_t> C1hwncoc0DeviceShape(const std::vector<size_t> &shape) {
     MS_LOG(EXCEPTION) << "Check dims failed.";
   }
   std::vector<size_t> device_shape;
-  device_shape.push_back((shape[1] - 1) / kCubeSize + 1);
-  device_shape.push_back(shape[2]);
-  device_shape.push_back(shape[3]);
-  device_shape.push_back(shape[0]);
+  device_shape.push_back((shape[kC] - 1) / kCubeSize + 1);
+  device_shape.push_back(shape[kH]);
+  device_shape.push_back(shape[kW]);
+  device_shape.push_back(shape[kN]);
   device_shape.push_back(kCubeSize);
   device_shape.push_back(kCubeSize);
   return device_shape;
@@ -375,9 +300,9 @@ std::vector<size_t> FracZc04DeviceShape(const std::vector<size_t> &shape) {
     MS_LOG(EXCEPTION) << "Check dims failed.";
   }
   std::vector<size_t> device_shape;
-  size_t c0 = 4;
-  auto first_dim = DivCeil(c0 * shape.at(2) * shape.at(3), kCubeSize);
-  auto no = DivCeil(shape.at(0), kCubeSize);
+  const size_t c0 = 4;
+  auto first_dim = DivCeil(c0 * shape[kH] * shape[kW], kCubeSize);
+  auto no = DivCeil(shape.at(kN), kCubeSize);
   device_shape.push_back(first_dim);
   device_shape.push_back(no);
   device_shape.push_back(kCubeSize);
@@ -390,24 +315,101 @@ std::vector<size_t> Nc1hwc04DeviceShape(const std::vector<size_t> &shape) {
     MS_LOG(EXCEPTION) << "Check dims failed.";
   }
   std::vector<size_t> device_shape;
-  size_t C1 = 1;
-  size_t C0 = 4;
-  device_shape.push_back(shape[0]);
+  const size_t C1 = 1;
+  const size_t C0 = 4;
+  device_shape.push_back(shape[kN]);
   device_shape.push_back(C1);
-  device_shape.push_back(shape[2]);
-  device_shape.push_back(shape[3]);
+  device_shape.push_back(shape[kH]);
+  device_shape.push_back(shape[kW]);
   device_shape.push_back(C0);
   return device_shape;
 }
 
 std::vector<size_t> NdhwcDeviceShape(const std::vector<size_t> &shape) {
-  if (shape.size() < 5) {
+  if (shape.size() < kNdhwc) {
     MS_LOG(EXCEPTION) << "Shape dims must be 5 when format is ndhwc.";
   }
   return shape;
 }
+
+std::vector<size_t> PaddingShapeTo4dByDefault(const std::vector<size_t> &shape) {
+  std::vector<size_t> shape_4d(kNchwDims, 1);
+  switch (shape.size()) {
+    case 0:
+      return shape_4d;
+    case 1:
+      shape_4d[kC] = shape[kN];
+      break;
+    case 2:
+      shape_4d[kC] = shape[kN];
+      shape_4d[kH] = shape[kC];
+      break;
+    case 3:
+      shape_4d[kC] = shape[kN];
+      shape_4d[kH] = shape[kC];
+      shape_4d[kW] = shape[kH];
+      break;
+    case 4:
+      std::copy(shape.begin(), shape.end(), shape_4d.begin());
+      break;
+    default:
+      MS_LOG(EXCEPTION) << "Unexpect shape size = " << shape.size();
+  }
+  return shape_4d;
+}
 }  // namespace
 
+bool IsNeedPadding(const std::string &format, const size_t shape_size) {
+  if (shape_size == 0) {
+    return false;
+  }
+  if (format == kOpFormat_DEFAULT || format == kOpFormat_FRAC_NZ) {
+    return false;
+  } else if (shape_size < kNchwDims) {
+    return true;
+  }
+  return false;
+}
+
+std::vector<int> GetRuntimePaddingShape(const AnfNodePtr &node, size_t index) {
+  MS_EXCEPTION_IF_NULL(node);
+  std::vector<int> shape;
+  std::vector<size_t> host_shape;
+  if (node->isa<ValueNode>()) {
+    auto value_node = node->cast<ValueNodePtr>();
+    MS_EXCEPTION_IF_NULL(value_node);
+    auto node_value = value_node->value();
+    MS_EXCEPTION_IF_NULL(node_value);
+    auto tensor = node_value->cast<tensor::TensorPtr>();
+    if (tensor == nullptr) {
+      MS_LOG(EXCEPTION) << " The node[ " << node->DebugString() << "]'s cannot convert ";
+    }
+    auto shape_temp = tensor->shape();
+    (void)std::transform(shape_temp.begin(), shape_temp.end(), std::back_inserter(host_shape), IntToSize);
+    if (host_shape.empty()) {
+      host_shape.push_back(1);
+    }
+  } else {
+    host_shape = AnfAlgo::GetOutputInferShape(node, index);
+  }
+  if (trans::IsNeedPadding(AnfAlgo::GetOutputFormat(node, 0), host_shape.size())) {
+    host_shape = trans::PaddingShapeTo4d(host_shape, AnfAlgo::GetOutputReshapeType(node, 0));
+  }
+  std::transform(host_shape.begin(), host_shape.end(), std::back_inserter(shape), SizeToInt);
+  return shape;
+}
+
+std::vector<size_t> PaddingShapeTo4d(const std::vector<size_t> &shape, const std::vector<kernel::Axis> &padding_axis) {
+  if (padding_axis.empty() || shape.size() != padding_axis.size()) {
+    return PaddingShapeTo4dByDefault(shape);
+  }
+  std::vector<size_t> shape_4d(kNchwDims, 1);
+  for (size_t index = 0; index < padding_axis.size(); index++) {
+    shape_4d[padding_axis[index]] = shape[index];
+  }
+  return shape_4d;
+}
+
 std::vector<size_t> TransShapeToDevice(const std::vector<size_t> &shape, const std::string &format) {
   using DeviceShapeTransfer = std::function<std::vector<size_t>(const std::vector<size_t> &)>;
   const std::map<std::string, DeviceShapeTransfer> device_shape_map{{kOpFormat_NCHW, NchwDeviceShape},
@@ -426,6 +428,10 @@ std::vector<size_t> TransShapeToDevice(const std::vector<size_t> &shape, const s
   auto temp_shape = shape;
   std::vector<size_t> device_shape;
   if (format == kOpFormat_FRAC_NZ) {
+    if (shape.size() == 1 && (shape[0] == 1 || shape[0] % kCubeSize == 0)) {
+      // For [1] and [1024] shape we can trait it as NZ shape
+      return shape;
+    }
     if (shape.size() < 2) {
       MS_LOG(EXCEPTION) << "Format" << format << " is not support shape " << shape.size();
     } else {
@@ -439,7 +445,7 @@ std::vector<size_t> TransShapeToDevice(const std::vector<size_t> &shape, const s
     device_shape.push_back(kCubeSize);
     return device_shape;
   }
-  if (shape.size() != 4) {
+  if (shape.size() != kNchwDims) {
     MS_LOG(WARNING) << "Get Device Shape using a shape size is less than 4 ,should be Padding shape by Default firstly";
     temp_shape = PaddingShapeTo4dByDefault(shape);
   }
@@ -455,6 +461,8 @@ bool CheckArgs(const FormatArgs &args, size_t *size, size_t *total_size) {
     MS_LOG(ERROR) << "Invalid host shape, host shape dims:" << args.host_shape.size() << ", expect dims:" << kNchwDims;
     return false;
   }
+  MS_EXCEPTION_IF_NULL(size);
+  MS_EXCEPTION_IF_NULL(total_size);
   *size = TypeIdSize(args.src_data_type);
   if (*size < 1) {
     MS_LOG(ERROR) << "Illegal dtype.";
@@ -540,10 +548,10 @@ bool NchwTo4D(const FormatArgs &args, void *result) {
     MS_LOG(ERROR) << "Check args failed.";
     return false;
   }
-  size_t n = args.host_shape[0];
-  size_t c = args.host_shape[1];
-  size_t h = args.host_shape[2];
-  size_t w = args.host_shape[3];
+  auto n = args.host_shape[kN];
+  auto c = args.host_shape[kC];
+  auto h = args.host_shape[kH];
+  auto w = args.host_shape[kW];
   for (size_t ni = 0; ni < n; ni++) {
     for (size_t ci = 0; ci < c; ci++) {
       for (size_t hi = 0; hi < h; hi++) {
@@ -572,10 +580,10 @@ bool ToNchw(const FormatArgs &args, void *result) {
     MS_LOG(ERROR) << "Check args failed.";
     return false;
   }
-  size_t n = args.host_shape[0];
-  size_t c = args.host_shape[1];
-  size_t h = args.host_shape[2];
-  size_t w = args.host_shape[3];
+  auto n = args.host_shape[kN];
+  auto c = args.host_shape[kC];
+  auto h = args.host_shape[kH];
+  auto w = args.host_shape[kW];
   for (size_t ni = 0; ni < n; ni++) {
     for (size_t ci = 0; ci < c; ci++) {
       for (size_t hi = 0; hi < h; hi++) {
@@ -602,32 +610,32 @@ bool NchwToFracZ(const FormatArgs &args, void *result) {
     MS_LOG(ERROR) << "Invalid host shape, host shape dims:" << args.host_shape.size() << ", expect dims:" << kNchwDims;
     return false;
   }
-  size_t size = TypeIdSize(args.src_data_type);
+  auto size = TypeIdSize(args.src_data_type);
   if (size < 1) {
     MS_LOG(ERROR) << "Illegal dtype.";
     return false;
   }
-  auto n = args.host_shape[0];
-  auto c = args.host_shape[1];
-  auto h = args.host_shape[2];
-  auto w = args.host_shape[3];
+  auto n = args.host_shape[kN];
+  auto c = args.host_shape[kC];
+  auto h = args.host_shape[kH];
+  auto w = args.host_shape[kW];
 
-  size_t c0 = CubeSizeByType(args.src_data_type);
+  auto c0 = CubeSizeByType(args.src_data_type);
   if (c0 < 1) {
     MS_LOG(ERROR) << "Illegal dtype.";
     return false;
   }
-  size_t c1 = DivCeil(c, c0);
-  size_t hw = h * w;
-  size_t chw = c * hw;
-  size_t hwc0 = hw * c0;
-  size_t nchw = n * chw;
-
-  size_t hf_cnt = DivCeil(n, kCubeSize);
-  size_t vf_cnt = c1 * hw;
-  size_t fractal_ele_cnt = c0 * kCubeSize;
-  size_t total_ele_cnt = hf_cnt * vf_cnt * fractal_ele_cnt;
-  size_t dst_size = total_ele_cnt * size;
+  auto c1 = DivCeil(c, c0);
+  auto hw = h * w;
+  auto chw = c * hw;
+  auto hwc0 = hw * c0;
+  auto nchw = n * chw;
+
+  auto hf_cnt = DivCeil(n, kCubeSize);
+  auto vf_cnt = c1 * hw;
+  auto fractal_ele_cnt = c0 * kCubeSize;
+  auto total_ele_cnt = hf_cnt * vf_cnt * fractal_ele_cnt;
+  auto dst_size = total_ele_cnt * size;
   if (dst_size != args.device_size) {
     MS_LOG(ERROR) << "Illegal total data size."
                   << "dst size is :" << dst_size << "device size is :" << args.device_size;
@@ -647,7 +655,7 @@ bool NchwToFracZ(const FormatArgs &args, void *result) {
           auto src_ni = hfi * kCubeSize + col;
           auto src_idx = src_row_offset + chw * col;
           auto dst_idx = gfi * fractal_ele_cnt + col * c0 + row;
-          auto pad_zero = (src_ni >= n || src_idx >= nchw || src_ci >= c) ? true : false;
+          auto pad_zero = src_ni >= n || src_idx >= nchw || src_ci >= c;
           SetData(size, pad_zero, src_idx, dst_idx, args, result);
         }
       }
@@ -663,12 +671,12 @@ bool FracZToNchw(const FormatArgs &args, void *result) {
     MS_LOG(ERROR) << "Invalid host shape, host shape dims:" << args.host_shape.size() << ", expect dims:" << kNchwDims;
     return false;
   }
-  size_t size = TypeIdSize(args.src_data_type);
+  auto size = TypeIdSize(args.src_data_type);
   if (size < 1) {
     MS_LOG(ERROR) << "Illegal dtype.";
     return false;
   }
-  size_t total_size = ShapeSize(args.device_shape) * size;
+  auto total_size = ShapeSize(args.device_shape) * size;
   if (total_size != args.device_size) {
     MS_LOG(ERROR) << "Illegal total data size, total_size:" << total_size << ", device_size:" << args.device_size;
     return false;
@@ -677,18 +685,16 @@ bool FracZToNchw(const FormatArgs &args, void *result) {
   auto n0 = args.device_shape.at(1);
   auto ni = args.device_shape.at(2);
   auto c0 = args.device_shape.at(3);
-
-  auto n = args.host_shape[0];
-  auto c = args.host_shape[1];
-  auto h = args.host_shape[2];
-  auto w = args.host_shape[3];
-
-  size_t nc = ni * n0;
-  size_t ncc0 = nc * c0;
-  size_t wncc0 = w * ncc0;
-  size_t hwncc0 = h * wncc0;
-  size_t hw = h * w;
-  size_t chw = c * hw;
+  auto n = args.host_shape[kN];
+  auto c = args.host_shape[kC];
+  auto h = args.host_shape[kH];
+  auto w = args.host_shape[kW];
+  auto nc = ni * n0;
+  auto ncc0 = nc * c0;
+  auto wncc0 = w * ncc0;
+  auto hwncc0 = h * wncc0;
+  auto hw = h * w;
+  auto chw = c * hw;
 
   for (size_t n_idx = 0; n_idx < n; n_idx++) {
     size_t n_head_addr = n_idx * chw;
@@ -720,20 +726,18 @@ bool NchwToFracZc04(const FormatArgs &args, void *result) {
     MS_LOG(ERROR) << "Check args failed.";
     return false;
   }
-  size_t cube = kCubeSize;
-  size_t n = args.host_shape[0];
-  size_t c = args.host_shape[1];
-  size_t h = args.host_shape[2];
-  size_t w = args.host_shape[3];
-
-  size_t c0 = 4;
-  size_t c1 = DivCeil(c, c0);
-  size_t hwc0 = h * w * c0;
-  size_t hwc = h * w * c;
-  size_t nhwc = n * h * w * c;
-
-  size_t n_cnt = DivCeil(n, cube);
-  size_t v_cnt = DivCeil(h * w * c0 * c1, cube);
+  auto cube = kCubeSize;
+  auto n = args.host_shape[kN];
+  auto c = args.host_shape[kC];
+  auto h = args.host_shape[kH];
+  auto w = args.host_shape[kW];
+  const size_t c0 = 4;
+  auto c1 = DivCeil(c, c0);
+  auto hwc0 = h * w * c0;
+  auto hwc = h * w * c;
+  auto nhwc = n * h * w * c;
+  auto n_cnt = DivCeil(n, cube);
+  auto v_cnt = DivCeil(h * w * c0 * c1, cube);
   size_t dst_idx = 0;
 
   for (size_t vi = 0; vi < v_cnt; vi++) {
@@ -929,7 +933,7 @@ bool NchwToNc1hwc0(const FormatArgs &args, void *result) {
     MS_LOG(ERROR) << "Invalid host shape, host shape dims:" << args.host_shape.size() << ", expect dims:" << kNchwDims;
     return false;
   }
-  size_t size = TypeIdSize(args.src_data_type);
+  auto size = TypeIdSize(args.src_data_type);
   if (size < 1) {
     MS_LOG(ERROR) << "Illegal dtype.";
     return false;
@@ -940,20 +944,23 @@ bool NchwToNc1hwc0(const FormatArgs &args, void *result) {
     return false;
   }
 
-  auto n = args.host_shape[0];
-  auto c = args.host_shape[1];
-  auto h = args.host_shape[2];
-  auto w = args.host_shape[3];
-  size_t c0 = CubeSizeByType(args.src_data_type);
+  auto n = args.host_shape[kN];
+  auto c = args.host_shape[kC];
+  auto h = args.host_shape[kH];
+  auto w = args.host_shape[kW];
+  auto c0 = CubeSizeByType(args.src_data_type);
   if (c0 < 1) {
     MS_LOG(ERROR) << "Illegal dtype.";
     return false;
   }
-  size_t c1 = DivCeil(c, c0);
-  size_t hw = h * w;
-  size_t chw = c * hw;
-  size_t c1hwc0 = c1 * hw * c0;
-  size_t wc0 = w * c0;
+  if (args.device_format == kOpFormat_NC1HWC0_C04) {
+    c0 = 4;
+  }
+  auto c1 = DivCeil(c, c0);
+  auto hw = h * w;
+  auto chw = c * hw;
+  auto c1hwc0 = c1 * hw * c0;
+  auto wc0 = w * c0;
 
   for (size_t n_idx = 0; n_idx < n; n_idx++) {
     size_t n_head_addr = n_idx * c1hwc0;
@@ -967,7 +974,7 @@ bool NchwToNc1hwc0(const FormatArgs &args, void *result) {
             size_t dst_idx = c0_idx + w_head_addr;
             size_t c_idx = c0_idx + c1_idx * c0;
             size_t src_idx = n_idx * chw + c_idx * hw + h_idx * w + w_idx;
-            auto pad_zero = (c_idx < c) ? false : true;
+            auto pad_zero = c_idx >= c;
             SetData(size, pad_zero, src_idx, dst_idx, args, result);
           }
         }
@@ -984,29 +991,29 @@ bool Nc1hwc0ToNchw(const FormatArgs &args, void *result) {
     MS_LOG(ERROR) << "Invalid host shape, host shape dims:" << args.host_shape.size() << ", expect dims:" << kNchwDims;
     return false;
   }
-  size_t size = TypeIdSize(args.src_data_type);
+  auto size = TypeIdSize(args.src_data_type);
   if (size < 1) {
     MS_LOG(ERROR) << "Illegal dtype.";
     return false;
   }
-  size_t total_size = ShapeSize(args.device_shape) * size;
+  auto total_size = ShapeSize(args.device_shape) * size;
   if (total_size != args.device_size) {
     MS_LOG(ERROR) << "Illegal total data size, total_size:" << total_size << ", device_size:" << args.device_size;
     return false;
   }
 
-  auto n = args.host_shape[0];
-  auto c = args.host_shape[1];
-  auto h = args.host_shape[2];
-  auto w = args.host_shape[3];
+  auto n = args.host_shape[kN];
+  auto c = args.host_shape[kC];
+  auto h = args.host_shape[kH];
+  auto w = args.host_shape[kW];
   auto c1 = args.device_shape[1];
   auto c0 = args.device_shape[4];
 
-  size_t hw = h * w;
-  size_t chw = c * hw;
-  size_t wc0 = w * c0;
-  size_t hwc0 = h * wc0;
-  size_t c1hwc0 = c1 * hwc0;
+  auto hw = h * w;
+  auto chw = c * hw;
+  auto wc0 = w * c0;
+  auto hwc0 = h * wc0;
+  auto c1hwc0 = c1 * hwc0;
 
   for (size_t n_idx = 0; n_idx < n; n_idx++) {
     size_t n_head_addr = n_idx * chw;
@@ -1037,13 +1044,15 @@ bool NchwToC1hwncoc0(const FormatArgs &args, void *result) {
     MS_LOG(ERROR) << "Check args failed.";
     return false;
   }
-  auto n = args.host_shape[0];
-  auto c = args.host_shape[1];
-  auto h = args.host_shape[2];
-  auto w = args.host_shape[3];
+  auto n = args.host_shape[kN];
+  auto c = args.host_shape[kC];
+  auto h = args.host_shape[kH];
+  auto w = args.host_shape[kW];
+  const int co_idx = 4;
+  const int c0_idx = 5;
   auto c1 = args.device_shape[0];
-  auto co = args.device_shape[4];
-  auto c0 = args.device_shape[5];
+  auto co = args.device_shape[co_idx];
+  auto c0 = args.device_shape[c0_idx];
 
   for (size_t c1_i = 0; c1_i < c1; c1_i++) {
     for (size_t h_i = 0; h_i < h; h_i++) {
@@ -1055,7 +1064,7 @@ bool NchwToC1hwncoc0(const FormatArgs &args, void *result) {
                                co_i * c0 + c0_i;
               size_t c_i = c0_i + c1_i * c0;
               size_t src_idx = n_i * c * h * w + c_i * h * w + h_i * w + w_i;
-              auto pad_zero = (c_i < c && c0_i == co_i) ? false : true;
+              auto pad_zero = !(c_i < c && c0_i == co_i);
               SetData(size, pad_zero, src_idx, dst_idx, args, result);
             }
           }
@@ -1076,12 +1085,14 @@ bool C1hwncoc0ToNchw(const FormatArgs &args, void *result) {
     MS_LOG(ERROR) << "Check args failed.";
     return false;
   }
-  auto n = args.host_shape[0];
-  auto c = args.host_shape[1];
-  auto h = args.host_shape[2];
-  auto w = args.host_shape[3];
-  auto co = args.device_shape[4];
-  auto c0 = args.device_shape[5];
+  auto n = args.host_shape[kN];
+  auto c = args.host_shape[kC];
+  auto h = args.host_shape[kH];
+  auto w = args.host_shape[kW];
+  const int co_idx = 4;
+  const int c0_idx = 5;
+  auto co = args.device_shape[co_idx];
+  auto c0 = args.device_shape[c0_idx];
   for (size_t n_i = 0; n_i < n; n_i++) {
     for (size_t c_i = 0; c_i < c; c_i++) {
       for (size_t h_i = 0; h_i < h; h_i++) {
diff --git a/mindspore/ccsrc/dataset/CMakeLists.txt b/mindspore/ccsrc/dataset/CMakeLists.txt
index 068aec8873..9238be93f2 100644
--- a/mindspore/ccsrc/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/CMakeLists.txt
@@ -62,6 +62,7 @@ add_dependencies(engine-datasetops-source core)
 add_dependencies(engine-datasetops-source-sampler core)
 add_dependencies(engine-datasetops core)
 add_dependencies(engine-opt core)
+add_dependencies(engine-perf core)
 add_dependencies(engine-gnn core)
 add_dependencies(engine core)
 add_dependencies(text core)
@@ -81,6 +82,7 @@ set(submodules
     $<TARGET_OBJECTS:engine-datasetops-source>
     $<TARGET_OBJECTS:engine-datasetops-source-sampler>
     $<TARGET_OBJECTS:engine-gnn>
+    $<TARGET_OBJECTS:engine-perf>
     $<TARGET_OBJECTS:engine-datasetops>
     $<TARGET_OBJECTS:engine-opt>
     $<TARGET_OBJECTS:engine>
@@ -106,10 +108,11 @@ target_link_libraries(_c_dataengine PRIVATE mindspore mindspore_gvar)
 if (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
     target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module ${PYTHON_LIBRARIES} mindspore::protobuf ${SECUREC_LIBRARY})
 else()
+    set(ICU_LIB mindspore::icuuc mindspore::icudata mindspore::icui18n)
     target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module -ldl mindspore::protobuf ${SECUREC_LIBRARY})
 endif()
 target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs
-        mindspore::opencv_imgproc mindspore::tinyxml2)
+        mindspore::opencv_imgproc mindspore::tinyxml2  ${ICU_LIB})
 if (ENABLE_GPUQUE)
     target_link_libraries(_c_dataengine PRIVATE gpu_queue
                                      ${CUDNN_PATH}/lib64/libcudnn.so
diff --git a/mindspore/ccsrc/dataset/api/de_pipeline.cc b/mindspore/ccsrc/dataset/api/de_pipeline.cc
index 0194785090..ce70476423 100644
--- a/mindspore/ccsrc/dataset/api/de_pipeline.cc
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.cc
@@ -19,56 +19,64 @@
 #include <map>
 
 #include "common/utils.h"
-#include "dataset/kernels/py_func_op.h"
-#include "dataset/engine/datasetops/source/image_folder_op.h"
-#include "dataset/engine/datasetops/source/mnist_op.h"
-#include "dataset/engine/datasetops/source/voc_op.h"
 #include "dataset/core/tensor.h"
 #include "dataset/engine/dataset_iterator.h"
-#include "dataset/engine/datasetops/source/manifest_op.h"
-#include "dataset/engine/datasetops/source/cifar_op.h"
+#include "dataset/engine/datasetops/bucket_batch_by_length_op.h"
+#include "dataset/engine/datasetops/filter_op.h"
 #include "dataset/engine/datasetops/source/celeba_op.h"
+#include "dataset/engine/datasetops/source/cifar_op.h"
+#include "dataset/engine/datasetops/source/clue_op.h"
+#include "dataset/engine/datasetops/source/coco_op.h"
+#include "dataset/engine/datasetops/source/image_folder_op.h"
+#include "dataset/engine/datasetops/source/manifest_op.h"
+#include "dataset/engine/datasetops/source/mnist_op.h"
 #include "dataset/engine/datasetops/source/random_data_op.h"
 #include "dataset/engine/datasetops/source/text_file_op.h"
-#include "dataset/engine/datasetops/filter_op.h"
+#include "dataset/engine/datasetops/source/voc_op.h"
+#include "dataset/kernels/py_func_op.h"
+#include "dataset/util/random.h"
+#include "dataset/util/status.h"
 #include "mindrecord/include/shard_category.h"
+#include "mindrecord/include/shard_distributed_sample.h"
 #include "mindrecord/include/shard_sample.h"
 #include "mindrecord/include/shard_shuffle.h"
-#include "dataset/util/random.h"
-#include "dataset/util/status.h"
-#include "utils/log_adapter.h"
 #include "pybind11/stl.h"
+#include "utils/log_adapter.h"
 
 namespace mindspore {
 namespace dataset {
 using pFunction = Status (DEPipeline::*)(const py::dict &, std::shared_ptr<DatasetOp> *);
 
-static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {{kStorage, &DEPipeline::ParseStorageOp},
-                                                                   {kShuffle, &DEPipeline::ParseShuffleOp},
-                                                                   {kMindrecord, &DEPipeline::ParseMindRecordOp},
-                                                                   {kMap, &DEPipeline::ParseMapOp},
-                                                                   {kFilter, &DEPipeline::ParseFilterOp},
-                                                                   {kBatch, &DEPipeline::ParseBatchOp},
-                                                                   {kBarrier, &DEPipeline::ParseBarrierOp},
-                                                                   {kRepeat, &DEPipeline::ParseRepeatOp},
-                                                                   {kSkip, &DEPipeline::ParseSkipOp},
-                                                                   {kZip, &DEPipeline::ParseZipOp},
-                                                                   {kConcat, &DEPipeline::ParseConcatOp},
-                                                                   {kRename, &DEPipeline::ParseRenameOp},
-                                                                   {kDeviceQueue, &DEPipeline::ParseDeviceQueueOp},
-                                                                   {kGenerator, &DEPipeline::ParseGeneratorOp},
-                                                                   {kTfReader, &DEPipeline::ParseTFReaderOp},
-                                                                   {kProject, &DEPipeline::ParseProjectOp},
-                                                                   {kTake, &DEPipeline::ParseTakeOp},
-                                                                   {kImageFolder, &DEPipeline::ParseImageFolderOp},
-                                                                   {kMnist, &DEPipeline::ParseMnistOp},
-                                                                   {kManifest, &DEPipeline::ParseManifestOp},
-                                                                   {kVoc, &DEPipeline::ParseVOCOp},
-                                                                   {kCifar10, &DEPipeline::ParseCifar10Op},
-                                                                   {kCifar100, &DEPipeline::ParseCifar100Op},
-                                                                   {kCelebA, &DEPipeline::ParseCelebAOp},
-                                                                   {kRandomData, &DEPipeline::ParseRandomDataOp},
-                                                                   {kTextFile, &DEPipeline::ParseTextFileOp}};
+static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {
+  {kShuffle, &DEPipeline::ParseShuffleOp},
+  {kMindrecord, &DEPipeline::ParseMindRecordOp},
+  {kMap, &DEPipeline::ParseMapOp},
+  {kFilter, &DEPipeline::ParseFilterOp},
+  {kBatch, &DEPipeline::ParseBatchOp},
+  {kBucketBatch, &DEPipeline::ParseBucketBatchByLengthOp},
+  {kBarrier, &DEPipeline::ParseBarrierOp},
+  {kRepeat, &DEPipeline::ParseRepeatOp},
+  {kSkip, &DEPipeline::ParseSkipOp},
+  {kZip, &DEPipeline::ParseZipOp},
+  {kConcat, &DEPipeline::ParseConcatOp},
+  {kRename, &DEPipeline::ParseRenameOp},
+  {kDeviceQueue, &DEPipeline::ParseDeviceQueueOp},
+  {kGenerator, &DEPipeline::ParseGeneratorOp},
+  {kTfReader, &DEPipeline::ParseTFReaderOp},
+  {kProject, &DEPipeline::ParseProjectOp},
+  {kTake, &DEPipeline::ParseTakeOp},
+  {kImageFolder, &DEPipeline::ParseImageFolderOp},
+  {kMnist, &DEPipeline::ParseMnistOp},
+  {kManifest, &DEPipeline::ParseManifestOp},
+  {kVoc, &DEPipeline::ParseVOCOp},
+  {kCoco, &DEPipeline::ParseCocoOp},
+  {kCifar10, &DEPipeline::ParseCifar10Op},
+  {kCifar100, &DEPipeline::ParseCifar100Op},
+  {kCelebA, &DEPipeline::ParseCelebAOp},
+  {kRandomData, &DEPipeline::ParseRandomDataOp},
+  {kTextFile, &DEPipeline::ParseTextFileOp},
+  {kBuildVocab, &DEPipeline::ParseBuildVocabOp},
+  {kClue, &DEPipeline::ParseClueOp}};
 
 DEPipeline::DEPipeline() : iterator_(nullptr) {
   try {
@@ -292,70 +300,6 @@ Status DEPipeline::SetBatchParameters(const py::dict &args) {
   return Status::OK();
 }
 
-Status DEPipeline::ValidateArgStorageOp(const py::dict &args) {
-  // Required arguments
-  if (((args.contains("dataset_files") && args["dataset_files"].is_none()) || args["schema"].is_none()) &&
-      ((args.contains("dataset_dir") && args["dataset_dir"].is_none()) ||
-       (args["schema"].is_none() && args["schema_json_string"].is_none()))) {
-    std::string err_msg = "Error: at least one of dataset_files or schema_file is missing";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-
-  return Status::OK();
-}
-
-Status DEPipeline::ParseStorageOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
-  RETURN_IF_NOT_OK(ValidateArgStorageOp(args));
-  std::shared_ptr<StorageOp::Builder> builder;
-  if (args.contains("dataset_files") && !args["dataset_files"].is_none()) {
-    builder = std::make_shared<StorageOp::Builder>();
-    (void)builder->SetDatasetFileList(ToStringVector(args["dataset_files"]));
-    (void)builder->SetSchemaFile(ToString(args["schema"]));
-  } else if (args.contains("dataset_dir") && !args["dataset_dir"].is_none()) {
-    builder = std::make_shared<StorageOp::Builder>();
-    (void)builder->SetDatasetFilesDir(ToString(args["dataset_dir"]));
-    if (!args["schema"].is_none()) {
-      (void)builder->SetSchemaFile(ToString(args["schema"]));
-    } else if (!args["schema_json_string"].is_none()) {
-      std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
-      std::string s = ToString(args["schema_json_string"]);
-      RETURN_IF_NOT_OK(schema->LoadSchemaString(s, std::vector<std::string>()));
-      (void)builder->SetNumRows(schema->num_rows());
-      (void)builder->SetSchema(std::move(schema));
-    }
-  }
-
-  // Optional arguments
-  for (auto arg : args) {
-    std::string key = py::str(arg.first);
-    py::handle value = arg.second;
-    if (!value.is_none()) {
-      if (key == "num_parallel_workers") {
-        (void)builder->SetNumWorkers(ToInt(value));
-      } else if (key == "prefetch_size") {
-        (void)builder->SetOpConnectorSize(ToInt(value));
-      } else if (key == "columns_list") {
-        (void)builder->SetColumnsToLoad(ToStringVector(value));
-      } else if (key == "distribution") {
-        (void)builder->SetDataDistributionFile(ToString(value));
-      } else if (key == "labels_filename") {
-        (void)builder->setLabelsFileName(ToString(value));
-      } else if (key == "dataset_usage") {
-        (void)builder->SetDatasetUsage(ToString(value));
-      }
-    }
-  }
-  (void)builder->SetBatchSize(temp_batch_size_);
-  (void)builder->SetDropRemainder(temp_drop_remainder_);
-
-  std::shared_ptr<StorageOp> op;
-  RETURN_IF_NOT_OK(builder->Build(&op));
-  num_rows_ = op->num_rows();
-  num_classes_ = op->num_classes();
-  *ptr = op;
-  return Status::OK();
-}
-
 Status DEPipeline::ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
   std::shared_ptr<ShuffleOp::Builder> builder = std::make_shared<ShuffleOp::Builder>();
   if (!args["buffer_size"].is_none()) {
@@ -382,35 +326,27 @@ Status DEPipeline::ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetO
   return Status::OK();
 }
 
-Status DEPipeline::CheckMindRecordPartitionInfo(const py::dict &args, std::vector<int> *in_partitions) {
-  if (args["partitions"].is_none()) {
-    std::string err_msg = "Error: partitions is not set (None)";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-
-  py::list list = py::reinterpret_borrow<py::list>(args["partitions"]);
-  for (auto l : list) {
-    if (!l.is_none()) {
-      in_partitions->push_back(ToInt(l));
+Status DEPipeline::BuildMindrecordSamplerChain(const py::handle &handle,
+                                               std::vector<std::shared_ptr<mindrecord::ShardOperator>> *operators,
+                                               int num_padded) {
+  auto sampler = py::reinterpret_borrow<py::object>(handle);
+  auto create = sampler.attr("create_for_minddataset");
+  auto op = create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
+  std::stack<std::shared_ptr<mindrecord::ShardOperator>> stack_ops;
+  while (op != nullptr) {
+    auto sampler_op = std::dynamic_pointer_cast<mindrecord::ShardDistributedSample>(op);
+    if (sampler_op && num_padded > 0) {
+      sampler_op->SetNumPaddedSamples(num_padded);
+      stack_ops.push(sampler_op);
+    } else {
+      stack_ops.push(op);
     }
+    op = op->GetChildOp();
   }
-
-  if (in_partitions->size() != 2) {
-    std::string err_msg = "Error: partitions is invalid or not set.";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-
-  constexpr int kMaxPartitions = 64;
-  if (in_partitions->at(0) <= 0 || in_partitions->at(0) > kMaxPartitions) {
-    std::string err_msg = "Error: partitions is invalid or not set.";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-
-  if (in_partitions->at(1) < 0 || in_partitions->at(1) >= in_partitions->at(0)) {
-    std::string err_msg = "Error: partitions is invalid or not set.";
-    RETURN_STATUS_UNEXPECTED(err_msg);
+  while (!stack_ops.empty()) {
+    operators->push_back(stack_ops.top());
+    stack_ops.pop();
   }
-
   return Status::OK();
 }
 
@@ -438,6 +374,10 @@ Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<Datas
     (void)builder->SetColumnsToLoad(in_col_names);
   }
 
+  if (!args["padded_sample"].is_none()) {
+    (void)builder->SetPaddedSample(args["padded_sample"]);
+    (void)builder->SetNumToPadSamples(ToInt(args["num_padded"]));
+  }
   std::vector<std::shared_ptr<mindrecord::ShardOperator>> operators;
   for (auto arg : args) {
     std::string key = py::str(arg.first);
@@ -447,27 +387,16 @@ Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<Datas
         (void)builder->SetNumMindRecordWorkers(ToInt(value));
       } else if (key == "block_reader" && ToBool(value) == true) {
         (void)builder->SetBlockReader();
-      } else if (key == "global_shuffle" && ToBool(value) == true) {
-        uint32_t seed = args["partitions"].is_none() ? GetSeed() : 0;
-        operators.push_back(std::make_shared<mindrecord::ShardShuffle>(seed));
       } else if (key == "sampler") {
-        auto create = py::reinterpret_borrow<py::object>(value).attr("_create_for_minddataset");
-        std::shared_ptr<mindrecord::ShardOperator> sample_op =
-          create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
-        operators.push_back(sample_op);
+        int num_padded = 0;
+        if (!args["num_padded"].is_none()) {
+          num_padded = ToInt(args["num_padded"]);
+        }
+        RETURN_IF_NOT_OK(BuildMindrecordSamplerChain(value, &operators, num_padded));
       }
     }
   }
 
-  std::vector<int> in_partitions;
-  if (!args["partitions"].is_none()) {
-    auto ret = CheckMindRecordPartitionInfo(args, &in_partitions);
-    if (Status::OK() != ret) {
-      return ret;
-    }
-    operators.push_back(std::make_shared<mindrecord::ShardSample>(1, in_partitions[0], in_partitions[1]));
-  }
-
   if (!operators.empty()) {
     (void)builder->SetOperators(operators);
   }
@@ -493,6 +422,8 @@ Status DEPipeline::ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *
         (void)builder->SetInColNames(in_col_names);
       } else if (key == "output_columns") {
         (void)builder->SetOutColNames(ToStringVector(value));
+      } else if (key == "columns_order") {
+        (void)builder->SetColOrder(ToStringVector(value));
       } else if (key == "num_parallel_workers") {
         (void)builder->SetNumWorkers(ToInt(value));
       } else if (key == "prefetch_size") {
@@ -642,18 +573,8 @@ Status DEPipeline::ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp>
         (void)builder->SetColumnsToMap(ToStringVector(value));
       }
       if (key == "pad_info") {
-        std::map<std::string, std::pair<TensorShape, float>> pad_info;
-        for (auto p : py::reinterpret_borrow<py::dict>(value)) {
-          if (!p.second.is_none()) {
-            py::tuple tp = py::reinterpret_borrow<py::tuple>(p.second);
-            CHECK_FAIL_RETURN_UNEXPECTED(tp.size() == 2, "tuple in pad_info must be (list,int) or (list,float)");
-            TensorShape shape = tp[0].is_none() ? TensorShape::CreateUnknownRankShape() : TensorShape(tp[0]);
-            float pad_val = tp[1].is_none() ? 0 : ToFloat(tp[1]);
-            (void)pad_info.insert({ToString(p.first), {shape, pad_val}});
-          } else {  // tuple is None
-            (void)pad_info.insert({ToString(p.first), {TensorShape({}), 0}});
-          }
-        }
+        PadInfo pad_info;
+        RETURN_IF_NOT_OK(ParsePadInfo(value, &pad_info));
         (void)builder->SetPaddingMap(pad_info, true);
       }
     }
@@ -665,6 +586,56 @@ Status DEPipeline::ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp>
   return Status::OK();
 }
 
+Status DEPipeline::ParseBucketBatchByLengthOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
+  std::vector<std::string> mandatory_arguments = {"length_dependent_columns", "bucket_boundaries",
+                                                  "bucket_batch_sizes"};
+  for (auto name : mandatory_arguments) {
+    if (args[name.c_str()].is_none()) {
+      std::string err_msg = "Error: " + name + " is not set.";
+      RETURN_STATUS_UNEXPECTED(err_msg);
+    }
+  }
+
+  std::shared_ptr<BucketBatchByLengthOp::Builder> builder = std::make_shared<BucketBatchByLengthOp::Builder>(
+    ToStringVector(args[mandatory_arguments[0].c_str()]), ToIntVector(args[mandatory_arguments[1].c_str()]),
+    ToIntVector(args[mandatory_arguments[2].c_str()]));
+
+  for (auto arg : args) {
+    std::string key = py::str(arg.first);
+    py::handle value = arg.second;
+    if (!value.is_none()) {
+      if (key == "length_dependent_columns") {
+        (void)builder->SetLengthDependentColumns(ToStringVector(value));
+      }
+      if (key == "bucket_boundaries") {
+        (void)builder->SetBucketBoundaries(ToIntVector(value));
+      }
+      if (key == "bucket_batch_sizes") {
+        (void)builder->SetBucketBatchSizes(ToIntVector(value));
+      }
+      if (key == "element_length_function") {
+        (void)builder->SetElementLengthFunction(value.cast<py::function>());
+      }
+      if (key == "pad_info") {
+        PadInfo pad_info;
+        RETURN_IF_NOT_OK(ParsePadInfo(value, &pad_info));
+        (void)builder->SetPadInfo(pad_info);
+      }
+      if (key == "pad_to_bucket_boundary") {
+        (void)builder->SetPadToBucketBoundary(ToBool(value));
+      }
+      if (key == "drop_remainder") {
+        (void)builder->SetDropRemainder(ToBool(value));
+      }
+    }
+  }
+
+  std::shared_ptr<BucketBatchByLengthOp> op;
+  RETURN_IF_NOT_OK(builder->Build(&op));
+  *ptr = op;
+  return Status::OK();
+}
+
 Status DEPipeline::ParseBarrierOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
   std::shared_ptr<BarrierOp::Builder> builder = std::make_shared<BarrierOp::Builder>();
   // Right now barrier should only take num_rows_per_buffer = 1
@@ -801,6 +772,8 @@ Status DEPipeline::ParseTFReaderOp(const py::dict &args, std::shared_ptr<Dataset
         (void)builder->SetColumnsToLoad(columns_to_load);
       } else if (key == "shuffle_files") {
         (void)builder->SetShuffleFiles(ToBool(value));
+      } else if (key == "shuffle_global") {
+        (void)builder->SetShuffleGlobal(ToBool(value));
       } else if (key == "schema_file_path" || key == "schema_json_string") {
         schema_exists = true;
       } else if (key == "num_samples") {
@@ -856,9 +829,7 @@ Status DEPipeline::ParseImageFolderOp(const py::dict &args, std::shared_ptr<Data
     std::string key = py::str(arg.first);
     py::handle value = arg.second;
     if (!value.is_none()) {
-      if (key == "num_samples") {
-        (void)builder->SetNumSamples(ToInt(value));
-      } else if (key == "num_parallel_workers") {
+      if (key == "num_parallel_workers") {
         (void)builder->SetNumWorkers(ToInt(value));
       } else if (key == "sampler") {
         auto create = py::reinterpret_borrow<py::object>(value).attr("create");
@@ -893,9 +864,7 @@ Status DEPipeline::ParseManifestOp(const py::dict &args, std::shared_ptr<Dataset
     std::string key = py::str(arg.first);
     py::handle value = arg.second;
     if (!value.is_none()) {
-      if (key == "num_samples") {
-        (void)builder->SetNumSamples(ToInt(value));
-      } else if (key == "num_parallel_workers") {
+      if (key == "num_parallel_workers") {
         (void)builder->SetNumWorkers(ToInt(value));
       } else if (key == "sampler") {
         auto create = py::reinterpret_borrow<py::object>(value).attr("create");
@@ -922,6 +891,16 @@ Status DEPipeline::ParseVOCOp(const py::dict &args, std::shared_ptr<DatasetOp> *
     RETURN_STATUS_UNEXPECTED(err_msg);
   }
 
+  if (args["task"].is_none()) {
+    std::string err_msg = "Error: No task specified";
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+
+  if (args["mode"].is_none()) {
+    std::string err_msg = "Error: No mode specified";
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+
   std::shared_ptr<VOCOp::Builder> builder = std::make_shared<VOCOp::Builder>();
   (void)builder->SetDir(ToString(args["dataset_dir"]));
   (void)builder->SetTask(ToString(args["task"]));
@@ -930,9 +909,7 @@ Status DEPipeline::ParseVOCOp(const py::dict &args, std::shared_ptr<DatasetOp> *
     std::string key = py::str(arg.first);
     py::handle value = arg.second;
     if (!value.is_none()) {
-      if (key == "num_samples") {
-        (void)builder->SetNumSamples(ToInt(value));
-      } else if (key == "num_parallel_workers") {
+      if (key == "num_parallel_workers") {
         (void)builder->SetNumWorkers(ToInt(value));
       } else if (key == "sampler") {
         auto create = py::reinterpret_borrow<py::object>(value).attr("create");
@@ -951,6 +928,47 @@ Status DEPipeline::ParseVOCOp(const py::dict &args, std::shared_ptr<DatasetOp> *
   return Status::OK();
 }
 
+Status DEPipeline::ParseCocoOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
+  if (args["dataset_dir"].is_none()) {
+    std::string err_msg = "Error: No dataset path specified";
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+
+  if (args["annotation_file"].is_none()) {
+    std::string err_msg = "Error: No annotation_file specified";
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+
+  if (args["task"].is_none()) {
+    std::string err_msg = "Error: No task specified";
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+
+  std::shared_ptr<CocoOp::Builder> builder = std::make_shared<CocoOp::Builder>();
+  (void)builder->SetDir(ToString(args["dataset_dir"]));
+  (void)builder->SetFile(ToString(args["annotation_file"]));
+  (void)builder->SetTask(ToString(args["task"]));
+  for (auto arg : args) {
+    std::string key = py::str(arg.first);
+    py::handle value = arg.second;
+    if (!value.is_none()) {
+      if (key == "num_parallel_workers") {
+        (void)builder->SetNumWorkers(ToInt(value));
+      } else if (key == "sampler") {
+        auto create = py::reinterpret_borrow<py::object>(value).attr("create");
+        std::shared_ptr<Sampler> sampler = create().cast<std::shared_ptr<Sampler>>();
+        (void)builder->SetSampler(std::move(sampler));
+      } else if (key == "decode") {
+        (void)builder->SetDecode(ToBool(value));
+      }
+    }
+  }
+  std::shared_ptr<CocoOp> op;
+  RETURN_IF_NOT_OK(builder->Build(&op));
+  *ptr = op;
+  return Status::OK();
+}
+
 Status DEPipeline::ParseCifar10Op(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
   // Required arguments
   if (args["dataset_dir"].is_none()) {
@@ -966,9 +984,7 @@ Status DEPipeline::ParseCifar10Op(const py::dict &args, std::shared_ptr<DatasetO
     std::string key = py::str(arg.first);
     py::handle value = arg.second;
     if (!value.is_none()) {
-      if (key == "num_samples") {
-        (void)builder->SetNumSamples(ToInt(value));
-      } else if (key == "num_parallel_workers") {
+      if (key == "num_parallel_workers") {
         (void)builder->SetNumWorkers(ToInt(value));
       } else if (key == "sampler") {
         auto create = py::reinterpret_borrow<py::object>(value).attr("create");
@@ -1001,9 +1017,7 @@ Status DEPipeline::ParseCifar100Op(const py::dict &args, std::shared_ptr<Dataset
     std::string key = py::str(arg.first);
     py::handle value = arg.second;
     if (!value.is_none()) {
-      if (key == "num_samples") {
-        (void)builder->SetNumSamples(ToInt(value));
-      } else if (key == "num_parallel_workers") {
+      if (key == "num_parallel_workers") {
         (void)builder->SetNumWorkers(ToInt(value));
       } else if (key == "sampler") {
         auto create = py::reinterpret_borrow<py::object>(value).attr("create");
@@ -1039,10 +1053,12 @@ Status DEPipeline::ParseRandomDataOp(const py::dict &args, std::shared_ptr<Datas
       (void)builder.SetNumWorkers(ToInt(value));
     } else if (key == "schema_file_path" || key == "schema_json_string") {
       schema_exists = true;
-    } else if (key == "num_samples") {
-      (void)builder.SetTotalRows(ToInt(value));
     } else if (key == "columns_list") {
       columns_to_load = ToStringVector(value);
+    } else if (key == "num_samples") {
+      // This is not sampling here. The random data op needs to know how much data to
+      // generate. It does not currently support sampling.
+      (void)builder.SetTotalRows(ToInt(value));
     }
   }
   if (schema_exists) {
@@ -1077,9 +1093,7 @@ Status DEPipeline::ParseMnistOp(const py::dict &args, std::shared_ptr<DatasetOp>
     std::string key = py::str(arg.first);
     py::handle value = arg.second;
     if (!value.is_none()) {
-      if (key == "num_samples") {
-        (void)builder->SetNumSamples(ToInt(value));
-      } else if (key == "num_parallel_workers") {
+      if (key == "num_parallel_workers") {
         (void)builder->SetNumWorkers(ToInt(value));
       } else if (key == "sampler") {
         auto create = py::reinterpret_borrow<py::object>(value).attr("create");
@@ -1121,8 +1135,6 @@ Status DEPipeline::ParseCelebAOp(const py::dict &args, std::shared_ptr<DatasetOp
         (void)builder->SetDecode(ToBool(value));
       } else if (key == "extensions") {
         (void)builder->SetExtensions(ToStringSet(value));
-      } else if (key == "num_samples") {
-        (void)builder->SetNumSamples(ToInt(value));
       } else if (key == "dataset_type") {
         (void)builder->SetDatasetType(ToString(value));
       }
@@ -1152,8 +1164,10 @@ Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr<Dataset
         (void)builder->SetNumWorkers(ToInt(value));
       } else if (key == "shuffle_files") {
         (void)builder->SetShuffleFiles(ToBool(value));
+      } else if (key == "shuffle_global") {
+        (void)builder->SetShuffleGlobal(ToBool(value));
       } else if (key == "num_samples") {
-        (void)builder->SetNumSamples(ToInt(value));
+        (void)builder->SetTotalRows(ToInt(value));
       } else if (key == "num_shards") {
         (void)builder->SetNumDevices(ToInt(value));
       } else if (key == "shard_id") {
@@ -1166,5 +1180,106 @@ Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr<Dataset
   *ptr = op;
   return Status::OK();
 }
+
+Status DEPipeline::ParsePadInfo(py::handle value, PadInfo *pad_info) {
+  for (auto p : py::reinterpret_borrow<py::dict>(value)) {
+    if (!p.second.is_none()) {
+      auto tp = py::reinterpret_borrow<py::tuple>(p.second);
+      CHECK_FAIL_RETURN_UNEXPECTED(tp.size() == 2, "tuple in pad_info must be (list,int) or (list,float)");
+      TensorShape shape = tp[0].is_none() ? TensorShape::CreateUnknownRankShape() : TensorShape(tp[0]);
+      std::shared_ptr<Tensor> pad_val = nullptr;
+      if (py::isinstance<py::str>(tp[1])) {
+        std::string pad_val_string = tp[1].is_none() ? "" : ToString(tp[1]);
+        CHECK_FAIL_RETURN_UNEXPECTED(
+          Tensor::CreateTensor(&pad_val, std::vector<std::string>{pad_val_string}, TensorShape::CreateScalar()),
+          "Cannot create pad_value Tensor");
+      } else {
+        float pad_val_float = tp[1].is_none() ? 0 : ToFloat(tp[1]);
+        CHECK_FAIL_RETURN_UNEXPECTED(Tensor::CreateTensor(&pad_val, TensorImpl::kFlexible, TensorShape::CreateScalar(),
+                                                          DataType(DataType::DE_FLOAT32)),
+                                     "Cannot create pad_value Tensor");
+        pad_val->SetItemAt<float>({}, pad_val_float);
+      }
+      (void)pad_info->insert({ToString(p.first), {shape, pad_val}});
+    } else {  // tuple is None
+      (void)pad_info->insert({ToString(p.first), {TensorShape({}), nullptr}});
+    }
+  }
+  return Status::OK();
+}
+
+Status DEPipeline::ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
+  std::shared_ptr<BuildVocabOp::Builder> builder = std::make_shared<BuildVocabOp::Builder>();
+  for (auto arg : args) {
+    std::string key = py::str(arg.first);
+    py::handle value = arg.second;
+    if (!value.is_none()) {
+      if (key == "freq_range") {
+        py::tuple tp = py::reinterpret_borrow<py::tuple>(value);
+        if (!tp[0].is_none()) (void)builder->SetMinFreq(py::reinterpret_borrow<py::int_>(tp[0]));
+        if (!tp[1].is_none()) (void)builder->SetMaxFreq(py::reinterpret_borrow<py::int_>(tp[1]));
+      } else if (key == "top_k") {
+        builder->SetTopK(py::reinterpret_borrow<py::int_>(value));
+      } else if (key == "columns") {
+        (void)builder->SetColumnNames(ToStringVector(value));
+      } else if (key == "vocab") {
+        (void)builder->SetVocab(value.cast<std::shared_ptr<Vocab>>());
+      } else if (key == "num_parallel_workers") {
+        (void)builder->SetNumWorkers(ToInt(value));
+      } else if (key == "special_first") {
+        (void)builder->SetSpecialFirst(ToBool(value));
+      } else if (key == "special_tokens") {
+        (void)builder->SetSpecialTokens(ToStringVector(value));
+      }
+    }
+  }
+  std::shared_ptr<BuildVocabOp> op;
+  RETURN_IF_NOT_OK(builder->Build(&op));
+  *ptr = op;
+  return Status::OK();
+}
+
+Status DEPipeline::ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
+  std::shared_ptr<ClueOp::Builder> builder = std::make_shared<ClueOp::Builder>();
+  if (!args["dataset_files"].is_none()) {
+    (void)builder->SetClueFilesList(ToStringVector(args["dataset_files"]));
+  } else {
+    RETURN_STATUS_UNEXPECTED("Error: dataset_files is missing");
+  }
+  // Optional arguments
+  for (auto arg : args) {
+    std::string key = py::str(arg.first);
+    py::handle value = arg.second;
+    if (!value.is_none()) {
+      if (key == "num_parallel_workers") {
+        (void)builder->SetNumWorkers(ToInt(value));
+      } else if (key == "shuffle_files") {
+        (void)builder->SetShuffleFiles(ToBool(value));
+      } else if (key == "shuffle_global") {
+        (void)builder->SetShuffleGlobal(ToBool(value));
+      } else if (key == "num_samples") {
+        (void)builder->SetNumSamples(ToInt(value));
+      } else if (key == "num_shards") {
+        (void)builder->SetNumDevices(ToInt(value));
+      } else if (key == "shard_id") {
+        (void)builder->SetDeviceId(ToInt(value));
+      } else if (key == "cols_to_keyword") {
+        std::map<std::string, std::string> map_dict;
+        for (auto p : py::reinterpret_borrow<py::dict>(value)) {
+          if (!p.second.is_none()) {
+            map_dict.insert({ToString(p.first), ToString(p.second)});
+          } else {
+            map_dict.insert({ToString(p.first), ToString(p.first)});
+          }
+        }
+        (void)builder->SetColsKeyMap(map_dict);
+      }
+    }
+  }
+  std::shared_ptr<ClueOp> op;
+  RETURN_IF_NOT_OK(builder->Build(&op));
+  *ptr = op;
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/api/de_pipeline.h b/mindspore/ccsrc/dataset/api/de_pipeline.h
index 4ecfb080c1..d6127d5d44 100644
--- a/mindspore/ccsrc/dataset/api/de_pipeline.h
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.h
@@ -18,6 +18,7 @@
 
 #include <iostream>
 #include <memory>
+#include <stack>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -36,10 +37,10 @@ using DsOpPtr = std::shared_ptr<DatasetOp>;
 
 // enum for the dataset operator names
 enum OpName {
-  kStorage = 0,
   kShuffle,
   kMindrecord,
   kBatch,
+  kBucketBatch,
   kBarrier,
   kCache,
   kRepeat,
@@ -58,11 +59,14 @@ enum OpName {
   kMnist,
   kManifest,
   kVoc,
+  kCoco,
   kCifar10,
   kCifar100,
   kCelebA,
   kRandomData,
-  kTextFile
+  kTextFile,
+  kBuildVocab,
+  kClue
 };
 
 // The C++ binder class that we expose to the python script.
@@ -100,14 +104,14 @@ class DEPipeline {
 
   int GetRepeatCount() const;
 
-  Status ParseStorageOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
-
   Status ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
 
-  Status CheckMindRecordPartitionInfo(const py::dict &args, std::vector<int> *ptr);
-
   Status ParseMindRecordOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
 
+  Status BuildMindrecordSamplerChain(const py::handle &handle,
+                                     std::vector<std::shared_ptr<mindrecord::ShardOperator>> *operators,
+                                     int num_padded);
+
   Status ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
 
   Status ParseFilterOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
@@ -118,6 +122,8 @@ class DEPipeline {
 
   Status ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
 
+  Status ParseBucketBatchByLengthOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
+
   Status ParseBarrierOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
 
   Status ParseGeneratorOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
@@ -142,6 +148,8 @@ class DEPipeline {
 
   Status ParseVOCOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
 
+  Status ParseCocoOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
+
   Status ParseCifar10Op(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
 
   Status ParseCifar100Op(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
@@ -160,14 +168,17 @@ class DEPipeline {
 
   Status ParseTextFileOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
 
+  Status ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
+
+  Status ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
+
  private:
   // Execution tree that links the dataset operators.
   std::shared_ptr<ExecutionTree> tree_;
 
   std::unique_ptr<DatasetIterator> iterator_;
 
-  // Validate required args passed to storage op.
-  Status ValidateArgStorageOp(const py::dict &args);
+  static Status ParsePadInfo(py::handle value, PadInfo *pad_info);
 
   int batch_size_;
   int repeat_num_;
diff --git a/mindspore/ccsrc/dataset/api/python_bindings.cc b/mindspore/ccsrc/dataset/api/python_bindings.cc
index 55918d8b43..51f2be49d5 100644
--- a/mindspore/ccsrc/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/dataset/api/python_bindings.cc
@@ -16,8 +16,37 @@
 #include <exception>
 
 #include "dataset/api/de_pipeline.h"
-#include "dataset/kernels/no_op.h"
+#include "dataset/engine/datasetops/source/cifar_op.h"
+#include "dataset/engine/datasetops/source/clue_op.h"
+#include "dataset/engine/datasetops/source/coco_op.h"
+#include "dataset/engine/datasetops/source/image_folder_op.h"
+#include "dataset/engine/datasetops/source/io_block.h"
+#include "dataset/engine/datasetops/source/manifest_op.h"
+#include "dataset/engine/datasetops/source/mindrecord_op.h"
+#include "dataset/engine/datasetops/source/mnist_op.h"
+#include "dataset/engine/datasetops/source/random_data_op.h"
+#include "dataset/engine/datasetops/source/sampler/distributed_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/pk_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/python_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/random_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/sequential_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
+#include "dataset/engine/datasetops/source/text_file_op.h"
+#include "dataset/engine/datasetops/source/tf_reader_op.h"
+#include "dataset/engine/datasetops/source/voc_op.h"
+#include "dataset/engine/gnn/graph.h"
+#include "dataset/engine/jagged_connector.h"
+#include "dataset/kernels/data/concatenate_op.h"
+#include "dataset/kernels/data/duplicate_op.h"
+#include "dataset/kernels/data/fill_op.h"
+#include "dataset/kernels/data/mask_op.h"
 #include "dataset/kernels/data/one_hot_op.h"
+#include "dataset/kernels/data/pad_end_op.h"
+#include "dataset/kernels/data/slice_op.h"
+#include "dataset/kernels/data/to_float16_op.h"
+#include "dataset/kernels/data/type_cast_op.h"
+#include "dataset/kernels/image/bounding_box_augment_op.h"
 #include "dataset/kernels/image/center_crop_op.h"
 #include "dataset/kernels/image/cut_out_op.h"
 #include "dataset/kernels/image/decode_op.h"
@@ -26,51 +55,51 @@
 #include "dataset/kernels/image/normalize_op.h"
 #include "dataset/kernels/image/pad_op.h"
 #include "dataset/kernels/image/random_color_adjust_op.h"
-#include "dataset/kernels/image/random_crop_decode_resize_op.h"
 #include "dataset/kernels/image/random_crop_and_resize_op.h"
+#include "dataset/kernels/image/random_crop_and_resize_with_bbox_op.h"
+#include "dataset/kernels/image/random_crop_decode_resize_op.h"
 #include "dataset/kernels/image/random_crop_op.h"
+#include "dataset/kernels/image/random_crop_with_bbox_op.h"
+#include "dataset/kernels/image/random_horizontal_flip_bbox_op.h"
 #include "dataset/kernels/image/random_horizontal_flip_op.h"
 #include "dataset/kernels/image/random_resize_op.h"
 #include "dataset/kernels/image/random_rotation_op.h"
 #include "dataset/kernels/image/random_vertical_flip_op.h"
+#include "dataset/kernels/image/random_vertical_flip_with_bbox_op.h"
 #include "dataset/kernels/image/rescale_op.h"
 #include "dataset/kernels/image/resize_bilinear_op.h"
 #include "dataset/kernels/image/resize_op.h"
 #include "dataset/kernels/image/uniform_aug_op.h"
-#include "dataset/kernels/data/type_cast_op.h"
-#include "dataset/engine/datasetops/source/cifar_op.h"
-#include "dataset/engine/datasetops/source/image_folder_op.h"
-#include "dataset/engine/datasetops/source/io_block.h"
-#include "dataset/engine/datasetops/source/mnist_op.h"
-#include "dataset/engine/datasetops/source/manifest_op.h"
-#include "dataset/engine/datasetops/source/mindrecord_op.h"
-#include "dataset/engine/datasetops/source/random_data_op.h"
-#include "dataset/engine/datasetops/source/sampler/distributed_sampler.h"
-#include "dataset/engine/datasetops/source/sampler/pk_sampler.h"
-#include "dataset/engine/datasetops/source/sampler/random_sampler.h"
-#include "dataset/engine/datasetops/source/sampler/sequential_sampler.h"
-#include "dataset/engine/datasetops/source/sampler/subset_sampler.h"
-#include "dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
-#include "dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
-#include "dataset/engine/datasetops/source/sampler/python_sampler.h"
-#include "dataset/engine/datasetops/source/tf_reader_op.h"
-#include "dataset/engine/jagged_connector.h"
-#include "dataset/engine/datasetops/source/text_file_op.h"
-#include "dataset/engine/datasetops/source/voc_op.h"
-#include "dataset/engine/gnn/graph.h"
-#include "dataset/kernels/data/to_float16_op.h"
+#include "dataset/kernels/no_op.h"
 #include "dataset/text/kernels/jieba_tokenizer_op.h"
+#include "dataset/text/kernels/lookup_op.h"
+#include "dataset/text/kernels/ngram_op.h"
+#include "dataset/text/kernels/to_number_op.h"
 #include "dataset/text/kernels/unicode_char_tokenizer_op.h"
+#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
 #include "dataset/text/vocab.h"
-#include "dataset/text/kernels/lookup_op.h"
 #include "dataset/util/random.h"
+#include "mindrecord/include/shard_distributed_sample.h"
 #include "mindrecord/include/shard_operator.h"
 #include "mindrecord/include/shard_pk_sample.h"
 #include "mindrecord/include/shard_sample.h"
+#include "mindrecord/include/shard_sequential_sample.h"
+#include "mindspore/ccsrc/dataset/text/kernels/truncate_sequence_pair_op.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 #include "pybind11/stl_bind.h"
 
+#ifdef ENABLE_ICU4C
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include "dataset/text/kernels/bert_tokenizer_op.h"
+#include "dataset/text/kernels/case_fold_op.h"
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include "dataset/text/kernels/regex_replace_op.h"
+#include "dataset/text/kernels/regex_tokenizer_op.h"
+#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
+#include "dataset/text/kernels/whitespace_tokenizer_op.h"
+#endif
+
 namespace py = pybind11;
 
 namespace mindspore {
@@ -143,51 +172,49 @@ void bindDatasetOps(py::module *m) {
     });
 
   (void)py::class_<CifarOp, DatasetOp, std::shared_ptr<CifarOp>>(*m, "CifarOp")
-    .def_static("get_num_rows", [](const std::string &dir, int64_t numSamples, bool isCifar10) {
+    .def_static("get_num_rows", [](const std::string &dir, bool isCifar10) {
       int64_t count = 0;
-      THROW_IF_ERROR(CifarOp::CountTotalRows(dir, numSamples, isCifar10, &count));
+      THROW_IF_ERROR(CifarOp::CountTotalRows(dir, isCifar10, &count));
       return count;
     });
 
   (void)py::class_<ImageFolderOp, DatasetOp, std::shared_ptr<ImageFolderOp>>(*m, "ImageFolderOp")
-    .def_static("get_num_rows_and_classes", [](const std::string &path, int64_t numSamples) {
+    .def_static("get_num_rows_and_classes", [](const std::string &path) {
       int64_t count = 0, num_classes = 0;
-      THROW_IF_ERROR(
-        ImageFolderOp::CountRowsAndClasses(path, numSamples, std::set<std::string>{}, &count, &num_classes));
+      THROW_IF_ERROR(ImageFolderOp::CountRowsAndClasses(path, std::set<std::string>{}, &count, &num_classes));
       return py::make_tuple(count, num_classes);
     });
 
   (void)py::class_<MindRecordOp, DatasetOp, std::shared_ptr<MindRecordOp>>(*m, "MindRecordOp")
-    .def_static("get_num_rows",
-                [](const std::vector<std::string> &paths, bool load_dataset, const py::object &sampler) {
-                  int64_t count = 0;
-                  std::shared_ptr<mindrecord::ShardOperator> op;
-                  if (py::hasattr(sampler, "_create_for_minddataset")) {
-                    auto create = sampler.attr("_create_for_minddataset");
-                    op = create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
-                  }
-                  THROW_IF_ERROR(MindRecordOp::CountTotalRows(paths, load_dataset, op, &count));
-                  return count;
-                });
+    .def_static("get_num_rows", [](const std::vector<std::string> &paths, bool load_dataset, const py::object &sampler,
+                                   const int64_t num_padded) {
+      int64_t count = 0;
+      std::shared_ptr<mindrecord::ShardOperator> op;
+      if (py::hasattr(sampler, "create_for_minddataset")) {
+        auto create = sampler.attr("create_for_minddataset");
+        op = create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
+      }
+      THROW_IF_ERROR(MindRecordOp::CountTotalRows(paths, load_dataset, op, &count, num_padded));
+      return count;
+    });
 
   (void)py::class_<ManifestOp, DatasetOp, std::shared_ptr<ManifestOp>>(*m, "ManifestOp")
     .def_static("get_num_rows_and_classes",
-                [](const std::string &file, int64_t numSamples, const py::dict &dict, const std::string &usage) {
+                [](const std::string &file, const py::dict &dict, const std::string &usage) {
                   int64_t count = 0, num_classes = 0;
-                  THROW_IF_ERROR(ManifestOp::CountTotalRows(file, numSamples, dict, usage, &count, &num_classes));
+                  THROW_IF_ERROR(ManifestOp::CountTotalRows(file, dict, usage, &count, &num_classes));
                   return py::make_tuple(count, num_classes);
                 })
-    .def_static("get_class_indexing",
-                [](const std::string &file, int64_t numSamples, const py::dict &dict, const std::string &usage) {
-                  std::map<std::string, int32_t> output_class_indexing;
-                  THROW_IF_ERROR(ManifestOp::GetClassIndexing(file, numSamples, dict, usage, &output_class_indexing));
-                  return output_class_indexing;
-                });
+    .def_static("get_class_indexing", [](const std::string &file, const py::dict &dict, const std::string &usage) {
+      std::map<std::string, int32_t> output_class_indexing;
+      THROW_IF_ERROR(ManifestOp::GetClassIndexing(file, dict, usage, &output_class_indexing));
+      return output_class_indexing;
+    });
 
   (void)py::class_<MnistOp, DatasetOp, std::shared_ptr<MnistOp>>(*m, "MnistOp")
-    .def_static("get_num_rows", [](const std::string &dir, int64_t numSamples) {
+    .def_static("get_num_rows", [](const std::string &dir) {
       int64_t count = 0;
-      THROW_IF_ERROR(MnistOp::CountTotalRows(dir, numSamples, &count));
+      THROW_IF_ERROR(MnistOp::CountTotalRows(dir, &count));
       return count;
     });
 
@@ -201,20 +228,44 @@ void bindDatasetOps(py::module *m) {
       THROW_IF_ERROR(TextFileOp::CountAllFileRows(filenames, &count));
       return count;
     });
+
+  (void)py::class_<ClueOp, DatasetOp, std::shared_ptr<ClueOp>>(*m, "ClueOp")
+    .def_static("get_num_rows", [](const py::list &files) {
+      int64_t count = 0;
+      std::vector<std::string> filenames;
+      for (auto file : files) {
+        file.is_none() ? (void)filenames.emplace_back("") : filenames.push_back(py::str(file));
+      }
+      THROW_IF_ERROR(ClueOp::CountAllFileRows(filenames, &count));
+      return count;
+    });
+
   (void)py::class_<VOCOp, DatasetOp, std::shared_ptr<VOCOp>>(*m, "VOCOp")
     .def_static("get_num_rows",
                 [](const std::string &dir, const std::string &task_type, const std::string &task_mode,
                    const py::dict &dict, int64_t numSamples) {
                   int64_t count = 0;
-                  THROW_IF_ERROR(VOCOp::CountTotalRows(dir, task_type, task_mode, dict, numSamples, &count));
+                  THROW_IF_ERROR(VOCOp::CountTotalRows(dir, task_type, task_mode, dict, &count));
                   return count;
                 })
     .def_static("get_class_indexing", [](const std::string &dir, const std::string &task_type,
-                                         const std::string &task_mode, const py::dict &dict, int64_t numSamples) {
+                                         const std::string &task_mode, const py::dict &dict) {
       std::map<std::string, int32_t> output_class_indexing;
-      THROW_IF_ERROR(VOCOp::GetClassIndexing(dir, task_type, task_mode, dict, numSamples, &output_class_indexing));
+      THROW_IF_ERROR(VOCOp::GetClassIndexing(dir, task_type, task_mode, dict, &output_class_indexing));
       return output_class_indexing;
     });
+  (void)py::class_<CocoOp, DatasetOp, std::shared_ptr<CocoOp>>(*m, "CocoOp")
+    .def_static("get_class_indexing",
+                [](const std::string &dir, const std::string &file, const std::string &task) {
+                  std::vector<std::pair<std::string, std::vector<int32_t>>> output_class_indexing;
+                  THROW_IF_ERROR(CocoOp::GetClassIndexing(dir, file, task, &output_class_indexing));
+                  return output_class_indexing;
+                })
+    .def_static("get_num_rows", [](const std::string &dir, const std::string &file, const std::string &task) {
+      int64_t count = 0;
+      THROW_IF_ERROR(CocoOp::CountTotalRows(dir, file, task, &count));
+      return count;
+    });
 }
 void bindTensor(py::module *m) {
   (void)py::class_<GlobalContext>(*m, "GlobalContext")
@@ -227,12 +278,14 @@ void bindTensor(py::module *m) {
     .def("set_worker_connector_size", &ConfigManager::set_worker_connector_size)
     .def("set_op_connector_size", &ConfigManager::set_op_connector_size)
     .def("set_seed", &ConfigManager::set_seed)
+    .def("set_monitor_sampling_interval", &ConfigManager::set_monitor_sampling_interval)
     .def("get_rows_per_buffer", &ConfigManager::rows_per_buffer)
     .def("get_num_parallel_workers", &ConfigManager::num_parallel_workers)
     .def("get_worker_connector_size", &ConfigManager::worker_connector_size)
     .def("get_op_connector_size", &ConfigManager::op_connector_size)
     .def("get_seed", &ConfigManager::seed)
-    .def("load", [](ConfigManager &c, std::string s) { (void)c.LoadFile(s); });
+    .def("get_monitor_sampling_interval", &ConfigManager::monitor_sampling_interval)
+    .def("load", [](ConfigManager &c, std::string s) { THROW_IF_ERROR(c.LoadFile(s)); });
 
   (void)py::class_<Tensor, std::shared_ptr<Tensor>>(*m, "Tensor", py::buffer_protocol())
     .def(py::init([](py::array arr) {
@@ -300,6 +353,11 @@ void bindTensorOps1(py::module *m) {
     .def(py::init<std::vector<std::shared_ptr<TensorOp>>, int32_t>(), py::arg("operations"),
          py::arg("NumOps") = UniformAugOp::kDefNumOps);
 
+  (void)py::class_<BoundingBoxAugmentOp, TensorOp, std::shared_ptr<BoundingBoxAugmentOp>>(
+    *m, "BoundingBoxAugmentOp", "Tensor operation to apply a transformation on a random choice of bounding boxes.")
+    .def(py::init<std::shared_ptr<TensorOp>, float>(), py::arg("transform"),
+         py::arg("ratio") = BoundingBoxAugmentOp::kDefRatio);
+
   (void)py::class_<ResizeBilinearOp, TensorOp, std::shared_ptr<ResizeBilinearOp>>(
     *m, "ResizeBilinearOp",
     "Tensor operation to resize an image using "
@@ -314,6 +372,11 @@ void bindTensorOps1(py::module *m) {
   (void)py::class_<RandomHorizontalFlipOp, TensorOp, std::shared_ptr<RandomHorizontalFlipOp>>(
     *m, "RandomHorizontalFlipOp", "Tensor operation to randomly flip an image horizontally.")
     .def(py::init<float>(), py::arg("probability") = RandomHorizontalFlipOp::kDefProbability);
+
+  (void)py::class_<RandomHorizontalFlipWithBBoxOp, TensorOp, std::shared_ptr<RandomHorizontalFlipWithBBoxOp>>(
+    *m, "RandomHorizontalFlipWithBBoxOp",
+    "Tensor operation to randomly flip an image horizontally, while flipping bounding boxes.")
+    .def(py::init<float>(), py::arg("probability") = RandomHorizontalFlipWithBBoxOp::kDefProbability);
 }
 
 void bindTensorOps2(py::module *m) {
@@ -321,6 +384,12 @@ void bindTensorOps2(py::module *m) {
     *m, "RandomVerticalFlipOp", "Tensor operation to randomly flip an image vertically.")
     .def(py::init<float>(), py::arg("probability") = RandomVerticalFlipOp::kDefProbability);
 
+  (void)py::class_<RandomVerticalFlipWithBBoxOp, TensorOp, std::shared_ptr<RandomVerticalFlipWithBBoxOp>>(
+    *m, "RandomVerticalFlipWithBBoxOp",
+    "Tensor operation to randomly flip an image vertically"
+    " and adjust bounding boxes.")
+    .def(py::init<float>(), py::arg("probability") = RandomVerticalFlipWithBBoxOp::kDefProbability);
+
   (void)py::class_<RandomCropOp, TensorOp, std::shared_ptr<RandomCropOp>>(*m, "RandomCropOp",
                                                                           "Gives random crop of specified size "
                                                                           "Takes crop size")
@@ -332,10 +401,84 @@ void bindTensorOps2(py::module *m) {
          py::arg("fillG") = RandomCropOp::kDefFillG, py::arg("fillB") = RandomCropOp::kDefFillB);
   (void)py::class_<HwcToChwOp, TensorOp, std::shared_ptr<HwcToChwOp>>(*m, "ChannelSwapOp").def(py::init<>());
 
+  (void)py::class_<RandomCropWithBBoxOp, TensorOp, std::shared_ptr<RandomCropWithBBoxOp>>(*m, "RandomCropWithBBoxOp",
+                                                                                          "Gives random crop of given "
+                                                                                          "size + adjusts bboxes "
+                                                                                          "Takes crop size")
+    .def(py::init<int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, BorderType, bool, uint8_t, uint8_t, uint8_t>(),
+         py::arg("cropHeight"), py::arg("cropWidth"), py::arg("padTop") = RandomCropWithBBoxOp::kDefPadTop,
+         py::arg("padBottom") = RandomCropWithBBoxOp::kDefPadBottom,
+         py::arg("padLeft") = RandomCropWithBBoxOp::kDefPadLeft,
+         py::arg("padRight") = RandomCropWithBBoxOp::kDefPadRight,
+         py::arg("borderType") = RandomCropWithBBoxOp::kDefBorderType,
+         py::arg("padIfNeeded") = RandomCropWithBBoxOp::kDefPadIfNeeded,
+         py::arg("fillR") = RandomCropWithBBoxOp::kDefFillR, py::arg("fillG") = RandomCropWithBBoxOp::kDefFillG,
+         py::arg("fillB") = RandomCropWithBBoxOp::kDefFillB);
+
   (void)py::class_<OneHotOp, TensorOp, std::shared_ptr<OneHotOp>>(
     *m, "OneHotOp", "Tensor operation to apply one hot encoding. Takes number of classes.")
     .def(py::init<int32_t>());
 
+  (void)py::class_<FillOp, TensorOp, std::shared_ptr<FillOp>>(
+    *m, "FillOp", "Tensor operation to return tensor filled with same value as input fill value.")
+    .def(py::init<std::shared_ptr<Tensor>>());
+
+  (void)py::class_<SliceOp, TensorOp, std::shared_ptr<SliceOp>>(*m, "SliceOp", "Tensor slice operation.")
+    .def(py::init<bool>())
+    .def(py::init([](const py::list &py_list) {
+      std::vector<dsize_t> c_list;
+      for (auto l : py_list) {
+        if (!l.is_none()) {
+          c_list.push_back(py::reinterpret_borrow<py::int_>(l));
+        }
+      }
+      return std::make_shared<SliceOp>(c_list);
+    }))
+    .def(py::init([](const py::tuple &py_slice) {
+      if (py_slice.size() != 3) {
+        THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Wrong slice object"));
+      }
+      Slice c_slice;
+      if (!py_slice[0].is_none() && !py_slice[1].is_none() && !py_slice[2].is_none()) {
+        c_slice = Slice(py::reinterpret_borrow<py::int_>(py_slice[0]), py::reinterpret_borrow<py::int_>(py_slice[1]),
+                        py::reinterpret_borrow<py::int_>(py_slice[2]));
+      } else if (py_slice[0].is_none() && py_slice[2].is_none()) {
+        c_slice = Slice(py::reinterpret_borrow<py::int_>(py_slice[1]));
+      } else if (!py_slice[0].is_none() && !py_slice[1].is_none()) {
+        c_slice = Slice(py::reinterpret_borrow<py::int_>(py_slice[0]), py::reinterpret_borrow<py::int_>(py_slice[1]));
+      }
+
+      if (!c_slice.valid()) {
+        THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Wrong slice object"));
+      }
+      return std::make_shared<SliceOp>(c_slice);
+    }));
+
+  (void)py::enum_<RelationalOp>(*m, "RelationalOp", py::arithmetic())
+    .value("EQ", RelationalOp::kEqual)
+    .value("NE", RelationalOp::kNotEqual)
+    .value("LT", RelationalOp::kLess)
+    .value("LE", RelationalOp::kLessEqual)
+    .value("GT", RelationalOp::kGreater)
+    .value("GE", RelationalOp::kGreaterEqual)
+    .export_values();
+
+  (void)py::class_<MaskOp, TensorOp, std::shared_ptr<MaskOp>>(*m, "MaskOp",
+                                                              "Tensor mask operation using relational comparator")
+    .def(py::init<RelationalOp, std::shared_ptr<Tensor>, DataType>());
+
+  (void)py::class_<DuplicateOp, TensorOp, std::shared_ptr<DuplicateOp>>(*m, "DuplicateOp", "Duplicate tensor.")
+    .def(py::init<>());
+
+  (void)py::class_<TruncateSequencePairOp, TensorOp, std::shared_ptr<TruncateSequencePairOp>>(
+    *m, "TruncateSequencePairOp", "Tensor operation to truncate two tensors to a max_length")
+    .def(py::init<int64_t>());
+
+  (void)py::class_<ConcatenateOp, TensorOp, std::shared_ptr<ConcatenateOp>>(*m, "ConcatenateOp",
+                                                                            "Tensor operation concatenate tensors.")
+    .def(py::init<int8_t, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>(), py::arg("axis"),
+         py::arg("prepend").none(true), py::arg("append").none(true));
+
   (void)py::class_<RandomRotationOp, TensorOp, std::shared_ptr<RandomRotationOp>>(
     *m, "RandomRotationOp",
     "Tensor operation to apply RandomRotation."
@@ -347,6 +490,10 @@ void bindTensorOps2(py::module *m) {
          py::arg("interpolation") = RandomRotationOp::kDefInterpolation,
          py::arg("expand") = RandomRotationOp::kDefExpand, py::arg("fillR") = RandomRotationOp::kDefFillR,
          py::arg("fillG") = RandomRotationOp::kDefFillG, py::arg("fillB") = RandomRotationOp::kDefFillB);
+
+  (void)py::class_<PadEndOp, TensorOp, std::shared_ptr<PadEndOp>>(
+    *m, "PadEndOp", "Tensor operation to pad end of tensor with a pad value.")
+    .def(py::init<TensorShape, std::shared_ptr<Tensor>>());
 }
 
 void bindTensorOps3(py::module *m) {
@@ -364,6 +511,20 @@ void bindTensorOps3(py::module *m) {
          py::arg("interpolation") = RandomCropAndResizeOp::kDefInterpolation,
          py::arg("maxIter") = RandomCropAndResizeOp::kDefMaxIter);
 
+  (void)py::class_<RandomCropAndResizeWithBBoxOp, TensorOp, std::shared_ptr<RandomCropAndResizeWithBBoxOp>>(
+    *m, "RandomCropAndResizeWithBBoxOp",
+    "Tensor operation to randomly crop an image (with BBoxes) and resize to a given size."
+    "Takes output height and width and"
+    "optional parameters for lower and upper bound for aspect ratio (h/w) and scale,"
+    "interpolation mode, and max attempts to crop")
+    .def(py::init<int32_t, int32_t, float, float, float, float, InterpolationMode, int32_t>(), py::arg("targetHeight"),
+         py::arg("targetWidth"), py::arg("scaleLb") = RandomCropAndResizeWithBBoxOp::kDefScaleLb,
+         py::arg("scaleUb") = RandomCropAndResizeWithBBoxOp::kDefScaleUb,
+         py::arg("aspectLb") = RandomCropAndResizeWithBBoxOp::kDefAspectLb,
+         py::arg("aspectUb") = RandomCropAndResizeWithBBoxOp::kDefAspectUb,
+         py::arg("interpolation") = RandomCropAndResizeWithBBoxOp::kDefInterpolation,
+         py::arg("maxIter") = RandomCropAndResizeWithBBoxOp::kDefMaxIter);
+
   (void)py::class_<RandomColorAdjustOp, TensorOp, std::shared_ptr<RandomColorAdjustOp>>(
     *m, "RandomColorAdjustOp",
     "Tensor operation to adjust an image's color randomly."
@@ -418,9 +579,13 @@ void bindTensorOps4(py::module *m) {
     .def(py::init<int32_t, int32_t, int32_t, int32_t, BorderType, uint8_t, uint8_t, uint8_t>(), py::arg("padTop"),
          py::arg("padBottom"), py::arg("padLeft"), py::arg("padRight"), py::arg("borderTypes") = PadOp::kDefBorderType,
          py::arg("fillR") = PadOp::kDefFillR, py::arg("fillG") = PadOp::kDefFillG, py::arg("fillB") = PadOp::kDefFillB);
+  (void)py::class_<ToNumberOp, TensorOp, std::shared_ptr<ToNumberOp>>(*m, "ToNumberOp",
+                                                                      "TensorOp to convert strings to numbers.")
+    .def(py::init<DataType>(), py::arg("data_type"))
+    .def(py::init<std::string>(), py::arg("data_type"));
 }
 
-void bindTensorOps5(py::module *m) {
+void bindTokenizerOps(py::module *m) {
   (void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "")
     .def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"),
          py::arg("mode") = JiebaMode::kMix)
@@ -433,6 +598,60 @@ void bindTensorOps5(py::module *m) {
                                                                   "Tensor operation to LookUp each word")
     .def(py::init<std::shared_ptr<Vocab>, WordIdType>(), py::arg("vocab"), py::arg("unknown"))
     .def(py::init<std::shared_ptr<Vocab>>(), py::arg("vocab"));
+  (void)py::class_<NgramOp, TensorOp, std::shared_ptr<NgramOp>>(*m, "NgramOp", "TensorOp performs ngram mapping")
+    .def(py::init<const std::vector<int32_t> &, int32_t, int32_t, const std::string &, const std::string &,
+                  const std::string &>(),
+         py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"), py::arg("r_pad_token"),
+         py::arg("separator"));
+  (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
+    *m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
+    .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(),
+         py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
+         py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
+         py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken));
+}
+
+void bindDependIcuTokenizerOps(py::module *m) {
+#ifdef ENABLE_ICU4C
+  (void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
+    *m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
+    .def(py::init<>());
+  (void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
+    *m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
+    .def(py::init<>())
+    .def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace);
+  (void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
+    *m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
+    .def(py::init<>());
+  (void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(
+    *m, "NormalizeUTF8Op", "Apply normalize operation on utf-8 string tensor.")
+    .def(py::init<>())
+    .def(py::init<NormalizeForm>(), py::arg("normalize_form") = NormalizeUTF8Op::kDefNormalizeForm);
+  (void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(
+    *m, "RegexReplaceOp", "Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.")
+    .def(py::init<const std::string &, const std::string &, bool>(), py::arg("pattern"), py::arg("replace"),
+         py::arg("replace_all"));
+  (void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
+    *m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
+    .def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern"));
+  (void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
+    *m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
+    .def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
+         py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
+         py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
+         py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
+  (void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp",
+                                                                                "Tokenizer used for Bert text process.")
+    .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool,
+                  NormalizeForm, bool>(),
+         py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
+         py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
+         py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
+         py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
+         py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
+         py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
+         py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
+#endif
 }
 
 void bindSamplerOps(py::module *m) {
@@ -449,32 +668,29 @@ void bindSamplerOps(py::module *m) {
     .def("add_child",
          [](std::shared_ptr<Sampler> self, std::shared_ptr<Sampler> child) { THROW_IF_ERROR(self->AddChild(child)); });
 
-  (void)py::class_<mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardOperator>>(*m, "ShardOperator");
+  (void)py::class_<mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardOperator>>(*m, "ShardOperator")
+    .def("add_child", [](std::shared_ptr<mindrecord::ShardOperator> self,
+                         std::shared_ptr<mindrecord::ShardOperator> child) { self->SetChildOp(child); });
 
   (void)py::class_<DistributedSampler, Sampler, std::shared_ptr<DistributedSampler>>(*m, "DistributedSampler")
-    .def(py::init<int64_t, int64_t, bool, uint32_t>(), py::arg("numDev"), py::arg("devId"), py::arg("shuffle"),
-         py::arg("seed"));
+    .def(py::init<int64_t, int64_t, int64_t, bool, uint32_t>());
 
   (void)py::class_<PKSampler, Sampler, std::shared_ptr<PKSampler>>(*m, "PKSampler")
-    .def(py::init<int64_t, bool>(), py::arg("kVal"), py::arg("shuffle"));
+    .def(py::init<int64_t, int64_t, bool>());
 
   (void)py::class_<RandomSampler, Sampler, std::shared_ptr<RandomSampler>>(*m, "RandomSampler")
-    .def(py::init<bool, bool, int64_t>(), py::arg("replacement"), py::arg("reshuffle_each_epoch"),
-         py::arg("num_samples"))
-    .def(py::init<bool, bool>(), py::arg("replacement"), py::arg("reshuffle_each_epoch"));
+    .def(py::init<int64_t, bool, bool>());
 
   (void)py::class_<SequentialSampler, Sampler, std::shared_ptr<SequentialSampler>>(*m, "SequentialSampler")
-    .def(py::init<>());
-
-  (void)py::class_<SubsetSampler, Sampler, std::shared_ptr<SubsetSampler>>(*m, "SubsetSampler")
-    .def(py::init<int64_t, int64_t>(), py::arg("start_index"), py::arg("subset_size"));
+    .def(py::init<int64_t, int64_t>());
 
   (void)py::class_<SubsetRandomSampler, Sampler, std::shared_ptr<SubsetRandomSampler>>(*m, "SubsetRandomSampler")
-    .def(py::init<std::vector<int64_t>>(), py::arg("indices"));
+    .def(py::init<int64_t, std::vector<int64_t>>());
 
   (void)py::class_<mindrecord::ShardSample, mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardSample>>(
     *m, "MindrecordSubsetRandomSampler")
     .def(py::init<std::vector<int64_t>, uint32_t>(), py::arg("indices"), py::arg("seed") = GetSeed());
+
   (void)py::class_<mindrecord::ShardPkSample, mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardPkSample>>(
     *m, "MindrecordPkSampler")
     .def(py::init([](int64_t kVal, std::string kColumn, bool shuffle) {
@@ -486,12 +702,27 @@ void bindSamplerOps(py::module *m) {
       }
     }));
 
+  (void)py::class_<mindrecord::ShardDistributedSample, mindrecord::ShardSample,
+                   std::shared_ptr<mindrecord::ShardDistributedSample>>(*m, "MindrecordDistributedSampler")
+    .def(py::init<int64_t, int64_t, bool, uint32_t>());
+
+  (void)py::class_<mindrecord::ShardShuffle, mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardShuffle>>(
+    *m, "MindrecordRandomSampler")
+    .def(py::init([](int64_t num_samples, bool replacement, bool reshuffle_each_epoch) {
+      return std::make_shared<mindrecord::ShardShuffle>(GetSeed(), num_samples, replacement, reshuffle_each_epoch);
+    }));
+
+  (void)py::class_<mindrecord::ShardSequentialSample, mindrecord::ShardSample,
+                   std::shared_ptr<mindrecord::ShardSequentialSample>>(*m, "MindrecordSequentialSampler")
+    .def(py::init([](int num_samples, int start_index) {
+      return std::make_shared<mindrecord::ShardSequentialSample>(num_samples, start_index);
+    }));
+
   (void)py::class_<WeightedRandomSampler, Sampler, std::shared_ptr<WeightedRandomSampler>>(*m, "WeightedRandomSampler")
-    .def(py::init<std::vector<double>, int64_t, bool>(), py::arg("weights"), py::arg("numSamples"),
-         py::arg("replacement"));
+    .def(py::init<int64_t, std::vector<double>, bool>());
 
   (void)py::class_<PythonSampler, Sampler, std::shared_ptr<PythonSampler>>(*m, "PythonSampler")
-    .def(py::init<py::object>(), py::arg("pySampler"));
+    .def(py::init<int64_t, py::object>());
 }
 
 void bindInfoObjects(py::module *m) {
@@ -503,16 +734,18 @@ void bindInfoObjects(py::module *m) {
 
 void bindVocabObjects(py::module *m) {
   (void)py::class_<Vocab, std::shared_ptr<Vocab>>(*m, "Vocab")
+    .def(py::init<>())
     .def_static("from_list",
-                [](const py::list &words) {
+                [](const py::list &words, const py::list &special_tokens, bool special_first) {
                   std::shared_ptr<Vocab> v;
-                  THROW_IF_ERROR(Vocab::BuildFromPyList(words, &v));
+                  THROW_IF_ERROR(Vocab::BuildFromPyList(words, special_tokens, special_first, &v));
                   return v;
                 })
     .def_static("from_file",
-                [](const std::string &path, const std::string &dlm, int32_t vocab_size) {
+                [](const std::string &path, const std::string &dlm, int32_t vocab_size, const py::list &special_tokens,
+                   bool special_first) {
                   std::shared_ptr<Vocab> v;
-                  THROW_IF_ERROR(Vocab::BuildFromFile(path, dlm, vocab_size, &v));
+                  THROW_IF_ERROR(Vocab::BuildFromFile(path, dlm, vocab_size, special_tokens, special_first, &v));
                   return v;
                 })
     .def_static("from_dict", [](const py::dict &words) {
@@ -529,10 +762,22 @@ void bindGraphData(py::module *m) {
       THROW_IF_ERROR(g_out->Init());
       return g_out;
     }))
-    .def("get_nodes",
-         [](gnn::Graph &g, gnn::NodeType node_type, gnn::NodeIdType node_num) {
+    .def("get_all_nodes",
+         [](gnn::Graph &g, gnn::NodeType node_type) {
+           std::shared_ptr<Tensor> out;
+           THROW_IF_ERROR(g.GetAllNodes(node_type, &out));
+           return out;
+         })
+    .def("get_all_edges",
+         [](gnn::Graph &g, gnn::EdgeType edge_type) {
            std::shared_ptr<Tensor> out;
-           THROW_IF_ERROR(g.GetNodes(node_type, node_num, &out));
+           THROW_IF_ERROR(g.GetAllEdges(edge_type, &out));
+           return out;
+         })
+    .def("get_nodes_from_edges",
+         [](gnn::Graph &g, std::vector<gnn::NodeIdType> edge_list) {
+           std::shared_ptr<Tensor> out;
+           THROW_IF_ERROR(g.GetNodesFromEdges(edge_list, &out));
            return out;
          })
     .def("get_all_neighbors",
@@ -541,12 +786,38 @@ void bindGraphData(py::module *m) {
            THROW_IF_ERROR(g.GetAllNeighbors(node_list, neighbor_type, &out));
            return out;
          })
+    .def("get_sampled_neighbors",
+         [](gnn::Graph &g, std::vector<gnn::NodeIdType> node_list, std::vector<gnn::NodeIdType> neighbor_nums,
+            std::vector<gnn::NodeType> neighbor_types) {
+           std::shared_ptr<Tensor> out;
+           THROW_IF_ERROR(g.GetSampledNeighbors(node_list, neighbor_nums, neighbor_types, &out));
+           return out;
+         })
+    .def("get_neg_sampled_neighbors",
+         [](gnn::Graph &g, std::vector<gnn::NodeIdType> node_list, gnn::NodeIdType neighbor_num,
+            gnn::NodeType neg_neighbor_type) {
+           std::shared_ptr<Tensor> out;
+           THROW_IF_ERROR(g.GetNegSampledNeighbors(node_list, neighbor_num, neg_neighbor_type, &out));
+           return out;
+         })
     .def("get_node_feature",
          [](gnn::Graph &g, std::shared_ptr<Tensor> node_list, std::vector<gnn::FeatureType> feature_types) {
            TensorRow out;
            THROW_IF_ERROR(g.GetNodeFeature(node_list, feature_types, &out));
+           return out.getRow();
+         })
+    .def("graph_info",
+         [](gnn::Graph &g) {
+           py::dict out;
+           THROW_IF_ERROR(g.GraphInfo(&out));
            return out;
-         });
+         })
+    .def("random_walk", [](gnn::Graph &g, std::vector<gnn::NodeIdType> node_list, std::vector<gnn::NodeType> meta_path,
+                           float step_home_param, float step_away_param, gnn::NodeIdType default_node) {
+      std::shared_ptr<Tensor> out;
+      THROW_IF_ERROR(g.RandomWalk(node_list, meta_path, step_home_param, step_away_param, default_node, &out));
+      return out;
+    });
 }
 
 // This is where we externalize the C logic as python modules
@@ -555,9 +826,9 @@ PYBIND11_MODULE(_c_dataengine, m) {
   (void)py::class_<DatasetOp, std::shared_ptr<DatasetOp>>(m, "DatasetOp");
 
   (void)py::enum_<OpName>(m, "OpName", py::arithmetic())
-    .value("STORAGE", OpName::kStorage)
     .value("SHUFFLE", OpName::kShuffle)
     .value("BATCH", OpName::kBatch)
+    .value("BUCKETBATCH", OpName::kBucketBatch)
     .value("BARRIER", OpName::kBarrier)
     .value("MINDRECORD", OpName::kMindrecord)
     .value("CACHE", OpName::kCache)
@@ -578,11 +849,14 @@ PYBIND11_MODULE(_c_dataengine, m) {
     .value("MNIST", OpName::kMnist)
     .value("MANIFEST", OpName::kManifest)
     .value("VOC", OpName::kVoc)
+    .value("COCO", OpName::kCoco)
     .value("CIFAR10", OpName::kCifar10)
     .value("CIFAR100", OpName::kCifar100)
     .value("RANDOMDATA", OpName::kRandomData)
+    .value("BUILDVOCAB", OpName::kBuildVocab)
     .value("CELEBA", OpName::kCelebA)
-    .value("TEXTFILE", OpName::kTextFile);
+    .value("TEXTFILE", OpName::kTextFile)
+    .value("CLUE", OpName::kClue);
 
   (void)py::enum_<JiebaMode>(m, "JiebaMode", py::arithmetic())
     .value("DE_JIEBA_MIX", JiebaMode::kMix)
@@ -590,6 +864,16 @@ PYBIND11_MODULE(_c_dataengine, m) {
     .value("DE_JIEBA_HMM", JiebaMode::kHmm)
     .export_values();
 
+#ifdef ENABLE_ICU4C
+  (void)py::enum_<NormalizeForm>(m, "NormalizeForm", py::arithmetic())
+    .value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
+    .value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
+    .value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
+    .value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
+    .value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
+    .export_values();
+#endif
+
   (void)py::enum_<InterpolationMode>(m, "InterpolationMode", py::arithmetic())
     .value("DE_INTER_LINEAR", InterpolationMode::kLinear)
     .value("DE_INTER_CUBIC", InterpolationMode::kCubic)
@@ -609,12 +893,13 @@ PYBIND11_MODULE(_c_dataengine, m) {
   bindTensorOps2(&m);
   bindTensorOps3(&m);
   bindTensorOps4(&m);
-  bindTensorOps5(&m);
+  bindTokenizerOps(&m);
   bindSamplerOps(&m);
   bindDatasetOps(&m);
   bindInfoObjects(&m);
   bindVocabObjects(&m);
   bindGraphData(&m);
+  bindDependIcuTokenizerOps(&m);
 }
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/core/CMakeLists.txt b/mindspore/ccsrc/dataset/core/CMakeLists.txt
index 0b9f08d070..27b9f0e13b 100644
--- a/mindspore/ccsrc/dataset/core/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/core/CMakeLists.txt
@@ -11,6 +11,7 @@ add_library(core OBJECT
   data_type.cc
   global_context.cc
   tensor.cc
+  tensor_row.cc
   tensor_shape.cc
   )
 add_dependencies(core mindspore::protobuf)
diff --git a/mindspore/ccsrc/dataset/core/client.h b/mindspore/ccsrc/dataset/core/client.h
index aa5e85f7de..a10cb4596e 100644
--- a/mindspore/ccsrc/dataset/core/client.h
+++ b/mindspore/ccsrc/dataset/core/client.h
@@ -27,6 +27,7 @@
 #include "dataset/engine/dataset_iterator.h"
 #include "dataset/engine/datasetops/barrier_op.h"
 #include "dataset/engine/datasetops/batch_op.h"
+#include "dataset/engine/datasetops/build_vocab_op.h"
 #include "dataset/engine/datasetops/dataset_op.h"
 #include "dataset/engine/datasetops/device_queue_op.h"
 #include "dataset/engine/datasetops/map_op.h"
@@ -38,7 +39,6 @@
 #include "dataset/engine/datasetops/shuffle_op.h"
 #include "dataset/engine/datasetops/source/generator_op.h"
 #include "dataset/engine/datasetops/source/mindrecord_op.h"
-#include "dataset/engine/datasetops/source/storage_op.h"
 #include "dataset/engine/datasetops/source/tf_reader_op.h"
 #include "dataset/engine/datasetops/take_op.h"
 #include "dataset/engine/datasetops/zip_op.h"
diff --git a/mindspore/ccsrc/dataset/core/config_manager.cc b/mindspore/ccsrc/dataset/core/config_manager.cc
index 3f659555f4..a489b4a4ce 100644
--- a/mindspore/ccsrc/dataset/core/config_manager.cc
+++ b/mindspore/ccsrc/dataset/core/config_manager.cc
@@ -48,7 +48,7 @@ Status ConfigManager::FromJson(const nlohmann::json &j) {
 Status ConfigManager::LoadFile(const std::string &settingsFile) {
   Status rc;
   if (!Path(settingsFile).Exists()) {
-    RETURN_STATUS_UNEXPECTED("File is not found");
+    RETURN_STATUS_UNEXPECTED("File is not found.");
   }
   // Some settings are mandatory, others are not (with default).  If a setting
   // is optional it will set a default value if the config is missing from the file.
@@ -59,14 +59,11 @@ Status ConfigManager::LoadFile(const std::string &settingsFile) {
     rc = FromJson(js);
   } catch (const nlohmann::json::type_error &e) {
     std::ostringstream ss;
-    ss << "Client settings failed to load:\n" << e.what();
+    ss << "Client file failed to load:\n" << e.what();
     std::string err_msg = ss.str();
     RETURN_STATUS_UNEXPECTED(err_msg);
   } catch (const std::exception &err) {
-    std::ostringstream ss;
-    ss << "Client settings failed to load:\n" << err.what();
-    std::string err_msg = ss.str();
-    RETURN_STATUS_UNEXPECTED(err_msg);
+    RETURN_STATUS_UNEXPECTED("Client file failed to load.");
   }
   return rc;
 }
@@ -88,5 +85,7 @@ void ConfigManager::set_op_connector_size(int32_t connector_size) { op_connector
 uint32_t ConfigManager::seed() const { return seed_; }
 
 void ConfigManager::set_seed(uint32_t seed) { seed_ = seed; }
+
+void ConfigManager::set_monitor_sampling_interval(uint32_t interval) { monitor_sampling_interval_ = interval; }
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/core/config_manager.h b/mindspore/ccsrc/dataset/core/config_manager.h
index 654d5f930c..807591daa1 100644
--- a/mindspore/ccsrc/dataset/core/config_manager.h
+++ b/mindspore/ccsrc/dataset/core/config_manager.h
@@ -111,12 +111,21 @@ class ConfigManager {
   // @param seed - The default seed to use
   void set_seed(uint32_t seed);
 
+  // setter function
+  // @param interval - The setting to apply to the config
+  void set_monitor_sampling_interval(uint32_t interval);
+
+  // getter function
+  // @return The iterval of monitor sampling
+  int32_t monitor_sampling_interval() const { return monitor_sampling_interval_; }
+
  private:
   int32_t rows_per_buffer_{kCfgRowsPerBuffer};
   int32_t num_parallel_workers_{kCfgParallelWorkers};
   int32_t worker_connector_size_{kCfgWorkerConnectorSize};
   int32_t op_connector_size_{kCfgOpConnectorSize};
   uint32_t seed_{kCfgDefaultSeed};
+  uint32_t monitor_sampling_interval_{kCfgMonitorSamplingInterval};
 
   // Private helper function that taks a nlohmann json format and populates the settings
   // @param j - The json nlohmann json info
diff --git a/mindspore/ccsrc/dataset/core/constants.h b/mindspore/ccsrc/dataset/core/constants.h
index 9c0e24acc6..34d2f2583c 100644
--- a/mindspore/ccsrc/dataset/core/constants.h
+++ b/mindspore/ccsrc/dataset/core/constants.h
@@ -47,9 +47,13 @@ constexpr uint32_t kCfgParallelWorkers = 4;
 constexpr uint32_t kCfgWorkerConnectorSize = 16;
 constexpr uint32_t kCfgOpConnectorSize = 16;
 constexpr uint32_t kCfgDefaultSeed = std::mt19937::default_seed;
+constexpr uint32_t kCfgMonitorSamplingInterval = 10;
 
 // Invalid OpenCV type should not be from 0 to 7 (opencv4/opencv2/core/hal/interface.h)
 constexpr uint8_t kCVInvalidType = 255;
+
+using connection_id_type = int64_t;
+using row_id_type = int64_t;
 }  // namespace dataset
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/dataset/core/data_type.cc b/mindspore/ccsrc/dataset/core/data_type.cc
index 744c8c1ca0..71a510d88f 100644
--- a/mindspore/ccsrc/dataset/core/data_type.cc
+++ b/mindspore/ccsrc/dataset/core/data_type.cc
@@ -138,7 +138,7 @@ DataType DataType::FromNpArray(const py::array &arr) {
     return DataType(DataType::DE_FLOAT32);
   } else if (py::isinstance<py::array_t<std::double_t>>(arr)) {
     return DataType(DataType::DE_FLOAT64);
-  } else if (arr.dtype().kind() == 'S') {
+  } else if (arr.dtype().kind() == 'S' || arr.dtype().kind() == 'U') {
     return DataType(DataType::DE_STRING);
   } else {
     MS_LOG(ERROR) << "Cannot convert from numpy type. Unknown data type is returned!";
diff --git a/mindspore/ccsrc/dataset/core/data_type.h b/mindspore/ccsrc/dataset/core/data_type.h
index f1f0bb2ebb..a487f3300e 100644
--- a/mindspore/ccsrc/dataset/core/data_type.h
+++ b/mindspore/ccsrc/dataset/core/data_type.h
@@ -128,7 +128,9 @@ class DataType {
   // @tparam T
   // @return true or false
   template <typename T>
-  bool IsCompatible() const;
+  bool IsCompatible() const {
+    return type_ == FromCType<T>();
+  }
 
   // returns true if the template type is the same as the Tensor type_
   // @tparam T
@@ -146,6 +148,9 @@ class DataType {
     return out;
   }
 
+  template <typename T>
+  static DataType FromCType();
+
   // Convert from DataType to Pybind type
   // @return
   py::dtype AsNumpyType() const;
@@ -191,68 +196,68 @@ class DataType {
 };
 
 template <>
-inline bool DataType::IsCompatible<bool>() const {
-  return type_ == DataType::DE_BOOL;
+inline DataType DataType::FromCType<bool>() {
+  return DataType(DataType::DE_BOOL);
 }
 
 template <>
-inline bool DataType::IsCompatible<double>() const {
-  return type_ == DataType::DE_FLOAT64;
+inline DataType DataType::FromCType<double>() {
+  return DataType(DataType::DE_FLOAT64);
 }
 
 template <>
-inline bool DataType::IsCompatible<float>() const {
-  return type_ == DataType::DE_FLOAT32;
+inline DataType DataType::FromCType<float>() {
+  return DataType(DataType::DE_FLOAT32);
 }
 
 template <>
-inline bool DataType::IsCompatible<float16>() const {
-  return type_ == DataType::DE_FLOAT16;
+inline DataType DataType::FromCType<float16>() {
+  return DataType(DataType::DE_FLOAT16);
 }
 
 template <>
-inline bool DataType::IsCompatible<int64_t>() const {
-  return type_ == DataType::DE_INT64;
+inline DataType DataType::FromCType<int64_t>() {
+  return DataType(DataType::DE_INT64);
 }
 
 template <>
-inline bool DataType::IsCompatible<uint64_t>() const {
-  return type_ == DataType::DE_UINT64;
+inline DataType DataType::FromCType<uint64_t>() {
+  return DataType(DataType::DE_UINT64);
 }
 
 template <>
-inline bool DataType::IsCompatible<int32_t>() const {
-  return type_ == DataType::DE_INT32;
+inline DataType DataType::FromCType<int32_t>() {
+  return DataType(DataType::DE_INT32);
 }
 
 template <>
-inline bool DataType::IsCompatible<uint32_t>() const {
-  return type_ == DataType::DE_UINT32;
+inline DataType DataType::FromCType<uint32_t>() {
+  return DataType(DataType::DE_UINT32);
 }
 
 template <>
-inline bool DataType::IsCompatible<int16_t>() const {
-  return type_ == DataType::DE_INT16;
+inline DataType DataType::FromCType<int16_t>() {
+  return DataType(DataType::DE_INT16);
 }
 
 template <>
-inline bool DataType::IsCompatible<uint16_t>() const {
-  return type_ == DataType::DE_UINT16;
+inline DataType DataType::FromCType<uint16_t>() {
+  return DataType(DataType::DE_UINT16);
 }
 
 template <>
-inline bool DataType::IsCompatible<int8_t>() const {
-  return type_ == DataType::DE_INT8;
+inline DataType DataType::FromCType<int8_t>() {
+  return DataType(DataType::DE_INT8);
 }
 
 template <>
-inline bool DataType::IsCompatible<uint8_t>() const {
-  return type_ == DataType::DE_UINT8;
+inline DataType DataType::FromCType<uint8_t>() {
+  return DataType(DataType::DE_UINT8);
 }
 
 template <>
-inline bool DataType::IsCompatible<std::string_view>() const {
-  return type_ == DataType::DE_STRING;
+inline DataType DataType::FromCType<std::string_view>() {
+  return DataType(DataType::DE_STRING);
 }
 
 template <>
diff --git a/mindspore/ccsrc/dataset/core/tensor.cc b/mindspore/ccsrc/dataset/core/tensor.cc
index c986e07089..abab8cf3f4 100644
--- a/mindspore/ccsrc/dataset/core/tensor.cc
+++ b/mindspore/ccsrc/dataset/core/tensor.cc
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <iomanip>
 #include <iostream>
+#include <fstream>
 #include <memory>
 #include <vector>
 #include <utility>
@@ -229,7 +230,12 @@ Status Tensor::CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::arr
   }
   arr.resize({arr.size()});  // flatten the py::array so we can iterate once
   std::vector<std::string> strings;
-  std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::bytes>(s)); });
+
+  if (arr.dtype().kind() == 'U') {
+    std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::str>(s)); });
+  } else {
+    std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::bytes>(s)); });
+  }
 
   arr.resize(shape);  // resize arr back to the original shape
 
@@ -306,6 +312,50 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::Byte
   return Status::OK();
 }
 
+Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const std::string &file_path) {
+  std::ifstream fs;
+  fs.open(file_path, std::ios::binary | std::ios::in);
+  CHECK_FAIL_RETURN_UNEXPECTED(!fs.fail(), "Fail to open file: " + file_path);
+  int64_t num_bytes = fs.seekg(0, std::ios::end).tellg();
+  CHECK_FAIL_RETURN_UNEXPECTED(fs.seekg(0, std::ios::beg).good(), "Fail to find size of file");
+  RETURN_IF_NOT_OK(
+    Tensor::CreateTensor(ptr, TensorImpl::kFlexible, TensorShape{num_bytes}, DataType(DataType::DE_UINT8)));
+  int64_t written_bytes = fs.read(reinterpret_cast<char *>((*ptr)->GetMutableBuffer()), num_bytes).gcount();
+  CHECK_FAIL_RETURN_UNEXPECTED(written_bytes == num_bytes && fs.good(), "Error in writing to tensor");
+  fs.close();
+  return Status::OK();
+}
+
+Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
+                            const TensorShape &shape, const DataType &type, dsize_t pad_size) {
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(ptr, TensorImpl::kFlexible, shape, type));
+
+  unsigned char *current_tensor_addr = (*ptr)->GetMutableBuffer();
+  int64_t tensor_bytes_remaining = bytes_list.value_size() * pad_size;
+
+  for (int i = 0; i < bytes_list.value_size(); i++) {
+    // read string data into tensor
+    const std::string &current_element = bytes_list.value(i);
+    int return_code =
+      memcpy_s(current_tensor_addr, tensor_bytes_remaining, common::SafeCStr(current_element), current_element.size());
+
+    CHECK_FAIL_RETURN_UNEXPECTED(return_code == 0, "memcpy_s failed when reading bytesList element into Tensor");
+
+    current_tensor_addr += current_element.size();
+    tensor_bytes_remaining -= current_element.size();
+
+    // pad
+    int64_t chars_to_pad = pad_size - current_element.size();
+    return_code = memset_s(current_tensor_addr, tensor_bytes_remaining, static_cast<int>(' '), chars_to_pad);
+    CHECK_FAIL_RETURN_UNEXPECTED(return_code == 0, "memcpy_s failed when padding Tensor");
+
+    current_tensor_addr += chars_to_pad;
+    tensor_bytes_remaining -= chars_to_pad;
+  }
+
+  return Status::OK();
+}
+
 // Memcpy the given strided array's used part to consecutive memory
 // Consider a 3-d array
 // A[(i * shape[1] + j) * shape[2] + k] = B[i][j][k] = C[i * strides[0] + j * strides[1] + k * strides[2]]
@@ -539,11 +589,13 @@ Status Tensor::StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_
   if (type() == DataType::DE_STRING) {
     RETURN_STATUS_UNEXPECTED("StartAddrOfIndex does not support string tensors yet.");
   }
+
   dsize_t flat_ind;
   std::vector<dsize_t> t_shape = shape().AsVector();
   std::vector<dsize_t> r(t_shape.begin() + ind.size(), t_shape.end());
   *remaining = TensorShape(r);
   ind.resize(this->Rank(), 0);  //  same as -> while (ind.size() < this->Rank()) ind.push_back(0);
+
   RETURN_IF_NOT_OK(shape_.ToFlatIndex(ind, &flat_ind));
   // check if GetBuffer() returns null, we should flag this as an error, this sanity check will only
   // be true is the tensor failed to allocate memory.
@@ -584,6 +636,39 @@ Status Tensor::InsertTensor(const std::vector<dsize_t> &ind, const std::shared_p
   }
 }
 
+Status Tensor::Concatenate(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &tensor) {
+  std::string err_msg;
+  err_msg += (index.size() != 1) ? "[Tensor] only supports 1d concatenation \n" : "";
+  err_msg += (type() == DataType::DE_STRING) ? "[Tensor] Cannot batch tensors of type string\n" : "";
+  err_msg += (!shape().known() || !tensor->shape().known()) ? "[Tensor] unknown shape\n" : "";
+
+  err_msg +=
+    (index.at(0) + tensor->shape().NumOfElements() > this->shape().NumOfElements()) ? "[Tensor] incorrect index\n" : "";
+  err_msg += tensor->type().SizeInBytes() != this->type().SizeInBytes() ? "[Tensor] incorrect datatype\n" : "";
+  uchar *start_addr_of_ind = nullptr;
+
+  TensorShape remaining_shape = tensor->shape();
+  StartAddrOfIndex(index, &start_addr_of_ind, &remaining_shape);
+  err_msg += (start_addr_of_ind == nullptr) ? "Failed to create memory for Tensor.\n" : "";
+
+  if (!err_msg.empty()) {
+    MS_LOG(DEBUG) << "Insert tensor message: " << err_msg;
+
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  } else {
+    int ret_code =
+      memcpy_s(start_addr_of_ind, tensor->SizeInBytes(), tensor->GetMutableBuffer(), tensor->SizeInBytes());
+
+    if (ret_code == 0) {
+      return Status::OK();
+    } else {
+      err_msg += "[Tensor] error in memcpy_s when inserting tensor\n";
+      MS_LOG(DEBUG) << "Tensor message: " << err_msg;
+      RETURN_STATUS_UNEXPECTED(err_msg);
+    }
+  }
+}
+
 Status Tensor::ExpandDim(const dsize_t &axis) {
   if (axis > Rank()) {
     std::string err = "Axis is out of bound";
@@ -649,7 +734,7 @@ Status Tensor::GetItemAt(T *o, const std::vector<dsize_t> &index) const {
 Status Tensor::GetItemAt(std::string_view *o, const std::vector<dsize_t> &index) const {
   RETURN_UNEXPECTED_IF_NULL(data_);
   RETURN_UNEXPECTED_IF_NULL(o);
-  CHECK_FAIL_RETURN_UNEXPECTED(type_ == DataType::DE_STRING, "Type is not DE_STRING");
+  CHECK_FAIL_RETURN_UNEXPECTED(type_ == DataType::DE_STRING, "Tensor type is not a string");
 
   uchar *start = nullptr;
   offset_t length = 0;
@@ -699,6 +784,8 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
   for (; itr != end<std::string_view>(); itr++) {
     max = std::max((*itr).length(), max);
   }
+  // if all strings are empty, numpy stores a byte for each string |S1
+  max = (max == 0 ? 1 : max);
   uint64_t total_size = shape_.NumOfElements() * max;
   char *tmp_data = reinterpret_cast<char *>(data_allocator_->allocate(total_size));
   if (tmp_data == nullptr) RETURN_STATUS_UNEXPECTED("Cannot create temp array.");
@@ -708,8 +795,10 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
   itr = begin<std::string_view>();
   uint64_t i = 0;
   for (; itr != end<std::string_view>(); itr++, i++) {
-    ret_code = memcpy_s(tmp_data + i * max, total_size, (*itr).data(), (*itr).length());
-    CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Failed to copy string data.");
+    if (!(*itr).empty()) {
+      ret_code = memcpy_s(tmp_data + i * max, total_size, (*itr).data(), (*itr).length());
+      CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Failed to copy string data.");
+    }
   }
   auto strides = shape_.Strides();
   std::transform(strides.begin(), strides.end(), strides.begin(), [&max](const auto &s) { return s * max; });
@@ -847,6 +936,78 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length
   *length = offset_ptr[index + 1] - start - 1;  // -1 to skip the \0 from the string length
   return Status::OK();
 }
+Status Tensor::CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index) {
+  CHECK_FAIL_RETURN_UNEXPECTED(src->type() == type_, "Source Tensor has a different type");
+  CHECK_FAIL_RETURN_UNEXPECTED(index.back() == 0, "Last dim in index should be 0");
+
+  uint8_t type_size = type_.SizeInBytes();
+  size_t len = std::min(src->shape()[-1], shape_[-1]) * type_size;
+  dsize_t src_flat_ind = 0, dst_flat_ind = 0;
+  RETURN_IF_NOT_OK(src->shape().ToFlatIndex(index, &src_flat_ind));
+  RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &dst_flat_ind));
+
+  const unsigned char *src_addr = src->GetBuffer() + src_flat_ind * type_size;
+  unsigned char *dst_addr = GetMutableBuffer() + dst_flat_ind * type_size;
+  CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(dst_addr, len, src_addr, len) == 0, "memcpy error");
+  return Status::OK();
+}
+Status Tensor::Slice(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices) {
+  CHECK_FAIL_RETURN_UNEXPECTED(shape_.Rank() == 1, "Currently Slice work with rank 1 tensors only.");
+  CHECK_FAIL_RETURN_UNEXPECTED(!indices.empty(), "Indices are empty, generated tensor would be empty.");
+  if (type_.IsNumeric()) {
+    return SliceNumeric(out, indices);
+  } else {
+    return SliceString(out, indices);
+  }
+}
+Status Tensor::SliceNumeric(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices) {
+  RETURN_IF_NOT_OK(
+    CreateTensor(out, TensorImpl::kFlexible, TensorShape({static_cast<dsize_t>(indices.size())}), type_));
+  (*out)->GetMutableBuffer();
+  dsize_t out_index = 0;
+  dsize_t dim_length = shape_[0];
+  dsize_t type_size = type_.SizeInBytes();
+  dsize_t src_start = HandleNeg(indices[0], dim_length);
+  uchar *dst_addr = (*out)->data_;
+  dsize_t count = 1;
+
+  for (dsize_t i = 0; i < indices.size(); i++) {
+    dsize_t cur_index = HandleNeg(indices[i], dim_length);
+    CHECK_FAIL_RETURN_UNEXPECTED(
+      cur_index >= 0 && cur_index < dim_length,
+      "Index " + std::to_string(indices[i]) + " is out of bounds [0," + std::to_string(dim_length) + ")");
+    if (i < indices.size() - 1) {
+      dsize_t next_index = HandleNeg(indices[i + 1], dim_length);
+      if (next_index == cur_index + 1) {
+        count++;
+        continue;
+      }
+    }
+    int return_code = memcpy_s(dst_addr + out_index * type_size, (*out)->SizeInBytes(), data_ + src_start * type_size,
+                               count * type_size);
+    CHECK_FAIL_RETURN_UNEXPECTED(return_code == 0, "memcpy_s failed in SliceNumeric");
+    out_index += count;
+    if (i < indices.size() - 1) {
+      src_start = HandleNeg(indices[i + 1], dim_length);  // next index
+    }
+    count = 1;
+  }
+  return Status::OK();
+}
+Status Tensor::SliceString(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices) {
+  dsize_t dim_length = shape_[0];
+  std::vector<std::string> strings;
+  for (dsize_t index : indices) {
+    dsize_t cur_index = HandleNeg(index, dim_length);
+    CHECK_FAIL_RETURN_UNEXPECTED(
+      cur_index >= 0 && cur_index < dim_length,
+      "Index " + std::to_string(index) + " is out of bounds [0," + std::to_string(dim_length) + ")");
+    std::string_view sv;
+    GetItemAt(&sv, {cur_index});
+    strings.emplace_back(sv);
+  }
+  return CreateTensor(out, strings);
+}
 
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/core/tensor.h b/mindspore/ccsrc/dataset/core/tensor.h
index 5efd989fc9..a3dbb391e5 100644
--- a/mindspore/ccsrc/dataset/core/tensor.h
+++ b/mindspore/ccsrc/dataset/core/tensor.h
@@ -44,9 +44,6 @@ class Tensor;
 
 using CharAllocPtr = std::unique_ptr<Allocator<unsigned char>>;
 using TensorAllocPtr = std::shared_ptr<Allocator<Tensor>>;  // An allocator shared_ptr for Tensors
-using TensorRow = std::vector<std::shared_ptr<Tensor>>;     // A row is a set of Tensor pointers
-using TensorTable = std::vector<TensorRow>;                 // The table of tensors is a vector of rows
-using TensorQTable = std::deque<TensorRow>;  // A different flavour of tensor table, this one has queue functionality
 
 class Tensor {
  public:
@@ -118,6 +115,16 @@ class Tensor {
   static Status CreateTensor(std::shared_ptr<Tensor> *, TensorImpl tensor_impl, const TensorShape &shape, DataType type,
                              const unsigned char *data = nullptr);
 
+  /// Create a copy of the input tensor
+  /// \param out [out] output tensor to be generated
+  /// \param in [in] orginal tensor to be copied
+  /// \return Status
+  static Status CreateTensor(std::shared_ptr<Tensor> *out, const std::shared_ptr<Tensor> &in) {
+    const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
+    *out = std::allocate_shared<Tensor>(*alloc, in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes());
+    return Status::OK();
+  }
+
   // A static factory method to create a Tensor from a given py::array.
   // @param ptr output argument to hold the created Tensor
   // @param arr py::array
@@ -135,9 +142,41 @@ class Tensor {
   static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std::string> &strings,
                              const TensorShape &shape = TensorShape::CreateUnknownRankShape());
 
+  // create tensor from protobuf bytelist with strings
   static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
                              const TensorShape &shape);
 
+  // A static factory method to create a Tensor from a given list of numbers.
+  // @param ptr output argument to hold the created Tensor
+  // @param items elements of the tensor
+  // @param shape shape of the tensor
+  // @return Status Code
+  template <typename T>
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<T> &items,
+                             const TensorShape &shape_req = TensorShape::CreateUnknownRankShape()) {
+    DataType type = DataType::FromCType<T>();
+    auto items_ptr = reinterpret_cast<const uchar *>(&items[0]);
+    TensorShape shape = shape_req;
+    if (!shape.known()) {
+      shape = TensorShape({static_cast<dsize_t>(items.size())});
+    }
+    return CreateTensor(ptr, TensorImpl::kFlexible, shape, type, items_ptr);
+  }
+
+  // A static factory method to create a Tensor from a given number.
+  // @param ptr output argument to hold the created Tensor
+  // @param item value
+  // @return Status Code
+  template <typename T>
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const T &item) {
+    return CreateTensor<T>(ptr, {item}, TensorShape::CreateScalar());
+  }
+  // Create tensor from protobuf bytelist with uint8 or int8 types
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
+                             const TensorShape &shape, const DataType &type, dsize_t pad_size);
+
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::string &path);
+
   // Copy raw data of a array based on shape and strides to the destination pointer
   // @param dst Pointer to the destination array where the content is to be copied
   // @param src Pointer to the source of strided array to be copied
@@ -260,11 +299,6 @@ class Tensor {
   // @return const unsigned char*
   const unsigned char *GetBuffer() const;
 
-  // Get the starting memory address for the data of the tensor.  This potentially
-  // drives an allocation if the data area.
-  // @return unsigned char*
-  unsigned char *GetMutableBuffer();
-
   // Getter of the type
   // @return
   DataType type() const { return type_; }
@@ -323,6 +357,22 @@ class Tensor {
     return ss.str();
   }
 
+  // Handle negative indices.
+  static inline dsize_t HandleNeg(dsize_t index, dsize_t length) { return (index < 0) ? (index + length) : index; }
+
+  // Slice tensor bases on the given indicies. Copy the sliced data into out tensor. Only rank1 tensors are supported.
+  // Based on the type of tensor, SliceNumeric or SliceString will be called
+  // @param out Tensor
+  // @param indices vector of indices
+  // @return Status error code
+  Status Slice(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices);
+
+  // Slice numeric tensors.
+  Status SliceNumeric(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices);
+
+  // Slice string tensors
+  Status SliceString(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices);
+
   // Constructs numpy array from input tensor
   // @param data this data is the location of python data
   // @return Status code
@@ -332,6 +382,9 @@ class Tensor {
 
   static Status GetBufferInfo(Tensor &t, py::buffer_info *out);
 
+  // Concatenate based on given tensor, can fill in current tensor with a smaller one, unlike InsertTensor
+  Status Concatenate(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &input);
+
   // TensorIterator is a linear iterator that can be used to iterate over the elements of the Tensor
   // The order  elements  is as the memory layout (i.e., row-major) [[1,2,3],[4,5,6] --> 1,2,3,4,5,6
   // @tparam T type of values in the Tensor Iterator
@@ -518,6 +571,7 @@ class Tensor {
   // @return TensorIterator
   template <typename T>
   TensorIterator<T> begin() {
+    AllocateBuffer(SizeInBytes());
     return TensorIterator<T>(data_);
   }
 
@@ -529,7 +583,18 @@ class Tensor {
     return TensorIterator<T>(data_end_);
   }
 
+  // Copies the last dimension at `index` from Tensor `src` to this Tensor.
+  // @param src Tensor
+  // @param index vector to the start of the dimension. The last dim should be 0
+  // @return Status
+  Status CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index);
+
  protected:
+  // Get the starting memory address for the data of the tensor.  This potentially
+  // drives an allocation if the data is null.
+  // @return unsigned char*
+  unsigned char *GetMutableBuffer();
+
   // A function that prints Tensor recursively, first called by print
   // @param out
   // @param cur_dim
diff --git a/mindspore/ccsrc/dataset/core/tensor_row.cc b/mindspore/ccsrc/dataset/core/tensor_row.cc
new file mode 100644
index 0000000000..882f6728bf
--- /dev/null
+++ b/mindspore/ccsrc/dataset/core/tensor_row.cc
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utility>
+
+#include "dataset/core/tensor_row.h"
+
+namespace py = pybind11;
+namespace mindspore {
+namespace dataset {
+
+TensorRow::TensorRow() noexcept : id_(kDefaultRowId) {}
+
+TensorRow::TensorRow(size_type n, TensorRow::value_type t) noexcept : id_(kDefaultRowId), row_(n, t) {}
+
+TensorRow::TensorRow(const TensorRow::vector_type &v) : id_(kDefaultRowId), row_(v) {}
+
+TensorRow::TensorRow(row_id_type id, const std::initializer_list<value_type> &lst) : id_(id), row_(lst) {}
+
+TensorRow::TensorRow(const TensorRow &tr) : id_(tr.id_), row_(tr.row_) {}
+
+TensorRow &TensorRow::operator=(const TensorRow &tr) {
+  if (this == &tr) {
+    return *this;
+  }
+  row_ = tr.row_;
+  id_ = tr.id_;
+  return *this;
+}
+
+TensorRow &TensorRow::operator=(const std::initializer_list<TensorRow::value_type> &lst) {
+  row_ = lst;
+  return *this;
+}
+
+TensorRow::TensorRow(TensorRow::vector_type &&v) noexcept : id_(kDefaultRowId), row_(std::move(v)) {}
+
+TensorRow::TensorRow(row_id_type id, std::initializer_list<value_type> &&lst) noexcept
+    : id_(id), row_(std::move(lst)) {}
+
+TensorRow::TensorRow(TensorRow &&tr) noexcept {
+  id_ = tr.id_;
+  row_ = std::move(tr.row_);
+}
+
+TensorRow &TensorRow::operator=(TensorRow &&tr) noexcept {
+  if (this == &tr) {
+    return *this;
+  }
+  row_ = std::move(tr.row_);
+  id_ = tr.id_;
+  tr.id_ = kDefaultRowId;
+  return *this;
+}
+
+TensorRow &TensorRow::operator=(std::initializer_list<TensorRow::value_type> &&lst) noexcept {
+  row_ = std::move(lst);
+  return *this;
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/core/tensor_row.h b/mindspore/ccsrc/dataset/core/tensor_row.h
new file mode 100644
index 0000000000..49bc61657c
--- /dev/null
+++ b/mindspore/ccsrc/dataset/core/tensor_row.h
@@ -0,0 +1,131 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_CORE_TENSOR_ROW_H_
+#define DATASET_CORE_TENSOR_ROW_H_
+
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "dataset/core/tensor.h"
+
+namespace mindspore {
+namespace dataset {
+
+class TensorRow;                             // A set of Tensor pointers with an id
+using TensorTable = std::vector<TensorRow>;  // The table of tensors is a vector of rows
+using TensorQTable = std::deque<TensorRow>;  // A different flavour of tensor table, this one has queue functionality
+
+class TensorRow {
+ public:
+  static constexpr row_id_type kDefaultRowId = -1;  // Default row id
+
+  // Type definitions
+  using size_type = dsize_t;
+  using value_type = std::shared_ptr<Tensor>;
+  using reference = std::shared_ptr<Tensor> &;
+  using const_reference = const std::shared_ptr<Tensor> &;
+  using vector_type = std::vector<std::shared_ptr<Tensor>>;
+  using iterator = std::vector<std::shared_ptr<Tensor>>::iterator;
+  using const_iterator = std::vector<std::shared_ptr<Tensor>>::const_iterator;
+
+  TensorRow() noexcept;
+
+  TensorRow(size_type n, value_type t) noexcept;
+
+  // Copy Constructors
+  explicit TensorRow(const vector_type &v);
+
+  TensorRow(row_id_type id, const std::initializer_list<value_type> &lst);
+
+  TensorRow(const TensorRow &tr);
+
+  TensorRow &operator=(const TensorRow &tr);
+
+  TensorRow &operator=(const std::initializer_list<value_type> &lst);
+
+  // Move Constructors
+  explicit TensorRow(vector_type &&v) noexcept;
+
+  TensorRow(row_id_type id, std::initializer_list<value_type> &&lst) noexcept;
+
+  TensorRow(TensorRow &&tr) noexcept;
+
+  TensorRow &operator=(TensorRow &&tr) noexcept;
+
+  TensorRow &operator=(std::initializer_list<value_type> &&lst) noexcept;
+
+  // Destructor
+  ~TensorRow() = default;
+
+  // Functions to fetch/set id/vector
+  row_id_type getId() const { return id_; }
+
+  void setId(row_id_type id) { id_ = id; }
+
+  const vector_type &getRow() const { return row_; }
+
+  // Wrapper functions to support vector operations
+  void emplace_back(value_type t) { row_.emplace_back(t); }
+
+  void push_back(value_type t) { row_.push_back(t); }
+
+  void clear() noexcept { row_.clear(); }
+
+  size_type size() const noexcept { return row_.size(); }
+
+  void reserve(size_type size) { row_.reserve(size); }
+
+  void resize(size_type size) { row_.resize(size); }
+
+  bool empty() { return row_.empty(); }
+
+  void insert(iterator position, iterator first, iterator last) { row_.insert(position, first, last); }
+
+  // Wrapper functions to support vector element access
+  reference at(size_type index) { return row_.at(index); }
+
+  const_reference at(size_type index) const { return row_.at(index); }
+
+  reference front() { return row_.front(); }
+
+  const_reference front() const { return row_.front(); }
+
+  reference back() { return row_.back(); }
+
+  const_reference back() const { return row_.back(); }
+
+  reference operator[](size_type index) { return row_[index]; }
+
+  const_reference operator[](size_type index) const { return row_[index]; }
+
+  // Wrapper functions to support vector iteration
+  iterator begin() { return row_.begin(); }
+
+  const_iterator begin() const { return row_.begin(); }
+
+  iterator end() { return row_.end(); }
+
+  const_iterator end() const { return row_.end(); }
+
+ protected:
+  row_id_type id_;
+  std::vector<std::shared_ptr<Tensor>> row_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_CORE_TENSOR_ROW_H_
diff --git a/mindspore/ccsrc/dataset/core/tensor_shape.h b/mindspore/ccsrc/dataset/core/tensor_shape.h
index 6cfb007b56..c83e43cd7d 100644
--- a/mindspore/ccsrc/dataset/core/tensor_shape.h
+++ b/mindspore/ccsrc/dataset/core/tensor_shape.h
@@ -94,7 +94,7 @@ class TensorShape {
   // @return
   TensorShape PrependDim(dsize_t dim) const;
 
-  // Insert a new dim at the end of the shape. For example,  <2,4> --> PrependDim(4) --> <2,4,4>
+  // Insert a new dim at the end of the shape. For example,  <2,4> --> AppendDim(4) --> <2,4,4>
   // @param dim
   // @return
   TensorShape AppendDim(dsize_t dim) const;
@@ -118,7 +118,10 @@ class TensorShape {
 
   bool operator!=(const TensorShape &rhs) const { return !(rhs == *this); }
 
-  dsize_t operator[](const dsize_t index) const { return raw_shape_[index]; }
+  dsize_t operator[](const dsize_t index) const {
+    if (index < 0) return raw_shape_[raw_shape_.size() + index];
+    return raw_shape_[index];
+  }
 
   // Return the Shape as a vector
   // @return
diff --git a/mindspore/ccsrc/dataset/engine/CMakeLists.txt b/mindspore/ccsrc/dataset/engine/CMakeLists.txt
index e7b5e682f3..66f95d0926 100644
--- a/mindspore/ccsrc/dataset/engine/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/engine/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(datasetops)
 add_subdirectory(opt)
 add_subdirectory(gnn)
+add_subdirectory(perf)
 if (ENABLE_TDTQUE)
   add_subdirectory(tdt)
 endif ()
@@ -16,7 +17,7 @@ add_library(engine OBJECT
 target_include_directories(engine PRIVATE ${pybind11_INCLUDE_DIRS})
 
 if (ENABLE_TDTQUE)
-  add_dependencies(engine engine-datasetops engine-datasetops-source engine-tdt engine-opt engine-gnn)
+  add_dependencies(engine engine-datasetops engine-datasetops-source engine-tdt engine-opt engine-gnn engine-perf)
 else()
-  add_dependencies(engine engine-datasetops engine-datasetops-source engine-opt engine-gnn)
+  add_dependencies(engine engine-datasetops engine-datasetops-source engine-opt engine-gnn engine-perf)
 endif ()
diff --git a/mindspore/ccsrc/dataset/engine/connector.h b/mindspore/ccsrc/dataset/engine/connector.h
index 085b790ec5..cdce592c1b 100644
--- a/mindspore/ccsrc/dataset/engine/connector.h
+++ b/mindspore/ccsrc/dataset/engine/connector.h
@@ -152,6 +152,23 @@ class Connector {
     return out;
   }
 
+  // Get current size of connector.
+  int32_t size() const {
+    int32_t size = 0;
+    for (int32_t i = 0; i < queues_.size(); ++i) {
+      size += queues_[i]->size();
+    }
+    return size;
+  }
+
+  int32_t capacity() const {
+    int32_t capacity = 0;
+    for (int32_t i = 0; i < queues_.size(); ++i) {
+      capacity += queues_[i]->capacity();
+    }
+    return capacity;
+  }
+
   // Register the internal resources with Task group for interruption service.
   // @param vg
   // @return
diff --git a/mindspore/ccsrc/dataset/engine/data_buffer.cc b/mindspore/ccsrc/dataset/engine/data_buffer.cc
index 4aed994d3c..32a70c259f 100644
--- a/mindspore/ccsrc/dataset/engine/data_buffer.cc
+++ b/mindspore/ccsrc/dataset/engine/data_buffer.cc
@@ -17,8 +17,6 @@
 #include "dataset/util/allocator.h"
 #include "dataset/core/global_context.h"
 #include "dataset/core/tensor.h"
-#include "dataset/engine/datasetops/source/storage_client.h"
-#include "dataset/engine/datasetops/source/tf_buffer.h"
 
 namespace mindspore {
 namespace dataset {
@@ -26,37 +24,6 @@ namespace dataset {
 // Description: This is the main constructor that is used for making a buffer
 DataBuffer::DataBuffer(int32_t id, BufferFlags flags) : buffer_id_(id), tensor_table_(nullptr), buffer_flags_(flags) {}
 
-// Name: CreateDataBuffer()
-// Description: A static factory method to create the appropriate type of derived class
-//              buffer.  Returns the base class reference for DataBuffer.
-Status DataBuffer::CreateDataBuffer(
-  int32_t id,                                     // In: The id for the new buffer
-  std::shared_ptr<StorageClient> storage_client,  // In: The storage client that is related to this buffer type
-  std::unique_ptr<DataBuffer> *ptr) {
-  std::unique_ptr<DataBuffer> new_data_buffer;
-  try {
-    DatasetType ds_type = storage_client->schema()->dataset_type();
-    switch (ds_type) {
-      case DatasetType::kTf: {
-        // This type of buffer is for TF record data.
-        // Allocate derived class version for a TF buffers
-        new_data_buffer = std::make_unique<TFBuffer>(id, kDeBFlagNone, storage_client);
-        break;
-      }
-      default: {
-        std::string errMsg("Invalid buffer type");
-        RETURN_STATUS_UNEXPECTED(errMsg);
-      }
-    }
-  } catch (std::bad_alloc &e) {
-    return Status(StatusCode::kOutOfMemory, __LINE__, __FILE__, e.what());
-  } catch (std::exception &e) {
-    RETURN_STATUS_UNEXPECTED(e.what());
-  }
-  *ptr = std::move(new_data_buffer);
-  return Status::OK();
-}
-
 // Name: print()
 // Description: A function that prints info about the DataBuffer (base class version)
 void DataBuffer::Print(std::ostream &out,      // In: The output stream to print to
@@ -98,7 +65,7 @@ Status DataBuffer::GetTensor(std::shared_ptr<Tensor> *ptr, int32_t row_id, int32
 
 // Remove me!! Callers should fetch rows via pop
 Status DataBuffer::GetRow(int32_t row_id, TensorRow *ptr) const {
-  if (row_id < tensor_table_->size()) {
+  if (tensor_table_ && !tensor_table_->empty() && row_id < tensor_table_->size()) {
     *ptr = tensor_table_->at(row_id);
   } else {
     std::string err_msg = "rowId for mTensorTable out of range: " + std::to_string(row_id);
diff --git a/mindspore/ccsrc/dataset/engine/data_buffer.h b/mindspore/ccsrc/dataset/engine/data_buffer.h
index 0053d8894d..2ab0783519 100644
--- a/mindspore/ccsrc/dataset/engine/data_buffer.h
+++ b/mindspore/ccsrc/dataset/engine/data_buffer.h
@@ -25,12 +25,10 @@
 #include "dataset/util/status.h"
 #include "dataset/core/constants.h"
 #include "dataset/core/tensor.h"
+#include "dataset/core/tensor_row.h"
 
 namespace mindspore {
 namespace dataset {
-// Forward declares
-class StorageClient;
-
 // The DataBuffer class is a base class that will represent the data for n values based
 // on a unique row id for each row of data.
 // There can be different types of DataBuffers to abstract over how the data is stored
@@ -52,14 +50,6 @@ class DataBuffer {
   // Destructor
   virtual ~DataBuffer();
 
-  // Name: CreateDataBuffer()
-  // Description: A factory method to create the appropriate type of derived class
-  //              buffer.  Returns the base class reference for DataBuffer.
-  static Status CreateDataBuffer(
-    int32_t id,                      // In: The id for the new buffer
-    std::shared_ptr<StorageClient>,  // In: The StorageClient is used to choose the buffer type to create
-    std::unique_ptr<DataBuffer> *);
-
   // Name: print()
   // Description: A function that prints info about the DataBuffer (base class version)
   virtual void Print(std::ostream &out,     // In: The output stream to print to
diff --git a/mindspore/ccsrc/dataset/engine/dataset_iterator.cc b/mindspore/ccsrc/dataset/engine/dataset_iterator.cc
index 011e60cc24..7eb38785aa 100644
--- a/mindspore/ccsrc/dataset/engine/dataset_iterator.cc
+++ b/mindspore/ccsrc/dataset/engine/dataset_iterator.cc
@@ -83,7 +83,19 @@ Status IteratorBase::FetchNextTensorRow(TensorRow *out_row) {
 }
 
 // Constructor of the DatasetIterator
-DatasetIterator::DatasetIterator(std::shared_ptr<ExecutionTree> exe_tree) : IteratorBase(), root_(exe_tree->root()) {}
+DatasetIterator::DatasetIterator(std::shared_ptr<ExecutionTree> exe_tree)
+    : IteratorBase(),
+      root_(exe_tree->root()),
+      tracing_(nullptr),
+      cur_batch_num_(0),
+      cur_connector_size_(0),
+      cur_connector_capacity_(0) {
+  std::shared_ptr<Tracing> node;
+  Status s = exe_tree->GetProfilingManager()->GetTracingNode(kDatasetIteratorTracingName, &node);
+  if (s.IsOk()) {
+    tracing_ = std::dynamic_pointer_cast<DatasetIteratorTracing>(node);
+  }
+}
 
 DatasetIterator::~DatasetIterator() = default;
 
@@ -101,6 +113,10 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
 
   // Check if we need to get a new DataBuffer to iterate.
   if (curr_buffer_ == nullptr || curr_buffer_->NumRows() == 0) {
+    if (tracing_ != nullptr) {
+      cur_connector_size_ = root_->ConnectorSize();
+      cur_connector_capacity_ = root_->ConnectorCapacity();
+    }
     RETURN_IF_NOT_OK(root_->GetNextBuffer(&curr_buffer_));
 
     // Since GetNextBuffer was used rather than GetNextInput(), it means we need to manually
@@ -121,6 +137,8 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
       }
       eof_handled_ = true;
       curr_buffer_.reset();  // explicitly free the eof buffer
+      // Set tree to Finished state
+      root_->Tree()->SetFinished();
 
       return Status::OK();
     }
@@ -131,13 +149,18 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
       // flow of an eof up the pipeline by itself.
       eof_handled_ = true;
       curr_buffer_.reset();  // explicitly free the eof buffer
+      // Set tree to Finished state
+      root_->Tree()->SetFinished();
       return Status::OK();
     }
   }
 
   // If we got this far, now it's time to pop that next row for return to caller
   RETURN_IF_NOT_OK(curr_buffer_->PopRow(out_row));
-
+  if (tracing_ != nullptr) {
+    cur_batch_num_++;
+    tracing_->Record(CONNECTOR_DEPTH, cur_connector_capacity_, cur_batch_num_, cur_connector_size_);
+  }
   return Status::OK();
 }
 
diff --git a/mindspore/ccsrc/dataset/engine/dataset_iterator.h b/mindspore/ccsrc/dataset/engine/dataset_iterator.h
index ddd4883a86..ada2b0ffb6 100644
--- a/mindspore/ccsrc/dataset/engine/dataset_iterator.h
+++ b/mindspore/ccsrc/dataset/engine/dataset_iterator.h
@@ -24,6 +24,7 @@
 #include "dataset/core/tensor.h"
 #include "dataset/engine/datasetops/dataset_op.h"
 #include "dataset/engine/execution_tree.h"
+#include "dataset/engine/perf/dataset_iterator_tracing.h"
 
 namespace mindspore {
 namespace dataset {
@@ -52,7 +53,7 @@ class IteratorBase {
   // messages are encountered (such as eoe or eof), then an empty TensorRow is returned back.
   // @return Status - The error code return
   // @note The position of a Tensor/column might be different from the initial column order
-  // in the storageOp. User must be aware that MapOp, ZipOps, and others might change
+  // in corresponding Dataset Op. User must be aware that MapOp, ZipOps, and others might change
   // the column ordering.
   virtual Status FetchNextTensorRow(TensorRow *out_row);
 
@@ -109,6 +110,10 @@ class DatasetIterator : public IteratorBase {
  private:
   std::shared_ptr<DatasetOp> root_;  // saves the root of the executionTree
   TensorRow device_queue_row_;
+  std::shared_ptr<DatasetIteratorTracing> tracing_;  // trace profiling data
+  int32_t cur_batch_num_;                            // current batch number,used for profiling
+  int32_t cur_connector_size_;                       // current connector size of root op,used for profiling
+  int32_t cur_connector_capacity_;                   // current connector capacity of root op, used for profiling
 };
 
 // The ChildIterator derived class is for fetching rows from intermediate nodes of execution tree.
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/CMakeLists.txt b/mindspore/ccsrc/dataset/engine/datasetops/CMakeLists.txt
index 70065df5f4..ed57421030 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/engine/datasetops/CMakeLists.txt
@@ -8,6 +8,7 @@ add_library(engine-datasetops OBJECT
     pipeline_op.cc
     barrier_op.cc
     batch_op.cc
+    bucket_batch_by_length_op.cc
     device_queue_op.cc
     map_op.cc
     project_op.cc
@@ -19,5 +20,6 @@ add_library(engine-datasetops OBJECT
     zip_op.cc
     concat_op.cc
     filter_op.cc
+    build_vocab_op.cc
     )
 
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
index 374128eb21..60643c90ba 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
@@ -23,6 +23,7 @@
 #include "dataset/engine/data_buffer.h"
 #include "dataset/engine/db_connector.h"
 #include "dataset/engine/opt/pass.h"
+#include "dataset/kernels/data/data_utils.h"
 
 using float16 = Eigen::half;
 
@@ -53,7 +54,7 @@ Status BatchOp::Builder::SanityCheck() {
 
 BatchOp::BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers,
                  const std::vector<std::string> &cols_to_map, py::function batch_size_func, py::function batch_map_func,
-                 std::map<std::string, std::pair<TensorShape, float>> pad_map)
+                 PadInfo pad_map)
     : ParallelOp(num_workers, op_queue_size),
       start_batch_size_(batch_size),
       drop_(drop),
@@ -75,10 +76,6 @@ Status BatchOp::operator()() {
   std::unique_ptr<TensorQTable> table = std::make_unique<TensorQTable>();
   child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
   RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
-  for (const auto &t : new_row) {
-    CHECK_FAIL_RETURN_UNEXPECTED(t->type().IsNumeric(),
-                                 "[Batch ERROR] Batch does not support Tensor of type string yet.");
-  }
   RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild());  // must come after the first fetch above
   int32_t cur_batch_size = 0;
   RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(0, 0, 0)));
@@ -134,49 +131,57 @@ void BatchOp::Print(std::ostream &out, bool show_all) const {
   }
 }
 
-Status BatchOp::BatchRows(const std::unique_ptr<TensorQTable> *source_table,
-                          const std::unique_ptr<TensorQTable> *dest_table, size_t batch_size) {
-  if ((*source_table)->size() < batch_size || (*source_table)->size() == 0) {
-    RETURN_STATUS_UNEXPECTED("[Internal Batch ERROR] Insufficient rows in source_table\n");
+Status BatchOp::BatchRows(const std::unique_ptr<TensorQTable> *src, const std::unique_ptr<TensorQTable> *dest,
+                          dsize_t batch_size) {
+  if ((*src)->size() != batch_size) {
+    RETURN_STATUS_UNEXPECTED("[Internal Batch ERROR] Source table size does not match the batch_size");
   }
-  TensorRow row = std::move((*source_table)->front());
-  (*source_table)->pop_front();
+
   if (batch_size == 1) {
-    for (std::shared_ptr<Tensor> tensor : row) {
+    TensorRow row = std::move((*src)->front());
+    (*src)->pop_front();
+    (*dest)->push_back(row);
+    for (const auto &tensor : (*dest)->front()) {
       RETURN_IF_NOT_OK(tensor->ExpandDim(0));
     }
-    (*dest_table)->push_back(row);
-  } else {  // batch_size > 1
-    std::vector<TensorShape> row_shapes;
-    TensorRow batched_row;
-    for (size_t i = 0; i < row.size(); i++) {  // Handle the first row popped
-      row_shapes.push_back(row[i]->shape());
-      std::shared_ptr<Tensor> ts;
-      RETURN_IF_NOT_OK(Tensor::CreateTensor(
-        &ts, TensorImpl::kFlexible, row[i]->shape().PrependDim(static_cast<int64_t>(batch_size)), row[i]->type()));
-      batched_row.emplace_back(ts);
-      RETURN_IF_NOT_OK(batched_row[i]->InsertTensor(std::vector<dsize_t>(1, 0), row[i]));  // {j} = 0
-    }
-    for (size_t j = 1; j < batch_size; j++) {  // Handle the rest of the rows
-      row = std::move((*source_table)->front());
-      (*source_table)->pop_front();
-      for (size_t i = 0; i < row.size(); i++) {
-        if (row[i]->shape() == row_shapes[i]) {  // check the newly popped rows have the same dim as the first
-          RETURN_IF_NOT_OK(batched_row[i]->InsertTensor(std::vector<dsize_t>(1, j), row[i]));
+    return Status::OK();
+  }
+
+  TensorRow batched_row;
+  auto num_columns = (*src)->front().size();
+  for (size_t i = 0; i < num_columns; i++) {
+    std::shared_ptr<Tensor> first_tensor = (*src)->at(0).at(i);  // first row, column i
+    TensorShape first_shape = first_tensor->shape();
+    DataType first_type = first_tensor->type();
+    TensorShape new_shape = first_shape.PrependDim(static_cast<int64_t>(batch_size));
+
+    std::shared_ptr<Tensor> new_tensor;
+    if (first_type.IsNumeric()) {  // numeric tensor
+      RETURN_IF_NOT_OK(Tensor::CreateTensor(&new_tensor, TensorImpl::kFlexible, new_shape, first_type));
+      dsize_t j = 0;
+      for (auto row : **src) {
+        std::shared_ptr<Tensor> old_tensor = row.at(i);  // row j, column i
+        if (old_tensor->shape() == first_shape) {        // check the newly popped rows have the same dim as the first
+          RETURN_IF_NOT_OK(new_tensor->InsertTensor({j++}, old_tensor));
         } else {
-          std::string column_name;
-          for (auto itr : column_name_id_map_) {
-            if (static_cast<size_t>(itr.second) == i) {
-              column_name = itr.first;
-              break;
-            }
-          }
-          RETURN_STATUS_UNEXPECTED("[Batch ERROR] Inconsistent TensorShapes of Column " + column_name);
+          RETURN_STATUS_UNEXPECTED("[Batch ERROR] Inconsistent TensorShapes of Column " + std::to_string(i));
         }
       }
+    } else {  // handle string column differently
+      std::vector<std::string> strings;
+      for (dsize_t j = 0; j < batch_size; j++) {
+        std::shared_ptr<Tensor> old_tensor = (*src)->at(j).at(i);
+        for (auto itr = old_tensor->begin<std::string_view>(); itr != old_tensor->end<std::string_view>(); itr++) {
+          strings.emplace_back(*itr);
+        }
+      }
+      RETURN_IF_NOT_OK(Tensor::CreateTensor(&new_tensor, strings, new_shape));
     }
-    (*dest_table)->emplace_back(batched_row);
+    batched_row.emplace_back(new_tensor);
   }
+
+  (*dest)->emplace_back(batched_row);
+
   return Status::OK();
 }
 
@@ -202,8 +207,8 @@ Status BatchOp::WorkerEntry(int32_t workerId) {
 Status BatchOp::MakeBatchedBuffer(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> table_pair,
                                   std::unique_ptr<DataBuffer> *db) {
   RETURN_UNEXPECTED_IF_NULL(table_pair.first);
-  if (!pyfunc_column_names_.empty()) RETURN_IF_NOT_OK(MapColumns(&table_pair));  // pass it through pyfunc
-  if (pad_) RETURN_IF_NOT_OK(PadColumns(&table_pair));                           // do padding if needed
+  if (!pyfunc_column_names_.empty()) RETURN_IF_NOT_OK(MapColumns(&table_pair));               // pass it through pyfunc
+  if (pad_) RETURN_IF_NOT_OK(PadColumns(&table_pair.first, pad_info_, column_name_id_map_));  // do padding if needed
   (*db) = std::make_unique<DataBuffer>(table_pair.second.batch_num_, DataBuffer::kDeBFlagNone);
   std::unique_ptr<TensorQTable> dest_table = std::make_unique<TensorQTable>();
   RETURN_IF_NOT_OK(BatchRows(&table_pair.first, &dest_table, table_pair.first->size()));
@@ -333,74 +338,27 @@ Status BatchOp::InvokeBatchMapFunc(TensorBatchTable *input, TensorBatchTable *ou
   return Status(StatusCode::kOK);
 }
 
-Status BatchOp::PadTensor(std::shared_ptr<Tensor> src, std::shared_ptr<Tensor> *dst,
-                          const std::vector<dsize_t> &pad_shape, float pad_val) {
-  CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr");
-  if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) {
-    (*dst) = src;  // if no padding, copy the pointer
-  } else {
-    CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed");
-    RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, TensorImpl::kFlexible, TensorShape(pad_shape), src->type()));
-    auto tensor_type = src->type().value();
-    if (pad_val == 0) {  // if pad with zero, don't care what type it is
-      RETURN_IF_NOT_OK((*dst)->Zero());
-    } else if (tensor_type == DataType::DE_INT8) {
-      RETURN_IF_NOT_OK((*dst)->Fill<int8_t>(pad_val));
-    } else if (tensor_type == DataType::DE_BOOL) {
-      RETURN_IF_NOT_OK((*dst)->Fill<bool>(pad_val));
-    } else if (tensor_type == DataType::DE_UINT8) {
-      RETURN_IF_NOT_OK((*dst)->Fill<uint8_t>(pad_val));
-    } else if (tensor_type == DataType::DE_INT16) {
-      RETURN_IF_NOT_OK((*dst)->Fill<int16_t>(pad_val));
-    } else if (tensor_type == DataType::DE_FLOAT16) {
-      RETURN_IF_NOT_OK((*dst)->Fill<float16>(static_cast<float16>(pad_val)));
-    } else if (tensor_type == DataType::DE_UINT16) {
-      RETURN_IF_NOT_OK((*dst)->Fill<uint16_t>(pad_val));
-    } else if (tensor_type == DataType::DE_INT32) {
-      RETURN_IF_NOT_OK((*dst)->Fill<int32_t>(pad_val));
-    } else if (tensor_type == DataType::DE_UINT32) {
-      RETURN_IF_NOT_OK((*dst)->Fill<uint32_t>(pad_val));
-    } else if (tensor_type == DataType::DE_INT64) {
-      RETURN_IF_NOT_OK((*dst)->Fill<int64_t>(pad_val));
-    } else if (tensor_type == DataType::DE_UINT64) {
-      RETURN_IF_NOT_OK((*dst)->Fill<uint64_t>(pad_val));
-    } else if (tensor_type == DataType::DE_FLOAT32) {
-      RETURN_IF_NOT_OK((*dst)->Fill<float>(pad_val));
-    } else if (tensor_type == DataType::DE_FLOAT64) {
-      RETURN_IF_NOT_OK((*dst)->Fill<double>(pad_val));
-    } else {
-      RETURN_STATUS_UNEXPECTED("Incorrect/Unknown tensor type");
-    }
-    std::vector<dsize_t> cur_ind(src->Rank(), 0), src_s(src->Rank(), 1), dst_s(src->Rank(), 1);
-    for (dsize_t i = src->Rank() - 2; i >= 0; i--) {
-      src_s[i] = src->shape()[i + 1] * src_s[i + 1];
-      dst_s[i] = pad_shape[i + 1] * dst_s[i + 1];
-    }
-    RETURN_IF_NOT_OK(PadHelper(src, *dst, cur_ind, src_s, dst_s, 0));
-  }
-  return Status::OK();
-}  // namespace dataset
-
-Status BatchOp::PadColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> *table_pair) {
-  RETURN_UNEXPECTED_IF_NULL(table_pair);  // placeholder for now, might need this in the future
-  CHECK_FAIL_RETURN_UNEXPECTED(table_pair->first->front().size() == column_name_id_map_.size(),
-                               "col_name_map mismatch");
-  std::vector<float> pad_vals(column_name_id_map_.size(), 0);  // value to pad each column's tensor with, default 0
+Status BatchOp::PadColumns(std::unique_ptr<TensorQTable> *table, const PadInfo &pad_info,
+                           const std::unordered_map<std::string, int32_t> &column_name_id_map) {
+  RETURN_UNEXPECTED_IF_NULL(table);  // placeholder for now, might need this in the future
+  CHECK_FAIL_RETURN_UNEXPECTED((*table)->front().size() == column_name_id_map.size(), "col_name_map mismatch");
+  std::vector<std::shared_ptr<Tensor>> pad_vals(column_name_id_map.size(),
+                                                0);  // value to pad each column's tensor with, default 0
   std::set<int32_t> pad_cols;
   // padded_shape provided by user, maximum shapes of current batch of tensors
-  std::vector<std::vector<dsize_t>> pad_shapes(column_name_id_map_.size()), max_shapes(column_name_id_map_.size());
-  RETURN_IF_NOT_OK(UnpackPadInfo(&pad_cols, &pad_vals, &pad_shapes));
+  std::vector<std::vector<dsize_t>> pad_shapes(column_name_id_map.size()), max_shapes(column_name_id_map.size());
+  RETURN_IF_NOT_OK(UnpackPadInfo(pad_info, column_name_id_map, &pad_cols, &pad_vals, &pad_shapes));
 
   // init each shape in max_shape to {-1,-1...} init each unspecified shape in pad_shape to -1 as well
   for (size_t col_id : pad_cols) {
-    max_shapes[col_id] = std::vector<dsize_t>(table_pair->first->front()[col_id]->Rank(), -1);
+    max_shapes[col_id] = std::vector<dsize_t>((*table)->front()[col_id]->Rank(), -1);
     if (pad_shapes[col_id].empty()) pad_shapes[col_id] = max_shapes[col_id];  // fill pad shape with -1
     CHECK_FAIL_RETURN_UNEXPECTED(pad_shapes[col_id].size() == max_shapes[col_id].size(), "wrong rank in pad_shape");
   }
 
   // calculate maximum shape for each column that needs to be padded
-  for (const TensorRow &row : *(table_pair->first)) {  // iterator each row in a batch
-    for (size_t col_id : pad_cols) {                   // iterator each tensor in a row
+  for (const TensorRow &row : **table) {  // iterator each row in a batch
+    for (size_t col_id : pad_cols) {      // iterator each tensor in a row
       CHECK_FAIL_RETURN_UNEXPECTED(row[col_id]->Rank() == max_shapes[col_id].size(),
                                    "Tensor to be padded together need to have the same rank");
       for (size_t dim = 0; dim < row[col_id]->Rank(); dim++) {  // pick the largest number in each dimension
@@ -417,27 +375,29 @@ Status BatchOp::PadColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo>
   }
 
   // call pad on each tensor that needs to be padded
-  for (TensorRow &row : *(table_pair->first)) {
+  for (TensorRow &row : **table) {
     for (size_t col_id : pad_cols) {
       std::shared_ptr<Tensor> pad_tensor;
-      RETURN_IF_NOT_OK(PadTensor(row[col_id], &pad_tensor, pad_shapes[col_id], pad_vals[col_id]));
+      RETURN_IF_NOT_OK(PadEnd(row[col_id], &pad_tensor, pad_shapes[col_id], pad_vals[col_id]));
       row[col_id] = pad_tensor;
     }
   }
   return Status::OK();
 }
 
-Status BatchOp::UnpackPadInfo(std::set<int32_t> *pad_cols, std::vector<float> *pad_vals,
+Status BatchOp::UnpackPadInfo(const PadInfo &pad_info,
+                              const std::unordered_map<std::string, int32_t> &column_name_id_map,
+                              std::set<int32_t> *pad_cols, std::vector<std::shared_ptr<Tensor>> *pad_vals,
                               std::vector<std::vector<dsize_t>> *pad_shapes) {
-  if (pad_info_.empty()) {  // if pad_info empty, pad every columns automatically
-    for (dsize_t col_id = 0; col_id < column_name_id_map_.size(); col_id++) {
+  if (pad_info.empty()) {  // if pad_info empty, pad every columns automatically
+    for (dsize_t col_id = 0; col_id < column_name_id_map.size(); col_id++) {
       pad_cols->insert(col_id);
     }
   } else {
-    for (auto p : pad_info_) {
-      CHECK_FAIL_RETURN_UNEXPECTED(column_name_id_map_.find(p.first) != column_name_id_map_.end(),
-                                   "no column exists with name:" + p.first);
-      dsize_t col_id = static_cast<dsize_t>(column_name_id_map_[p.first]);
+    for (const auto &p : pad_info) {
+      auto location = column_name_id_map.find(p.first);
+      CHECK_FAIL_RETURN_UNEXPECTED(location != column_name_id_map.end(), "no column exists with name:" + p.first);
+      auto col_id = static_cast<dsize_t>(location->second);
       CHECK_FAIL_RETURN_UNEXPECTED(col_id < pad_vals->size() && col_id < pad_shapes->size(), "col_id out of bound");
       pad_cols->insert(col_id);
       (*pad_vals)[col_id] = p.second.second;              // set pad values
@@ -447,29 +407,6 @@ Status BatchOp::UnpackPadInfo(std::set<int32_t> *pad_cols, std::vector<float> *p
   return Status::OK();
 }
 
-Status BatchOp::PadHelper(std::shared_ptr<Tensor> src, std::shared_ptr<Tensor> dst, std::vector<dsize_t> cur_ind,
-                          const std::vector<dsize_t> &src_s, const std::vector<dsize_t> &dst_s, size_t cur_dim) {
-  if (cur_dim == src->Rank() - 1) {  // if this is the last dimension, copy the data
-    uint8_t type_size = src->type().SizeInBytes();
-    size_t len = std::min(src->shape()[cur_dim], dst->shape()[cur_dim]) * type_size;
-    dsize_t src_flat_ind = 0, dst_flat_ind = 0;
-    for (size_t i = 0; i < src->Rank(); i++) {
-      src_flat_ind += src_s[i] * cur_ind[i];
-      dst_flat_ind += dst_s[i] * cur_ind[i];
-    }
-    unsigned char *src_addr = src->GetMutableBuffer() + src_flat_ind * type_size;
-    unsigned char *dst_addr = dst->GetMutableBuffer() + dst_flat_ind * type_size;
-    CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(dst_addr, len, src_addr, len) == 0, "memcpy error");
-  } else {  // not the last dimension, keep doing recursion
-    dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]);
-    for (dsize_t i = 0; i < min_ind; i++) {
-      cur_ind[cur_dim] = i;
-      RETURN_IF_NOT_OK(PadHelper(src, dst, cur_ind, src_s, dst_s, cur_dim + 1));
-    }
-  }
-  return Status::OK();
-}
-
 // Visitor accept method for NodePass
 Status BatchOp::Accept(NodePass *p, bool *modified) {
   // Downcast shared pointer then call visitor
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h
index 1a862acd0b..28df5e7e81 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h
@@ -36,8 +36,9 @@ namespace mindspore {
 namespace dataset {
 class DataBuffer;
 
-using TensorBatch = std::vector<std::shared_ptr<Tensor>>;
+using TensorBatch = TensorRow;
 using TensorBatchTable = std::vector<TensorBatch>;
+using PadInfo = std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>>;
 
 class BatchOp : public ParallelOp {
  public:
@@ -66,7 +67,7 @@ class BatchOp : public ParallelOp {
       return *this;
     }
 
-    Builder &SetPaddingMap(const std::map<std::string, std::pair<TensorShape, float>> &pad_map, bool pad = true) {
+    Builder &SetPaddingMap(const PadInfo &pad_map, bool pad = true) {
       builder_pad_ = pad;
       builder_pad_map_ = pad_map;
       return *this;
@@ -119,7 +120,7 @@ class BatchOp : public ParallelOp {
     int32_t builder_num_workers_;
     int32_t builder_op_connector_size_;
     std::vector<std::string> builder_cols_to_map_;
-    std::map<std::string, std::pair<TensorShape, float>> builder_pad_map_;
+    PadInfo builder_pad_map_;
     py::function builder_batch_size_func_;
     py::function builder_batch_map_func_;
   };
@@ -150,8 +151,7 @@ class BatchOp : public ParallelOp {
   // @param int32_t rows_per_buf
   // @param int32_t num_workers
   BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers,
-          const std::vector<std::string> &, py::function batch_size_func, py::function batch_map_func,
-          std::map<std::string, std::pair<TensorShape, float>> pad_map);
+          const std::vector<std::string> &, py::function batch_size_func, py::function batch_map_func, PadInfo pad_map);
 
   // BatchOp destructor
   ~BatchOp() {}
@@ -183,34 +183,33 @@ class BatchOp : public ParallelOp {
   // @return Status - The error code return
   Status operator()() override;
 
-  // Pad input tensor according pad_shape, need to have same rank.
-  // @param std::shared_ptr<Tensor> src - tensor to pad from
-  // @param std::shared_ptr<Tensor> *dst - return tensor padded
-  // @param std::vector<dsize_t> pad_shape - shape to pad to
-  // @param float pad_val - value to pad with
-  // @return - The error code return
-  Status PadTensor(std::shared_ptr<Tensor> src, std::shared_ptr<Tensor> *dst, const std::vector<dsize_t> &pad_shape,
-                   float pad_val);
-
   // Base-class override for NodePass visitor acceptor.
   // @param p - Pointer to the NodePass to be accepted.
   // @param modified - Whether this node visit modified the pipeline.
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
- private:
-  // recursive helper function. This function could be very expensive if called on a multi-dimensional tensor
-  // it is only meant to be called by PadTensor.
-  // @tparam T - type of tensor and fill value
-  // @param std::shared_ptr<Tensor> src - Tensor to pad from
-  // @param std::shared_ptr<Tensor>* dst - Tensor to pad to, return value
-  // @param std::vector<dsize_t> cur_ind - recursion helper
-  // @param T pad_val - value to pad tensor with
-  // @param size_t cur_dim - recursion helper
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "BatchOp"; }
+
+  // batch the rows in src table then put it to dest table
+  // @param const std::unique_ptr<TensorQTable> *src - table that has the rows for batching
+  // @param const std::unique_ptr<TensorQTable> *dest - dest_table to hold batched rows
+  // @param int32_t size - batch_size
+  // @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
   // @return Status - The error code return
-  Status PadHelper(std::shared_ptr<Tensor> src, std::shared_ptr<Tensor> dst, std::vector<dsize_t> cur_ind,
-                   const std::vector<dsize_t> &src_s, const std::vector<dsize_t> &dst_s, size_t cur_dim = 0);
+  static Status BatchRows(const std::unique_ptr<TensorQTable> *src, const std::unique_ptr<TensorQTable> *dest,
+                          dsize_t batch_size);
 
+  // @param table
+  // @param const PadInfo &pad_info pad info
+  // @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
+  // @return Status - The error code return
+  static Status PadColumns(std::unique_ptr<TensorQTable> *table, const PadInfo &pad_info,
+                           const std::unordered_map<std::string, int32_t> &column_name_id_map);
+
+ private:
   // Worker thread for doing the memcpy of batch
   // @param int32_t param workerId
   // @return Status - The error code return
@@ -220,28 +219,21 @@ class BatchOp : public ParallelOp {
   // @return Status - The error code return
   Status MakeBatchedBuffer(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> table_pair,
                            std::unique_ptr<DataBuffer> *db);
-
-  // batch the rows in src table then put it to dest table
-  // @param const std::unique_ptr<TensorQTable> *src - table that has the rows for batching
-  // @param const std::unique_ptr<TensorQTable> *dest - dest_table to hold batched rows
-  // @param int32_t size - batch_size
-  // @return Status - The error code return
-  Status BatchRows(const std::unique_ptr<TensorQTable> *src, const std::unique_ptr<TensorQTable> *dest, size_t size);
-
   // Function that calls pyfunc to perform map on batch
   // @param (std::pair<std::unique_ptr<TensorQTable>, batch_stats> *table_pair - contains un-batched tensor
   // @return Status - The error code return
   Status MapColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> *table_pair);
 
+  // @param const PadInfo &pad_info pad info to unpack
+  // @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
   // @param std::set<int32_t> *cols, col ids to perform pad on
   // @param std::vector<float> *vals, default padding value for each column
   // @param std::vector<std::vector<dsize_t>> *shapes, padding shape specified by user
   // @return Status - The error code return
-  Status UnpackPadInfo(std::set<int32_t> *cols, std::vector<float> *vals, std::vector<std::vector<dsize_t>> *shapes);
-
-  // @param table_pair
-  // @return Status - The error code return
-  Status PadColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> *table_pair);
+  static Status UnpackPadInfo(const PadInfo &pad_info,
+                              const std::unordered_map<std::string, int32_t> &column_name_id_map,
+                              std::set<int32_t> *pad_cols, std::vector<std::shared_ptr<Tensor>> *pad_vals,
+                              std::vector<std::vector<dsize_t>> *pad_shapes);
 
   // the number of thread pulling from the mOutConnector of the Op below
   // @return int32_t, 1
@@ -264,11 +256,11 @@ class BatchOp : public ParallelOp {
   Status InvokeBatchMapFunc(TensorTable *input, TensorTable *output, CBatchInfo info);
 
   int32_t start_batch_size_;
-  bool drop_;                                                      // bool for whether to drop remainder or not
-  bool pad_;                                                       // bool for whether to perform padding on tensor
-  std::vector<std::string> pyfunc_column_names_;                   // Name of the columns to perform map op on
-  std::map<std::string, std::pair<TensorShape, float>> pad_info_;  // column names to perform padding on
-  std::unique_ptr<ChildIterator> child_iterator_;                  // child iterator for fetching TensorRows 1 by 1
+  bool drop_;                                      // bool for whether to drop remainder or not
+  bool pad_;                                       // bool for whether to perform padding on tensor
+  std::vector<std::string> pyfunc_column_names_;   // Name of the columns to perform map op on
+  PadInfo pad_info_;                               // column names to perform padding on
+  std::unique_ptr<ChildIterator> child_iterator_;  // child iterator for fetching TensorRows 1 by 1
   QueueList<std::pair<std::unique_ptr<TensorQTable>, CBatchInfo>> worker_queues_;  // internal queue for syncing worker
   py::function batch_size_func_;  // Function pointer of batch size function
   py::function batch_map_func_;   // Function pointer of per batch map function
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/bucket_batch_by_length_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/bucket_batch_by_length_op.cc
new file mode 100644
index 0000000000..def2ea0fee
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/datasetops/bucket_batch_by_length_op.cc
@@ -0,0 +1,241 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/engine/datasetops/bucket_batch_by_length_op.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "dataset/core/pybind_support.h"
+#include "dataset/core/config_manager.h"
+#include "dataset/core/tensor.h"
+#include "dataset/core/tensor_shape.h"
+#include "dataset/engine/dataset_iterator.h"
+#include "dataset/engine/datasetops/parallel_op.h"
+#include "dataset/engine/opt/pass.h"
+#include "dataset/util/status.h"
+
+namespace py = pybind11;
+namespace mindspore {
+namespace dataset {
+BucketBatchByLengthOp::Builder::Builder(std::vector<std::string> length_dependent_columns,
+                                        std::vector<int32_t> bucket_boundaries, std::vector<int32_t> bucket_batch_sizes)
+    : builder_length_dependent_columns_(length_dependent_columns),
+      builder_bucket_boundaries_(bucket_boundaries),
+      builder_bucket_batch_sizes_(bucket_batch_sizes),
+      builder_pad_info_({}),
+      builder_pad_to_bucket_boundary_(false),
+      builder_drop_remainder_(false) {
+  std::shared_ptr<ConfigManager> config_manager = GlobalContext::config_manager();
+  builder_op_connector_size_ = config_manager->op_connector_size();
+}
+
+Status BucketBatchByLengthOp::Builder::SanityCheck() {
+  std::string error_message;
+
+  if (builder_length_dependent_columns_.empty()) {
+    error_message += "At least 1 column must be specified for element length calculation.\n";
+  }
+
+  if (builder_bucket_boundaries_.empty()) {
+    error_message += "At least 1 bucket boundary must be specified.\n";
+  }
+
+  if (builder_bucket_batch_sizes_.size() != builder_bucket_boundaries_.size() + 1) {
+    error_message += "There must be exactly one bucket batch size specified for each bucket boundary.\n";
+  }
+
+  CHECK_FAIL_RETURN_UNEXPECTED(error_message.empty(), error_message);
+
+  return Status::OK();
+}
+
+Status BucketBatchByLengthOp::Builder::Build(std::shared_ptr<BucketBatchByLengthOp> *new_bucket_batch_by_length_op) {
+  RETURN_IF_NOT_OK(SanityCheck());
+
+  // insert 0 for the first bucket
+  builder_bucket_boundaries_.insert(builder_bucket_boundaries_.begin(), 0);
+
+  *new_bucket_batch_by_length_op = std::make_shared<BucketBatchByLengthOp>(
+    builder_length_dependent_columns_, builder_bucket_boundaries_, builder_bucket_batch_sizes_,
+    builder_element_length_function_, builder_pad_info_, builder_pad_to_bucket_boundary_, builder_drop_remainder_,
+    builder_op_connector_size_);
+
+  return Status::OK();
+}
+
+BucketBatchByLengthOp::BucketBatchByLengthOp(std::vector<std::string> length_dependent_columns,
+                                             std::vector<int32_t> bucket_boundaries,
+                                             std::vector<int32_t> bucket_batch_sizes,
+                                             py::function element_length_function, PadInfo pad_info,
+                                             bool pad_to_bucket_boundary, bool drop_remainder,
+                                             int32_t op_connector_size)
+    : PipelineOp(op_connector_size),
+      length_dependent_columns_(length_dependent_columns),
+      bucket_boundaries_(bucket_boundaries),
+      bucket_batch_sizes_(bucket_batch_sizes),
+      element_length_function_(element_length_function),
+      pad_info_(pad_info),
+      pad_to_bucket_boundary_(pad_to_bucket_boundary),
+      drop_remainder_(drop_remainder),
+      batch_count_(0) {
+  for (int i = 0; i < bucket_batch_sizes_.size(); i++) {
+    buckets_.push_back(std::make_unique<TensorQTable>());
+  }
+}
+
+Status BucketBatchByLengthOp::EoeReceived(int32_t) {
+  state_ = OpState::kDeOpIdle;
+  return Status::OK();
+}
+
+void BucketBatchByLengthOp::Print(std::ostream &out, bool show_all) const { out << "BucketBatchByLengthOp\n"; }
+
+Status BucketBatchByLengthOp::operator()() {
+  TaskManager::FindMe()->Post();
+
+  TensorRow current_row;
+  child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
+  RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&current_row));
+  RETURN_IF_NOT_OK(AssignColMapFromChild());
+  while (!child_iterator_->eof_handled()) {
+    while (!current_row.empty()) {
+      int32_t element_length;
+      RETURN_IF_NOT_OK(ObtainElementLength(&element_length, current_row));
+
+      int bucket_index = bucket_boundaries_.size() - 1;
+      while (element_length < bucket_boundaries_[bucket_index]) {
+        bucket_index--;
+      }
+
+      buckets_[bucket_index]->push_back(current_row);
+
+      if (buckets_[bucket_index]->size() == bucket_batch_sizes_[bucket_index]) {
+        RETURN_IF_NOT_OK(PadAndBatchBucket(bucket_index, bucket_batch_sizes_[bucket_index]));
+      }
+
+      RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&current_row));
+    }
+
+    // got EOE, do what we need to do with remainders in each bucket
+    if (!drop_remainder_) {
+      for (int i = 0; i < bucket_boundaries_.size(); i++) {
+        if (!buckets_[i]->empty()) {
+          RETURN_IF_NOT_OK(PadAndBatchBucket(i, buckets_[i]->size()));
+        }
+      }
+    }
+
+    // need to send EOE manually since we set state to idle in EoeRecieved()
+    std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
+    RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eoe_buffer)));
+
+    RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&current_row));
+  }
+
+  return Status::OK();
+}
+
+Status BucketBatchByLengthOp::ObtainElementLength(int32_t *out_element_length, TensorRow element) {
+  // call pyfunc here if given pyfunc, otherwise return 0th dimension of shape of
+  // the single column specified in length_dependent_columns_
+  if (element_length_function_) {
+    py::gil_scoped_acquire gil_acquire;
+    if (Py_IsInitialized() == 0) {
+      return Status(StatusCode::kPythonInterpreterFailure, "Python Interpreter is finalized");
+    }
+    try {
+      size_t number_of_arguments = length_dependent_columns_.size();
+      py::tuple input_arguments(number_of_arguments);
+      for (size_t i = 0; i < number_of_arguments; i++) {
+        py::array argument_value;
+        int32_t column_index = column_name_id_map_[length_dependent_columns_[i]];
+        RETURN_IF_NOT_OK(element[column_index]->GetDataAsNumpy(&argument_value));
+        input_arguments[i] = argument_value;
+      }
+
+      py::object length = element_length_function_(*input_arguments);
+      *out_element_length = length.cast<int32_t>();
+      if (*out_element_length < 0) {
+        return Status(StatusCode::kPyFuncException, "Element length function should return a non negative integer.");
+      }
+    } catch (const py::error_already_set &e) {
+      return Status(StatusCode::kPyFuncException, e.what());
+    } catch (const py::cast_error &e) {
+      return Status(StatusCode::kPyFuncException, "Count not cast output of element length function to int32_t.");
+    }
+  } else {
+    *out_element_length = element[0]->shape()[0];
+  }
+
+  return Status::OK();
+}
+
+Status BucketBatchByLengthOp::PadAndBatchBucket(int32_t bucket_index, int32_t batch_size) {
+  std::unique_ptr<TensorQTable> *bucket = &buckets_[bucket_index];
+
+  PadInfo pad_info_copy = pad_info_;
+  if (pad_to_bucket_boundary_) {
+    for (auto &pair : pad_info_copy) {
+      std::vector<dsize_t> pad_shape = pair.second.first.AsVector();
+
+      for (size_t i = 0; i < pad_shape.size(); i++) {
+        if (pad_shape[i] == TensorShape::kDimUnknown) {
+          if (bucket_index + 1 >= bucket_boundaries_.size()) {
+            std::string error_message = "Requested to pad to bucket boundary, element falls in last bucket";
+            return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, error_message);
+          }
+
+          pad_shape[i] = bucket_boundaries_[bucket_index + 1] - 1;
+        }
+      }
+
+      pair.second.first = TensorShape(pad_shape);
+    }
+  }
+
+  // PadColumns will change the data in bucket
+  RETURN_IF_NOT_OK(BatchOp::PadColumns(bucket, pad_info_copy, column_name_id_map_));
+
+  std::unique_ptr<TensorQTable> batched_bucket = std::make_unique<TensorQTable>();
+  RETURN_IF_NOT_OK(BatchOp::BatchRows(bucket, &batched_bucket, batch_size));
+  (*bucket)->clear();
+
+  std::unique_ptr<DataBuffer> batched_buffer = std::make_unique<DataBuffer>(batch_count_, DataBuffer::kDeBFlagNone);
+  batched_buffer->set_tensor_table(std::move(batched_bucket));
+  RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(batched_buffer)));
+
+  batch_count_++;
+
+  return Status::OK();
+}
+
+Status BucketBatchByLengthOp::Reset() {
+  batch_count_ = 0;
+
+  for (int i = 0; i < buckets_.size(); i++) {
+    buckets_[i] = std::make_unique<TensorQTable>();
+  }
+
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/bucket_batch_by_length_op.h b/mindspore/ccsrc/dataset/engine/datasetops/bucket_batch_by_length_op.h
new file mode 100644
index 0000000000..bf0bcb0e78
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/datasetops/bucket_batch_by_length_op.h
@@ -0,0 +1,155 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_ENGINE_DATASETOPS_BUCKET_BATCH_BY_LENGTH_OP_H_
+#define DATASET_ENGINE_DATASETOPS_BUCKET_BATCH_BY_LENGTH_OP_H_
+
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "dataset/core/config_manager.h"
+#include "dataset/core/tensor.h"
+#include "dataset/engine/dataset_iterator.h"
+#include "dataset/engine/datasetops/batch_op.h"
+#include "dataset/engine/datasetops/pipeline_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class DataBuffer;
+
+class BucketBatchByLengthOp : public PipelineOp {
+ public:
+  class Builder {
+   public:
+    Builder(std::vector<std::string> length_dependent_columns, std::vector<int32_t> bucket_boundaries,
+            std::vector<int32_t> bucket_batch_sizes);
+
+    ~Builder() = default;
+
+    Builder &SetLengthDependentColumns(std::vector<std::string> length_dependent_columns) {
+      builder_length_dependent_columns_ = length_dependent_columns;
+      return *this;
+    }
+
+    Builder &SetBucketBoundaries(std::vector<int32_t> bucket_boundaries) {
+      builder_bucket_boundaries_ = bucket_boundaries;
+      return *this;
+    }
+
+    Builder &SetBucketBatchSizes(std::vector<int32_t> bucket_batch_sizes) {
+      builder_bucket_batch_sizes_ = bucket_batch_sizes;
+      return *this;
+    }
+
+    Builder &SetElementLengthFunction(py::function element_length_function) {
+      builder_element_length_function_ = element_length_function;
+      return *this;
+    }
+
+    Builder &SetPadInfo(PadInfo pad_info) {
+      builder_pad_info_ = pad_info;
+      return *this;
+    }
+
+    Builder &SetPadToBucketBoundary(bool pad_to_bucket_boundary) {
+      builder_pad_to_bucket_boundary_ = pad_to_bucket_boundary;
+      return *this;
+    }
+
+    Builder &SetDropRemainder(bool drop_remainder) {
+      builder_drop_remainder_ = drop_remainder;
+      return *this;
+    }
+
+    Builder &SetOpConnectorSize(int32_t op_connector_size) {
+      builder_op_connector_size_ = op_connector_size;
+      return *this;
+    }
+
+    Status Build(std::shared_ptr<BucketBatchByLengthOp> *new_bucket_batch_by_length_op);
+
+   private:
+    Status SanityCheck();
+
+    std::vector<std::string> builder_length_dependent_columns_;
+    std::vector<int32_t> builder_bucket_boundaries_;
+    std::vector<int32_t> builder_bucket_batch_sizes_;
+    py::function builder_element_length_function_;
+    PadInfo builder_pad_info_;
+    bool builder_pad_to_bucket_boundary_;
+    bool builder_drop_remainder_;
+    int32_t builder_op_connector_size_;
+  };
+
+  BucketBatchByLengthOp(std::vector<std::string> length_dependent_columns, std::vector<int32_t> bucket_boundaries,
+                        std::vector<int32_t> bucket_batch_sizes, py::function element_length_function, PadInfo pad_info,
+                        bool pad_to_bucket_boundary, bool drop_remainder, int32_t op_connector_size);
+
+  // Destructor
+  ~BucketBatchByLengthOp() = default;
+
+  // Might need to batch remaining buckets after receiving eoe, so override this method.
+  // @param int32_t workerId
+  // @return Status - The error code returned
+  Status EoeReceived(int32_t) override;
+
+  // A print method typically used for debugging
+  // @param out - The output stream to write output to
+  // @param show_all - A bool to control if you want to show all info or just a summary
+  void Print(std::ostream &out, bool show_all) const override;
+
+  // << Stream output operator overload
+  // @notes This allows you to write the debug print info using stream operators
+  // @param out - reference to the output stream being overloaded
+  // @param sO - reference to the BucketBatchByLengthOp to display
+  // @return - the output stream must be returned
+  friend std::ostream &operator<<(std::ostream &out, const BucketBatchByLengthOp &bo) {
+    bo.Print(out, false);
+    return out;
+  }
+
+  // Main loop of batch
+  // @return Status - The error code returned
+  Status operator()() override;
+
+  // Function that is called by ResetOp at the end of every epoch
+  // @return Status - The error code returned
+  Status Reset() override;
+
+ private:
+  Status ObtainElementLength(int32_t *out_element_length, TensorRow element);
+
+  Status PadAndBatchBucket(int32_t bucket_index, int32_t batch_size);
+
+  std::vector<std::string> length_dependent_columns_;
+  std::vector<int32_t> bucket_boundaries_;
+  std::vector<int32_t> bucket_batch_sizes_;
+  py::function element_length_function_;
+  PadInfo pad_info_;
+  bool pad_to_bucket_boundary_;
+  bool drop_remainder_;
+
+  int32_t batch_count_;
+  std::unique_ptr<ChildIterator> child_iterator_;
+  std::vector<std::unique_ptr<TensorQTable>> buckets_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_ENGINE_DATASETOPS_BUCKET_BATCH_BY_LENGTH_OP_H_
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/build_vocab_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/build_vocab_op.cc
new file mode 100644
index 0000000000..f99804ec9b
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/datasetops/build_vocab_op.cc
@@ -0,0 +1,207 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/engine/datasetops/build_vocab_op.h"
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "dataset/core/config_manager.h"
+
+namespace mindspore {
+namespace dataset {
+
+BuildVocabOp::BuildVocabOp(std::shared_ptr<Vocab> vocab, std::vector<std::string> col_names,
+                           std::pair<int64_t, int64_t> freq_r, int64_t top_k, const std::vector<std::string> &tokens,
+                           bool prepend, int32_t num_workers, int32_t op_conn_size)
+    : ParallelOp(num_workers, op_conn_size),
+      interval_(op_conn_size * num_workers),
+      vocab_(vocab),
+      col_names_(col_names),
+      freq_range_(freq_r),
+      top_k_(top_k),
+      special_tokens_(tokens),
+      special_first_(prepend) {
+  // init two queues for thread sync
+  distributor_queue_ = std::make_unique<Queue<TensorRow>>(num_workers * op_conn_size);
+  collector_queue_ =
+    std::make_unique<Queue<std::unique_ptr<std::unordered_map<std::string, int64_t>>>>(num_workers * op_conn_size);
+}
+
+Status BuildVocabOp::WorkerEntry(int32_t worker_id) {
+  TaskManager::FindMe()->Post();
+  TensorRow new_row;
+  RETURN_IF_NOT_OK(distributor_queue_->PopFront(&new_row));
+  std::unique_ptr<std::unordered_map<std::string, int64_t>> wrkr_map =
+    std::make_unique<std::unordered_map<std::string, int64_t>>();
+  int32_t row_cnt = 0;
+  while (!new_row.empty()) {
+    for (int32_t col : col_ids_) {
+      CHECK_FAIL_RETURN_UNEXPECTED(!new_row[col]->type().IsNumeric(), "from_dataset only works on string columns");
+      for (auto itr = new_row[col]->begin<std::string_view>(); itr != new_row[col]->end<std::string_view>(); itr++) {
+        (*wrkr_map)[std::string(*itr)] += 1;
+      }
+    }
+    row_cnt++;  // row is processed by this point
+    if ((row_cnt % interval_ == 0) && ((row_cnt / interval_) % num_workers_ == worker_id) && (!wrkr_map->empty())) {
+      RETURN_IF_NOT_OK(collector_queue_->Add(std::move(wrkr_map)));
+      wrkr_map = std::make_unique<std::unordered_map<std::string, int64_t>>();
+    }
+    RETURN_IF_NOT_OK(distributor_queue_->PopFront(&new_row));
+  }
+  // clean up
+  if (!wrkr_map->empty()) {
+    RETURN_IF_NOT_OK(collector_queue_->Add(std::move(wrkr_map)));
+  }
+  // empty map as quit signal
+  RETURN_IF_NOT_OK(collector_queue_->Add(std::make_unique<std::unordered_map<std::string, int64_t>>()));
+  return Status::OK();
+}
+
+Status BuildVocabOp::operator()() {
+  // launch the collector thread
+  RETURN_UNEXPECTED_IF_NULL(tree_);
+  RETURN_IF_NOT_OK(distributor_queue_->Register(tree_->AllTasks()));
+  RETURN_IF_NOT_OK(collector_queue_->Register(tree_->AllTasks()));
+  // launch worker threads and collector thread
+  RETURN_IF_NOT_OK(
+    tree_->LaunchWorkers(num_workers_, std::bind(&BuildVocabOp::WorkerEntry, this, std::placeholders::_1)));
+  RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("collector", std::bind(&BuildVocabOp::CollectorThread, this)));
+  TaskManager::FindMe()->Post();
+  child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
+  TensorRow new_row;
+  RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
+  RETURN_IF_NOT_OK(AssignColMapFromChild());
+  if (!col_names_.empty()) {
+    col_ids_.reserve(col_names_.size());
+    for (std::string col : col_names_) {
+      auto itr = column_name_id_map_.find(col);
+      CHECK_FAIL_RETURN_UNEXPECTED(itr != column_name_id_map_.end(), col + " column doesn't exist");
+      col_ids_.push_back(itr->second);
+    }
+  } else {
+    col_ids_.reserve(column_name_id_map_.size());
+    for (const auto &p : column_name_id_map_) {
+      col_ids_.push_back(p.second);
+    }
+  }
+  bool eoe_warning = false;  // give out warning if receive more than 1 eoe
+  while (child_iterator_->eof_handled() == false) {
+    while (new_row.empty() == false) {
+      RETURN_IF_NOT_OK(distributor_queue_->EmplaceBack(new_row));
+      RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
+    }
+    CHECK_FAIL_RETURN_UNEXPECTED(!eoe_warning, "no op should be after from_dataset (repeat detected)");
+    eoe_warning = true;
+  }
+
+  // tell all workers to quit
+  for (int32_t wrkr_id = 0; wrkr_id < num_workers_; wrkr_id++) {
+    RETURN_IF_NOT_OK(distributor_queue_->EmplaceBack(TensorRow()));
+  }
+  return Status::OK();
+}
+
+Status BuildVocabOp::CollectorThread() {
+  TaskManager::FindMe()->Post();
+  int32_t num_quited_worker = 0;
+  std::unique_ptr<std::unordered_map<std::string, int64_t>> wrkr_map;
+  while (num_quited_worker != num_workers_) {
+    RETURN_IF_NOT_OK(collector_queue_->PopFront(&wrkr_map));
+    RETURN_UNEXPECTED_IF_NULL(wrkr_map);
+    if (!wrkr_map->empty()) {
+      for (const auto &wd : *wrkr_map) word_cnt_[wd.first] += wd.second;
+    } else {
+      ++num_quited_worker;
+    }
+  }  // all frequencies are obtained
+  CHECK_FAIL_RETURN_UNEXPECTED(!word_cnt_.empty(), "word_cnt is empty");
+  std::vector<std::string> words;
+  // make sure enough is reserved, this will become a partially sorted list eventually
+  words.reserve(wrkr_map->size());
+
+  for (auto it = word_cnt_.begin(); it != word_cnt_.end();) {
+    if (it->second >= freq_range_.first && it->second <= freq_range_.second) {
+      words.push_back(it->first);
+      it++;
+    } else {
+      it = word_cnt_.erase(it);
+    }
+  }
+  std::string err_msg;
+
+  for (const std::string &sp_tk : special_tokens_) {
+    // if a special word exists in dataset, warn user about this
+    err_msg += (word_cnt_.find(sp_tk) != word_cnt_.end() ? sp_tk + "\t" : "");
+  }
+
+  CHECK_FAIL_RETURN_UNEXPECTED(err_msg.empty(), "These specials words are already in the dataset: " + err_msg + ".");
+
+  int64_t num_words = std::min(static_cast<int64_t>(words.size()), top_k_);
+  if (num_words == 0) {
+    MS_LOG(WARNING) << "No word falls in the frequency range: (" << freq_range_.first << "," << freq_range_.second
+                    << ") vocab would be empty (except for special tokens).";
+  }
+
+  // this would take the top-k most frequent words
+  std::partial_sort(words.begin(), words.begin() + num_words, words.end(),
+                    [this](const std::string &w1, const std::string &w2) {
+                      int64_t f1 = word_cnt_[w1], f2 = word_cnt_[w2];
+                      return f1 == f2 ? w1 < w2 : f1 > f2;
+                    });
+
+  if (special_first_) {
+    for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
+  }
+
+  for (int64_t i = 0; i < num_words; i++) {
+    vocab_->append_word(words[i]);
+  }
+
+  if (!special_first_) {
+    for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
+  }
+
+  RETURN_IF_NOT_OK(out_connector_->Add(0, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
+  RETURN_IF_NOT_OK(out_connector_->Add(0, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
+  // then use std::nth_element to partial sort
+  return Status::OK();
+}
+
+Status BuildVocabOp::Builder::Build(std::shared_ptr<BuildVocabOp> *op) {
+  CHECK_FAIL_RETURN_UNEXPECTED(builder_num_workers_ > 0, "builder num_workers need to be greater than 0");
+  CHECK_FAIL_RETURN_UNEXPECTED(builder_top_k_ > 0, "top_k needs to be positive number");
+  CHECK_FAIL_RETURN_UNEXPECTED(builder_max_freq_ >= builder_min_freq_ && builder_min_freq_ >= 0,
+                               "frequency range [a,b] should be 0 <= a <= b (a,b are inclusive)");
+  (*op) = std::make_shared<BuildVocabOp>(
+    builder_vocab_, builder_col_names_, std::make_pair(builder_min_freq_, builder_max_freq_), builder_top_k_,
+    builder_speical_tokens_, builder_special_first_, builder_num_workers_, builder_connector_size_);
+  return Status::OK();
+}
+
+BuildVocabOp::Builder::Builder()
+    : builder_top_k_(std::numeric_limits<int64_t>::max()),
+      builder_min_freq_(0),
+      builder_max_freq_(std::numeric_limits<int64_t>::max()),
+      builder_special_first_(true) {
+  std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
+  builder_num_workers_ = cfg->num_parallel_workers();
+  builder_connector_size_ = cfg->op_connector_size();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/build_vocab_op.h b/mindspore/ccsrc/dataset/engine/datasetops/build_vocab_op.h
new file mode 100644
index 0000000000..bf358c48c6
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/datasetops/build_vocab_op.h
@@ -0,0 +1,174 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_ENGINE_DATASETOPS_BUILD_VOCAB_OP_H_
+#define DATASET_ENGINE_DATASETOPS_BUILD_VOCAB_OP_H_
+
+#include <vector>
+#include <memory>
+#include <unordered_map>
+#include <string>
+#include <utility>
+
+#include "dataset/core/tensor.h"
+#include "dataset/engine/dataset_iterator.h"
+#include "dataset/engine/datasetops/parallel_op.h"
+#include "dataset/text/vocab.h"
+#include "dataset/util/queue.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class BuildVocabOp : public ParallelOp {
+ public:
+  class Builder {
+   public:
+    Builder();
+
+    // Destructor.
+    ~Builder() = default;
+
+    // Setter method
+    // @param int32_t size
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetOpConnectorSize(int32_t size) {
+      builder_connector_size_ = size;
+      return *this;
+    }
+
+    // Setter method
+    // @param int32_t num_workers
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetNumWorkers(int32_t num_workers) {
+      builder_num_workers_ = num_workers;
+      return *this;
+    }
+
+    // Setter method
+    // @param int64_t top_k
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetTopK(int64_t top_k) {
+      builder_top_k_ = top_k;
+      return *this;
+    }
+
+    // Setter method
+    // @param int64_t min_freq
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetMinFreq(int64_t min_freq) {
+      builder_min_freq_ = min_freq;
+      return *this;
+    }
+
+    // Setter method
+    // @param int64_t max_freq
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetMaxFreq(int64_t max_freq) {
+      builder_max_freq_ = max_freq;
+      return *this;
+    }
+
+    // set columns names
+    // @param const std::vector<std::string> & col_names - name of columns to get words
+    // @return Builder & reference to builder class object
+    Builder &SetColumnNames(const std::vector<std::string> &col_names) {
+      builder_col_names_ = col_names;
+      return *this;
+    }
+
+    // set special tokens
+    // @param const std::vector<std::string> & col_names - name of columns to get words
+    // @return Builder & reference to builder class object
+    Builder &SetSpecialTokens(const std::vector<std::string> &tokens) {
+      builder_speical_tokens_ = tokens;
+      return *this;
+    }
+
+    // set vocab object
+    Builder &SetVocab(std::shared_ptr<Vocab> vocab) {
+      builder_vocab_ = vocab;
+      return *this;
+    }
+
+    // set special tokens first (or last)
+    Builder &SetSpecialFirst(bool prepend) {
+      builder_special_first_ = prepend;
+      return *this;
+    }
+
+    // The builder "build" method creates the final object.
+    // @param std::shared_ptr<BuildVocabOp> *op - DatasetOp
+    // @return - The error code return
+    Status Build(std::shared_ptr<BuildVocabOp> *op);
+
+   private:
+    int32_t builder_num_workers_;
+    int32_t builder_connector_size_;
+    int64_t builder_min_freq_;
+    int64_t builder_max_freq_;
+    bool builder_special_first_;
+    std::vector<std::string> builder_col_names_;
+    std::vector<std::string> builder_speical_tokens_;
+    std::shared_ptr<Vocab> builder_vocab_;
+    int64_t builder_top_k_;
+  };
+
+  BuildVocabOp(std::shared_ptr<Vocab> vocab, std::vector<std::string> col_names, std::pair<int64_t, int64_t> freq_range,
+               int64_t top_k, const std::vector<std::string> &tokens, bool prepend, int32_t num_workers,
+               int32_t op_connector_size);
+
+  ~BuildVocabOp() = default;
+
+  Status WorkerEntry(int32_t worker_id) override;
+
+  // collect the work product from each worker
+  Status CollectorThread();
+
+  Status EofReceived(int32_t) override { return Status::OK(); }
+
+  Status EoeReceived(int32_t) override { return Status::OK(); }
+
+  Status operator()() override;
+
+  // Getter
+  // @return the number of workers
+  int32_t num_producers() const override { return 1; }
+
+  // Getter
+  // @return the number of threads consuming from the previous Connector
+  int32_t num_consumers() const override { return 1; }
+
+  Status Reset() override { RETURN_STATUS_UNEXPECTED("Reset shouldn't be called in BuildVocabOp"); }
+
+ private:
+  const int32_t interval_;
+  bool special_first_;
+  std::shared_ptr<Vocab> vocab_;
+  std::vector<std::string> col_names_;
+  std::vector<int32_t> col_ids_;
+  std::vector<std::string> special_tokens_;
+  // pair = {min_f, max_f}
+  // make sure that 0<= min_f < max_f <= int32_max in the builder
+  std::pair<int64_t, int64_t> freq_range_;
+
+  int64_t top_k_;                                        // every thing means top_k_ == int32_max
+  std::unique_ptr<ChildIterator> child_iterator_;        // child iterator for fetching TensorRows 1 by 1
+  std::unique_ptr<Queue<TensorRow>> distributor_queue_;  // master thread assigns each worker TensorRow via this
+  std::unique_ptr<Queue<std::unique_ptr<std::unordered_map<std::string, int64_t>>>> collector_queue_;
+  std::unordered_map<std::string, int64_t> word_cnt_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_ENGINE_DATASETOPS_BUILD_VOCAB_OP_H_
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/concat_op.h b/mindspore/ccsrc/dataset/engine/datasetops/concat_op.h
index 9afadab39a..0fb8ec8362 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/concat_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/concat_op.h
@@ -40,7 +40,7 @@ class ConcatOp : public PipelineOp {
     ~Builder() = default;
 
     // The builder "build" method creates the final object.
-    // @return shared_ptr to the new StorageOp object
+    // @return shared_ptr to the new ConcatOp object
     Status Build(std::shared_ptr<ConcatOp> *);
 
    private:
@@ -81,6 +81,10 @@ class ConcatOp : public PipelineOp {
   // before providing their own implementations.
   Status PrepareNodePostAction() override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "ConcatOp"; }
+
  private:
   Status Verify(int32_t id, const std::unique_ptr<DataBuffer> &buf);
 
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.cc
index 9ee6e706aa..bf991ea7d9 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.cc
@@ -20,6 +20,7 @@
 #include <memory>
 #include <utility>
 #include <string>
+#include <algorithm>
 
 #include "dataset/engine/execution_tree.h"
 #include "dataset/engine/datasetops/device_queue_op.h"
@@ -38,6 +39,7 @@ DatasetOp::DatasetOp(int32_t op_connector_size)
       tree_(nullptr),
       state_(OpState::kDeOpIdle),
       op_ctrl_flags_(kDeOpNone),
+      out_connector_(nullptr),
       first_fetch_(true) {
   // The operator starts out with an invalid operator id.  The only way to
   // get it out of invalid state is to assign the operator to an execution tree.
@@ -67,8 +69,45 @@ Status DatasetOp::AddChild(std::shared_ptr<DatasetOp> child) {
   return Status::OK();
 }
 
+Status DatasetOp::RemoveChild(std::shared_ptr<DatasetOp> child) {
+  if (operator_id_ == kInvalidOperatorId) {
+    std::string err_msg(
+      "Cannot remove child node.  Tree node connections can only"
+      "be made if the node belongs to a tree.");
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+
+  // disallow relationships with other trees
+  if (tree_ != child->tree_) {
+    std::string err_msg(
+      "Cannot remove child node.  Tree node connections can only be made if both nodes belong to the same tree.");
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+
+  child_.erase(std::remove(child_.begin(), child_.end(), child), child_.end());
+  child->RemoveParent(this);
+  return Status::OK();
+}
+
+Status DatasetOp::InsertAsParent(std::shared_ptr<DatasetOp> to_add) {
+  for (auto &prev_parent : this->parent_) {
+    RETURN_IF_NOT_OK(prev_parent->RemoveChild(shared_from_this()));
+    RETURN_IF_NOT_OK(prev_parent->AddChild(to_add));
+  }
+  RETURN_IF_NOT_OK(to_add->AddChild(shared_from_this()));
+  if (tree_->root()->id() == this->id()) {
+    tree_->AssignRoot(to_add);
+  }
+  return Status::OK();
+}
+
 // Adds a parent operator to this operator
-void DatasetOp::AddParent(const DatasetOp *parent) { parent_.push_back(parent); }
+void DatasetOp::AddParent(DatasetOp *parent) { parent_.push_back(parent); }
+
+// Removes a parent operator from this operator
+void DatasetOp::RemoveParent(DatasetOp *parent) {
+  parent_.erase(std::remove(parent_.begin(), parent_.end(), parent), parent_.end());
+}
 
 // Getter function to get a shared pointer to our childAdds a operator to become our child.
 std::shared_ptr<DatasetOp> DatasetOp::child(int32_t child_index) const {
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.h b/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.h
index 315dc27219..973b5be962 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.h
@@ -51,7 +51,7 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
   };
 
   // Flags that control operator runtime behaviours
-  enum OpState { kDeOpRunning = 0, kDeOpIdle = 1 };
+  enum OpState { kDeOpRunning = 0, kDeOpIdle = 1, kDeOpTerminated };
 
   // Constructor
   // @param op_connector_size - The size for the output connector of this operator.
@@ -64,10 +64,19 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
   // @param child - shared pointer to the child to add.
   Status AddChild(std::shared_ptr<DatasetOp> child);
 
+  // Remove a operator from our children.
+  // @param child - shared pointer to the child to remove.
+  Status RemoveChild(std::shared_ptr<DatasetOp> child);
+
   // Getter function to get a shared pointer to our child
   // @param child_index - An operator can have n children. Indicates choose which child to return.
   std::shared_ptr<DatasetOp> child(int32_t child_index) const;
 
+  // Inserts a operator as the parent current op.
+  // Inserted op will become the sole parent of the current op.
+  // The existing parent of the current op will be transferred to the inserted op.
+  Status InsertAsParent(std::shared_ptr<DatasetOp> to_add);
+
   // Creates the connector within this operator
   // @param num_producers - number of threads that write into this connector
   // @param num_consumers - number of threads that read from this connector
@@ -211,8 +220,36 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
   // @return - the column name map as a string
   std::string ColumnNameMapAsString() const;
 
+  // Getter function
+  // @return connector size of current op
+  int32_t ConnectorSize() const {
+    if (!inlined()) {
+      return out_connector_->size();
+    }
+    // Return child connector size for inlined op
+    return ChildOpConnectorSize();
+  }
+
+  // Getter function
+  // @return connector size of current op
+  int32_t ConnectorCapacity() const {
+    if (!inlined()) {
+      return out_connector_->capacity();
+    }
+    // Return child connector capacity for inlined op
+    return ChildOpConnectorCapacity();
+  }
+
+  // Getter function
+  // @return connector size of child op
+  int32_t ChildOpConnectorSize(int32_t child_index = 0) const { return child_[child_index]->ConnectorSize(); }
+
+  // Getter function
+  // @return connector capacity of child op
+  int32_t ChildOpConnectorCapacity(int32_t child_index = 0) const { return child_[child_index]->ConnectorCapacity(); }
+
   // Children Getter
-  // @return Vector or Children
+  // @return Vector of Children
   std::vector<std::shared_ptr<DatasetOp>> Children() const { return child_; }
 
   // Base method for NodePass visit.
@@ -221,11 +258,24 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
   // @return Statue of the node visit
   virtual Status Accept(NodePass *p, bool *modified);
 
+  // Op name getter
+  // @return Name of the current Op
+  virtual std::string Name() const { return "DatasetOp"; }
+
+  // Execution Tree getter
+  // @return Pointer to the ExecutionTree the current op belongs to, no ownership
+  ExecutionTree *Tree() { return tree_; }
+
  protected:
   // Adds a parent operator to this operator
   // @notes External callers do not have access to this function.
   // @param parent - The parent node to add
-  void AddParent(const DatasetOp *parent);
+  void AddParent(DatasetOp *parent);
+
+  // Removes a parent operator from this operator
+  // @notes External callers do not have access to this function.
+  // @param parent - The parent node to remove
+  void RemoveParent(DatasetOp *parent);
 
   // A helper function for providing an assignment of the column name map.
   // This grabs the map from child 0 and assigns it into this op.
@@ -234,7 +284,7 @@ class DatasetOp : public std::enable_shared_from_this<DatasetOp> {
   Status AssignColMapFromChild();
 
   std::vector<std::shared_ptr<DatasetOp>> child_;                // Child nodes
-  std::vector<const DatasetOp *> parent_;                        // Parent nodes. No ownership and read-only
+  std::vector<DatasetOp *> parent_;                              // Parent nodes. No ownership
   int32_t oc_queue_size_;                                        // Capacity for each out_connector_
   int32_t operator_id_;                                          // Generated id for the node
   ExecutionTree *tree_;                                          // Back pointer to our tree.
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc
index bcdb58db24..84bad9db1a 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc
@@ -13,18 +13,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "dataset/engine/datasetops/device_queue_op.h"
+
 #include <iomanip>
 #include <iostream>
 #include <memory>
-
 #include "dataset/core/config_manager.h"
 #include "dataset/core/global_context.h"
+#include "dataset/engine/datasetops/device_queue_op.h"
 #include "dataset/engine/data_buffer.h"
 #include "dataset/engine/dataset_iterator.h"
+#include "dataset/engine/opt/pass.h"
+#include "dataset/engine/perf/profiling.h"
+#include "dataset/engine/perf/device_queue_tracing.h"
 #include "dataset/util/status.h"
 #include "dataset/util/task_manager.h"
-#include "dataset/engine/opt/pass.h"
 
 namespace mindspore {
 namespace dataset {
@@ -97,7 +99,19 @@ Status DeviceQueueOp::SendDataToAscend() {
   MS_LOG(INFO) << "Device queue, sending data to Ascend.";
   int64_t total_batch = 0;
   bool is_break_loop = false;
-
+  double batch_start_time, end_time;
+  int32_t batch_cost, tdt_cost;
+  int32_t connector_size = 0;
+  int32_t connector_capacity;
+  std::shared_ptr<DeviceQueueTracing> profiling_node;
+  bool isProfilingEnable = tree_->GetProfilingManager()->IsProfilingEnable();
+  if (isProfilingEnable) {
+    std::shared_ptr<Tracing> node;
+    RETURN_IF_NOT_OK(tree_->GetProfilingManager()->GetTracingNode(kDeviceQueueTracingName, &node));
+    profiling_node = std::dynamic_pointer_cast<DeviceQueueTracing>(node);
+    batch_start_time = ProfilingTime::GetCurMilliSecond();
+    connector_capacity = ChildOpConnectorCapacity();
+  }
   std::unique_ptr<DataBuffer> current_buffer;
   RETURN_IF_NOT_OK(GetNextInput(&current_buffer));
 
@@ -107,20 +121,43 @@ Status DeviceQueueOp::SendDataToAscend() {
       TensorRow currRow;
       for (int row_id = 0; row_id < current_buffer->NumRows() && !is_break_loop; row_id++) {
         RETURN_IF_NOT_OK(current_buffer->GetRow(row_id, &currRow));
-        auto status = tdtInstancePtr->hostPush(currRow, true, channel_name_);
+        auto status = tdtInstancePtr->hostPush(currRow, true, channel_name_, isProfilingEnable, tdt_cost);
         if (status == TdtStatus::FAILED) {
           return Status(StatusCode::kTDTPushFailure, "TDT Push Failed");
         }
+
+        if (isProfilingEnable) {
+          end_time = ProfilingTime::GetCurMilliSecond();
+          // record push tdt time
+          profiling_node->Record(TIME, TDT_PUSH_TIME, total_batch + 1, tdt_cost);
+          batch_cost = (int32_t)(end_time - batch_start_time);
+          // record batch time
+          profiling_node->Record(TIME, BATCH_TIME, total_batch + 1, batch_cost);
+          // record pipeline time
+          profiling_node->Record(TIME, PIPELINE_TIME, total_batch + 1, batch_cost - tdt_cost);
+          batch_start_time = end_time;
+          // record connector depth
+          profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, total_batch + 1, connector_size);
+        }
         total_batch++;
         if (num_batch_ > 0 && total_batch == num_batch_) {
           is_break_loop = true;
         }
       }
+      if (isProfilingEnable) {
+        connector_size = ChildOpConnectorSize();
+        connector_capacity = ChildOpConnectorCapacity();
+      }
       RETURN_IF_NOT_OK(GetNextInput(&current_buffer));
     }
+    if (isProfilingEnable) {
+      connector_size = ChildOpConnectorSize();
+      connector_capacity = ChildOpConnectorCapacity();
+    }
     RETURN_IF_NOT_OK(GetNextInput(&current_buffer));
   }
 
+  tree_->SetFinished();
   MS_LOG(INFO) << "Device queue total batch is " << total_batch << ", number of batches is " << num_batch_ << ".";
 
   return Status::OK();
@@ -195,13 +232,17 @@ Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, con
 
   while (!GpuBufferMgr::GetInstance().IsClosed() && !TaskManager::FindMe()->Interrupted()) {
     RETURN_IF_NOT_OK(MallocForGPUData(&items, curr_row));
-    auto ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME);
+    BlockQueueStatus_T ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME);
     if (ret) {
       for (int i = 0; i < items.size(); i++) {
         free(items[i].data_ptr_);
       }
-      MS_LOG(WARNING) << "Retry pushing data...";
-      continue;
+      if (ret == BlockQueueStatus_T::ERROR_INPUT) {
+        return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "invalid input Data, please check it.");
+      } else {
+        MS_LOG(WARNING) << "Retry pushing data...";
+        continue;
+      }
     } else {
       break;
     }
@@ -217,7 +258,7 @@ Status DeviceQueueOp::MallocForGPUData(std::vector<device::DataItemGpu> *items,
       return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "memory malloc failed.");
     }
     (void)memset_s(sub_item.data_ptr_, sub_item.data_len_, 0, sub_item.data_len_);
-    unsigned char *column_data = curr_row[i]->GetMutableBuffer();
+    const unsigned char *column_data = curr_row[i]->GetBuffer();
     if (memcpy_s(sub_item.data_ptr_, sub_item.data_len_, column_data,
                  static_cast<uint32_t>(curr_row[i++]->SizeInBytes())) != 0) {
       MS_LOG(ERROR) << "memcpy_s failed!";
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.h b/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.h
index ebbcd16cc3..a854004593 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.h
@@ -25,11 +25,11 @@
 
 #ifdef ENABLE_TDTQUE
 #include "dataset/engine/tdt/tdt_plugin.h"
-
 #endif
 
 #ifdef ENABLE_GPUQUE
 #include "device/gpu/gpu_buffer_mgr.h"
+using mindspore::device::BlockQueueStatus_T;
 using mindspore::device::GpuBufferMgr;
 #endif
 
@@ -140,6 +140,10 @@ class DeviceQueueOp : public PipelineOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "DeviceQueueOp"; }
+
  private:
   //  Name: checkExceptions(DataBuffer);
   //  Description: Check whether the dataBuffer meets the condition for performing DeviceQueueOp
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/filter_op.h b/mindspore/ccsrc/dataset/engine/datasetops/filter_op.h
index cd6c01da90..36f70cb82f 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/filter_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/filter_op.h
@@ -127,6 +127,10 @@ class FilterOp : public ParallelOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "FilterOp"; }
+
  private:
   // predicate_func python callable which returns a boolean value.
   py::function predicate_func_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/map_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/map_op.cc
index 008ff09c99..9918260201 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/map_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/map_op.cc
@@ -54,19 +54,20 @@ Status MapOp::Builder::sanityCheck() const {
 Status MapOp::Builder::Build(std::shared_ptr<MapOp> *ptr) {
   RETURN_IF_NOT_OK(sanityCheck());
   *ptr = std::make_shared<MapOp>(std::move(build_in_col_names_), std::move(build_out_col_names_),
-                                 std::move(build_tensor_funcs_), build_num_workers_, build_op_connector_size_,
-                                 build_perf_mode_);
+                                 std::move(build_tensor_funcs_), std::move(build_col_order_), build_num_workers_,
+                                 build_op_connector_size_, build_perf_mode_);
   return Status::OK();
 }
 
 // Constructor of MapOp
 MapOp::MapOp(const std::vector<std::string> &in_col_names, const std::vector<std::string> &out_col_names,
-             std::vector<std::shared_ptr<TensorOp>> tensor_funcs, int32_t num_workers, int32_t op_connector_size,
-             bool perf_mode)
+             std::vector<std::shared_ptr<TensorOp>> tensor_funcs, const std::vector<std::string> &columns_order,
+             int32_t num_workers, int32_t op_connector_size, bool perf_mode)
     : ParallelOp(num_workers, op_connector_size),
       tfuncs_(std::move(tensor_funcs)),
       in_columns_(in_col_names),
       out_columns_(out_col_names),
+      columns_order_(columns_order),
       perf_mode_(perf_mode) {
   // If caller didn't specify the out_col_names, assume they are same as the in_columns.
   if (out_columns_.empty() || out_columns_[0].empty()) {
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/map_op.h b/mindspore/ccsrc/dataset/engine/datasetops/map_op.h
index f903881ca2..4d7ffd1204 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/map_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/map_op.h
@@ -93,6 +93,13 @@ class MapOp : public ParallelOp {
       return *this;
     }
 
+    // Setter method.
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetColOrder(const std::vector<std::string> &col_order_) {
+      build_col_order_ = col_order_;
+      return *this;
+    }
+
     // Setter method.
     // @return Builder setter method returns reference to the builder.
     Builder &SetNumWorkers(int32_t num_workers) {
@@ -123,6 +130,7 @@ class MapOp : public ParallelOp {
     std::vector<std::string> build_in_col_names_;
     std::vector<std::string> build_out_col_names_;
     std::vector<std::shared_ptr<TensorOp>> build_tensor_funcs_;
+    std::vector<std::string> build_col_order_;
     int32_t build_num_workers_;
     int32_t build_op_connector_size_;
     bool build_perf_mode_;  // Default true.
@@ -137,11 +145,12 @@ class MapOp : public ParallelOp {
   // @param in_col_names A list of input column names (should match the input/output \p tensorFuncs).
   // @param out_col_names A list of output column names (should match the input/output \p tensorFuncs).
   // @param tensor_funcs A list of TensorOp pointers for MapOp to apply to each data.
+  // @param columns_order names A full list of column names (should match the whole dataset view post \p tensorFuncs).
   // @param num_workers The number of worker threads.
   // @param op_connector_size The size of each queue in the connector.
   MapOp(const std::vector<std::string> &in_col_names, const std::vector<std::string> &out_col_names,
-        std::vector<std::shared_ptr<TensorOp>> tensor_funcs, int32_t num_workers, int32_t op_connector_size,
-        bool perf_mode);
+        std::vector<std::shared_ptr<TensorOp>> tensor_funcs, const std::vector<std::string> &columns_order,
+        int32_t num_workers, int32_t op_connector_size, bool perf_mode);
 
   // Destructor
   ~MapOp() = default;
@@ -177,6 +186,14 @@ class MapOp : public ParallelOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "MapOp"; }
+
+  // Columns order getter
+  // @return The post map columns order
+  std::vector<std::string> const &ColumnsOrder() const { return columns_order_; }
+
  private:
   // Local queues where worker threads can pop from.
   // Popping directly from the Connector can block if the previous designated threads haven't pop.
@@ -198,6 +215,9 @@ class MapOp : public ParallelOp {
   // Indices of the columns to process.
   std::vector<size_t> to_process_indices_;
 
+  // Variable to store the column_order of all columns post tensorOps
+  std::vector<std::string> columns_order_;
+
   // Performance mode is when the main thread creates local queues, pulls databuffers from the previous
   // op's Connector and distributes them to the local queues. Workers pull from the local queues.
   // If this flag is false, each worker pulls directly from the Connector. This use less resources
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/project_op.h b/mindspore/ccsrc/dataset/engine/datasetops/project_op.h
index 3940b9adc7..ced0f9e5a9 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/project_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/project_op.h
@@ -40,7 +40,7 @@ class ProjectOp : public PipelineOp {
     ~Builder() = default;
 
     // The builder "build" method creates the final object.
-    // @return shared_ptr to the new StorageOp object.
+    // @return shared_ptr to the new ProjectOp object.
     Status Build(std::shared_ptr<ProjectOp> *);
 
    private:
@@ -107,6 +107,10 @@ class ProjectOp : public PipelineOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "ProjectOp"; }
+
  private:
   std::vector<std::string> columns_to_project_;
   std::vector<int32_t> projected_column_indices_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/rename_op.h b/mindspore/ccsrc/dataset/engine/datasetops/rename_op.h
index 2bd4875fda..eaca20ccc8 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/rename_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/rename_op.h
@@ -67,7 +67,7 @@ class RenameOp : public PipelineOp {
     }
 
     // The builder "build" method creates the ZipOp dataset Operator.
-    // @return shared_ptr to the new StorageOp object
+    // @return shared_ptr to the new RenameOp object
     Status Build(std::shared_ptr<RenameOp> *);
 
    private:
@@ -116,6 +116,10 @@ class RenameOp : public PipelineOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "RenameOp"; }
+
  protected:
   // Rename core functionality
   Status RenameColumns();
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/repeat_op.h b/mindspore/ccsrc/dataset/engine/datasetops/repeat_op.h
index 718bc1922b..bba85c3bb5 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/repeat_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/repeat_op.h
@@ -42,7 +42,7 @@ class RepeatOp : public PipelineOp {
     ~Builder() = default;
 
     // The builder "build" method creates the final object.
-    // @return shared_ptr to the new StorageOp object
+    // @return shared_ptr to the new RepeatOp object
     Status Build(std::shared_ptr<RepeatOp> *);
 
    private:
@@ -124,6 +124,10 @@ class RepeatOp : public PipelineOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "RepeatOp"; }
+
  private:
   int32_t max_repeats_;                              // The number of repeats that the user requested
   int32_t repeat_count_;                             // A counter for the current number of executed repeats
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/shuffle_op.h b/mindspore/ccsrc/dataset/engine/datasetops/shuffle_op.h
index baabad758c..14b1e4511e 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/shuffle_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/shuffle_op.h
@@ -101,7 +101,7 @@ class ShuffleOp : public PipelineOp {
     }
 
     // The builder "build" method creates the final object.
-    // @return shared_ptr to the new StorageOp object
+    // @return shared_ptr to the new ShuffleOp object
     Status Build(std::shared_ptr<ShuffleOp> *);
 
    private:
@@ -161,6 +161,10 @@ class ShuffleOp : public PipelineOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "ShuffleOp"; }
+
  private:
   // Private function to add a new row to the shuffle buffer.
   // @return Status - The error code return
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/skip_op.h b/mindspore/ccsrc/dataset/engine/datasetops/skip_op.h
index 40db770642..4cb658b2a7 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/skip_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/skip_op.h
@@ -37,7 +37,7 @@ class SkipOp : public PipelineOp {
     ~Builder() = default;
 
     // The builder "build" method creates the final object.
-    // @return shared_ptr to the new StorageOp object
+    // @return shared_ptr to the new SkipOp object
     Status Build(std::shared_ptr<SkipOp> *);
 
    private:
@@ -80,6 +80,10 @@ class SkipOp : public PipelineOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "SkipOp"; }
+
  private:
   int32_t max_skips_;   // The number of skips that the user requested
   int32_t skip_count_;  // A counter for the current number of executed skips
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt b/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
index a1d0b22f15..b78ddcd87b 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
@@ -5,17 +5,15 @@ add_library(engine-datasetops-source OBJECT
     generator_op.cc
     io_block.cc
     mindrecord_op.cc
-    storage_client.cc
-    storage_op.cc
-    tf_buffer.cc
-    tf_client.cc
     tf_reader_op.cc
     image_folder_op.cc
     mnist_op.cc
     voc_op.cc
+    coco_op.cc
     manifest_op.cc
     cifar_op.cc
     random_data_op.cc
     celeba_op.cc
     text_file_op.cc
+    clue_op.cc
     )
\ No newline at end of file
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc
index 8f8c57b012..4b32201d6d 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc
@@ -26,7 +26,7 @@
 
 namespace mindspore {
 namespace dataset {
-CelebAOp::Builder::Builder() : builder_decode_(false), builder_sampler_(nullptr), builder_num_samples_(0) {
+CelebAOp::Builder::Builder() : builder_decode_(false), builder_sampler_(nullptr) {
   std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
   builder_num_workers_ = cfg->num_parallel_workers();
   builder_rows_per_buffer_ = cfg->rows_per_buffer();
@@ -38,7 +38,9 @@ Status CelebAOp::Builder::Build(std::shared_ptr<CelebAOp> *op) {
   MS_LOG(DEBUG) << "Celeba dataset type is " << builder_dataset_type_.c_str() << ".";
   RETURN_IF_NOT_OK(SanityCheck());
   if (builder_sampler_ == nullptr) {
-    builder_sampler_ = std::make_shared<SequentialSampler>();
+    const int64_t num_samples = 0;
+    const int64_t start_index = 0;
+    builder_sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples);
   }
 
   builder_schema_ = std::make_unique<DataSchema>();
@@ -47,10 +49,9 @@ Status CelebAOp::Builder::Build(std::shared_ptr<CelebAOp> *op) {
   // label is like this:0 1 0 0 1......
   RETURN_IF_NOT_OK(
     builder_schema_->AddColumn(ColDescriptor("attr", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
-  *op =
-    std::make_shared<CelebAOp>(builder_num_workers_, builder_rows_per_buffer_, builder_dir_, builder_op_connector_size_,
-                               builder_decode_, builder_dataset_type_, builder_extensions_, std::move(builder_schema_),
-                               std::move(builder_sampler_), builder_num_samples_);
+  *op = std::make_shared<CelebAOp>(builder_num_workers_, builder_rows_per_buffer_, builder_dir_,
+                                   builder_op_connector_size_, builder_decode_, builder_dataset_type_,
+                                   builder_extensions_, std::move(builder_schema_), std::move(builder_sampler_));
   if (*op == nullptr) {
     return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "CelebAOp is null");
   }
@@ -68,7 +69,7 @@ Status CelebAOp::Builder::SanityCheck() {
 
 CelebAOp::CelebAOp(int32_t num_workers, int32_t rows_per_buffer, const std::string &dir, int32_t queue_size,
                    bool decode, const std::string &dataset_type, const std::set<std::string> &exts,
-                   std::unique_ptr<DataSchema> schema, std::shared_ptr<Sampler> sampler, int64_t num_samples)
+                   std::unique_ptr<DataSchema> schema, std::shared_ptr<Sampler> sampler)
     : ParallelOp(num_workers, queue_size),
       rows_per_buffer_(rows_per_buffer),
       folder_path_(dir),
@@ -77,8 +78,6 @@ CelebAOp::CelebAOp(int32_t num_workers, int32_t rows_per_buffer, const std::stri
       data_schema_(std::move(schema)),
       sampler_(std::move(sampler)),
       num_rows_in_attr_file_(0),
-      num_rows_exact_(0),
-      num_samples_(num_samples),
       dataset_type_(dataset_type) {
   // Set the column name map (base class field)
   for (int32_t index = 0; index < data_schema_->NumColumns(); index++) {
@@ -202,13 +201,6 @@ Status CelebAOp::ParseImageAttrInfo() {
   RETURN_IF_NOT_OK(attr_info_queue_->PopFront(&image_infos));
   while (!image_infos.empty() && needMoreData) {
     for (uint32_t index = 0; index < image_infos.size(); index++) {
-      if (num_samples_ != 0 && image_labels_vec_.size() >= num_samples_) {
-        MS_LOG(WARNING) << "Image number(" << image_labels_vec_.size() << " is more than"
-                        << " rows num eval attr file(" << num_rows_in_attr_file_ << ") or num samples(" << num_samples_
-                        << ").";
-        needMoreData = false;
-        break;
-      }
       std::string image_info = image_infos[index];
       std::vector<std::string> split = Split(image_info);
       std::pair<std::string, std::vector<int32_t>> image_labels;
@@ -239,14 +231,13 @@ Status CelebAOp::ParseImageAttrInfo() {
     RETURN_IF_NOT_OK(attr_info_queue_->PopFront(&image_infos));
   }
 
-  num_rows_exact_ = image_labels_vec_.size();
-  num_samples_ = (num_samples_ == 0 || num_samples_ > num_rows_exact_) ? num_rows_exact_ : num_samples_;
-  if (num_rows_exact_ == 0) {
+  num_rows_ = image_labels_vec_.size();
+  if (num_rows_ == 0) {
     RETURN_STATUS_UNEXPECTED(
       "There is no valid data matching the dataset API CelebADataset.Please check file path or dataset API "
       "validation first.");
   }
-  MS_LOG(DEBUG) << "Celeba dataset rows number is " << num_rows_exact_ << ".";
+  MS_LOG(DEBUG) << "Celeba dataset rows number is " << num_rows_ << ".";
   return Status::OK();
 }
 
@@ -268,33 +259,11 @@ std::vector<std::string> CelebAOp::Split(const std::string &line) {
   return split;
 }
 
-// Derived from RandomAccessOp
-Status CelebAOp::GetNumSamples(int64_t *num) const {
-  if (num == nullptr || num_samples_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API CelebADataset.Please check file path or dataset API "
-      "validation first.");
-  }
-  (*num) = num_samples_;
-  return Status::OK();
-}
-
-Status CelebAOp::GetNumRowsInDataset(int64_t *num) const {
-  if (num == nullptr || num_rows_exact_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API CelebADataset.Please check file path or dataset API "
-      "validation first.");
-  }
-
-  *num = num_rows_exact_;
-  return Status::OK();
-}
-
 // Main logic, Register Queue with TaskGroup, launch all threads and do the functor's work
 Status CelebAOp::operator()() {
   RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
   std::unique_ptr<DataBuffer> data_buffer;
-  RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&data_buffer));
+  RETURN_IF_NOT_OK(sampler_->GetNextSample(&data_buffer));
   RETURN_IF_NOT_OK(AddIOBlock(&data_buffer));
   return Status::OK();
 }
@@ -310,9 +279,8 @@ Status CelebAOp::AddIOBlock(std::unique_ptr<DataBuffer> *data_buffer) {
       RETURN_IF_NOT_OK((*data_buffer)->PopRow(&sample_row));
       std::shared_ptr<Tensor> sample_ids = sample_row[0];
       for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
-        if ((*itr) >= num_rows_exact_) {
-          MS_LOG(WARNING) << "Sample Id (" << *itr << ") is out of bounds, skipping. Max id is " << num_rows_exact_
-                          << ".";
+        if ((*itr) >= num_rows_) {
+          MS_LOG(WARNING) << "Sample Id (" << *itr << ") is out of bounds, skipping. Max id is " << num_rows_ << ".";
           continue;
         }
         keys.push_back(*itr);
@@ -323,7 +291,7 @@ Status CelebAOp::AddIOBlock(std::unique_ptr<DataBuffer> *data_buffer) {
           keys.clear();
         }
       }
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(data_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(data_buffer));
     }
 
     if (!keys.empty()) {
@@ -345,7 +313,7 @@ Status CelebAOp::AddIOBlock(std::unique_ptr<DataBuffer> *data_buffer) {
         io_block_queues_[(buff_count++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
       RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
       wp_.Clear();
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(data_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(data_buffer));
     }
   }
 }
@@ -381,7 +349,7 @@ Status CelebAOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<Da
   std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
   for (const auto &key : keys) {
     TensorRow row;
-    RETURN_IF_NOT_OK(LoadTensorRow(image_labels_vec_[key], &row));
+    RETURN_IF_NOT_OK(LoadTensorRow(key, image_labels_vec_[key], &row));
     deq->push_back(std::move(row));
   }
 
@@ -389,25 +357,14 @@ Status CelebAOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<Da
   return Status::OK();
 }
 
-Status CelebAOp::LoadTensorRow(const std::pair<std::string, std::vector<int32_t>> &image_label, TensorRow *row) {
+Status CelebAOp::LoadTensorRow(row_id_type row_id, const std::pair<std::string, std::vector<int32_t>> &image_label,
+                               TensorRow *row) {
   std::shared_ptr<Tensor> image;
   std::shared_ptr<Tensor> label;
 
   Path path(folder_path_);
   Path image_path = path / image_label.first;
-  std::ifstream handle(image_path.toString(), std::ios::binary | std::ios::in);
-  if (handle.fail()) {
-    std::string err_msg = "Fail to open file: " + image_path.toString();
-    return Status(StatusCode::kFileNotExist, __LINE__, __FILE__, err_msg);
-  }
-
-  (void)handle.seekg(0, std::ios::end);
-  int64_t num_elements = handle.tellg();
-  (void)handle.seekg(0, std::ios::beg);
-  RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(),
-                                        TensorShape(std::vector<dsize_t>(1, num_elements)),
-                                        data_schema_->column(0).type()));
-  (void)handle.read(reinterpret_cast<char *>(image->GetMutableBuffer()), num_elements);
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, image_path.toString()));
   if (decode_ == true) {
     Status rc = Decode(image, &image);
     if (rc.IsError()) {
@@ -430,7 +387,7 @@ Status CelebAOp::LoadTensorRow(const std::pair<std::string, std::vector<int32_t>
   }
   label->Squeeze();
 
-  (*row) = {std::move(image), std::move(label)};
+  (*row) = TensorRow(row_id, {std::move(image), std::move(label)});
   return Status::OK();
 }
 
@@ -446,13 +403,13 @@ void CelebAOp::Print(std::ostream &out, bool show_all) const {
     // Call the super class for displaying any common detailed info
     ParallelOp::Print(out, show_all);
     // Then show any custom derived-internal stuff
-    out << "\nNumber of rows:" << num_rows_exact_ << "\nceleba dir: " << folder_path_ << "\n\n";
+    out << "\nNumber of rows:" << num_rows_ << "\nceleba dir: " << folder_path_ << "\n\n";
   }
 }
 
 // Reset Sampler and wakeup Master thread (functor)
 Status CelebAOp::Reset() {
-  RETURN_IF_NOT_OK(sampler_->Reset());
+  RETURN_IF_NOT_OK(sampler_->ResetSampler());
   wp_.Set();  // wake up master thread after reset is done
   return Status::OK();
 }
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.h
index e0055441ef..f4b5d040ca 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.h
@@ -108,14 +108,6 @@ class CelebAOp : public ParallelOp, RandomAccessOp {
       return *this;
     }
 
-    // Setter method
-    // @param int64_t num_samples
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetNumSamples(int64_t num_samples) {
-      builder_num_samples_ = num_samples;
-      return *this;
-    }
-
     // Setter method
     // @param const std::string dataset_type: type to be read
     // @return Builder setter method returns reference to the builder.
@@ -141,7 +133,6 @@ class CelebAOp : public ParallelOp, RandomAccessOp {
     std::set<std::string> builder_extensions_;
     std::shared_ptr<Sampler> builder_sampler_;
     std::unique_ptr<DataSchema> builder_schema_;
-    int64_t builder_num_samples_;
     std::string builder_dataset_type_;
   };
 
@@ -153,7 +144,7 @@ class CelebAOp : public ParallelOp, RandomAccessOp {
   // @param std::unique_ptr<Sampler> sampler - sampler tells CelebAOp what to read
   CelebAOp(int32_t num_workers, int32_t rows_per_buffer, const std::string &dir, int32_t queue_size, bool decode,
            const std::string &dataset_type, const std::set<std::string> &exts, std::unique_ptr<DataSchema> schema,
-           std::shared_ptr<Sampler> sampler, int64_t num_samples);
+           std::shared_ptr<Sampler> sampler);
 
   ~CelebAOp() override = default;
 
@@ -163,16 +154,6 @@ class CelebAOp : public ParallelOp, RandomAccessOp {
   // @return Status - The error code return
   Status operator()() override;
 
-  // Method derived from RandomAccess Op, enable Sampler to get numRows
-  // @param int64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumSamples(int64_t *num) const override;
-
-  // Method derived from RandomAccess Op, enable Sampler to get numRows
-  // @param int64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumRowsInDataset(int64_t *num) const override;
-
   // Worker thread pulls a number of IOBlock from IOBlock Queue, make a buffer and push it to Connector
   // @param int32_t worker_id - id of each worker
   // @return Status - The error code return
@@ -188,6 +169,10 @@ class CelebAOp : public ParallelOp, RandomAccessOp {
   // @return Status - The error code return
   Status AddIOBlock(std::unique_ptr<DataBuffer> *data_buffer);
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const { return "CelebAOp"; }
+
  private:
   // Called first when function is called
   // @return
@@ -212,10 +197,12 @@ class CelebAOp : public ParallelOp, RandomAccessOp {
   Status LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db);
 
   // Load a tensor row according to a pair
+  // @param row_id_type row_id - id for this tensor row
   // @param std::pair - <image_file,<label>>
   // @param TensorRow row - image & label read into this tensor row
   // @return Status - The error code return
-  Status LoadTensorRow(const std::pair<std::string, std::vector<int32_t>> &image_label, TensorRow *row);
+  Status LoadTensorRow(row_id_type row_id, const std::pair<std::string, std::vector<int32_t>> &image_label,
+                       TensorRow *row);
 
   // Check if need read according to dataset type
   // @return bool - if need read
@@ -233,11 +220,9 @@ class CelebAOp : public ParallelOp, RandomAccessOp {
   std::shared_ptr<Sampler> sampler_;
   std::unique_ptr<Queue<std::vector<std::string>>> attr_info_queue_;
   int64_t num_rows_in_attr_file_;  // rows number specified in attr file
-  int64_t num_rows_exact_;         // exact rows number,maybe is less than rows_num_in_attr_file_
   QueueList<std::unique_ptr<IOBlock>> io_block_queues_;
   WaitPost wp_;
   std::vector<std::pair<std::string, std::vector<int32_t>>> image_labels_vec_;
-  int64_t num_samples_;
   std::string dataset_type_;
   std::ifstream partition_file_;
 };
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc
index d0a17b56f9..ad87e394eb 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc
@@ -35,7 +35,7 @@ constexpr uint32_t kCifarImageChannel = 3;
 constexpr uint32_t kCifarBlockImageNum = 5;
 constexpr uint32_t kCifarImageSize = kCifarImageHeight * kCifarImageWidth * kCifarImageChannel;
 
-CifarOp::Builder::Builder() : num_samples_(0), sampler_(nullptr) {
+CifarOp::Builder::Builder() : sampler_(nullptr) {
   std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
   num_workers_ = cfg->num_parallel_workers();
   rows_per_buffer_ = cfg->rows_per_buffer();
@@ -46,7 +46,9 @@ CifarOp::Builder::Builder() : num_samples_(0), sampler_(nullptr) {
 Status CifarOp::Builder::Build(std::shared_ptr<CifarOp> *ptr) {
   RETURN_IF_NOT_OK(SanityCheck());
   if (sampler_ == nullptr) {
-    sampler_ = std::make_shared<SequentialSampler>();
+    const int64_t num_samples = 0;
+    const int64_t start_index = 0;
+    sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples);
   }
   schema_ = std::make_unique<DataSchema>();
   TensorShape scalar = TensorShape::CreateScalar();
@@ -62,7 +64,7 @@ Status CifarOp::Builder::Build(std::shared_ptr<CifarOp> *ptr) {
       ColDescriptor("fine_label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &another_scalar)));
   }
 
-  *ptr = std::make_shared<CifarOp>(cifar_type_, num_workers_, rows_per_buffer_, dir_, op_connect_size_, num_samples_,
+  *ptr = std::make_shared<CifarOp>(cifar_type_, num_workers_, rows_per_buffer_, dir_, op_connect_size_,
                                    std::move(schema_), std::move(sampler_));
   return Status::OK();
 }
@@ -76,16 +78,13 @@ Status CifarOp::Builder::SanityCheck() {
 }
 
 CifarOp::CifarOp(CifarType type, int32_t num_works, int32_t rows_per_buf, const std::string &file_dir,
-                 int32_t queue_size, int64_t num_samples, std::unique_ptr<DataSchema> data_schema,
-                 std::shared_ptr<Sampler> sampler)
+                 int32_t queue_size, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler)
     : ParallelOp(num_works, queue_size),
       cifar_type_(type),
       rows_per_buffer_(rows_per_buf),
       folder_path_(file_dir),
-      num_samples_(num_samples),
       data_schema_(std::move(data_schema)),
       sampler_(std::move(sampler)),
-      num_rows_(0),
       row_cnt_(0),
       buf_cnt_(0) {
   // set the column name map (base class field)
@@ -101,7 +100,7 @@ CifarOp::CifarOp(CifarType type, int32_t num_works, int32_t rows_per_buf, const
 Status CifarOp::operator()() {
   RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
   std::unique_ptr<DataBuffer> sampler_buffer;
-  RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+  RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
   while (true) {  // each iterator is 1 epoch
     std::vector<int64_t> keys;
     keys.reserve(rows_per_buffer_);
@@ -112,15 +111,14 @@ Status CifarOp::operator()() {
       for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); itr++) {
         keys.push_back(*itr);
         row_cnt_++;
-        if ((*itr) >= num_rows_) continue;    // index out of bound, skipping
-        if (row_cnt_ >= num_samples_) break;  // enough row read, break for loop
+        if ((*itr) >= num_rows_) continue;  // index out of bound, skipping
         if (row_cnt_ % rows_per_buffer_ == 0) {
           RETURN_IF_NOT_OK(io_block_queues_[buf_cnt_++ % num_workers_]->Add(
             std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
           keys.clear();
         }
       }
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
     }
     if (keys.empty() == false) {
       RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(
@@ -141,7 +139,7 @@ Status CifarOp::operator()() {
         io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
       RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
       wp_.Clear();
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
     }
   }
 }
@@ -197,7 +195,7 @@ Status CifarOp::LoadTensorRow(uint64_t index, TensorRow *trow) {
   std::shared_ptr<Tensor> fine_label;
   std::shared_ptr<Tensor> ori_image = cifar_image_label_pairs_[index].first;
   std::shared_ptr<Tensor> copy_image =
-    std::make_shared<Tensor>(ori_image->shape(), ori_image->type(), ori_image->GetMutableBuffer());
+    std::make_shared<Tensor>(ori_image->shape(), ori_image->type(), ori_image->GetBuffer());
   RETURN_IF_NOT_OK(Tensor::CreateTensor(&label, data_schema_->column(1).tensorImpl(), data_schema_->column(1).shape(),
                                         data_schema_->column(1).type(),
                                         reinterpret_cast<unsigned char *>(&cifar_image_label_pairs_[index].second[0])));
@@ -205,9 +203,9 @@ Status CifarOp::LoadTensorRow(uint64_t index, TensorRow *trow) {
     RETURN_IF_NOT_OK(Tensor::CreateTensor(
       &fine_label, data_schema_->column(2).tensorImpl(), data_schema_->column(2).shape(),
       data_schema_->column(2).type(), reinterpret_cast<unsigned char *>(&cifar_image_label_pairs_[index].second[1])));
-    (*trow) = {copy_image, std::move(label), std::move(fine_label)};
+    (*trow) = TensorRow(index, {copy_image, std::move(label), std::move(fine_label)});
   } else {
-    (*trow) = {copy_image, std::move(label)};
+    (*trow) = TensorRow(index, {copy_image, std::move(label)});
   }
 
   return Status::OK();
@@ -243,7 +241,7 @@ void CifarOp::Print(std::ostream &out, bool show_all) const {
 
 // Reset Sampler and wakeup Master thread (functor)
 Status CifarOp::Reset() {
-  RETURN_IF_NOT_OK(sampler_->Reset());
+  RETURN_IF_NOT_OK(sampler_->ResetSampler());
   row_cnt_ = 0;
   wp_.Set();  // wake up master thread after reset is done
   return Status::OK();
@@ -255,30 +253,6 @@ Status CifarOp::InitSampler() {
   return Status::OK();
 }
 
-// Derived from RandomAccessOp
-Status CifarOp::GetNumSamples(int64_t *num) const {
-  if (num == nullptr || num_rows_ == 0) {
-    std::string api = cifar_type_ == kCifar10 ? "Cifar10Dataset" : "Cifar100Dataset";
-    std::string err_msg = "There is no valid data matching the dataset API " + api +
-                          ".Please check file path or dataset API validation first.";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-  (*num) = num_samples_;
-  return Status::OK();
-}
-
-// Derived from RandomAccessOp
-Status CifarOp::GetNumRowsInDataset(int64_t *num) const {
-  if (num == nullptr || num_rows_ == 0) {
-    std::string api = cifar_type_ == kCifar10 ? "Cifar10Dataset" : "Cifar100Dataset";
-    std::string err_msg = "There is no valid data matching the dataset API " + api +
-                          ".Please check file path or dataset API validation first.";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-  (*num) = num_rows_;
-  return Status::OK();
-}
-
 Status CifarOp::ReadCifarBlockDataAsync() {
   TaskManager::FindMe()->Post();
   RETURN_IF_NOT_OK(GetCifarFiles());
@@ -392,11 +366,15 @@ Status CifarOp::ParseCifarData() {
       RETURN_IF_NOT_OK(Tensor::CreateTensor(&image_tensor, data_schema_->column(0).tensorImpl(),
                                             TensorShape({kCifarImageHeight, kCifarImageWidth, kCifarImageChannel}),
                                             data_schema_->column(0).type()));
-      for (int ch = 0; ch < kCifarImageChannel; ++ch) {
-        for (int pix = 0; pix < kCifarImageHeight * kCifarImageWidth; ++pix) {
-          (image_tensor->GetMutableBuffer())[pix * kCifarImageChannel + ch] = block[cur_block_index++];
+      auto itr = image_tensor->begin<uint8_t>();
+      uint32_t total_pix = kCifarImageHeight * kCifarImageWidth;
+      for (int pix = 0; pix < total_pix; ++pix) {
+        for (int ch = 0; ch < kCifarImageChannel; ++ch) {
+          *itr = block[cur_block_index + ch * total_pix + pix];
+          itr++;
         }
       }
+      cur_block_index += total_pix * kCifarImageChannel;
       cifar_image_label_pairs_.emplace_back(std::make_pair(image_tensor, labels));
     }
     RETURN_IF_NOT_OK(cifar_raw_data_block_->PopFront(&block));
@@ -404,7 +382,6 @@ Status CifarOp::ParseCifarData() {
   }
   cifar_image_label_pairs_.shrink_to_fit();
   num_rows_ = cifar_image_label_pairs_.size();
-  num_samples_ = (num_samples_ == 0 || num_samples_ > num_rows_) ? num_rows_ : num_samples_;
   if (num_rows_ == 0) {
     std::string api = cifar_type_ == kCifar10 ? "Cifar10Dataset" : "Cifar100Dataset";
     std::string err_msg = "There is no valid data matching the dataset API " + api +
@@ -432,11 +409,11 @@ Status CifarOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_ids) co
   return Status::OK();
 }
 
-Status CifarOp::CountTotalRows(const std::string &dir, int64_t numSamples, bool isCIFAR10, int64_t *count) {
+Status CifarOp::CountTotalRows(const std::string &dir, bool isCIFAR10, int64_t *count) {
   // the logic of counting the number of samples is copied from ReadCifar100Block() and ReadCifar10Block()
   std::shared_ptr<CifarOp> op;
   *count = 0;
-  RETURN_IF_NOT_OK(Builder().SetCifarDir(dir).SetNumSamples(numSamples).SetCifarType(isCIFAR10).Build(&op));
+  RETURN_IF_NOT_OK(Builder().SetCifarDir(dir).SetCifarType(isCIFAR10).Build(&op));
   RETURN_IF_NOT_OK(op->GetCifarFiles());
   if (op->cifar_type_ == kCifar10) {
     constexpr int64_t num_cifar10_records = 10000;
@@ -448,7 +425,6 @@ Status CifarOp::CountTotalRows(const std::string &dir, int64_t numSamples, bool
       }
       *count = *count + num_cifar10_records;
     }
-    *count = *count < numSamples || numSamples == 0 ? *count : numSamples;
     return Status::OK();
   } else {
     int64_t num_cifar100_records = 0;
@@ -458,7 +434,11 @@ Status CifarOp::CountTotalRows(const std::string &dir, int64_t numSamples, bool
         std::string err_msg = "Invalid cifar100 file path";
         RETURN_STATUS_UNEXPECTED(err_msg);
       }
-      std::string file_name(file.substr(pos + 1));
+      std::string file_name;
+      if (file.size() > 0)
+        file_name = file.substr(pos + 1);
+      else
+        RETURN_STATUS_UNEXPECTED("Invalid string length!");
       if (file_name.find("test") != std::string::npos) {
         num_cifar100_records = 10000;
       } else if (file_name.find("train") != std::string::npos) {
@@ -470,7 +450,7 @@ Status CifarOp::CountTotalRows(const std::string &dir, int64_t numSamples, bool
         RETURN_STATUS_UNEXPECTED(err_msg);
       }
     }
-    *count = num_cifar100_records < numSamples || numSamples == 0 ? num_cifar100_records : numSamples;
+    *count = num_cifar100_records;
     return Status::OK();
   }
 }
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.h
index ade0998c30..62c20ac401 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.h
@@ -73,14 +73,6 @@ class CifarOp : public ParallelOp, public RandomAccessOp {
       return *this;
     }
 
-    // Setter method
-    // @param uint64_t num_samples
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetNumSamples(uint64_t num_samples) {
-      num_samples_ = num_samples;
-      return *this;
-    }
-
     // Setter method
     // @param std::shared_ptr<Sampler> sampler
     // @return Builder setter method returns reference to the builder.
@@ -121,7 +113,6 @@ class CifarOp : public ParallelOp, public RandomAccessOp {
    private:
     std::string dir_;
     int32_t num_workers_;
-    uint64_t num_samples_;
     int32_t rows_per_buffer_;
     int32_t op_connect_size_;
     std::shared_ptr<Sampler> sampler_;
@@ -137,7 +128,7 @@ class CifarOp : public ParallelOp, public RandomAccessOp {
   // @param uint32_t - queueSize - connector queue size
   // @param std::unique_ptr<Sampler> sampler - sampler tells ImageFolderOp what to read
   CifarOp(CifarType type, int32_t num_works, int32_t rows_per_buf, const std::string &file_dir, int32_t queue_size,
-          int64_t num_samples, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler);
+          std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler);
   // Destructor.
   ~CifarOp() = default;
 
@@ -152,16 +143,6 @@ class CifarOp : public ParallelOp, public RandomAccessOp {
   // @return Status - The error code return
   Status operator()() override;
 
-  // Method derived from RandomAccess Op, enable Sampler to get numRows
-  // @param uint64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumSamples(int64_t *num) const override;
-
-  // Method derived from RandomAccess Op, enable Sampler to get total numRows in dataset
-  // @param uint64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumRowsInDataset(int64_t *num) const override;
-
   // A print method typically used for debugging
   // @param out
   // @param show_all
@@ -169,11 +150,14 @@ class CifarOp : public ParallelOp, public RandomAccessOp {
 
   // Function to count the number of samples in the CIFAR dataset
   // @param dir path to the CIFAR directory
-  // @param numSamples maximum number of samples requested
   // @param isCIFAR10 true if CIFAR10 and false if CIFAR100
-  // @param count output arg that will hold the minimum of the actual dataset size and numSamples
+  // @param count output arg that will hold the actual dataset size
   // @return
-  static Status CountTotalRows(const std::string &dir, int64_t numSamples, bool isCIFAR10, int64_t *count);
+  static Status CountTotalRows(const std::string &dir, bool isCIFAR10, int64_t *count);
+
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "CifarOp"; }
 
  private:
   // Initialize Sampler, calls sampler->Init() within
@@ -227,10 +211,8 @@ class CifarOp : public ParallelOp, public RandomAccessOp {
   CifarType cifar_type_;
   int32_t rows_per_buffer_;
   std::string folder_path_;
-  int64_t num_samples_;
   std::unique_ptr<DataSchema> data_schema_;
   std::shared_ptr<Sampler> sampler_;
-  int64_t num_rows_;
   int64_t row_cnt_;
   int64_t buf_cnt_;
 
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.cc
new file mode 100644
index 0000000000..e92ca0d26c
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.cc
@@ -0,0 +1,553 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/engine/datasetops/source/clue_op.h"
+
+#include <string>
+#include <vector>
+#include <fstream>
+#include <iomanip>
+#include <utility>
+
+#include "dataset/core/config_manager.h"
+#include "dataset/util/task_manager.h"
+#include "dataset/engine/jagged_connector.h"
+#include "dataset/engine/execution_tree.h"
+#include "dataset/engine/datasetops/source/io_block.h"
+#include "dataset/util/random.h"
+
+namespace mindspore {
+namespace dataset {
+ClueOp::Builder::Builder()
+    : builder_device_id_(0),
+      builder_num_devices_(1),
+      builder_num_samples_(0),
+      builder_shuffle_files_(false),
+      builder_shuffle_global_(false) {
+  std::shared_ptr<ConfigManager> config_manager = GlobalContext::config_manager();
+  builder_num_workers_ = config_manager->num_parallel_workers();
+  builder_op_connector_size_ = config_manager->op_connector_size();
+  builder_rows_per_buffer_ = config_manager->rows_per_buffer();
+  builder_worker_connector_size_ = config_manager->worker_connector_size();
+}
+
+Status ClueOp::Builder::ValidateInputs() const {
+  std::string err;
+  err += builder_num_workers_ <= 0 ? "Number of parallel workers should be greater than 0\n" : "";
+  err += (builder_device_id_ >= builder_num_devices_ || builder_num_devices_ < 1) ? "Wrong sharding configs\n" : "";
+  return err.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err);
+}
+
+Status ClueOp::Builder::Build(std::shared_ptr<ClueOp> *op) {
+  RETURN_IF_NOT_OK(ValidateInputs());
+
+  // Throttle the number of workers if we have more workers than files!
+  if (static_cast<size_t>(builder_num_workers_) > builder_clue_files_list_.size()) {
+    builder_num_workers_ = builder_clue_files_list_.size();
+    MS_LOG(WARNING) << "ClueOp operator parallelism reduced to " << builder_num_workers_ << " workers.";
+  }
+
+  ColKeyMap ck_map;
+  for (auto &p : builder_cols_to_keyword_) {
+    ck_map.insert({p.first, split(p.second, '/')});
+  }
+
+  std::shared_ptr<ClueOp> clue_op = std::make_shared<ClueOp>(
+    builder_num_workers_, builder_rows_per_buffer_, builder_num_samples_, builder_worker_connector_size_, ck_map,
+    builder_clue_files_list_, builder_op_connector_size_, builder_shuffle_files_, builder_shuffle_global_,
+    builder_num_devices_, builder_device_id_);
+  RETURN_IF_NOT_OK(clue_op->Init());
+  *op = std::move(clue_op);
+
+  return Status::OK();
+}
+
+std::vector<std::string> ClueOp::Builder::split(const std::string &s, char delim) {
+  std::vector<std::string> res;
+  std::stringstream ss(s);
+  std::string item;
+
+  while (getline(ss, item, delim)) {
+    res.push_back(item);
+  }
+  return res;
+}
+
+ClueOp::ClueOp(int32_t num_workers, int64_t rows_per_buffer, int64_t num_samples, int32_t worker_connector_size,
+               ColKeyMap cols_to_keyword, std::vector<std::string> clue_files_list, int32_t op_connector_size,
+               bool shuffle_files, bool shuffle_global, int32_t num_device, int32_t device_id)
+    : ParallelOp(num_workers, op_connector_size),
+      rows_per_buffer_(rows_per_buffer),
+      num_rows_per_shard_(0),
+      all_num_rows_(0),
+      num_samples_(num_samples),
+      filename_index_(std::make_unique<StringIndex>()),
+      clue_files_list_(std::move(clue_files_list)),
+      load_jagged_connector_(true),
+      cols_to_keyword_(cols_to_keyword),
+      shuffle_files_(shuffle_files),
+      shuffle_global_(shuffle_global),
+      finished_reading_dataset_(false),
+      num_devices_(num_device),
+      device_id_(device_id),
+      load_io_block_queue_(true) {
+  worker_connector_size_ = worker_connector_size;
+}
+
+Status ClueOp::Init() {
+  RETURN_IF_NOT_OK(filename_index_->insert(clue_files_list_));
+
+  int32_t safe_queue_size = static_cast<int32_t>(std::ceil(clue_files_list_.size() / num_workers_) + 1);
+  io_block_queues_.Init(num_workers_, safe_queue_size);
+
+  // Set the column name mapping (base class field)
+  int count = 0;
+  for (auto &p : cols_to_keyword_) {
+    column_name_id_map_[p.first] = count;
+    count++;
+  }
+
+  RETURN_IF_NOT_OK(ParallelOp::CreateWorkerConnector(worker_connector_size_));
+  jagged_buffer_connector_ = std::make_unique<JaggedConnector>(num_workers_, 1, worker_connector_size_);
+
+  return Status::OK();
+}
+
+Status ClueOp::Reset() {
+  load_jagged_connector_ = true;
+  load_io_block_queue_ = true;
+
+  RETURN_IF_NOT_OK(ParallelOp::Reset());
+  NotifyToFillIOBlockQueue();
+  return Status::OK();
+}
+
+Status ClueOp::LoadTensor(const std::string &line, std::unique_ptr<TensorQTable> *tensor_table, int64_t row) {
+  TensorRow tRow(1, nullptr);
+  (*tensor_table)->push_back(std::move(tRow));
+
+  std::shared_ptr<Tensor> tensor;
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, {line}, TensorShape::CreateScalar()));
+  (**tensor_table)[row][0] = std::move(tensor);
+  return Status::OK();
+}
+
+Status ClueOp::GetValue(const nlohmann::json &js, std::vector<std::string> key_chain, std::shared_ptr<Tensor> *t) {
+  nlohmann::json cursor = js;
+  for (int i = 0; i < key_chain.size(); i++) {
+    if (cursor.find(key_chain[i]) != cursor.end()) {
+      cursor = cursor[key_chain[i]];
+    } else {
+      RETURN_STATUS_UNEXPECTED("Failed to find key: " + key_chain[i]);
+    }
+  }
+  std::string final_str = key_chain.back();
+  switch (cursor.type()) {
+    case nlohmann::detail::value_t::string:
+      RETURN_IF_NOT_OK(Tensor::CreateTensor(t, {cursor.get<std::string>()}, TensorShape::CreateScalar()));
+      break;
+
+    case nlohmann::detail::value_t::number_integer:
+      RETURN_IF_NOT_OK(
+        Tensor::CreateTensor(t, TensorImpl::kFlexible, TensorShape::CreateScalar(), DataType(DataType::DE_INT32)));
+      (*t)->SetItemAt<int32_t>({0}, cursor.get<int32_t>());
+      break;
+    case nlohmann::detail::value_t::number_unsigned:
+      RETURN_IF_NOT_OK(
+        Tensor::CreateTensor(t, TensorImpl::kFlexible, TensorShape::CreateScalar(), DataType(DataType::DE_INT32)));
+      (*t)->SetItemAt<int32_t>({0}, cursor.get<uint32_t>());
+      break;
+    case nlohmann::detail::value_t::number_float:
+      RETURN_IF_NOT_OK(
+        Tensor::CreateTensor(t, TensorImpl::kFlexible, TensorShape::CreateScalar(), DataType(DataType::DE_FLOAT32)));
+      (*t)->SetItemAt<int32_t>({0}, cursor.get<float>());
+      break;
+    case nlohmann::detail::value_t::array:
+      RETURN_IF_NOT_OK(Tensor::CreateTensor(t, {cursor.get<std::vector<std::string>>()}, TensorShape::CreateScalar()));
+      break;
+    default:
+      break;
+  }
+  return Status::OK();
+}
+
+Status ClueOp::LoadFile(const std::string &file, const int64_t start_offset, const int64_t end_offset,
+                        const int32_t worker_id) {
+  std::ifstream handle(file);
+  if (!handle.is_open()) {
+    RETURN_STATUS_UNEXPECTED("Failed to open file " + file);
+  }
+
+  int64_t rows_each_buffer = 0;
+  int64_t rows_total = 0;
+  std::string line;
+  std::unique_ptr<DataBuffer> cur_buffer = std::make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
+  std::unique_ptr<TensorQTable> tensor_table = std::make_unique<TensorQTable>();
+
+  while (getline(handle, line)) {
+    if (line.empty()) {
+      continue;
+    }
+    // If read to the end offset of this file, break.
+    if (rows_total >= end_offset) {
+      break;
+    }
+    // Skip line before start offset.
+    if (rows_total < start_offset) {
+      rows_total++;
+      continue;
+    }
+
+    try {
+      nlohmann::json js = nlohmann::json::parse(line);
+      int cols_count = cols_to_keyword_.size();
+      TensorRow tRow(cols_count, nullptr);
+      tensor_table->push_back(std::move(tRow));
+
+      int cout = 0;
+      for (auto &p : cols_to_keyword_) {
+        std::shared_ptr<Tensor> tensor;
+        RETURN_IF_NOT_OK(GetValue(js, p.second, &tensor));
+        (*tensor_table)[rows_each_buffer][cout] = std::move(tensor);
+        cout++;
+      }
+    } catch (const std::exception &err) {
+      // Catch any exception and convert to Status return code
+      RETURN_STATUS_UNEXPECTED("Failed to load json file");
+    }
+
+    // RETURN_IF_NOT_OK(LoadTensor(line, &tensor_table, rows_each_buffer));
+    rows_each_buffer++;
+    rows_total++;
+    if (rows_each_buffer == rows_per_buffer_) {
+      cur_buffer->set_tensor_table(std::move(tensor_table));
+      RETURN_IF_NOT_OK(jagged_buffer_connector_->Add(worker_id, std::move(cur_buffer)));
+
+      cur_buffer = std::make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
+      tensor_table = std::make_unique<TensorQTable>();
+      rows_each_buffer = 0;
+    }
+  }
+
+  if (rows_each_buffer > 0) {
+    cur_buffer->set_tensor_table(std::move(tensor_table));
+    RETURN_IF_NOT_OK(jagged_buffer_connector_->Add(worker_id, std::move(cur_buffer)));
+  }
+  return Status::OK();
+}
+
+Status ClueOp::operator()() {
+  RETURN_IF_NOT_OK(CalculateNumRowsPerShard());
+
+  // launch one thread, responsible for filling IoBlockQueue
+  RETURN_IF_NOT_OK(tree_->LaunchWorkers(1, std::bind(&ClueOp::WaitToFillIOBlockQueue, this)));
+
+  RETURN_IF_NOT_OK(tree_->LaunchWorkers(num_workers_, std::bind(&ClueOp::WorkerEntry, this, std::placeholders::_1)));
+
+  // must be called after launching workers.
+  TaskManager::FindMe()->Post();
+  RETURN_IF_NOT_OK(io_block_queue_wait_post_.Register(tree_->AllTasks()));
+  NotifyToFillIOBlockQueue();
+
+  while (!finished_reading_dataset_) {
+    int64_t buffer_id = 0;
+    int32_t workers_done = 0;
+    int64_t rows_read = 0;
+    load_io_block_queue_ = true;
+
+    while (workers_done < num_workers_) {
+      std::unique_ptr<DataBuffer> buffer;
+      RETURN_IF_NOT_OK(jagged_buffer_connector_->Pop(0, &buffer));
+      if (buffer->eoe()) {
+        workers_done++;
+      } else if (num_samples_ == 0 || rows_read < num_samples_) {
+        if ((num_samples_ > 0) && (rows_read + buffer->NumRows() > num_samples_)) {
+          int64_t rowsToRemove = buffer->NumRows() - (num_samples_ - rows_read);
+          RETURN_IF_NOT_OK(buffer->SliceOff(rowsToRemove));
+        }
+        rows_read += buffer->NumRows();
+        buffer->set_id(buffer_id++);
+        RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(buffer)));
+      } else {
+        // end of epoch
+        load_jagged_connector_ = false;
+        load_io_block_queue_ = false;
+      }
+    }
+
+    std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
+    RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eoe_buffer)));
+
+    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
+      finished_reading_dataset_ = true;
+      NotifyToFillIOBlockQueue();
+    } else {
+      jagged_buffer_connector_->DoReset();
+      buffer_id = 0;
+    }
+  }
+  std::unique_ptr<DataBuffer> eof_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
+  RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eof_buffer)));
+
+  RETURN_IF_NOT_OK(PostEndOfData());
+  return Status::OK();
+}
+
+Status ClueOp::WorkerEntry(int32_t worker_id) {
+  TaskManager::FindMe()->Post();
+  std::unique_ptr<FilenameBlock> io_block;
+  RETURN_IF_NOT_OK(PopIoBlockQueue(worker_id, &io_block));
+  while (!io_block->eof()) {
+    if (!io_block->eoe()) {
+      if (load_jagged_connector_) {
+        std::string filename;
+        RETURN_IF_NOT_OK(io_block->GetFilename(&filename, *filename_index_));
+        int64_t start_offset = io_block->GetStartOffset();
+        int64_t end_offset = io_block->GetEndOffset();
+        RETURN_IF_NOT_OK(LoadFile(filename, start_offset, end_offset, worker_id));
+      }
+    } else {
+      std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
+      RETURN_IF_NOT_OK(jagged_buffer_connector_->Add(worker_id, std::move(eoe_buffer)));
+    }
+
+    RETURN_IF_NOT_OK(PopIoBlockQueue(worker_id, &io_block));
+  }
+  return Status::OK();
+}
+
+// A print method typically used for debugging
+void ClueOp::Print(std::ostream &out, bool show_all) const {
+  // Always show the id and name as first line regardless if this summary or detailed print
+  out << "(" << std::setw(2) << operator_id_ << ") <ClueOp>:";
+  if (!show_all) {
+    // Call the super class for displaying any common 1-liner info
+    ParallelOp::Print(out, show_all);
+    // Then show any custom derived-internal 1-liner info for this op
+    out << "\n";
+  } else {
+    // Call the super class for displaying any common detailed info
+    ParallelOp::Print(out, show_all);
+    // Then show any custom derived-internal stuff
+    out << "\nRows per buffer: " << rows_per_buffer_ << "\nSample count: " << num_samples_
+        << "\nDevice id: " << device_id_ << "\nNumber of devices: " << num_devices_
+        << "\nShuffle files: " << ((shuffle_files_) ? "yes" : "no") << "\nClue files list:\n";
+    for (int i = 0; i < clue_files_list_.size(); ++i) {
+      out << " " << clue_files_list_[i];
+    }
+    out << "\n\n";
+  }
+}
+
+// Pops an element from a queue in io_block_queues
+Status ClueOp::PopIoBlockQueue(int32_t index, std::unique_ptr<FilenameBlock> *out_block) {
+  RETURN_IF_NOT_OK(io_block_queues_[index]->PopFront(out_block));
+
+  return Status::OK();
+}
+
+// Pushes an element to a queue in io_block_queues
+Status ClueOp::PushIoBlockQueue(int32_t index, std::unique_ptr<FilenameBlock> &&io_block) {
+  RETURN_IF_NOT_OK(io_block_queues_[index]->Add(std::move(io_block)));
+
+  return Status::OK();
+}
+
+static void ShuffleKeys(std::vector<int64_t> *i_keys, uint32_t seed) {
+  std::mt19937 rng(seed);
+  std::shuffle(i_keys->begin(), i_keys->end(), rng);
+}
+
+Status ClueOp::WaitToFillIOBlockQueue() {
+  // must be called first if called by worker spanwed by taskgroup
+  TaskManager::FindMe()->Post();
+
+  std::vector<int64_t> i_keys;
+  if (shuffle_files_) {
+    for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) {
+      i_keys.push_back(it.key());
+    }
+  }
+  uint32_t seed = 0;
+  while (true) {
+    RETURN_IF_NOT_OK(io_block_queue_wait_post_.Wait());
+    io_block_queue_wait_post_.Clear();
+
+    if (finished_reading_dataset_) {
+      break;
+    }
+
+    if (shuffle_files_) {
+      ShuffleKeys(&i_keys, num_devices_ == 1 ? GetSeed() : ++seed);
+    }
+    RETURN_IF_NOT_OK(FillIOBlockQueue(i_keys));
+  }
+  return Status::OK();
+}
+
+Status ClueOp::FillIOBlockQueue(const std::vector<int64_t> &i_keys) {
+  int32_t queue_index = 0;
+  int64_t pre_count = 0;
+  int64_t start_offset = 0;
+  int64_t end_offset = 0;
+  bool finish = false;
+  while (!finish) {
+    std::vector<std::pair<std::string, int64_t>> file_index;
+    if (!i_keys.empty()) {
+      for (auto it = i_keys.begin(); it != i_keys.end(); ++it) {
+        {
+          if (!load_io_block_queue_) {
+            break;
+          }
+        }
+        file_index.emplace_back(std::pair<std::string, int64_t>((*filename_index_)[*it], *it));
+      }
+    } else {
+      for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) {
+        {
+          if (!load_io_block_queue_) {
+            break;
+          }
+        }
+        file_index.emplace_back(std::pair<std::string, int64_t>(it.value(), it.key()));
+      }
+    }
+    for (auto file_info : file_index) {
+      if (NeedPushFileToBlockQueue(file_info.first, &start_offset, &end_offset, pre_count)) {
+        auto ioBlock =
+          std::make_unique<FilenameBlock>(file_info.second, start_offset, end_offset, IOBlock::kDeIoBlockNone);
+        RETURN_IF_NOT_OK(PushIoBlockQueue(queue_index, std::move(ioBlock)));
+        queue_index = (queue_index + 1) % num_workers_;
+      }
+
+      pre_count += filename_numrows_[file_info.first];
+    }
+
+    if (pre_count < (static_cast<int64_t>(device_id_) + 1) * num_rows_per_shard_) {
+      finish = false;
+    } else {
+      finish = true;
+    }
+  }
+
+  RETURN_IF_NOT_OK(PostEndOfEpoch(queue_index));
+  return Status::OK();
+}
+
+void ClueOp::NotifyToFillIOBlockQueue() { io_block_queue_wait_post_.Set(); }
+
+bool ClueOp::NeedPushFileToBlockQueue(const std::string &file_name, int64_t *start_offset, int64_t *end_offset,
+                                      const int64_t &pre_count) {
+  *start_offset = 0;
+  *end_offset = 0;
+  bool push = false;
+  int64_t start_index = device_id_ * num_rows_per_shard_;
+  if (device_id_ + 1 < 0) {
+    MS_LOG(ERROR) << "Device id is invalid";
+    return false;
+  }
+
+  int64_t end_index = (static_cast<int64_t>(device_id_) + 1) * num_rows_per_shard_;
+  if (pre_count <= start_index && pre_count + filename_numrows_[file_name] > start_index) {
+    *start_offset = start_index - pre_count;
+    push = true;
+    if (pre_count < end_index && pre_count + filename_numrows_[file_name] >= end_index) {
+      *end_offset = end_index - pre_count;
+    } else {
+      *end_offset = filename_numrows_[file_name];
+    }
+  }
+
+  if (pre_count >= start_index && pre_count < end_index) {
+    *start_offset = 0;
+    push = true;
+    if (pre_count + filename_numrows_[file_name] >= end_index) {
+      *end_offset = end_index - pre_count;
+    } else {
+      *end_offset = filename_numrows_[file_name];
+    }
+  }
+
+  return push;
+}
+
+// Pushes a control indicator onto the IOBlockQueue for each worker to consume. When the worker
+// pops this control indicator, it will wait until the next epoch starts and then resume execution.
+Status ClueOp::PostEndOfEpoch(int32_t queue_index) {
+  for (int i = 0; i < num_workers_; ++i) {
+    std::unique_ptr<FilenameBlock> eoe = std::make_unique<FilenameBlock>(IOBlock::kDeIoBlockFlagEoe);
+    RETURN_IF_NOT_OK(PushIoBlockQueue((queue_index + i) % num_workers_, std::move(eoe)));
+  }
+
+  return Status::OK();
+}
+
+Status ClueOp::CalculateNumRowsPerShard() {
+  for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) {
+    int64_t count = CountTotalRows(it.value());
+    filename_numrows_[it.value()] = count;
+    all_num_rows_ += count;
+  }
+  if (all_num_rows_ == 0) {
+    RETURN_STATUS_UNEXPECTED(
+      "There is no valid data matching the dataset API CLUEDataset. Please check file path or dataset API "
+      "validation first.");
+  }
+
+  num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_));
+  MS_LOG(DEBUG) << "Number rows per shard is " << num_rows_per_shard_;
+  return Status::OK();
+}
+
+int64_t ClueOp::CountTotalRows(const std::string &file) {
+  std::ifstream handle(file);
+  if (!handle.is_open()) {
+    MS_LOG(ERROR) << "Failed to open file: " << file;
+    return 0;
+  }
+
+  std::string line;
+  int64_t count = 0;
+  while (getline(handle, line)) {
+    if (!line.empty()) {
+      count++;
+    }
+  }
+
+  return count;
+}
+
+// Pushes a control indicator onto the IOBlockQueue for each worker to consume.
+// When the worker pops this control indicator, it will shut itself down gracefully.
+Status ClueOp::PostEndOfData() {
+  for (int i = 0; i < num_workers_; ++i) {
+    std::unique_ptr<FilenameBlock> eof = std::make_unique<FilenameBlock>(IOBlock::kDeIoBlockFlagEof);
+    RETURN_IF_NOT_OK(PushIoBlockQueue(i, std::move(eof)));
+  }
+
+  return Status::OK();
+}
+
+Status ClueOp::CountAllFileRows(const std::vector<std::string> &files, int64_t *count) {
+  std::shared_ptr<ClueOp> op;
+  *count = 0;
+  RETURN_IF_NOT_OK(Builder().SetClueFilesList(files).Build(&op));
+  for (auto file : files) {
+    *count += op->CountTotalRows(file);
+  }
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.h
new file mode 100644
index 0000000000..b6a797d3f4
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.h
@@ -0,0 +1,287 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_CLUE_OP_H_
+#define DATASET_ENGINE_DATASETOPS_SOURCE_CLUE_OP_H_
+
+#include <memory>
+#include <map>
+#include <mutex>
+#include <string>
+#include <vector>
+#include <nlohmann/json.hpp>
+
+#include "dataset/util/auto_index.h"
+#include "dataset/engine/datasetops/parallel_op.h"
+#include "dataset/engine/datasetops/source/io_block.h"
+
+namespace mindspore {
+namespace dataset {
+using StringIndex = AutoIndexObj<std::string>;
+using ColKeyMap = std::map<std::string, std::vector<std::string>>;
+
+class JaggedConnector;
+
+class ClueOp : public ParallelOp {
+ public:
+  class Builder {
+   public:
+    // Builder constructor. Creates the builder object.
+    // @note No default args
+    // @return This is a constructor.
+    Builder();
+
+    // Default destructor
+    ~Builder() = default;
+
+    // Checks if the inputs of the builder is valid.
+    // @return Status - the error code returned.
+    Status ValidateInputs() const;
+
+    // Create the final object.
+    // @param op - dataset op.
+    // @return - the error code return.
+    Status Build(std::shared_ptr<ClueOp> *op);
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetNumWorkers(int32_t num_workers) {
+      builder_num_workers_ = num_workers;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetOpConnectorSize(int32_t op_connector_size) {
+      builder_op_connector_size_ = op_connector_size;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetRowsPerBuffer(int64_t rows_per_buffer) {
+      builder_rows_per_buffer_ = rows_per_buffer;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetNumDevices(int64_t num_dev) {
+      builder_num_devices_ = num_dev;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetDeviceId(int64_t dev_id) {
+      builder_device_id_ = dev_id;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetClueFilesList(const std::vector<std::string> &files_list) {
+      builder_clue_files_list_ = files_list;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetShuffleFiles(bool shuffle_files) {
+      builder_shuffle_files_ = shuffle_files;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetShuffleGlobal(bool shuffle_global) {
+      builder_shuffle_global_ = shuffle_global;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetNumSamples(int64_t num_samples) {
+      builder_num_samples_ = num_samples;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetColsKeyMap(const std::map<std::string, std::string> &cols_to_key) {
+      builder_cols_to_keyword_ = cols_to_key;
+      return *this;
+    }
+
+    // Split string based on a character delimiter
+    // @return - the a string vector
+    std::vector<std::string> split(const std::string &s, char delim);
+
+   private:
+    int32_t builder_device_id_;
+    int32_t builder_num_devices_;
+    int32_t builder_num_workers_;
+    int32_t builder_op_connector_size_;
+    int64_t builder_rows_per_buffer_;
+    int64_t builder_num_samples_;
+    int32_t builder_worker_connector_size_;
+    std::vector<std::string> builder_clue_files_list_;
+    bool builder_shuffle_files_;
+    bool builder_shuffle_global_;
+    std::map<std::string, std::string> builder_cols_to_keyword_;
+  };
+
+  // Constructor of ClueOp
+  // @param shuffle_global - whether or not to shuffle the entire dataset.
+  ClueOp(int32_t num_workers, int64_t rows_per_buffer, int64_t num_samples, int32_t worker_connector_size,
+         ColKeyMap cols_to_keyword, std::vector<std::string> clue_files_list, int32_t op_connector_size,
+         bool shuffle_files, bool shuffle_global, int32_t num_devices, int32_t device_id);
+
+  // Default destructor
+  ~ClueOp() = default;
+
+  // A print method typically used for debugging
+  // @param out - The output stream to write output to
+  // @param show_all - A bool to control if you want to show all info or just a summary
+  void Print(std::ostream &out, bool show_all) const override;
+
+  // Instantiates the internal queues and connectors
+  // @return Status - the error code returned
+  Status Init();
+
+  // Class functor operator () override.
+  // All dataset operators operate by launching a thread (see ExecutionTree). This class functor will
+  // provide the master loop that drives the logic for performing the work
+  // @return Status - the error code returned.
+  Status operator()() override;
+
+  // Overrides base class reset method. Cleans up any state info from it's previous execution
+  // reinitializes itself so that it can be executed again, as if it was just created.
+  // @return Status - the error code returned.
+  Status Reset() override;
+
+  // Get total rows in files.
+  // @param files - all clue files.
+  // @param count - number of rows.
+  // @return Status - the error coed returned.
+  static Status CountAllFileRows(const std::vector<std::string> &files, int64_t *count);
+
+  // File names getter
+  // @return Vector of the input file names
+  std::vector<std::string> FileNames() { return clue_files_list_; }
+
+  // Global shuffle flag getter
+  // @return Bool - whether this Op requires global shuffle
+  bool RequireGlobalShuffle() { return shuffle_global_; }
+
+ private:
+  // The entry point for when workers are launched.
+  // @param worker_id - the id of the worker that is executing this function.
+  // @return Status - the error code returned.
+  Status WorkerEntry(int32_t worker_id) override;
+
+  // Parses a single row and puts the data into a tensor table.
+  // @param line - the content of the row.
+  // @param tensor_table - the tensor table to put the parsed data in.
+  // @param row - the id of the row filled in the tensor table.
+  // @return Status - the error code returned.
+  Status LoadTensor(const std::string &line, std::unique_ptr<TensorQTable> *tensor_table, int64_t row);
+
+  // Reads a clue file and loads the data into multiple buffers.
+  // @param file - the file to read.
+  // @param start_offset - the start offset of file.
+  // @param end_offset - the end offset of file.
+  // @param worker_id - the id of the worker that is executing this function.
+  // @return Status - the error code returned.
+  Status LoadFile(const std::string &file, const int64_t start_offset, const int64_t end_offset,
+                  const int32_t worker_id);
+
+  // Pops an element from a queue in IOBlockQueue.
+  // @param index - the index of the queue to pop from.
+  // @param out_block - the popped element.
+  // @return Status - the error code returned.
+  Status PopIoBlockQueue(int32_t index, std::unique_ptr<FilenameBlock> *out_block);
+
+  // Pushes an element to a queue in IOBlockQueue.
+  // @param index - the index of the queue to push to.
+  // @param io_block - the element to push onto the queue.
+  // @return Status - the error code returned.
+  Status PushIoBlockQueue(int32_t index, std::unique_ptr<FilenameBlock> &&io_block);
+
+  // Called asynchronously by another thread. Will wait until notified to fill the IOBlockQueue.
+  // @return Status - the error code returned.
+  Status WaitToFillIOBlockQueue();
+
+  // Fill the IOBlockQueue.
+  // @para i_keys - keys of file to fill to the IOBlockQueue
+  // @return Status - the error code returned.
+  Status FillIOBlockQueue(const std::vector<int64_t> &i_keys);
+
+  // Notifies the thread which called FillIoBlockQueue to resume execution
+  void NotifyToFillIOBlockQueue();
+
+  // Select file and push it to the block queue.
+  // @param file_name - File name.
+  // @param start_file - If file contains the first sample of data.
+  // @param end_file - If file contains the end sample of data.
+  // @param pre_count - Total rows of previous files.
+  // @return Status - the error code returned.
+  bool NeedPushFileToBlockQueue(const std::string &file_name, int64_t *start_offset, int64_t *end_offset,
+                                const int64_t &pre_count);
+
+  // Pushes a control indicator onto the IOBlockQueue for each worker to consume. When the worker
+  // pops this control indicator, it will wait until the next epoch starts and then resume execution.
+  // @return Status - the error code returned.
+  Status PostEndOfEpoch(int32_t queue_index);
+
+  // Calculate number of rows in each shard.
+  // @return Status - the error code returned.
+  Status CalculateNumRowsPerShard();
+
+  // Count number of rows in each file.
+  // @param filename - clue file name.
+  // @return int64_t - the total number of rows in file.
+  int64_t CountTotalRows(const std::string &file);
+
+  // Pushes a control indicator onto the IOBlockQueue for each worker to consume.
+  // When the worker pops this control indicator, it will shut itself down gracefully.
+  // @return Status - the error code returned.
+  Status PostEndOfData();
+
+  // @return Status - the error code returned.
+  Status GetValue(const nlohmann::json &js, std::vector<std::string> key_chain, std::shared_ptr<Tensor> *t);
+
+  int32_t device_id_;
+  bool shuffle_files_;
+  bool shuffle_global_;
+  bool finished_reading_dataset_;
+  int32_t num_devices_;
+  int64_t rows_per_buffer_;
+  bool load_io_block_queue_;
+  int64_t num_rows_per_shard_;
+  int64_t all_num_rows_;
+  int64_t num_samples_;
+  std::map<std::string, int64_t> filename_numrows_;
+  std::unique_ptr<StringIndex> filename_index_;
+  std::vector<std::string> clue_files_list_;
+  WaitPost io_block_queue_wait_post_;
+  std::unique_ptr<JaggedConnector> jagged_buffer_connector_;
+  QueueList<std::unique_ptr<FilenameBlock>> io_block_queues_;
+  bool load_jagged_connector_;
+  ColKeyMap cols_to_keyword_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_ENGINE_DATASETOPS_SOURCE_CLUE_OP_H_
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/coco_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/coco_op.cc
new file mode 100644
index 0000000000..8d352bbd6c
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/coco_op.cc
@@ -0,0 +1,631 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/engine/datasetops/source/coco_op.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include "common/utils.h"
+#include "dataset/core/config_manager.h"
+#include "dataset/core/tensor_shape.h"
+#include "dataset/engine/datasetops/source/sampler/sequential_sampler.h"
+#include "dataset/engine/db_connector.h"
+#include "dataset/engine/execution_tree.h"
+
+namespace mindspore {
+namespace dataset {
+const char kColumnImage[] = "image";
+const char kJsonImages[] = "images";
+const char kJsonImagesFileName[] = "file_name";
+const char kJsonId[] = "id";
+const char kJsonAnnotations[] = "annotations";
+const char kJsonAnnoSegmentation[] = "segmentation";
+const char kJsonAnnoCounts[] = "counts";
+const char kJsonAnnoSegmentsInfo[] = "segments_info";
+const char kJsonAnnoIscrowd[] = "iscrowd";
+const char kJsonAnnoBbox[] = "bbox";
+const char kJsonAnnoArea[] = "area";
+const char kJsonAnnoImageId[] = "image_id";
+const char kJsonAnnoNumKeypoints[] = "num_keypoints";
+const char kJsonAnnoKeypoints[] = "keypoints";
+const char kJsonAnnoCategoryId[] = "category_id";
+const char kJsonCategories[] = "categories";
+const char kJsonCategoriesIsthing[] = "isthing";
+const char kJsonCategoriesName[] = "name";
+const float kDefaultPadValue = -1.0;
+const unsigned int kPadValueZero = 0;
+
+CocoOp::Builder::Builder() : builder_decode_(false), builder_sampler_(nullptr) {
+  std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
+  builder_num_workers_ = cfg->num_parallel_workers();
+  builder_rows_per_buffer_ = cfg->rows_per_buffer();
+  builder_op_connector_size_ = cfg->op_connector_size();
+  builder_task_type_ = TaskType::Detection;
+}
+
+Status CocoOp::Builder::Build(std::shared_ptr<CocoOp> *ptr) {
+  RETURN_IF_NOT_OK(SanityCheck());
+  if (builder_sampler_ == nullptr) {
+    const int64_t num_samples = 0;
+    const int64_t start_index = 0;
+    builder_sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples);
+  }
+  builder_schema_ = std::make_unique<DataSchema>();
+  RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+    ColDescriptor(std::string(kColumnImage), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
+  switch (builder_task_type_) {
+    case TaskType::Detection:
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoBbox), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1)));
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoCategoryId), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoIscrowd), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+      break;
+    case TaskType::Stuff:
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoSegmentation), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1)));
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoIscrowd), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+      break;
+    case TaskType::Keypoint:
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoKeypoints), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1)));
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoNumKeypoints), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+      break;
+    case TaskType::Panoptic:
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoBbox), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1)));
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoCategoryId), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoIscrowd), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+      RETURN_IF_NOT_OK(builder_schema_->AddColumn(
+        ColDescriptor(std::string(kJsonAnnoArea), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+      break;
+    default:
+      RETURN_STATUS_UNEXPECTED("Invalid task type");
+  }
+  *ptr = std::make_shared<CocoOp>(builder_task_type_, builder_dir_, builder_file_, builder_num_workers_,
+                                  builder_rows_per_buffer_, builder_op_connector_size_, builder_decode_,
+                                  std::move(builder_schema_), std::move(builder_sampler_));
+  return Status::OK();
+}
+
+Status CocoOp::Builder::SanityCheck() {
+  Path dir(builder_dir_);
+  Path file(builder_file_);
+  std::string err_msg;
+  err_msg += dir.IsDirectory() == false ? "Coco image folder path is invalid or not set\n" : "";
+  err_msg += file.Exists() == false ? "Coco annotation json path is invalid or not set\n" : "";
+  err_msg += builder_num_workers_ <= 0 ? "Num of parallel workers is set to 0 or negative\n" : "";
+  return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg);
+}
+
+CocoOp::CocoOp(const TaskType &task_type, const std::string &image_folder_path, const std::string &annotation_path,
+               int32_t num_workers, int32_t rows_per_buffer, int32_t queue_size, bool decode,
+               std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler)
+    : ParallelOp(num_workers, queue_size),
+      decode_(decode),
+      row_cnt_(0),
+      buf_cnt_(0),
+      task_type_(task_type),
+      image_folder_path_(image_folder_path),
+      annotation_path_(annotation_path),
+      rows_per_buffer_(rows_per_buffer),
+      sampler_(std::move(sampler)),
+      data_schema_(std::move(data_schema)) {
+  // Set the column name map (base class field)
+  for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
+    column_name_id_map_[data_schema_->column(i).name()] = i;
+  }
+  io_block_queues_.Init(num_workers_, queue_size);
+}
+
+Status CocoOp::TraverseSampleIds(const std::shared_ptr<Tensor> &sample_ids, std::vector<int64_t> *keys) {
+  for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
+    if ((*itr) > num_rows_) continue;
+    keys->push_back(*itr);
+    row_cnt_++;
+    if (row_cnt_ % rows_per_buffer_ == 0) {
+      RETURN_IF_NOT_OK(io_block_queues_[buf_cnt_++ % num_workers_]->Add(
+        std::make_unique<IOBlock>(IOBlock(*keys, IOBlock::kDeIoBlockNone))));
+      keys->clear();
+    }
+  }
+  return Status::OK();
+}
+
+Status CocoOp::operator()() {
+  RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
+  std::unique_ptr<DataBuffer> sampler_buffer;
+  RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
+  while (true) {
+    std::vector<int64_t> keys;
+    keys.reserve(rows_per_buffer_);
+    while (sampler_buffer->eoe() == false) {
+      std::shared_ptr<Tensor> sample_ids;
+      RETURN_IF_NOT_OK(sampler_buffer->GetTensor(&sample_ids, 0, 0));
+      if (sample_ids->type() != DataType(DataType::DE_INT64)) {
+        RETURN_STATUS_UNEXPECTED("Sampler Tensor isn't int64");
+      }
+      RETURN_IF_NOT_OK(TraverseSampleIds(sample_ids, &keys));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
+    }
+    if (keys.empty() == false) {
+      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(
+        std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
+    }
+    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
+      std::unique_ptr<IOBlock> eoe_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe);
+      std::unique_ptr<IOBlock> eof_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof);
+      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eoe_block)));
+      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eof_block)));
+      for (int32_t i = 0; i < num_workers_; i++) {
+        RETURN_IF_NOT_OK(
+          io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
+      }
+      return Status::OK();
+    } else {
+      RETURN_IF_NOT_OK(
+        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
+      RETURN_IF_NOT_OK(wp_.Wait());
+      wp_.Clear();
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
+    }
+  }
+}
+
+void CocoOp::Print(std::ostream &out, bool show_all) const {
+  // Always show the id and name as first line regardless if this summary or detailed print
+  out << "(" << std::setw(2) << operator_id_ << ") <CocoOp>:";
+  if (!show_all) {
+    // Call the super class for displaying any common 1-liner info
+    ParallelOp::Print(out, show_all);
+    // Then show any custom derived-internal 1-liner info for this op
+    out << "\n";
+  } else {
+    // Call the super class for displaying any common detailed info
+    ParallelOp::Print(out, show_all);
+    // Then show any custom derived-internal stuff
+    out << "\nNumber of rows: " << num_rows_ << "\nCOCO Directory: " << image_folder_path_ << "\n\n";
+  }
+}
+
+Status CocoOp::Reset() {
+  RETURN_IF_NOT_OK(sampler_->ResetSampler());
+  row_cnt_ = 0;
+  wp_.Set();
+  return Status::OK();
+}
+
+Status CocoOp::LoadTensorRow(row_id_type row_id, const std::string &image_id, TensorRow *trow) {
+  std::shared_ptr<Tensor> image, coordinate;
+  auto itr = coordinate_map_.find(image_id);
+  if (itr == coordinate_map_.end()) RETURN_STATUS_UNEXPECTED("Invalid image_id found :" + image_id);
+
+  std::string kImageFile = image_folder_path_ + image_id;
+  RETURN_IF_NOT_OK(ReadImageToTensor(kImageFile, data_schema_->column(0), &image));
+
+  auto bboxRow = itr->second;
+  std::vector<float> bbox_row;
+  dsize_t bbox_row_num = static_cast<dsize_t>(bboxRow.size());
+  dsize_t bbox_column_num = 0;
+  for (auto bbox : bboxRow) {
+    if (static_cast<dsize_t>(bbox.size()) > bbox_column_num) {
+      bbox_column_num = static_cast<dsize_t>(bbox.size());
+    }
+  }
+
+  for (auto bbox : bboxRow) {
+    bbox_row.insert(bbox_row.end(), bbox.begin(), bbox.end());
+    dsize_t pad_len = bbox_column_num - static_cast<dsize_t>(bbox.size());
+    if (pad_len > 0) {
+      for (dsize_t i = 0; i < pad_len; i++) {
+        bbox_row.push_back(kDefaultPadValue);
+      }
+    }
+  }
+
+  std::vector<dsize_t> bbox_dim = {bbox_row_num, bbox_column_num};
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&coordinate, data_schema_->column(1).tensorImpl(), TensorShape(bbox_dim),
+                                        data_schema_->column(1).type(),
+                                        reinterpret_cast<unsigned char *>(&bbox_row[0])));
+  if (task_type_ == TaskType::Detection) {
+    RETURN_IF_NOT_OK(LoadDetectionTensorRow(row_id, image_id, image, coordinate, trow));
+  } else if (task_type_ == TaskType::Stuff || task_type_ == TaskType::Keypoint) {
+    RETURN_IF_NOT_OK(LoadSimpleTensorRow(row_id, image_id, image, coordinate, trow));
+  } else if (task_type_ == TaskType::Panoptic) {
+    RETURN_IF_NOT_OK(LoadMixTensorRow(row_id, image_id, image, coordinate, trow));
+  } else {
+    RETURN_STATUS_UNEXPECTED("Invalid task type.");
+  }
+
+  return Status::OK();
+}
+
+// When task is Detection, user can get data with four columns:
+// column ["image"] with datatype=uint8
+// column ["bbox"] with datatype=float32
+// column ["category_id"] with datatype=uint32
+// column ["iscrowd"] with datatype=uint32
+// By the way, column ["iscrowd"] is used for some testcases, like fasterRcnn.
+// If "iscrowd" is not existed, user will get default value 0.
+Status CocoOp::LoadDetectionTensorRow(row_id_type row_id, const std::string &image_id, std::shared_ptr<Tensor> image,
+                                      std::shared_ptr<Tensor> coordinate, TensorRow *trow) {
+  std::shared_ptr<Tensor> category_id, iscrowd;
+  std::vector<uint32_t> category_id_row;
+  std::vector<uint32_t> iscrowd_row;
+  auto itr_item = simple_item_map_.find(image_id);
+  if (itr_item == simple_item_map_.end()) RETURN_STATUS_UNEXPECTED("Invalid image_id found :" + image_id);
+
+  std::vector<uint32_t> annotation = itr_item->second;
+  for (int64_t i = 0; i < annotation.size(); i++) {
+    if (i % 2 == 0) {
+      category_id_row.push_back(annotation[i]);
+    } else if (i % 2 == 1) {
+      iscrowd_row.push_back(annotation[i]);
+    }
+  }
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(
+    &category_id, data_schema_->column(2).tensorImpl(), TensorShape({static_cast<dsize_t>(category_id_row.size()), 1}),
+    data_schema_->column(2).type(), reinterpret_cast<unsigned char *>(&category_id_row[0])));
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(
+    &iscrowd, data_schema_->column(3).tensorImpl(), TensorShape({static_cast<dsize_t>(iscrowd_row.size()), 1}),
+    data_schema_->column(3).type(), reinterpret_cast<unsigned char *>(&iscrowd_row[0])));
+  (*trow) = TensorRow(row_id, {std::move(image), std::move(coordinate), std::move(category_id), std::move(iscrowd)});
+  return Status::OK();
+}
+
+// When task is "Stuff"/"Keypoint", user can get data with three columns:
+// column ["image"] with datatype=uint8
+// column ["segmentation"]/["keypoints"] with datatype=float32
+// column ["iscrowd"]/["num_keypoints"] with datatype=uint32
+Status CocoOp::LoadSimpleTensorRow(row_id_type row_id, const std::string &image_id, std::shared_ptr<Tensor> image,
+                                   std::shared_ptr<Tensor> coordinate, TensorRow *trow) {
+  std::shared_ptr<Tensor> item;
+  std::vector<uint32_t> item_queue;
+  auto itr_item = simple_item_map_.find(image_id);
+  if (itr_item == simple_item_map_.end()) RETURN_STATUS_UNEXPECTED("Invalid image_id found :" + image_id);
+
+  item_queue = itr_item->second;
+  std::vector<dsize_t> bbox_dim = {static_cast<dsize_t>(item_queue.size()), 1};
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&item, data_schema_->column(2).tensorImpl(), TensorShape(bbox_dim),
+                                        data_schema_->column(2).type(),
+                                        reinterpret_cast<unsigned char *>(&item_queue[0])));
+  (*trow) = TensorRow(row_id, {std::move(image), std::move(coordinate), std::move(item)});
+  return Status::OK();
+}
+
+// When task is "Panoptic", user can get data with five columns:
+// column ["image"] with datatype=uint8
+// column ["bbox"] with datatype=float32
+// column ["category_id"] with datatype=uint32
+// column ["iscrowd"] with datatype=uint32
+// column ["area"] with datattype=uint32
+Status CocoOp::LoadMixTensorRow(row_id_type row_id, const std::string &image_id, std::shared_ptr<Tensor> image,
+                                std::shared_ptr<Tensor> coordinate, TensorRow *trow) {
+  std::shared_ptr<Tensor> category_id, iscrowd, area;
+  std::vector<uint32_t> category_id_row;
+  std::vector<uint32_t> iscrowd_row;
+  std::vector<uint32_t> area_row;
+  auto itr_item = simple_item_map_.find(image_id);
+  if (itr_item == simple_item_map_.end()) RETURN_STATUS_UNEXPECTED("Invalid image_id found :" + image_id);
+
+  std::vector<uint32_t> annotation = itr_item->second;
+  for (int64_t i = 0; i < annotation.size(); i++) {
+    if (i % 3 == 0) {
+      category_id_row.push_back(annotation[i]);
+    } else if (i % 3 == 1) {
+      iscrowd_row.push_back(annotation[i]);
+    } else if (i % 3 == 2) {
+      area_row.push_back(annotation[i]);
+    }
+  }
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(
+    &category_id, data_schema_->column(2).tensorImpl(), TensorShape({static_cast<dsize_t>(category_id_row.size()), 1}),
+    data_schema_->column(2).type(), reinterpret_cast<unsigned char *>(&category_id_row[0])));
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(
+    &iscrowd, data_schema_->column(3).tensorImpl(), TensorShape({static_cast<dsize_t>(iscrowd_row.size()), 1}),
+    data_schema_->column(3).type(), reinterpret_cast<unsigned char *>(&iscrowd_row[0])));
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(
+    &area, data_schema_->column(4).tensorImpl(), TensorShape({static_cast<dsize_t>(area_row.size()), 1}),
+    data_schema_->column(4).type(), reinterpret_cast<unsigned char *>(&area_row[0])));
+  (*trow) = TensorRow(
+    row_id, {std::move(image), std::move(coordinate), std::move(category_id), std::move(iscrowd), std::move(area)});
+  return Status::OK();
+}
+
+Status CocoOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
+  std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
+  TensorRow trow;
+  for (const int64_t &key : keys) {
+    RETURN_IF_NOT_OK(this->LoadTensorRow(key, image_ids_[key], &trow));
+    deq->push_back(std::move(trow));
+  }
+  (*db)->set_tensor_table(std::move(deq));
+  return Status::OK();
+}
+
+Status CocoOp::WorkerEntry(int32_t worker_id) {
+  TaskManager::FindMe()->Post();
+  int64_t buffer_id = worker_id;
+  std::unique_ptr<IOBlock> io_block;
+  RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
+  while (io_block != nullptr) {
+    if (io_block->eoe() == true) {
+      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
+      buffer_id = worker_id;
+    } else if (io_block->eof() == true) {
+      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, (std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
+    } else {
+      std::vector<int64_t> keys;
+      RETURN_IF_NOT_OK(io_block->GetKeys(&keys));
+      if (keys.empty() == true) return Status::OK();
+      std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
+      RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
+      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
+      buffer_id += num_workers_;
+    }
+    RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
+  }
+  RETURN_STATUS_UNEXPECTED("Unexpected nullptr received in worker");
+}
+
+template <typename T>
+Status CocoOp::SearchNodeInJson(nlohmann::json input_tree, std::string node_name, T *output_node) {
+  auto node = input_tree.find(node_name);
+  if (node == input_tree.end()) RETURN_STATUS_UNEXPECTED("Invalid node found in json : " + node_name);
+  (*output_node) = *node;
+  return Status::OK();
+}
+
+Status CocoOp::ParseAnnotationIds() {
+  std::ifstream in(annotation_path_);
+  nlohmann::json js;
+  in >> js;
+
+  std::vector<std::string> image_que;
+  nlohmann::json image_list;
+  RETURN_IF_NOT_OK(SearchNodeInJson(js, std::string(kJsonImages), &image_list));
+  RETURN_IF_NOT_OK(ImageColumnLoad(image_list, &image_que));
+  if (task_type_ == TaskType::Detection || task_type_ == TaskType::Panoptic) {
+    nlohmann::json node_categories;
+    RETURN_IF_NOT_OK(SearchNodeInJson(js, std::string(kJsonCategories), &node_categories));
+    RETURN_IF_NOT_OK(CategoriesColumnLoad(node_categories));
+  }
+  nlohmann::json annotations_list;
+  RETURN_IF_NOT_OK(SearchNodeInJson(js, std::string(kJsonAnnotations), &annotations_list));
+  for (auto annotation : annotations_list) {
+    int32_t image_id = 0, id = 0;
+    std::string file_name;
+    RETURN_IF_NOT_OK(SearchNodeInJson(annotation, std::string(kJsonAnnoImageId), &image_id));
+    auto itr_file = image_index_.find(image_id);
+    if (itr_file == image_index_.end())
+      RETURN_STATUS_UNEXPECTED("Invalid image id of annotations : " + std::to_string(image_id));
+    file_name = itr_file->second;
+    switch (task_type_) {
+      case TaskType::Detection:
+        RETURN_IF_NOT_OK(SearchNodeInJson(annotation, std::string(kJsonId), &id));
+        RETURN_IF_NOT_OK(DetectionColumnLoad(annotation, file_name, id));
+        break;
+      case TaskType::Stuff:
+        RETURN_IF_NOT_OK(SearchNodeInJson(annotation, std::string(kJsonId), &id));
+        RETURN_IF_NOT_OK(StuffColumnLoad(annotation, file_name, id));
+        break;
+      case TaskType::Keypoint:
+        RETURN_IF_NOT_OK(SearchNodeInJson(annotation, std::string(kJsonId), &id));
+        RETURN_IF_NOT_OK(KeypointColumnLoad(annotation, file_name, id));
+        break;
+      case TaskType::Panoptic:
+        RETURN_IF_NOT_OK(PanopticColumnLoad(annotation, file_name, image_id));
+        break;
+      default:
+        RETURN_STATUS_UNEXPECTED("Invalid task type");
+    }
+  }
+  for (auto img : image_que) {
+    if (coordinate_map_.find(img) != coordinate_map_.end()) image_ids_.push_back(img);
+  }
+  num_rows_ = image_ids_.size();
+  return Status::OK();
+}
+
+Status CocoOp::ImageColumnLoad(nlohmann::json image_tree, std::vector<std::string> *image_vec) {
+  if (image_tree.size() == 0) {
+    RETURN_STATUS_UNEXPECTED("No images found in " + annotation_path_);
+  }
+  for (auto img : image_tree) {
+    std::string file_name;
+    int32_t id = 0;
+    RETURN_IF_NOT_OK(SearchNodeInJson(img, std::string(kJsonImagesFileName), &file_name));
+    RETURN_IF_NOT_OK(SearchNodeInJson(img, std::string(kJsonId), &id));
+
+    image_index_[id] = file_name;
+    image_vec->push_back(file_name);
+  }
+  return Status::OK();
+}
+
+Status CocoOp::DetectionColumnLoad(nlohmann::json annotation_tree, const std::string &image_file,
+                                   const int32_t &unique_id) {
+  std::vector<float> bbox;
+  nlohmann::json node_bbox;
+  uint32_t category_id = 0, iscrowd = 0;
+  RETURN_IF_NOT_OK(SearchNodeInJson(annotation_tree, std::string(kJsonAnnoBbox), &node_bbox));
+  RETURN_IF_NOT_OK(SearchNodeInJson(annotation_tree, std::string(kJsonAnnoCategoryId), &category_id));
+  auto search_category = category_set_.find(category_id);
+  if (search_category == category_set_.end())
+    RETURN_STATUS_UNEXPECTED("category_id can't find in categories where category_id: " + std::to_string(category_id));
+  auto node_iscrowd = annotation_tree.find(kJsonAnnoIscrowd);
+  if (node_iscrowd != annotation_tree.end()) iscrowd = *node_iscrowd;
+  bbox.insert(bbox.end(), node_bbox.begin(), node_bbox.end());
+  coordinate_map_[image_file].push_back(bbox);
+  simple_item_map_[image_file].push_back(category_id);
+  simple_item_map_[image_file].push_back(iscrowd);
+  return Status::OK();
+}
+
+Status CocoOp::StuffColumnLoad(nlohmann::json annotation_tree, const std::string &image_file,
+                               const int32_t &unique_id) {
+  uint32_t iscrowd = 0;
+  std::vector<float> bbox;
+  RETURN_IF_NOT_OK(SearchNodeInJson(annotation_tree, std::string(kJsonAnnoIscrowd), &iscrowd));
+  simple_item_map_[image_file].push_back(iscrowd);
+  nlohmann::json segmentation;
+  RETURN_IF_NOT_OK(SearchNodeInJson(annotation_tree, std::string(kJsonAnnoSegmentation), &segmentation));
+  if (iscrowd == 0) {
+    for (auto item : segmentation) {
+      if (bbox.size() > 0) bbox.clear();
+      bbox.insert(bbox.end(), item.begin(), item.end());
+      coordinate_map_[image_file].push_back(bbox);
+    }
+  } else if (iscrowd == 1) {
+    nlohmann::json segmentation_count;
+    RETURN_IF_NOT_OK(SearchNodeInJson(segmentation, std::string(kJsonAnnoCounts), &segmentation_count));
+    bbox.insert(bbox.end(), segmentation_count.begin(), segmentation_count.end());
+    coordinate_map_[image_file].push_back(bbox);
+  }
+  return Status::OK();
+}
+
+Status CocoOp::KeypointColumnLoad(nlohmann::json annotation_tree, const std::string &image_file,
+                                  const int32_t &unique_id) {
+  auto itr_num_keypoint = annotation_tree.find(kJsonAnnoNumKeypoints);
+  if (itr_num_keypoint == annotation_tree.end())
+    RETURN_STATUS_UNEXPECTED("No num_keypoint found in annotations where id: " + std::to_string(unique_id));
+  simple_item_map_[image_file].push_back(*itr_num_keypoint);
+  auto itr_keypoint = annotation_tree.find(kJsonAnnoKeypoints);
+  if (itr_keypoint == annotation_tree.end())
+    RETURN_STATUS_UNEXPECTED("No keypoint found in annotations where id: " + std::to_string(unique_id));
+  coordinate_map_[image_file].push_back(*itr_keypoint);
+  return Status::OK();
+}
+
+Status CocoOp::PanopticColumnLoad(nlohmann::json annotation_tree, const std::string &image_file,
+                                  const int32_t &image_id) {
+  auto itr_segments = annotation_tree.find(kJsonAnnoSegmentsInfo);
+  if (itr_segments == annotation_tree.end())
+    RETURN_STATUS_UNEXPECTED("No segments_info found in annotations where image_id: " + std::to_string(image_id));
+  for (auto info : *itr_segments) {
+    std::vector<float> bbox;
+    uint32_t category_id = 0;
+    auto itr_bbox = info.find(kJsonAnnoBbox);
+    if (itr_bbox == info.end())
+      RETURN_STATUS_UNEXPECTED("No bbox found in segments_info where image_id: " + std::to_string(image_id));
+    bbox.insert(bbox.end(), itr_bbox->begin(), itr_bbox->end());
+    coordinate_map_[image_file].push_back(bbox);
+
+    RETURN_IF_NOT_OK(SearchNodeInJson(info, std::string(kJsonAnnoCategoryId), &category_id));
+    auto search_category = category_set_.find(category_id);
+    if (search_category == category_set_.end())
+      RETURN_STATUS_UNEXPECTED("category_id can't find in categories where category_id: " +
+                               std::to_string(category_id));
+    auto itr_iscrowd = info.find(kJsonAnnoIscrowd);
+    if (itr_iscrowd == info.end())
+      RETURN_STATUS_UNEXPECTED("No iscrowd found in segments_info where image_id: " + std::to_string(image_id));
+    auto itr_area = info.find(kJsonAnnoArea);
+    if (itr_area == info.end())
+      RETURN_STATUS_UNEXPECTED("No area found in segments_info where image_id: " + std::to_string(image_id));
+    simple_item_map_[image_file].push_back(category_id);
+    simple_item_map_[image_file].push_back(*itr_iscrowd);
+    simple_item_map_[image_file].push_back(*itr_area);
+  }
+  return Status::OK();
+}
+
+Status CocoOp::CategoriesColumnLoad(nlohmann::json categories_tree) {
+  if (categories_tree.size() == 0) RETURN_STATUS_UNEXPECTED("No categories found in " + annotation_path_);
+  for (auto category : categories_tree) {
+    int32_t id = 0;
+    std::string name;
+    std::vector<int32_t> label_info;
+    auto itr_id = category.find(kJsonId);
+    if (itr_id == category.end()) RETURN_STATUS_UNEXPECTED("No id found in categories of " + annotation_path_);
+    id = *itr_id;
+    label_info.push_back(id);
+    category_set_.insert(id);
+
+    auto itr_name = category.find(kJsonCategoriesName);
+    if (itr_name == category.end())
+      RETURN_STATUS_UNEXPECTED("No name found in categories where id: " + std::to_string(id));
+    name = *itr_name;
+
+    if (task_type_ == TaskType::Panoptic) {
+      auto itr_isthing = category.find(kJsonCategoriesIsthing);
+      if (itr_isthing == category.end())
+        RETURN_STATUS_UNEXPECTED("No isthing found in categories of " + annotation_path_);
+      label_info.push_back(*itr_isthing);
+    }
+    label_index_.emplace_back(std::make_pair(name, label_info));
+  }
+  return Status::OK();
+}
+
+Status CocoOp::InitSampler() {
+  RETURN_IF_NOT_OK(sampler_->HandshakeRandomAccessOp(this));
+  return Status::OK();
+}
+
+Status CocoOp::LaunchThreadsAndInitOp() {
+  if (tree_ == nullptr) {
+    RETURN_STATUS_UNEXPECTED("tree_ not set");
+  }
+  RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
+  RETURN_IF_NOT_OK(wp_.Register(tree_->AllTasks()));
+  RETURN_IF_NOT_OK(tree_->LaunchWorkers(num_workers_, std::bind(&CocoOp::WorkerEntry, this, std::placeholders::_1)));
+  TaskManager::FindMe()->Post();
+  RETURN_IF_NOT_OK(this->ParseAnnotationIds());
+  RETURN_IF_NOT_OK(this->InitSampler());
+  return Status::OK();
+}
+
+Status CocoOp::ReadImageToTensor(const std::string &path, const ColDescriptor &col, std::shared_ptr<Tensor> *tensor) {
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, path));
+
+  if (decode_ == true) {
+    Status rc = Decode(*tensor, tensor);
+    if (rc.IsError()) {
+      RETURN_STATUS_UNEXPECTED("fail to decode file: " + path);
+    }
+  }
+  return Status::OK();
+}
+
+Status CocoOp::CountTotalRows(const std::string &dir, const std::string &file, const std::string &task,
+                              int64_t *count) {
+  std::shared_ptr<CocoOp> op;
+  RETURN_IF_NOT_OK(Builder().SetDir(dir).SetFile(file).SetTask(task).Build(&op));
+  RETURN_IF_NOT_OK(op->ParseAnnotationIds());
+  *count = static_cast<int64_t>(op->image_ids_.size());
+  return Status::OK();
+}
+
+Status CocoOp::GetClassIndexing(const std::string &dir, const std::string &file, const std::string &task,
+                                std::vector<std::pair<std::string, std::vector<int32_t>>> *output_class_indexing) {
+  std::shared_ptr<CocoOp> op;
+  RETURN_IF_NOT_OK(Builder().SetDir(dir).SetFile(file).SetTask(task).Build(&op));
+  RETURN_IF_NOT_OK(op->ParseAnnotationIds());
+  *output_class_indexing = op->label_index_;
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/coco_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/coco_op.h
new file mode 100644
index 0000000000..f5abeed72e
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/coco_op.h
@@ -0,0 +1,330 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_COCO_OP_H_
+#define DATASET_ENGINE_DATASETOPS_SOURCE_COC0_OP_H_
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dataset/core/tensor.h"
+#include "dataset/engine/data_buffer.h"
+#include "dataset/engine/data_schema.h"
+#include "dataset/engine/datasetops/parallel_op.h"
+#include "dataset/engine/datasetops/source/io_block.h"
+#include "dataset/engine/datasetops/source/sampler/sampler.h"
+#include "dataset/kernels/image/image_utils.h"
+#include "dataset/util/path.h"
+#include "dataset/util/queue.h"
+#include "dataset/util/status.h"
+#include "dataset/util/wait_post.h"
+
+namespace mindspore {
+namespace dataset {
+// Forward declares
+template <typename T>
+class Queue;
+
+using CoordinateRow = std::vector<std::vector<float>>;
+
+class CocoOp : public ParallelOp, public RandomAccessOp {
+ public:
+  enum class TaskType { Detection = 0, Stuff = 1, Panoptic = 2, Keypoint = 3 };
+
+  class Builder {
+   public:
+    // Constructor for Builder class of ImageFolderOp
+    // @param  uint32_t numWrks - number of parallel workers
+    // @param dir - directory folder got ImageNetFolder
+    Builder();
+
+    // Destructor.
+    ~Builder() = default;
+
+    // Setter method.
+    // @param const std::string & build_dir
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetDir(const std::string &build_dir) {
+      builder_dir_ = build_dir;
+      return *this;
+    }
+
+    // Setter method.
+    // @param const std::string & build_file
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetFile(const std::string &build_file) {
+      builder_file_ = build_file;
+      return *this;
+    }
+
+    // Setter method.
+    // @param const std::string & task_type
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetTask(const std::string &task_type) {
+      if (task_type == "Detection") {
+        builder_task_type_ = TaskType::Detection;
+      } else if (task_type == "Stuff") {
+        builder_task_type_ = TaskType::Stuff;
+      } else if (task_type == "Panoptic") {
+        builder_task_type_ = TaskType::Panoptic;
+      } else if (task_type == "Keypoint") {
+        builder_task_type_ = TaskType::Keypoint;
+      }
+      return *this;
+    }
+
+    // Setter method.
+    // @param int32_t num_workers
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetNumWorkers(int32_t num_workers) {
+      builder_num_workers_ = num_workers;
+      return *this;
+    }
+
+    // Setter method.
+    // @param int32_t op_connector_size
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetOpConnectorSize(int32_t op_connector_size) {
+      builder_op_connector_size_ = op_connector_size;
+      return *this;
+    }
+
+    // Setter method.
+    // @param int32_t rows_per_buffer
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetRowsPerBuffer(int32_t rows_per_buffer) {
+      builder_rows_per_buffer_ = rows_per_buffer;
+      return *this;
+    }
+
+    // Setter method.
+    // @param std::shared_ptr<Sampler> sampler
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetSampler(std::shared_ptr<Sampler> sampler) {
+      builder_sampler_ = std::move(sampler);
+      return *this;
+    }
+
+    // Setter method.
+    // @param bool do_decode
+    // @return Builder setter method returns reference to the builder.
+    Builder &SetDecode(bool do_decode) {
+      builder_decode_ = do_decode;
+      return *this;
+    }
+
+    // Check validity of input args
+    // @return = The error code return
+    Status SanityCheck();
+
+    // The builder "Build" method creates the final object.
+    // @param std::shared_ptr<CocoOp> *op - DatasetOp
+    // @return - The error code return
+    Status Build(std::shared_ptr<CocoOp> *op);
+
+   private:
+    bool builder_decode_;
+    std::string builder_dir_;
+    std::string builder_file_;
+    TaskType builder_task_type_;
+    int32_t builder_num_workers_;
+    int32_t builder_op_connector_size_;
+    int32_t builder_rows_per_buffer_;
+    std::shared_ptr<Sampler> builder_sampler_;
+    std::unique_ptr<DataSchema> builder_schema_;
+  };
+
+  // Constructor
+  // @param TaskType task_type - task type of Coco
+  // @param std::string image_folder_path - image folder path of Coco
+  // @param std::string annotation_path - annotation json path of Coco
+  // @param int32_t num_workers - number of workers reading images in parallel
+  // @param int32_t rows_per_buffer - number of images (rows) in each buffer
+  // @param int32_t queue_size - connector queue size
+  // @param int64_t num_samples - number of samples to read
+  // @param bool decode - whether to decode images
+  // @param std::unique_ptr<DataSchema> data_schema - the schema of the Coco dataset
+  // @param std::shared_ptr<Sampler> sampler - sampler tells CocoOp what to read
+  CocoOp(const TaskType &task_type, const std::string &image_folder_path, const std::string &annotation_path,
+         int32_t num_workers, int32_t rows_per_buffer, int32_t queue_size, bool decode,
+         std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler);
+
+  // Destructor
+  ~CocoOp() = default;
+
+  // Worker thread pulls a number of IOBlock from IOBlock Queue, make a buffer and push it to Connector
+  // @param int32_t workerId - id of each worker
+  // @return Status - The error code return
+  Status WorkerEntry(int32_t worker_id) override;
+
+  // Main Loop of CocoOp
+  // Master thread: Fill IOBlockQueue, then goes to sleep
+  // Worker thread: pulls IOBlock from IOBlockQueue, work on it the put buffer to mOutConnector
+  // @return Status - The error code return
+  Status operator()() override;
+
+  // A print method typically used for debugging
+  // @param out
+  // @param show_all
+  void Print(std::ostream &out, bool show_all) const override;
+
+  // @param const std::string &dir - Coco image dir path
+  // @param const std::string &file - Coco json file path
+  // @param const std::string &task - task mode of Coco task
+  // @param int64_t numSamples - samples number of CocoDataset
+  // @param int64_t *count - output rows number of CocoDataset
+  static Status CountTotalRows(const std::string &dir, const std::string &task_type, const std::string &task_mode,
+                               int64_t *count);
+
+  // @param const std::string &dir - Coco image dir path
+  // @param const std::string &file - Coco json file path
+  // @param const std::string &task - task mode of Coco task
+  // @param int64_t numSamples - samples number of CocoDataset
+  // @param std::map<std::string, int32_t> *output_class_indexing - output class index of CocoDataset
+  static Status GetClassIndexing(const std::string &dir, const std::string &task_type, const std::string &task_mode,
+                                 std::vector<std::pair<std::string, std::vector<int32_t>>> *output_class_indexing);
+
+ private:
+  // Initialize Sampler, calls sampler->Init() within
+  // @return Status - The error code return
+  Status InitSampler();
+
+  // Load a tensor row according to image id
+  // @param row_id_type row_id - id for this tensor row
+  // @param std::string image_id - image id
+  // @param TensorRow row - image & target read into this tensor row
+  // @return Status - The error code return
+  Status LoadTensorRow(row_id_type row_id, const std::string &image_id, TensorRow *row);
+
+  // Load a tensor row with vector which a vector to a tensor
+  // @param row_id_type row_id - id for this tensor row
+  // @param const std::string &image_id - image is
+  // @param std::shared_ptr<Tensor> image - image tensor
+  // @param std::shared_ptr<Tensor> coordinate - coordinate tensor
+  // @param TensorRow row - image & target read into this tensor row
+  // @return Status - The error code return
+  Status LoadDetectionTensorRow(row_id_type row_id, const std::string &image_id, std::shared_ptr<Tensor> image,
+                                std::shared_ptr<Tensor> coordinate, TensorRow *trow);
+
+  // Load a tensor row with vector which a vector to a tensor
+  // @param row_id_type row_id - id for this tensor row
+  // @param const std::string &image_id - image is
+  // @param std::shared_ptr<Tensor> image - image tensor
+  // @param std::shared_ptr<Tensor> coordinate - coordinate tensor
+  // @param TensorRow row - image & target read into this tensor row
+  // @return Status - The error code return
+  Status LoadSimpleTensorRow(row_id_type row_id, const std::string &image_id, std::shared_ptr<Tensor> image,
+                             std::shared_ptr<Tensor> coordinate, TensorRow *trow);
+
+  // Load a tensor row with vector which a vector to multi-tensor
+  // @param row_id_type row_id - id for this tensor row
+  // @param const std::string &image_id - image is
+  // @param std::shared_ptr<Tensor> image - image tensor
+  // @param std::shared_ptr<Tensor> coordinate - coordinate tensor
+  // @param TensorRow row - image & target read into this tensor row
+  // @return Status - The error code return
+  Status LoadMixTensorRow(row_id_type row_id, const std::string &image_id, std::shared_ptr<Tensor> image,
+                          std::shared_ptr<Tensor> coordinate, TensorRow *trow);
+
+  // @param const std::string &path - path to the image file
+  // @param const ColDescriptor &col - contains tensor implementation and datatype
+  // @param std::shared_ptr<Tensor> tensor - return
+  // @return Status - The error code return
+  Status ReadImageToTensor(const std::string &path, const ColDescriptor &col, std::shared_ptr<Tensor> *tensor);
+
+  // @param const std::vector<uint64_t> &keys - keys in ioblock
+  // @param std::unique_ptr<DataBuffer> db
+  // @return Status - The error code return
+  Status LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db);
+
+  // Read annotation from Annotation folder
+  // @return Status - The error code return
+  Status ParseAnnotationIds();
+
+  // @param const std::shared_ptr<Tensor> &sample_ids - sample ids of tensor
+  // @param std::vector<int64_t> *keys - image id
+  // @return Status - The error code return
+  Status TraverseSampleIds(const std::shared_ptr<Tensor> &sample_ids, std::vector<int64_t> *keys);
+
+  // Called first when function is called
+  // @return Status - The error code return
+  Status LaunchThreadsAndInitOp();
+
+  // Reset dataset state
+  // @return Status - The error code return
+  Status Reset() override;
+
+  // @param nlohmann::json image_tree - image tree of json
+  // @param std::vector<std::string> *image_vec - image id list of json
+  // @return Status - The error code return
+  Status ImageColumnLoad(nlohmann::json image_tree, std::vector<std::string> *image_vec);
+
+  // @param nlohmann::json categories_tree - categories tree of json
+  // return Status - The error code return
+  Status CategoriesColumnLoad(nlohmann::json categories_tree);
+
+  // @param nlohmann::json categories_tree - categories tree of json
+  // @param const std::string &image_file - current image name in annotation
+  // @param const int32_t &id - current unique id of annotation
+  // @return Status - The error code return
+  Status DetectionColumnLoad(nlohmann::json annotation_tree, const std::string &image_file, const int32_t &id);
+
+  // @param nlohmann::json categories_tree - categories tree of json
+  // @param const std::string &image_file - current image name in annotation
+  // @param const int32_t &id - current unique id of annotation
+  // @return Status - The error code return
+  Status StuffColumnLoad(nlohmann::json annotation_tree, const std::string &image_file, const int32_t &id);
+
+  // @param nlohmann::json categories_tree - categories tree of json
+  // @param const std::string &image_file - current image name in annotation
+  // @param const int32_t &id - current unique id of annotation
+  // @return Status - The error code return
+  Status KeypointColumnLoad(nlohmann::json annotation_tree, const std::string &image_file, const int32_t &id);
+
+  // @param nlohmann::json categories_tree - categories tree of json
+  // @param const std::string &image_file - current image name in annotation
+  // @param const int32_t &image_id - current unique id of annotation
+  // @return Status - The error code return
+  Status PanopticColumnLoad(nlohmann::json annotation_tree, const std::string &image_file, const int32_t &image_id);
+
+  template <typename T>
+  Status SearchNodeInJson(nlohmann::json input_tree, std::string node_name, T *output_node);
+
+  bool decode_;
+  int64_t row_cnt_;
+  int64_t buf_cnt_;
+  std::string image_folder_path_;
+  std::string annotation_path_;
+  TaskType task_type_;
+  int32_t rows_per_buffer_;
+  std::shared_ptr<Sampler> sampler_;
+  std::unique_ptr<DataSchema> data_schema_;
+
+  WaitPost wp_;
+  std::vector<std::string> image_ids_;
+  std::map<int32_t, std::string> image_index_;
+  QueueList<std::unique_ptr<IOBlock>> io_block_queues_;
+  std::vector<std::pair<std::string, std::vector<int32_t>>> label_index_;
+  std::map<std::string, CoordinateRow> coordinate_map_;
+  std::map<std::string, std::vector<uint32_t>> simple_item_map_;
+  std::set<uint32_t> category_set_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_ENGINE_DATASETOPS_SOURCE_Coco_OP_H_
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.cc
index fe0763c8b7..d316524c04 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.cc
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace dataset {
 GeneratorOp::Builder::Builder() {
-  // Some arguments to the StorageOp constructor have a default argument that is taken
+  // Some arguments to the GeneratorOp constructor have a default argument that is taken
   // from the client config.
   build_buffer_size_ = kCfgRowsPerBuffer;
   build_op_connector_size_ = kCfgOpConnectorSize;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.h
index afeff29b86..82b395d6de 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.h
@@ -72,7 +72,7 @@ class GeneratorOp : public PipelineOp {
     }
 
     // The builder "build" method creates the final object.
-    // @return shared_ptr to the new StorageOp object
+    // @return shared_ptr to the new GeneratorOp object
     Status Build(std::shared_ptr<GeneratorOp> *);
 
    private:
@@ -127,6 +127,10 @@ class GeneratorOp : public PipelineOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "GeneratorOp"; }
+
  private:
   py::function generator_function_;
   std::vector<std::string> column_names_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc
index ce8fef7404..5cdfa8bb76 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc
@@ -26,8 +26,7 @@
 
 namespace mindspore {
 namespace dataset {
-ImageFolderOp::Builder::Builder()
-    : builder_decode_(false), builder_recursive_(false), builder_num_samples_(0), builder_sampler_(nullptr) {
+ImageFolderOp::Builder::Builder() : builder_decode_(false), builder_recursive_(false), builder_sampler_(nullptr) {
   std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
   builder_num_workers_ = cfg->num_parallel_workers();
   builder_rows_per_buffer_ = cfg->rows_per_buffer();
@@ -37,7 +36,9 @@ ImageFolderOp::Builder::Builder()
 Status ImageFolderOp::Builder::Build(std::shared_ptr<ImageFolderOp> *ptr) {
   RETURN_IF_NOT_OK(SanityCheck());
   if (builder_sampler_ == nullptr) {
-    builder_sampler_ = std::make_shared<SequentialSampler>();
+    const int64_t num_samples = 0;  // default num samples of 0 means to sample entire set of data
+    const int64_t start_index = 0;
+    builder_sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples);
   }
   builder_schema_ = std::make_unique<DataSchema>();
   TensorShape scalar = TensorShape::CreateScalar();
@@ -46,9 +47,9 @@ Status ImageFolderOp::Builder::Build(std::shared_ptr<ImageFolderOp> *ptr) {
   RETURN_IF_NOT_OK(builder_schema_->AddColumn(
     ColDescriptor("label", DataType(DataType::DE_INT32), TensorImpl::kFlexible, 0, &scalar)));
   *ptr = std::make_shared<ImageFolderOp>(builder_num_workers_, builder_rows_per_buffer_, builder_dir_,
-                                         builder_op_connector_size_, builder_num_samples_, builder_recursive_,
-                                         builder_decode_, builder_extensions_, builder_labels_to_read_,
-                                         std::move(builder_schema_), std::move(builder_sampler_));
+                                         builder_op_connector_size_, builder_recursive_, builder_decode_,
+                                         builder_extensions_, builder_labels_to_read_, std::move(builder_schema_),
+                                         std::move(builder_sampler_));
   return Status::OK();
 }
 
@@ -61,20 +62,18 @@ Status ImageFolderOp::Builder::SanityCheck() {
 }
 
 ImageFolderOp::ImageFolderOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size,
-                             int64_t num_samples, bool recursive, bool do_decode, const std::set<std::string> &exts,
+                             bool recursive, bool do_decode, const std::set<std::string> &exts,
                              const std::map<std::string, int32_t> &map, std::unique_ptr<DataSchema> data_schema,
                              std::shared_ptr<Sampler> sampler)
     : ParallelOp(num_wkrs, queue_size),
       rows_per_buffer_(rows_per_buffer),
       folder_path_(file_dir),
-      num_samples_(num_samples),
       recursive_(recursive),
       decode_(do_decode),
       extensions_(exts),
       class_index_(map),
       data_schema_(std::move(data_schema)),
       sampler_(std::move(sampler)),
-      num_rows_(0),
       row_cnt_(0),
       buf_cnt_(0),
       sampler_ind_(0),
@@ -117,7 +116,11 @@ Status ImageFolderOp::PrescanMasterEntry(const std::string &filedir) {
   }
   image_label_pairs_.shrink_to_fit();
   num_rows_ = image_label_pairs_.size();
-  num_samples_ = (num_samples_ == 0 || num_samples_ > num_rows_) ? num_rows_ : num_samples_;
+  if (num_rows_ == 0) {
+    RETURN_STATUS_UNEXPECTED(
+      "There is no valid data matching the dataset API ImageFolderDatasetV2.Please check file path or dataset "
+      "API validation first.");
+  }
   // free memory of two queues used for pre-scan
   folder_name_queue_->Reset();
   image_name_queue_->Reset();
@@ -128,7 +131,7 @@ Status ImageFolderOp::PrescanMasterEntry(const std::string &filedir) {
 Status ImageFolderOp::operator()() {
   RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
   std::unique_ptr<DataBuffer> sampler_buffer;
-  RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+  RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
   while (true) {  // each iterator is 1 epoch
     std::vector<int64_t> keys;
     keys.reserve(rows_per_buffer_);
@@ -138,8 +141,7 @@ Status ImageFolderOp::operator()() {
       std::shared_ptr<Tensor> sample_ids = sample_row[0];
       if (sample_ids->type() != DataType(DataType::DE_INT64)) RETURN_STATUS_UNEXPECTED("Sampler Tensor isn't int64");
       for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
-        if ((*itr) >= num_rows_) continue;    // index out of bound, skipping
-        if (row_cnt_ >= num_samples_) break;  // enough row read, break for loop
+        if ((*itr) >= num_rows_) continue;  // index out of bound, skipping
         keys.push_back(*itr);
         row_cnt_++;
         if (row_cnt_ % rows_per_buffer_ == 0) {
@@ -148,7 +150,7 @@ Status ImageFolderOp::operator()() {
           keys.clear();
         }
       }
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
     }
     if (keys.empty() == false) {
       RETURN_IF_NOT_OK(
@@ -169,7 +171,7 @@ Status ImageFolderOp::operator()() {
         io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
       RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
       wp_.Clear();
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
     }
   }
 }
@@ -202,23 +204,13 @@ Status ImageFolderOp::WorkerEntry(int32_t worker_id) {
 }
 
 // Load 1 TensorRow (image,label) using 1 ImageLabelPair. 1 function call produces 1 TensorTow in a DataBuffer
-Status ImageFolderOp::LoadTensorRow(ImageLabelPair pairPtr, TensorRow *trow) {
+Status ImageFolderOp::LoadTensorRow(row_id_type row_id, ImageLabelPair pairPtr, TensorRow *trow) {
   std::shared_ptr<Tensor> image, label;
   RETURN_IF_NOT_OK(Tensor::CreateTensor(&label, data_schema_->column(1).tensorImpl(), data_schema_->column(1).shape(),
                                         data_schema_->column(1).type(),
                                         reinterpret_cast<unsigned char *>(&pairPtr->second)));
-  std::ifstream fs;
-  fs.open(folder_path_ + (pairPtr->first), std::ios::binary | std::ios::in);
-  if (fs.fail()) {
-    RETURN_STATUS_UNEXPECTED("Fail to open file: " + pairPtr->first);
-  }
-  int64_t num_elements = fs.seekg(0, std::ios::end).tellg();
-  (void)fs.seekg(0, std::ios::beg);
-  RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(),
-                                        TensorShape(std::vector<dsize_t>(1, num_elements)),
-                                        data_schema_->column(0).type(), nullptr));
-  (void)fs.read(reinterpret_cast<char *>(image->GetMutableBuffer()), num_elements);
-  fs.close();
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, folder_path_ + (pairPtr->first)));
+
   if (decode_ == true) {
     Status rc = Decode(image, &image);
     if (rc.IsError()) {
@@ -226,7 +218,7 @@ Status ImageFolderOp::LoadTensorRow(ImageLabelPair pairPtr, TensorRow *trow) {
       RETURN_STATUS_UNEXPECTED(err);
     }
   }
-  (*trow) = {std::move(image), std::move(label)};
+  (*trow) = TensorRow(row_id, {std::move(image), std::move(label)});
   return Status::OK();
 }
 
@@ -235,7 +227,7 @@ Status ImageFolderOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_p
   std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
   TensorRow trow;
   for (const int64_t &key : keys) {
-    RETURN_IF_NOT_OK(this->LoadTensorRow(image_label_pairs_[key], &trow));
+    RETURN_IF_NOT_OK(this->LoadTensorRow(key, image_label_pairs_[key], &trow));
     deq->push_back(std::move(trow));
   }
   (*db)->set_tensor_table(std::move(deq));
@@ -260,7 +252,7 @@ void ImageFolderOp::Print(std::ostream &out, bool show_all) const {
 
 // Reset Sampler and wakeup Master thread (functor)
 Status ImageFolderOp::Reset() {
-  RETURN_IF_NOT_OK(sampler_->Reset());
+  RETURN_IF_NOT_OK(sampler_->ResetSampler());
   row_cnt_ = 0;
   wp_.Set();  // wake up master thread after reset is done
   return Status::OK();
@@ -272,28 +264,6 @@ Status ImageFolderOp::InitSampler() {
   return Status::OK();
 }
 
-// Derived from RandomAccessOp
-Status ImageFolderOp::GetNumSamples(int64_t *num) const {
-  if (num == nullptr || num_samples_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API ImageFolderDatasetV2.Please check file path or dataset API "
-      "validation first.");
-  }
-  (*num) = num_samples_;
-  return Status::OK();
-}
-
-// Derived from RandomAccessOp
-Status ImageFolderOp::GetNumRowsInDataset(int64_t *num) const {
-  if (num == nullptr || num_rows_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API ImageFolderDatasetV2.Please check file path or dataset API "
-      "validation first.");
-  }
-  (*num) = num_rows_;
-  return Status::OK();
-}
-
 // Derived from RandomAccessOp
 Status ImageFolderOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_ids) const {
   if (cls_ids == nullptr || !cls_ids->empty() || image_label_pairs_.empty()) {
@@ -353,9 +323,7 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) {
 // if mRecursive == false, don't go into folder of folders
 Status ImageFolderOp::RecursiveWalkFolder(Path *dir) {
   std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir);
-  if (dir_itr == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Error encountered when indexing files");
-  }
+  RETURN_UNEXPECTED_IF_NULL(dir_itr);
   while (dir_itr->hasNext()) {
     Path subdir = dir_itr->next();
     if (subdir.IsDirectory()) {
@@ -389,9 +357,7 @@ Status ImageFolderOp::startAsyncWalk() {
 }
 
 Status ImageFolderOp::LaunchThreadsAndInitOp() {
-  if (tree_ == nullptr) {
-    RETURN_STATUS_UNEXPECTED("tree_ not set");
-  }
+  RETURN_UNEXPECTED_IF_NULL(tree_);
   // Registers QueueList and individual Queues for interrupt services
   RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
   RETURN_IF_NOT_OK(folder_name_queue_->Register(tree_->AllTasks()));
@@ -413,16 +379,14 @@ Status ImageFolderOp::LaunchThreadsAndInitOp() {
   return Status::OK();
 }
 
-Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const int64_t &num_samples,
-                                          const std::set<std::string> &exts, int64_t *num_rows, int64_t *num_classes,
-                                          int64_t dev_id, int64_t num_dev) {
+Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows,
+                                          int64_t *num_classes, int64_t dev_id, int64_t num_dev) {
   Path dir(path);
   std::string err_msg = "";
   int64_t row_cnt = 0;
   err_msg += (dir.Exists() == false || dir.IsDirectory() == false) ? "unable to open dir " + path : "";
   err_msg += (num_classes == nullptr || num_rows == nullptr) ? "num_class/num_rows is null\n" : "";
   err_msg += (dev_id >= num_dev || num_dev <= 0) ? "invalid sharding config\n" : "";
-  err_msg += num_samples < 0 ? "num_samples can't be negative! set it to 0 to use all samples\n" : "";
   if (err_msg.empty() == false) {
     RETURN_STATUS_UNEXPECTED(err_msg);
   }
@@ -441,10 +405,6 @@ Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const int64_t
     while (dir_itr->hasNext()) {
       if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) {
         ++row_cnt;
-        if (row_cnt == num_samples * num_dev) {
-          (*num_rows) = (row_cnt / num_dev) + (row_cnt % num_dev == 0 ? 0 : 1);
-          return Status::OK();
-        }
       }
     }
     foldernames.pop();
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.h
index 72d47224fb..e1d578e034 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.h
@@ -107,14 +107,6 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
       return *this;
     }
 
-    // Setter method
-    // @param int64_t num_samples
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetNumSamples(int64_t num_samples) {
-      builder_num_samples_ = num_samples;
-      return *this;
-    }
-
     // Setter method
     // @param std::shared_ptr<Sampler> sampler
     // @return Builder setter method returns reference to the builder.
@@ -153,7 +145,6 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
     bool builder_recursive_;
     std::string builder_dir_;
     int32_t builder_num_workers_;
-    int64_t builder_num_samples_;
     int32_t builder_rows_per_buffer_;
     int32_t builder_op_connector_size_;
     std::set<std::string> builder_extensions_;
@@ -169,10 +160,9 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
   // @param int32_t queue_size - connector queue size
   // @param std::set<std::string> exts - set of file extensions to read, if empty, read everything under the dir
   // @param td::unique_ptr<Sampler> sampler - sampler tells ImageFolderOp what to read
-  ImageFolderOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size,
-                int64_t num_samples, bool recursive, bool do_decode, const std::set<std::string> &exts,
-                const std::map<std::string, int32_t> &map, std::unique_ptr<DataSchema>,
-                std::shared_ptr<Sampler> sampler);
+  ImageFolderOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size, bool recursive,
+                bool do_decode, const std::set<std::string> &exts, const std::map<std::string, int32_t> &map,
+                std::unique_ptr<DataSchema>, std::shared_ptr<Sampler> sampler);
 
   // Destructor.
   ~ImageFolderOp() = default;
@@ -198,16 +188,6 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
   // @return Status - The error code return
   Status operator()() override;
 
-  // Method derived from RandomAccess Op, enable Sampler to get numRows
-  // @param int64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumSamples(int64_t *num) const override;
-
-  // Method derived from RandomAccess Op, enable Sampler to get total numRows in dataset
-  // @param int64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumRowsInDataset(int64_t *num) const override;
-
   // Method derived from RandomAccess Op, enable Sampler to get all ids for each class
   // @param (std::map<int64_t, std::vector<int64_t >> * map - key label, val all ids for this class
   // @return Status - The error code return
@@ -218,12 +198,11 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
   // @param show_all
   void Print(std::ostream &out, bool show_all) const override;
 
-  // This function is a hack! It is to return the num_class and num_rows the old storageOp does. The result
+  // This function is a hack! It is to return the num_class and num_rows. The result
   // returned by this function may not be consistent with what image_folder_op is going to return
   // user this at your own risk!
-  static Status CountRowsAndClasses(const std::string &path, const int64_t &num_samples,
-                                    const std::set<std::string> &exts, int64_t *num_rows, int64_t *num_classes,
-                                    int64_t dev_id = 0, int64_t num_dev = 1);
+  static Status CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows,
+                                    int64_t *num_classes, int64_t dev_id = 0, int64_t num_dev = 1);
 
   // Base-class override for NodePass visitor acceptor.
   // @param p - Pointer to the NodePass to be accepted.
@@ -231,16 +210,21 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "ImageFolderOp"; }
+
  private:
   // Initialize Sampler, calls sampler->Init() within
   // @return Status - The error code return
   Status InitSampler();
 
   // Load a tensor row according to a pair
+  // @param row_id_type row_id - id for this tensor row
   // @param ImageLabelPair pair - <imagefile,label>
   // @param TensorRow row - image & label read into this tensor row
   // @return Status - The error code return
-  Status LoadTensorRow(ImageLabelPair pair, TensorRow *row);
+  Status LoadTensorRow(row_id_type row_id, ImageLabelPair pair, TensorRow *row);
 
   // @param const std::vector<int64_t> &keys - keys in ioblock
   // @param std::unique_ptr<DataBuffer> db
@@ -266,14 +250,12 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp {
 
   int32_t rows_per_buffer_;
   std::string folder_path_;  // directory of image folder
-  int64_t num_samples_;
   bool recursive_;
   bool decode_;
   std::set<std::string> extensions_;  // extensions allowed
   std::map<std::string, int32_t> class_index_;
   std::unique_ptr<DataSchema> data_schema_;
   std::shared_ptr<Sampler> sampler_;
-  int64_t num_rows_;  // total number of images in ImageFolder
   int64_t row_cnt_;
   int64_t buf_cnt_;
   int64_t sampler_ind_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/io_block.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/io_block.cc
index 9f45e2179f..0963f1a67a 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/io_block.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/io_block.cc
@@ -72,8 +72,9 @@ Status FilenameBlock::GetFilename(std::string *out_filename, const AutoIndexObj<
   RETURN_IF_NOT_OK(IOBlock::GetKey(&fetched_key));
 
   // Do an index lookup using that key to get the filename.
-  auto it = index.Search(fetched_key);
-  if (it != index.end()) {
+  auto r = index.Search(fetched_key);
+  if (r.second) {
+    auto &it = r.first;
     *out_filename = it.value();
   } else {
     RETURN_STATUS_UNEXPECTED("Could not find filename from index");
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc
index 5892b10701..0762f36d5a 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc
@@ -29,7 +29,7 @@
 
 namespace mindspore {
 namespace dataset {
-ManifestOp::Builder::Builder() : builder_sampler_(nullptr), builder_num_samples_(0), builder_decode_(false) {
+ManifestOp::Builder::Builder() : builder_sampler_(nullptr), builder_decode_(false) {
   std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
   builder_num_workers_ = cfg->num_parallel_workers();
   builder_rows_per_buffer_ = cfg->rows_per_buffer();
@@ -39,16 +39,18 @@ ManifestOp::Builder::Builder() : builder_sampler_(nullptr), builder_num_samples_
 Status ManifestOp::Builder::Build(std::shared_ptr<ManifestOp> *ptr) {
   RETURN_IF_NOT_OK(SanityCheck());
   if (builder_sampler_ == nullptr) {
-    builder_sampler_ = std::make_shared<SequentialSampler>();
+    const int64_t num_samples = 0;
+    const int64_t start_index = 0;
+    builder_sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples);
   }
   builder_schema_ = std::make_unique<DataSchema>();
   RETURN_IF_NOT_OK(
     builder_schema_->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
   RETURN_IF_NOT_OK(
     builder_schema_->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
-  *ptr = std::make_shared<ManifestOp>(
-    builder_num_workers_, builder_rows_per_buffer_, builder_file_, builder_op_connector_size_, builder_num_samples_,
-    builder_decode_, builder_labels_to_read_, std::move(builder_schema_), std::move(builder_sampler_), builder_usage_);
+  *ptr = std::make_shared<ManifestOp>(builder_num_workers_, builder_rows_per_buffer_, builder_file_,
+                                      builder_op_connector_size_, builder_decode_, builder_labels_to_read_,
+                                      std::move(builder_schema_), std::move(builder_sampler_), builder_usage_);
   return Status::OK();
 }
 
@@ -59,9 +61,9 @@ Status ManifestOp::Builder::SanityCheck() {
   return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg);
 }
 
-ManifestOp::ManifestOp(int32_t num_works, int32_t rows_per_buffer, std::string file, int32_t queue_size,
-                       int64_t num_samples, bool decode, const std::map<std::string, int32_t> &class_index,
-                       std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler, std::string usage)
+ManifestOp::ManifestOp(int32_t num_works, int32_t rows_per_buffer, std::string file, int32_t queue_size, bool decode,
+                       const std::map<std::string, int32_t> &class_index, std::unique_ptr<DataSchema> data_schema,
+                       std::shared_ptr<Sampler> sampler, std::string usage)
     : ParallelOp(num_works, queue_size),
       rows_per_buffer_(rows_per_buffer),
       io_block_pushed_(0),
@@ -71,8 +73,6 @@ ManifestOp::ManifestOp(int32_t num_works, int32_t rows_per_buffer, std::string f
       file_(file),
       class_index_(class_index),
       sampler_(std::move(sampler)),
-      num_samples_(num_samples),
-      num_rows_(0),
       decode_(decode),
       usage_(usage),
       buf_cnt_(0) {
@@ -88,7 +88,7 @@ ManifestOp::ManifestOp(int32_t num_works, int32_t rows_per_buffer, std::string f
 Status ManifestOp::operator()() {
   RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
   std::unique_ptr<DataBuffer> sampler_buffer;
-  RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+  RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
   return AddIoBlock(&sampler_buffer);
 }
 
@@ -101,8 +101,7 @@ Status ManifestOp::AddIoBlock(std::unique_ptr<DataBuffer> *sampler_buffer) {
       RETURN_IF_NOT_OK((*sampler_buffer)->PopRow(&sample_row));
       std::shared_ptr<Tensor> sample_ids = sample_row[0];
       for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
-        if ((*itr) >= num_rows_) continue;    // index out of bound, skipping
-        if (row_cnt_ >= num_samples_) break;  // enough row read, break for loop
+        if ((*itr) >= num_rows_) continue;  // index out of bound, skipping
         keys.push_back(*itr);
         row_cnt_++;
         if (row_cnt_ % rows_per_buffer_ == 0) {
@@ -111,7 +110,7 @@ Status ManifestOp::AddIoBlock(std::unique_ptr<DataBuffer> *sampler_buffer) {
           keys.clear();
         }
       }
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(sampler_buffer));
     }
     if (keys.empty() == false) {
       RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(
@@ -132,7 +131,7 @@ Status ManifestOp::AddIoBlock(std::unique_ptr<DataBuffer> *sampler_buffer) {
         io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
       RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
       wp_.Clear();
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(sampler_buffer));
     }
   }
 }
@@ -183,7 +182,8 @@ Status ManifestOp::WorkerEntry(int32_t worker_id) {
 }
 
 // Load 1 TensorRow (image,label) using 1 ImageLabelPair. 1 function call produces 1 TensorTow in a DataBuffer
-Status ManifestOp::LoadTensorRow(const std::pair<std::string, std::vector<std::string>> &data, TensorRow *trow) {
+Status ManifestOp::LoadTensorRow(row_id_type row_id, const std::pair<std::string, std::vector<std::string>> &data,
+                                 TensorRow *trow) {
   std::shared_ptr<Tensor> image;
   std::shared_ptr<Tensor> label;
   std::vector<int32_t> label_index(data.second.size());
@@ -199,23 +199,7 @@ Status ManifestOp::LoadTensorRow(const std::pair<std::string, std::vector<std::s
       data_schema_->column(1).type(), reinterpret_cast<unsigned char *>(&label_index[0])));
   }
 
-  std::ifstream fs;
-  fs.open(data.first, std::ios::binary | std::ios::in);
-  if (!fs.is_open()) {
-    RETURN_STATUS_UNEXPECTED("Fail to open file: " + data.first);
-  }
-
-  int64_t num_elements = fs.seekg(0, std::ios::end).tellg();
-  (void)fs.seekg(0, std::ios::beg);
-  RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(),
-                                        TensorShape(std::vector<dsize_t>(1, num_elements)),
-                                        data_schema_->column(0).type(), nullptr));
-  (void)fs.read(reinterpret_cast<char *>(image->GetMutableBuffer()), num_elements);
-  if (fs.fail()) {
-    fs.close();
-    RETURN_STATUS_UNEXPECTED("Fail to read file: " + data.first);
-  }
-  fs.close();
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data.first));
   if (decode_ == true) {
     Status rc = Decode(image, &image);
     if (rc.IsError()) {
@@ -223,7 +207,7 @@ Status ManifestOp::LoadTensorRow(const std::pair<std::string, std::vector<std::s
       RETURN_STATUS_UNEXPECTED(err);
     }
   }
-  (*trow) = {std::move(image), std::move(label)};
+  (*trow) = TensorRow(row_id, {std::move(image), std::move(label)});
   return Status::OK();
 }
 
@@ -232,7 +216,7 @@ Status ManifestOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<
   std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
   for (const auto &key : keys) {
     TensorRow trow;
-    RETURN_IF_NOT_OK(LoadTensorRow(image_labelname_[static_cast<size_t>(key)], &trow));
+    RETURN_IF_NOT_OK(LoadTensorRow(key, image_labelname_[static_cast<size_t>(key)], &trow));
     deq->push_back(std::move(trow));
   }
   (*db)->set_tensor_table(std::move(deq));
@@ -257,7 +241,7 @@ void ManifestOp::Print(std::ostream &out, bool show_all) const {
 
 // Reset Sampler and wakeup Master thread (functor)
 Status ManifestOp::Reset() {
-  RETURN_IF_NOT_OK(sampler_->Reset());
+  RETURN_IF_NOT_OK(sampler_->ResetSampler());
   row_cnt_ = 0;
   wp_.Set();  // wake up master thread after reset is done
   return Status::OK();
@@ -269,28 +253,6 @@ Status ManifestOp::InitSampler() {
   return Status::OK();
 }
 
-// Derived from RandomAccessOp
-Status ManifestOp::GetNumSamples(int64_t *num) const {
-  if (num == nullptr || num_rows_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API ManifestDataset.Please check file path or dataset API "
-      "validation first.");
-  }
-  (*num) = num_samples_;
-  return Status::OK();
-}
-
-// Derived from RandomAccessOp
-Status ManifestOp::GetNumRowsInDataset(int64_t *num) const {
-  if (num == nullptr || num_rows_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API ManifestDataset.Please check file path or dataset API "
-      "validation first.");
-  }
-  (*num) = num_rows_;
-  return Status::OK();
-}
-
 // Derived from RandomAccessOp
 Status ManifestOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_ids) const {
   if (cls_ids == nullptr || !cls_ids->empty() || image_labelname_.empty()) {
@@ -408,7 +370,6 @@ Status ManifestOp::CountDatasetInfo() {
   }
 
   num_rows_ = static_cast<int64_t>(image_labelname_.size());
-  num_samples_ = (num_samples_ == 0 || num_samples_ > num_rows_) ? num_rows_ : num_samples_;
   if (num_rows_ == 0) {
     RETURN_STATUS_UNEXPECTED(
       "There is no valid data matching the dataset API ManifestDataset.Please check file path or dataset API "
@@ -417,8 +378,8 @@ Status ManifestOp::CountDatasetInfo() {
   return Status::OK();
 }
 
-Status ManifestOp::CountTotalRows(const std::string &file, int64_t numSamples, const py::dict &dict,
-                                  const std::string &usage, int64_t *count, int64_t *numClasses) {
+Status ManifestOp::CountTotalRows(const std::string &file, const py::dict &dict, const std::string &usage,
+                                  int64_t *count, int64_t *numClasses) {
   // the logic of counting the number of samples is copied from ParseManifestFile()
   std::map<std::string, int32_t> map;
   for (auto p : dict) {
@@ -428,17 +389,15 @@ Status ManifestOp::CountTotalRows(const std::string &file, int64_t numSamples, c
 
   std::shared_ptr<ManifestOp> op;
   *count = 0;
-  RETURN_IF_NOT_OK(
-    Builder().SetManifestFile(file).SetNumSamples(numSamples).SetClassIndex(map).SetUsage(usage).Build(&op));
+  RETURN_IF_NOT_OK(Builder().SetManifestFile(file).SetClassIndex(map).SetUsage(usage).Build(&op));
   RETURN_IF_NOT_OK(op->ParseManifestFile());
   *numClasses = static_cast<int64_t>(op->label_index_.size());
   *count = static_cast<int64_t>(op->image_labelname_.size());
-  *count = (*count < numSamples || numSamples == 0) ? *count : numSamples;
   return Status::OK();
 }
 
-Status ManifestOp::GetClassIndexing(const std::string &file, int64_t numSamples, const py::dict &dict,
-                                    const std::string &usage, std::map<std::string, int32_t> *output_class_indexing) {
+Status ManifestOp::GetClassIndexing(const std::string &file, const py::dict &dict, const std::string &usage,
+                                    std::map<std::string, int32_t> *output_class_indexing) {
   std::map<std::string, int32_t> input_class_indexing;
   for (auto p : dict) {
     (void)input_class_indexing.insert(std::pair<std::string, int32_t>(py::reinterpret_borrow<py::str>(p.first),
@@ -449,12 +408,7 @@ Status ManifestOp::GetClassIndexing(const std::string &file, int64_t numSamples,
     *output_class_indexing = input_class_indexing;
   } else {
     std::shared_ptr<ManifestOp> op;
-    RETURN_IF_NOT_OK(Builder()
-                       .SetManifestFile(file)
-                       .SetNumSamples(numSamples)
-                       .SetClassIndex(input_class_indexing)
-                       .SetUsage(usage)
-                       .Build(&op));
+    RETURN_IF_NOT_OK(Builder().SetManifestFile(file).SetClassIndex(input_class_indexing).SetUsage(usage).Build(&op));
     RETURN_IF_NOT_OK(op->ParseManifestFile());
     RETURN_IF_NOT_OK(op->CountDatasetInfo());
     uint32_t count = 0;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.h
index e015496acc..edfdbb51ae 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.h
@@ -86,14 +86,6 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
       return *this;
     }
 
-    // Setter method
-    // @param int64_t num_samples
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetNumSamples(int64_t num_samples) {
-      builder_num_samples_ = num_samples;
-      return *this;
-    }
-
     // Setter method
     // @param std::shared_ptr<Sampler> sampler
     // @return Builder setter method returns reference to the builder.
@@ -129,7 +121,6 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
 
    private:
     std::shared_ptr<Sampler> builder_sampler_;
-    int64_t builder_num_samples_;
     bool builder_decode_;
 
     std::string builder_file_;
@@ -147,8 +138,8 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
   // @param std::string - file list of Manifest
   // @param int32_t queue_size - connector queue size
   // @param td::unique_ptr<Sampler> sampler - sampler tells ImageFolderOp what to read
-  ManifestOp(int32_t num_works, int32_t rows_per_buffer, std::string file, int32_t queue_size, int64_t num_samples,
-             bool decode, const std::map<std::string, int32_t> &class_index, std::unique_ptr<DataSchema> data_schema,
+  ManifestOp(int32_t num_works, int32_t rows_per_buffer, std::string file, int32_t queue_size, bool decode,
+             const std::map<std::string, int32_t> &class_index, std::unique_ptr<DataSchema> data_schema,
              std::shared_ptr<Sampler> sampler, std::string usage);
   // Destructor.
   ~ManifestOp() = default;
@@ -164,16 +155,6 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
   // @return Status - The error code return
   Status operator()() override;
 
-  // Method derived from RandomAccess Op, enable Sampler to get numRows
-  // @param int64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumSamples(int64_t *num) const override;
-
-  // Method derived from RandomAccess Op, enable Sampler to get total number of Rows in dataset
-  // @param int64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumRowsInDataset(int64_t *num) const override;
-
   // Method derived from RandomAccess Op, enable Sampler to get all ids for each class
   // @param (std::map<int64_t, std::vector<int64_t >> * map - key label, val all ids for this class
   // @return Status - The error code return
@@ -184,12 +165,16 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
   // @param show_all
   void Print(std::ostream &out, bool show_all) const override;
 
-  static Status CountTotalRows(const std::string &file, int64_t numSamples, const py::dict &dict,
-                               const std::string &usage, int64_t *count, int64_t *numClasses);
+  static Status CountTotalRows(const std::string &file, const py::dict &dict, const std::string &usage, int64_t *count,
+                               int64_t *numClasses);
 
   // Get str-to-int mapping from label name to index
-  static Status GetClassIndexing(const std::string &file, int64_t numSamples, const py::dict &dict,
-                                 const std::string &usage, std::map<std::string, int32_t> *output_class_indexing);
+  static Status GetClassIndexing(const std::string &file, const py::dict &dict, const std::string &usage,
+                                 std::map<std::string, int32_t> *output_class_indexing);
+
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "ManifestOp"; }
 
  private:
   // Initialize Sampler, calls sampler->Init() within
@@ -202,10 +187,12 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
   Status AddIoBlock(std::unique_ptr<DataBuffer> *sampler_buffer);
 
   // Load a tensor row according to a pair
+  // @param row_id_type row_id - id for this tensor row
   // @param std::pair<std::string, std::vector<std::string>> - <imagefile, <label1, label2...>>
   // @param TensorRow row - image & label read into this tensor row
   // @return Status - The error code return
-  Status LoadTensorRow(const std::pair<std::string, std::vector<std::string>> &data, TensorRow *row);
+  Status LoadTensorRow(row_id_type row_id, const std::pair<std::string, std::vector<std::string>> &data,
+                       TensorRow *row);
 
   // @param const std::vector<int64_t> &keys - keys in ioblock
   // @param std::unique_ptr<DataBuffer> db
@@ -240,8 +227,6 @@ class ManifestOp : public ParallelOp, public RandomAccessOp {
   std::string file_;  // file that store the information of images
   std::map<std::string, int32_t> class_index_;
   std::shared_ptr<Sampler> sampler_;
-  int64_t num_samples_;
-  int64_t num_rows_;
   bool decode_;
   std::string usage_;
   int64_t buf_cnt_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
index 358dd07872..0f762386af 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
@@ -44,7 +44,7 @@ using mindrecord::ShardReader;
 MindRecordOp::Builder::Builder() : build_dataset_file_({}) {
   // Some arguments to the MindRecordOp constructor have a default argument that is taken
   // from the client config.
-  // The user may choose to change these values for the construction of the StorageOp by
+  // The user may choose to change these values for the construction of the MindRecordOp by
   // using the various builder set methods.
 
   std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
@@ -53,6 +53,8 @@ MindRecordOp::Builder::Builder() : build_dataset_file_({}) {
   build_op_connector_queue_size_ = cfg->op_connector_size();
   build_block_reader_ = false;
   builder_num_workers_ = 0;
+  build_num_padded_ = 0;
+  build_sample_ = nullptr;
 }
 
 // The builder "build" method creates the final object.
@@ -63,24 +65,57 @@ Status MindRecordOp::Builder::Build(std::shared_ptr<MindRecordOp> *ptr) {
     return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
                   "Building a MindRecordOp that has not provided a file.");
   }
-
+  mindrecord::json sample_json;
+  if (build_num_padded_ > 0) {
+    sample_json = ToJson(build_sample_);
+  }
   new_mind_record_op = std::make_shared<MindRecordOp>(
     build_num_mind_record_workers_, build_rows_per_buffer_, build_dataset_file_, build_load_dataset_,
-    build_op_connector_queue_size_, build_columns_to_load_, build_operators_, build_block_reader_);
+    build_op_connector_queue_size_, build_columns_to_load_, build_operators_, build_block_reader_, build_num_padded_,
+    sample_json, build_sample_bytes_);
 
   RETURN_IF_NOT_OK(new_mind_record_op->Init());
-
   *ptr = std::move(new_mind_record_op);
   return Status::OK();
 }
 
 Status MindRecordOp::Builder::SanityCheck() const { return Status::OK(); }
 
+mindrecord::json MindRecordOp::Builder::ToJson(const py::handle &obj) {
+  if (obj.is_none()) {
+    return nullptr;
+  }
+  if (py::isinstance<py::int_>(obj)) {
+    return obj.cast<int64_t>();
+  }
+  if (py::isinstance<py::float_>(obj)) {
+    return obj.cast<double>();
+  }
+  if (py::isinstance<py::str>(obj)) {  // also catch py::bytes
+    return obj.cast<std::string>();
+  }
+  if (py::isinstance<py::dict>(obj)) {
+    auto out = mindrecord::json::object();
+    for (const py::handle &key : obj) {
+      if (py::isinstance<py::bytes>(obj[key])) {
+        build_sample_bytes_[py::str(key).cast<std::string>()] = obj[key].cast<std::string>();
+      } else {
+        out[py::str(key).cast<std::string>()] = ToJson(obj[key]);
+      }
+    }
+    return out;
+  }
+  MS_LOG(ERROR) << "Python object convert to json failed, object is: " << py::cast<std::string>(obj);
+  return mindrecord::json();
+}
+
 // Constructor of the MindRecordOp.
 MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buffer,
                            std::vector<std::string> dataset_file, bool load_dataset, int32_t op_connector_queue_size,
                            const std::vector<std::string> &columns_to_load,
-                           const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader)
+                           const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader,
+                           int64_t num_padded, const mindrecord::json &sample_json,
+                           const std::map<std::string, std::string> &sample_bytes)
     : ParallelOp(num_mind_record_workers, op_connector_queue_size),
       rows_per_buffer_(rows_per_buffer),
       dataset_file_(dataset_file),
@@ -89,11 +124,14 @@ MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buf
       operators_(operators),
       num_mind_record_workers_(num_mind_record_workers),
       block_reader_(block_reader),
+      num_rows_(0),
       buffers_needed_(0),
       buf_cnt_(0),
-      num_rows_(0),
       ended_worker_(0),
-      buffer_water_mark_(0) {
+      buffer_water_mark_(0),
+      num_padded_(num_padded),
+      sample_json_(sample_json),
+      sample_bytes_(sample_bytes) {
   io_blk_queues_.Init(num_workers_, op_connector_queue_size);
   if (!block_reader_) return;
   for (int32_t i = 0; i < num_workers_; ++i) {
@@ -105,7 +143,7 @@ MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buf
 Status MindRecordOp::Init() {
   shard_reader_ = std::make_unique<ShardReader>();
   auto rc = shard_reader_->Open(dataset_file_, load_dataset_, num_mind_record_workers_, columns_to_load_, operators_,
-                                block_reader_);
+                                block_reader_, num_padded_);
 
   CHECK_FAIL_RETURN_UNEXPECTED(rc == MSRStatus::SUCCESS,
                                "MindRecordOp init failed. Error message: " + ErrnoToMessage(rc));
@@ -162,10 +200,6 @@ Status MindRecordOp::Init() {
     column_name_id_map_[columns_to_load_[i]] = i;
   }
 
-  num_rows_ = shard_reader_->GetNumRows();
-  // Compute how many buffers we would need to accomplish rowsPerBuffer
-  buffers_needed_ = (num_rows_ + rows_per_buffer_ - 1) / rows_per_buffer_;
-
   return Status::OK();
 }
 
@@ -262,20 +296,30 @@ Status MindRecordOp::GetBufferFromReader(std::unique_ptr<DataBuffer> *fetched_bu
   std::unique_ptr<TensorQTable> tensor_table = std::make_unique<TensorQTable>();
   for (int32_t i = 0; i < rows_per_buffer_; ++i) {
     ShardTuple tupled_buffer;
+    mindrecord::TaskType task_type = mindrecord::TaskType::kCommonTask;
     if (block_reader_) {
       if (i >= block_buffer_[buffer_id % num_workers_]->size()) break;
       tupled_buffer = block_buffer_[buffer_id % num_workers_]->at(i);
     } else {
       int32_t row_id = buffer_id * rows_per_buffer_ + i;
-      tupled_buffer = shard_reader_->GetNextById(row_id, worker_id);
+      auto rc = shard_reader_->GetNextById(row_id, worker_id);
+      task_type = rc.first;
+      tupled_buffer = rc.second;
+      if (task_type == mindrecord::TaskType::kPaddedTask) {
+        TensorRow tensor_row;
+        RETURN_IF_NOT_OK(LoadTensorRow(&tensor_row, {}, mindrecord::json(), task_type));
+        tensor_table->push_back(std::move(tensor_row));
+      }
       if (tupled_buffer.empty()) break;
     }
-    for (const auto &tupled_row : tupled_buffer) {
-      std::vector<uint8_t> columns_blob = std::get<0>(tupled_row);
-      mindrecord::json columns_json = std::get<1>(tupled_row);
-      TensorRow tensor_row;
-      RETURN_IF_NOT_OK(LoadTensorRow(&tensor_row, columns_blob, columns_json));
-      tensor_table->push_back(std::move(tensor_row));
+    if (task_type == mindrecord::TaskType::kCommonTask) {
+      for (const auto &tupled_row : tupled_buffer) {
+        std::vector<uint8_t> columns_blob = std::get<0>(tupled_row);
+        mindrecord::json columns_json = std::get<1>(tupled_row);
+        TensorRow tensor_row;
+        RETURN_IF_NOT_OK(LoadTensorRow(&tensor_row, columns_blob, columns_json, task_type));
+        tensor_table->push_back(std::move(tensor_row));
+      }
     }
   }
 
@@ -285,7 +329,7 @@ Status MindRecordOp::GetBufferFromReader(std::unique_ptr<DataBuffer> *fetched_bu
 }
 
 Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint8_t> &columns_blob,
-                                   const mindrecord::json &columns_json) {
+                                   const mindrecord::json &columns_json, const mindrecord::TaskType task_type) {
   for (uint32_t i_col = 0; i_col < columns_to_load_.size(); i_col++) {
     auto column_name = columns_to_load_[i_col];
 
@@ -298,11 +342,39 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint
     std::vector<int64_t> column_shape;
 
     // Get column data
-    auto has_column = shard_reader_->GetShardColumn()->GetColumnValueByName(
-      column_name, columns_blob, columns_json, &data, &data_ptr, &n_bytes, &column_data_type, &column_data_type_size,
-      &column_shape);
-    if (has_column == MSRStatus::FAILED) {
-      RETURN_STATUS_UNEXPECTED("Failed to retrieve data from mindrecord reader.");
+    auto shard_column = shard_reader_->GetShardColumn();
+    if (num_padded_ > 0 && task_type == mindrecord::TaskType::kPaddedTask) {
+      auto rc =
+        shard_column->GetColumnTypeByName(column_name, &column_data_type, &column_data_type_size, &column_shape);
+      if (rc.first != MSRStatus::SUCCESS) {
+        RETURN_STATUS_UNEXPECTED("Failed to retrieve data type.");
+      }
+      if (rc.second == mindrecord::ColumnInRaw) {
+        auto has_column = shard_column->GetColumnFromJson(column_name, sample_json_, &data_ptr, &n_bytes);
+        if (has_column == MSRStatus::FAILED) {
+          RETURN_STATUS_UNEXPECTED("Failed to retrieve raw data from padding sample.");
+        }
+      } else if (rc.second == mindrecord::ColumnInBlob) {
+        if (sample_bytes_.find(column_name) == sample_bytes_.end()) {
+          RETURN_STATUS_UNEXPECTED("Failed to retrieve blob data from padding sample.");
+        }
+        std::string ss(sample_bytes_[column_name]);
+        n_bytes = ss.size();
+        data_ptr = std::make_unique<unsigned char[]>(n_bytes);
+        std::copy(ss.begin(), ss.end(), data_ptr.get());
+      } else {
+        RETURN_STATUS_UNEXPECTED("Retrieved data type is unknown.");
+      }
+      if (data == nullptr) {
+        data = reinterpret_cast<const unsigned char *>(data_ptr.get());
+      }
+    } else {
+      auto has_column =
+        shard_column->GetColumnValueByName(column_name, columns_blob, columns_json, &data, &data_ptr, &n_bytes,
+                                           &column_data_type, &column_data_type_size, &column_shape);
+      if (has_column == MSRStatus::FAILED) {
+        RETURN_STATUS_UNEXPECTED("Failed to retrieve data from mindrecord reader.");
+      }
     }
 
     std::shared_ptr<Tensor> tensor;
@@ -335,7 +407,8 @@ Status MindRecordOp::FetchBlockBuffer(const int32_t &buffer_id) {
   }
   for (int32_t i = 0; i < rows_per_buffer_; i++) {
     // Block reader does NOT care about argument
-    ShardTuple tuple_buffer = shard_reader_->GetNextById(i, i);
+    auto rc = shard_reader_->GetNextById(i, i);
+    ShardTuple tuple_buffer = rc.second;
     if (tuple_buffer.empty()) break;
     block_buffer_[buffer_id % num_workers_]->push_back(std::move(tuple_buffer));
   }
@@ -349,11 +422,8 @@ Status MindRecordOp::FetchBlockBuffer(const int32_t &buffer_id) {
 Status MindRecordOp::operator()() {
   RETURN_IF_NOT_OK(LaunchThreadAndInitOp());
   num_rows_ = shard_reader_->GetNumRows();
-
-  buffers_needed_ = num_rows_ / rows_per_buffer_;
-  if (num_rows_ % rows_per_buffer_ != 0) {
-    buffers_needed_++;
-  }
+  // Compute how many buffers we would need to accomplish rowsPerBuffer
+  buffers_needed_ = (num_rows_ + rows_per_buffer_ - 1) / rows_per_buffer_;
 
   while (true) {  // each iterator is 1 epoch
     for (int32_t i = 0; i < buffers_needed_; ++i) {
@@ -418,9 +488,9 @@ Status MindRecordOp::LaunchThreadAndInitOp() {
 }
 
 Status MindRecordOp::CountTotalRows(const std::vector<std::string> dataset_path, bool load_dataset,
-                                    const std::shared_ptr<ShardOperator> &op, int64_t *count) {
+                                    const std::shared_ptr<ShardOperator> &op, int64_t *count, int64_t num_padded) {
   std::unique_ptr<ShardReader> shard_reader = std::make_unique<ShardReader>();
-  MSRStatus rc = shard_reader->CountTotalRows(dataset_path, load_dataset, op, count);
+  MSRStatus rc = shard_reader->CountTotalRows(dataset_path, load_dataset, op, count, num_padded);
   if (rc == MSRStatus::FAILED) {
     RETURN_STATUS_UNEXPECTED("MindRecordOp count total rows failed.");
   }
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.h
index 251b4f9130..b704240aaa 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.h
@@ -104,10 +104,22 @@ class MindRecordOp : public ParallelOp {
       return *this;
     }
 
+    Builder &SetNumToPadSamples(int64_t num_padded) {
+      build_num_padded_ = num_padded;
+      return *this;
+    }
+
+    Builder &SetPaddedSample(const py::handle &sample) {
+      build_sample_ = sample;
+      return *this;
+    }
+
     Status SanityCheck() const;
 
     static int32_t num_mind_record_workers() { return kDefaultMindRecordWorkers; }
 
+    mindrecord::json ToJson(const py::handle &obj);
+
    private:
     static constexpr int32_t kDefaultMindRecordWorkers = 4;
     // The builder saves all MindRecordOp construction arguments internally.
@@ -121,6 +133,9 @@ class MindRecordOp : public ParallelOp {
     std::vector<std::string> build_columns_to_load_;
     std::vector<std::shared_ptr<ShardOperator>> build_operators_;
     bool build_block_reader_;
+    int64_t build_num_padded_;
+    py::handle build_sample_;
+    std::map<std::string, std::string> build_sample_bytes_;
   };
 
   // Constructor of the MindRecordOp.
@@ -133,7 +148,9 @@ class MindRecordOp : public ParallelOp {
   // @param operators - ShardOperators for Shuffle, Category, Sample
   MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buffer, std::vector<std::string> dataset_file,
                bool load_dataset, int32_t op_connector_queue_size, const std::vector<std::string> &columns_to_load,
-               const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader);
+               const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader,
+               int64_t num_padded_, const mindrecord::json &sample_json,
+               const std::map<std::string, std::string> &sample_bytes_);
 
   // Destructor
   ~MindRecordOp() override;
@@ -178,7 +195,7 @@ class MindRecordOp : public ParallelOp {
   int32_t num_rows() const { return num_rows_; }
 
   static Status CountTotalRows(const std::vector<std::string> dataset_path, bool load_dataset,
-                               const std::shared_ptr<ShardOperator> &op, int64_t *count);
+                               const std::shared_ptr<ShardOperator> &op, int64_t *count, int64_t num_padded);
 
   // Getter method
   int32_t rows_per_buffer() const { return rows_per_buffer_; }
@@ -201,6 +218,10 @@ class MindRecordOp : public ParallelOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "MindRecordOp"; }
+
  private:
   Status GetBufferFromReader(std::unique_ptr<DataBuffer> *fetched_buffer, int64_t buffer_id, int32_t worker_id);
 
@@ -209,7 +230,7 @@ class MindRecordOp : public ParallelOp {
   // @param columns_blob - the blob data received from the reader
   // @param columns_json - the data for fields received from the reader
   Status LoadTensorRow(TensorRow *tensor_row, const std::vector<uint8_t> &columns_blob,
-                       const mindrecord::json &columns_json);
+                       const mindrecord::json &columns_json, const mindrecord::TaskType task_type);
 
   Status FetchBlockBuffer(const int32_t &buffer_id);
 
@@ -226,6 +247,10 @@ class MindRecordOp : public ParallelOp {
   std::atomic<int32_t> ended_worker_;
   std::atomic<int32_t> buffer_water_mark_;
 
+  int64_t num_padded_;
+  mindrecord::json sample_json_;
+  std::map<std::string, std::string> sample_bytes_;
+
   std::unique_ptr<DataSchema> data_schema_;  // Data schema for column typing
   std::vector<std::string> columns_blob_;    // Blob Columns to load from dataset
   std::vector<int32_t> columns_blob_index_;  // Blob Columns to load from dataset
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc
index 53c32b1904..eacd9daf75 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc
@@ -31,7 +31,7 @@ const int32_t kMnistLabelFileMagicNumber = 2049;
 const int32_t kMnistImageRows = 28;
 const int32_t kMnistImageCols = 28;
 
-MnistOp::Builder::Builder() : builder_num_samples_(0), builder_sampler_(nullptr) {
+MnistOp::Builder::Builder() : builder_sampler_(nullptr) {
   std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
   builder_num_workers_ = cfg->num_parallel_workers();
   builder_rows_per_buffer_ = cfg->rows_per_buffer();
@@ -41,7 +41,9 @@ MnistOp::Builder::Builder() : builder_num_samples_(0), builder_sampler_(nullptr)
 Status MnistOp::Builder::Build(std::shared_ptr<MnistOp> *ptr) {
   RETURN_IF_NOT_OK(SanityCheck());
   if (builder_sampler_ == nullptr) {
-    builder_sampler_ = std::make_shared<SequentialSampler>();
+    const int64_t num_samples = 0;
+    const int64_t start_index = 0;
+    builder_sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples);
   }
   builder_schema_ = std::make_unique<DataSchema>();
   RETURN_IF_NOT_OK(
@@ -49,9 +51,8 @@ Status MnistOp::Builder::Build(std::shared_ptr<MnistOp> *ptr) {
   TensorShape scalar = TensorShape::CreateScalar();
   RETURN_IF_NOT_OK(builder_schema_->AddColumn(
     ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar)));
-  *ptr =
-    std::make_shared<MnistOp>(builder_num_workers_, builder_rows_per_buffer_, builder_dir_, builder_op_connector_size_,
-                              builder_num_samples_, std::move(builder_schema_), std::move(builder_sampler_));
+  *ptr = std::make_shared<MnistOp>(builder_num_workers_, builder_rows_per_buffer_, builder_dir_,
+                                   builder_op_connector_size_, std::move(builder_schema_), std::move(builder_sampler_));
   return Status::OK();
 }
 
@@ -60,17 +61,14 @@ Status MnistOp::Builder::SanityCheck() {
   std::string err_msg;
   err_msg += dir.IsDirectory() == false ? "MNIST path is invalid or not set\n" : "";
   err_msg += builder_num_workers_ <= 0 ? "Number of parallel workers is set to 0 or negative\n" : "";
-  err_msg += builder_num_samples_ < 0 ? "Number of samples is set to negative\n" : "";
   return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg);
 }
 
 MnistOp::MnistOp(int32_t num_workers, int32_t rows_per_buffer, std::string folder_path, int32_t queue_size,
-                 int64_t num_samples, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler)
+                 std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler)
     : ParallelOp(num_workers, queue_size),
       buf_cnt_(0),
       row_cnt_(0),
-      num_rows_(0),
-      num_samples_(num_samples),
       folder_path_(folder_path),
       rows_per_buffer_(rows_per_buffer),
       sampler_(std::move(sampler)),
@@ -84,8 +82,7 @@ MnistOp::MnistOp(int32_t num_workers, int32_t rows_per_buffer, std::string folde
 
 Status MnistOp::TraversalSampleIds(const std::shared_ptr<Tensor> &sample_ids, std::vector<int64_t> *keys) {
   for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
-    if ((*itr) >= num_rows_) continue;    // index out of bound, skipping
-    if (row_cnt_ >= num_samples_) break;  // enough row read, break for loop
+    if ((*itr) >= num_rows_) continue;  // index out of bound, skipping
     keys->push_back(*itr);
     row_cnt_++;
     if (row_cnt_ % rows_per_buffer_ == 0) {
@@ -101,7 +98,7 @@ Status MnistOp::TraversalSampleIds(const std::shared_ptr<Tensor> &sample_ids, st
 Status MnistOp::operator()() {
   RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
   std::unique_ptr<DataBuffer> sampler_buffer;
-  RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+  RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
   while (true) {  // each iterator is 1 epoch
     std::vector<int64_t> keys;
     keys.reserve(rows_per_buffer_);
@@ -112,7 +109,7 @@ Status MnistOp::operator()() {
         RETURN_STATUS_UNEXPECTED("Sampler Tensor isn't UINT64");
       }
       RETURN_IF_NOT_OK(TraversalSampleIds(sample_ids, &keys));
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
     }
     if (keys.empty() == false) {
       RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(
@@ -133,7 +130,7 @@ Status MnistOp::operator()() {
         io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
       RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
       wp_.Clear();
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
     }
   }
 }
@@ -165,15 +162,15 @@ Status MnistOp::WorkerEntry(int32_t worker_id) {
 }
 
 // Load 1 TensorRow (image,label) using 1 MnistLabelPair.
-Status MnistOp::LoadTensorRow(const MnistLabelPair &mnist_pair, TensorRow *trow) {
+Status MnistOp::LoadTensorRow(row_id_type row_id, const MnistLabelPair &mnist_pair, TensorRow *trow) {
   std::shared_ptr<Tensor> image, label;
   int32_t l = mnist_pair.second;
   // make a copy of cached tensor
   RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(), mnist_pair.first->shape(),
-                                        mnist_pair.first->type(), mnist_pair.first->GetMutableBuffer()));
+                                        mnist_pair.first->type(), mnist_pair.first->GetBuffer()));
   RETURN_IF_NOT_OK(Tensor::CreateTensor(&label, data_schema_->column(1).tensorImpl(), data_schema_->column(1).shape(),
                                         data_schema_->column(1).type(), reinterpret_cast<unsigned char *>(&l)));
-  (*trow) = {std::move(image), std::move(label)};
+  (*trow) = TensorRow(row_id, {std::move(image), std::move(label)});
   return Status::OK();
 }
 
@@ -182,7 +179,7 @@ Status MnistOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<Dat
   std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
   TensorRow trow;
   for (const int64_t &key : keys) {
-    RETURN_IF_NOT_OK(this->LoadTensorRow(image_label_pairs_[key], &trow));
+    RETURN_IF_NOT_OK(this->LoadTensorRow(key, image_label_pairs_[key], &trow));
     deq->push_back(std::move(trow));
   }
   (*db)->set_tensor_table(std::move(deq));
@@ -207,7 +204,7 @@ void MnistOp::Print(std::ostream &out, bool show_all) const {
 
 // Reset Sampler and wakeup Master thread (functor)
 Status MnistOp::Reset() {
-  RETURN_IF_NOT_OK(sampler_->Reset());
+  RETURN_IF_NOT_OK(sampler_->ResetSampler());
   row_cnt_ = 0;
   wp_.Set();  // wake up master thread after reset is done
   return Status::OK();
@@ -219,17 +216,6 @@ Status MnistOp::InitSampler() {
   return Status::OK();
 }
 
-// Derived from RandomAccessOp
-Status MnistOp::GetNumSamples(int64_t *num) const {
-  if (num == nullptr || num_rows_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API MnistDataset.Please check file path or dataset API "
-      "validation first.");
-  }
-  (*num) = num_samples_;
-  return Status::OK();
-}
-
 // Derived from RandomAccessOp
 Status MnistOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_ids) const {
   if (cls_ids == nullptr || !cls_ids->empty() || image_label_pairs_.empty()) {
@@ -364,7 +350,11 @@ Status MnistOp::ParseMnistData() {
   }
   image_label_pairs_.shrink_to_fit();
   num_rows_ = image_label_pairs_.size();
-  num_samples_ = (num_samples_ == 0 || num_samples_ > num_rows_) ? num_rows_ : num_samples_;
+  if (num_rows_ == 0) {
+    RETURN_STATUS_UNEXPECTED(
+      "There is no valid data matching the dataset API MnistDataset.Please check file path or dataset API "
+      "validation first.");
+  }
   return Status::OK();
 }
 
@@ -414,11 +404,11 @@ Status MnistOp::LaunchThreadsAndInitOp() {
   return Status::OK();
 }
 
-Status MnistOp::CountTotalRows(const std::string &dir, int64_t numSamples, int64_t *count) {
+Status MnistOp::CountTotalRows(const std::string &dir, int64_t *count) {
   // the logic of counting the number of samples is copied from ParseMnistData() and uses CheckReader()
   std::shared_ptr<MnistOp> op;
   *count = 0;
-  RETURN_IF_NOT_OK(Builder().SetDir(dir).SetNumSamples(numSamples).Build(&op));
+  RETURN_IF_NOT_OK(Builder().SetDir(dir).Build(&op));
 
   RETURN_IF_NOT_OK(op->WalkAllFiles());
 
@@ -440,19 +430,6 @@ Status MnistOp::CountTotalRows(const std::string &dir, int64_t numSamples, int64
     label_reader.close();
   }
 
-  *count = (numSamples == 0 || *count < numSamples) ? *count : numSamples;
-
-  return Status::OK();
-}
-
-// Derived from RandomAccessOp
-Status MnistOp::GetNumRowsInDataset(int64_t *num) const {
-  if (num == nullptr || num_rows_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API MnistDataset.Please check file path or dataset API "
-      "validation first.");
-  }
-  (*num) = num_rows_;
   return Status::OK();
 }
 }  // namespace dataset
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.h
index 397a51710e..909ac22124 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.h
@@ -78,14 +78,6 @@ class MnistOp : public ParallelOp, public RandomAccessOp {
       return *this;
     }
 
-    // Setter method
-    // @param int64_t num_samples
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetNumSamples(int64_t num_samples) {
-      builder_num_samples_ = num_samples;
-      return *this;
-    }
-
     // Setter method
     // @param std::shared_ptr<Sampler> sampler
     // @return Builder setter method returns reference to the builder.
@@ -114,7 +106,6 @@ class MnistOp : public ParallelOp, public RandomAccessOp {
    private:
     std::string builder_dir_;
     int32_t builder_num_workers_;
-    int64_t builder_num_samples_;
     int32_t builder_rows_per_buffer_;
     int32_t builder_op_connector_size_;
     std::shared_ptr<Sampler> builder_sampler_;
@@ -126,11 +117,10 @@ class MnistOp : public ParallelOp, public RandomAccessOp {
   // @param int32_t rows_per_buffer - number of images (rows) in each buffer
   // @param std::string folder_path - dir directory of mnist
   // @param int32_t queue_size - connector queue size
-  // @param int64_t num_samples - number of samples to read
   // @param std::unique_ptr<DataSchema> data_schema - the schema of the mnist dataset
   // @param td::unique_ptr<Sampler> sampler - sampler tells MnistOp what to read
   MnistOp(int32_t num_workers, int32_t rows_per_buffer, std::string folder_path, int32_t queue_size,
-          int64_t num_samples, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler);
+          std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler);
 
   // Destructor.
   ~MnistOp() = default;
@@ -146,16 +136,6 @@ class MnistOp : public ParallelOp, public RandomAccessOp {
   // @return Status - The error code return
   Status operator()() override;
 
-  // Method derived from RandomAccess Op, enable Sampler to get numRows
-  // @param int64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumSamples(int64_t *num) const override;
-
-  // Method derived from RandomAccess Op, enable Sampler to get total numRows in dataset
-  // @param int64_t num - to return numRows
-  // @return Status - The error code return
-  Status GetNumRowsInDataset(int64_t *num) const override;
-
   // Method derived from RandomAccess Op, enable Sampler to get all ids for each class
   // @param (std::map<uint64_t, std::vector<uint64_t >> * map - key label, val all ids for this class
   // @return Status - The error code return
@@ -167,11 +147,14 @@ class MnistOp : public ParallelOp, public RandomAccessOp {
   void Print(std::ostream &out, bool show_all) const override;
 
   // Function to count the number of samples in the MNIST dataset
-  // @param dir path to the MNSIT directory
-  // @param numSamples maximum number of samples requested
+  // @param dir path to the MNIST directory
   // @param count output arg that will hold the minimum of the actual dataset size and numSamples
   // @return
-  static Status CountTotalRows(const std::string &dir, int64_t numSamples, int64_t *count);
+  static Status CountTotalRows(const std::string &dir, int64_t *count);
+
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "MnistOp"; }
 
  private:
   // Initialize Sampler, calls sampler->Init() within
@@ -179,10 +162,11 @@ class MnistOp : public ParallelOp, public RandomAccessOp {
   Status InitSampler();
 
   // Load a tensor row according to a pair
+  // @param row_id_type row_id - id for this tensor row
   // @param ImageLabelPair pair - <imagefile,label>
   // @param TensorRow row - image & label read into this tensor row
   // @return Status - The error code return
-  Status LoadTensorRow(const MnistLabelPair &mnist_pair, TensorRow *row);
+  Status LoadTensorRow(row_id_type row_id, const MnistLabelPair &mnist_pair, TensorRow *row);
 
   // @param const std::vector<int64_t> &keys - keys in ioblock
   // @param std::unique_ptr<DataBuffer> db
@@ -244,9 +228,7 @@ class MnistOp : public ParallelOp, public RandomAccessOp {
 
   int64_t buf_cnt_;
   int64_t row_cnt_;
-  int64_t num_rows_;  // total number of images in Mnist
   WaitPost wp_;
-  int64_t num_samples_;
   std::string folder_path_;  // directory of image folder
   int32_t rows_per_buffer_;
   std::shared_ptr<Sampler> sampler_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h
index 92d05d7318..48cfb0be51 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h
@@ -189,6 +189,10 @@ class RandomDataOp : public ParallelOp {
    */
   int64_t GetTotalRows() const { return total_rows_; }
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "RandomDataOp"; }
+
  private:
   /**
    * The entry point code for when workers are launched
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/CMakeLists.txt b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/CMakeLists.txt
index 152b887ef4..5209d9ba4a 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/CMakeLists.txt
@@ -8,6 +8,5 @@ add_library(engine-datasetops-source-sampler OBJECT
     sampler.cc
     sequential_sampler.cc
     subset_random_sampler.cc
-    subset_sampler.cc
     weighted_random_sampler.cc
     )
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc
index d4e5a732db..226647df14 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc
@@ -23,8 +23,9 @@
 
 namespace mindspore {
 namespace dataset {
-DistributedSampler::DistributedSampler(int64_t num_dev, int64_t dev_id, bool shuffle, uint32_t seed)
-    : Sampler(),
+DistributedSampler::DistributedSampler(int64_t num_samples, int64_t num_dev, int64_t dev_id, bool shuffle,
+                                       uint32_t seed)
+    : Sampler(num_samples, std::numeric_limits<int64_t>::max()),
       cnt_(0),
       seed_(seed == std::numeric_limits<uint32_t>::max() ? GetSeed() : seed),
       device_id_(dev_id),
@@ -32,6 +33,11 @@ DistributedSampler::DistributedSampler(int64_t num_dev, int64_t dev_id, bool shu
       shuffle_(shuffle) {}
 
 Status DistributedSampler::InitSampler() {
+  // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
+  // If the user asked to sample more rows than exists in the dataset, adjust the num_samples accordingly.
+  if (num_samples_ == 0 || num_samples_ > num_rows_) {
+    num_samples_ = num_rows_;
+  }
   CHECK_FAIL_RETURN_UNEXPECTED(num_samples_ > 0, "num_samples <= 0\n");
   CHECK_FAIL_RETURN_UNEXPECTED(num_rows_ > 0, "num_rows <= 0\n");
   CHECK_FAIL_RETURN_UNEXPECTED(device_id_ < num_devices_ && device_id_ >= 0 && num_rows_ > 0 && num_samples_ > 0,
@@ -49,21 +55,21 @@ Status DistributedSampler::InitSampler() {
   return Status::OK();
 }
 
-Status DistributedSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
+Status DistributedSampler::GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) {
   if (cnt_ > samples_per_buffer_) {
     RETURN_STATUS_UNEXPECTED("Distributed Sampler Error");
   } else if (cnt_ == samples_per_buffer_) {
     (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
   } else {
     if (HasChildSampler()) {
-      RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(&child_ids_));
+      RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
     }
 
     (*out_buffer) = std::make_unique<DataBuffer>(cnt_, DataBuffer::kDeBFlagNone);
     std::shared_ptr<Tensor> sample_ids;
     RETURN_IF_NOT_OK(CreateSamplerTensor(&sample_ids, samples_per_buffer_));
-    int64_t *id_ptr = reinterpret_cast<int64_t *>(sample_ids->GetMutableBuffer());
-    while (cnt_ < samples_per_buffer_) {
+    auto id_ptr = sample_ids->begin<int64_t>();
+    while (cnt_ < samples_per_buffer_ && id_ptr != sample_ids->end<int64_t>()) {
       int64_t sampled_id = (num_devices_ * cnt_ + device_id_) % num_rows_;
       if (shuffle_) {
         sampled_id = shuffle_vec_[static_cast<size_t>(sampled_id)];
@@ -83,7 +89,7 @@ Status DistributedSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer
   return Status::OK();
 }
 
-Status DistributedSampler::Reset() {
+Status DistributedSampler::ResetSampler() {
   CHECK_FAIL_RETURN_UNEXPECTED(cnt_ == samples_per_buffer_, "ERROR Reset() called early/late");
   cnt_ = 0;
 
@@ -94,7 +100,7 @@ Status DistributedSampler::Reset() {
   }
 
   if (HasChildSampler()) {
-    RETURN_IF_NOT_OK(child_[0]->Reset());
+    RETURN_IF_NOT_OK(child_[0]->ResetSampler());
   }
 
   return Status::OK();
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.h b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.h
index 29b5cda0da..7083580c6c 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.h
@@ -27,10 +27,11 @@ namespace mindspore {
 namespace dataset {
 class DistributedSampler : public Sampler {
  public:
-  // @param int64_t numDev
-  // @param int64_t devId
+  // @param num_samples
+  // @param int64_t num_dev
+  // @param int64_t dev_id
   // @param bool shuffle
-  DistributedSampler(int64_t num_dev, int64_t dev_id, bool shuffle = true,
+  DistributedSampler(int64_t num_samples, int64_t num_dev, int64_t dev_id, bool shuffle,
                      uint32_t seed = std::numeric_limits<uint32_t>::max());
 
   // default destructor
@@ -39,14 +40,14 @@ class DistributedSampler : public Sampler {
   // @param std::unique_ptr<DataBuffer> * pBuffer
   // @param int32_t workerId
   // @return - The error code return
-  Status GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) override;
+  Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) override;
 
   // Init sampler, called by base class or python
   Status InitSampler() override;
 
   // for next epoch of sampleIds
   // @return - The error code return
-  Status Reset() override;
+  Status ResetSampler() override;
 
   void Print(std::ostream &out, bool show_all) const override;
 
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc
index 72c2cc1874..92a880d599 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc
@@ -20,12 +20,11 @@
 
 namespace mindspore {
 namespace dataset {
-PKSampler::PKSampler(int64_t val, bool shuffle, int64_t samples_per_buffer)
-    : Sampler(samples_per_buffer),
+PKSampler::PKSampler(int64_t num_samples, int64_t val, bool shuffle, int64_t samples_per_buffer)
+    : Sampler(num_samples, samples_per_buffer),
       shuffle_(shuffle),
       seed_(GetSeed()),
       next_id_(0),
-      num_pk_samples_(0),
       samples_per_class_(val) {}
 
 Status PKSampler::InitSampler() {
@@ -36,35 +35,46 @@ Status PKSampler::InitSampler() {
     }
   }
   rnd_.seed(seed_++);
-  num_pk_samples_ = samples_per_class_ * static_cast<int64_t>(labels_.size());
-  samples_per_buffer_ = (samples_per_buffer_ > num_pk_samples_) ? num_pk_samples_ : samples_per_buffer_;
-  num_samples_ = num_pk_samples_;
+
+  // The special handshake gives the list of classes and id's, but it did not set the num_rows_ to
+  // capture the total number of possible sample ids.
+  // Compute that here for this case to find the total number of samples that are available to return.
+  // (in this case, samples per class * total classes).
+  num_rows_ = samples_per_class_ * static_cast<int64_t>(labels_.size());
+
+  // The user may have chosen to sample less than the total amount.
+  // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
+  // If the user asked to sample more rows than exists in the dataset, adjust the num_samples accordingly.
+  if (num_samples_ == 0 || num_samples_ > num_rows_) {
+    num_samples_ = num_rows_;
+  }
+
+  samples_per_buffer_ = (samples_per_buffer_ > num_samples_) ? num_samples_ : samples_per_buffer_;
   if (shuffle_ == true) {
     std::shuffle(labels_.begin(), labels_.end(), rnd_);
   } else {
     std::sort(labels_.begin(), labels_.end());
   }
-  CHECK_FAIL_RETURN_UNEXPECTED(num_pk_samples_ > 0, "num_class or K (num samples per class) is not positive");
+  CHECK_FAIL_RETURN_UNEXPECTED(num_samples_ > 0, "num_class or K (num samples per class) is not positive");
   return Status::OK();
 }
 
-Status PKSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
-  if (next_id_ > num_pk_samples_ || num_pk_samples_ == 0) {
+Status PKSampler::GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) {
+  if (next_id_ > num_samples_ || num_samples_ == 0) {
     RETURN_STATUS_UNEXPECTED("Index out of bound in PKSampler");
-  } else if (next_id_ == num_pk_samples_) {
+  } else if (next_id_ == num_samples_) {
     (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
   } else {
     if (HasChildSampler()) {
-      RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(&child_ids_));
+      RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
     }
 
     (*out_buffer) = std::make_unique<DataBuffer>(next_id_, DataBuffer::kDeBFlagNone);
     std::shared_ptr<Tensor> sample_ids;
-    int64_t last_id =
-      (samples_per_buffer_ + next_id_ > num_pk_samples_) ? num_pk_samples_ : samples_per_buffer_ + next_id_;
+    int64_t last_id = (samples_per_buffer_ + next_id_ > num_samples_) ? num_samples_ : samples_per_buffer_ + next_id_;
     RETURN_IF_NOT_OK(CreateSamplerTensor(&sample_ids, last_id - next_id_));
-    int64_t *id_ptr = reinterpret_cast<int64_t *>(sample_ids->GetMutableBuffer());
-    while (next_id_ < last_id) {
+    auto id_ptr = sample_ids->begin<int64_t>();
+    while (next_id_ < last_id && id_ptr != sample_ids->end<int64_t>()) {
       int64_t cls_id = next_id_++ / samples_per_class_;
       const std::vector<int64_t> &samples = label_to_ids_[labels_[cls_id]];
       int64_t rnd_ind = std::uniform_int_distribution<int64_t>(0, samples.size() - 1)(rnd_);
@@ -84,13 +94,13 @@ Status PKSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
   return Status::OK();
 }
 
-Status PKSampler::Reset() {
-  CHECK_FAIL_RETURN_UNEXPECTED(next_id_ == num_pk_samples_, "ERROR Reset() called early/late");
+Status PKSampler::ResetSampler() {
+  CHECK_FAIL_RETURN_UNEXPECTED(next_id_ == num_samples_, "ERROR Reset() called early/late");
   next_id_ = 0;
   rnd_.seed(seed_++);
 
   if (HasChildSampler()) {
-    RETURN_IF_NOT_OK(child_[0]->Reset());
+    RETURN_IF_NOT_OK(child_[0]->ResetSampler());
   }
 
   return Status::OK();
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.h b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.h
index 14f598a9ce..7b1423326a 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.h
@@ -28,10 +28,11 @@ namespace mindspore {
 namespace dataset {
 class PKSampler : public Sampler {  // NOT YET FINISHED
  public:
-  // @param int64_t kVal
+  // @param num_samples - the number of samples to draw.  value of 0 means to take the full amount
+  // @param int64_t val
   // @param bool shuffle - shuffle all classIds or not, if true, classes may be 5,1,4,3,2
   // @param int64_t samplesPerBuffer - Num of Sampler Ids to fetch via 1 GetNextBuffer call
-  explicit PKSampler(int64_t val, bool shuffle = false,
+  explicit PKSampler(int64_t num_samples, int64_t val, bool shuffle,
                      int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());
 
   // default destructor
@@ -40,10 +41,11 @@ class PKSampler : public Sampler {  // NOT YET FINISHED
   // @param std::unique_ptr<DataBuffer pBuffer
   // @param int32_t workerId
   // @return - The error code return
-  Status GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) override;
+  Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) override;
 
-  // first handshake between StorageOp and Sampler
-  // @param op - StorageOp pointer, pass in so Sampler can call GetNumSamples() and get ClassIds()
+  // first handshake between leaf source op and Sampler. This func will determine the amount of data
+  // in the dataset that we can sample from.
+  // @param op - leaf op pointer, pass in so Sampler can ask it about how much data there is
   // @return
   Status HandshakeRandomAccessOp(const RandomAccessOp *op) override;
 
@@ -52,13 +54,12 @@ class PKSampler : public Sampler {  // NOT YET FINISHED
 
   // for next epoch of sampleIds
   // @return - The error code return
-  Status Reset() override;
+  Status ResetSampler() override;
 
  private:
   bool shuffle_;
   uint32_t seed_;
   int64_t next_id_;
-  int64_t num_pk_samples_;
   int64_t samples_per_class_;
   std::mt19937 rnd_;
   std::vector<int64_t> labels_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/python_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/python_sampler.cc
index ca999e31a5..af4aa20bb2 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/python_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/python_sampler.cc
@@ -20,15 +20,15 @@
 namespace mindspore {
 namespace dataset {
 
-PythonSampler::PythonSampler(py::object py_sampler_instance, int64_t samples_per_buffer)
-    : Sampler(samples_per_buffer), py_sampler_instance(py_sampler_instance), need_to_reset_(false) {}
+PythonSampler::PythonSampler(int64_t num_samples, py::object py_sampler_instance, int64_t samples_per_buffer)
+    : Sampler(num_samples, samples_per_buffer), py_sampler_instance(py_sampler_instance), need_to_reset_(false) {}
 
-Status PythonSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
+Status PythonSampler::GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) {
   if (need_to_reset_) {
     (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
   } else {
     if (HasChildSampler()) {
-      RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(&child_ids_));
+      RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
     }
 
     std::shared_ptr<Tensor> sample_ids;
@@ -65,6 +65,11 @@ Status PythonSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
 
 Status PythonSampler::InitSampler() {
   CHECK_FAIL_RETURN_UNEXPECTED(num_rows_ > 0, "ERROR num_rows_ should be greater than 0");
+  // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
+  // If the user asked to sample more rows than exists in the dataset, adjust the num_samples accordingly.
+  if (num_samples_ == 0 || num_samples_ > num_rows_) {
+    num_samples_ = num_rows_;
+  }
   {
     py::gil_scoped_acquire gil_acquire;
     if (Py_IsInitialized() == 0) {
@@ -79,7 +84,7 @@ Status PythonSampler::InitSampler() {
   return Status::OK();
 }
 
-Status PythonSampler::Reset() {
+Status PythonSampler::ResetSampler() {
   CHECK_FAIL_RETURN_UNEXPECTED(need_to_reset_, "ERROR Reset() called not at end of an epoch");
   need_to_reset_ = false;
   py::gil_scoped_acquire gil_acquire;
@@ -93,7 +98,7 @@ Status PythonSampler::Reset() {
   }
 
   if (HasChildSampler()) {
-    RETURN_IF_NOT_OK(child_[0]->Reset());
+    RETURN_IF_NOT_OK(child_[0]->ResetSampler());
   }
 
   return Status::OK();
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/python_sampler.h b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/python_sampler.h
index b8734fee6a..49ff12878d 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/python_sampler.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/python_sampler.h
@@ -26,8 +26,11 @@ namespace dataset {
 class PythonSampler : public Sampler {
  public:
   // Constructor
-  // @param int64_t samplesPerBuffer - Num of Sampler Ids to fetch via 1 GetNextBuffer call
-  explicit PythonSampler(py::object py_sampler_instance,
+  // @param num_samples - the number of samples to draw.  Value of 0 means to sample all of the
+  //                      data from the dataset.
+  // @param py_sampler_instance - the python instance of the sampler
+  // @param int64_t samples_per_buffer - Num of Sampler Ids to fetch via 1 GetNextBuffer call
+  explicit PythonSampler(int64_t num_samples, py::object py_sampler_instance,
                          int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());
 
   // Destructor.
@@ -39,13 +42,13 @@ class PythonSampler : public Sampler {
 
   // for next epoch of sampleIds
   // @return - The error code return
-  Status Reset() override;
+  Status ResetSampler() override;
 
   // Op calls this to get next Buffer that contains all the sampleIds
-  // @param std::unique_ptr<DataBuffer> pBuffer - Buffer to be returned to StorageOp
+  // @param std::unique_ptr<DataBuffer> pBuffer - Buffer to be returned to corresponding Dataset Op
   // @param int32_t workerId - not meant to be used
   // @return - The error code return
-  Status GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) override;
+  Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) override;
 
  private:
   bool need_to_reset_;  // Whether Reset() should be called before calling GetNextBuffer()
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc
index 0de55e0fb4..b3dfaad7f7 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc
@@ -22,31 +22,30 @@
 
 namespace mindspore {
 namespace dataset {
-RandomSampler::RandomSampler(bool replacement, bool reshuffle_each_epoch, int64_t num_samples,
+RandomSampler::RandomSampler(int64_t num_samples, bool replacement, bool reshuffle_each_epoch,
                              int64_t samples_per_buffer)
-    : Sampler(samples_per_buffer),
+    : Sampler(num_samples, samples_per_buffer),
       seed_(GetSeed()),
       replacement_(replacement),
-      user_num_samples_(num_samples),
       next_id_(0),
       reshuffle_each_epoch_(reshuffle_each_epoch),
       dist(nullptr) {}
 
-Status RandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
+Status RandomSampler::GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) {
   if (next_id_ > num_samples_) {
     RETURN_STATUS_UNEXPECTED("RandomSampler Internal Error");
   } else if (next_id_ == num_samples_) {
     (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
   } else {
     if (HasChildSampler()) {
-      RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(&child_ids_));
+      RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
     }
     (*out_buffer) = std::make_unique<DataBuffer>(next_id_, DataBuffer::kDeBFlagNone);
 
     std::shared_ptr<Tensor> sampleIds;
     int64_t last_id = std::min(samples_per_buffer_ + next_id_, num_samples_);
     RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, last_id - next_id_));
-    int64_t *id_ptr = reinterpret_cast<int64_t *>(sampleIds->GetMutableBuffer());
+    auto id_ptr = sampleIds->begin<int64_t>();
 
     for (int64_t i = 0; i < (last_id - next_id_); i++) {
       int64_t sampled_id = 0;
@@ -70,31 +69,29 @@ Status RandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
 }
 
 Status RandomSampler::InitSampler() {
-  CHECK_FAIL_RETURN_UNEXPECTED(num_rows_ > 0, "num_rows needs to be positive.");
-
+  // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
+  // If the user asked to sample more rows than exists in the dataset, adjust the num_samples accordingly.
+  if (num_samples_ == 0 || num_samples_ > num_rows_) {
+    num_samples_ = num_rows_;
+  }
+  CHECK_FAIL_RETURN_UNEXPECTED(num_samples_ > 0 && num_rows_ > 0, "both num_samples & num_rows need to be positive");
+  samples_per_buffer_ = samples_per_buffer_ > num_samples_ ? num_samples_ : samples_per_buffer_;
   rnd_.seed(seed_);
 
   if (replacement_ == false) {
-    num_samples_ = std::min(num_samples_, num_rows_);
-    num_samples_ = std::min(num_samples_, user_num_samples_);
-
     shuffled_ids_.reserve(num_rows_);
     for (int64_t i = 0; i < num_rows_; i++) {
       shuffled_ids_.push_back(i);
     }
     std::shuffle(shuffled_ids_.begin(), shuffled_ids_.end(), rnd_);
   } else {
-    num_samples_ = std::min(num_samples_, user_num_samples_);
     dist = std::make_unique<std::uniform_int_distribution<int64_t>>(0, num_rows_ - 1);
   }
 
-  CHECK_FAIL_RETURN_UNEXPECTED(num_samples_ > 0, "num_samples needs to be positive.");
-  samples_per_buffer_ = samples_per_buffer_ > num_samples_ ? num_samples_ : samples_per_buffer_;
-
   return Status::OK();
 }
 
-Status RandomSampler::Reset() {
+Status RandomSampler::ResetSampler() {
   CHECK_FAIL_RETURN_UNEXPECTED(next_id_ == num_samples_, "ERROR Reset() called early/late");
   next_id_ = 0;
 
@@ -109,7 +106,7 @@ Status RandomSampler::Reset() {
   }
 
   if (HasChildSampler()) {
-    RETURN_IF_NOT_OK(child_[0]->Reset());
+    RETURN_IF_NOT_OK(child_[0]->ResetSampler());
   }
 
   return Status::OK();
@@ -119,7 +116,6 @@ void RandomSampler::Print(std::ostream &out, bool show_all) const {
   out << "(sampler): RandomSampler\n";
 
   if (show_all) {
-    out << "user_num_samples_: " << user_num_samples_ << '\n';
     out << "num_samples_: " << num_samples_ << '\n';
     out << "next_id_: " << next_id_ << '\n';
   }
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.h b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.h
index 352751dbb8..b1c54eb98c 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.h
@@ -27,11 +27,11 @@ namespace dataset {
 class RandomSampler : public Sampler {
  public:
   // Constructor
+  // @param int64_t num_samples - number samples to draw
   // @param bool replacement - put he id back / or not after a sample
-  // @param int64_t numSamples - number samples to draw
-  // @param int64_t samplesPerBuffer - Num of Sampler Ids to fetch via 1 GetNextBuffer call
-  explicit RandomSampler(bool replacement = false, bool reshuffle_each_epoch = true,
-                         int64_t num_samples = std::numeric_limits<int64_t>::max(),
+  // @param reshuffle_each_epoch - T/F to reshuffle after epoch
+  // @param int64_t samples_per_buffer - Num of Sampler Ids to fetch via 1 GetNextBuffer call
+  explicit RandomSampler(int64_t num_samples, bool replacement, bool reshuffle_each_epoch,
                          int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());
 
   // Destructor.
@@ -41,21 +41,20 @@ class RandomSampler : public Sampler {
   // @param std::unique_ptr<DataBuffer> pBuffer - Buffer to be returned to StorageOp
   // @param int32_t workerId - not meant to be used
   // @return - The error code return
-  Status GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) override;
+  Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) override;
 
   // meant to be called by base class or python
   Status InitSampler() override;
 
   // for next epoch of sampleIds
   // @return - The error code return
-  Status Reset() override;
+  Status ResetSampler() override;
 
   virtual void Print(std::ostream &out, bool show_all) const;
 
  private:
   uint32_t seed_;
   bool replacement_;
-  int64_t user_num_samples_;
   std::vector<int64_t> shuffled_ids_;  // only used for NO REPLACEMENT
   int64_t next_id_;
   std::mt19937 rnd_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc
index 600d8c576b..3f737c167c 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc
@@ -19,8 +19,21 @@
 
 namespace mindspore {
 namespace dataset {
-Sampler::Sampler(int64_t samples_per_buffer)
-    : DatasetOp(0), num_rows_(0), num_samples_(0), samples_per_buffer_(samples_per_buffer), col_desc_(nullptr) {}
+Status RandomAccessOp::GetNumRowsInDataset(int64_t *num) const {
+  // The sampler base class itself does not compute it's own num_rows_ value.
+  // Instead, this value is computed by the derived leaf op during it's own initialization
+  // after it has interacted with it's storage layers.
+  // Here, it is just a getter method to return the value.  However, it is invalid if there is
+  // not a value set for this count, so generate a failure if that is the case.
+  if (num == nullptr || num_rows_ == 0) {
+    RETURN_STATUS_UNEXPECTED("RandomAccessOp has not computed it's num rows yet.");
+  }
+  (*num) = num_rows_;
+  return Status::OK();
+}
+
+Sampler::Sampler(int64_t num_samples, int64_t samples_per_buffer)
+    : num_rows_(0), num_samples_(num_samples), samples_per_buffer_(samples_per_buffer), col_desc_(nullptr) {}
 
 Status Sampler::HandshakeRandomAccessOp(const RandomAccessOp *op) {
   std::shared_ptr<Sampler> child_sampler;
@@ -36,10 +49,10 @@ Status Sampler::HandshakeRandomAccessOp(const RandomAccessOp *op) {
   }
 
   CHECK_FAIL_RETURN_UNEXPECTED(op != nullptr, "RandomAccessOp is nullptr\n");
-  RETURN_IF_NOT_OK(op->GetNumSamples(&num_samples_));
+
+  // If there's a child sampler, set the row count to be it's sample count
   if (HasChildSampler()) {
-    int64_t child_num_samples = child_sampler->num_samples();
-    num_rows_ = child_num_samples;
+    num_rows_ = child_sampler->num_samples_;
   } else {
     RETURN_IF_NOT_OK(op->GetNumRowsInDataset(&num_rows_));
   }
@@ -80,7 +93,7 @@ Status Sampler::GetAllIdsThenReset(py::array *data) {
   std::shared_ptr<Tensor> sample_ids;
 
   // A call to derived class to get sample ids wrapped inside a buffer
-  RETURN_IF_NOT_OK(GetNextBuffer(&db));
+  RETURN_IF_NOT_OK(GetNextSample(&db));
   // Get the only tensor inside the buffer that contains the actual SampleIds for the entire epoch
   RETURN_IF_NOT_OK(db->GetTensor(&sample_ids, 0, 0));
   // check this buffer is not a ctrl buffer
@@ -97,15 +110,15 @@ Status Sampler::GetAllIdsThenReset(py::array *data) {
     }
   }
   // perform error checking! Next buffer supposed to be EOE since last one already contains all ids for current epoch
-  RETURN_IF_NOT_OK(GetNextBuffer(&db));
+  RETURN_IF_NOT_OK(GetNextSample(&db));
   CHECK_FAIL_RETURN_UNEXPECTED(db->eoe(), "ERROR Non EOE received");
   // Reset Sampler since this is the end of the epoch
-  RETURN_IF_NOT_OK(Reset());
+  RETURN_IF_NOT_OK(ResetSampler());
   return Status::OK();
 }
 
 Status Sampler::SetNumSamples(int64_t num_samples) {
-  CHECK_FAIL_RETURN_UNEXPECTED(num_samples > 0, "num_samples is negative or 0");
+  CHECK_FAIL_RETURN_UNEXPECTED(num_samples >= 0, "num_samples is negative");
   num_samples_ = num_samples;
   return Status::OK();
 }
@@ -116,7 +129,7 @@ Status Sampler::SetNumRowsInDataset(int64_t num_rows) {
   return Status::OK();
 }
 
-Status Sampler::AddChild(std::shared_ptr<DatasetOp> child) {
+Status Sampler::AddChild(std::shared_ptr<Sampler> child) {
   if (child == nullptr) {
     return Status::OK();
   }
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.h b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.h
index 936a80bb38..34c3cb7935 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.h
@@ -33,25 +33,12 @@ namespace dataset {
 //  must inherit from if those leaf operator wish to support sampling.
 class RandomAccessOp {
  public:
-  // Sampler get numRows from StorageOp
-  // @param int64_t num - return number of rows, normally num of samples
-  // @return - The error code return
-  virtual Status GetNumSamples(int64_t *num_samples) const {
-    // CI complains num_samples not used if the following line is not added
-    CHECK_FAIL_RETURN_UNEXPECTED(num_samples != nullptr, "num_samples == nullptr");
-    RETURN_STATUS_UNEXPECTED("function GetNumSamples needs to overridden to support this sampler");
-  }
-
-  // Sampler get number of rows in the dataset!
+  // Sampler get number of rows in the dataset
   // @param int64_t num - return number of rows for this dataset
   // @return - The error code return
-  virtual Status GetNumRowsInDataset(int64_t *num_rows) const {
-    // CI complains num_rows not used if the following line is not added
-    CHECK_FAIL_RETURN_UNEXPECTED(num_rows != nullptr, "num_rows == nullptr");
-    RETURN_STATUS_UNEXPECTED("function GetNumRowsInDataset needs to overridden to support this sampler");
-  }
+  Status GetNumRowsInDataset(int64_t *num_rows) const;
 
-  // sampler gets label , imageIds from storageOp, this function is unique to PK
+  // sampler gets label , imageIds from corresponding Dataset Op, this function is unique to PK
   // @param std::map<int64_t, std::vector<int64_t>> * map
   // @return - The error code return
   virtual Status GetClassIds(std::map<int32_t, std::vector<int64_t>> *map) const {
@@ -60,12 +47,22 @@ class RandomAccessOp {
 
   // default destructor
   virtual ~RandomAccessOp() = default;
+
+ protected:
+  // The amount of rows in the dataset itself. This is the before-sampling value, the
+  // total count of rows.  A sampler may choose to sample less than this amount.
+  int64_t num_rows_;
 };
 
-class Sampler : public DatasetOp {
+class Sampler {
  public:
+  // Constructor
+  // @param int64_t num_samples: the user-requested number of samples ids to generate. A value of 0
+  //                indicates that the sampler should produce the complete set of ids.
   // @param int64_t samplesPerBuffer: Num of Sampler Ids to fetch via 1 GetNextBuffer call
-  explicit Sampler(int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());
+  explicit Sampler(int64_t num_samples, int64_t samples_per_buffer);
+
+  Sampler(const Sampler &s) : Sampler(s.num_samples_, s.samples_per_buffer_) {}
 
   // default destructor
   ~Sampler() = default;
@@ -75,51 +72,38 @@ class Sampler : public DatasetOp {
   // @param std::unique_ptr<DataBuffer> pBuffer - Buffer to be returned to StorageOp
   // @param int32_t workerId - not meant to be used
   // @return - The error code return
-  Status GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) override = 0;
+  virtual Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) = 0;
 
   // return all ids in one epoch as a numpy array, then call reset
   Status GetAllIdsThenReset(py::array *data);
 
   // for next epoch of sampleIds
   // @return - The error code return
-  Status Reset() override = 0;
+  virtual Status ResetSampler() = 0;
 
-  // setter function for num_rows_
-  Status SetNumRowsInDataset(int64_t num_rows);
-
-  // setter function for num_samples_
-  Status SetNumSamples(int64_t num_samples);
-
-  int64_t num_samples() { return num_samples_; }
-
-  // first handshake between StorageOp and Sampler. This func will call getNumRows and getNumSamples
-  // @param op - StorageOp pointer, pass in so Sampler can call getNumSamples() and get ClassIds()
+  // first handshake between leaf source op and Sampler. This func will determine the amount of data
+  // in the dataset that we can sample from.
+  // @param op - leaf op pointer, pass in so Sampler can ask it about how much data there is
   // @return
   virtual Status HandshakeRandomAccessOp(const RandomAccessOp *op);
 
   // initialize sampler and perform checks on certain vars
   virtual Status InitSampler() { return Status::OK(); }
 
-  // Not meant to be called
-  // @return
-  int32_t num_workers() const final { return 0; }
-
-  // Not meant to be called
-  // @return
-  int32_t num_consumers() const final { return 0; }
-
-  // Not meant to be called
-  // @return
-  int32_t num_producers() const final { return 0; }
+  // setter for num samples
+  // @param num_samples - the number of samples to assign.
+  // @return status error code
+  Status SetNumSamples(int64_t num_samples);
 
-  // Not meant to be called!
-  // @return - The error code return
-  Status operator()() final { RETURN_STATUS_UNEXPECTED("Functor not supported in Sampler"); }
+  // setter for num or records in the dataset
+  // @param num_rows - the number of records
+  // @return status error code
+  Status SetNumRowsInDataset(int64_t num_rows);
 
   // Adds a sampler to become our child.
   // @param std::shared_ptr<DatasetOp> - The sampler to add as a child.
   // @return - The error code returned.
-  Status AddChild(std::shared_ptr<DatasetOp> child);
+  Status AddChild(std::shared_ptr<Sampler> child);
 
   // A helper function to create a int64_t 1-D Tensor specifically used to hold sampleIds for Sampler
   // @param std::shared_ptr<Tensor>* sampleIds
@@ -127,8 +111,16 @@ class Sampler : public DatasetOp {
   // @return - The error code returned.
   Status CreateSamplerTensor(std::shared_ptr<Tensor> *sample_ids, int64_t num_elements);
 
-  void Print(std::ostream &out, bool show_all) const override;
+  // A print method typically used for debugging
+  // @param out - The output stream to write output to
+  // @param show_all - A bool to control if you want to show all info or just a summary
+  virtual void Print(std::ostream &out, bool show_all) const;
 
+  // << Stream output operator overload
+  // @notes This allows you to write the debug print info using stream operators
+  // @param out - reference to the output stream being overloaded
+  // @param sampler - reference to teh sampler to print
+  // @return - the output stream must be returned
   friend std::ostream &operator<<(std::ostream &out, const Sampler &sampler) {
     sampler.Print(out, false);
     return out;
@@ -151,12 +143,14 @@ class Sampler : public DatasetOp {
   // output. Otherwise, num_rows_ is the number of rows in the dataset.
   int64_t num_rows_;
 
-  // Number of ids this sampler will return.
+  // The user may want to sample less than the full amount of data.  num_samples_ reduces the number
+  // of id's returned as request by the user.  Derived classes will choose how to sample the smaller
+  // amount.
   int64_t num_samples_;
 
-  // The max number of ids a DataBuffer returned by this sampler will contain.
   int64_t samples_per_buffer_;
   std::unique_ptr<ColDescriptor> col_desc_;
+  std::vector<std::shared_ptr<Sampler>> child_;  // Child nodes
   std::unique_ptr<DataBuffer> child_ids_;
 };
 }  // namespace dataset
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc
index 789f232e1e..f0ff6a2c02 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc
@@ -20,34 +20,42 @@
 
 namespace mindspore {
 namespace dataset {
-SequentialSampler::SequentialSampler(int64_t samples_per_buffer) : Sampler(samples_per_buffer), next_id_(0) {}
+SequentialSampler::SequentialSampler(int64_t num_samples, int64_t start_index, int64_t samples_per_buffer)
+    : Sampler(num_samples, samples_per_buffer), start_index_(start_index), current_id_(start_index), id_count_(0) {}
 
-Status SequentialSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
-  if (next_id_ > num_samples_) {
-    RETURN_STATUS_UNEXPECTED("Sequential Sampler Internal Error");
-  } else if (next_id_ == num_samples_) {
+Status SequentialSampler::GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) {
+  if (id_count_ > num_samples_) {
+    RETURN_STATUS_UNEXPECTED("SequentialSampler Internal Error");
+  } else if (id_count_ == num_samples_) {
     (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
   } else {
     if (HasChildSampler()) {
-      RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(&child_ids_));
+      RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
     }
 
-    (*out_buffer) = std::make_unique<DataBuffer>(next_id_, DataBuffer::kDeBFlagNone);
+    (*out_buffer) = std::make_unique<DataBuffer>(current_id_, DataBuffer::kDeBFlagNone);
     std::shared_ptr<Tensor> sampleIds;
-    int64_t lastId = (samples_per_buffer_ + next_id_ > num_samples_) ? num_samples_ : samples_per_buffer_ + next_id_;
-    RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, lastId - next_id_));
-    int64_t *idPtr = reinterpret_cast<int64_t *>(sampleIds->GetMutableBuffer());
-    while (next_id_ < lastId) {
-      int64_t sampled_id = next_id_;
+
+    // Compute how many ids are left to pack, and pack this amount into a new buffer.  Respect the setting for
+    // samples per buffer though.
+    int64_t remaining_ids = num_samples_ - id_count_;
+    int64_t num_elements = std::min(remaining_ids, samples_per_buffer_);
+
+    RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, num_elements));
+    auto idPtr = sampleIds->begin<int64_t>();
+    for (int64_t i = 0; i < num_elements; i++) {
+      int64_t sampled_id = current_id_;
       if (HasChildSampler()) {
         RETURN_IF_NOT_OK(GetAssociatedChildId(&sampled_id, sampled_id));
       }
 
       *idPtr = sampled_id;
-      next_id_++;
+      current_id_++;  // Move the current id to the next one in the sequence
       idPtr++;
     }
 
+    id_count_ += num_elements;  // Count the packed ids towards our overall sample count
+
     TensorRow row(1, sampleIds);
     (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, row));
   }
@@ -55,22 +63,27 @@ Status SequentialSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer)
 }
 
 Status SequentialSampler::InitSampler() {
-  num_samples_ = (num_samples_ <= 0) ? num_rows_ : num_samples_;  // if num_samples < 0, try if num_rows is set
-  if (HasChildSampler()) {
-    num_samples_ = std::min(num_samples_, num_rows_);
+  CHECK_FAIL_RETURN_UNEXPECTED(start_index_ >= 0, "start_index < 0\n");
+  CHECK_FAIL_RETURN_UNEXPECTED(start_index_ < num_rows_, "start_index >= num_rows\n");
+  CHECK_FAIL_RETURN_UNEXPECTED(num_samples_ >= 0, "num_samples < 0\n");
+  // Adjust the num_samples count based on the range of ids we are sequencing.  If num_samples is 0, we sample
+  // the entire set.  If it's non-zero, we will implicitly cap the amount sampled based on available data.
+  int64_t available_row_count = num_rows_ - start_index_;
+  if (num_samples_ == 0 || num_samples_ > available_row_count) {
+    num_samples_ = available_row_count;
   }
-
   CHECK_FAIL_RETURN_UNEXPECTED(num_samples_ > 0 && samples_per_buffer_ > 0, "Fail to init Sequential Sampler");
   samples_per_buffer_ = samples_per_buffer_ > num_samples_ ? num_samples_ : samples_per_buffer_;
   return Status::OK();
 }
 
-Status SequentialSampler::Reset() {
-  CHECK_FAIL_RETURN_UNEXPECTED(next_id_ == num_samples_, "ERROR Reset() called early/late");
-  next_id_ = 0;
+Status SequentialSampler::ResetSampler() {
+  CHECK_FAIL_RETURN_UNEXPECTED(id_count_ == num_samples_, "ERROR Reset() called early/late");
+  current_id_ = start_index_;
+  id_count_ = 0;
 
   if (HasChildSampler()) {
-    RETURN_IF_NOT_OK(child_[0]->Reset());
+    RETURN_IF_NOT_OK(child_[0]->ResetSampler());
   }
 
   return Status::OK();
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.h b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.h
index 4e195d75db..2cb7a9ff8d 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.h
@@ -26,8 +26,12 @@ namespace dataset {
 class SequentialSampler : public Sampler {
  public:
   // Constructor
+  // @param num_samples - The number of samples to draw. A value of 0 indicates the sampler should produce the
+  //                      full amount of ids from the dataset
+  // @param start_index - The starting index value
   // @param int64_t samplesPerBuffer - Num of Sampler Ids to fetch via 1 GetNextBuffer call
-  explicit SequentialSampler(int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());
+  explicit SequentialSampler(int64_t num_samples, int64_t start_index,
+                             int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());
 
   // Destructor.
   ~SequentialSampler() = default;
@@ -37,18 +41,20 @@ class SequentialSampler : public Sampler {
 
   // for next epoch of sampleIds
   // @return - The error code return
-  Status Reset() override;
+  Status ResetSampler() override;
 
   // Op calls this to get next Buffer that contains all the sampleIds
-  // @param std::unique_ptr<DataBuffer> pBuffer - Buffer to be returned to StorageOp
+  // @param std::unique_ptr<DataBuffer> pBuffer - Buffer to be returned to corresponding Dataset Op
   // @param int32_t workerId - not meant to be used
   // @return - The error code return
-  Status GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) override;
+  Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) override;
 
   void Print(std::ostream &out, bool show_all) const override;
 
  private:
-  int64_t next_id_;
+  int64_t current_id_;   // The id sequencer.  Each new id increments from this
+  int64_t start_index_;  // The starting id.  current_id_ begins from here.
+  int64_t id_count_;     // An internal counter that tracks how many ids have been produced
 };
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
index ca1160299a..54491889fc 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
@@ -27,29 +27,35 @@
 namespace mindspore {
 namespace dataset {
 // Constructor.
-SubsetRandomSampler::SubsetRandomSampler(const std::vector<int64_t> &indices, int64_t samples_per_buffer)
-    : Sampler(samples_per_buffer), indices_(indices), sample_id_(0), buffer_id_(0) {}
+SubsetRandomSampler::SubsetRandomSampler(int64_t num_samples, const std::vector<int64_t> &indices,
+                                         int64_t samples_per_buffer)
+    : Sampler(num_samples, samples_per_buffer), indices_(indices), sample_id_(0), buffer_id_(0) {}
 
 // Initialized this Sampler.
 Status SubsetRandomSampler::InitSampler() {
   CHECK_FAIL_RETURN_UNEXPECTED(num_rows_ > 0, "num_rows <= 0\n");
 
-  num_samples_ = indices_.size();
-
+  // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
+  // In this case, the id's are provided by the user.  Cap the num_samples on the number of id's given.
+  if (num_samples_ == 0 || num_samples_ > static_cast<int64_t>(indices_.size())) {
+    num_samples_ = static_cast<int64_t>(indices_.size());
+  }
   // Initialize random generator with seed from config manager
   rand_gen_.seed(GetSeed());
 
-  if (static_cast<size_t>(samples_per_buffer_) > indices_.size()) {
-    samples_per_buffer_ = static_cast<int64_t>(indices_.size());
+  if (samples_per_buffer_ > num_samples_) {
+    samples_per_buffer_ = num_samples_;
   }
 
+  // num_samples_ could be smaller than the total number of input id's.
+  // We will shuffle the full set of id's, but only select the first num_samples_ of them later.
   std::shuffle(indices_.begin(), indices_.end(), rand_gen_);
 
   return Status::OK();
 }
 
 // Reset the internal variable to the initial state.
-Status SubsetRandomSampler::Reset() {
+Status SubsetRandomSampler::ResetSampler() {
   // Reset the internal counters.
   sample_id_ = 0;
   buffer_id_ = 0;
@@ -59,20 +65,20 @@ Status SubsetRandomSampler::Reset() {
   std::shuffle(indices_.begin(), indices_.end(), rand_gen_);
 
   if (HasChildSampler()) {
-    RETURN_IF_NOT_OK(child_[0]->Reset());
+    RETURN_IF_NOT_OK(child_[0]->ResetSampler());
   }
 
   return Status::OK();
 }
 
 // Get the sample ids.
-Status SubsetRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
+Status SubsetRandomSampler::GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) {
   // All samples have been drawn
-  if (sample_id_ == indices_.size()) {
+  if (sample_id_ == num_samples_) {
     (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagEOE);
   } else {
     if (HasChildSampler()) {
-      RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(&child_ids_));
+      RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
     }
 
     (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
@@ -80,15 +86,15 @@ Status SubsetRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffe
 
     int64_t last_id = sample_id_ + samples_per_buffer_;
     // Handling the return all samples at once, and when last draw is not a full batch.
-    if (static_cast<size_t>(last_id) > indices_.size()) {
-      last_id = indices_.size();
+    if (last_id > num_samples_) {
+      last_id = num_samples_;
     }
 
     // Allocate tensor
     RETURN_IF_NOT_OK(CreateSamplerTensor(&outputIds, last_id - sample_id_));
 
     // Initialize tensor
-    int64_t *id_ptr = reinterpret_cast<int64_t *>(outputIds->GetMutableBuffer());
+    auto id_ptr = outputIds->begin<int64_t>();
     while (sample_id_ < last_id) {
       if (indices_[sample_id_] >= num_rows_) {
         std::string err_msg =
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.h b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.h
index 1f4c155748..980ffe578a 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.h
@@ -28,10 +28,11 @@ namespace dataset {
 class SubsetRandomSampler : public Sampler {
  public:
   // Constructor.
+  // @param num_samples The number of samples to draw. 0 for the full amount.
   // @param indices List of indices from where we will randomly draw samples.
   // @param samples_per_buffer The number of ids we draw on each call to GetNextBuffer().
   // When samplesPerBuffer=0, GetNextBuffer() will draw all the sample ids and return them at once.
-  explicit SubsetRandomSampler(const std::vector<int64_t> &indices,
+  explicit SubsetRandomSampler(int64_t num_samples, const std::vector<int64_t> &indices,
                                std::int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());
 
   // Destructor.
@@ -43,12 +44,12 @@ class SubsetRandomSampler : public Sampler {
 
   // Reset the internal variable to the initial state and reshuffle the indices.
   // @return Status
-  Status Reset() override;
+  Status ResetSampler() override;
 
   // Get the sample ids.
   // @param[out] out_buffer The address of a unique_ptr to DataBuffer where the sample ids will be placed.
   // @note the sample ids (int64_t) will be placed in one Tensor and be placed into pBuffer.
-  Status GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) override;
+  Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) override;
 
  private:
   // A list of indices (already randomized in constructor).
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_sampler.cc
deleted file mode 100644
index 0ae7a7d503..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_sampler.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "dataset/engine/datasetops/source/sampler/subset_sampler.h"
-
-#include <memory>
-#include <string>
-
-#include "dataset/core/config_manager.h"
-#include "dataset/core/global_context.h"
-
-namespace mindspore {
-namespace dataset {
-// Constructor.
-SubsetSampler::SubsetSampler(int64_t start_index, int64_t subset_size)
-    : Sampler(subset_size), start_index_(start_index), subset_size_(subset_size), current_id_(0) {}
-
-Status SubsetSampler::InitSampler() {
-  CHECK_FAIL_RETURN_UNEXPECTED(subset_size_ > 0, "subset_size <= 0\n");
-  CHECK_FAIL_RETURN_UNEXPECTED(start_index_ >= 0, "start_index < 0\n");
-  CHECK_FAIL_RETURN_UNEXPECTED(start_index_ < num_rows_, "start_index >= num_rows\n");
-  CHECK_FAIL_RETURN_UNEXPECTED(start_index_ + subset_size_ - 1 < num_rows_, "Final index out of bounds.\n");
-
-  num_samples_ = subset_size_;
-
-  return Status::OK();
-}
-
-Status SubsetSampler::Reset() {
-  current_id_ = 0;
-
-  if (HasChildSampler()) {
-    RETURN_IF_NOT_OK(child_[0]->Reset());
-  }
-
-  return Status::OK();
-}
-
-Status SubsetSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
-  if (current_id_ > subset_size_) {
-    RETURN_STATUS_UNEXPECTED("SubsetSampler Internal Error");
-  } else if (current_id_ == subset_size_) {
-    (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
-  } else {
-    if (HasChildSampler()) {
-      RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(&child_ids_));
-    }
-
-    (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagNone);
-    std::shared_ptr<Tensor> sampled_ids;
-    RETURN_IF_NOT_OK(CreateSamplerTensor(&sampled_ids, subset_size_));
-
-    int64_t *sampled_ids_start_addr = reinterpret_cast<int64_t *>(sampled_ids->GetMutableBuffer());
-
-    while (current_id_ < subset_size_) {
-      int64_t sampled_id = start_index_ + current_id_;
-      if (HasChildSampler()) {
-        RETURN_IF_NOT_OK(GetAssociatedChildId(&sampled_id, sampled_id));
-      }
-
-      *(sampled_ids_start_addr + current_id_) = sampled_id;
-      current_id_++;
-    }
-
-    TensorRow sampled_ids_row(1, sampled_ids);
-    (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, sampled_ids_row));
-  }
-
-  return Status::OK();
-}
-
-}  // namespace dataset
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_sampler.h b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_sampler.h
deleted file mode 100644
index 5e8774f673..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_sampler.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SUBSET_SAMPLER_H_
-#define DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SUBSET_SAMPLER_H_
-
-#include <memory>
-#include <vector>
-
-#include "dataset/engine/datasetops/source/sampler/sampler.h"
-
-namespace mindspore {
-namespace dataset {
-
-class SubsetSampler : public Sampler {
- public:
-  // Constructor.
-  // @param start_index The index we start sampling from.
-  explicit SubsetSampler(int64_t start_index, int64_t subset_size);
-
-  // Destructor.
-  ~SubsetSampler() = default;
-
-  // Initialize the sampler.
-  // @return Status
-  Status InitSampler() override;
-
-  // Reset the internal variable to the initial state and reshuffle the indices.
-  // @return Status
-  Status Reset() override;
-
-  // Get the sample ids.
-  // @param[out] out_buffer The address of a unique_ptr to DataBuffer where the sample ids will be placed.
-  // @note the sample ids (int64_t) will be placed in one Tensor.
-  Status GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) override;
-
- private:
-  int64_t start_index_;
-  int64_t subset_size_;
-  int64_t current_id_;
-};
-
-}  // namespace dataset
-}  // namespace mindspore
-
-#endif  // DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SUBSET_SAMPLER_H_
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc
index 5027dcdd67..759af99352 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc
@@ -27,25 +27,28 @@
 namespace mindspore {
 namespace dataset {
 //  Constructor.
-WeightedRandomSampler::WeightedRandomSampler(const std::vector<double> &weights, int64_t num_samples, bool replacement,
+WeightedRandomSampler::WeightedRandomSampler(int64_t num_samples, const std::vector<double> &weights, bool replacement,
                                              int64_t samples_per_buffer)
-    : Sampler(samples_per_buffer),
+    : Sampler(num_samples, samples_per_buffer),
       weights_(weights),
       replacement_(replacement),
       sample_id_(0),
-      buffer_id_(0),
-      user_num_samples_(num_samples) {}
+      buffer_id_(0) {}
 
 // Initialized this Sampler.
 Status WeightedRandomSampler::InitSampler() {
-  CHECK_FAIL_RETURN_UNEXPECTED(num_rows_ > 0 && user_num_samples_, "num_samples & num_rows need to be positive");
+  // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
+  // If the user asked to sample more rows than exists in the dataset, adjust the num_samples accordingly.
+  if (num_samples_ == 0 || num_samples_ > num_rows_) {
+    num_samples_ = num_rows_;
+  }
+  CHECK_FAIL_RETURN_UNEXPECTED(num_rows_ > 0 && num_samples_, "num_samples & num_rows need to be positive");
   CHECK_FAIL_RETURN_UNEXPECTED(samples_per_buffer_ > 0, "samples_per_buffer<=0\n");
-  num_samples_ = user_num_samples_;
 
   // Initialize random generator with seed from config manager
   rand_gen_.seed(GetSeed());
 
-  samples_per_buffer_ = (samples_per_buffer_ > user_num_samples_) ? user_num_samples_ : samples_per_buffer_;
+  samples_per_buffer_ = (samples_per_buffer_ > num_samples_) ? num_samples_ : samples_per_buffer_;
 
   if (!replacement_) {
     exp_dist_ = std::make_unique<std::exponential_distribution<>>(1);
@@ -67,14 +70,14 @@ void WeightedRandomSampler::InitOnePassSampling() {
   }
 
   // Partial sort the first `numSamples` elements.
-  std::partial_sort(val_idx.begin(), val_idx.begin() + user_num_samples_, val_idx.end());
-  for (int64_t i = 0; i < user_num_samples_; i++) {
+  std::partial_sort(val_idx.begin(), val_idx.begin() + num_samples_, val_idx.end());
+  for (int64_t i = 0; i < num_samples_; i++) {
     onepass_ids_.push_back(val_idx[i].second);
   }
 }
 
 // Reset the internal variable to the initial state and reshuffle the indices.
-Status WeightedRandomSampler::Reset() {
+Status WeightedRandomSampler::ResetSampler() {
   sample_id_ = 0;
   buffer_id_ = 0;
   rand_gen_.seed(GetSeed());
@@ -85,28 +88,28 @@ Status WeightedRandomSampler::Reset() {
   }
 
   if (HasChildSampler()) {
-    RETURN_IF_NOT_OK(child_[0]->Reset());
+    RETURN_IF_NOT_OK(child_[0]->ResetSampler());
   }
 
   return Status::OK();
 }
 
 // Get the sample ids.
-Status WeightedRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
+Status WeightedRandomSampler::GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) {
   if (weights_.size() > static_cast<size_t>(num_rows_)) {
     return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
                   "number of samples weights is more than num of rows. Might generate id out of bound OR other errors");
   }
 
-  if (!replacement_ && (weights_.size() < static_cast<size_t>(user_num_samples_))) {
+  if (!replacement_ && (weights_.size() < static_cast<size_t>(num_samples_))) {
     RETURN_STATUS_UNEXPECTED("Without replacement, sample weights less than numSamples");
   }
 
-  if (sample_id_ == user_num_samples_) {
+  if (sample_id_ == num_samples_) {
     (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagEOE);
   } else {
     if (HasChildSampler()) {
-      RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(&child_ids_));
+      RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
     }
 
     (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
@@ -114,15 +117,15 @@ Status WeightedRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buf
 
     int64_t last_id = sample_id_ + samples_per_buffer_;
     // Handling the return all samples at once, and when last draw is not a full batch.
-    if (last_id > user_num_samples_) {
-      last_id = user_num_samples_;
+    if (last_id > num_samples_) {
+      last_id = num_samples_;
     }
 
     // Allocate tensor.
     RETURN_IF_NOT_OK(CreateSamplerTensor(&outputIds, last_id - sample_id_));
 
     // Initialize tensor.
-    int64_t *id_ptr = reinterpret_cast<int64_t *>(outputIds->GetMutableBuffer());
+    auto id_ptr = outputIds->begin<int64_t>();
     // Assign the data to tensor element.
     while (sample_id_ < last_id) {
       int64_t genId;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h
index 5381bb64b0..257501250d 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h
@@ -29,12 +29,12 @@ namespace dataset {
 class WeightedRandomSampler : public Sampler {
  public:
   // Constructor.
-  // @param weights A lift of sample weights.
   // @param num_samples Number of samples to be drawn.
+  // @param weights A lift of sample weights.
   // @param replacement Determine if samples are drawn with/without replacement.
   // @param samples_per_buffer The number of ids we draw on each call to GetNextBuffer().
   // When samplesPerBuffer=0, GetNextBuffer() will draw all the sample ids and return them at once.
-  WeightedRandomSampler(const std::vector<double> &weights, int64_t num_samples, bool replacement = true,
+  WeightedRandomSampler(int64_t num_samples, const std::vector<double> &weights, bool replacement,
                         int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());
 
   // Destructor.
@@ -46,12 +46,12 @@ class WeightedRandomSampler : public Sampler {
   Status InitSampler() override;
 
   // Reset the internal variable to the initial state and reshuffle the indices.
-  Status Reset() override;
+  Status ResetSampler() override;
 
   // Get the sample ids.
   // @param[out] out_buffer The address of a unique_ptr to DataBuffer where the sample ids will be placed.
   // @note the sample ids (int64_t) will be placed in one Tensor and be placed into pBuffer.
-  Status GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) override;
+  Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) override;
 
  private:
   // A list of weights for each sample.
@@ -69,9 +69,6 @@ class WeightedRandomSampler : public Sampler {
   // Random engine and device
   std::mt19937 rand_gen_;
 
-  // num_samples from user
-  int64_t user_num_samples_;
-
   // Discrete distribution for generating weighted random numbers with replacement.
   std::unique_ptr<std::discrete_distribution<int64_t>> discrete_dist_;
 
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_client.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/storage_client.cc
deleted file mode 100644
index 7f081af2b7..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_client.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define MAX_INTEGER_INT32 2147483647
-
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <nlohmann/json.hpp>
-#include "dataset/core/constants.h"
-#include "dataset/engine/datasetops/source/storage_client.h"
-#include "dataset/engine/datasetops/source/storage_op.h"
-#include "dataset/engine/datasetops/source/tf_client.h"
-#include "dataset/util/status.h"
-
-namespace mindspore {
-namespace dataset {
-// Name: Constructor
-// Description:
-StorageClient::StorageClient(std::unique_ptr<DataSchema> schema,  // In: The schema for this storage client.
-                             StorageOp *store_op)                 // In: The StorageOp that's using this client
-    : data_schema_(std::move(schema)), num_rows_in_dataset_(0), storage_op_(store_op), num_classes_(0) {}
-
-// Name: Print()
-// Description: A function that prints info about the StorageClient
-// In: The output stream to print to
-void StorageClient::Print(std::ostream &out) const {
-  // not much to show here folks!
-  // out << "Storage client:\n";
-}
-
-// This is a local-only static function to drive the switch statement for creating
-// the storage client (not a static member function)
-static Status CreateStorageClientSwitch(
-  std::unique_ptr<DataSchema> schema,            // In: The schema  to set into the client
-  StorageOp *store_op,                           // In: The StorageOp we are operating on
-  std::shared_ptr<StorageClient> *out_client) {  // Out: the created storage client
-  switch (schema->dataset_type()) {
-    case DatasetType::kArrow: {
-      return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
-                    "Storage client not implemented yet for arrow dataset type.");
-    }
-    case DatasetType::kTf: {
-      // Construct the derived class TFClient, stored as base class StorageClient
-      store_op->set_rows_per_buffer(32);
-      *out_client = std::make_unique<TFClient>(std::move(schema), store_op);
-      break;
-    }
-    case DatasetType::kUnknown:
-    default: {
-      RETURN_STATUS_UNEXPECTED("Invalid dataset type.");
-    }
-  }
-  if (*out_client) {
-    RETURN_IF_NOT_OK((*out_client)->Init());
-  }
-  return Status::OK();
-}
-
-// Name: CreateStorageClient()
-// Description: A factory method to create the derived storage client.
-//              Every dataset has a required field for the dataset type in a config
-//              file.  This type will determine the child class to return for the
-//              type of storage client.  It also creates the schema and sticks it
-//              into the cache.
-Status StorageClient::CreateStorageClient(
-  StorageOp *store_op,                           // In: A backpointer to the owning cache for this client.
-  std::string dataset_schema_path,               // In: The path to the schema
-  std::shared_ptr<StorageClient> *out_client) {  // Out: the created storage client
-  // Make a new schema first.  This only assigns the dataset type.  It does not
-  // create the columns yet.
-  auto new_schema = std::make_unique<DataSchema>();
-  RETURN_IF_NOT_OK(new_schema->LoadDatasetType(dataset_schema_path));
-  RETURN_IF_NOT_OK(CreateStorageClientSwitch(std::move(new_schema), store_op, out_client));
-  return Status::OK();
-}
-
-// Name: CreateStorageClient()
-// Description: A factory method to create the derived storage client.
-//              This creator is a user-override for the schema properties where
-//              the user has input the layout of the data (typically used in testcases)
-Status StorageClient::CreateStorageClient(
-  StorageOp *store_op,                           // In: A backpointer to the owning cache for this client.
-  DatasetType in_type,                           // In: The type of dataset
-  std::shared_ptr<StorageClient> *out_client) {  // Out: the created storage client
-  // The dataset type is passed in by the user.  Create an empty schema with only
-  // only the dataset type filled in and then create the client with it.
-  auto new_schema = std::make_unique<DataSchema>();
-  new_schema->set_dataset_type(in_type);
-  RETURN_IF_NOT_OK(CreateStorageClientSwitch(std::move(new_schema), store_op, out_client));
-  return Status::OK();
-}
-
-// Name: LoadDatasetLayout()
-// Description: There are 2 ways to define the properties of the data in the storage
-//              layer: LoadDatasetLayout() and AssignDatasetLayout().
-//              LoadDatasetLayout() will parse the json config file that comes with
-//              the dataset.
-Status StorageClient::LoadDatasetLayout() {
-  // Access the json file to populate our schema, assume the json file is accessible
-  // locally.
-  RETURN_IF_NOT_OK(data_schema_->LoadSchemaFile(storage_op_->schema_file(), storage_op_->columns_to_load()));
-
-  // The number of rows in the schema file is an optional config.  For example,
-  // maybe the derived storage client will know how to determine the total number
-  // of rows a different way rather than having it in the schema config json file.
-  // Thus, mNumRowsInDataset can still be zero and force the derived class override
-  // to determine it another way.
-  uint32_t num_rows = 0;
-  RETURN_IF_NOT_OK(this->numRowsFromFile(num_rows));
-  CHECK_FAIL_RETURN_UNEXPECTED(num_rows <= MAX_INTEGER_INT32, "numRows exceeds the boundary numRows>2147483647");
-  if (num_rows_in_dataset_ == 0 || num_rows < num_rows_in_dataset_) {
-    num_rows_in_dataset_ = num_rows;
-  }
-
-  return Status::OK();
-}
-
-// Name: AssignDatasetLayout()
-// Description: There are 2 ways to define the properties of the data in the storage
-//              layer: LoadDatasetLayout() and AssignDatasetLayout().
-//              AssignDatasetLayout() will take input from the caller and assign that
-//              info into the storage client.
-Status StorageClient::AssignDatasetLayout(uint32_t num_rows,           // In: The number of rows in the dataset
-                                          const DataSchema &schema) {  // In: The schema for the dataset
-  // Since this is just an assignment into the storage client, you probably won't need
-  // to override this one in a derived class.  First some sanity checks
-  CHECK_FAIL_RETURN_UNEXPECTED(data_schema_->dataset_type() == schema.dataset_type(),
-                               "Assigning a schema into StorageClient with mismatched dataset types!");
-  CHECK_FAIL_RETURN_UNEXPECTED(data_schema_->NumColumns() == 0,
-                               "Assigning a schema into StorageClient that already has non-empty schema!");
-
-  // The current schema was just an empty one with only the dataset field populated.
-  // Let's copy construct a new one that will be a copy of the input schema (releasing the old
-  // one) and then set the number of rows that the user requested.
-  data_schema_ = std::make_unique<DataSchema>(schema);
-  CHECK_FAIL_RETURN_UNEXPECTED(num_rows <= MAX_INTEGER_INT32, "numRows exceeds the boundary numRows>2147483647");
-  num_rows_in_dataset_ = num_rows;
-
-  return Status::OK();
-}
-
-// Name: numRowsFromFile()
-// Description: Reads the schema json file to see if the optional numRows field has
-//              been set and returns it.
-Status StorageClient::numRowsFromFile(uint32_t &num_rows) const {
-  std::string schemaFile = storage_op_->schema_file();
-  try {
-    std::ifstream in(schemaFile);
-    nlohmann::json js;
-    in >> js;
-    if (js.find("numRows") == js.end()) {
-      num_rows = MAX_INTEGER_INT32;
-    } else {
-      num_rows = js.value("numRows", 0);
-    }
-    if (num_rows == 0) {
-      std::string err_msg =
-        "Storage client has not properly done dataset "
-        "handshake to initialize schema and number of rows.";
-      RETURN_STATUS_UNEXPECTED(err_msg);
-    }
-  }
-  // Catch any exception and rethrow it as our own
-  catch (const std::exception &err) {
-    std::ostringstream ss;
-    ss << "Schema file failed to load:\n" << err.what();
-    std::string err_msg = ss.str();
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-  return Status::OK();
-}
-
-// Get'r function
-DataSchema *StorageClient::schema() const { return data_schema_.get(); }
-}  // namespace dataset
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_client.h b/mindspore/ccsrc/dataset/engine/datasetops/source/storage_client.h
deleted file mode 100644
index 6198f4233f..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_client.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_STORAGE_CLIENT_H_
-#define DATASET_ENGINE_DATASETOPS_SOURCE_STORAGE_CLIENT_H_
-
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-#include "dataset/engine/data_schema.h"
-#include "dataset/engine/datasetops/source/storage_op.h"
-#include "dataset/util/status.h"
-
-namespace mindspore {
-namespace dataset {
-// The Storage Client is the interface and base class that the StorageOp
-// will use to perform any interactions with the storage layer.
-// The different types of datasets will have different derived classes
-// under that storage client super class.
-class StorageClient {
- public:
-  // Name: Constructor
-  // Description:
-  StorageClient(std::unique_ptr<DataSchema> schema,  // In: The schema for this storage client.
-                StorageOp *store_op);                // In: The StorageOp that's using this client
-
-  // Destructor
-  virtual ~StorageClient() { storage_op_ = nullptr; }
-
-  virtual Status Init() { return Status::OK(); }
-
-  // Name: CreateStorageClient()
-  // Description: A factory method to create the derived storage client.
-  //              Every dataset has a required field for the dataset type in a config
-  //              file.  This type will determine the child class to return for the
-  //              type of storage client.
-  static Status CreateStorageClient(StorageOp *store_op,  // In: A backpointer to the owning storage op for this client.
-                                    std::string dataset_schema_path,              // In: The path to the dataset
-                                    std::shared_ptr<StorageClient> *out_client);  // Out: the created storage client
-
-  // Name: CreateStorageClient()
-  // Description: A factory method to create the derived storage client.
-  //              This creator is a user-override for the schema properties where
-  //              the user has input the layout of the data (typically used in testcases)
-  static Status CreateStorageClient(StorageOp *store_op,  // In: A backpointer to the owning cache for this client.
-                                    DatasetType in_type,  // In: The type of dataset
-                                    std::shared_ptr<StorageClient> *out_client);  // Out: the created storage client
-
-  // Name: Print()
-  // Description: A function that prints info about the StorageClient
-  virtual void Print(std::ostream &out) const;  // In: The output stream to print to
-
-  // Provide stream operator for displaying
-  friend std::ostream &operator<<(std::ostream &out, const StorageClient &storage_client) {
-    storage_client.Print(out);
-    return out;
-  }
-
-  // Name: LoadDatasetLayout()
-  // Description: There are 2 ways to define the properties of the data in the storage
-  //              layer: LoadDatasetLayout() and AssignDatasetLayout().
-  //              LoadDatasetLayout() will parse the json config file that comes with
-  //              the dataset and internally populate row counts and schema.
-  virtual Status LoadDatasetLayout();
-
-  // Name: AssignDatasetLayout()
-  // Description: There are 2 ways to define the properties of the data in the storage
-  //              layer: LoadDatasetLayout() and AssignDatasetLayout().
-  //              AssignDatasetLayout() will take input from the caller and assign that
-  virtual Status AssignDatasetLayout(uint32_t num_rows,          // In: The number of rows in the dataset
-                                     const DataSchema &schema);  // In: The schema for the dataset
-
-  // Name: Reset()
-  // Description: Resets any state info inside the client back to it's initialized
-  //              state.
-  virtual Status Reset() = 0;
-
-  // Name: IsMoreData
-  // Description: General routine to ask if more data exists in the storage side for
-  //              a given buffer id.
-  virtual bool IsMoreData(uint32_t id) { return true; }
-
-  // Name: numRowsFromFile()
-  // Description: Reads the schema json file to see if the optional numRows field has
-  //              been set and returns it.
-  Status numRowsFromFile(uint32_t &num_rows) const;
-
-  // Get'r functions
-  DataSchema *schema() const;
-
-  uint32_t num_rows() const { return num_rows_in_dataset_; }
-
-  // Name: rows_per_buffer()
-  // Description: This default version simply gives you the count of the requested
-  //              rows per buffer that the user defined in the storage op.
-  //              However, if some condition down in the storage client layers
-  //              could result in a buffer that has a different number of rows,
-  //              then the derived class can override this method to provide their
-  //              own implementation.
-  virtual uint32_t rows_per_buffer() { return storage_op_->rows_per_buffer(); }
-
-  // Description: Get the label classes num. Only manifest and Imagenet dataset support this parameter
-  virtual uint32_t num_classes() const { return 0; }
-
- protected:
-  std::unique_ptr<DataSchema> data_schema_;  // The schema for the data
-  uint32_t num_rows_in_dataset_;             // The number of rows in the dataset
-  StorageOp *storage_op_;                    // Back pointer to the owning storage operator.
-  std::vector<std::string> col_names_;
-  uint32_t num_classes_;
-};
-}  // namespace dataset
-}  // namespace mindspore
-
-#endif  // DATASET_ENGINE_DATASETOPS_SOURCE_STORAGE_CLIENT_H_
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/storage_op.cc
deleted file mode 100644
index 052e474b6e..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_op.cc
+++ /dev/null
@@ -1,607 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define MAX_INTEGER_UINT32 4294967295
-#define MAX_INTEGER_INT32 2147483647
-
-#include "dataset/engine/datasetops/source/storage_client.h"
-
-#include <algorithm>
-#include <chrono>
-#include <cstdint>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <random>
-#include <vector>
-#include <utility>
-#include <nlohmann/json.hpp>
-
-#include "common/utils.h"
-#include "dataset/core/config_manager.h"
-#include "dataset/core/constants.h"
-#include "dataset/core/global_context.h"
-#include "dataset/engine/data_buffer.h"
-#include "dataset/engine/datasetops/dataset_op.h"
-#include "dataset/engine/datasetops/parallel_op.h"
-#include "dataset/engine/db_connector.h"
-#include "dataset/engine/data_schema.h"
-#include "dataset/engine/execution_tree.h"
-#include "dataset/util/queue.h"
-#include "dataset/engine/datasetops/source/storage_op.h"
-#include "dataset/util/task_manager.h"
-#include "utils/log_adapter.h"
-
-namespace mindspore {
-namespace dataset {
-// Builder constructor.  Creates the builder object.
-StorageOp::Builder::Builder()
-    : build_dataset_files_dir_(""),
-      build_schema_file_(""),
-      build_num_rows_(0),
-      build_data_distribution_file_(""),
-      build_batch_size_(1),
-      build_drop_remainder_(false) {
-  // Some arguments to the StorageOp constructor have a default argument that is taken
-  // from the client config.
-  // The user may choose to change these values for the construction of the StorageOp by
-  // using the various builder set methods.
-
-  std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
-  build_rows_per_buffer_ = cfg->rows_per_buffer();
-  build_worker_connector_size_ = cfg->worker_connector_size();
-  build_num_workers_ = cfg->num_parallel_workers();
-  build_op_connector_size_ = cfg->op_connector_size();
-}
-
-// The builder "build" method creates the final object.
-Status StorageOp::Builder::Build(std::shared_ptr<StorageOp> *ptr) {
-  // There are 2 "flavours" of construction for a StorageOp:
-  //
-  // 1) Does a handshake with the dataset to identify row ranges and to identify
-  //    the schema (internally the handshake does lookup against a json file in the dataset)
-  //
-  // 2) The user manually creates a schema and defines the row ranges, so there is no real
-  //    dataset handshake.
-  //
-  // The decision about which style is called will depend on if the user supplied the
-  // schema and row range fields.
-
-  const std::string dataset_schema_file("datasetSchema.json");
-  if (build_schema_ != nullptr && build_num_rows_ == 0) {
-    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
-                  "Building a StorageOp with a given schema, but the number of rows not specified!");
-  }
-  if (build_schema_ == nullptr && build_num_rows_ != 0) {
-    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
-                  "Building a StorageOp with a given number of rows but schema not specified!");
-  }
-  if (build_dataset_files_dir_.empty() && build_dataset_file_list_.empty()) {
-    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
-                  "Building a StorageOp that has not provided the location of the data files.");
-  }
-  if (!build_dataset_files_dir_.empty() && !build_dataset_file_list_.empty()) {
-    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
-                  "Building a StorageOp that has provided conflicting location of the data files.");
-  }
-
-  std::shared_ptr<StorageOp> new_storage_op = std::make_shared<StorageOp>(
-    build_num_workers_, build_worker_connector_size_, build_rows_per_buffer_, build_op_connector_size_,
-    build_columns_to_load_, build_data_distribution_file_, build_batch_size_, build_drop_remainder_);
-
-  // If there is no schema or number of rows given, then we go with construction method 1
-  // where we need to handshake with storage client to find out what the schema (and
-  // number of rows) are based on schema file.
-  if (build_schema_ == nullptr && build_num_rows_ == 0) {
-    if (!build_dataset_files_dir_.empty()) {
-      // We have a dataset files dir, but do not have a schema file.
-      // Set the default schema file to be inside the same path as the dataset files dir.
-      if (build_schema_file_.empty()) {
-        build_schema_file_ = build_dataset_files_dir_ + "/" + dataset_schema_file;
-      }
-      RETURN_IF_NOT_OK(new_storage_op->InitOp(build_dataset_files_dir_, build_schema_file_, build_labels_file_name_,
-                                              build_dataset_usage_));
-    } else {
-      // dataset is provided by list of files not dir_path
-      RETURN_IF_NOT_OK(new_storage_op->InitOp(build_dataset_file_list_, build_schema_file_));
-    }
-  } else {
-    // else, the user gave us a schema and a row range, go with construction method 2, where we use
-    // the user-provided schema, but we still need to identify our data files.
-    RETURN_IF_NOT_OK(new_storage_op->InitOp(build_num_rows_, build_dataset_files_dir_, std::move(build_schema_),
-                                            build_labels_file_name_, build_dataset_usage_));
-  }
-
-  // Call the actual workhorse of the constructor
-  RETURN_IF_NOT_OK(new_storage_op->init());
-  *ptr = std::move(new_storage_op);
-  return Status::OK();
-}
-
-StorageOp::StorageOp(int32_t num_workers, int32_t worker_connector_size, int32_t rows_per_buffer,
-                     int32_t op_connector_size, std::vector<std::string> columns_to_load,
-                     std::string data_distribution_file, int32_t batch_size, bool drop_remainder)
-    : ParallelOp(num_workers, op_connector_size),
-      worker_conn_size_(worker_connector_size),
-      rows_per_buffer_(rows_per_buffer),
-      num_rows_(0),
-      buffers_fetched_(0),
-      columns_to_load_(columns_to_load),
-      data_distribution_file_(data_distribution_file),
-      device_num_(1),
-      device_id_(0),
-      shard_config_("ALL"),
-      seed_(0),
-      shuffle_config_(false),
-      num_classes_(0),
-      batch_size_(batch_size),
-      drop_remainder_(drop_remainder) {}
-
-// Init of the StorageOp.  This is 1 of 3 init.
-// This version of the init does not take the schema in it's arguments. It must perform an
-// internal handshake with the dataset to produce the schema.
-Status StorageOp::InitOp(const std::string &dataset_files_dir, const std::string &schema_file,
-                         const std::string &labels_file_name, const std::string &dataset_usage) {
-  dataset_files_dir_ = dataset_files_dir;
-  schema_file_ = schema_file;
-  labels_file_name_ = labels_file_name;
-  dataset_usage_ = dataset_usage;
-
-  // Storage ops require the internal master/worker connector.  create it here
-  RETURN_IF_NOT_OK(ParallelOp::CreateWorkerConnector(worker_conn_size_));
-
-  // Get parameter for distribution.
-  RETURN_IF_NOT_OK(LoadParallelConfig());
-
-  // Create the storage client. This will read the json file to determine what
-  // type of client we're creating.
-  RETURN_IF_NOT_OK(StorageClient::CreateStorageClient(this, schema_file_, &store_client_));
-
-  // Perform the initial handshake with the storage client to further read the
-  // dataset info to populate schema info and the number of rows in the client.
-  RETURN_IF_NOT_OK(store_client_->LoadDatasetLayout());
-
-  // Pull out the number of rows from the client and save into the op.
-  num_rows_ = store_client_->num_rows();
-  num_classes_ = store_client_->num_classes();
-
-  return Status::OK();
-}
-
-// Init of the StorageOp.  This is 2 of 3 init.
-// This version of the init allows the user to input the schema and other dataset properties rather
-// than get it from the dataset itself.
-Status StorageOp::InitOp(int32_t num_rows, const std::string &dataset_files_dir,
-                         std::unique_ptr<DataSchema> data_schema, const std::string &labels_file_name,
-                         const std::string &dataset_usage) {
-  num_rows_ = num_rows;
-  dataset_files_dir_ = dataset_files_dir;
-  labels_file_name_ = labels_file_name;
-  dataset_usage_ = dataset_usage;
-
-  // Storage ops require the internal master/worker connector.  create it here
-  RETURN_IF_NOT_OK(ParallelOp::CreateWorkerConnector(worker_conn_size_));
-
-  // Get parameter for distribution.
-  RETURN_IF_NOT_OK(LoadParallelConfig());
-
-  // Create the storage client based on the dataset type given from the input schema.
-  RETURN_IF_NOT_OK(StorageClient::CreateStorageClient(this, data_schema->dataset_type(), &store_client_));
-
-  // Perform the initial handshake with the storage client to initialize the schema
-  // and the number of rows in the set.  In this case, since the schema and the number
-  // of rows is input by the user directly, it's not much of a "handshake", it's more
-  // like an assign.
-  RETURN_IF_NOT_OK(store_client_->AssignDatasetLayout(num_rows_, *data_schema));
-  num_classes_ = store_client_->num_classes();
-
-  return Status::OK();
-}
-
-// Init of the StorageOp.  This is 3 of 3 init.
-// This version of the init does not take the schema in it's arguments. It must perform an
-// internal handshake with the dataset to produce the schema.  Unlike constructor 1, it takes a
-// list of files rather than a directory.
-Status StorageOp::InitOp(const std::vector<std::string> &files_list, const std::string &schema_file) {
-  dataset_file_list_ = files_list;
-  schema_file_ = schema_file;
-
-  // Storage ops require the internal master/worker connector.  create it here
-  RETURN_IF_NOT_OK(ParallelOp::CreateWorkerConnector(worker_conn_size_));
-
-  // Get parameter for distribution.
-  RETURN_IF_NOT_OK(LoadParallelConfig());
-
-  // Create the storage client. This will read the json file to determine what
-  // type of client we're creating.
-  RETURN_IF_NOT_OK(StorageClient::CreateStorageClient(this, schema_file_, &store_client_));
-
-  // Perform the initial handshake with the storage client to further read the
-  // dataset info to populate schema info and the number of rows in the client.
-  RETURN_IF_NOT_OK(store_client_->LoadDatasetLayout());
-
-  // Pull out the number of rows from the client and save into the op.
-  num_rows_ = store_client_->num_rows();
-
-  return Status::OK();
-}
-
-// Private helper method.  This one encapsulates some common construction/reset tasks and is
-// designed to be re-entrant so that you can re-init a previously used StorageOp without needing
-// to redo the storage client handshake.
-Status StorageOp::init() {
-  // First a sanity check to make sure the StorageClient initialization has done the proper
-  // handshake and initialized both the schema and the number of rows for the dataset.
-  const DataSchema *the_schema = store_client_->schema();
-  if (the_schema->NumColumns() == 0 || num_rows_ == 0) {
-    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
-                  "Storage client did not run handshake to init schema and number of rows.");
-  }
-
-  // Now that we have schema, generate the column name map (base class field)
-  for (int32_t i = 0; i < the_schema->NumColumns(); ++i) {
-    column_name_id_map_[the_schema->column(i).name()] = i;
-  }
-
-  // If the data buffer vector is not empty, then we may be redoing a scan again after a repeat.
-  // In such a case, we have vector of nullptrs that used to hold the buffers.  get rid of this
-  // so we can reuse the vector.
-  if (!data_buffers_.empty()) {
-    data_buffers_.clear();
-  }
-  int32_t buffers_needed;
-
-  // We have our range of row id's, but we must carve this up into buffers now so that
-  // each buffer holds a subset of the overall range.
-  // Instantiate the buffers now, but this does not actually drive a load of actual
-  // data at this point.
-
-  // First, compute how many buffers we would need to accomplish rowsPerBuffer
-  buffers_needed = this->num_rows() / rows_per_buffer_;
-
-  // If an extra partial buffer is needed, adjust for that.
-  if (this->num_rows() % rows_per_buffer_ != 0) {
-    buffers_needed++;
-  }
-  MS_LOG(DEBUG) << "Master: Initializing StorageOp. Dataset files dir: " << dataset_files_dir_ << " Dataset type: "
-                << static_cast<std::underlying_type<DatasetType>::type>(store_client_->schema()->dataset_type())
-                << " Dataset schema file: " << schema_file_ << " Number of rows: " << num_rows_
-                << " Rows per buffer: " << rows_per_buffer_ << " Num buffers (computed): " << buffers_needed
-                << " Number of workers: " << num_workers_ << ".";
-
-  // Next, create each buffer in a loop.
-  int32_t buff_id = 0;
-  for (buff_id = 0; buff_id < buffers_needed; buff_id++) {
-    // Create a new data buffer as a base class pointer, using the factory method from
-    // DataBuffer class
-    std::unique_ptr<DataBuffer> new_data_buffer;
-    RETURN_IF_NOT_OK(DataBuffer::CreateDataBuffer(buff_id, store_client_, &new_data_buffer));
-
-    // Insert the buffer into our vector
-    data_buffers_.push_back(std::move(new_data_buffer));
-  }
-
-  // Instantiate the action queues.  If this was a re-entrant call then these already exist.
-  // We cannot drop and recreate them because there are threads waiting on them currently.
-  // They should be empty anyway in a reset codepath
-  if (action_queue_.empty()) {
-    // The max size of these queues should ensure they will never get full and they support
-    // precisely the amount of data that we know they will hold (the total number of buffers).
-    // There needs to be one queue for each worker, to support the Connector design for how
-    // data will be fetched and pushed into a Connector in parallel.
-    //
-    // Say the total buffers is 5, and we have 2 workers.
-    // To support this, we'd need 1 queue of size 2 and the other of size 3.
-    // For simplicity, we'll make both of them 3 so they are the same size.
-    int32_t action_queue_size = (buffers_needed / num_workers_) + 1;
-    for (int32_t i = 0; i < num_workers_; ++i) {
-      auto new_queue = std::make_unique<Queue<int32_t>>(action_queue_size);
-      action_queue_.push_back(std::move(new_queue));
-    }
-  }
-
-  // Extract the list of buffer id's from the vector and use this as our starting action
-  // queue of buffers.
-  RETURN_IF_NOT_OK(this->FillActionQueue(false));
-  return Status::OK();
-}
-
-// Destructor
-StorageOp::~StorageOp() {}
-
-// A print method typically used for debugging
-void StorageOp::Print(std::ostream &out, bool show_all) const {
-  // Always show the id and name as first line regardless if this summary or detailed print
-  out << "(" << std::setw(2) << operator_id_ << ") <StorageOp>:";
-  if (!show_all) {
-    // Call the super class for displaying any common 1-liner info
-    ParallelOp::Print(out, show_all);
-    // Then show any custom derived-internal 1-liner info for this op
-    out << "\n";
-  } else {
-    // Call the super class for displaying any common detailed info
-    ParallelOp::Print(out, show_all);
-    // Then show any custom derived-internal stuff
-    out << "\nDetailed operator printing has not been implemented for this op.\n\n";
-  }
-}
-
-// Private helper method.  This one posts a control indicator for each worker thread to consume
-// from the action queue.  When the worker pops this msg, it will shut itself down gracefully.
-Status StorageOp::PostEndOfData() {
-  MS_LOG(DEBUG) << "Master: Processed all of the buffers. Send end-of-data message to workers.";
-
-  // For each worker we add the message so that they can all get the memo
-  for (int32_t i = 0; i < num_workers_; ++i) {
-    RETURN_IF_NOT_OK(action_queue_[i]->Add(kEndOfActions));
-  }
-  return Status::OK();
-}
-
-// Private helper method.  This one populates the action queue with the list of buffer ids.
-Status StorageOp::FillActionQueue(bool randomize) {
-  // We only support adding the new list of id's to the queue if we are sure the old list
-  // of actions is already done.  This might change in the future though
-  for (int32_t i = 0; i < num_workers_; ++i) {
-    if (!(action_queue_[i]->empty())) {
-      return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
-                    "Attempt to get buffer id's into a queue, but the queue not empty!");
-    }
-  }
-  if (!data_buffers_.empty()) {
-    // Add buffer id's to the queue. Buffer id's in our vector are just numbers from 0 up, so
-    // basically just a list of consecutive numbers starting from 0 (incremented by 1).
-    // If randomize is requested, the list of id's will be jumbled up (so not consecutive
-    // order)
-    if (!randomize) {
-      // Round robin of filling each worker with the buffer id's
-      int32_t curr_worker = 0;
-      for (int32_t i = 0; i < data_buffers_.size(); ++i) {
-        RETURN_IF_NOT_OK(action_queue_[curr_worker]->Add(i));
-        curr_worker++;
-        if (curr_worker == num_workers_) {
-          curr_worker = 0;
-        }
-      }
-    } else {
-      std::vector<int32_t> random_ids;
-      int32_t i;
-      for (i = 0; i < data_buffers_.size(); ++i) {
-        random_ids.push_back(i);
-      }
-      uint32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
-      std::shuffle(random_ids.begin(), random_ids.end(), std::default_random_engine(seed));
-
-      // Round robin of filling each worker with the buffer id's from randomized list
-      int32_t curr_worker = 0;
-      for (i = 0; i < random_ids.size(); ++i) {
-        RETURN_IF_NOT_OK(action_queue_[curr_worker]->Add(random_ids[i]));
-        curr_worker++;
-        if (curr_worker == num_workers_) {
-          curr_worker = 0;
-        }
-      }
-    }
-  }
-  return Status::OK();
-}
-
-// The entry point code for when workers are launched.
-// Given the input bufferId, it returns a shared_ptr to that buffer back to you by driving a
-// load operation.  This function is intended to be run by worker threads, when they are
-// populating the memory with the actual data of the buffer.
-Status StorageOp::GetBuffer(int32_t buffer_id, std::unique_ptr<DataBuffer> *ptr) {
-  if (!data_buffers_.empty()) {
-    if (static_cast<size_t>(buffer_id) >= data_buffers_.size()) {
-      std::ostringstream ss;
-      ss << "Error.  Buffer id " << buffer_id << " is out of range.";
-      std::string err_msg = ss.str();
-      RETURN_STATUS_UNEXPECTED(err_msg);
-    }
-
-    // execute a load operation to fill this buffer (may result in call to storage layers)
-    RETURN_IF_NOT_OK(data_buffers_[buffer_id]->Load());
-
-    // Return the buffer
-    // Important: The share pointer remains counted for the caller as well as locally in the
-    // mDataBuffers array.  Later when the buffer is sent on it's way up the pipeline, the
-    // shared_ptr in the array will be reset so that the StorageOp will not hang on to old
-    // buffers that it has already passed up the pipeline.
-    *ptr = std::move(data_buffers_[buffer_id]);
-  } else {
-    RETURN_STATUS_UNEXPECTED("Requested to get a buffer from an empty cache.");
-  }
-  return Status::OK();
-}
-
-// Class functor operator () override.
-// All dataset ops operate by launching a thread (see ExecutionTree). This class functor will
-// provide the master loop that drives the logic for performing the work
-Status StorageOp::operator()() {
-  // Before we enter our master loop, kick off our workers and assign them to
-  // use the StorageOp worker entry code.
-  RETURN_IF_NOT_OK(tree_->LaunchWorkers(num_workers_, std::bind(&StorageOp::WorkerEntry, this, std::placeholders::_1)));
-  // Handshake with TaskManager to synchronize thread creation
-  TaskManager::FindMe()->Post();
-  int32_t num_buffers_to_fetch = data_buffers_.size();
-
-  // The storage op is the bottom node in the tree, so it does not listen to an input
-  // queue from an operator below us. Instead, we'll will read from the internal queue
-  // that our workers produce into, and then push that into output queue.
-  bool done = false;
-  std::unique_ptr<DataBuffer> fetched_buffer;
-  while (!done) {
-    // Get the next buffer. We are single thread master so thread id hard coded to 0
-    // on the connector pop.  Count this buffer towards our count, and then push
-    // it up to the output connector.
-    RETURN_IF_NOT_OK(worker_connector_->PopWithRetry(0, &fetched_buffer));
-    buffers_fetched_++;
-    int32_t buffer_id = fetched_buffer->id();
-
-    if (buffers_fetched_ == 1) {
-      num_buffers_to_fetch = static_cast<int32_t>(data_buffers_.size());
-    }
-
-    // There should be 2 holders of this buffer currently. We have one in the mDataBuffers
-    // table, and then ourselves right now with fetchedBuffer.
-    // Reduce the shared_ptr ref count of this buffer by removing it from the mDataBuffers
-    // table first before we push the buffer to output connector.
-    data_buffers_[buffer_id].reset();
-    MS_LOG(DEBUG) << "StorageOp master: Consumed buffer " << buffer_id << " from internal worker connector.";
-    RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(fetched_buffer)));
-    MS_LOG(DEBUG) << "StorageOp master: pushed buffer " << buffer_id << " to output connector.";
-
-    // Now, check our loop exit conditions and perform appropriate end of data handling if
-    // we've reached the end of our scan.
-    if (buffers_fetched_ == num_buffers_to_fetch) {
-      MS_LOG(DEBUG) << "StorageOp master: Reached end of data.";
-
-      // If we are not inside of a Repeat path in the tree, or we are in a repeat path but
-      // this was our last repeat, then we do a full quit here with eof control message.
-      if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
-        // Post the control message to tell the workers to stop waiting on action queue
-        // because we are done!
-        RETURN_IF_NOT_OK(this->PostEndOfData());
-        std::unique_ptr<DataBuffer> eoeBuffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
-        RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eoeBuffer)));
-        MS_LOG(DEBUG) << "StorageOp master: Flow end-of-data eof message.";
-        std::unique_ptr<DataBuffer> eofBuffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
-        RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eofBuffer)));
-        MS_LOG(DEBUG) << "StorageOp master: Main execution loop complete.";
-        done = true;  // while loop exit
-      } else {
-        // We are in a repeat path and it's not the last repeat.
-        // Flow an end-of-epoch control message up the pipeline.
-        // RepeatOp above us somewhere in the tree will re-init us with the data to fetch again
-        // once it gets the end-of-epoch message.
-        MS_LOG(DEBUG) << "StorageOp master: Flow end-of-epoch eoe message.";
-        std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
-        RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eoe_buffer)));
-
-        // reset our buffer count and go to loop again.
-        buffers_fetched_ = 0;
-
-        // This is a bit of a cheat.  Only the repeat op should perform resetting actions
-        // against us (currently).  However, if we go to block/wait on the worker_connector_
-        // right now before the reset is done (driven from the repeat op), then we end
-        // up using stale connector index info and blocking on the wrong thing, causing
-        // invalid order during the next epoch.
-        // For now then, do a quick reset of just the connector queue so that we block
-        // at a safe starting point in the connector.
-        worker_connector_->Reset();
-      }
-    }
-  }
-  return Status::OK();
-}
-
-// The entry point code for when workers are launched.
-Status StorageOp::WorkerEntry(int32_t worker_id) {
-  int32_t next_action_id = 0;
-  MS_LOG(DEBUG) << "Worker: StorageOp worker entry point.";
-
-  // Handshake with TaskManager to synchronize the creation
-  TaskManager::FindMe()->Post();
-
-  // While there is still some actions to perform
-  RETURN_IF_NOT_OK(action_queue_[worker_id]->PopFront(&next_action_id));
-  while (next_action_id != kEndOfActions) {
-    // Drive a load of this buffer and get a pointer to the buffer after it's loaded in
-    std::unique_ptr<DataBuffer> dB;
-    RETURN_IF_NOT_OK(this->GetBuffer(next_action_id, &dB));
-    MS_LOG(DEBUG) << "Worker: Loaded buffer " << next_action_id << ".";
-
-    // Add the buffer to the internal queue for master to consume from later.
-    // This could end up blocking if the queue is full in which case it waits here
-    // until the master can drain a buffer off the queue.
-    RETURN_IF_NOT_OK(worker_connector_->Add(worker_id, std::move(dB)));
-    MS_LOG(DEBUG) << "Worker: Pushed buffer " << next_action_id << " to internal worker connector.";
-
-    // Get the next action id and loop
-    RETURN_IF_NOT_OK(action_queue_[worker_id]->PopFront(&next_action_id));
-  }
-  MS_LOG(DEBUG) << "Worker: Received end-of-data message.  Worker complete.";
-  return Status::OK();
-}
-
-const DataSchema *StorageOp::schema() const { return store_client_->schema(); }
-
-// Overrides base class reset method.  When an operator does a reset, it cleans up any state
-// info from it's previous execution and then initializes itself so that it can be executed
-// again.
-Status StorageOp::Reset() {
-  RETURN_IF_NOT_OK(ParallelOp::Reset());  // Call our super class reset first.
-
-  // We do not need to redo the handshake with the storage client, since that
-  // info should be the same as the last time.  However there may be stale
-  // state info in the client from the last execution.  The client provides
-  // a reset method as well to re-initialize.
-  RETURN_IF_NOT_OK(store_client_->Reset());
-
-  // init method is re-entrant and will refresh everything.
-  RETURN_IF_NOT_OK(this->init());
-  return Status::OK();
-}
-
-// Name: LoadParallelConfig
-// Description: Load parallel config info from a specific config file. In multi-P cases (or single-P cases), we
-//             need to know deviceID, rank, device number, shard mode
-//             , shuffle (or not) and seed to prepare to scatter files.
-Status StorageOp::LoadParallelConfig() {
-  if (data_distribution_file_ == "") {
-    return Status::OK();
-  }
-  try {
-    std::ifstream in(data_distribution_file_);
-    nlohmann::json js;
-    in >> js;
-    device_num_ = js.value("deviceNum", 0);
-    device_id_ = js.value("deviceId", 0);
-    if (device_num_ == 0 || device_num_ > MAX_INTEGER_INT32) {
-      RETURN_STATUS_UNEXPECTED("Invalid deviceNum");
-    }
-    if (device_id_ > MAX_INTEGER_INT32 || device_id_ >= device_num_) {
-      MS_LOG(DEBUG) << "In parallel config file " << data_distribution_file_ << ", wrong deviceID provided.";
-      RETURN_STATUS_UNEXPECTED("Invalid deviceId");
-    }
-    shard_config_ = js.value("shardConfig", "");
-    if (shard_config_ != "ALL" && shard_config_ != "UNIQUE" && shard_config_ != "RANDOM") {
-      MS_LOG(DEBUG) << "In parallel config file " << data_distribution_file_ << " wrong mShardConfig provided.";
-      RETURN_STATUS_UNEXPECTED("Invalid shardConfig");
-    }
-    std::string shuffle_str = js.value("shuffle", "");
-    if (shuffle_str == "ON") {
-      shuffle_config_ = true;
-    } else if (shuffle_str == "OFF") {
-      shuffle_config_ = false;
-    } else {
-      MS_LOG(DEBUG) << "In parallel config file " << data_distribution_file_
-                    << ", shuffle config is wrong: it's not ON or OFF";
-      RETURN_STATUS_UNEXPECTED("Invalid shuffle option");
-    }
-    seed_ = js.value("seed", 0);
-    if (seed_ > MAX_INTEGER_UINT32) {
-      RETURN_STATUS_UNEXPECTED("Invalid seed");
-    }
-  } catch (const std::exception &e) {
-    RETURN_STATUS_UNEXPECTED("Load parallel config failed");
-  }
-  return Status::OK();
-}
-}  // namespace dataset
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/storage_op.h
deleted file mode 100644
index 9334addc34..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_op.h
+++ /dev/null
@@ -1,389 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_STORAGE_OP_H_
-#define DATASET_ENGINE_DATASETOPS_SOURCE_STORAGE_OP_H_
-
-#include <condition_variable>
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <queue>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "dataset/engine/data_schema.h"
-#include "dataset/engine/datasetops/parallel_op.h"
-#include "dataset/util/status.h"
-
-namespace mindspore {
-namespace dataset {
-// Forward declares
-template <typename T>
-class Queue;
-
-// A type for a container of DataBuffer shared_ptr's
-using DataBuffers = std::vector<std::unique_ptr<DataBuffer>>;
-
-// A type for the queue of buffer id's for workers to fetch.
-using ActionQueue = std::vector<std::unique_ptr<Queue<int32_t>>>;
-
-// Forward declare
-class DataBuffer;
-
-class StorageClient;
-
-class StorageOp : public ParallelOp {
- public:
-  // The nested builder class inside of the StorageOp is used to help manage all of the arguments
-  // for constructing it.  Use the builder by setting each argument with the provided set methods,
-  // and then finally call the build method to execute the actual construction.
-  class Builder {
-   public:
-    // Builder constructor.  Creates the builder object.
-    // @note No default args
-    // @return This is a constructor.
-    Builder();
-
-    // Default destructor
-    ~Builder() = default;
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetNumRows(int num_rows) {
-      build_num_rows_ = num_rows;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetRowsPerBuffer(int rows_per_buffer) {
-      build_rows_per_buffer_ = rows_per_buffer;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetSchema(std::unique_ptr<DataSchema> schema) {
-      build_schema_ = std::move(schema);
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetNumWorkers(int32_t num_workers) {
-      build_num_workers_ = num_workers;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetWorkerConnectorSize(int32_t connector_size) {
-      build_worker_connector_size_ = connector_size;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetOpConnectorSize(int32_t connector_size) {
-      build_op_connector_size_ = connector_size;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetSchemaDir(const std::string &schema_dir) {
-      build_schema_file_ = schema_dir + "/datasetSchema.json";
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetSchemaFile(const std::string &schema_file) {
-      build_schema_file_ = schema_file;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetDatasetFilesDir(const std::string &files_dir) {
-      build_dataset_files_dir_ = files_dir;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetDatasetFileList(const std::vector<std::string> &file_list) {
-      build_dataset_file_list_ = file_list;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetColumnsToLoad(const std::vector<std::string> &columns) {
-      build_columns_to_load_ = columns;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetDataDistributionFile(const std::string &data_distribution_file) {
-      build_data_distribution_file_ = data_distribution_file;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &setLabelsFileName(const std::string &labels_file_name) {
-      build_labels_file_name_ = labels_file_name;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetDatasetUsage(const std::string &dataset_usage) {
-      build_dataset_usage_ = dataset_usage;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetBatchSize(int32_t batch_size) {
-      build_batch_size_ = batch_size;
-      return *this;
-    }
-
-    // Setter method.
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetDropRemainder(bool drop_remainder) {
-      build_drop_remainder_ = drop_remainder;
-      return *this;
-    }
-
-    // The builder "build" method creates the final object.
-    // @param shared_ptr to the new StorageOp object
-    // @return Status - The error code return
-    Status Build(std::shared_ptr<StorageOp> *);
-
-   private:
-    // The builder saves all StorageOp construction arguments internally.
-    // The following are the arguments.
-    std::string build_dataset_files_dir_;
-    std::string build_schema_file_;
-    int32_t build_num_rows_;
-    std::string build_data_distribution_file_;
-    int32_t build_rows_per_buffer_;
-    int32_t build_worker_connector_size_;
-    int32_t build_num_workers_;
-    int32_t build_op_connector_size_;
-    std::unique_ptr<DataSchema> build_schema_;
-    std::vector<std::string> build_dataset_file_list_;
-    std::vector<std::string> build_columns_to_load_;
-    std::string build_labels_file_name_;
-    std::string build_dataset_usage_;
-    int32_t build_batch_size_;
-    bool build_drop_remainder_;
-  };
-
-  // Constructor of the StorageOp.
-  // @note The builder class should be used to call it
-  // @param num_workers - The number of workers for the op
-  // @param worker_connector_size - The internal connector size between workers and master
-  // @param rows_per_buffer - The requested number of rows per buffer
-  // @param op_connector_size - The output connector queue size
-  // @param columns_to_load - The list of columns to use (column name)
-  StorageOp(int32_t num_workers, int32_t worker_connector_size, int32_t rows_per_buffer, int32_t op_connector_size,
-            std::vector<std::string> columns_to_load, std::string data_distribution_file, int32_t batch_size,
-            bool drop_remainder);
-
-  // Init the StorageOp.  This is 1 of 3 init.
-  // This version of the init does not take the schema in it's arguments. It must perform an
-  // internal handshake with the dataset to produce the schema.
-  // @note The builder class should be used to call it
-  // @param dataset_files_dir - The directory that has the dataset files
-  // @param schema_file - The schema file for providing column info
-  Status InitOp(const std::string &dataset_files_dir, const std::string &schema_file,
-                const std::string &labels_file_name, const std::string &dataset_usage);
-
-  // Init the StorageOp.  This is 2 of 3 init.
-  // This version of the init allows the user to input the schema and other dataset properties rather
-  // than get it from the dataset itself.
-  // @note The builder class should be used to call it
-  // @param num_rows - The number of rows in the dataset
-  // @param dataset_files_dir - The directory that has the dataset files
-  // @param data_schema - The schema to use
-  Status InitOp(int32_t num_rows, const std::string &dataset_files_dir, std::unique_ptr<DataSchema> data_schema,
-                const std::string &labels_file_name, const std::string &dataset_usage);
-
-  // Init the StorageOp.  This is 3 of 3 init.
-  // This version of the init does not take the schema in it's arguments. It must perform an
-  // internal handshake with the dataset to produce the schema.  Unlike constructor 1, it takes a
-  // list of files rather than a directory.
-  // @note The builder class should be used to call it
-  // @param files_list - The list of files to use for the dataset
-  // @param schema_file - The schema file for providing column info
-  Status InitOp(const std::vector<std::string> &files_list, const std::string &schema_file);
-
-  // Destructor
-  ~StorageOp();
-
-  // A print method typically used for debugging
-  // @param out - The output stream to write output to
-  // @param show_all - A bool to control if you want to show all info or just a summary
-  void Print(std::ostream &out, bool show_all) const override;
-
-  // << Stream output operator overload
-  // @notes This allows you to write the debug print info using stream operators
-  // @param out - reference to the output stream being overloaded
-  // @param storage_op - reference to the StorageOp to display
-  // @return - the output stream must be returned
-  friend std::ostream &operator<<(std::ostream &out, const StorageOp &storage_op) {
-    storage_op.Print(out, false);
-    return out;
-  }
-
-  // Class functor operator () override.
-  // All DatasetOps operate by launching a thread (see ExecutionTree). This class functor will
-  // provide the master loop that drives the logic for performing the work.
-  // @return Status - The error code return
-  Status operator()() override;
-
-  // The entry point code for when workers are launched.
-  // @param worker_id - The worker id
-  // @return Status - The error code return
-  Status WorkerEntry(int32_t worker_id) override;
-
-  // The entry point code for when workers are launched.
-  // Given the input bufferId, it returns a shared_ptr to that buffer back to you by driving a
-  // load operation.  This function is intended to be run by worker threads, when they are
-  // populating the memory with the actual data of the buffer.
-  // @param buffer_id - The buffer id to get.
-  // @param ptr - Pointer to shared_ptr to the buffer that was loaded in.
-  // @return Status - The error code return
-  Status GetBuffer(int32_t buffer_id, std::unique_ptr<DataBuffer> *ptr);
-
-  // Overrides base class reset method.  When an operator does a reset, it cleans up any state
-  // info from it's previous execution and then initializes itself so that it can be executed
-  // again.
-  // @return Status - The error code return
-  Status Reset() override;
-
-  // Getter method
-  int32_t num_rows() const { return num_rows_; }
-
-  // Setter method
-  void set_num_rows(int32_t num_rows) { num_rows_ = num_rows; }
-
-  // Getter method
-  int32_t rows_per_buffer() const { return rows_per_buffer_; }
-
-  // Setter method
-  void set_rows_per_buffer(int32_t rows_per_buffer) { rows_per_buffer_ = rows_per_buffer; }
-
-  // Getter method
-  std::string dataset_files_dir() const { return dataset_files_dir_; }
-
-  // Getter method
-  std::vector<std::string> dataset_file_list() const { return dataset_file_list_; }
-
-  // Getter method
-  std::string schema_file() const { return schema_file_; }
-
-  // Getter method
-  const DataSchema *schema() const;
-
-  // Getter method
-  const std::vector<std::string> columns_to_load() const { return columns_to_load_; }
-
-  // Getter method
-  std::string data_distribution_file() const { return data_distribution_file_; }
-
-  // Getter method
-  int32_t device_num() const { return device_num_; }
-
-  // Getter method
-  int32_t device_id() const { return device_id_; }
-
-  // Getter method
-  std::string shard_config() const { return shard_config_; }
-
-  // Getter method
-  uint32_t seed() const { return seed_; }
-
-  // Getter method
-  bool shuffle_config() const { return shuffle_config_; }
-
-  // Getter method
-  int32_t num_classes() const { return num_classes_; }
-
-  // Getter method
-  std::string labels_file_name() const { return labels_file_name_; }
-
-  // Getter method
-  std::string dataset_usage() const { return dataset_usage_; }
-
-  // Getter method
-  int32_t batch_size() const { return batch_size_; }
-
-  // Getter method
-  bool drop_remainder() const { return drop_remainder_; }
-
- private:
-  // Private helper method.  This one populates the action queue with the list of buffer ids.
-  // @param randomize - T/F if the id's in the action queue should be randomized or sequential.
-  Status FillActionQueue(bool randomize);
-
-  // Private helper method.  This one encapsulates some common construction/reset tasks and is
-  // designed to be re-entrant so that you can re-init a previously used StorageOp without needing
-  // to redo the storage client handshake.
-  // @return Status - The error code return
-  Status init();
-
-  // Private helper method.  This one posts a control indicator for each worker thread to consume
-  // from the action queue.  When the worker pops this msg, it will shut itself down gracefully.
-  // @return Status - The error code return
-  Status PostEndOfData();
-
-  Status LoadParallelConfig();
-
-  DataBuffers data_buffers_;                     // A vector of pointers to buffers
-  std::shared_ptr<StorageClient> store_client_;  // The client for interacting with storage
-  ActionQueue action_queue_;                     // The queues of buffer id's for workers to fetch.
-  int32_t worker_conn_size_;                     // connector size for internal worker queue
-  int32_t rows_per_buffer_;                      // The number of requested rows per buffer.
-  int32_t num_rows_;                             // One more than the last row id in the range for this cache
-  std::string dataset_files_dir_;                // The path for the dataset files
-  std::vector<std::string> dataset_file_list_;   // List of paths to files for the dataset
-  int32_t buffers_fetched_;                      // Counter for the buffers that were fetched
-  std::string schema_file_;                      // Path to the schema json file
-  std::vector<std::string> columns_to_load_;     // Columns to load from dataset
-  std::string data_distribution_file_;           // Distribution configuration file
-  int32_t device_num_;                           // All device number
-  int32_t device_id_;                            // Device id
-  std::string shard_config_;                     // ALL UNIQUE RANDOM
-  uint32_t seed_;                                // Used for shuffle
-  bool shuffle_config_;                          // True or false
-  std::string labels_file_name_;                 // File name of labels
-  int32_t num_classes_;                          // Label class number
-  std::string dataset_usage_;                    // train/eval/inference
-  int32_t batch_size_;
-  bool drop_remainder_;
-};
-}  // namespace dataset
-}  // namespace mindspore
-
-#endif  // DATASET_ENGINE_DATASETOPS_SOURCE_STORAGE_OP_H_
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc
index e51eb4e00d..26058cc8b8 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc
@@ -33,7 +33,11 @@
 namespace mindspore {
 namespace dataset {
 TextFileOp::Builder::Builder()
-    : builder_device_id_(0), builder_num_devices_(1), builder_num_samples_(0), builder_shuffle_files_(false) {
+    : builder_device_id_(0),
+      builder_num_devices_(1),
+      builder_total_rows_(0),
+      builder_shuffle_files_(false),
+      builder_shuffle_global_(false) {
   std::shared_ptr<ConfigManager> config_manager = GlobalContext::config_manager();
   builder_num_workers_ = config_manager->num_parallel_workers();
   builder_op_connector_size_ = config_manager->op_connector_size();
@@ -43,7 +47,7 @@ TextFileOp::Builder::Builder()
 
 Status TextFileOp::Builder::ValidateInputs() const {
   std::string err_msg;
-  err_msg += builder_num_workers_ <= 0 ? "Number of parallel workers should be greate than 0\n" : "";
+  err_msg += builder_num_workers_ <= 0 ? "Number of parallel workers should be greater than 0\n" : "";
   err_msg += builder_device_id_ >= builder_num_devices_ || builder_num_devices_ < 1 ? "Wrong sharding configs\n" : "";
   return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg);
 }
@@ -62,25 +66,27 @@ Status TextFileOp::Builder::Build(std::shared_ptr<TextFileOp> *op) {
     builder_schema_->AddColumn(ColDescriptor("text", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
 
   std::shared_ptr<TextFileOp> text_file_op = std::make_shared<TextFileOp>(
-    builder_num_workers_, builder_rows_per_buffer_, builder_num_samples_, builder_worker_connector_size_,
+    builder_num_workers_, builder_rows_per_buffer_, builder_total_rows_, builder_worker_connector_size_,
     std::move(builder_schema_), builder_text_files_list_, builder_op_connector_size_, builder_shuffle_files_,
-    builder_num_devices_, builder_device_id_);
+    builder_shuffle_global_, builder_num_devices_, builder_device_id_);
   RETURN_IF_NOT_OK(text_file_op->Init());
   *op = std::move(text_file_op);
 
   return Status::OK();
 }
 
-TextFileOp::TextFileOp(int32_t num_workers, int64_t rows_per_buffer, int64_t num_samples, int32_t worker_connector_size,
+TextFileOp::TextFileOp(int32_t num_workers, int64_t rows_per_buffer, int64_t total_rows, int32_t worker_connector_size,
                        std::unique_ptr<DataSchema> schema, std::vector<std::string> text_files_list,
-                       int32_t op_connector_size, bool shuffle_files, int32_t num_device, int32_t device_id)
+                       int32_t op_connector_size, bool shuffle_files, bool shuffle_global, int32_t num_device,
+                       int32_t device_id)
     : ParallelOp(num_workers, op_connector_size),
       device_id_(device_id),
       num_devices_(num_device),
       rows_per_buffer_(rows_per_buffer),
-      num_samples_(num_samples),
+      total_rows_(total_rows),
       text_files_list_(std::move(text_files_list)),
       shuffle_files_(shuffle_files),
+      shuffle_global_(shuffle_global),
       data_schema_(std::move(schema)),
       all_num_rows_(0),
       num_rows_per_shard_(0),
@@ -104,9 +110,9 @@ void TextFileOp::Print(std::ostream &out, bool show_all) const {
     // Call the super class for displaying any common detailed info
     ParallelOp::Print(out, show_all);
     // Then show any custom derived-internal stuff
-    out << "\nRows per buffer: " << rows_per_buffer_ << "\nSample count: " << num_samples_
-        << "\nDevice id: " << device_id_ << "\nNumber of devices: " << num_devices_
-        << "\nShuffle files: " << ((shuffle_files_) ? "yes" : "no") << "\nText files list:\n";
+    out << "\nRows per buffer: " << rows_per_buffer_ << "\nRow count: " << total_rows_ << "\nDevice id: " << device_id_
+        << "\nNumber of devices: " << num_devices_ << "\nShuffle files: " << ((shuffle_files_) ? "yes" : "no")
+        << "\nText files list:\n";
     for (int i = 0; i < text_files_list_.size(); ++i) {
       out << " " << text_files_list_[i];
     }
@@ -314,8 +320,7 @@ Status TextFileOp::FillIOBlockQueue(const std::vector<int64_t> &i_keys) {
             break;
           }
         }
-        auto file_it = filename_index_->Search(*it);
-        file_index.emplace_back(std::pair<std::string, int64_t>(file_it.value(), *it));
+        file_index.emplace_back(std::pair<std::string, int64_t>((*filename_index_)[*it], *it));
       }
     } else {
       for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) {
@@ -404,9 +409,9 @@ Status TextFileOp::operator()() {
       RETURN_IF_NOT_OK(jagged_buffer_connector_->Pop(0, &buffer));
       if (buffer->eoe()) {
         workers_done++;
-      } else if (num_samples_ == 0 || rows_read < num_samples_) {
-        if ((num_samples_ > 0) && (rows_read + buffer->NumRows() > num_samples_)) {
-          int64_t rowsToRemove = buffer->NumRows() - (num_samples_ - rows_read);
+      } else if (total_rows_ == 0 || rows_read < total_rows_) {
+        if ((total_rows_ > 0) && (rows_read + buffer->NumRows() > total_rows_)) {
+          int64_t rowsToRemove = buffer->NumRows() - (total_rows_ - rows_read);
           RETURN_IF_NOT_OK(buffer->SliceOff(rowsToRemove));
         }
         rows_read += buffer->NumRows();
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.h
index 8b8eda00fe..dd258d914e 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.h
@@ -107,8 +107,15 @@ class TextFileOp : public ParallelOp {
 
     // Setter method.
     // @return Builder - setter method returns reference to the builder.
-    Builder &SetNumSamples(int64_t num_samples) {
-      builder_num_samples_ = num_samples;
+    Builder &SetShuffleGlobal(bool shuffle_global) {
+      builder_shuffle_global_ = shuffle_global;
+      return *this;
+    }
+
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetTotalRows(int64_t total_rows) {
+      builder_total_rows_ = total_rows;
       return *this;
     }
 
@@ -118,10 +125,11 @@ class TextFileOp : public ParallelOp {
     int32_t builder_num_workers_;
     int32_t builder_op_connector_size_;
     int64_t builder_rows_per_buffer_;
-    int64_t builder_num_samples_;
+    int64_t builder_total_rows_;
     int32_t builder_worker_connector_size_;
     std::vector<std::string> builder_text_files_list_;
     bool builder_shuffle_files_;
+    bool builder_shuffle_global_;
     std::unique_ptr<DataSchema> builder_schema_;
   };
 
@@ -135,10 +143,11 @@ class TextFileOp : public ParallelOp {
   // @param op_connector_size - size of each queue in the connector that the child operator pulls from.
   // @param columns_to_load - the names of the columns to load data from.
   // @param shuffle_files - whether or not to shuffle the files before reading data.
+  // @param shuffle_global - whether or not to shuffle the entire dataset.
   // @param equal_rows_per_shard - whether or not to get equal rows for each process.
-  TextFileOp(int32_t num_workers, int64_t rows_per_buffer, int64_t num_samples, int32_t worker_connector_size,
+  TextFileOp(int32_t num_workers, int64_t rows_per_buffer, int64_t total_rows, int32_t worker_connector_size,
              std::unique_ptr<DataSchema>, std::vector<std::string> text_files_list, int32_t op_connector_size,
-             bool shuffle_files, int32_t num_devices, int32_t device_id);
+             bool shuffle_files, bool shuffle_global, int32_t num_devices, int32_t device_id);
 
   // Default destructor
   ~TextFileOp() = default;
@@ -169,6 +178,18 @@ class TextFileOp : public ParallelOp {
   // @return Status - the error coed returned.
   static Status CountAllFileRows(const std::vector<std::string> &files, int64_t *count);
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "TextFileOp"; }
+
+  // File names getter
+  // @return Vector of the input file names
+  std::vector<std::string> FileNames() { return text_files_list_; }
+
+  // Global shuffle flag getter
+  // @return Bool - whether this Op requires global shuffle
+  bool RequireGlobalShuffle() { return shuffle_global_; }
+
  private:
   // The entry point for when workers are launched.
   // @param worker_id - the id of the worker that is executing this function.
@@ -246,9 +267,10 @@ class TextFileOp : public ParallelOp {
   int32_t device_id_;
   int32_t num_devices_;
   int64_t rows_per_buffer_;
-  int64_t num_samples_;
+  int64_t total_rows_;
   std::vector<std::string> text_files_list_;
   bool shuffle_files_;
+  bool shuffle_global_;
   std::unique_ptr<DataSchema> data_schema_;
   int64_t all_num_rows_;
   int64_t num_rows_per_shard_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_buffer.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_buffer.cc
deleted file mode 100644
index 8803c3f040..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_buffer.cc
+++ /dev/null
@@ -1,326 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "dataset/engine/datasetops/source/tf_buffer.h"
-#include <cstring>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "common/utils.h"
-#include "utils/log_adapter.h"
-
-#include "dataset/engine/datasetops/source/tf_client.h"
-#include "dataset/core/data_type.h"
-#include "dataset/engine/datasetops/source/storage_client.h"
-#include "dataset/engine/data_schema.h"
-
-namespace mindspore {
-namespace dataset {
-// constructor
-TFBuffer::TFBuffer(
-  uint32_t id,                                           // In: The id for this buffer
-  BufferFlags flags,                                     // In: The flags for this buffer
-  const std::shared_ptr<StorageClient> &storage_client)  // In: Storage client that is related to this buffer type
-    : DataBuffer(id, flags), storage_client_(storage_client) {}
-
-// destructor
-TFBuffer::~TFBuffer() {}
-
-// Name: print()
-// Description: A function that prints info
-void TFBuffer::Print(std::ostream &out,      // In: The output stream to print to
-                     bool show_all) const {  // In: T/F if it should print everything
-  out << "TFBuffer print\n";
-
-  // Call base class printer
-  DataBuffer::Print(out, show_all);
-}
-
-// Name: load()
-// Description: populates the DataBuffer with data
-//              Overrides base-class method.
-Status TFBuffer::Load() {
-  const DataSchema *the_schema = storage_client_->schema();
-  uint32_t num_columns = the_schema->NumColumns();
-  uint32_t num_rows_requested = storage_client_->rows_per_buffer();
-  uint32_t remaining_rows = storage_client_->num_rows() > buffer_id_ * storage_client_->rows_per_buffer()
-                              ? storage_client_->num_rows() - buffer_id_ * storage_client_->rows_per_buffer()
-                              : 0;
-  if (remaining_rows < num_rows_requested) {
-    num_rows_requested = remaining_rows;
-  }
-
-  // Construct the Tensor table for this buffer.
-  tensor_table_ = std::make_unique<TensorQTable>();
-
-  // At each position in the tensor table, instantiate the shared pointer to it's Tensor.
-  uint32_t row = 0;
-  while (row < num_rows_requested && (cur_reader_.peek() != EOF || storage_client_->IsMoreData(buffer_id_))) {
-    TensorRow new_row;
-
-    // Read the data from storage into a tf_file format
-    dataengine::Example tf_file;
-    RETURN_IF_NOT_OK(ParseSingleExample(&tf_file));
-    for (uint32_t col = 0; col < num_columns; ++col) {
-      std::shared_ptr<Tensor> new_t;
-      const ColDescriptor current_col = the_schema->column(col);
-      const dataengine::Features &example_features = tf_file.features();
-      const google::protobuf::Map<std::string, dataengine::Feature> &feature_map = example_features.feature();
-      const dataengine::Feature &column_values_list = feature_map.at(current_col.name());
-      const dataengine::Feature::KindCase column_list_type = column_values_list.kind_case();
-      RETURN_IF_NOT_OK(LoadFeature(column_list_type, column_values_list, current_col, &new_t));
-
-      // Add the column to the current tensor row
-      new_row.push_back(std::move(new_t));
-    }
-
-    // Add the new row of tensors to the end of our tensor table
-    tensor_table_->push_back(new_row);
-    row++;
-  }
-  cur_reader_.close();
-  return Status::OK();
-}
-
-// Name: ParseSingleExample()
-// Description: Drives the calls to TFClient for fetching the tf_file info from
-//              the tf_file files.  Returns a single row of data from the tf_file
-//              files.
-Status TFBuffer::ParseSingleExample(dataengine::Example *ptr) {
-  if (cur_reader_.peek() == EOF) {
-    auto client = std::dynamic_pointer_cast<TFClient>(storage_client_);
-    if (client == nullptr) {
-      std::string errMsg = "Unexpected storage client type for TFBuffer";
-      RETURN_STATUS_UNEXPECTED(errMsg);
-    }
-    RETURN_IF_NOT_OK(client->NextFileInfo(buffer_id_, &cur_f_info_));
-    cur_reader_.close();
-    cur_reader_.open(cur_f_info_.fileName);
-    // Seek to the offset
-    (void)cur_reader_.seekg(static_cast<std::streamsize>(cur_f_info_.startOffset));
-    MS_LOG(DEBUG) << "got new file " << cur_f_info_.fileName << ".";
-  }
-
-  // one record in tf_file looks like:
-  // Format of a single record:
-  //  uint64    length
-  //  uint32    masked crc of length
-  //  byte      data[length]
-  //  uint32    masked crc of data
-  // read length
-  if (cur_reader_.peek() == EOF) {
-    MS_LOG(ERROR) << "ParseSingleExample failed";
-  }
-
-  dataengine::Example tf_file;
-  try {
-    uint64_t record_length = 0;
-    (void)cur_reader_.read(reinterpret_cast<char *>(&record_length), static_cast<std::streamsize>(sizeof(uint64_t)));
-
-    // ignore crc header
-    (void)cur_reader_.ignore(static_cast<std::streamsize>(sizeof(uint32_t)));
-
-    // read serialized Example
-    std::string serialized_example;
-    serialized_example.resize(record_length);
-    (void)cur_reader_.read(&serialized_example[0], static_cast<std::streamsize>(record_length));
-
-    // ignore crc footer
-    (void)cur_reader_.ignore(static_cast<std::streamsize>(sizeof(uint32_t)));
-
-    if (!tf_file.ParseFromString(serialized_example)) {
-      std::string err_msg = "parse tf_file failed";
-      RETURN_STATUS_UNEXPECTED(err_msg);
-    }
-  } catch (const std::exception &err) {
-    std::string err_msg = "Please check if the data file is complete!";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-  *ptr = tf_file;
-  return Status::OK();
-}
-
-// Name: LoadFeature()
-// Description: Given the column type of the tf record and the values list,
-//              constructs the tensor and returns it.
-Status TFBuffer::LoadFeature(const dataengine::Feature::KindCase &column_list_type,
-                             const dataengine::Feature &column_values_list, const ColDescriptor &current_col,
-                             std::shared_ptr<Tensor> *out_tensor) {
-  std::string element_str;                  // For staging data from protobuf deserialization
-  std::unique_ptr<int64_t[]> int_array;     // For staging data from protobuf deserialization
-  std::unique_ptr<float[]> float_array;     // For staging data from protobuf deserialization
-  const unsigned char *data_ptr = nullptr;  // Generic pointer used for populating the Tensor
-  // This variable will point into the above staging
-  // variables.
-  uint32_t num_elements = 0;  // Generic counter used for setting shape attributes
-
-  // Depending on the type of data from the tf_file, we want to extract 2 things:
-  // 1) A pointer to the data as a const unsigned char *
-  // 2) The number of elements of the data
-  // After those are determined, we can then build the tensor to represent this data.
-
-  switch (column_list_type) {
-    // CASE : TF record type: kBytesList
-    case dataengine::Feature::KindCase::kBytesList: {
-      RETURN_IF_NOT_OK(LoadBytesList(current_col, column_values_list, &element_str));
-
-      // Get the const pointer representation of this data, and the number of elements
-      // (number of bytes) for this tensor.
-      data_ptr = reinterpret_cast<const unsigned char *>(common::SafeCStr(element_str));
-      num_elements = element_str.length();
-      break;
-    }
-
-      // CASE : TF record type: kFloatList
-    case dataengine::Feature::KindCase::kFloatList: {
-      RETURN_IF_NOT_OK(LoadFloatList(current_col, column_values_list, &num_elements, &float_array));
-
-      data_ptr = reinterpret_cast<const unsigned char *>(float_array.get());
-      break;
-    }
-
-      // CASE : TF record type: kInt64List
-    case dataengine::Feature::KindCase::kInt64List: {
-      RETURN_IF_NOT_OK(LoadIntList(current_col, column_values_list, &num_elements, &int_array));
-
-      data_ptr = reinterpret_cast<const unsigned char *>(int_array.get());
-      break;
-    }
-    case dataengine::Feature::KindCase::KIND_NOT_SET: {
-      std::string errMsg = "tf_file column list type enum is KIND_NOT_SET";
-      RETURN_STATUS_UNEXPECTED(errMsg);
-    }
-    default: {
-      std::string errMsg = "tf_file column list type enum does not match any known DE type";
-      RETURN_STATUS_UNEXPECTED(errMsg);
-    }
-  }
-
-  // At this point we have a raw pointer to the data, and we have the number of elements.
-  // Along with the tensor implementation type and the data type from the schema, we
-  // enough info to construct the Tensor for it.
-  TensorShape current_shape = TensorShape::CreateUnknownRankShape();
-  RETURN_IF_NOT_OK(CreateTensorShapeForColumn(current_col, num_elements, &current_shape));
-
-  // Now, create this tensor directly into the appropriate slot in our tensor
-  // table.
-  RETURN_IF_NOT_OK(
-    Tensor::CreateTensor(out_tensor, current_col.tensorImpl(), current_shape, current_col.type(), data_ptr));
-
-  return Status::OK();
-}
-
-Status TFBuffer::LoadBytesList(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
-                               std::string *element_str) {
-  // kBytesList can map to the following DE types ONLY!
-  // DE_UINT8, DE_INT8
-  // Must be single byte type for each element!
-  if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8) {
-    std::string err_msg = "Invalid datatype for Tensor at column: " + current_col.name();
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-  const dataengine::BytesList &bytes_list = column_values_list.bytes_list();
-
-  // A bytesList is a special case where the entire list of data can be
-  // deserialized into a single string. For example, it is not a list
-  // of bytes, it is a list of strings, where each string represents
-  // a list of bytes (this is different from the other cases like IntList etc)
-  // As such, if there is more than one string in this list, that is invalid.
-  if (bytes_list.value_size() > 1) {
-    std::string err_msg = "Bytes list contains more than one element for column: " + current_col.name();
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-
-  // Extract the string that contains the bytes we need.  Position 0 is the only
-  // valid string here.
-  *element_str = bytes_list.value(0);
-
-  return Status::OK();
-}
-
-Status TFBuffer::LoadFloatList(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
-                               uint32_t *num_elements, std::unique_ptr<float[]> *float_array) {
-  // KFloatList can only map to DE types:
-  // DE_FLOAT32
-  if (current_col.type() != DataType::DE_FLOAT32) {
-    std::string err_msg = "Invalid datatype for Tensor at column: " + current_col.name();
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-  const dataengine::FloatList &float_list = column_values_list.float_list();
-
-  // Identify how many values we have and then create a local array of these
-  // to deserialize into
-  *num_elements = float_list.value_size();
-  *float_array = std::make_unique<float[]>(*num_elements);
-  for (int i = 0; i < float_list.value_size(); i++) {
-    (*float_array)[i] = float_list.value(i);
-  }
-
-  return Status::OK();
-}
-
-Status TFBuffer::LoadIntList(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
-                             uint32_t *num_elements, std::unique_ptr<int64_t[]> *int_array) {
-  // KInt64List can only map to DE types:
-  // DE_UINT64, DE_INT64, DE_UINT32, DE_INT32, DE_UINT16, DE_INT16, DE_UINT8, DE_INT8
-  if (!(current_col.type().IsInt())) {
-    std::string err_msg = "Invalid datatype/rank for column label in TFBuffer.";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-
-  const dataengine::Int64List &int64_list = column_values_list.int64_list();
-
-  // Identify how many values we have and then create a local array of these
-  // to deserialize into
-  *num_elements = int64_list.value_size();
-  *int_array = std::make_unique<int64_t[]>(*num_elements);
-  for (int i = 0; i < int64_list.value_size(); i++) {
-    (*int_array)[i] = int64_list.value(i);
-  }
-
-  return Status::OK();
-}
-
-Status TFBuffer::CreateTensorShapeForColumn(const ColDescriptor &current_col, uint32_t num_elements,
-                                            TensorShape *current_shape) {
-  // If the shape is assigned by user, we have an assumption that the data is
-  // already in the appropriate format that we can copy into the Tensor as-is.
-  if (current_col.hasShape()) {
-    *current_shape = current_col.shape();
-  } else if (current_col.rank() == 1) {
-    // If shape was not given, then we support 2 possible shapes.
-    // 1) It's a scalar (rank 0), in which case the shape is empty but we need to flag
-    //    it as a scalar value (empty shape but has a single value)
-    // 2) It's a rank 1 shape, and the dimension value for that single dimension will
-    //    be comprised of the entire bytes-size of the input data.
-    *current_shape = TensorShape({num_elements});
-  } else if (current_col.rank() == 0) {
-    // Make this shape into a single value scalar.
-    *current_shape = TensorShape::CreateScalar();
-  } else if (current_col.rank() > 1) {
-    // All other ranks, except for 0, are invalid because we cannot guess
-    // what the shape will be.  For example, if we have rank 3 and 12 bytes
-    // of data, is it shape {2,2,3} or is it {2,6,1}.  We can't guess at
-    // the shape dimensions.
-    const std::string kErrMsg = "Invalid rank (rank>1) for dynamic shape construction. Specify shape in schema.";
-    RETURN_STATUS_UNEXPECTED(kErrMsg);
-  }
-
-  return Status::OK();
-}
-}  // namespace dataset
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_buffer.h b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_buffer.h
deleted file mode 100644
index 389f4a76d9..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_buffer.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_TF_BUFFER_H_
-#define DATASET_ENGINE_DATASETOPS_SOURCE_TF_BUFFER_H_
-
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-#include "dataset/engine/data_buffer.h"
-#include "proto/example.pb.h"
-#include "dataset/engine/datasetops/source/tf_client.h"
-
-namespace mindspore {
-namespace dataset {
-// This TFBuffer is the buffer type for dealing with tf record data.
-class TFBuffer : public DataBuffer {
- public:
-  // constructor
-  TFBuffer(uint32_t id,                    // In: The id for this buffer
-           DataBuffer::BufferFlags flags,  // In: The flags for this buffer
-           const std::shared_ptr<StorageClient>
-             &storage_client);  // In: The storage client that is related to this buffer type
-
-  // destructor
-  ~TFBuffer() override;
-
-  // Name: print()
-  // Description: A function that prints info
-  void Print(std::ostream &out,              // In: The output stream to print to
-             bool show_all) const override;  // In: T/F if it should print everything
-
-  // Provide stream operator for displaying it
-  friend std::ostream &operator<<(std::ostream &out, const TFBuffer &tf_buffer) {
-    tf_buffer.Print(out, false);  // Show meta info only
-    return out;
-  }
-
-  // Name: load()
-  // Description: populates the DataBuffer with data.
-  //              Overrides base-class method.
-  Status Load() override;
-
- private:
-  std::ifstream cur_reader_;
-  FileInfo cur_f_info_;
-
-  std::shared_ptr<StorageClient> storage_client_;  // The storage client for populating the buffer initially.
-
-  // Name: ParseSingleExample()
-  // Description: Drives the calls to TFClient for fetching the tf_file info from
-  //              the tf_file files.  Returns a single row of data from the tf_file
-  //              files.
-  Status ParseSingleExample(dataengine::Example *ptr);
-
-  // Name: LoadFeature()
-  // Description: Given the column type of the tf record and the values list,
-  //              constructs the tensor and returns it.
-  Status LoadFeature(const dataengine::Feature::KindCase &column_list_type,
-                     const dataengine::Feature &column_values_list, const ColDescriptor &current_col,
-                     std::shared_ptr<Tensor> *out_tensor);
-
-  Status LoadBytesList(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
-                       std::string *element_str);
-
-  Status LoadFloatList(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
-                       uint32_t *num_elements, std::unique_ptr<float[]> *float_array);
-
-  Status LoadIntList(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
-                     uint32_t *num_elements, std::unique_ptr<int64_t[]> *int_array);
-
-  Status CreateTensorShapeForColumn(const ColDescriptor &current_col, uint32_t num_elements,
-                                    TensorShape *current_shape);
-};
-}  // namespace dataset
-}  // namespace mindspore
-
-#endif  // DATASET_ENGINE_DATASETOPS_SOURCE_TF_BUFFER_H_
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_client.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_client.cc
deleted file mode 100644
index 9e8cd67ae6..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_client.cc
+++ /dev/null
@@ -1,376 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dataset/engine/datasetops/source/tf_client.h"
-
-#include <iostream>
-#include <memory>
-#include <random>
-#include <string>
-#include <limits>
-#include <algorithm>
-
-#include "common/utils.h"
-#include "proto/example.pb.h"
-#include "dataset/engine/datasetops/source/storage_client.h"
-#include "dataset/util/path.h"
-#include "dataset/util/status.h"
-#include "dataset/engine/datasetops/source/storage_op.h"
-#include "utils/log_adapter.h"
-
-namespace mindspore {
-namespace dataset {
-// Name: Constructor
-// Description: Creates the TFClient.
-TFClient::TFClient(std::unique_ptr<DataSchema> schema,  // In: The schema for this storage client.
-                   StorageOp *so)                       // In: The StorageOp that's using this client
-    : StorageClient(std::move(schema), so),
-      rows_per_buffer_(so->rows_per_buffer()),
-      random_seed_generator_(so->seed()),
-      random_seed_distribution_(0, std::numeric_limits<uint32_t>::max()),
-      rows_per_shard_(0) {}
-
-Status TFClient::Init() {
-  // Initialize queue to hold the tf file names
-  const std::string kExtensionData = ".data";
-  const std::string kExtensionTF = ".tfrecord";
-  bool schema_init = false;
-  if (!storage_op_->dataset_files_dir().empty()) {
-    MS_LOG(DEBUG) << "Reading dataset using datasetPath.";
-    Path data_set_directory(storage_op_->dataset_files_dir());
-    auto dirIt = Path::DirIterator::OpenDirectory(&data_set_directory);
-    if (dirIt) {
-      while (dirIt->hasNext()) {
-        Path file = dirIt->next();
-        std::string filename = file.toString();
-        if ((file.Extension() == kExtensionData) || (file.Extension() == kExtensionTF)) {
-          const std::vector<uint64_t> recs_lengths = ParseTfFileLines(filename);
-          v_total_file_rows_.emplace_back(
-            std::pair<std::string, std::vector<uint64_t>>(filename, std::move(recs_lengths)));
-
-          // schema
-          if (!schema_init) {
-            RETURN_IF_NOT_OK(ParseTfFileSchema(filename));
-            schema_init = true;
-          }
-          MS_LOG(INFO) << "found tf file: " << filename << ", num rows " << recs_lengths.size() << ".";
-        }
-      }
-    } else {
-      RETURN_STATUS_UNEXPECTED("Unable to open directory " + data_set_directory.toString());
-    }
-  } else {
-    MS_LOG(DEBUG) << "Reading dataset using dataset files list.";
-    for (auto filename : storage_op_->dataset_file_list()) {
-      const std::vector<uint64_t> recs_lengths = ParseTfFileLines(filename);
-      v_total_file_rows_.emplace_back(std::pair<std::string, std::vector<uint64_t>>(filename, std::move(recs_lengths)));
-
-      // schema
-      if (!schema_init) {
-        RETURN_IF_NOT_OK(ParseTfFileSchema(filename));
-        schema_init = true;
-      }
-      MS_LOG(INFO) << "Processed tf file: " << filename << ", num rows " << recs_lengths.size() << ".";
-    }
-  }
-
-  RETURN_IF_NOT_OK(CalculateRowsPerDevice());
-  std::sort(v_total_file_rows_.begin(), v_total_file_rows_.end());
-  RETURN_IF_NOT_OK(ScatterFileRows(static_cast<uint32_t>(storage_op_->device_id()), storage_op_->shard_config(),
-                                   storage_op_->seed(), storage_op_->shuffle_config()));
-
-  CalculateNumRows();
-  InitStateInfo();
-  return Status::OK();
-}
-
-// Sharding will reduce the number of rows. Doing this in constructor as we only want to do this once.
-void TFClient::CalculateNumRows() {
-  num_rows_in_dataset_ = 0;
-  for (auto rows : file_start_end_offset_) {
-    num_rows_in_dataset_ += (rows.second - rows.first);
-  }
-}
-
-Status TFClient::CalculateRowsPerDevice() {
-  uint64_t num = std::accumulate(
-    v_total_file_rows_.begin(), v_total_file_rows_.end(), 0,
-    [](uint64_t value, const std::pair<std::string, std::vector<uint64_t>> &a) { return value + a.second.size(); });
-  if (static_cast<uint64_t>(std::floor(num * 1.0 / storage_op_->device_num())) == 0) {
-    RETURN_STATUS_UNEXPECTED("Num rows of dataset is less than device number");
-  }
-  rows_per_shard_ = static_cast<uint64_t>(std::ceil(num * 1.0 / storage_op_->device_num()));
-  return Status::OK();
-}
-
-bool TFClient::ValidFileForShard(const uint64_t file_rows, uint64_t *start_offset, uint64_t *end_offset,
-                                 const uint64_t &pre_count, uint32_t device_id) const {
-  *start_offset = 0;
-  *end_offset = 0;
-  bool valid = false;
-  uint64_t start_index = device_id * rows_per_shard_;
-  uint64_t end_index = (device_id + 1) * rows_per_shard_;
-
-  // First valid file
-  if (pre_count <= start_index && pre_count + file_rows > start_index) {
-    *start_offset = start_index - pre_count;
-    valid = true;
-    if (pre_count < end_index && pre_count + file_rows >= end_index) {
-      *end_offset = end_index - pre_count;
-    } else {
-      *end_offset = file_rows;
-    }
-  }
-
-  // Second and subsequent files
-  if (pre_count > start_index && pre_count < end_index) {
-    *start_offset = 0;
-    valid = true;
-    if (pre_count + file_rows >= end_index) {
-      *end_offset = end_index - pre_count;
-    } else {
-      *end_offset = file_rows;
-    }
-  }
-
-  return valid;
-}
-
-void TFClient::GetValidFileForShard(const std::vector<std::pair<std::string, std::vector<uint64_t>>> &v_files,
-                                    uint32_t device_id) {
-  uint64_t start_offset = 0;
-  uint64_t end_offset = 0;
-  uint64_t pre_count = 0;
-  bool finish = false;
-  while (!finish) {
-    for (const auto &file : v_files) {
-      if (ValidFileForShard(file.second.size(), &start_offset, &end_offset, pre_count, device_id)) {
-        std::pair<uint32_t, uint32_t> offset(start_offset, end_offset);
-        file_start_end_offset_.emplace_back(offset);
-        v_file_rows_.emplace_back(file);
-      }
-      pre_count += file.second.size();
-    }
-    if (pre_count < (device_id + 1) * rows_per_shard_) {
-      finish = false;
-    } else {
-      finish = true;
-    }
-  }
-}
-
-// Description: Scatter file rows to local single-P according to config info.
-// There are 3 modes: ALL, UNIQUE, RANDOM. For UNIQUE and RANDOM mode, shuffleConfig controls
-// whether file row vector would be shuffled or not before a new mEopch.
-// For ALL mode, temporarily, we deal with epoch in python part.
-Status TFClient::ScatterFileRows(uint32_t device_id, const std::string &shard_config, uint32_t seed,
-                                 bool shuffle_config) {
-  if (shard_config == "UNIQUE" || shard_config == "RANDOM") {
-    std::vector<std::pair<std::string, std::vector<uint64_t>>> v_shuffled_total_file_rows =
-      ShuffleVector(v_total_file_rows_, seed);
-    GetValidFileForShard(v_shuffled_total_file_rows, device_id);
-    if (shuffle_config) {
-      v_total_file_rows_ = v_shuffled_total_file_rows;
-    }
-  } else if (shard_config == "ALL") {
-    v_file_rows_.insert(v_file_rows_.end(), v_total_file_rows_.begin(), v_total_file_rows_.end());
-    if (shuffle_config) {
-      v_total_file_rows_ = ShuffleVector(v_total_file_rows_, seed);
-    }
-
-    for (const auto &file : v_file_rows_) {
-      std::pair<uint32_t, uint32_t> offset(0, file.second.size());
-      file_start_end_offset_.emplace_back(offset);
-    }
-  } else {
-    RETURN_STATUS_UNEXPECTED("In parallel config file, wrong shuffleConfig or shardConfig provided.");
-  }
-
-  return Status::OK();
-}
-
-std::vector<std::pair<std::string, std::vector<uint64_t>>> TFClient::ShuffleVector(
-  std::vector<std::pair<std::string, std::vector<uint64_t>>> v, uint32_t seed = 1) {
-  std::default_random_engine randomEngine(seed);
-  std::shuffle(std::begin(v), std::end(v), randomEngine);
-  return v;
-}
-
-void TFClient::CalculateStartOffset(const uint64_t start_index, const uint64_t end_index,
-                                    const std::vector<uint64_t> &vec_length, uint64_t *start_offset) const {
-  for (size_t i = start_index; i < end_index; i++) {
-    // Format of a single record:
-    //  uint64    length
-    //  uint32    masked crc of length
-    //  byte      data[length]
-    //  uint32    masked crc of data
-    *start_offset += sizeof(uint64_t) + 2 * sizeof(uint32_t) + vec_length[i];
-  }
-}
-
-void TFClient::InitStateInfo() {
-  uint32_t start_idx = 0, record_num = 0, buffer_id = 0;
-  uint64_t start_offset = 0;
-  bool first_buffer = true;
-  f_info_queue_.emplace_back(QFile());
-  std::vector<std::pair<std::string, std::vector<uint64_t>>>::iterator itr = v_file_rows_.begin();
-  uint32_t index = 0;
-  while (itr != v_file_rows_.end()) {
-    uint32_t file_start_index = file_start_end_offset_[index].first;
-    uint32_t file_end_index = file_start_end_offset_[index].second;
-    FileInfo f_info;
-    f_info.fileName = itr->first;
-    f_info.startRecordIdx = start_idx > file_start_index ? start_idx : file_start_index;
-    if (first_buffer && f_info.startRecordIdx != 0) {
-      CalculateStartOffset(0, f_info.startRecordIdx, itr->second, &start_offset);
-      start_idx = static_cast<uint32_t>(f_info.startRecordIdx);
-    }
-    first_buffer = false;
-    f_info.startOffset = start_offset;
-    if (start_idx + rows_per_buffer_ - record_num < itr->second.size()) {
-      uint64_t end_idx = start_idx + rows_per_buffer_ - record_num - 1;
-      f_info.endRecordIdx = end_idx > (file_end_index - 1) ? (file_end_index - 1) : end_idx;
-      f_info_queue_[buffer_id].push(f_info);
-      CalculateStartOffset(start_idx, f_info.endRecordIdx + 1, itr->second, &start_offset);
-      start_idx = start_idx + rows_per_buffer_ - record_num;
-      record_num = 0;
-      buffer_id++;
-      f_info_queue_.emplace_back(QFile());
-      if (end_idx >= file_end_index - 1) {
-        start_idx = start_offset = 0;
-        ++itr;
-        ++index;
-      }
-    } else {
-      f_info.endRecordIdx = itr->second.size() - 1 > file_end_index - 1 ? file_end_index - 1 : itr->second.size() - 1;
-      f_info_queue_[buffer_id].push(f_info);
-      if (start_idx + rows_per_buffer_ - record_num == itr->second.size()) {
-        record_num = start_idx = start_offset = 0;
-        buffer_id++;
-        if (itr + 1 != v_file_rows_.end()) {
-          f_info_queue_.emplace_back(QFile());
-        }
-      } else {
-        record_num += static_cast<uint32_t>(itr->second.size()) - start_idx;
-        start_idx = start_offset = 0;
-      }
-      ++itr;
-      ++index;
-    }
-  }
-}
-
-// Name: Print()
-// Description: A function that prints info about the TFClient
-void TFClient::Print(std::ostream &out) const {  //  In: The output stream to print to
-  out << "TF client.";
-}
-
-std::vector<uint64_t> TFClient::ParseTfFileLines(const std::string &filename) {
-  std::vector<uint64_t> recs_lengths;
-  std::ifstream reader;
-  reader.open(filename);
-  while (true) {
-    if (reader.peek() == EOF) {
-      reader.close();
-      break;
-    }
-
-    // read length
-    uint64_t record_length = 0;
-    (void)reader.read(reinterpret_cast<char *>(&record_length), static_cast<std::streamsize>(sizeof(uint64_t)));
-    recs_lengths.push_back(record_length);
-
-    // ignore crc header
-    (void)reader.ignore(static_cast<std::streamsize>(sizeof(uint32_t)));
-
-    // ignore data length
-    (void)reader.ignore(static_cast<std::streamsize>(record_length));
-
-    // ignore crc footer
-    (void)reader.ignore(static_cast<std::streamsize>(sizeof(uint32_t)));
-  }
-  return recs_lengths;
-}
-
-Status TFClient::ParseTfFileSchema(const std::string &filename) {
-  std::ifstream reader;
-  reader.open(filename);
-  std::string serialized_example;
-  // read length
-  uint64_t record_length = 0;
-  (void)reader.read(reinterpret_cast<char *>(&record_length), static_cast<std::streamsize>(sizeof(uint64_t)));
-
-  // ignore crc header
-  (void)reader.ignore(static_cast<std::streamsize>(sizeof(uint32_t)));
-
-  // read serialized Example
-  serialized_example.resize(record_length);
-  (void)reader.read(&serialized_example[0], static_cast<std::streamsize>(record_length));
-
-  // ignore crc footer
-  (void)reader.ignore(static_cast<std::streamsize>(sizeof(uint32_t)));
-
-  reader.close();
-  dataengine::Example tf_file;
-  if (!tf_file.ParseFromString(serialized_example)) {
-    std::string err_msg = "parse tf_file failed, file name is " + filename;
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-  const dataengine::Features &example_features = tf_file.features();
-  const google::protobuf::Map<std::string, dataengine::Feature> &feature_map = example_features.feature();
-  for (auto it = feature_map.begin(); it != feature_map.end(); ++it) {
-    col_names_.push_back(it->first);
-  }
-  return Status::OK();
-}
-
-// Name: Reset()
-// Description: Resets any state info inside the client back to it's initialized
-//              state.
-Status TFClient::Reset() {
-  v_file_rows_.clear();
-  file_start_end_offset_.clear();
-
-  uint32_t next_seed = random_seed_distribution_(random_seed_generator_);
-  RETURN_IF_NOT_OK(ScatterFileRows(static_cast<uint32_t>(storage_op_->device_id()), storage_op_->shard_config(),
-                                   next_seed, storage_op_->shuffle_config()));
-
-  CalculateNumRows();
-  uint32_t num_rows_in_file = 0;
-  RETURN_IF_NOT_OK(this->numRowsFromFile(num_rows_in_file));
-  if (num_rows_in_file < num_rows_in_dataset_) {
-    num_rows_in_dataset_ = num_rows_in_file;
-  }
-
-  storage_op_->set_num_rows(static_cast<int32_t>(num_rows_in_dataset_));
-  InitStateInfo();
-
-  return Status::OK();
-}
-
-Status TFClient::NextFileInfo(uint32_t id, FileInfo *ptr) {
-  if (f_info_queue_.empty() || id >= f_info_queue_.size() || f_info_queue_[id].empty()) {
-    RETURN_STATUS_UNEXPECTED("cannot find next FileInfo in mFInfoQueue");
-  }
-  *ptr = f_info_queue_[id].front();
-  f_info_queue_[id].pop();
-  return Status::OK();
-}
-
-bool TFClient::IsMoreData(uint32_t id) { return (!f_info_queue_[id].empty()); }
-}  // namespace dataset
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_client.h b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_client.h
deleted file mode 100644
index 3602f93351..0000000000
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_client.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_TF_CLIENT_H_
-#define DATASET_ENGINE_DATASETOPS_SOURCE_TF_CLIENT_H_
-
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <queue>
-#include <random>
-#include <string>
-#include <utility>
-#include <vector>
-#include <map>
-#include "proto/example.pb.h"
-#include "dataset/engine/datasetops/source/storage_client.h"
-#include "dataset/util/status.h"
-
-struct FileInfo {
-  std::string fileName;
-  uint64_t startRecordIdx;
-  uint64_t endRecordIdx;
-  uint64_t startOffset;
-};
-
-using QFile = std::queue<FileInfo>;
-
-namespace mindspore {
-namespace dataset {
-// forward declares
-class DataSchema;
-class ParallelOp;
-
-class TFClient : public StorageClient {
- public:
-  // Name: Constructor
-  // Description: Creates the TFClient.
-  TFClient(std::unique_ptr<DataSchema> schema,  // In: The schema for this storage client.
-           StorageOp *so);                      // In: The ParallelOp that's using this client
-
-  ~TFClient() {}
-
-  Status Init() override;
-
-  // Name: Print()
-  // Description: A function that prints info about the TFClient
-  void Print(std::ostream &out) const override;  // In: The output stream to print to
-
-  std::vector<uint64_t> ParseTfFileLines(const std::string &filename);
-
-  Status ParseTfFileSchema(const std::string &filename);
-
-  Status NextFileInfo(uint32_t id, FileInfo *);
-
-  bool IsMoreData(uint32_t id) override;
-
-  // Name: Reset()
-  // Description: Resets any state info inside the client back to it's initialized
-  //              state.
-  Status Reset() override;
-
-  Status ScatterFileRows(uint32_t device_id, const std::string &shard_config, uint32_t seed, bool shuffle_config);
-
- private:
-  // hardcoded, put this in json schema
-  // const static int32_t BERT_DATASET_TOTAL_ROWS = 43900;
-  uint32_t rows_per_buffer_;
-  std::default_random_engine random_seed_generator_;
-  std::uniform_int_distribution<uint32_t> random_seed_distribution_;
-
-  std::vector<std::pair<std::string, std::vector<uint64_t>>> v_file_rows_;
-  std::vector<std::pair<std::string, std::vector<uint64_t>>> v_total_file_rows_;
-  std::vector<QFile> f_info_queue_;
-  uint64_t rows_per_shard_;
-  std::vector<std::pair<uint32_t, uint32_t>> file_start_end_offset_;
-
-  void InitStateInfo();
-
-  std::vector<std::pair<std::string, std::vector<uint64_t>>> ShuffleVector(
-    std::vector<std::pair<std::string, std::vector<uint64_t>>> v, uint32_t seed);
-
-  Status CalculateRowsPerDevice();
-
-  bool ValidFileForShard(const uint64_t file_rows, uint64_t *start_offset, uint64_t *end_offset,
-                         const uint64_t &pre_count, uint32_t device_id) const;
-
-  void CalculateNumRows();
-
-  void GetValidFileForShard(const std::vector<std::pair<std::string, std::vector<uint64_t>>> &v_files,
-                            uint32_t device_id);
-
-  void CalculateStartOffset(const uint64_t start_index, const uint64_t end_index,
-                            const std::vector<uint64_t> &vec_length, uint64_t *start_offset) const;
-};
-}  // namespace dataset
-}  // namespace mindspore
-
-#endif  // DATASET_ENGINE_DATASETOPS_SOURCE_TF_CLIENT_H_
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc
index 60adddb4a8..23dce8dc10 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc
@@ -15,14 +15,15 @@
  */
 #include "dataset/engine/datasetops/source/tf_reader_op.h"
 
-#include <cmath>
-#include <condition_variable>
+#include <algorithm>
+#include <fstream>
 #include <future>
 #include <iomanip>
 #include <memory>
 #include <mutex>
+#include <string>
 #include <utility>
-#include <unordered_map>
+#include <vector>
 
 #include "proto/example.pb.h"
 #include "./securec.h"
@@ -32,8 +33,6 @@
 #include "dataset/engine/connector.h"
 #include "dataset/engine/data_schema.h"
 #include "dataset/engine/datasetops/source/io_block.h"
-#include "dataset/engine/datasetops/source/storage_client.h"
-#include "dataset/engine/datasetops/source/tf_client.h"
 #include "dataset/engine/db_connector.h"
 #include "dataset/engine/execution_tree.h"
 #include "dataset/engine/jagged_connector.h"
@@ -56,6 +55,7 @@ TFReaderOp::Builder::Builder()
   builder_op_connector_size_ = config_manager->op_connector_size();
   builder_rows_per_buffer_ = config_manager->rows_per_buffer();
   builder_shuffle_files_ = false;
+  builder_shuffle_global_ = false;
   builder_data_schema_ = std::make_unique<DataSchema>();
 }
 
@@ -126,7 +126,8 @@ Status TFReaderOp::Builder::Build(std::shared_ptr<TFReaderOp> *out_tf_reader_op)
   std::shared_ptr<TFReaderOp> new_tf_reader_op = std::make_shared<TFReaderOp>(
     builder_num_workers_, builder_worker_connector_size_, builder_rows_per_buffer_, builder_total_rows_,
     builder_dataset_files_list_, std::move(builder_data_schema_), builder_op_connector_size_, builder_columns_to_load_,
-    builder_shuffle_files_, builder_num_devices_, builder_device_id_, builder_equal_rows_per_shard_);
+    builder_shuffle_files_, builder_shuffle_global_, builder_num_devices_, builder_device_id_,
+    builder_equal_rows_per_shard_);
 
   RETURN_IF_NOT_OK(new_tf_reader_op->Init());
   *out_tf_reader_op = std::move(new_tf_reader_op);
@@ -136,8 +137,8 @@ Status TFReaderOp::Builder::Build(std::shared_ptr<TFReaderOp> *out_tf_reader_op)
 TFReaderOp::TFReaderOp(int32_t num_workers, int32_t worker_connector_size, int64_t rows_per_buffer,
                        int64_t total_num_rows, std::vector<std::string> dataset_files_list,
                        std::unique_ptr<DataSchema> data_schema, int32_t op_connector_size,
-                       std::vector<std::string> columns_to_load, bool shuffle_files, int32_t num_device,
-                       int32_t device_id, bool equal_rows_per_shard)
+                       std::vector<std::string> columns_to_load, bool shuffle_files, bool shuffle_global,
+                       int32_t num_device, int32_t device_id, bool equal_rows_per_shard)
     : ParallelOp(num_workers, op_connector_size),
       device_id_(device_id),
       num_devices_(num_device),
@@ -147,6 +148,7 @@ TFReaderOp::TFReaderOp(int32_t num_workers, int32_t worker_connector_size, int64
       columns_to_load_(std::move(columns_to_load)),
       finished_reading_dataset_(false),
       shuffle_files_(shuffle_files),
+      shuffle_global_(shuffle_global),
       data_schema_(std::move(data_schema)),
       filename_index_(std::make_unique<StringIndex>()),
       load_io_block_queue_(true),
@@ -172,7 +174,8 @@ void TFReaderOp::Print(std::ostream &out, bool show_all) const {
     // Then show any custom derived-internal stuff
     out << "\nRows per buffer: " << rows_per_buffer_ << "\nTotal rows: " << total_rows_ << "\nDevice id: " << device_id_
         << "\nNumber of devices: " << num_devices_ << "\nShuffle files: " << ((shuffle_files_) ? "yes" : "no")
-        << "\nDataset files list:\n";
+        << "\nShuffle global: " << ((shuffle_global_) ? "yes" : "no")
+        << "\nDataset files list: Size: " << dataset_files_list_.size() << "\n";
     for (int i = 0; i < dataset_files_list_.size(); ++i) {
       out << " " << dataset_files_list_[i];
     }
@@ -217,7 +220,6 @@ Status TFReaderOp::Init() {
   // temporary: make size large enough to hold all files + EOE to avoid hangs
   int32_t safe_queue_size = static_cast<int32_t>(std::ceil(dataset_files_list_.size() / num_workers_)) + 1;
   io_block_queues_.Init(num_workers_, safe_queue_size);
-  dataset_files_list_.clear();  // no longer need the original list of files
 
   return Status::OK();
 }
@@ -451,8 +453,7 @@ Status TFReaderOp::FillIOBlockShuffle(const std::vector<int64_t> &i_keys) {
         }
       } else {
         // Do an index lookup using that key to get the filename.
-        auto file_it = filename_index_->Search(*it);
-        std::string file_name = file_it.value();
+        std::string file_name = (*filename_index_)[*it];
         if (NeedPushFileToblockQueue(file_name, &start_offset, &end_offset, pre_count)) {
           auto ioBlock = std::make_unique<FilenameBlock>(*it, start_offset, end_offset, IOBlock::kDeIoBlockNone);
           RETURN_IF_NOT_OK(PushIoBlockQueue(queue_index, std::move(ioBlock)));
@@ -481,7 +482,7 @@ Status TFReaderOp::FillIOBlockNoShuffle() {
   int64_t start_offset = 0;
   int64_t end_offset = 0;
   bool finish = false;
-  bool end_of_epoch = true;
+  bool end_of_epoch = false;
   while (!finish) {
     // Iterate over all the keys and add one key to each block.
     for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) {
@@ -771,53 +772,7 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor &current_col, const dataeng
   // know how many elements there are and the total bytes, create tensor here:
   TensorShape current_shape = TensorShape::CreateScalar();
   RETURN_IF_NOT_OK(current_col.MaterializeTensorShape((*num_elements) * pad_size, &current_shape));
-  RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, current_col.tensorImpl(), current_shape, current_col.type()));
-
-  // Tensors are lazily allocated, this eagerly allocates memory for the tensor.
-  unsigned char *current_tensor_addr = (*tensor)->GetMutableBuffer();
-  int64_t tensor_bytes_remaining = (*num_elements) * pad_size;
-
-  if (current_tensor_addr == nullptr) {
-    std::string err_msg = "tensor memory allocation failed";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-
-  RETURN_IF_NOT_OK(LoadAndPadBytes(current_tensor_addr, bytes_list, tensor_bytes_remaining, pad_size));
-
-  return Status::OK();
-}
-
-Status TFReaderOp::LoadAndPadBytes(unsigned char *current_tensor_addr, const dataengine::BytesList &bytes_list,
-                                   int64_t tensor_bytes_remaining, int64_t pad_size) {
-  if (current_tensor_addr == nullptr) {
-    std::string err_msg = "current_tensor_addr is null";
-    RETURN_STATUS_UNEXPECTED(err_msg);
-  }
-
-  for (int i = 0; i < bytes_list.value_size(); i++) {
-    // read string data into tensor
-    const std::string &current_element = bytes_list.value(i);
-    int return_code =
-      memcpy_s(current_tensor_addr, tensor_bytes_remaining, common::SafeCStr(current_element), current_element.size());
-    if (return_code != 0) {
-      std::string err_msg = "memcpy_s failed when reading bytesList element into Tensor";
-      RETURN_STATUS_UNEXPECTED(err_msg);
-    }
-
-    current_tensor_addr += current_element.size();
-    tensor_bytes_remaining -= current_element.size();
-
-    // pad
-    int64_t chars_to_pad = pad_size - current_element.size();
-    return_code = memset_s(current_tensor_addr, tensor_bytes_remaining, static_cast<int>(' '), chars_to_pad);
-    if (return_code != 0) {
-      std::string err_msg = "memset_s failed when padding bytesList in Tensor";
-      RETURN_STATUS_UNEXPECTED(err_msg);
-    }
-
-    current_tensor_addr += chars_to_pad;
-    tensor_bytes_remaining -= chars_to_pad;
-  }
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, bytes_list, current_shape, current_col.type(), pad_size));
 
   return Status::OK();
 }
@@ -905,7 +860,7 @@ Status TFReaderOp::LoadIntList(const ColDescriptor &current_col, const dataengin
   return Status::OK();
 }
 
-Status TFReaderOp::CreateSchema(const std::string tf_file, const std::vector<std::string> &columns_to_load) {
+Status TFReaderOp::CreateSchema(const std::string tf_file, std::vector<std::string> columns_to_load) {
   std::ifstream reader;
   reader.open(tf_file);
 
@@ -926,12 +881,14 @@ Status TFReaderOp::CreateSchema(const std::string tf_file, const std::vector<std
 
   const dataengine::Features &example_features = example.features();
   const google::protobuf::Map<std::string, dataengine::Feature> &feature_map = example_features.feature();
-  std::vector<std::string> columns = columns_to_load;
 
-  if (columns_to_load.empty())
-    (void)std::transform(feature_map.begin(), feature_map.end(), std::back_inserter(columns),
+  if (columns_to_load.empty()) {
+    (void)std::transform(feature_map.begin(), feature_map.end(), std::back_inserter(columns_to_load),
                          [](const auto &it) -> std::string { return it.first; });
-  for (const auto &curr_col_name : columns) {
+    std::sort(columns_to_load.begin(), columns_to_load.end());
+  }
+
+  for (const auto &curr_col_name : columns_to_load) {
     auto it = feature_map.find(curr_col_name);
     if (it == feature_map.end()) {
       RETURN_STATUS_UNEXPECTED("Failed to find column " + curr_col_name);
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.h
index 3dc5ee932e..9c92d6d4be 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.h
@@ -146,6 +146,13 @@ class TFReaderOp : public ParallelOp {
       return *this;
     }
 
+    // Setter method.
+    // @return Builder - setter method returns reference to the builder.
+    Builder &SetShuffleGlobal(bool shuffle_global) {
+      builder_shuffle_global_ = shuffle_global;
+      return *this;
+    }
+
     // Setter method.
     // @return Builder - setter method returns reference to the builder.
     Builder &SetShardEqualRows(bool shard_equal_rows) {
@@ -165,6 +172,7 @@ class TFReaderOp : public ParallelOp {
     std::vector<std::string> builder_dataset_files_list_;
     std::vector<std::string> builder_columns_to_load_;
     bool builder_shuffle_files_;
+    bool builder_shuffle_global_;
     bool builder_equal_rows_per_shard_;
   };
 
@@ -179,11 +187,12 @@ class TFReaderOp : public ParallelOp {
   // @param op_connector_size - size of each queue in the connector that the child operator pulls from.
   // @param columns_to_load - the names of the columns to load data from.
   // @param shuffle_files - whether or not to shuffle the files before reading data.
+  // @param shuffle_global - whether or not to shuffle the entire dataset.
   // @param equal_rows_per_shard - whether or not to get equal rows for each process.
   TFReaderOp(int32_t num_workers, int32_t worker_connector_size, int64_t rows_per_buffer, int64_t total_num_rows,
              std::vector<std::string> dataset_files_list, std::unique_ptr<DataSchema> data_schema,
              int32_t op_connector_size, std::vector<std::string> columns_to_load, bool shuffle_files,
-             int32_t num_devices, int32_t device_id, bool equal_rows_per_shard);
+             bool shuffle_global, int32_t num_devices, int32_t device_id, bool equal_rows_per_shard);
 
   // Default destructor
   ~TFReaderOp() = default;
@@ -228,6 +237,18 @@ class TFReaderOp : public ParallelOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "TFReaderOp"; }
+
+  // File names getter
+  // @return Vector of the input file names
+  std::vector<std::string> FileNames() { return dataset_files_list_; }
+
+  // Global shuffle flag getter
+  // @return Bool - whether this Op requires global shuffle
+  bool RequireGlobalShuffle() { return shuffle_global_; }
+
  private:
   // The entry point for when workers are launched.
   // @param worker_id - the id of the worker that is executing this function.
@@ -292,17 +313,8 @@ class TFReaderOp : public ParallelOp {
   // @param column_values_list - the cell that contains the bytes list to read from.
   // @param elementStr - the string we read the value into.
   // @return Status - the error code returned.
-  Status LoadBytesList(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
-                       int32_t *num_elements, std::shared_ptr<Tensor> *tensor);
-
-  // Loads all the strings in bytes_list into the memory at current_tensor_addr.
-  // @param current_tensor_addr - the memory address to load the strings to.
-  // @param bytes_list - the list of strings to load.
-  // @param tensor_bytes_remaining - the number of bytes available for this function to use.
-  // @param pad_size - number of bytes to pad to.
-  // @return Status - the error code returned.
-  Status LoadAndPadBytes(unsigned char *current_tensor_addr, const dataengine::BytesList &bytes_list,
-                         int64_t tensor_bytes_remaining, int64_t pad_size);
+  static Status LoadBytesList(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
+                              int32_t *num_elements, std::shared_ptr<Tensor> *tensor);
 
   // Reads values from a float list
   // @param current_col - the column descriptor containing the expected shape and type of the data.
@@ -335,7 +347,7 @@ class TFReaderOp : public ParallelOp {
 
   // Reads one row of data from a tf file and creates a schema based on that row
   // @return Status - the error code returned.
-  Status CreateSchema(const std::string tf_file, const std::vector<std::string> &columns_to_load);
+  Status CreateSchema(const std::string tf_file, std::vector<std::string> columns_to_load);
 
   // Meant to be called async. Will read files in the range [begin, end) and return the total rows
   // @param filenames - a list of tf data filenames.
@@ -377,6 +389,7 @@ class TFReaderOp : public ParallelOp {
   std::vector<std::string> columns_to_load_;
   bool finished_reading_dataset_;
   bool shuffle_files_;
+  bool shuffle_global_;
   std::unique_ptr<DataSchema> data_schema_;
   std::unique_ptr<StringIndex> filename_index_;
   bool load_io_block_queue_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc
index d96b3a8872..d3c7ff397f 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc
@@ -44,7 +44,7 @@ const char kSegmentationExtension[] = ".png";
 const char kAnnotationExtension[] = ".xml";
 const char kImageSetsExtension[] = ".txt";
 
-VOCOp::Builder::Builder() : builder_decode_(false), builder_num_samples_(0), builder_sampler_(nullptr) {
+VOCOp::Builder::Builder() : builder_decode_(false), builder_sampler_(nullptr) {
   std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
   builder_num_workers_ = cfg->num_parallel_workers();
   builder_rows_per_buffer_ = cfg->rows_per_buffer();
@@ -55,7 +55,9 @@ VOCOp::Builder::Builder() : builder_decode_(false), builder_num_samples_(0), bui
 Status VOCOp::Builder::Build(std::shared_ptr<VOCOp> *ptr) {
   RETURN_IF_NOT_OK(SanityCheck());
   if (builder_sampler_ == nullptr) {
-    builder_sampler_ = std::make_shared<SequentialSampler>();
+    const int64_t num_samples = 0;
+    const int64_t start_index = 0;
+    builder_sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples);
   }
   builder_schema_ = std::make_unique<DataSchema>();
   if (builder_task_type_ == TaskType::Segmentation) {
@@ -71,8 +73,7 @@ Status VOCOp::Builder::Build(std::shared_ptr<VOCOp> *ptr) {
   }
   *ptr = std::make_shared<VOCOp>(builder_task_type_, builder_task_mode_, builder_dir_, builder_labels_to_read_,
                                  builder_num_workers_, builder_rows_per_buffer_, builder_op_connector_size_,
-                                 builder_num_samples_, builder_decode_, std::move(builder_schema_),
-                                 std::move(builder_sampler_));
+                                 builder_decode_, std::move(builder_schema_), std::move(builder_sampler_));
   return Status::OK();
 }
 
@@ -81,20 +82,16 @@ Status VOCOp::Builder::SanityCheck() {
   std::string err_msg;
   err_msg += dir.IsDirectory() == false ? "VOC path is invalid or not set\n" : "";
   err_msg += builder_num_workers_ <= 0 ? "Num of parallel workers is set to 0 or negative\n" : "";
-  err_msg += builder_num_samples_ < 0 ? "num_samples is negative\n" : "";
   return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg);
 }
 
 VOCOp::VOCOp(const TaskType &task_type, const std::string &task_mode, const std::string &folder_path,
              const std::map<std::string, int32_t> &class_index, int32_t num_workers, int32_t rows_per_buffer,
-             int32_t queue_size, int64_t num_samples, bool decode, std::unique_ptr<DataSchema> data_schema,
-             std::shared_ptr<Sampler> sampler)
+             int32_t queue_size, bool decode, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler)
     : ParallelOp(num_workers, queue_size),
       decode_(decode),
       row_cnt_(0),
       buf_cnt_(0),
-      num_rows_(0),
-      num_samples_(num_samples),
       task_type_(task_type),
       task_mode_(task_mode),
       folder_path_(folder_path),
@@ -112,7 +109,6 @@ VOCOp::VOCOp(const TaskType &task_type, const std::string &task_mode, const std:
 Status VOCOp::TraverseSampleIds(const std::shared_ptr<Tensor> &sample_ids, std::vector<int64_t> *keys) {
   for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
     if ((*itr) > num_rows_) continue;
-    if (row_cnt_ == num_samples_) break;
     keys->push_back(*itr);
     row_cnt_++;
     if (row_cnt_ % rows_per_buffer_ == 0) {
@@ -127,7 +123,7 @@ Status VOCOp::TraverseSampleIds(const std::shared_ptr<Tensor> &sample_ids, std::
 Status VOCOp::operator()() {
   RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
   std::unique_ptr<DataBuffer> sampler_buffer;
-  RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+  RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
   while (true) {
     std::vector<int64_t> keys;
     keys.reserve(rows_per_buffer_);
@@ -138,7 +134,7 @@ Status VOCOp::operator()() {
         RETURN_STATUS_UNEXPECTED("Sampler Tensor isn't int64");
       }
       RETURN_IF_NOT_OK(TraverseSampleIds(sample_ids, &keys));
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
     }
     if (keys.empty() == false) {
       RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(
@@ -159,7 +155,7 @@ Status VOCOp::operator()() {
         io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
       RETURN_IF_NOT_OK(wp_.Wait());
       wp_.Clear();
-      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
+      RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
     }
   }
 }
@@ -181,23 +177,13 @@ void VOCOp::Print(std::ostream &out, bool show_all) const {
 }
 
 Status VOCOp::Reset() {
-  RETURN_IF_NOT_OK(sampler_->Reset());
+  RETURN_IF_NOT_OK(sampler_->ResetSampler());
   row_cnt_ = 0;
   wp_.Set();
   return Status::OK();
 }
 
-Status VOCOp::GetNumSamples(int64_t *num) const {
-  if (num == nullptr || num_rows_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API VOCDataset.Please check file path or dataset API "
-      "validation first.");
-  }
-  (*num) = num_samples_;
-  return Status::OK();
-}
-
-Status VOCOp::LoadTensorRow(const std::string &image_id, TensorRow *trow) {
+Status VOCOp::LoadTensorRow(row_id_type row_id, const std::string &image_id, TensorRow *trow) {
   if (task_type_ == TaskType::Segmentation) {
     std::shared_ptr<Tensor> image, target;
     const std::string kImageFile =
@@ -206,7 +192,7 @@ Status VOCOp::LoadTensorRow(const std::string &image_id, TensorRow *trow) {
       folder_path_ + std::string(kSegmentationClassFolder) + image_id + std::string(kSegmentationExtension);
     RETURN_IF_NOT_OK(ReadImageToTensor(kImageFile, data_schema_->column(0), &image));
     RETURN_IF_NOT_OK(ReadImageToTensor(kTargetFile, data_schema_->column(1), &target));
-    (*trow) = {std::move(image), std::move(target)};
+    (*trow) = TensorRow(row_id, {std::move(image), std::move(target)});
   } else if (task_type_ == TaskType::Detection) {
     std::shared_ptr<Tensor> image, annotation;
     const std::string kImageFile =
@@ -215,7 +201,7 @@ Status VOCOp::LoadTensorRow(const std::string &image_id, TensorRow *trow) {
       folder_path_ + std::string(kAnnotationsFolder) + image_id + std::string(kAnnotationExtension);
     RETURN_IF_NOT_OK(ReadImageToTensor(kImageFile, data_schema_->column(0), &image));
     RETURN_IF_NOT_OK(ReadAnnotationToTensor(kAnnotationFile, data_schema_->column(1), &annotation));
-    (*trow) = {std::move(image), std::move(annotation)};
+    (*trow) = TensorRow(row_id, {std::move(image), std::move(annotation)});
   }
   return Status::OK();
 }
@@ -224,7 +210,7 @@ Status VOCOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataB
   std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
   TensorRow trow;
   for (const uint64_t &key : keys) {
-    RETURN_IF_NOT_OK(this->LoadTensorRow(image_ids_[key], &trow));
+    RETURN_IF_NOT_OK(this->LoadTensorRow(key, image_ids_[key], &trow));
     deq->push_back(std::move(trow));
   }
   (*db)->set_tensor_table(std::move(deq));
@@ -280,7 +266,6 @@ Status VOCOp::ParseImageIds() {
   in_file.close();
   image_ids_.shrink_to_fit();
   num_rows_ = image_ids_.size();
-  num_samples_ = (num_samples_ == 0 || num_samples_ > num_rows_) ? num_rows_ : num_samples_;
   return Status::OK();
 }
 
@@ -305,7 +290,6 @@ Status VOCOp::ParseAnnotationIds() {
   }
 
   num_rows_ = image_ids_.size();
-  num_samples_ = (num_samples_ == 0 || num_samples_ > num_rows_) ? num_rows_ : num_samples_;
   return Status::OK();
 }
 
@@ -384,17 +368,7 @@ Status VOCOp::LaunchThreadsAndInitOp() {
 }
 
 Status VOCOp::ReadImageToTensor(const std::string &path, const ColDescriptor &col, std::shared_ptr<Tensor> *tensor) {
-  std::ifstream fs;
-  fs.open(path, std::ios::binary | std::ios::in);
-  if (fs.fail()) {
-    RETURN_STATUS_UNEXPECTED("Fail to open file: " + path);
-  }
-  int64_t num_elements = fs.seekg(0, std::ios::end).tellg();
-  (void)fs.seekg(0, std::ios::beg);
-  RETURN_IF_NOT_OK(
-    Tensor::CreateTensor(tensor, col.tensorImpl(), TensorShape(std::vector<dsize_t>(1, num_elements)), col.type()));
-  (void)fs.read(reinterpret_cast<char *>((*tensor)->GetMutableBuffer()), num_elements);
-  fs.close();
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, path));
   if (decode_ == true) {
     Status rc = Decode(*tensor, tensor);
     if (rc.IsError()) {
@@ -432,19 +406,8 @@ Status VOCOp::ReadAnnotationToTensor(const std::string &path, const ColDescripto
   return Status::OK();
 }
 
-// Derived from RandomAccessOp
-Status VOCOp::GetNumRowsInDataset(int64_t *num) const {
-  if (num == nullptr || num_rows_ == 0) {
-    RETURN_STATUS_UNEXPECTED(
-      "There is no valid data matching the dataset API VOCDataset.Please check file path or dataset API "
-      "validation first.");
-  }
-  (*num) = num_rows_;
-  return Status::OK();
-}
-
 Status VOCOp::CountTotalRows(const std::string &dir, const std::string &task_type, const std::string &task_mode,
-                             const py::dict &dict, int64_t numSamples, int64_t *count) {
+                             const py::dict &dict, int64_t *count) {
   if (task_type == "Detection") {
     std::map<std::string, int32_t> input_class_indexing;
     for (auto p : dict) {
@@ -464,14 +427,12 @@ Status VOCOp::CountTotalRows(const std::string &dir, const std::string &task_typ
     RETURN_IF_NOT_OK(op->ParseImageIds());
     *count = static_cast<int64_t>(op->image_ids_.size());
   }
-  *count = (numSamples == 0 || *count < numSamples) ? *count : numSamples;
 
   return Status::OK();
 }
 
 Status VOCOp::GetClassIndexing(const std::string &dir, const std::string &task_type, const std::string &task_mode,
-                               const py::dict &dict, int64_t numSamples,
-                               std::map<std::string, int32_t> *output_class_indexing) {
+                               const py::dict &dict, std::map<std::string, int32_t> *output_class_indexing) {
   std::map<std::string, int32_t> input_class_indexing;
   for (auto p : dict) {
     (void)input_class_indexing.insert(std::pair<std::string, int32_t>(py::reinterpret_borrow<py::str>(p.first),
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.h b/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.h
index 203ec05fab..bce82a43c9 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.h
@@ -116,14 +116,6 @@ class VOCOp : public ParallelOp, public RandomAccessOp {
       return *this;
     }
 
-    // Setter method.
-    // @param int64_t num_samples
-    // @return Builder setter method returns reference to the builder.
-    Builder &SetNumSamples(int64_t num_samples) {
-      builder_num_samples_ = num_samples;
-      return *this;
-    }
-
     // Setter method.
     // @param std::shared_ptr<Sampler> sampler
     // @return Builder setter method returns reference to the builder.
@@ -157,7 +149,6 @@ class VOCOp : public ParallelOp, public RandomAccessOp {
     int32_t builder_num_workers_;
     int32_t builder_op_connector_size_;
     int32_t builder_rows_per_buffer_;
-    int64_t builder_num_samples_;
     std::shared_ptr<Sampler> builder_sampler_;
     std::unique_ptr<DataSchema> builder_schema_;
     std::map<std::string, int32_t> builder_labels_to_read_;
@@ -171,14 +162,12 @@ class VOCOp : public ParallelOp, public RandomAccessOp {
   // @param int32_t num_workers - number of workers reading images in parallel
   // @param int32_t rows_per_buffer - number of images (rows) in each buffer
   // @param int32_t queue_size - connector queue size
-  // @param int64_t num_samples - number of samples to read
   // @param bool decode - whether to decode images
   // @param std::unique_ptr<DataSchema> data_schema - the schema of the VOC dataset
   // @param std::shared_ptr<Sampler> sampler - sampler tells VOCOp what to read
   VOCOp(const TaskType &task_type, const std::string &task_mode, const std::string &folder_path,
         const std::map<std::string, int32_t> &class_index, int32_t num_workers, int32_t rows_per_buffer,
-        int32_t queue_size, int64_t num_samples, bool decode, std::unique_ptr<DataSchema> data_schema,
-        std::shared_ptr<Sampler> sampler);
+        int32_t queue_size, bool decode, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler);
 
   // Destructor
   ~VOCOp() = default;
@@ -194,15 +183,6 @@ class VOCOp : public ParallelOp, public RandomAccessOp {
   // @return Status - The error code return
   Status operator()() override;
 
-  // Method derived from RandomAccessOp, enable Sampler to get numRows
-  // @param uint64_t num - to return numRows
-  // return Status - The error code return
-  Status GetNumSamples(int64_t *num) const override;
-
-  // Method derived from RandomAccessOp, enable Sampler to get total number of rows in dataset
-  // @param uint64_t num - to return numRows
-  Status GetNumRowsInDataset(int64_t *num) const override;
-
   // A print method typically used for debugging
   // @param out
   // @param show_all
@@ -212,10 +192,9 @@ class VOCOp : public ParallelOp, public RandomAccessOp {
   // @param const std::string &task_type - task type of reading voc job
   // @param const std::string &task_mode - task mode of reading voc job
   // @param const py::dict &dict - input dict of class index
-  // @param int64_t numSamples - samples number of VOCDataset
   // @param int64_t *count - output rows number of VOCDataset
   static Status CountTotalRows(const std::string &dir, const std::string &task_type, const std::string &task_mode,
-                               const py::dict &dict, int64_t numSamples, int64_t *count);
+                               const py::dict &dict, int64_t *count);
 
   // @param const std::string &dir - VOC dir path
   // @param const std::string &task_type - task type of reading voc job
@@ -224,8 +203,11 @@ class VOCOp : public ParallelOp, public RandomAccessOp {
   // @param int64_t numSamples - samples number of VOCDataset
   // @param std::map<std::string, int32_t> *output_class_indexing - output class index of VOCDataset
   static Status GetClassIndexing(const std::string &dir, const std::string &task_type, const std::string &task_mode,
-                                 const py::dict &dict, int64_t numSamples,
-                                 std::map<std::string, int32_t> *output_class_indexing);
+                                 const py::dict &dict, std::map<std::string, int32_t> *output_class_indexing);
+
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "VOCOp"; }
 
  private:
   // Initialize Sampler, calls sampler->Init() within
@@ -233,10 +215,11 @@ class VOCOp : public ParallelOp, public RandomAccessOp {
   Status InitSampler();
 
   // Load a tensor row according to image id
+  // @param row_id_type row_id - id for this tensor row
   // @param std::string image_id - image id
   // @param TensorRow row - image & target read into this tensor row
   // @return Status - The error code return
-  Status LoadTensorRow(const std::string &image_id, TensorRow *row);
+  Status LoadTensorRow(row_id_type row_id, const std::string &image_id, TensorRow *row);
 
   // @param const std::string &path - path to the image file
   // @param const ColDescriptor &col - contains tensor implementation and datatype
@@ -283,8 +266,6 @@ class VOCOp : public ParallelOp, public RandomAccessOp {
   bool decode_;
   int64_t row_cnt_;
   int64_t buf_cnt_;
-  int64_t num_rows_;
-  int64_t num_samples_;
   std::string folder_path_;
   TaskType task_type_;
   std::string task_mode_;
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/take_op.h b/mindspore/ccsrc/dataset/engine/datasetops/take_op.h
index 64ba8e69e0..9619a4409d 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/take_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/take_op.h
@@ -40,7 +40,7 @@ class TakeOp : public PipelineOp {
     ~Builder() = default;
 
     // The builder "build" method creates the final object.
-    // @return shared_ptr to the new StorageOp object
+    // @return shared_ptr to the new TakeOp object
     Status Build(std::shared_ptr<TakeOp> *);
 
    private:
@@ -90,6 +90,10 @@ class TakeOp : public PipelineOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "TakeOp"; }
+
  private:
   int32_t max_takes_;   // The number of takes that the user requested
   int32_t take_count_;  // A counter for the current number of executed takes
diff --git a/mindspore/ccsrc/dataset/engine/datasetops/zip_op.h b/mindspore/ccsrc/dataset/engine/datasetops/zip_op.h
index 1140a98dd7..08b93c18b5 100644
--- a/mindspore/ccsrc/dataset/engine/datasetops/zip_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/zip_op.h
@@ -65,7 +65,7 @@ class ZipOp : public PipelineOp {
     }
 
     // The builder "build" method creates the ZipOp dataset Operator.
-    // @return shared_ptr to the new StorageOp object
+    // @return shared_ptr to the new ZipOp object
     Status Build(std::shared_ptr<ZipOp> *);
 
    private:
@@ -110,6 +110,10 @@ class ZipOp : public PipelineOp {
   // @return - Status of the node visit.
   Status Accept(NodePass *p, bool *modified) override;
 
+  // Op name getter
+  // @return Name of the current Op
+  std::string Name() const override { return "ZipOp"; }
+
  private:
   // Handles preprocessing of the main loop, used when starting new epoch
   Status prepare(TensorQTable *const table);
diff --git a/mindspore/ccsrc/dataset/engine/execution_tree.cc b/mindspore/ccsrc/dataset/engine/execution_tree.cc
index bcb387082b..5c921bba84 100644
--- a/mindspore/ccsrc/dataset/engine/execution_tree.cc
+++ b/mindspore/ccsrc/dataset/engine/execution_tree.cc
@@ -19,8 +19,10 @@
 #include "dataset/engine/datasetops/dataset_op.h"
 #include "dataset/engine/datasetops/shuffle_op.h"
 #include "dataset/util/task_manager.h"
-
-#include "dataset/engine/opt/util/printer_pass.h"
+#include "dataset/engine/opt/pre/map_column_reorder.h"
+#include "dataset/engine/opt/pre/global_shuffle.h"
+#include "dataset/engine/perf/profiling.h"
+#include "dataset/engine/perf/monitor.h"
 
 namespace mindspore {
 namespace dataset {
@@ -29,6 +31,8 @@ ExecutionTree::ExecutionTree() : id_count_(0) {
   tg_ = std::make_unique<TaskGroup>();
   tree_state_ = kDeTStateInit;
   prepare_flags_ = kDePrepNone;
+  perf_monitor_ = std::make_unique<Monitor>(this);
+  profiling_manager_ = std::make_unique<ProfilingManager>(this);
 }
 
 // Destructor
@@ -77,8 +81,6 @@ Status ExecutionTree::AssignRoot(const std::shared_ptr<DatasetOp> &op) {
   // Then add it as the root.
   root_ = op;
 
-  // The tree has an assigned root now and it's ready to be prepared.
-  tree_state_ = kDeTStatePrepare;
   return Status::OK();
 }
 
@@ -120,6 +122,15 @@ Status ExecutionTree::Launch() {
   }
   std::ostringstream ss;
   ss << *this;
+
+  // Profiling infrastructures need to be initialized before Op launching
+  if (profiling_manager_->IsProfilingEnable()) {
+    // Setup profiling manager
+    RETURN_IF_NOT_OK(profiling_manager_->Initialize());
+    // Launch Monitor Thread
+    RETURN_IF_NOT_OK(tg_->CreateAsyncTask("Monitor Thread launched", std::ref(*perf_monitor_)));
+  }
+
   MS_LOG(DEBUG) << "Printing the tree before launch tasks:\n" << ss.str();
   for (auto itr = this->begin(); itr != this->end(); ++itr) {
     // An inlined operator is one that has an output connector size of 0, and it does not
@@ -132,7 +143,9 @@ Status ExecutionTree::Launch() {
       // Set the state of the Operator as running. This only matters in Leaf ops, CacheOp and TakeOp
     }
   }
+
   tree_state_ = kDeTStateExecuting;
+
   return Status::OK();
 }
 
@@ -194,9 +207,24 @@ Status ExecutionTree::Prepare() {
   return Status::OK();
 }
 
-Status ExecutionTree::PrepareTreePreAction() { return Status::OK(); }
+Status ExecutionTree::PrepareTreePreAction() {
+  bool modified = false;
+  std::vector<Pass *> pre_actions;
+  // Construct pre actions
+  pre_actions.push_back(new MapColumnReorder());
+  pre_actions.push_back(new GlobalShufflePass());
+  // Apply pre action passes
+  for (auto &pass : pre_actions) {
+    RETURN_IF_NOT_OK(pass->Run(this, &modified));
+  }
+  return Status::OK();
+}
 
-Status ExecutionTree::PrepareTreePostAction() { return Status::OK(); }
+Status ExecutionTree::PrepareTreePostAction() {
+  // The tree is ready to be prepared.
+  tree_state_ = kDeTStatePrepare;
+  return Status::OK();
+}
 
 Status ExecutionTree::Optimize() {
   //  auto pp = new PrinterPass();
diff --git a/mindspore/ccsrc/dataset/engine/execution_tree.h b/mindspore/ccsrc/dataset/engine/execution_tree.h
index f0c894f05b..e1c5e8ff54 100644
--- a/mindspore/ccsrc/dataset/engine/execution_tree.h
+++ b/mindspore/ccsrc/dataset/engine/execution_tree.h
@@ -23,12 +23,14 @@
 #include <vector>
 #include "dataset/engine/datasetops/dataset_op.h"
 #include "dataset/util/status.h"
+#include "mindspore/ccsrc/dataset/engine/perf/profiling.h"
 
 namespace mindspore {
 namespace dataset {
 // Forward declares
 class TaskGroup;
 class DatasetOp;
+class Monitor;
 
 class ExecutionTree {
  public:
@@ -40,11 +42,12 @@ class ExecutionTree {
 
   // State flags for the lifecycle of the tree
   enum TreeState {
-    kDeTStateInit = 0,  // The freshly initialized state after construction
-    kDeTStateBuilding,  // The tree is being built, nodes are being added
-    kDeTStatePrepare,   // The tree has been assigned a root node and is pending prepare
-    kDeTStateReady,     // The tree has been prepared and is ready to be launched
-    kDeTStateExecuting  // The tree has been launched and is executing
+    kDeTStateInit = 0,   // The freshly initialized state after construction
+    kDeTStateBuilding,   // The tree is being built, nodes are being added
+    kDeTStatePrepare,    // The tree has been assigned a root node and is pending prepare
+    kDeTStateReady,      // The tree has been prepared and is ready to be launched
+    kDeTStateExecuting,  // The tree has been launched and is executing
+    kDeTStateFinished    // The tree has been drained, dataset iterator received EOF
   };
 
   class Iterator {
@@ -120,7 +123,7 @@ class ExecutionTree {
   // Returns an iterator positioned at the start
   // @return Iterator - The iterator
   ExecutionTree::Iterator begin(const std::shared_ptr<DatasetOp> &root = nullptr) const {
-    return Iterator((root == nullptr) ? root_ : root);
+    return Iterator(root == nullptr ? root_ : root);
   }
 
   // Returns an iterator positioned at the end
@@ -207,6 +210,16 @@ class ExecutionTree {
   // @return raw pointer to the TaskGroup
   TaskGroup *AllTasks() const { return tg_.get(); }
 
+  // Return if the ExecutionTree is finished (iterator receives EOF).
+  // @return Bool - true is ExecutionTree is finished
+  bool isFinished() const { return tree_state_ == TreeState::kDeTStateFinished; }
+
+  // Set the ExecutionTree to Finished state.
+  void SetFinished() { tree_state_ = TreeState::kDeTStateFinished; }
+
+  // Getter for profiling manager, no ownership
+  ProfilingManager *GetProfilingManager() { return profiling_manager_.get(); }
+
  private:
   // A helper functions for doing the recursive printing
   // @param dataset_op - The dataset op to print
@@ -222,6 +235,8 @@ class ExecutionTree {
   uint32_t prepare_flags_;                               // Flags used during tree prepare
   TreeState tree_state_;                                 // Tracking the current tree state
   std::stack<std::shared_ptr<DatasetOp>> repeat_stack_;  // A stack used during prepare phase
+  std::unique_ptr<Monitor> perf_monitor_;                // Performance Monitor
+  std::unique_ptr<ProfilingManager> profiling_manager_;  // Profiling manager
 };
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/gnn/graph.cc b/mindspore/ccsrc/dataset/engine/gnn/graph.cc
index 74e7b85153..1017657397 100644
--- a/mindspore/ccsrc/dataset/engine/gnn/graph.cc
+++ b/mindspore/ccsrc/dataset/engine/gnn/graph.cc
@@ -17,29 +17,30 @@
 
 #include <algorithm>
 #include <functional>
+#include <iterator>
 #include <numeric>
 #include <utility>
 
 #include "dataset/core/tensor_shape.h"
+#include "dataset/util/random.h"
 
 namespace mindspore {
 namespace dataset {
 namespace gnn {
 
-Graph::Graph(std::string dataset_file, int32_t num_workers) : dataset_file_(dataset_file), num_workers_(num_workers) {
+Graph::Graph(std::string dataset_file, int32_t num_workers)
+    : dataset_file_(dataset_file), num_workers_(num_workers), rnd_(GetRandomDevice()), random_walk_(this) {
+  rnd_.seed(GetSeed());
   MS_LOG(INFO) << "num_workers:" << num_workers;
 }
 
-Status Graph::GetNodes(NodeType node_type, NodeIdType node_num, std::shared_ptr<Tensor> *out) {
+Status Graph::GetAllNodes(NodeType node_type, std::shared_ptr<Tensor> *out) {
   auto itr = node_type_map_.find(node_type);
   if (itr == node_type_map_.end()) {
     std::string err_msg = "Invalid node type:" + std::to_string(node_type);
     RETURN_STATUS_UNEXPECTED(err_msg);
   } else {
-    if (node_num == -1) {
-      RETURN_IF_NOT_OK(CreateTensorByVector<NodeIdType>({itr->second}, DataType(DataType::DE_INT32), out));
-    } else {
-    }
+    RETURN_IF_NOT_OK(CreateTensorByVector<NodeIdType>({itr->second}, DataType(DataType::DE_INT32), out));
   }
   return Status::OK();
 }
@@ -58,10 +59,10 @@ Status Graph::CreateTensorByVector(const std::vector<std::vector<T>> &data, Data
   size_t n = data[0].size();
   RETURN_IF_NOT_OK(Tensor::CreateTensor(
     &tensor, TensorImpl::kFlexible, TensorShape({static_cast<dsize_t>(m), static_cast<dsize_t>(n)}), type, nullptr));
-  T *ptr = reinterpret_cast<T *>(tensor->GetMutableBuffer());
-  for (auto id_m : data) {
+  auto ptr = tensor->begin<T>();
+  for (const auto &id_m : data) {
     CHECK_FAIL_RETURN_UNEXPECTED(id_m.size() == n, "Each member of the vector has a different size");
-    for (auto id_n : id_m) {
+    for (const auto &id_n : id_m) {
       *ptr = id_n;
       ptr++;
     }
@@ -89,7 +90,38 @@ Status Graph::ComplementVector(std::vector<std::vector<T>> *data, size_t max_siz
   return Status::OK();
 }
 
-Status Graph::GetEdges(EdgeType edge_type, EdgeIdType edge_num, std::shared_ptr<Tensor> *out) { return Status::OK(); }
+Status Graph::GetAllEdges(EdgeType edge_type, std::shared_ptr<Tensor> *out) {
+  auto itr = edge_type_map_.find(edge_type);
+  if (itr == edge_type_map_.end()) {
+    std::string err_msg = "Invalid edge type:" + std::to_string(edge_type);
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  } else {
+    RETURN_IF_NOT_OK(CreateTensorByVector<EdgeIdType>({itr->second}, DataType(DataType::DE_INT32), out));
+  }
+  return Status::OK();
+}
+
+Status Graph::GetNodesFromEdges(const std::vector<EdgeIdType> &edge_list, std::shared_ptr<Tensor> *out) {
+  if (edge_list.empty()) {
+    RETURN_STATUS_UNEXPECTED("Input edge_list is empty");
+  }
+
+  std::vector<std::vector<NodeIdType>> node_list;
+  node_list.reserve(edge_list.size());
+  for (const auto &edge_id : edge_list) {
+    auto itr = edge_id_map_.find(edge_id);
+    if (itr == edge_id_map_.end()) {
+      std::string err_msg = "Invalid edge id:" + std::to_string(edge_id);
+      RETURN_STATUS_UNEXPECTED(err_msg);
+    } else {
+      std::pair<std::shared_ptr<Node>, std::shared_ptr<Node>> nodes;
+      RETURN_IF_NOT_OK(itr->second->GetNode(&nodes));
+      node_list.push_back({nodes.first->id(), nodes.second->id()});
+    }
+  }
+  RETURN_IF_NOT_OK(CreateTensorByVector<NodeIdType>(node_list, DataType(DataType::DE_INT32), out));
+  return Status::OK();
+}
 
 Status Graph::GetAllNeighbors(const std::vector<NodeIdType> &node_list, NodeType neighbor_type,
                               std::shared_ptr<Tensor> *out) {
@@ -105,14 +137,10 @@ Status Graph::GetAllNeighbors(const std::vector<NodeIdType> &node_list, NodeType
   size_t max_neighbor_num = 0;
   neighbors.resize(node_list.size());
   for (size_t i = 0; i < node_list.size(); ++i) {
-    auto itr = node_id_map_.find(node_list[i]);
-    if (itr != node_id_map_.end()) {
-      RETURN_IF_NOT_OK(itr->second->GetNeighbors(neighbor_type, -1, &neighbors[i]));
-      max_neighbor_num = max_neighbor_num > neighbors[i].size() ? max_neighbor_num : neighbors[i].size();
-    } else {
-      std::string err_msg = "Invalid node id:" + std::to_string(node_list[i]);
-      RETURN_STATUS_UNEXPECTED(err_msg);
-    }
+    std::shared_ptr<Node> node;
+    RETURN_IF_NOT_OK(GetNodeByNodeId(node_list[i], &node));
+    RETURN_IF_NOT_OK(node->GetAllNeighbors(neighbor_type, &neighbors[i]));
+    max_neighbor_num = max_neighbor_num > neighbors[i].size() ? max_neighbor_num : neighbors[i].size();
   }
 
   RETURN_IF_NOT_OK(ComplementVector<NodeIdType>(&neighbors, max_neighbor_num, kDefaultNodeId));
@@ -121,18 +149,104 @@ Status Graph::GetAllNeighbors(const std::vector<NodeIdType> &node_list, NodeType
   return Status::OK();
 }
 
-Status Graph::GetSampledNeighbor(const std::vector<NodeIdType> &node_list, const std::vector<NodeIdType> &neighbor_nums,
-                                 const std::vector<NodeType> &neighbor_types, std::shared_ptr<Tensor> *out) {
+Status Graph::GetSampledNeighbors(const std::vector<NodeIdType> &node_list,
+                                  const std::vector<NodeIdType> &neighbor_nums,
+                                  const std::vector<NodeType> &neighbor_types, std::shared_ptr<Tensor> *out) {
+  CHECK_FAIL_RETURN_UNEXPECTED(!node_list.empty(), "Input node_list is empty.");
+  CHECK_FAIL_RETURN_UNEXPECTED(neighbor_nums.size() == neighbor_types.size(),
+                               "The sizes of neighbor_nums and neighbor_types are inconsistent.");
+  std::vector<std::vector<NodeIdType>> neighbors_vec(node_list.size());
+  for (size_t node_idx = 0; node_idx < node_list.size(); ++node_idx) {
+    neighbors_vec[node_idx].emplace_back(node_list[node_idx]);
+    std::vector<NodeIdType> input_list = {node_list[node_idx]};
+    for (size_t i = 0; i < neighbor_nums.size(); ++i) {
+      std::vector<NodeIdType> neighbors;
+      neighbors.reserve(input_list.size() * neighbor_nums[i]);
+      for (const auto &node_id : input_list) {
+        if (node_id == kDefaultNodeId) {
+          for (int32_t j = 0; j < neighbor_nums[i]; ++j) {
+            neighbors.emplace_back(kDefaultNodeId);
+          }
+        } else {
+          std::shared_ptr<Node> node;
+          RETURN_IF_NOT_OK(GetNodeByNodeId(node_id, &node));
+          std::vector<NodeIdType> out;
+          RETURN_IF_NOT_OK(node->GetSampledNeighbors(neighbor_types[i], neighbor_nums[i], &out));
+          neighbors.insert(neighbors.end(), out.begin(), out.end());
+        }
+      }
+      neighbors_vec[node_idx].insert(neighbors_vec[node_idx].end(), neighbors.begin(), neighbors.end());
+      input_list = std::move(neighbors);
+    }
+  }
+  RETURN_IF_NOT_OK(CreateTensorByVector<NodeIdType>(neighbors_vec, DataType(DataType::DE_INT32), out));
+  return Status::OK();
+}
+
+Status Graph::NegativeSample(const std::vector<NodeIdType> &data, const std::unordered_set<NodeIdType> &exclude_data,
+                             int32_t samples_num, std::vector<NodeIdType> *out_samples) {
+  CHECK_FAIL_RETURN_UNEXPECTED(!data.empty(), "Input data is empty.");
+  std::vector<NodeIdType> shuffled_id(data.size());
+  std::iota(shuffled_id.begin(), shuffled_id.end(), 0);
+  std::shuffle(shuffled_id.begin(), shuffled_id.end(), rnd_);
+  for (const auto &index : shuffled_id) {
+    if (exclude_data.find(data[index]) != exclude_data.end()) {
+      continue;
+    }
+    out_samples->emplace_back(data[index]);
+    if (out_samples->size() >= samples_num) {
+      break;
+    }
+  }
   return Status::OK();
 }
 
-Status Graph::GetNegSampledNeighbor(const std::vector<NodeIdType> &node_list, NodeIdType samples_num,
-                                    NodeType neg_neighbor_type, std::shared_ptr<Tensor> *out) {
+Status Graph::GetNegSampledNeighbors(const std::vector<NodeIdType> &node_list, NodeIdType samples_num,
+                                     NodeType neg_neighbor_type, std::shared_ptr<Tensor> *out) {
+  CHECK_FAIL_RETURN_UNEXPECTED(!node_list.empty(), "Input node_list is empty.");
+  std::vector<std::vector<NodeIdType>> neighbors_vec;
+  neighbors_vec.resize(node_list.size());
+  for (size_t node_idx = 0; node_idx < node_list.size(); ++node_idx) {
+    std::shared_ptr<Node> node;
+    RETURN_IF_NOT_OK(GetNodeByNodeId(node_list[node_idx], &node));
+    std::vector<NodeIdType> neighbors;
+    RETURN_IF_NOT_OK(node->GetAllNeighbors(neg_neighbor_type, &neighbors));
+    std::unordered_set<NodeIdType> exclude_node;
+    std::transform(neighbors.begin(), neighbors.end(),
+                   std::insert_iterator<std::unordered_set<NodeIdType>>(exclude_node, exclude_node.begin()),
+                   [](const NodeIdType node) { return node; });
+    auto itr = node_type_map_.find(neg_neighbor_type);
+    if (itr == node_type_map_.end()) {
+      std::string err_msg = "Invalid node type:" + std::to_string(neg_neighbor_type);
+      RETURN_STATUS_UNEXPECTED(err_msg);
+    } else {
+      neighbors_vec[node_idx].emplace_back(node->id());
+      if (itr->second.size() > exclude_node.size()) {
+        while (neighbors_vec[node_idx].size() < samples_num + 1) {
+          RETURN_IF_NOT_OK(NegativeSample(itr->second, exclude_node, samples_num - neighbors_vec[node_idx].size(),
+                                          &neighbors_vec[node_idx]));
+        }
+      } else {
+        MS_LOG(DEBUG) << "There are no negative neighbors. node_id:" << node->id()
+                      << " neg_neighbor_type:" << neg_neighbor_type;
+        // If there are no negative neighbors, they are filled with kDefaultNodeId
+        for (int32_t i = 0; i < samples_num; ++i) {
+          neighbors_vec[node_idx].emplace_back(kDefaultNodeId);
+        }
+      }
+    }
+  }
+  RETURN_IF_NOT_OK(CreateTensorByVector<NodeIdType>(neighbors_vec, DataType(DataType::DE_INT32), out));
   return Status::OK();
 }
 
-Status Graph::RandomWalk(const std::vector<NodeIdType> &node_list, const std::vector<NodeType> &meta_path, float p,
-                         float q, NodeIdType default_node, std::shared_ptr<Tensor> *out) {
+Status Graph::RandomWalk(const std::vector<NodeIdType> &node_list, const std::vector<NodeType> &meta_path,
+                         float step_home_param, float step_away_param, NodeIdType default_node,
+                         std::shared_ptr<Tensor> *out) {
+  RETURN_IF_NOT_OK(random_walk_.Build(node_list, meta_path, step_home_param, step_away_param, default_node));
+  std::vector<std::vector<NodeIdType>> walks;
+  RETURN_IF_NOT_OK(random_walk_.SimulateWalk(&walks));
+  RETURN_IF_NOT_OK(CreateTensorByVector<NodeIdType>({walks}, DataType(DataType::DE_INT32), out));
   return Status::OK();
 }
 
@@ -154,7 +268,7 @@ Status Graph::GetNodeFeature(const std::shared_ptr<Tensor> &nodes, const std::ve
   }
   CHECK_FAIL_RETURN_UNEXPECTED(!feature_types.empty(), "Inpude feature_types is empty");
   TensorRow tensors;
-  for (auto f_type : feature_types) {
+  for (const auto &f_type : feature_types) {
     std::shared_ptr<Feature> default_feature;
     // If no feature can be obtained, fill in the default value
     RETURN_IF_NOT_OK(GetNodeDefaultFeature(f_type, &default_feature));
@@ -169,18 +283,14 @@ Status Graph::GetNodeFeature(const std::shared_ptr<Tensor> &nodes, const std::ve
 
     dsize_t index = 0;
     for (auto node_itr = nodes->begin<NodeIdType>(); node_itr != nodes->end<NodeIdType>(); ++node_itr) {
-      auto itr = node_id_map_.find(*node_itr);
       std::shared_ptr<Feature> feature;
-      if (itr != node_id_map_.end()) {
-        if (!itr->second->GetFeatures(f_type, &feature).IsOk()) {
-          feature = default_feature;
-        }
+      if (*node_itr == kDefaultNodeId) {
+        feature = default_feature;
       } else {
-        if (*node_itr == kDefaultNodeId) {
+        std::shared_ptr<Node> node;
+        RETURN_IF_NOT_OK(GetNodeByNodeId(*node_itr, &node));
+        if (!node->GetFeatures(f_type, &feature).IsOk()) {
           feature = default_feature;
-        } else {
-          std::string err_msg = "Invalid node id:" + std::to_string(*node_itr);
-          RETURN_STATUS_UNEXPECTED(err_msg);
         }
       }
       RETURN_IF_NOT_OK(fea_tensor->InsertTensor({index}, feature->Value()));
@@ -209,35 +319,54 @@ Status Graph::Init() {
   return Status::OK();
 }
 
-Status Graph::GetMetaInfo(std::vector<NodeMetaInfo> *node_info, std::vector<EdgeMetaInfo> *edge_info) {
-  node_info->reserve(node_type_map_.size());
-  for (auto node : node_type_map_) {
-    NodeMetaInfo n_info;
-    n_info.type = node.first;
-    n_info.num = node.second.size();
-    auto itr = node_feature_map_.find(node.first);
-    if (itr != node_feature_map_.end()) {
-      for (auto f_type : itr->second) {
-        n_info.feature_type.push_back(f_type);
-      }
-      std::sort(n_info.feature_type.begin(), n_info.feature_type.end());
+Status Graph::GetMetaInfo(MetaInfo *meta_info) {
+  meta_info->node_type.resize(node_type_map_.size());
+  std::transform(node_type_map_.begin(), node_type_map_.end(), meta_info->node_type.begin(),
+                 [](auto itr) { return itr.first; });
+  std::sort(meta_info->node_type.begin(), meta_info->node_type.end());
+
+  meta_info->edge_type.resize(edge_type_map_.size());
+  std::transform(edge_type_map_.begin(), edge_type_map_.end(), meta_info->edge_type.begin(),
+                 [](auto itr) { return itr.first; });
+  std::sort(meta_info->edge_type.begin(), meta_info->edge_type.end());
+
+  for (const auto &node : node_type_map_) {
+    meta_info->node_num[node.first] = node.second.size();
+  }
+
+  for (const auto &edge : edge_type_map_) {
+    meta_info->edge_num[edge.first] = edge.second.size();
+  }
+
+  for (const auto &node_feature : node_feature_map_) {
+    for (auto type : node_feature.second) {
+      meta_info->node_feature_type.emplace_back(type);
     }
-    node_info->push_back(n_info);
   }
+  std::sort(meta_info->node_feature_type.begin(), meta_info->node_feature_type.end());
+  auto unique_node = std::unique(meta_info->node_feature_type.begin(), meta_info->node_feature_type.end());
+  meta_info->node_feature_type.erase(unique_node, meta_info->node_feature_type.end());
 
-  edge_info->reserve(edge_type_map_.size());
-  for (auto edge : edge_type_map_) {
-    EdgeMetaInfo e_info;
-    e_info.type = edge.first;
-    e_info.num = edge.second.size();
-    auto itr = edge_feature_map_.find(edge.first);
-    if (itr != edge_feature_map_.end()) {
-      for (auto f_type : itr->second) {
-        e_info.feature_type.push_back(f_type);
-      }
+  for (const auto &edge_feature : edge_feature_map_) {
+    for (const auto &type : edge_feature.second) {
+      meta_info->edge_feature_type.emplace_back(type);
     }
-    edge_info->push_back(e_info);
   }
+  std::sort(meta_info->edge_feature_type.begin(), meta_info->edge_feature_type.end());
+  auto unique_edge = std::unique(meta_info->edge_feature_type.begin(), meta_info->edge_feature_type.end());
+  meta_info->edge_feature_type.erase(unique_edge, meta_info->edge_feature_type.end());
+  return Status::OK();
+}
+
+Status Graph::GraphInfo(py::dict *out) {
+  MetaInfo meta_info;
+  RETURN_IF_NOT_OK(GetMetaInfo(&meta_info));
+  (*out)["node_type"] = py::cast(meta_info.node_type);
+  (*out)["edge_type"] = py::cast(meta_info.edge_type);
+  (*out)["node_num"] = py::cast(meta_info.node_num);
+  (*out)["edge_num"] = py::cast(meta_info.edge_num);
+  (*out)["node_feature_type"] = py::cast(meta_info.node_feature_type);
+  (*out)["edge_feature_type"] = py::cast(meta_info.edge_feature_type);
   return Status::OK();
 }
 
@@ -250,6 +379,207 @@ Status Graph::LoadNodeAndEdge() {
                                        &node_feature_map_, &edge_feature_map_, &default_feature_map_));
   return Status::OK();
 }
+
+Status Graph::GetNodeByNodeId(NodeIdType id, std::shared_ptr<Node> *node) {
+  auto itr = node_id_map_.find(id);
+  if (itr == node_id_map_.end()) {
+    std::string err_msg = "Invalid node id:" + std::to_string(id);
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  } else {
+    *node = itr->second;
+  }
+  return Status::OK();
+}
+
+Graph::RandomWalkBase::RandomWalkBase(Graph *graph)
+    : graph_(graph), step_home_param_(1.0), step_away_param_(1.0), default_node_(-1), num_walks_(1), num_workers_(1) {}
+
+Status Graph::RandomWalkBase::Build(const std::vector<NodeIdType> &node_list, const std::vector<NodeType> &meta_path,
+                                    float step_home_param, float step_away_param, const NodeIdType default_node,
+                                    int32_t num_walks, int32_t num_workers) {
+  node_list_ = node_list;
+  if (meta_path.empty() || meta_path.size() > kMaxNumWalks) {
+    std::string err_msg = "Failed, meta path required between 1 and " + std::to_string(kMaxNumWalks) +
+                          ". The size of input path is " + std::to_string(meta_path.size());
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+  meta_path_ = meta_path;
+  if (step_home_param < kGnnEpsilon || step_away_param < kGnnEpsilon) {
+    std::string err_msg = "Failed, step_home_param and step_away_param required greater than " +
+                          std::to_string(kGnnEpsilon) + ". step_home_param: " + std::to_string(step_home_param) +
+                          ", step_away_param: " + std::to_string(step_away_param);
+    RETURN_STATUS_UNEXPECTED(err_msg);
+  }
+  step_home_param_ = step_home_param;
+  step_away_param_ = step_away_param;
+  default_node_ = default_node;
+  num_walks_ = num_walks;
+  num_workers_ = num_workers;
+  return Status::OK();
+}
+
+Status Graph::RandomWalkBase::Node2vecWalk(const NodeIdType &start_node, std::vector<NodeIdType> *walk_path) {
+  // Simulate a random walk starting from start node.
+  auto walk = std::vector<NodeIdType>(1, start_node);  // walk is an vector
+  // walk simulate
+  while (walk.size() - 1 < meta_path_.size()) {
+    // current nodE
+    auto cur_node_id = walk.back();
+    std::shared_ptr<Node> cur_node;
+    RETURN_IF_NOT_OK(graph_->GetNodeByNodeId(cur_node_id, &cur_node));
+
+    // current neighbors
+    std::vector<NodeIdType> cur_neighbors;
+    RETURN_IF_NOT_OK(cur_node->GetAllNeighbors(meta_path_[walk.size() - 1], &cur_neighbors, true));
+    std::sort(cur_neighbors.begin(), cur_neighbors.end());
+
+    // break if no neighbors
+    if (cur_neighbors.empty()) {
+      break;
+    }
+
+    // walk by the fist node, then by the previous 2 nodes
+    std::shared_ptr<StochasticIndex> stochastic_index;
+    if (walk.size() == 1) {
+      RETURN_IF_NOT_OK(GetNodeProbability(cur_node_id, meta_path_[0], &stochastic_index));
+    } else {
+      NodeIdType prev_node_id = walk[walk.size() - 2];
+      RETURN_IF_NOT_OK(GetEdgeProbability(prev_node_id, cur_node_id, walk.size() - 2, &stochastic_index));
+    }
+    NodeIdType next_node_id = cur_neighbors[WalkToNextNode(*stochastic_index)];
+    walk.push_back(next_node_id);
+  }
+
+  while (walk.size() - 1 < meta_path_.size()) {
+    walk.push_back(default_node_);
+  }
+
+  *walk_path = std::move(walk);
+  return Status::OK();
+}
+
+Status Graph::RandomWalkBase::SimulateWalk(std::vector<std::vector<NodeIdType>> *walks) {
+  // Repeatedly simulate random walks from each node
+  std::vector<uint32_t> permutation(node_list_.size());
+  std::iota(permutation.begin(), permutation.end(), 0);
+  for (int32_t i = 0; i < num_walks_; i++) {
+    unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+    std::shuffle(permutation.begin(), permutation.end(), std::default_random_engine(seed));
+    for (const auto &i_perm : permutation) {
+      std::vector<NodeIdType> walk;
+      RETURN_IF_NOT_OK(Node2vecWalk(node_list_[i_perm], &walk));
+      walks->push_back(walk);
+    }
+  }
+  return Status::OK();
+}
+
+Status Graph::RandomWalkBase::GetNodeProbability(const NodeIdType &node_id, const NodeType &node_type,
+                                                 std::shared_ptr<StochasticIndex> *node_probability) {
+  // Generate alias nodes
+  std::shared_ptr<Node> node;
+  graph_->GetNodeByNodeId(node_id, &node);
+  std::vector<NodeIdType> neighbors;
+  RETURN_IF_NOT_OK(node->GetAllNeighbors(node_type, &neighbors, true));
+  std::sort(neighbors.begin(), neighbors.end());
+  auto non_normalized_probability = std::vector<float>(neighbors.size(), 1.0);
+  *node_probability =
+    std::make_shared<StochasticIndex>(GenerateProbability(Normalize<float>(non_normalized_probability)));
+  return Status::OK();
+}
+
+Status Graph::RandomWalkBase::GetEdgeProbability(const NodeIdType &src, const NodeIdType &dst, uint32_t meta_path_index,
+                                                 std::shared_ptr<StochasticIndex> *edge_probability) {
+  // Get the alias edge setup lists for a given edge.
+  std::shared_ptr<Node> src_node;
+  graph_->GetNodeByNodeId(src, &src_node);
+  std::vector<NodeIdType> src_neighbors;
+  RETURN_IF_NOT_OK(src_node->GetAllNeighbors(meta_path_[meta_path_index], &src_neighbors, true));
+
+  std::shared_ptr<Node> dst_node;
+  graph_->GetNodeByNodeId(dst, &dst_node);
+  std::vector<NodeIdType> dst_neighbors;
+  RETURN_IF_NOT_OK(dst_node->GetAllNeighbors(meta_path_[meta_path_index + 1], &dst_neighbors, true));
+
+  std::sort(dst_neighbors.begin(), dst_neighbors.end());
+  std::vector<float> non_normalized_probability;
+  for (const auto &dst_nbr : dst_neighbors) {
+    if (dst_nbr == src) {
+      non_normalized_probability.push_back(1.0 / step_home_param_);  // replace 1.0 with G[dst][dst_nbr]['weight']
+      continue;
+    }
+    auto it = std::find(src_neighbors.begin(), src_neighbors.end(), dst_nbr);
+    if (it != src_neighbors.end()) {
+      // stay close, this node connect both src and dst
+      non_normalized_probability.push_back(1.0);  // replace 1.0 with G[dst][dst_nbr]['weight']
+    } else {
+      // step far away
+      non_normalized_probability.push_back(1.0 / step_away_param_);  // replace 1.0 with G[dst][dst_nbr]['weight']
+    }
+  }
+
+  *edge_probability =
+    std::make_shared<StochasticIndex>(GenerateProbability(Normalize<float>(non_normalized_probability)));
+  return Status::OK();
+}
+
+StochasticIndex Graph::RandomWalkBase::GenerateProbability(const std::vector<float> &probability) {
+  uint32_t K = probability.size();
+  std::vector<int32_t> switch_to_large_index(K, 0);
+  std::vector<float> weight(K, .0);
+  std::vector<int32_t> smaller;
+  std::vector<int32_t> larger;
+  auto random_device = GetRandomDevice();
+  std::uniform_real_distribution<> distribution(-kGnnEpsilon, kGnnEpsilon);
+  float accumulate_threshold = 0.0;
+  for (uint32_t i = 0; i < K; i++) {
+    float threshold_one = distribution(random_device);
+    accumulate_threshold += threshold_one;
+    weight[i] = i < K - 1 ? probability[i] * K + threshold_one : probability[i] * K - accumulate_threshold;
+    weight[i] < 1.0 ? smaller.push_back(i) : larger.push_back(i);
+  }
+
+  while ((!smaller.empty()) && (!larger.empty())) {
+    uint32_t small = smaller.back();
+    smaller.pop_back();
+    uint32_t large = larger.back();
+    larger.pop_back();
+    switch_to_large_index[small] = large;
+    weight[large] = weight[large] + weight[small] - 1.0;
+    weight[large] < 1.0 ? smaller.push_back(large) : larger.push_back(large);
+  }
+  return StochasticIndex(switch_to_large_index, weight);
+}
+
+uint32_t Graph::RandomWalkBase::WalkToNextNode(const StochasticIndex &stochastic_index) {
+  auto switch_to_large_index = stochastic_index.first;
+  auto weight = stochastic_index.second;
+  const uint32_t size_of_index = switch_to_large_index.size();
+
+  auto random_device = GetRandomDevice();
+  std::uniform_real_distribution<> distribution(0.0, 1.0);
+
+  // Generate random integer between [0, K)
+  uint32_t random_idx = std::floor(distribution(random_device) * size_of_index);
+
+  if (distribution(random_device) < weight[random_idx]) {
+    return random_idx;
+  }
+  return switch_to_large_index[random_idx];
+}
+
+template <typename T>
+std::vector<float> Graph::RandomWalkBase::Normalize(const std::vector<T> &non_normalized_probability) {
+  float sum_probability =
+    1.0 * std::accumulate(non_normalized_probability.begin(), non_normalized_probability.end(), 0);
+  if (sum_probability < kGnnEpsilon) {
+    sum_probability = 1.0;
+  }
+  std::vector<float> normalized_probability;
+  std::transform(non_normalized_probability.begin(), non_normalized_probability.end(),
+                 std::back_inserter(normalized_probability), [&](T value) -> float { return value / sum_probability; });
+  return normalized_probability;
+}
 }  // namespace gnn
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/gnn/graph.h b/mindspore/ccsrc/dataset/engine/gnn/graph.h
index 3dd6444807..ea10363053 100644
--- a/mindspore/ccsrc/dataset/engine/gnn/graph.h
+++ b/mindspore/ccsrc/dataset/engine/gnn/graph.h
@@ -16,13 +16,17 @@
 #ifndef DATASET_ENGINE_GNN_GRAPH_H_
 #define DATASET_ENGINE_GNN_GRAPH_H_
 
+#include <algorithm>
 #include <memory>
 #include <string>
+#include <map>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <utility>
 
 #include "dataset/core/tensor.h"
+#include "dataset/core/tensor_row.h"
 #include "dataset/engine/gnn/graph_loader.h"
 #include "dataset/engine/gnn/feature.h"
 #include "dataset/engine/gnn/node.h"
@@ -33,24 +37,17 @@ namespace mindspore {
 namespace dataset {
 namespace gnn {
 
-struct NodeMetaInfo {
-  NodeType type;
-  NodeIdType num;
-  std::vector<FeatureType> feature_type;
-  NodeMetaInfo() {
-    type = 0;
-    num = 0;
-  }
-};
-
-struct EdgeMetaInfo {
-  EdgeType type;
-  EdgeIdType num;
-  std::vector<FeatureType> feature_type;
-  EdgeMetaInfo() {
-    type = 0;
-    num = 0;
-  }
+const float kGnnEpsilon = 0.0001;
+const uint32_t kMaxNumWalks = 80;
+using StochasticIndex = std::pair<std::vector<int32_t>, std::vector<float>>;
+
+struct MetaInfo {
+  std::vector<NodeType> node_type;
+  std::vector<EdgeType> edge_type;
+  std::map<NodeType, NodeIdType> node_num;
+  std::map<EdgeType, EdgeIdType> edge_num;
+  std::vector<FeatureType> node_feature_type;
+  std::vector<FeatureType> edge_feature_type;
 };
 
 class Graph {
@@ -62,19 +59,23 @@ class Graph {
 
   ~Graph() = default;
 
-  // Get the nodes from the graph.
+  // Get all nodes from the graph.
   // @param NodeType node_type - type of node
-  // @param NodeIdType node_num - Number of nodes to be acquired, if -1 means all nodes are acquired
   // @param std::shared_ptr<Tensor> *out - Returned nodes id
   // @return Status - The error code return
-  Status GetNodes(NodeType node_type, NodeIdType node_num, std::shared_ptr<Tensor> *out);
+  Status GetAllNodes(NodeType node_type, std::shared_ptr<Tensor> *out);
 
-  // Get the edges from the graph.
+  // Get all edges from the graph.
   // @param NodeType edge_type - type of edge
-  // @param NodeIdType edge_num - Number of edges to be acquired, if -1 means all edges are acquired
   // @param std::shared_ptr<Tensor> *out - Returned edge ids
   // @return Status - The error code return
-  Status GetEdges(EdgeType edge_type, EdgeIdType edge_num, std::shared_ptr<Tensor> *out);
+  Status GetAllEdges(EdgeType edge_type, std::shared_ptr<Tensor> *out);
+
+  // Get the node id from the edge.
+  // @param std::vector<EdgeIdType> edge_list - List of edges
+  // @param std::shared_ptr<Tensor> *out - Returned node ids
+  // @return Status - The error code return
+  Status GetNodesFromEdges(const std::vector<EdgeIdType> &edge_list, std::shared_ptr<Tensor> *out);
 
   // All neighbors of the acquisition node.
   // @param std::vector<NodeType> node_list - List of nodes
@@ -86,12 +87,35 @@ class Graph {
   Status GetAllNeighbors(const std::vector<NodeIdType> &node_list, NodeType neighbor_type,
                          std::shared_ptr<Tensor> *out);
 
-  Status GetSampledNeighbor(const std::vector<NodeIdType> &node_list, const std::vector<NodeIdType> &neighbor_nums,
-                            const std::vector<NodeType> &neighbor_types, std::shared_ptr<Tensor> *out);
-  Status GetNegSampledNeighbor(const std::vector<NodeIdType> &node_list, NodeIdType samples_num,
-                               NodeType neg_neighbor_type, std::shared_ptr<Tensor> *out);
-  Status RandomWalk(const std::vector<NodeIdType> &node_list, const std::vector<NodeType> &meta_path, float p, float q,
-                    NodeIdType default_node, std::shared_ptr<Tensor> *out);
+  // Get sampled neighbors.
+  // @param std::vector<NodeType> node_list - List of nodes
+  // @param std::vector<NodeIdType> neighbor_nums - Number of neighbors sampled per hop
+  // @param std::vector<NodeType> neighbor_types - Neighbor type sampled per hop
+  // @param std::shared_ptr<Tensor> *out - Returned neighbor's id.
+  // @return Status - The error code return
+  Status GetSampledNeighbors(const std::vector<NodeIdType> &node_list, const std::vector<NodeIdType> &neighbor_nums,
+                             const std::vector<NodeType> &neighbor_types, std::shared_ptr<Tensor> *out);
+
+  // Get negative sampled neighbors.
+  // @param std::vector<NodeType> node_list - List of nodes
+  // @param NodeIdType samples_num - Number of neighbors sampled
+  // @param NodeType neg_neighbor_type - The type of negative neighbor.
+  // @param std::shared_ptr<Tensor> *out - Returned negative neighbor's id.
+  // @return Status - The error code return
+  Status GetNegSampledNeighbors(const std::vector<NodeIdType> &node_list, NodeIdType samples_num,
+                                NodeType neg_neighbor_type, std::shared_ptr<Tensor> *out);
+
+  // Node2vec random walk.
+  // @param std::vector<NodeIdType> node_list - List of nodes
+  // @param std::vector<NodeType> meta_path - node type of each step
+  // @param float step_home_param - return hyper parameter in node2vec algorithm
+  // @param float step_away_param - inout hyper parameter in node2vec algorithm
+  // @param NodeIdType default_node - default node id
+  // @param std::shared_ptr<Tensor> *out - Returned nodes id in walk path
+  // @return Status - The error code return
+  Status RandomWalk(const std::vector<NodeIdType> &node_list, const std::vector<NodeType> &meta_path,
+                    float step_home_param, float step_away_param, NodeIdType default_node,
+                    std::shared_ptr<Tensor> *out);
 
   // Get the feature of a node
   // @param std::shared_ptr<Tensor> nodes - List of nodes
@@ -112,14 +136,55 @@ class Graph {
                         TensorRow *out);
 
   // Get meta information of graph
-  // @param std::vector<NodeMetaInfo> *node_info - Returned meta information of node
-  // @param std::vector<NodeMetaInfo> *node_info - Returned meta information of edge
+  // @param MetaInfo *meta_info - Returned meta information
   // @return Status - The error code return
-  Status GetMetaInfo(std::vector<NodeMetaInfo> *node_info, std::vector<EdgeMetaInfo> *edge_info);
+  Status GetMetaInfo(MetaInfo *meta_info);
+
+  // Return meta information to python layer
+  Status GraphInfo(py::dict *out);
 
   Status Init();
 
  private:
+  class RandomWalkBase {
+   public:
+    explicit RandomWalkBase(Graph *graph);
+
+    Status Build(const std::vector<NodeIdType> &node_list, const std::vector<NodeType> &meta_path,
+                 float step_home_param = 1.0, float step_away_param = 1.0, NodeIdType default_node = -1,
+                 int32_t num_walks = 1, int32_t num_workers = 1);
+
+    ~RandomWalkBase() = default;
+
+    Status SimulateWalk(std::vector<std::vector<NodeIdType>> *walks);
+
+   private:
+    Status Node2vecWalk(const NodeIdType &start_node, std::vector<NodeIdType> *walk_path);
+
+    Status GetNodeProbability(const NodeIdType &node_id, const NodeType &node_type,
+                              std::shared_ptr<StochasticIndex> *node_probability);
+
+    Status GetEdgeProbability(const NodeIdType &src, const NodeIdType &dst, uint32_t meta_path_index,
+                              std::shared_ptr<StochasticIndex> *edge_probability);
+
+    static StochasticIndex GenerateProbability(const std::vector<float> &probability);
+
+    static uint32_t WalkToNextNode(const StochasticIndex &stochastic_index);
+
+    template <typename T>
+    std::vector<float> Normalize(const std::vector<T> &non_normalized_probability);
+
+    Graph *graph_;
+    std::vector<NodeIdType> node_list_;
+    std::vector<NodeType> meta_path_;
+    float step_home_param_;  // Return hyper parameter. Default is 1.0
+    float step_away_param_;  // Inout hyper parameter. Default is 1.0
+    NodeIdType default_node_;
+
+    int32_t num_walks_;    // Number of walks per source. Default is 10
+    int32_t num_workers_;  // The number of worker threads. Default is 1
+  };
+
   // Load graph data from mindrecord file
   // @return Status - The error code return
   Status LoadNodeAndEdge();
@@ -146,8 +211,25 @@ class Graph {
   // @return Status - The error code return
   Status GetNodeDefaultFeature(FeatureType feature_type, std::shared_ptr<Feature> *out_feature);
 
+  // Find node object using node id
+  // @param NodeIdType id -
+  // @param std::shared_ptr<Node> *node - Returned node object
+  // @return Status - The error code return
+  Status GetNodeByNodeId(NodeIdType id, std::shared_ptr<Node> *node);
+
+  // Negative sampling
+  // @param std::vector<NodeIdType> &input_data - The data set to be sampled
+  // @param std::unordered_set<NodeIdType> &exclude_data - Data to be excluded
+  // @param int32_t samples_num -
+  // @param std::vector<NodeIdType> *out_samples - Sampling results returned
+  // @return Status - The error code return
+  Status NegativeSample(const std::vector<NodeIdType> &input_data, const std::unordered_set<NodeIdType> &exclude_data,
+                        int32_t samples_num, std::vector<NodeIdType> *out_samples);
+
   std::string dataset_file_;
   int32_t num_workers_;  // The number of worker threads
+  std::mt19937 rnd_;
+  RandomWalkBase random_walk_;
 
   std::unordered_map<NodeType, std::vector<NodeIdType>> node_type_map_;
   std::unordered_map<NodeIdType, std::shared_ptr<Node>> node_id_map_;
diff --git a/mindspore/ccsrc/dataset/engine/gnn/graph_loader.cc b/mindspore/ccsrc/dataset/engine/gnn/graph_loader.cc
index c517fda969..6504d088bf 100644
--- a/mindspore/ccsrc/dataset/engine/gnn/graph_loader.cc
+++ b/mindspore/ccsrc/dataset/engine/gnn/graph_loader.cc
@@ -36,6 +36,7 @@ GraphLoader::GraphLoader(std::string mr_filepath, int32_t num_workers)
     : mr_path_(mr_filepath),
       num_workers_(num_workers),
       row_id_(0),
+      shard_reader_(nullptr),
       keys_({"first_id", "second_id", "third_id", "attribute", "type", "node_feature_index", "edge_feature_index"}) {}
 
 Status GraphLoader::GetNodesAndEdges(NodeIdMap *n_id_map, EdgeIdMap *e_id_map, NodeTypeMap *n_type_map,
@@ -203,7 +204,8 @@ Status GraphLoader::LoadFeatureIndex(const std::string &key, const std::vector<u
 Status GraphLoader::WorkerEntry(int32_t worker_id) {
   // Handshake
   TaskManager::FindMe()->Post();
-  ShardTuple rows = shard_reader_->GetNextById(row_id_++, worker_id);
+  auto ret = shard_reader_->GetNextById(row_id_++, worker_id);
+  ShardTuple rows = ret.second;
   while (rows.empty() == false) {
     RETURN_IF_INTERRUPTED();
     for (const auto &tupled_row : rows) {
@@ -224,7 +226,8 @@ Status GraphLoader::WorkerEntry(int32_t worker_id) {
         MS_LOG(WARNING) << "attribute:" << attr << " is neither edge nor node.";
       }
     }
-    rows = shard_reader_->GetNextById(row_id_++, worker_id);
+    auto rc = shard_reader_->GetNextById(row_id_++, worker_id);
+    rows = rc.second;
   }
   return Status::OK();
 }
diff --git a/mindspore/ccsrc/dataset/engine/gnn/local_node.cc b/mindspore/ccsrc/dataset/engine/gnn/local_node.cc
index 24e865dff7..c829f8e8ca 100644
--- a/mindspore/ccsrc/dataset/engine/gnn/local_node.cc
+++ b/mindspore/ccsrc/dataset/engine/gnn/local_node.cc
@@ -20,12 +20,13 @@
 #include <utility>
 
 #include "dataset/engine/gnn/edge.h"
+#include "dataset/util/random.h"
 
 namespace mindspore {
 namespace dataset {
 namespace gnn {
 
-LocalNode::LocalNode(NodeIdType id, NodeType type) : Node(id, type) {}
+LocalNode::LocalNode(NodeIdType id, NodeType type) : Node(id, type), rnd_(GetRandomDevice()) { rnd_.seed(GetSeed()); }
 
 Status LocalNode::GetFeatures(FeatureType feature_type, std::shared_ptr<Feature> *out_feature) {
   auto itr = features_.find(feature_type);
@@ -38,21 +39,57 @@ Status LocalNode::GetFeatures(FeatureType feature_type, std::shared_ptr<Feature>
   }
 }
 
-Status LocalNode::GetNeighbors(NodeType neighbor_type, int32_t samples_num, std::vector<NodeIdType> *out_neighbors) {
+Status LocalNode::GetAllNeighbors(NodeType neighbor_type, std::vector<NodeIdType> *out_neighbors, bool exclude_itself) {
   std::vector<NodeIdType> neighbors;
   auto itr = neighbor_nodes_.find(neighbor_type);
   if (itr != neighbor_nodes_.end()) {
-    if (samples_num == -1) {
-      // Return all neighbors
+    if (exclude_itself) {
+      neighbors.resize(itr->second.size());
+      std::transform(itr->second.begin(), itr->second.end(), neighbors.begin(),
+                     [](const std::shared_ptr<Node> node) { return node->id(); });
+    } else {
       neighbors.resize(itr->second.size() + 1);
       neighbors[0] = id_;
       std::transform(itr->second.begin(), itr->second.end(), neighbors.begin() + 1,
                      [](const std::shared_ptr<Node> node) { return node->id(); });
-    } else {
     }
   } else {
-    neighbors.push_back(id_);
     MS_LOG(DEBUG) << "No neighbors. node_id:" << id_ << " neighbor_type:" << neighbor_type;
+    if (!exclude_itself) {
+      neighbors.emplace_back(id_);
+    }
+  }
+  *out_neighbors = std::move(neighbors);
+  return Status::OK();
+}
+
+Status LocalNode::GetSampledNeighbors(const std::vector<std::shared_ptr<Node>> &neighbors, int32_t samples_num,
+                                      std::vector<NodeIdType> *out) {
+  std::vector<NodeIdType> shuffled_id(neighbors.size());
+  std::iota(shuffled_id.begin(), shuffled_id.end(), 0);
+  std::shuffle(shuffled_id.begin(), shuffled_id.end(), rnd_);
+  int32_t num = std::min(samples_num, static_cast<int32_t>(neighbors.size()));
+  for (int32_t i = 0; i < num; ++i) {
+    out->emplace_back(neighbors[shuffled_id[i]]->id());
+  }
+  return Status::OK();
+}
+
+Status LocalNode::GetSampledNeighbors(NodeType neighbor_type, int32_t samples_num,
+                                      std::vector<NodeIdType> *out_neighbors) {
+  std::vector<NodeIdType> neighbors;
+  neighbors.reserve(samples_num);
+  auto itr = neighbor_nodes_.find(neighbor_type);
+  if (itr != neighbor_nodes_.end()) {
+    while (neighbors.size() < samples_num) {
+      RETURN_IF_NOT_OK(GetSampledNeighbors(itr->second, samples_num - neighbors.size(), &neighbors));
+    }
+  } else {
+    MS_LOG(DEBUG) << "There are no neighbors. node_id:" << id_ << " neighbor_type:" << neighbor_type;
+    // If there are no neighbors, they are filled with kDefaultNodeId
+    for (int32_t i = 0; i < samples_num; ++i) {
+      neighbors.emplace_back(kDefaultNodeId);
+    }
   }
   *out_neighbors = std::move(neighbors);
   return Status::OK();
diff --git a/mindspore/ccsrc/dataset/engine/gnn/local_node.h b/mindspore/ccsrc/dataset/engine/gnn/local_node.h
index 25f24818e1..bc069d073f 100644
--- a/mindspore/ccsrc/dataset/engine/gnn/local_node.h
+++ b/mindspore/ccsrc/dataset/engine/gnn/local_node.h
@@ -43,12 +43,20 @@ class LocalNode : public Node {
   // @return Status - The error code return
   Status GetFeatures(FeatureType feature_type, std::shared_ptr<Feature> *out_feature) override;
 
-  // Get the neighbors of a node
+  // Get the all neighbors of a node
   // @param NodeType neighbor_type - type of neighbor
-  // @param int32_t samples_num - Number of neighbors to be acquired, if -1 means all neighbors are acquired
   // @param std::vector<NodeIdType> *out_neighbors - Returned neighbors id
   // @return Status - The error code return
-  Status GetNeighbors(NodeType neighbor_type, int32_t samples_num, std::vector<NodeIdType> *out_neighbors) override;
+  Status GetAllNeighbors(NodeType neighbor_type, std::vector<NodeIdType> *out_neighbors,
+                         bool exclude_itself = false) override;
+
+  // Get the sampled neighbors of a node
+  // @param NodeType neighbor_type - type of neighbor
+  // @param int32_t samples_num - Number of neighbors to be acquired
+  // @param std::vector<NodeIdType> *out_neighbors - Returned neighbors id
+  // @return Status - The error code return
+  Status GetSampledNeighbors(NodeType neighbor_type, int32_t samples_num,
+                             std::vector<NodeIdType> *out_neighbors) override;
 
   // Add neighbor of node
   // @param std::shared_ptr<Node> node -
@@ -61,6 +69,10 @@ class LocalNode : public Node {
   Status UpdateFeature(const std::shared_ptr<Feature> &feature) override;
 
  private:
+  Status GetSampledNeighbors(const std::vector<std::shared_ptr<Node>> &neighbors, int32_t samples_num,
+                             std::vector<NodeIdType> *out);
+
+  std::mt19937 rnd_;
   std::unordered_map<FeatureType, std::shared_ptr<Feature>> features_;
   std::unordered_map<NodeType, std::vector<std::shared_ptr<Node>>> neighbor_nodes_;
 };
diff --git a/mindspore/ccsrc/dataset/engine/gnn/node.h b/mindspore/ccsrc/dataset/engine/gnn/node.h
index 8e3db51d65..282f856797 100644
--- a/mindspore/ccsrc/dataset/engine/gnn/node.h
+++ b/mindspore/ccsrc/dataset/engine/gnn/node.h
@@ -52,12 +52,20 @@ class Node {
   // @return Status - The error code return
   virtual Status GetFeatures(FeatureType feature_type, std::shared_ptr<Feature> *out_feature) = 0;
 
-  // Get the neighbors of a node
+  // Get the all neighbors of a node
   // @param NodeType neighbor_type - type of neighbor
-  // @param int32_t samples_num - Number of neighbors to be acquired, if -1 means all neighbors are acquired
   // @param std::vector<NodeIdType> *out_neighbors - Returned neighbors id
   // @return Status - The error code return
-  virtual Status GetNeighbors(NodeType neighbor_type, int32_t samples_num, std::vector<NodeIdType> *out_neighbors) = 0;
+  virtual Status GetAllNeighbors(NodeType neighbor_type, std::vector<NodeIdType> *out_neighbors,
+                                 bool exclude_itself = false) = 0;
+
+  // Get the sampled neighbors of a node
+  // @param NodeType neighbor_type - type of neighbor
+  // @param int32_t samples_num - Number of neighbors to be acquired
+  // @param std::vector<NodeIdType> *out_neighbors - Returned neighbors id
+  // @return Status - The error code return
+  virtual Status GetSampledNeighbors(NodeType neighbor_type, int32_t samples_num,
+                                     std::vector<NodeIdType> *out_neighbors) = 0;
 
   // Add neighbor of node
   // @param std::shared_ptr<Node> node -
diff --git a/mindspore/ccsrc/dataset/engine/opt/CMakeLists.txt b/mindspore/ccsrc/dataset/engine/opt/CMakeLists.txt
index 9804b85d3a..170cbb55e5 100644
--- a/mindspore/ccsrc/dataset/engine/opt/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/engine/opt/CMakeLists.txt
@@ -2,5 +2,7 @@ file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc"
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_library(engine-opt OBJECT
         pass.cc
+        pre/map_column_reorder.cc
+        pre/global_shuffle.cc
         util/printer_pass.cc
         )
\ No newline at end of file
diff --git a/mindspore/ccsrc/dataset/engine/opt/pass.cc b/mindspore/ccsrc/dataset/engine/opt/pass.cc
index e6bd9fe247..a032d46cba 100644
--- a/mindspore/ccsrc/dataset/engine/opt/pass.cc
+++ b/mindspore/ccsrc/dataset/engine/opt/pass.cc
@@ -27,7 +27,6 @@
 #include "dataset/engine/datasetops/shuffle_op.h"
 #include "dataset/engine/datasetops/source/generator_op.h"
 #include "dataset/engine/datasetops/source/mindrecord_op.h"
-#include "dataset/engine/datasetops/source/storage_op.h"
 #include "dataset/engine/datasetops/source/tf_reader_op.h"
 #include "dataset/engine/datasetops/source/image_folder_op.h"
 #include "dataset/engine/datasetops/take_op.h"
@@ -37,10 +36,18 @@ namespace mindspore {
 namespace dataset {
 
 // Driver method for TreePass
-Status TreePass::Run(ExecutionTree *tree, bool *modified) { return this->RunOnTree(tree, modified); }
+Status TreePass::Run(ExecutionTree *tree, bool *modified) {
+  if (tree == nullptr || modified == nullptr) {
+    return Status(StatusCode::kUnexpectedError, "Null pointer passed to TreePass");
+  }
+  return this->RunOnTree(tree, modified);
+}
 
 // Driver method for NodePass
 Status NodePass::Run(ExecutionTree *tree, bool *modified) {
+  if (tree == nullptr || modified == nullptr) {
+    return Status(StatusCode::kUnexpectedError, "Null pointer passed to NodePass");
+  }
   std::shared_ptr<DatasetOp> root = tree->root();
   if (traversalOrder_ == Order::DFS) {
     // DFS
diff --git a/mindspore/ccsrc/dataset/engine/opt/pass.h b/mindspore/ccsrc/dataset/engine/opt/pass.h
index bac464f401..39682b22f7 100644
--- a/mindspore/ccsrc/dataset/engine/opt/pass.h
+++ b/mindspore/ccsrc/dataset/engine/opt/pass.h
@@ -57,10 +57,10 @@ class ImageFolderOp;
 // The actual implementation of the passes will be derived from here.
 class Pass : public std::enable_shared_from_this<Pass> {
  public:
-  // Run the transformation pass again the execution tree.
+  // Run the transformation pass against the execution tree.
   // @param tree - Pointer to the execution tree to be transformed.
   // @param modified - Pointer to the modified flag,
-  virtual Status Run(ExecutionTree *tree, bool *modified) { return Status::OK(); }
+  virtual Status Run(ExecutionTree *tree, bool *modified) = 0;
 };
 
 // TreePass is a basic Pass class which performs transformation on ExecutionTree directly.
diff --git a/mindspore/ccsrc/dataset/engine/opt/pre/global_shuffle.cc b/mindspore/ccsrc/dataset/engine/opt/pre/global_shuffle.cc
new file mode 100644
index 0000000000..2adf734a6c
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/opt/pre/global_shuffle.cc
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <algorithm>
+#include "dataset/engine/opt/pre/global_shuffle.h"
+#include "dataset/engine/execution_tree.h"
+#include "dataset/engine/datasetops/shuffle_op.h"
+#include "dataset/engine/datasetops/source/tf_reader_op.h"
+#include "dataset/engine/datasetops/source/text_file_op.h"
+#include "dataset/engine/datasetops/source/clue_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status GlobalShufflePass::RunOnTree(ExecutionTree *tree, bool *modified) {
+  std::vector<std::shared_ptr<TFReaderOp>> tf_readers;
+  std::vector<std::shared_ptr<TextFileOp>> text_files;
+  std::vector<std::shared_ptr<ClueOp>> clues;
+
+  // Pass 1, search for all sources which requires global shuffle
+  for (auto &op : *tree) {
+    if (auto ptr = std::dynamic_pointer_cast<TFReaderOp>(op.shared_from_this())) {
+      if (ptr->RequireGlobalShuffle()) {
+        tf_readers.push_back(ptr);
+        continue;
+      }
+    }
+    if (auto ptr = std::dynamic_pointer_cast<TextFileOp>(op.shared_from_this())) {
+      if (ptr->RequireGlobalShuffle()) {
+        text_files.push_back(ptr);
+        continue;
+      }
+    }
+    if (auto ptr = std::dynamic_pointer_cast<ClueOp>(op.shared_from_this())) {
+      if (ptr->RequireGlobalShuffle()) {
+        clues.push_back(ptr);
+        continue;
+      }
+    }
+  }
+
+  // Pass 2, insert shuffle nodes
+  // The following blocks can be implemented with template if we unify the CountTotalRows across all source nodes .
+  for (auto node : tf_readers) {
+    std::shared_ptr<ShuffleOp::Builder> builder = std::make_shared<ShuffleOp::Builder>();
+    int64_t total_rows = 0;
+    TFReaderOp::CountTotalRows(&total_rows, node->FileNames(), 8, true);
+    int32_t avg_file_size = total_rows / (node->FileNames().size());
+    builder->SetShuffleSize(std::max(avg_file_size * 4, 10000));
+    std::shared_ptr<ShuffleOp> op;
+    RETURN_IF_NOT_OK(builder->Build(&op));
+    RETURN_IF_NOT_OK(tree->AssociateNode(op));
+    RETURN_IF_NOT_OK(node->InsertAsParent(op));
+  }
+
+  for (auto node : text_files) {
+    std::shared_ptr<ShuffleOp::Builder> builder = std::make_shared<ShuffleOp::Builder>();
+    int64_t total_rows = 0;
+    TextFileOp::CountAllFileRows(node->FileNames(), &total_rows);
+    int32_t avg_file_size = total_rows / (node->FileNames().size());
+    builder->SetShuffleSize(std::max(avg_file_size * 4, 10000));
+    std::shared_ptr<ShuffleOp> op;
+    RETURN_IF_NOT_OK(builder->Build(&op));
+    RETURN_IF_NOT_OK(tree->AssociateNode(op));
+    RETURN_IF_NOT_OK(node->InsertAsParent(op));
+  }
+
+  for (auto node : clues) {
+    std::shared_ptr<ShuffleOp::Builder> builder = std::make_shared<ShuffleOp::Builder>();
+    int64_t total_rows = 0;
+    ClueOp::CountAllFileRows(node->FileNames(), &total_rows);
+    int32_t avg_file_size = total_rows / (node->FileNames().size());
+    builder->SetShuffleSize(std::max(avg_file_size * 4, 10000));
+    std::shared_ptr<ShuffleOp> op;
+    RETURN_IF_NOT_OK(builder->Build(&op));
+    RETURN_IF_NOT_OK(tree->AssociateNode(op));
+    RETURN_IF_NOT_OK(node->InsertAsParent(op));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/opt/pre/global_shuffle.h b/mindspore/ccsrc/dataset/engine/opt/pre/global_shuffle.h
new file mode 100644
index 0000000000..6865ac9391
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/opt/pre/global_shuffle.h
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_ENGINE_OPT_PASS_PRE_GLOBALSHUFFLE_H
+#define DATASET_ENGINE_OPT_PASS_PRE_GLOBALSHUFFLE_H
+
+#include <memory>
+#include "dataset/engine/opt/pass.h"
+
+namespace mindspore {
+namespace dataset {
+// Global Shuffle Pass will insert ShuffleOp when the leaf nodes requires global shuffle.
+// Example:
+// Input Tree:  TFReader(GLOBAL_SHUFFLE) -> Batch
+// Output Tree: TFReader -> Shuffle -> Batch
+class GlobalShufflePass : public TreePass {
+  Status RunOnTree(ExecutionTree *tree, bool *modified) override;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_ENGINE_OPT_PASS_PRE_GLOBALSHUFFLE_H
diff --git a/mindspore/ccsrc/dataset/engine/opt/pre/map_column_reorder.cc b/mindspore/ccsrc/dataset/engine/opt/pre/map_column_reorder.cc
new file mode 100644
index 0000000000..a3dbbfcc54
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/opt/pre/map_column_reorder.cc
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+#include <vector>
+#include "dataset/engine/opt/pre/map_column_reorder.h"
+#include "dataset/engine/execution_tree.h"
+#include "dataset/engine/datasetops/map_op.h"
+#include "dataset/engine/datasetops/project_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status MapColumnReorder::RunOnTree(ExecutionTree *tree, bool *modified) {
+  std::vector<std::shared_ptr<MapOp>> to_process;
+
+  // Pass 1, search for all MapOp with column orders
+  for (auto &op : *tree) {
+    if (auto mapOp = std::dynamic_pointer_cast<MapOp>(op.shared_from_this())) {
+      if (mapOp->ColumnsOrder().size() != 0) {
+        to_process.push_back(mapOp);
+      }
+    }
+  }
+
+  // Pass 2, insert nodes for all MapOp
+  for (auto node : to_process) {
+    std::shared_ptr<ProjectOp::Builder> builder = std::make_shared<ProjectOp::Builder>(node->ColumnsOrder());
+    std::shared_ptr<ProjectOp> op;
+    RETURN_IF_NOT_OK(builder->Build(&op));
+    RETURN_IF_NOT_OK(tree->AssociateNode(op));
+    RETURN_IF_NOT_OK(node->InsertAsParent(op));
+  }
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/opt/pre/map_column_reorder.h b/mindspore/ccsrc/dataset/engine/opt/pre/map_column_reorder.h
new file mode 100644
index 0000000000..84274db3d5
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/opt/pre/map_column_reorder.h
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_ENGINE_OPT_PASS_PRE_MAPCOLREORDER_H
+#define DATASET_ENGINE_OPT_PASS_PRE_MAPCOLREORDER_H
+
+#include <memory>
+#include "dataset/engine/opt/pass.h"
+
+namespace mindspore {
+namespace dataset {
+// Map Column Recorder Pass will insert ProjectOp when MapOp requires a full output columns reorder.
+// Example:
+// Input Tree:  TFReader -> MapOp(with col_order) -> Batch
+// Output Tree: TFReader -> MapOp -> ProjectOp(col_order) -> Batch
+class MapColumnReorder : public TreePass {
+  Status RunOnTree(ExecutionTree *tree, bool *modified) override;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_ENGINE_OPT_PASS_PRE_MAPCOLREORDER_H
diff --git a/mindspore/ccsrc/dataset/engine/perf/CMakeLists.txt b/mindspore/ccsrc/dataset/engine/perf/CMakeLists.txt
new file mode 100644
index 0000000000..0b67469d2d
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_library(engine-perf OBJECT
+    profiling.cc
+    monitor.cc
+    device_queue_tracing.cc
+    connector_size.cc
+    dataset_iterator_tracing.cc)
diff --git a/mindspore/ccsrc/dataset/engine/perf/connector_size.cc b/mindspore/ccsrc/dataset/engine/perf/connector_size.cc
new file mode 100644
index 0000000000..862ec51c49
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/connector_size.cc
@@ -0,0 +1,89 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/engine/perf/connector_size.h"
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <string>
+#include "dataset/core/config_manager.h"
+#include "dataset/engine/execution_tree.h"
+#include "dataset/util/path.h"
+
+using json = nlohmann::json;
+namespace mindspore {
+namespace dataset {
+using Qrow = std::vector<int>;
+
+// Sample action
+Status ConnectorSize::Sample() {
+  Qrow cur_row;
+  std::transform(tree_->begin(), tree_->end(), std::back_inserter(cur_row),
+                 [](DatasetOp &op) { return op.ConnectorSize(); });
+  // Push new row of sample
+  sample_table_.push_back(cur_row);
+  return Status::OK();
+}
+
+// JSON serializer helper function
+json ConnectorSize::ParseOpInfo(const DatasetOp &node, const std::vector<int32_t> &size) {
+  auto children = node.Children();
+  std::vector<int32_t> children_id;
+  std::transform(children.begin(), children.end(), std::back_inserter(children_id),
+                 [](std::shared_ptr<DatasetOp> op) -> int32_t { return op->id(); });
+  json json_node;
+  json_node["op_id"] = node.id();
+  json_node["op_type"] = node.Name();
+  json_node["num_workers"] = node.num_workers();
+  json metrics;
+  // DeviceQueueOp is a special op,it is not inlined but its output queue is invalid.
+  // So we should not output its queue size.
+  if (!node.inlined() && node.Name() != "DeviceQueueOp") {
+    metrics["output_queue"] = {{"size", size}, {"length", node.ConnectorCapacity()}};
+  }
+  json_node["metrics"] = metrics;
+  if (!children_id.empty()) {
+    json_node["children"] = children_id;
+  }
+
+  return json_node;
+}
+
+// Save profiling data to file
+Status ConnectorSize::SaveToFile() {
+  std::ofstream os(file_path_, std::ios::trunc);
+  uint32_t idx = 0;
+  json output;
+  std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
+  output["sampling_interval"] = cfg->monitor_sampling_interval();
+  // Traverse the ExecutionTree for JSON node generation
+  for (auto &node : *tree_) {
+    std::vector<int32_t> cur_queue_size;
+    std::transform(sample_table_.begin(), sample_table_.end(), std::back_inserter(cur_queue_size),
+                   [&](const ConnectorSizeSample &sample) { return sample[idx]; });
+    json json_node = ParseOpInfo(node, cur_queue_size);
+    output["op_info"].push_back(json_node);
+    idx++;
+  }
+  os << output;
+  return Status::OK();
+}
+Status ConnectorSize::Init(const std::string &dir_path, const std::string &device_id) {
+  file_path_ = (Path(dir_path) / Path("pipeline_profiling_" + device_id + ".json")).toString();
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/perf/connector_size.h b/mindspore/ccsrc/dataset/engine/perf/connector_size.h
new file mode 100644
index 0000000000..6840ffe244
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/connector_size.h
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_QUEUE_DEPTH_H
+#define MINDSPORE_QUEUE_DEPTH_H
+
+#include <string>
+#include <vector>
+#include <nlohmann/json.hpp>
+#include "dataset/engine/perf/profiling.h"
+#include "dataset/engine/datasetops/dataset_op.h"
+
+using json = nlohmann::json;
+
+namespace mindspore {
+namespace dataset {
+class ExecutionTree;
+
+// Connector size sampling samples the output connector size of each op in the pipeline.
+// It support JSON serialization for external usage.
+class ConnectorSize : public Sampling {
+  // Connecto size sampling data is stored as a 2D vector
+  //            op_0            ...         op_m
+  // sample_0   size_0_0        ...         size_m_0
+  // ...        ...             ...         ...
+  // sample_n   size_0_m        ...         size_m_n
+  //
+  // A circular buffer will be implemented in the future to make this table more flexible.
+  using ConnectorSizeSample = std::vector<int>;
+  using ConnectorSizeSampleTable = std::vector<ConnectorSizeSample>;
+
+ public:
+  explicit ConnectorSize(ExecutionTree *tree) : tree_(tree) {}
+
+  ~ConnectorSize() override = default;
+
+  // Driver function for connector size sampling.
+  // This function samples the connector size of every nodes within the ExecutionTree
+  Status Sample() override;
+
+  std::string Name() const override { return kDeviceQueueTracingName; };
+
+  // Save sampling data to file
+  // @return Status - The error code return
+  Status SaveToFile() override;
+
+  Status Init(const std::string &dir_path, const std::string &device_id) override;
+
+  // Parse op infomation and transform to json format
+  json ParseOpInfo(const DatasetOp &node, const std::vector<int32_t> &size);
+
+ private:
+  ExecutionTree *tree_ = nullptr;          // ExecutionTree pointer
+  ConnectorSizeSampleTable sample_table_;  // Dataset structure to store all samples of connector size sampling
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // MINDSPORE_QUEUE_DEPTH_H
diff --git a/mindspore/ccsrc/dataset/engine/perf/dataset_iterator_tracing.cc b/mindspore/ccsrc/dataset/engine/perf/dataset_iterator_tracing.cc
new file mode 100644
index 0000000000..99b0c2d7e0
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/dataset_iterator_tracing.cc
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <fstream>
+#include <string>
+#include "dataset/engine/perf/dataset_iterator_tracing.h"
+#include "dataset/util/path.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status DatasetIteratorTracing::Record(const int32_t type, const int32_t extra_info, const int32_t batch_num,
+                                      const int32_t value) {
+  // Format: "type extra-info batch-num value"
+  // type: 0: time,  1: connector size
+  // extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
+  //             if type is 1 - connector capacity
+  // batch-num: batch number
+  // value: if type is 0 - value is time(ms)
+  //        if type is 1 - value is connector size
+  // Examples:
+  // 0 0 20 10 - The 20th batch took 10ms to get data from pipeline.
+  // 1 64 20 5 - Connector size is 5 when get the 20th batch.Connector capacity is 64.
+  std::string data = std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " +
+                     std::to_string(value);
+  value_.emplace_back(data);
+  return Status::OK();
+}
+
+Status DatasetIteratorTracing::SaveToFile() {
+  if (value_.empty()) {
+    return Status::OK();
+  }
+
+  std::ofstream handle(file_path_, std::ios::trunc);
+  if (!handle.is_open()) {
+    RETURN_STATUS_UNEXPECTED("Profiling file can not be opened.");
+  }
+  for (auto value : value_) {
+    handle << value << "\n";
+  }
+  handle.close();
+
+  return Status::OK();
+}
+
+Status DatasetIteratorTracing::Init(const std::string &dir_path, const std::string &device_id) {
+  file_path_ = (Path(dir_path) / Path("dataset_iterator_profiling_" + device_id + ".txt")).toString();
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/perf/dataset_iterator_tracing.h b/mindspore/ccsrc/dataset/engine/perf/dataset_iterator_tracing.h
new file mode 100644
index 0000000000..00264939fc
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/dataset_iterator_tracing.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_DATASET_ITERATOR_TRACING_H
+#define MINDSPORE_DATASET_ITERATOR_TRACING_H
+
+#include <string>
+#include <vector>
+#include "dataset/engine/perf/profiling.h"
+
+namespace mindspore {
+namespace dataset {
+class DatasetIteratorTracing : public Tracing {
+ public:
+  // Constructor
+  DatasetIteratorTracing() = default;
+
+  // Destructor
+  ~DatasetIteratorTracing() override = default;
+
+  // Record tracing data
+  // @return Status - The error code return
+  Status Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value);
+
+  std::string Name() const override { return kDatasetIteratorTracingName; };
+
+  // Save tracing data to file
+  // @return Status - The error code return
+  Status SaveToFile() override;
+
+  Status Init(const std::string &dir_path, const std::string &device_id) override;
+
+ private:
+  std::vector<std::string> value_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_DATASET_ITERATOR_TRACING_H
diff --git a/mindspore/ccsrc/dataset/engine/perf/device_queue_tracing.cc b/mindspore/ccsrc/dataset/engine/perf/device_queue_tracing.cc
new file mode 100644
index 0000000000..204a83e3fb
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/device_queue_tracing.cc
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fstream>
+#include <string>
+#include "dataset/engine/perf/device_queue_tracing.h"
+#include "dataset/util/path.h"
+namespace mindspore {
+namespace dataset {
+
+Status DeviceQueueTracing::Record(const int32_t type, const int32_t extra_info, const int32_t batch_num,
+                                  const int32_t value) {
+  // Format: "type extra-info batch-num value"
+  // type: 0: time,  1: connector size
+  // extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
+  //             if type is 1 - connector capacity
+  // batch-num: batch number
+  // value: if type is 0 - value is time(ms)
+  //        if type is 1 - value is connector size
+  // Examples:
+  // 0 0 20 10 - The 20th batch took 10ms to get data from pipeline.
+  // 1 64 20 5 - Connector size is 5 when get the 20th batch.Connector capacity is 64.
+  std::string data = std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " +
+                     std::to_string(value);
+  value_.emplace_back(data);
+  return Status::OK();
+}
+
+Status DeviceQueueTracing::SaveToFile() {
+  if (value_.empty()) {
+    return Status::OK();
+  }
+
+  std::ofstream handle(file_path_, std::ios::trunc);
+  if (!handle.is_open()) {
+    RETURN_STATUS_UNEXPECTED("Profiling file can not be opened.");
+  }
+  for (auto value : value_) {
+    handle << value << "\n";
+  }
+  handle.close();
+
+  return Status::OK();
+}
+
+Status DeviceQueueTracing::Init(const std::string &dir_path, const std::string &device_id) {
+  file_path_ = (Path(dir_path) / Path("device_queue_profiling_" + device_id + ".txt")).toString();
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/perf/device_queue_tracing.h b/mindspore/ccsrc/dataset/engine/perf/device_queue_tracing.h
new file mode 100644
index 0000000000..f7c6da3a04
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/device_queue_tracing.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_DEVICE_QUEUE_TRACING_H
+#define MINDSPORE_DEVICE_QUEUE_TRACING_H
+
+#include <string>
+#include <vector>
+#include "dataset/engine/perf/profiling.h"
+
+namespace mindspore {
+namespace dataset {
+class DeviceQueueTracing : public Tracing {
+ public:
+  // Constructor
+  DeviceQueueTracing() = default;
+
+  // Destructor
+  ~DeviceQueueTracing() override = default;
+
+  // Record tracing data
+  // @return Status - The error code return
+  Status Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value);
+
+  std::string Name() const override { return "Device Queue Tracing"; };
+
+  // Save tracing data to file
+  // @return Status - The error code return
+  Status SaveToFile() override;
+
+  Status Init(const std::string &dir_path, const std::string &device_id) override;
+
+ private:
+  std::vector<std::string> value_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_DEVICE_QUEUE_TRACING_H
diff --git a/mindspore/ccsrc/dataset/engine/perf/monitor.cc b/mindspore/ccsrc/dataset/engine/perf/monitor.cc
new file mode 100644
index 0000000000..c9dce004b5
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/monitor.cc
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include "dataset/core/config_manager.h"
+#include "dataset/engine/perf/monitor.h"
+#include "dataset/engine/execution_tree.h"
+
+namespace mindspore {
+namespace dataset {
+
+Monitor::Monitor(ExecutionTree *tree) : tree_(tree) {
+  std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
+  sampling_interval_ = cfg->monitor_sampling_interval();
+  max_samples_ = 0;
+  cur_row_ = 0;
+}
+
+Status Monitor::operator()() {
+  // Register this thread with TaskManager to receive proper interrupt signal.
+  TaskManager::FindMe()->Post();
+
+  // Keep sampling if
+  // 1) Monitor Task is not interrupted by TaskManager AND
+  // 2) Iterator has not received EOF
+  while (!this_thread::is_interrupted() && !(tree_->isFinished())) {
+    for (auto &node : tree_->GetProfilingManager()->GetSamplingNodes()) {
+      RETURN_IF_NOT_OK(node.second->Sample());
+      std::this_thread::sleep_for(std::chrono::milliseconds(sampling_interval_));
+    }
+  }
+
+  // Output all profiling data upon request.
+  tree_->GetProfilingManager()->SaveProfilingData();
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/perf/monitor.h b/mindspore/ccsrc/dataset/engine/perf/monitor.h
new file mode 100644
index 0000000000..2a482a6ad7
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/monitor.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MONITOR_H
+#define MINDSPORE_MONITOR_H
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "dataset/util/status.h"
+#include "dataset/engine/perf/profiling.h"
+
+namespace mindspore {
+namespace dataset {
+class ExecutionTree;
+class Monitor {
+ public:
+  // Monitor object constructor
+  explicit Monitor(ExecutionTree *tree);
+
+  Monitor() = default;
+
+  ~Monitor() = default;
+
+  // Functor for Perf Monitor main loop.
+  // This function will be the entry point of mindspore::Dataset::Task
+  Status operator()();
+
+  int64_t GetSamplingInterval() { return sampling_interval_; }
+
+ private:
+  int64_t cur_row_;
+  int64_t max_samples_;
+  int64_t sampling_interval_;
+  ExecutionTree *tree_;
+  std::vector<std::shared_ptr<Sampling>> sampling_list_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_MONITOR_H
diff --git a/mindspore/ccsrc/dataset/engine/perf/profiling.cc b/mindspore/ccsrc/dataset/engine/perf/profiling.cc
new file mode 100644
index 0000000000..4786b8dd69
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/profiling.cc
@@ -0,0 +1,153 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/engine/perf/profiling.h"
+
+#include <sys/time.h>
+#include <cstdlib>
+#include <fstream>
+#include "common/utils.h"
+#include "dataset/util/path.h"
+#include "dataset/engine/perf/monitor.h"
+#include "dataset/engine/perf/device_queue_tracing.h"
+#include "dataset/engine/perf/connector_size.h"
+#include "dataset/engine/perf/dataset_iterator_tracing.h"
+#include "utils/log_adapter.h"
+
+namespace mindspore {
+namespace dataset {
+
+bool ProfilingManager::IsProfilingEnable() const {
+  auto profiling = common::GetEnv("PROFILING_MODE");
+  if (profiling.empty() || profiling != "true") {
+    return false;
+  }
+  return true;
+}
+
+Status ProfilingManager::Initialize() {
+  // Register nodes based on config
+  std::string dir = common::GetEnv("MINDDATA_PROFILING_DIR");
+  if (dir.empty()) {
+    RETURN_STATUS_UNEXPECTED("Profiling dir is not set.");
+  }
+  char real_path[PATH_MAX] = {0};
+  if (dir.size() >= PATH_MAX) {
+    RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
+  }
+#if defined(_WIN32) || defined(_WIN64)
+  if (_fullpath(real_path, common::SafeCStr(dir), PATH_MAX) == nullptr) {
+    RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
+  }
+#else
+  if (realpath(common::SafeCStr(dir), real_path) == nullptr) {
+    RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
+  }
+#endif
+  dir_path_ = real_path;
+
+  // If DEVICE_ID is not set,defult value is 0
+  device_id_ = common::GetEnv("DEVICE_ID");
+  if (device_id_.empty()) {
+    device_id_ = "0";
+  }
+
+  // Register all profiling node.
+  // device_queue node is used for graph mode
+  std::shared_ptr<Tracing> device_queue_tracing = std::make_shared<DeviceQueueTracing>();
+  RETURN_IF_NOT_OK(RegisterTracingNode(device_queue_tracing));
+  // dataset_iterator node is used for graph mode
+  std::shared_ptr<Tracing> dataset_iterator_tracing = std::make_shared<DatasetIteratorTracing>();
+  RETURN_IF_NOT_OK(RegisterTracingNode(dataset_iterator_tracing));
+
+  std::shared_ptr<Sampling> monitor_sampling = std::make_shared<ConnectorSize>(tree_);
+  RETURN_IF_NOT_OK(RegisterSamplingNode(monitor_sampling));
+
+  return Status::OK();
+}
+
+// Profiling node registration
+Status ProfilingManager::RegisterTracingNode(std::shared_ptr<Tracing> node) {
+  // Check if node with the same name has already been registered.
+  auto exist = tracing_nodes_.find(node->Name());
+  if (exist != tracing_nodes_.end()) {
+    return Status(StatusCode::kProfilingError, "Profiling node already exist: " + node->Name());
+  }
+  // Register the node with its name as key.
+  RETURN_IF_NOT_OK(node->Init(dir_path_, device_id_));
+  tracing_nodes_[node->Name()] = node;
+  return Status::OK();
+}
+
+// Profiling node getter
+Status ProfilingManager::GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node) {
+  // Check if node with the same name has already been registered.
+  auto exist = tracing_nodes_.find(name);
+  if (exist == tracing_nodes_.end()) {
+    return Status(StatusCode::kProfilingError, "Profiling node does not exist: " + name);
+  }
+  // Fetch node.
+  *node = tracing_nodes_[name];
+  return Status::OK();
+}
+
+// Profiling node registration
+Status ProfilingManager::RegisterSamplingNode(std::shared_ptr<Sampling> node) {
+  // Check if node with the same name has already been registered.
+  auto exist = sampling_nodes_.find(node->Name());
+  if (exist != sampling_nodes_.end()) {
+    return Status(StatusCode::kProfilingError, "Profiling node already exist: " + node->Name());
+  }
+  // Register the node with its name as key.
+  RETURN_IF_NOT_OK(node->Init(dir_path_, device_id_));
+  sampling_nodes_[node->Name()] = node;
+  return Status::OK();
+}
+
+// Profiling node getter
+Status ProfilingManager::GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node) {
+  // Check if node with the same name has already been registered.
+  auto exist = sampling_nodes_.find(name);
+  if (exist == sampling_nodes_.end()) {
+    return Status(StatusCode::kProfilingError, "Profiling node does not exist: " + name);
+  }
+  // Fetch node.
+  *node = sampling_nodes_[name];
+  return Status::OK();
+}
+
+Status ProfilingManager::SaveProfilingData() {
+  if (!IsProfilingEnable()) {
+    return Status::OK();
+  }
+  MS_LOG(INFO) << "Start to save profiling data.";
+  for (auto node : tracing_nodes_) {
+    RETURN_IF_NOT_OK(node.second->SaveToFile());
+  }
+  for (auto node : sampling_nodes_) {
+    RETURN_IF_NOT_OK(node.second->SaveToFile());
+  }
+  MS_LOG(INFO) << "Save profiling data end.";
+
+  return Status::OK();
+}
+
+double ProfilingTime::GetCurMilliSecond() {
+  struct timeval tv = {0, 0};
+  (void)gettimeofday(&tv, nullptr);
+  return tv.tv_sec * 1000 + tv.tv_usec / 1000;
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/engine/perf/profiling.h b/mindspore/ccsrc/dataset/engine/perf/profiling.h
new file mode 100644
index 0000000000..d0ea91d566
--- /dev/null
+++ b/mindspore/ccsrc/dataset/engine/perf/profiling.h
@@ -0,0 +1,140 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_UTIL_PROFILE_H_
+#define DATASET_UTIL_PROFILE_H_
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <memory>
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class Monitor;
+class ExecutionTree;
+
+const char kDeviceQueueTracingName[] = "Device Queue Tracing";
+const char kDatasetIteratorTracingName[] = "Dataset Iterator Tracing";
+const char kConnectorSizeSamplingName[] = "Connector Size Sampling";
+
+// Profiling is a class of basic unit of profiling action
+// This base class encapsulate the serialization output logic
+class Profiling : std::enable_shared_from_this<Profiling> {
+ public:
+  // Constructor
+  Profiling() = default;
+
+  // Destructor
+  virtual ~Profiling() = default;
+
+  virtual Status Init(const std::string &dir_path, const std::string &device_id) = 0;
+
+  // Default serialization file generator
+  virtual Status SaveToFile() = 0;
+
+  // Profiling name
+  virtual std::string Name() const = 0;
+
+ protected:
+  std::string file_path_;
+};
+
+// Sampling is a class of profiling which generate samples periodically.
+class Sampling : public Profiling {
+ public:
+  // Sampling action function. This function will be invoked by performance monitor thread.
+  virtual Status Sample() = 0;
+};
+
+// Tracing is class of profiling which record samples upon request.
+class Tracing : public Profiling {
+  // Tracing does not define a fixed interface to provide flexible on data recording.
+};
+
+// ProfilingManager is a class manages all profiling infrastructure
+// It serves the following purposes:
+// 1) Fetch profiling configs from global contexts
+// 2) Setup all profiling node based on config
+// 3) Provide access of profiling nodes for profiling actions
+// 4) Manage profiling data serialization process
+class ProfilingManager {
+ public:
+  explicit ProfilingManager(ExecutionTree *tree) : tree_(tree) {}
+
+  ~ProfilingManager() = default;
+
+  Status Initialize();
+
+  // Save profile data to file
+  // @return Status - The error code return
+  Status SaveProfilingData();
+
+  // Sampling node getter
+  // @param name - The name of the requested node
+  // @param node - Pointer to the shared pointer for the Sampling node
+  // @return Status - The error code return
+  Status GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node);
+
+  // Tracing node getter
+  // @param name - The name of the requested node
+  // @param node - Pointer to the shared pointer for the Tracing node
+  // @return Status - The error code return
+  Status GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node);
+
+  // If profiling is enabled.
+  bool IsProfilingEnable() const;
+
+  const std::unordered_map<std::string, std::shared_ptr<Sampling>> &GetSamplingNodes() { return sampling_nodes_; }
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<Tracing>> tracing_nodes_;
+
+  std::unordered_map<std::string, std::shared_ptr<Sampling>> sampling_nodes_;
+
+  // Register profile node to tree
+  // @param node - Profiling node
+  // @return Status - The error code return
+  Status RegisterTracingNode(std::shared_ptr<Tracing> node);
+
+  // Register profile node to tree
+  // @param node - Profiling node
+  // @return Status - The error code return
+  Status RegisterSamplingNode(std::shared_ptr<Sampling> node);
+
+  ExecutionTree *tree_ = nullptr;  // ExecutionTree pointer
+  std::string dir_path_;           // where to create profiling file
+  std::string device_id_;          // used when create profiling file,filename_deviceid.suffix
+};
+
+enum ProfilingType { TIME, CONNECTOR_DEPTH };
+
+enum ProfilingTimeSubType {
+  PIPELINE_TIME,
+  TDT_PUSH_TIME,
+  BATCH_TIME,
+  INVALID_TIME,
+};
+
+class ProfilingTime {
+ public:
+  static double GetCurMilliSecond();
+};
+
+}  // namespace dataset
+}  // namespace mindspore
+#endif
diff --git a/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.cc b/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.cc
index e457de52ae..ca9f2176f5 100644
--- a/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.cc
+++ b/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.cc
@@ -16,6 +16,7 @@
 #include "dataset/engine/tdt/tdt_plugin.h"
 #include "common/utils.h"
 #include "utils/log_adapter.h"
+#include "dataset/engine/perf/profiling.h"
 
 namespace mindspore {
 namespace dataset {
@@ -28,18 +29,26 @@ std::shared_ptr<TdtPlugin> TdtPlugin::GetInstance() {
   return instance_ptr_;
 }
 
-TdtStatus TdtPlugin::hostPush(TensorRow ts_row, bool is_wait, std::string channel_name) {
-  MS_LOG(INFO) << "TDT channel name is " << channel_name << ".";
+TdtStatus TdtPlugin::hostPush(TensorRow ts_row, bool is_wait, std::string channel_name, bool profiling, int32_t &time) {
+  MS_LOG(DEBUG) << "TDT channel name is " << channel_name << ".";
   std::vector<DataItem> items;
+  double start_time;
   auto ret = translate(ts_row, items);
   if (ret != SUCCESS) {
     MS_LOG(ERROR) << "TDT converting tensor failed!";
     return FAILED;
   }
+  if (profiling) {
+    start_time = ProfilingTime::GetCurMilliSecond();
+  }
   if (tdt::TdtHostPushData(channel_name, items) != 0) {
     MS_LOG(ERROR) << "TDT pushing data failed!";
     return FAILED;
   }
+  if (profiling) {
+    double end_time = ProfilingTime::GetCurMilliSecond();
+    time = (int32_t)(end_time - start_time);
+  }
   return SUCCESS;
 }
 
@@ -110,10 +119,11 @@ TdtStatus TdtPlugin::translate(const TensorRow &ts_row, std::vector<DataItem> &i
     data_item.tensorShape_ = dataShapes;
     data_item.tensorType_ = datatype;
     data_item.dataLen_ = ts->SizeInBytes();
-    data_item.dataPtr_ = std::shared_ptr<void>(reinterpret_cast<void *>(ts->GetMutableBuffer()), [](void *elem) {});
+    data_item.dataPtr_ =
+      std::shared_ptr<void>(reinterpret_cast<uchar *>(&(*ts->begin<uint8_t>())), [](const void *elem) {});
     items.emplace_back(data_item);
-    MS_LOG(INFO) << "TDT data type is " << datatype << ", data shape is " << dataShapes << ", data length is "
-                 << ts->Size() << ".";
+    MS_LOG(DEBUG) << "TDT data type is " << datatype << ", data shape is " << dataShapes << ", data length is "
+                  << ts->Size() << ".";
   }
   return SUCCESS;
 }
diff --git a/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.h b/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.h
index a25deb4aab..304b205b81 100644
--- a/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.h
+++ b/mindspore/ccsrc/dataset/engine/tdt/tdt_plugin.h
@@ -26,6 +26,7 @@
 
 #include "dataset/core/data_type.h"
 #include "dataset/core/tensor.h"
+#include "dataset/core/tensor_row.h"
 
 namespace mindspore {
 namespace dataset {
@@ -37,7 +38,7 @@ class TdtPlugin {
  public:
   static std::shared_ptr<TdtPlugin> GetInstance();
 
-  TdtStatus hostPush(TensorRow ts_row, bool is_wait, std::string channel_name);
+  TdtStatus hostPush(TensorRow ts_row, bool is_wait, std::string channel_name, bool profilig, int32_t &time);
 
  private:
   TdtPlugin() {}
diff --git a/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt b/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt
index 8472ab5192..9131c9c667 100644
--- a/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt
@@ -1,8 +1,14 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_library(kernels-data OBJECT
-    data_utils.cc
-    one_hot_op.cc
-    type_cast_op.cc
-    to_float16_op.cc
-    )
+        data_utils.cc
+        one_hot_op.cc
+        pad_end_op.cc
+        type_cast_op.cc
+        to_float16_op.cc
+        fill_op.cc
+        slice_op.cc
+        mask_op.cc
+        concatenate_op.cc
+        duplicate_op.cc
+        )
diff --git a/mindspore/ccsrc/dataset/kernels/data/concatenate_op.cc b/mindspore/ccsrc/dataset/kernels/data/concatenate_op.cc
new file mode 100644
index 0000000000..87115fd3ce
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/concatenate_op.cc
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/kernels/data/concatenate_op.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/data/data_utils.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status ConcatenateOp::Compute(const TensorRow &input, TensorRow *output) {
+  IO_CHECK_VECTOR(input, output);
+  RETURN_IF_NOT_OK(Concatenate(input, output, axis_, prepend_, append_));
+  return Status::OK();
+}
+
+Status ConcatenateOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
+  RETURN_IF_NOT_OK(TensorOp::OutputShape(inputs, outputs));
+
+  std::vector<TensorShape> inputs_copy;
+  inputs_copy.push_back(inputs[0].Squeeze());
+
+  CHECK_FAIL_RETURN_UNEXPECTED(inputs.at(0).Rank() == 1, "Only 1D input tensors supported");
+
+  outputs.clear();
+  dsize_t output_shape = 0;
+  output_shape = output_shape + inputs.at(0).NumOfElements();
+  if (prepend_ != nullptr) {
+    CHECK_FAIL_RETURN_UNEXPECTED(prepend_->shape().Rank() == 1, "Only 1D prepend tensors supported");
+    output_shape = output_shape + prepend_->shape().NumOfElements();
+  }
+  if (append_ != nullptr) {
+    CHECK_FAIL_RETURN_UNEXPECTED(append_->shape().Rank() == 1, "Only 1D append tensors supported");
+    output_shape = output_shape + append_->shape().NumOfElements();
+  }
+
+  outputs.emplace_back(std::vector<dsize_t>{output_shape});
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/data/concatenate_op.h b/mindspore/ccsrc/dataset/kernels/data/concatenate_op.h
new file mode 100644
index 0000000000..4e4c7ad4e0
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/concatenate_op.h
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_KERNELS_DATA_CONCATENATE_OP_H_
+#define DATASET_KERNELS_DATA_CONCATENATE_OP_H_
+
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+class ConcatenateOp : public TensorOp {
+ public:
+  /// Constructor to ConcatenateOp.
+  /// @param int8_t axis - axis to concatenate tensors along.
+  /// @param std::shared_ptr<Tensor> prepend - prepend tensor.
+  /// @param std::shared_ptr<Tensor> append -append tensor.
+  explicit ConcatenateOp(int8_t axis, std::shared_ptr<Tensor> prepend, std::shared_ptr<Tensor> append)
+      : axis_(axis), prepend_(prepend), append_(append) {}
+
+  ~ConcatenateOp() override = default;
+
+  /// Print method to see which tensor Op this is.
+  /// @param std::ostream &out - output stream object.
+  void Print(std::ostream &out) const override { out << "ConcatenateOp"; }
+
+  /// Compute method allowing multiple tensors as inputs
+  /// @param TensorRow &input - input tensor rows
+  /// @param TensorRow *output - output tensor rows
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+  /// Compute tensor output shape
+  /// @param std::vector<TensorShape> &inputs - vector of input tensor shapes
+  /// @param std::vector<TensorShape< &outputs - vector of output tensor shapes
+  Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
+
+  /// Number of inputs the tensor operation accepts
+  uint32_t NumInput() override { return 0; }
+
+ private:
+  int8_t axis_;
+  std::shared_ptr<Tensor> prepend_;
+  std::shared_ptr<Tensor> append_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CONCATENATE_OP_H
diff --git a/mindspore/ccsrc/dataset/kernels/data/data_utils.cc b/mindspore/ccsrc/dataset/kernels/data/data_utils.cc
index f2635c1fe3..40eba1edf6 100644
--- a/mindspore/ccsrc/dataset/kernels/data/data_utils.cc
+++ b/mindspore/ccsrc/dataset/kernels/data/data_utils.cc
@@ -15,12 +15,19 @@
  */
 
 #include "dataset/kernels/data/data_utils.h"
+
+#include <algorithm>
+#include <limits>
+#include <string>
 #include <vector>
+
 #include "dataset/core/constants.h"
-#include "dataset/core/tensor.h"
-#include "dataset/core/tensor_shape.h"
 #include "dataset/core/data_type.h"
 #include "dataset/core/pybind_support.h"
+#include "dataset/core/tensor.h"
+#include "dataset/core/tensor_shape.h"
+#include "dataset/kernels/data/type_cast_op.h"
+#include "dataset/util/status.h"
 
 namespace mindspore {
 namespace dataset {
@@ -76,6 +83,7 @@ Status OneHotEncodingSigned(const std::shared_ptr<Tensor> &input, std::shared_pt
 
 Status OneHotEncoding(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, dsize_t num_classes) {
   input->Squeeze();
+
   if (input->Rank() > 1) {  // We expect the input to be int he first dimension
     RETURN_STATUS_UNEXPECTED("One hot only supports scalars or 1D shape Tensors.");
   }
@@ -104,11 +112,121 @@ Status OneHotEncoding(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *ou
   }
 }
 
+Status Fill(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, std::shared_ptr<Tensor> fill_value) {
+  CHECK_FAIL_RETURN_UNEXPECTED(!((fill_value->type() == DataType::DE_STRING) && (input->type() != DataType::DE_STRING)),
+                               "Types do not match");
+
+  CHECK_FAIL_RETURN_UNEXPECTED(fill_value->shape() == TensorShape({}), "fill_value is not a scalar");
+
+  std::shared_ptr<Tensor> out;
+
+  const DataType &to = input->type();
+  std::unique_ptr<TypeCastOp> op(new TypeCastOp(to));
+
+  std::shared_ptr<Tensor> fill_output;
+  RETURN_IF_NOT_OK(op->Compute(fill_value, &fill_output));
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, TensorImpl::kFlexible, input->shape(), input->type()));
+
+  switch (input->type().value()) {
+    case DataType::DE_BOOL: {
+      bool value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<bool>(value);
+      break;
+    }
+    case DataType::DE_INT8: {
+      int8_t value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<int8_t>(value);
+      break;
+    }
+    case DataType::DE_UINT8: {
+      uint8_t value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<uint8_t>(value);
+      break;
+    }
+    case DataType::DE_UINT16: {
+      uint16_t value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<uint16_t>(value);
+      break;
+    }
+    case DataType::DE_INT16: {
+      int16_t value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<int16_t>(value);
+      break;
+    }
+    case DataType::DE_UINT32: {
+      uint32_t value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<uint32_t>(value);
+      break;
+    }
+    case DataType::DE_INT32: {
+      int32_t value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<int32_t>(value);
+      break;
+    }
+    case DataType::DE_UINT64: {
+      uint64_t value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<uint64_t>(value);
+      break;
+    }
+    case DataType::DE_INT64: {
+      int64_t value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<int64_t>(value);
+      break;
+    }
+    case DataType::DE_FLOAT16: {
+      int64_t value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<float>(value);
+      break;
+    }
+    case DataType::DE_FLOAT32: {
+      float value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<float>(value);
+      break;
+    }
+    case DataType::DE_FLOAT64: {
+      double value = 0;
+      RETURN_IF_NOT_OK(fill_output->GetItemAt(&value, {}));
+      out->Fill<double>(value);
+      break;
+    }
+    case DataType::DE_STRING: {
+      std::vector<std::string> strings;
+      std::string_view fill_string_view;
+      RETURN_IF_NOT_OK(fill_value->GetItemAt(&fill_string_view, {}));
+      std::string fill_string = std::string(fill_string_view);
+      for (int i = 0; i < input->shape().NumOfElements(); i++) {
+        strings.emplace_back(fill_string);
+      }
+      RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, strings, input->shape()));
+      break;
+    }
+    case DataType::DE_UNKNOWN: {
+      RETURN_STATUS_UNEXPECTED("FillOp does not support input of this type.");
+      break;
+    }
+  }
+
+  *output = out;
+  return Status::OK();
+}
 template <typename FROM, typename TO>
 void Cast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   auto in_itr = input->begin<FROM>();
   auto out_itr = (*output)->begin<TO>();
   auto out_end = (*output)->end<TO>();
+
   for (; out_itr != out_end; static_cast<void>(in_itr++), static_cast<void>(out_itr++))
     *out_itr = static_cast<TO>(*in_itr);
 }
@@ -216,7 +334,314 @@ Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *
   auto in_itr = input->begin<float>();
   auto out_itr = (*output)->begin<float16>();
   auto out_end = (*output)->end<float16>();
-  for (; out_itr != out_end; in_itr++, out_itr++) *out_itr = Eigen::half(*in_itr);
+
+  for (; out_itr != out_end; in_itr++, out_itr++) {
+    float element = *in_itr;
+    float float16_max = static_cast<float>(std::numeric_limits<Eigen::half>::max());
+    float float16_min = static_cast<float>(std::numeric_limits<Eigen::half>::lowest());
+    if (element > float16_max || element < float16_min) {
+      RETURN_STATUS_UNEXPECTED("Value " + std::to_string(element) + " is outside of valid float16 range [" +
+                               std::to_string(float16_max) + ", " + std::to_string(float16_min) + "].");
+    }
+
+    *out_itr = Eigen::half(*in_itr);
+  }
+
+  return Status::OK();
+}
+
+Status PadEnd(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst, const std::vector<dsize_t> &pad_shape,
+              const std::shared_ptr<Tensor> &pad_val) {
+  if (pad_val == nullptr) {
+    if (src->type().IsNumeric()) {
+      return PadEndNumeric(src, dst, pad_shape, 0);
+    } else {
+      return PadEndString(src, dst, pad_shape, "");
+    }
+  }
+  CHECK_FAIL_RETURN_UNEXPECTED(src->type().IsNumeric() == pad_val->type().IsNumeric(),
+                               "Source and pad_value tensors are not of the same type.");
+  if (pad_val->type().IsNumeric()) {
+    std::shared_ptr<Tensor> float_pad_value;
+    RETURN_IF_NOT_OK(TypeCast(pad_val, &float_pad_value, DataType(DataType::DE_FLOAT32)));
+    float val = 0;
+    RETURN_IF_NOT_OK(float_pad_value->GetItemAt<float>(&val, {}));
+    return PadEndNumeric(src, dst, pad_shape, val);
+  }
+  std::string_view val;
+  RETURN_IF_NOT_OK(pad_val->GetItemAt(&val, {}));
+  return PadEndString(src, dst, pad_shape, std::string(val));
+}
+
+Status PadEndNumeric(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
+                     const std::vector<dsize_t> &pad_shape, float pad_val) {
+  CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr");
+  if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) {
+    (*dst) = src;  // if no padding, copy the pointer
+  } else {
+    CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed");
+    RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, TensorImpl::kFlexible, TensorShape(pad_shape), src->type()));
+    auto tensor_type = src->type().value();
+    if (pad_val == 0) {  // if pad with zero, don't care what type it is
+      RETURN_IF_NOT_OK((*dst)->Zero());
+    } else if (tensor_type == DataType::DE_INT8) {
+      RETURN_IF_NOT_OK((*dst)->Fill<int8_t>(pad_val));
+    } else if (tensor_type == DataType::DE_BOOL) {
+      RETURN_IF_NOT_OK((*dst)->Fill<bool>(pad_val));
+    } else if (tensor_type == DataType::DE_UINT8) {
+      RETURN_IF_NOT_OK((*dst)->Fill<uint8_t>(pad_val));
+    } else if (tensor_type == DataType::DE_INT16) {
+      RETURN_IF_NOT_OK((*dst)->Fill<int16_t>(pad_val));
+    } else if (tensor_type == DataType::DE_FLOAT16) {
+      RETURN_IF_NOT_OK((*dst)->Fill<float16>(static_cast<float16>(pad_val)));
+    } else if (tensor_type == DataType::DE_UINT16) {
+      RETURN_IF_NOT_OK((*dst)->Fill<uint16_t>(pad_val));
+    } else if (tensor_type == DataType::DE_INT32) {
+      RETURN_IF_NOT_OK((*dst)->Fill<int32_t>(pad_val));
+    } else if (tensor_type == DataType::DE_UINT32) {
+      RETURN_IF_NOT_OK((*dst)->Fill<uint32_t>(pad_val));
+    } else if (tensor_type == DataType::DE_INT64) {
+      RETURN_IF_NOT_OK((*dst)->Fill<int64_t>(pad_val));
+    } else if (tensor_type == DataType::DE_UINT64) {
+      RETURN_IF_NOT_OK((*dst)->Fill<uint64_t>(pad_val));
+    } else if (tensor_type == DataType::DE_FLOAT32) {
+      RETURN_IF_NOT_OK((*dst)->Fill<float>(pad_val));
+    } else if (tensor_type == DataType::DE_FLOAT64) {
+      RETURN_IF_NOT_OK((*dst)->Fill<double>(pad_val));
+    } else {
+      RETURN_STATUS_UNEXPECTED("Incorrect/Unknown tensor type");
+    }
+    std::vector<dsize_t> cur_ind(src->Rank(), 0);
+    RETURN_IF_NOT_OK(PadEndNumericHelper(src, *dst, cur_ind, 0));
+  }
+  return Status::OK();
+}
+Status PadEndNumericHelper(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> dst,
+                           std::vector<dsize_t> cur_ind, size_t cur_dim) {
+  if (cur_dim == src->Rank() - 1) {  // if this is the last dimension, copy the data
+    dst->CopyLastDimAt(src, cur_ind);
+  } else {  // not the last dimension, keep doing recursion
+    dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]);
+    for (dsize_t i = 0; i < min_ind; i++) {
+      cur_ind[cur_dim] = i;
+      RETURN_IF_NOT_OK(PadEndNumericHelper(src, dst, cur_ind, cur_dim + 1));
+    }
+  }
+  return Status::OK();
+}
+
+Status PadEndString(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
+                    const std::vector<dsize_t> &pad_shape, const std::string &pad_val) {
+  CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr");
+  if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) {
+    (*dst) = src;  // if no padding, copy the pointer
+  } else {
+    CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed");
+    std::vector<dsize_t> cur_ind(src->Rank(), 0);
+    std::vector<std::string> strings;
+    RETURN_IF_NOT_OK(PadEndStringHelper(src, &strings, TensorShape(pad_shape), cur_ind, 0, pad_val));
+    RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, strings, TensorShape(pad_shape)));
+  }
+  return Status::OK();
+}
+
+Status PadEndStringHelper(const std::shared_ptr<Tensor> &src, std::vector<std::string> *dst,
+                          const TensorShape &dst_shape, std::vector<dsize_t> cur_ind, size_t cur_dim,
+                          const std::string &pad_value) {
+  if (cur_dim == src->Rank() - 1) {  // if this is the last dimension, copy the data
+    dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]);
+    for (dsize_t i = 0; i < min_ind; i++) {
+      cur_ind[cur_dim] = i;
+      std::string_view item;
+      RETURN_IF_NOT_OK(src->GetItemAt(&item, cur_ind));
+      dst->emplace_back(item);
+    }
+    for (dsize_t i = min_ind; i < dst_shape[cur_dim]; i++) {
+      dst->emplace_back(pad_value);
+    }
+
+  } else {  // not the last dimension, keep doing recursion
+    dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]);
+    for (dsize_t i = 0; i < min_ind; i++) {
+      cur_ind[cur_dim] = i;
+      RETURN_IF_NOT_OK(PadEndStringHelper(src, dst, dst_shape, cur_ind, cur_dim + 1, pad_value));
+    }
+    dsize_t count = (dst_shape[cur_dim] - min_ind) * dst_shape.Strides()[cur_dim];
+    for (dsize_t i = 0; i < count; i++) {
+      dst->emplace_back(pad_value);
+    }
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Status MaskHelper(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &output,
+                  const std::shared_ptr<Tensor> &value_tensor, RelationalOp op) {
+  T value;
+  RETURN_IF_NOT_OK(value_tensor->GetItemAt(&value, {}));
+  auto in_itr = input->begin<T>();
+  auto out_itr = output->begin<bool>();
+  for (; in_itr != input->end<T>(); in_itr++, out_itr++) {
+    switch (op) {
+      case RelationalOp::kEqual:
+        *out_itr = (*in_itr == value);
+        break;
+      case RelationalOp::kNotEqual:
+        *out_itr = (*in_itr != value);
+        break;
+      case RelationalOp::kGreater:
+        *out_itr = (*in_itr > value);
+        break;
+      case RelationalOp::kGreaterEqual:
+        *out_itr = (*in_itr >= value);
+        break;
+      case RelationalOp::kLess:
+        *out_itr = (*in_itr < value);
+        break;
+      case RelationalOp::kLessEqual:
+        *out_itr = (*in_itr <= value);
+        break;
+      default:
+        RETURN_STATUS_UNEXPECTED("Unknown relational operator.");
+    }
+  }
+  return Status::OK();
+}
+
+Status Mask(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const std::shared_ptr<Tensor> &value,
+            RelationalOp op) {
+  CHECK_FAIL_RETURN_UNEXPECTED(input->type().IsNumeric() == value->type().IsNumeric(),
+                               "Cannot convert constant value to the type of the input tensor.");
+  CHECK_FAIL_RETURN_UNEXPECTED(value->shape() == TensorShape::CreateScalar(), "Value is not a scalar");
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), DataType(DataType::DE_BOOL)));
+
+  std::unique_ptr<TypeCastOp> value_cast_op(new TypeCastOp(input->type()));
+  std::shared_ptr<Tensor> casted_value;
+  if (input->type().IsNumeric()) {
+    RETURN_IF_NOT_OK(value_cast_op->Compute(value, &casted_value));
+  } else {
+    casted_value = value;
+  }
+
+  switch (input->type().value()) {
+    case DataType::DE_BOOL:
+      RETURN_IF_NOT_OK(MaskHelper<bool>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_INT8:
+      RETURN_IF_NOT_OK(MaskHelper<int8_t>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_UINT8:
+      RETURN_IF_NOT_OK(MaskHelper<uint8_t>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_UINT16:
+      RETURN_IF_NOT_OK(MaskHelper<uint16_t>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_INT16:
+      RETURN_IF_NOT_OK(MaskHelper<int16_t>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_UINT32:
+      RETURN_IF_NOT_OK(MaskHelper<uint32_t>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_INT32:
+      RETURN_IF_NOT_OK(MaskHelper<int32_t>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_UINT64:
+      RETURN_IF_NOT_OK(MaskHelper<uint64_t>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_INT64:
+      RETURN_IF_NOT_OK(MaskHelper<int64_t>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_FLOAT16:
+      RETURN_IF_NOT_OK(MaskHelper<float16>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_FLOAT32:
+      RETURN_IF_NOT_OK(MaskHelper<float>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_FLOAT64:
+      RETURN_IF_NOT_OK(MaskHelper<double>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_STRING:
+      RETURN_IF_NOT_OK(MaskHelper<std::string_view>(input, *output, casted_value, op));
+      break;
+    case DataType::DE_UNKNOWN:
+      RETURN_STATUS_UNEXPECTED("Unsupported input type.");
+      break;
+  }
+  return Status::OK();
+}
+
+Status Concatenate(const TensorRow &input, TensorRow *output, int8_t axis, std::shared_ptr<Tensor> prepend,
+                   std::shared_ptr<Tensor> append) {
+  CHECK_FAIL_RETURN_UNEXPECTED(input[0]->shape().Rank() == 1, "Only 1D tensors supported");
+  CHECK_FAIL_RETURN_UNEXPECTED(axis == 0 || axis == -1, "Only concatenation along the last dimension supported");
+
+  axis = Tensor::HandleNeg(axis, input[0]->shape().Rank());
+  CHECK_FAIL_RETURN_UNEXPECTED(axis == 0, "Only axis=0 is supported");
+
+  std::shared_ptr<Tensor> out;
+  if (prepend != nullptr) {
+    CHECK_FAIL_RETURN_UNEXPECTED(prepend->shape().Rank() == 1, "Only 1D tensors supported");
+    RETURN_IF_NOT_OK(ConcatenateHelper(prepend, &out, axis, input[0]));
+  } else {
+    out = input[0];
+  }
+  for (dsize_t i = 1; i < input.size(); i++) {
+    std::shared_ptr<Tensor> out_t;
+    CHECK_FAIL_RETURN_UNEXPECTED(input[i]->shape().Rank() == 1, "Only 1D tensors supported");
+    RETURN_IF_NOT_OK(ConcatenateHelper(out, &out_t, axis, input[i]));
+    out = out_t;
+  }
+  std::shared_ptr<Tensor> out_t;
+  if (append != nullptr) {
+    CHECK_FAIL_RETURN_UNEXPECTED(append->shape().Rank() == 1, "Only 1D tensors supported");
+    RETURN_IF_NOT_OK(ConcatenateHelper(out, &out_t, axis, append));
+  } else {
+    out_t = out;
+  }
+  output->push_back(out_t);
+
+  return Status::OK();
+}
+
+Status ConcatenateHelper(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int8_t axis,
+                         std::shared_ptr<Tensor> append) {
+  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == append->type(), "Tensor types do not match");
+
+  TensorShape t({});
+
+  for (dsize_t i = 0; i < input->shape().Rank(); i++) {
+    if (i != axis) {
+      t = t.AppendDim(input->shape()[i]);
+    } else {
+      dsize_t new_shape = input->shape()[i] + append->shape()[i];
+
+      t = t.AppendDim(new_shape);
+    }
+  }
+  std::shared_ptr<Tensor> out;
+
+  if (input->type().IsNumeric()) {
+    RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, TensorImpl::kFlexible, t, input->type()));
+
+    RETURN_IF_NOT_OK(out->Concatenate({0}, input));
+    RETURN_IF_NOT_OK(out->Concatenate({input->shape()[0]}, append));
+    *output = out;
+  } else {
+    std::vector<std::string> strings;
+
+    auto itr = input->begin<std::string_view>();
+    for (; itr != input->end<std::string_view>(); itr++) {
+      strings.emplace_back(*itr);
+    }
+    itr = append->begin<std::string_view>();
+    for (; itr != append->end<std::string_view>(); itr++) {
+      strings.emplace_back(*itr);
+    }
+    RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, strings, t));
+
+    *output = out;
+  }
 
   return Status::OK();
 }
diff --git a/mindspore/ccsrc/dataset/kernels/data/data_utils.h b/mindspore/ccsrc/dataset/kernels/data/data_utils.h
index 2bd168a1fe..6034e2a0eb 100644
--- a/mindspore/ccsrc/dataset/kernels/data/data_utils.h
+++ b/mindspore/ccsrc/dataset/kernels/data/data_utils.h
@@ -17,11 +17,13 @@
 #define DATASET_KERNELS_DATA_DATA_UTILS_H_
 
 #include <memory>
+#include <string>
 #include <vector>
 #include "dataset/core/constants.h"
 #include "dataset/core/cv_tensor.h"
 #include "dataset/core/data_type.h"
 #include "dataset/core/tensor.h"
+#include "dataset/core/tensor_row.h"
 
 namespace mindspore {
 namespace dataset {
@@ -42,6 +44,13 @@ Status OneHotEncodingUnsigned(const std::shared_ptr<Tensor> &input, std::shared_
 Status OneHotEncodingSigned(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, dsize_t num_classes,
                             int64_t index);
 
+// Returns a tensor of shape input filled with the passed fill_value
+// @param input  Tensor
+// @param output Tensor. The shape and type of the output tensor is same as input
+// @param fill_value Tensor. A scalar tensor used to fill the output tensor
+
+Status Fill(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, std::shared_ptr<Tensor> fill_value);
+
 // Returns a type changed input tensor.
 //          Example: if input tensor is float64, the output will the specified dataType. See DataTypes.cpp
 // @param input  Tensor
@@ -58,6 +67,96 @@ void Cast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output)
 Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);
 
 Status TypeCast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const DataType &data_type);
+
+// Pad input tensor according pad_shape, need to have same rank.
+// Based on the type of the input tensor, PadEndNumeric/String will be called.
+// @param std::shared_ptr<Tensor> src - tensor to pad from
+// @param std::shared_ptr<Tensor> *dst - return tensor padded
+// @param std::vector<dsize_t> pad_shape - shape to pad to
+// @param std::shared_ptr<Tensor> pad_val - value to pad with in Tensor format,
+// @return - The error code return
+Status PadEnd(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst, const std::vector<dsize_t> &pad_shape,
+              const std::shared_ptr<Tensor> &pad_val);
+
+// Pad input numeric tensor according pad_shape, need to have same rank.
+// @param std::shared_ptr<Tensor> src - tensor to pad from
+// @param std::shared_ptr<Tensor> *dst - return tensor padded
+// @param std::vector<dsize_t> pad_shape - shape to pad to
+// @param float pad_val - value to pad with
+// @return - The error code return
+Status PadEndNumeric(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
+                     const std::vector<dsize_t> &pad_shape, float pad_val);
+
+// recursive helper function for padding numric tensors. This function could be very expensive if called on a
+// multi-dimensional tensor it is only meant to be called by PadEndNumeric.
+// @tparam T - type of tensor and fill value
+// @param std::shared_ptr<Tensor> src - Tensor to pad from
+// @param std::shared_ptr<Tensor>* dst - Tensor to pad to, return value
+// @param std::vector<dsize_t> cur_ind - recursion helper
+// @param T pad_val - value to pad tensor with
+// @param size_t cur_dim - recursion helper
+// @return Status - The error code return
+Status PadEndNumericHelper(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> dst,
+                           std::vector<dsize_t> cur_ind, size_t cur_dim = 0);
+
+// Pad input string tensor according pad_shape, need to have same rank.
+// @param std::shared_ptr<Tensor> src - tensor to pad from
+// @param std::shared_ptr<Tensor> *dst - return tensor padded
+// @param std::vector<dsize_t> pad_shape - shape to pad to
+// @param std::string pad_val - value to pad with
+// @return - The error code return
+Status PadEndString(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
+                    const std::vector<dsize_t> &pad_shape, const std::string &pad_val);
+
+// recursive helper function for padding string tensors. This function could be very expensive if called on a
+// multi-dimensional tensor it is only meant to be called by PadEndString.
+// @tparam T - type of tensor and fill value
+// @param std::shared_ptr<Tensor> src - Tensor to pad from
+// @param std::shared_ptr<Tensor>* dst - Tensor to pad to, return value
+// @param std::vector<dsize_t> cur_ind - recursion helperas text
+// @param std::string pad_val - value to pad tensor with
+// @param size_t cur_dim - recursion helper
+// @return Status - The error code return
+Status PadEndStringHelper(const std::shared_ptr<Tensor> &src, std::vector<std::string> *dst,
+                          const TensorShape &dst_shape, std::vector<dsize_t> cur_ind, size_t cur_dim,
+                          const std::string &pad_value);
+
+enum class RelationalOp {
+  kEqual = 0,     // ==
+  kNotEqual,      // !=
+  kLess,          // <
+  kLessEqual,     // <=
+  kGreater,       // >
+  kGreaterEqual,  // >=
+};
+
+/// Helper method that masks the input tensor
+/// @tparam T type of the tensor
+/// @param input[in] input tensor
+/// @param output[out] output tensor
+/// @param value_tensor[in] scalar tensor value to compared with
+/// @param op[in] RelationalOp enum
+/// @return Status ok/error
+template <typename T>
+Status MaskHelper(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &output,
+                  const std::shared_ptr<Tensor> &value_tensor, RelationalOp op);
+
+/// Mask the input tensor
+/// @param input[in] input tensor
+/// @param output[out] output tensor
+/// @param value[in] scalar tensor value to compared with
+/// @param op[in] RelationalOp enum
+/// @return Status ok/error
+Status Mask(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const std::shared_ptr<Tensor> &value,
+            RelationalOp op);
+
+Status Concatenate(const TensorRow &input, TensorRow *output, int8_t axis, std::shared_ptr<Tensor> prepend,
+                   std::shared_ptr<Tensor> append);
+
+// helper for concat, always append to the input, and pass that to the output
+Status ConcatenateHelper(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int8_t axis,
+                         std::shared_ptr<Tensor> append);
+
 }  // namespace dataset
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/dataset/kernels/data/duplicate_op.cc b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.cc
new file mode 100644
index 0000000000..959516a4aa
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.cc
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/kernels/data/duplicate_op.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status DuplicateOp::Compute(const TensorRow &input, TensorRow *output) {
+  IO_CHECK_VECTOR(input, output);
+  CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
+  std::shared_ptr<Tensor> out;
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, input[0]));
+  output->push_back(input[0]);
+  output->push_back(out);
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/data/duplicate_op.h b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.h
new file mode 100644
index 0000000000..4c9d6d36c9
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_DATA_DUPLICATE_OP_H_
+#define DATASET_KERNELS_DATA_DUPLICATE_OP_H_
+
+#include <vector>
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+class DuplicateOp : public TensorOp {
+ public:
+  DuplicateOp() = default;
+
+  ~DuplicateOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "DuplicateOp"; }
+
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+  uint32_t NumOutput() override { return 2; }
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_KERNELS_DUPLICATE_OP_H_
diff --git a/mindspore/ccsrc/dataset/kernels/data/fill_op.cc b/mindspore/ccsrc/dataset/kernels/data/fill_op.cc
new file mode 100644
index 0000000000..63895d3a95
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/fill_op.cc
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/kernels/data/fill_op.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/data/data_utils.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+Status FillOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  Status s = Fill(input, output, fill_value_);
+  return s;
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/data/fill_op.h b/mindspore/ccsrc/dataset/kernels/data/fill_op.h
new file mode 100644
index 0000000000..03f59f3e67
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/fill_op.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_KERNELS_DATA_FILL_OP_H_
+#define DATASET_KERNELS_DATA_FILL_OP_H_
+
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+class FillOp : public TensorOp {
+ public:
+  explicit FillOp(std::shared_ptr<Tensor> value) : fill_value_(value) {}
+
+  ~FillOp() override = default;
+  void Print(std::ostream &out) const override { out << "FillOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  std::shared_ptr<Tensor> fill_value_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_FILL_OP_H
diff --git a/mindspore/ccsrc/dataset/kernels/data/mask_op.cc b/mindspore/ccsrc/dataset/kernels/data/mask_op.cc
new file mode 100644
index 0000000000..2cfeb7e36f
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/mask_op.cc
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/kernels/data/mask_op.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status MaskOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  std::shared_ptr<Tensor> temp_output;
+  CHECK_FAIL_RETURN_UNEXPECTED(type_.IsNumeric(), "Cannot generate a string mask. Type should be numeric.");
+
+  RETURN_IF_NOT_OK(Mask(input, &temp_output, value_, op_));
+
+  // cast the output to the the required type. Skip casting if type_ is bool.
+  if (type_ != DataType::DE_BOOL) {
+    RETURN_IF_NOT_OK(cast_->Compute(temp_output, output));
+  } else {
+    *output = std::move(temp_output);
+  }
+
+  return Status::OK();
+}
+
+Status MaskOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
+  RETURN_IF_NOT_OK(TensorOp::OutputType(inputs, outputs));
+  outputs[0] = type_;
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/data/mask_op.h b/mindspore/ccsrc/dataset/kernels/data/mask_op.h
new file mode 100644
index 0000000000..0affe543bb
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/mask_op.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_DATA_MASK_OP_H_
+#define DATASET_KERNELS_DATA_MASK_OP_H_
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/kernels/data/type_cast_op.h"
+#include "dataset/kernels/data/data_utils.h"
+
+namespace mindspore {
+namespace dataset {
+
+class MaskOp : public TensorOp {
+ public:
+  MaskOp(RelationalOp op, std::shared_ptr<Tensor> value, DataType type = DataType(DataType::DE_BOOL))
+      : op_(op), value_(std::move(value)), type_(type), cast_(new TypeCastOp(type)) {}
+
+  ~MaskOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "MaskOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
+
+ private:
+  RelationalOp op_;
+  std::shared_ptr<Tensor> value_;
+  DataType type_;
+  std::unique_ptr<TypeCastOp> cast_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_KERNELS_DATA_MASK_OP_H_
diff --git a/mindspore/ccsrc/dataset/kernels/data/pad_end_op.cc b/mindspore/ccsrc/dataset/kernels/data/pad_end_op.cc
new file mode 100644
index 0000000000..5b3b4cbe16
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/pad_end_op.cc
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/kernels/data/pad_end_op.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/data/data_utils.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+Status PadEndOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  Status s = PadEnd(input, output, output_shape_.AsVector(), pad_val_);
+  return s;
+}
+
+Status PadEndOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
+  RETURN_IF_NOT_OK(TensorOp::OutputShape(inputs, outputs));
+  outputs.clear();
+  for (auto s : inputs) {
+    outputs.emplace_back(TensorShape(output_shape_.AsVector()));
+  }
+  CHECK_FAIL_RETURN_UNEXPECTED(!outputs.empty(), "Input has a wrong shape");
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/data/pad_end_op.h b/mindspore/ccsrc/dataset/kernels/data/pad_end_op.h
new file mode 100644
index 0000000000..c6bc0c430e
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/pad_end_op.h
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_DATA_PAD_END_OP_H_
+#define DATASET_KERNELS_DATA_PAD_END_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+class PadEndOp : public TensorOp {
+ public:
+  explicit PadEndOp(const TensorShape &pad_shape, const std::shared_ptr<Tensor> &pad_value)
+      : output_shape_(pad_shape), pad_val_(pad_value) {}
+
+  ~PadEndOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "PadEndOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
+
+ private:
+  TensorShape output_shape_;
+  std::shared_ptr<Tensor> pad_val_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_KERNELS_DATA_PAD_END_OP_H_
diff --git a/mindspore/ccsrc/dataset/kernels/data/slice_op.cc b/mindspore/ccsrc/dataset/kernels/data/slice_op.cc
new file mode 100644
index 0000000000..2eebf26e84
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/slice_op.cc
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/kernels/data/slice_op.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+Status SliceOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  CHECK_FAIL_RETURN_UNEXPECTED(input->shape().Rank() == 1, "SliceOp supports 1D Tensors only for now.");
+
+  // if `all` flag is true, output is just the input.
+  if (all_) {
+    *output = input;
+    return Status::OK();
+  }
+
+  // if slice object was provided, indices should be empty. Generate indices from the slice object.
+  if (slice_.valid() && indices_.empty()) {
+    dsize_t len = input->shape()[0];
+    std::vector<dsize_t> indices = slice_.Indices(len);
+    return input->Slice(output, indices);
+  }
+
+  // if indices are not empty, slices should be invalid, use indices_ to slice
+  if (!indices_.empty() && !slice_.valid()) {
+    return input->Slice(output, indices_);
+  }
+  RETURN_STATUS_UNEXPECTED("The indexing parameters are invalid");
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/data/slice_op.h b/mindspore/ccsrc/dataset/kernels/data/slice_op.h
new file mode 100644
index 0000000000..0a24ae171e
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/data/slice_op.h
@@ -0,0 +1,85 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_DATA_SLICE_OP_H_
+#define DATASET_KERNELS_DATA_SLICE_OP_H_
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+class Slice {
+ public:
+  Slice() : start_(0), stop_(0), step_(0) {}
+  Slice(dsize_t start, dsize_t stop, dsize_t step) : start_(start), stop_(stop), step_(step) {}
+  Slice(dsize_t start, dsize_t stop) : start_(start), stop_(stop), step_(1) {}
+  explicit Slice(dsize_t stop) : start_(0), stop_(stop), step_(1) {}
+
+  ~Slice() = default;
+
+  std::vector<dsize_t> Indices(dsize_t length) {
+    std::vector<dsize_t> indices;
+    dsize_t index = std::min(Tensor::HandleNeg(start_, length), length);
+    dsize_t end_index = std::min(Tensor::HandleNeg(stop_, length), length);
+    if (step_ > 0) {
+      for (; index < end_index; index += step_) {
+        indices.push_back(index);
+      }
+    } else {
+      for (; index > end_index; index += step_) {
+        indices.push_back(index);
+      }
+    }
+    return indices;
+  }
+
+  bool valid() { return !(start_ == 0 && stop_ == 0 && step_ == 0); }
+
+  dsize_t start_;
+  dsize_t stop_;
+  dsize_t step_;
+};
+
+class SliceOp : public TensorOp {
+ public:
+  explicit SliceOp(std::vector<dsize_t> indices) : indices_(std::move(indices)) {}
+  explicit SliceOp(Slice slice) : slice_(slice) {}
+  explicit SliceOp(bool all) : all_(all) {}
+
+  ~SliceOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "SliceOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  // only on of the following will be valid
+  // given indices to slice the Tensor. Empty vector if invalid.
+  std::vector<dsize_t> indices_;
+  // Slice object. All start, stop and step are 0 if invalid.
+  Slice slice_;
+  // Flag to read all indcies in the dim.
+  bool all_ = false;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_KERNELS_DATA_SLICE_OP_H_
diff --git a/mindspore/ccsrc/dataset/kernels/image/CMakeLists.txt b/mindspore/ccsrc/dataset/kernels/image/CMakeLists.txt
index 583a732f7d..3d88d9989c 100644
--- a/mindspore/ccsrc/dataset/kernels/image/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/kernels/image/CMakeLists.txt
@@ -10,12 +10,17 @@ add_library(kernels-image OBJECT
     pad_op.cc
     random_color_adjust_op.cc
     random_crop_decode_resize_op.cc
+    random_crop_and_resize_with_bbox_op.cc
     random_crop_and_resize_op.cc
     random_crop_op.cc
+    random_crop_with_bbox_op.cc
     random_horizontal_flip_op.cc
+    random_horizontal_flip_bbox_op.cc
+    bounding_box_augment_op.cc
     random_resize_op.cc
     random_rotation_op.cc
     random_vertical_flip_op.cc
+    random_vertical_flip_with_bbox_op.cc
     rescale_op.cc
     resize_bilinear_op.cc
     resize_op.cc
diff --git a/mindspore/ccsrc/dataset/kernels/image/bounding_box_augment_op.cc b/mindspore/ccsrc/dataset/kernels/image/bounding_box_augment_op.cc
new file mode 100644
index 0000000000..04e00d878d
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/bounding_box_augment_op.cc
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <utility>
+#include "dataset/kernels/image/bounding_box_augment_op.h"
+#include "dataset/kernels/image/resize_op.h"
+#include "dataset/kernels/image/image_utils.h"
+#include "dataset/core/cv_tensor.h"
+
+namespace mindspore {
+namespace dataset {
+const float BoundingBoxAugmentOp::kDefRatio = 0.3;
+
+BoundingBoxAugmentOp::BoundingBoxAugmentOp(std::shared_ptr<TensorOp> transform, float ratio)
+    : ratio_(ratio), transform_(std::move(transform)) {
+  rnd_.seed(GetSeed());
+}
+
+Status BoundingBoxAugmentOp::Compute(const TensorRow &input, TensorRow *output) {
+  IO_CHECK_VECTOR(input, output);
+  BOUNDING_BOX_CHECK(input);  // check if bounding boxes are valid
+  uint32_t num_of_boxes = input[1]->shape()[0];
+  uint32_t num_to_aug = num_of_boxes * ratio_;  // cast to int
+  std::vector<uint32_t> boxes(num_of_boxes);
+  std::vector<uint32_t> selected_boxes;
+  for (uint32_t i = 0; i < num_of_boxes; i++) boxes[i] = i;
+  // sample bboxes according to ratio picked by user
+  std::sample(boxes.begin(), boxes.end(), std::back_inserter(selected_boxes), num_to_aug, rnd_);
+  std::shared_ptr<Tensor> crop_out;
+  std::shared_ptr<Tensor> res_out;
+  std::shared_ptr<CVTensor> input_restore = CVTensor::AsCVTensor(input[0]);
+
+  for (uint32_t i = 0; i < num_to_aug; i++) {
+    uint32_t min_x = 0;
+    uint32_t min_y = 0;
+    uint32_t b_w = 0;
+    uint32_t b_h = 0;
+    // get the required items
+    input[1]->GetItemAt<uint32_t>(&min_x, {selected_boxes[i], 0});
+    input[1]->GetItemAt<uint32_t>(&min_y, {selected_boxes[i], 1});
+    input[1]->GetItemAt<uint32_t>(&b_w, {selected_boxes[i], 2});
+    input[1]->GetItemAt<uint32_t>(&b_h, {selected_boxes[i], 3});
+    Crop(input_restore, &crop_out, min_x, min_y, b_w, b_h);
+    // transform the cropped bbox region
+    transform_->Compute(crop_out, &res_out);
+    // place the transformed region back in the restored input
+    std::shared_ptr<CVTensor> res_img = CVTensor::AsCVTensor(res_out);
+    // check if transformed crop is out of bounds of the box
+    if (res_img->mat().cols > b_w || res_img->mat().rows > b_h || res_img->mat().cols < b_w ||
+        res_img->mat().rows < b_h) {
+      // if so, resize to fit in the box
+      std::shared_ptr<TensorOp> resize_op = std::make_shared<ResizeOp>(b_h, b_w);
+      resize_op->Compute(std::static_pointer_cast<Tensor>(res_img), &res_out);
+      res_img = CVTensor::AsCVTensor(res_out);
+    }
+    res_img->mat().copyTo(input_restore->mat()(cv::Rect(min_x, min_y, res_img->mat().cols, res_img->mat().rows)));
+  }
+  (*output).push_back(std::move(std::static_pointer_cast<Tensor>(input_restore)));
+  (*output).push_back(input[1]);
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/image/bounding_box_augment_op.h b/mindspore/ccsrc/dataset/kernels/image/bounding_box_augment_op.h
new file mode 100644
index 0000000000..6c106f75dc
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/bounding_box_augment_op.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_KERNELS_IMAGE_BOUNDING_BOX_AUGMENT_OP_H_
+#define DATASET_KERNELS_IMAGE_BOUNDING_BOX_AUGMENT_OP_H_
+
+#include <memory>
+#include <random>
+#include <cstdlib>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+#include "dataset/util/random.h"
+
+namespace mindspore {
+namespace dataset {
+class BoundingBoxAugmentOp : public TensorOp {
+ public:
+  // Default values, also used by python_bindings.cc
+  static const float kDefRatio;
+
+  // Constructor for BoundingBoxAugmentOp
+  // @param std::shared_ptr<TensorOp> transform transform: C++ opration to apply on select bounding boxes
+  // @param float ratio: ratio of bounding boxes to have the transform applied on
+  BoundingBoxAugmentOp(std::shared_ptr<TensorOp> transform, float ratio);
+
+  ~BoundingBoxAugmentOp() override = default;
+
+  // Provide stream operator for displaying it
+  friend std::ostream &operator<<(std::ostream &out, const BoundingBoxAugmentOp &so) {
+    so.Print(out);
+    return out;
+  }
+
+  void Print(std::ostream &out) const override { out << "BoundingBoxAugmentOp"; }
+
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+ private:
+  float ratio_;
+  std::mt19937 rnd_;
+  std::shared_ptr<TensorOp> transform_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_IMAGE_BOUNDING_BOX_AUGMENT_OP_H_
diff --git a/mindspore/ccsrc/dataset/kernels/image/image_utils.cc b/mindspore/ccsrc/dataset/kernels/image/image_utils.cc
index bf470173d9..bb88f991a4 100644
--- a/mindspore/ccsrc/dataset/kernels/image/image_utils.cc
+++ b/mindspore/ccsrc/dataset/kernels/image/image_utils.cc
@@ -16,6 +16,7 @@
 #include "dataset/kernels/image/image_utils.h"
 #include <opencv2/imgproc/types_c.h>
 #include <algorithm>
+#include <vector>
 #include <stdexcept>
 #include <utility>
 #include <opencv2/imgcodecs.hpp>
@@ -119,17 +120,14 @@ Status Resize(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
   }
 }
 
-bool HasJpegMagic(const unsigned char *data, size_t data_size) {
+bool HasJpegMagic(const std::shared_ptr<Tensor> &input) {
   const unsigned char *kJpegMagic = (unsigned char *)"\xFF\xD8\xFF";
   constexpr size_t kJpegMagicLen = 3;
-  return data_size >= kJpegMagicLen && memcmp(data, kJpegMagic, kJpegMagicLen) == 0;
+  return input->SizeInBytes() >= kJpegMagicLen && memcmp(input->GetBuffer(), kJpegMagic, kJpegMagicLen) == 0;
 }
 
 Status Decode(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
-  if (input->GetMutableBuffer() == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Tensor is nullptr");
-  }
-  if (HasJpegMagic(input->GetMutableBuffer(), input->SizeInBytes())) {
+  if (HasJpegMagic(input)) {
     return JpegCropAndDecode(input, output);
   } else {
     return DecodeCv(input, output);
@@ -283,7 +281,7 @@ Status JpegCropAndDecode(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
   jerr.pub.error_exit = JpegErrorExitCustom;
   try {
     jpeg_create_decompress(&cinfo);
-    JpegSetSource(&cinfo, input->GetMutableBuffer(), input->SizeInBytes());
+    JpegSetSource(&cinfo, input->GetBuffer(), input->SizeInBytes());
     (void)jpeg_read_header(&cinfo, TRUE);
     RETURN_IF_NOT_OK(JpegSetColorSpace(&cinfo));
     jpeg_calc_output_dimensions(&cinfo);
@@ -312,7 +310,7 @@ Status JpegCropAndDecode(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
   TensorShape ts = TensorShape({crop_h, crop_w, kOutNumComponents});
   auto output_tensor = std::make_shared<Tensor>(ts, DataType(DataType::DE_UINT8));
   const int buffer_size = output_tensor->SizeInBytes();
-  JSAMPLE *buffer = static_cast<JSAMPLE *>(output_tensor->GetMutableBuffer());
+  JSAMPLE *buffer = static_cast<JSAMPLE *>(reinterpret_cast<uchar *>(&(*output_tensor->begin<uint8_t>())));
   const int max_scanlines_to_read = skipped_scanlines + crop_h;
   // stride refers to output tensor, which has 3 components at most
   const int stride = crop_w * kOutNumComponents;
@@ -376,8 +374,9 @@ Status HwcToChw(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output)
       *output = input;
       return Status::OK();
     }
-    if (input_cv->shape().Size() != 3 && input_cv->shape()[2] != 3) {
-      RETURN_STATUS_UNEXPECTED("The shape is incorrect: number of channels is not equal 3");
+    if (input_cv->shape().Size() < 2 || input_cv->shape().Size() > 3 ||
+        (input_cv->shape().Size() == 3 && input_cv->shape()[2] != 3 && input_cv->shape()[2] != 1)) {
+      RETURN_STATUS_UNEXPECTED("The shape is incorrect: number of channels does not equal 3 nor 1");
     }
     cv::Mat output_img;
 
@@ -401,8 +400,8 @@ Status HwcToChw(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output)
 Status SwapRedAndBlue(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output) {
   try {
     std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(std::move(input));
-    if (input_cv->shape().Size() != 3 && input_cv->shape()[2] != 3) {
-      RETURN_STATUS_UNEXPECTED("The shape is incorrect: number of channels is not equal 3");
+    if (input_cv->shape().Size() != 3 || input_cv->shape()[2] != 3) {
+      RETURN_STATUS_UNEXPECTED("The shape is incorrect: number of channels does not equal 3");
     }
     auto output_cv = std::make_shared<CVTensor>(input_cv->shape(), input_cv->type());
     RETURN_UNEXPECTED_IF_NULL(output_cv);
@@ -422,7 +421,7 @@ Status CropAndResize(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tenso
       RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
     }
     if (input_cv->Rank() != 3 && input_cv->Rank() != 2) {
-      RETURN_STATUS_UNEXPECTED("Ishape not <H,W,C> or <H,W>");
+      RETURN_STATUS_UNEXPECTED("Shape not <H,W,C> or <H,W>");
     }
     // image too large or too small
     if (crop_height == 0 || crop_width == 0 || target_height == 0 || target_height > crop_height * 1000 ||
@@ -541,8 +540,8 @@ Status AdjustBrightness(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
     if (!input_cv->mat().data) {
       RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
     }
-    if (input_cv->Rank() != 3 && input_cv->shape()[2] != 3) {
-      RETURN_STATUS_UNEXPECTED("Shape not <H,W,3> or <H,W>");
+    if (input_cv->Rank() != 3 || input_cv->shape()[2] != 3) {
+      RETURN_STATUS_UNEXPECTED("The shape is incorrect: number of channels does not equal 3");
     }
     auto output_cv = std::make_shared<CVTensor>(input_cv->shape(), input_cv->type());
     RETURN_UNEXPECTED_IF_NULL(output_cv);
@@ -561,8 +560,8 @@ Status AdjustContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tens
     if (!input_cv->mat().data) {
       RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
     }
-    if (input_cv->Rank() != 3 && input_cv->shape()[2] != 3) {
-      RETURN_STATUS_UNEXPECTED("Shape not <H,W,3> or <H,W>");
+    if (input_cv->Rank() != 3 || input_cv->shape()[2] != 3) {
+      RETURN_STATUS_UNEXPECTED("The shape is incorrect: number of channels does not equal 3");
     }
     cv::Mat gray, output_img;
     cv::cvtColor(input_img, gray, CV_RGB2GRAY);
@@ -587,8 +586,8 @@ Status AdjustSaturation(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
     if (!input_cv->mat().data) {
       RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
     }
-    if (input_cv->Rank() != 3 && input_cv->shape()[2] != 3) {
-      RETURN_STATUS_UNEXPECTED("Shape not <H,W,3> or <H,W>");
+    if (input_cv->Rank() != 3 || input_cv->shape()[2] != 3) {
+      RETURN_STATUS_UNEXPECTED("The shape is incorrect: number of channels does not equal 3");
     }
     auto output_cv = std::make_shared<CVTensor>(input_cv->shape(), input_cv->type());
     RETURN_UNEXPECTED_IF_NULL(output_cv);
@@ -615,8 +614,8 @@ Status AdjustHue(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *
     if (!input_cv->mat().data) {
       RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
     }
-    if (input_cv->Rank() != 3 && input_cv->shape()[2] != 3) {
-      RETURN_STATUS_UNEXPECTED("Shape not <H,W,3> or <H,W>");
+    if (input_cv->Rank() != 3 || input_cv->shape()[2] != 3) {
+      RETURN_STATUS_UNEXPECTED("The shape is incorrect: number of channels does not equal 3");
     }
     auto output_cv = std::make_shared<CVTensor>(input_cv->shape(), input_cv->type());
     RETURN_UNEXPECTED_IF_NULL(output_cv);
@@ -644,7 +643,7 @@ Status Erase(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outp
              uint8_t fill_g, uint8_t fill_b) {
   try {
     std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(input);
-    if (input_cv->mat().data == nullptr || (input_cv->Rank() != 3 && input_cv->shape()[2] != 3)) {
+    if (input_cv->mat().data == nullptr || input_cv->Rank() != 3 || input_cv->shape()[2] != 3) {
       RETURN_STATUS_UNEXPECTED("bad CV Tensor input for erase");
     }
     cv::Mat input_img = input_cv->mat();
@@ -726,5 +725,101 @@ Status Pad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output
     RETURN_STATUS_UNEXPECTED("Unexpected error in pad");
   }
 }
+// -------- BBOX OPERATIONS -------- //
+Status UpdateBBoxesForCrop(std::shared_ptr<Tensor> *bboxList, size_t *bboxCount, int CB_Xmin, int CB_Ymin, int CB_Xmax,
+                           int CB_Ymax) {
+  // PASS LIST, COUNT OF BOUNDING BOXES
+  // Also PAss X/Y Min/Max of image cropped region - normally obtained from 'GetCropBox' functions
+  uint32_t bb_Xmin_t, bb_Ymin_t, bb_Xmax_t, bb_Ymax_t;
+
+  std::vector<int> correct_ind;
+  std::vector<uint32_t> copyVals;
+  dsize_t bboxDim = (*bboxList)->shape()[1];
+  bool retFlag = false;  // true unless overlap found
+  for (int i = 0; i < *bboxCount; i++) {
+    int bb_Xmin, bb_Xmax, bb_Ymin, bb_Ymax;
+    RETURN_IF_NOT_OK((*bboxList)->GetUnsignedIntAt(&bb_Xmin_t, {i, 0}));
+    RETURN_IF_NOT_OK((*bboxList)->GetUnsignedIntAt(&bb_Ymin_t, {i, 1}));
+    RETURN_IF_NOT_OK((*bboxList)->GetUnsignedIntAt(&bb_Xmax_t, {i, 2}));
+    RETURN_IF_NOT_OK((*bboxList)->GetUnsignedIntAt(&bb_Ymax_t, {i, 3}));
+    bb_Xmin = bb_Xmin_t;
+    bb_Ymin = bb_Ymin_t;
+    bb_Xmax = bb_Xmax_t;
+    bb_Ymax = bb_Ymax_t;
+    bb_Xmax = bb_Xmin + bb_Xmax;
+    bb_Ymax = bb_Ymin + bb_Ymax;
+    // check for image / BB overlap
+    if (((bb_Xmin > CB_Xmax) || (bb_Ymin > CB_Ymax)) || ((bb_Xmax < CB_Xmin) || (bb_Ymax < CB_Ymin))) {
+      continue;  // no overlap found
+    }
+    // Update this bbox and select it to move to the final output tensor
+    correct_ind.push_back(i);
+    // adjust BBox corners by bringing into new CropBox if beyond
+    // Also reseting/adjusting for boxes to lie within CropBox instead of Image - subtract CropBox Xmin/YMin
+    bb_Xmin = bb_Xmin - (std::min(0, (bb_Xmin - CB_Xmin)) + CB_Xmin);
+    bb_Xmax = bb_Xmax - (std::max(0, (bb_Xmax - CB_Xmax)) + CB_Xmin);
+    bb_Ymin = bb_Ymin - (std::min(0, (bb_Ymin - CB_Ymin)) + CB_Ymin);
+    bb_Ymax = bb_Ymax - (std::max(0, (bb_Ymax - CB_Ymax)) + CB_Ymin);
+    // reset min values and calculate width/height from Box corners
+    RETURN_IF_NOT_OK((*bboxList)->SetItemAt({i, 0}, static_cast<uint32_t>(bb_Xmin)));
+    RETURN_IF_NOT_OK((*bboxList)->SetItemAt({i, 1}, static_cast<uint32_t>(bb_Ymin)));
+    RETURN_IF_NOT_OK((*bboxList)->SetItemAt({i, 2}, static_cast<uint32_t>(bb_Xmax - bb_Xmin)));
+    RETURN_IF_NOT_OK((*bboxList)->SetItemAt({i, 3}, static_cast<uint32_t>(bb_Ymax - bb_Ymin)));
+  }
+  // create new tensor and copy over bboxes still valid to the image
+  // bboxes outside of new cropped region are ignored - empty tensor returned in case of none
+  *bboxCount = correct_ind.size();
+  uint32_t temp;
+  for (auto slice : correct_ind) {  // for every index in the loop
+    for (int ix = 0; ix < bboxDim; ix++) {
+      RETURN_IF_NOT_OK((*bboxList)->GetUnsignedIntAt(&temp, {slice, ix}));
+      copyVals.push_back(temp);
+    }
+  }
+  std::shared_ptr<Tensor> retV;
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&retV, copyVals, TensorShape({static_cast<dsize_t>(*bboxCount), bboxDim})));
+  (*bboxList) = retV;  // reset pointer
+  return Status::OK();
+}
+
+Status PadBBoxes(std::shared_ptr<Tensor> *bboxList, const size_t &bboxCount, int32_t pad_top, int32_t pad_left) {
+  for (int i = 0; i < bboxCount; i++) {
+    uint32_t xMin, yMin;
+    RETURN_IF_NOT_OK((*bboxList)->GetUnsignedIntAt(&xMin, {i, 0}));
+    RETURN_IF_NOT_OK((*bboxList)->GetUnsignedIntAt(&yMin, {i, 1}));
+    xMin += static_cast<uint32_t>(pad_left);  // should not be negative
+    yMin += static_cast<uint32_t>(pad_top);
+    RETURN_IF_NOT_OK((*bboxList)->SetItemAt({i, 0}, xMin));
+    RETURN_IF_NOT_OK((*bboxList)->SetItemAt({i, 1}, yMin));
+  }
+  return Status::OK();
+}
+
+Status UpdateBBoxesForResize(const std::shared_ptr<Tensor> &bboxList, const size_t &bboxCount, int32_t target_width_,
+                             int32_t target_height_, int orig_width, int orig_height) {
+  uint32_t bb_Xmin, bb_Ymin, bb_Xwidth, bb_Ywidth;
+  // cast to float to preseve fractional
+  double W_aspRatio = (target_width_ * 1.0) / (orig_width * 1.0);
+  double H_aspRatio = (target_height_ * 1.0) / (orig_height * 1.0);
+  for (int i = 0; i < bboxCount; i++) {
+    // for each bounding box
+    RETURN_IF_NOT_OK(bboxList->GetUnsignedIntAt(&bb_Xmin, {i, 0}));
+    RETURN_IF_NOT_OK(bboxList->GetUnsignedIntAt(&bb_Ymin, {i, 1}));
+    RETURN_IF_NOT_OK(bboxList->GetUnsignedIntAt(&bb_Xwidth, {i, 2}));
+    RETURN_IF_NOT_OK(bboxList->GetUnsignedIntAt(&bb_Ywidth, {i, 3}));
+    // update positions and widths
+    bb_Xmin = bb_Xmin * W_aspRatio;
+    bb_Ymin = bb_Ymin * H_aspRatio;
+    bb_Xwidth = bb_Xwidth * W_aspRatio;
+    bb_Ywidth = bb_Ywidth * H_aspRatio;
+    // reset bounding box values
+    RETURN_IF_NOT_OK(bboxList->SetItemAt({i, 0}, bb_Xmin));
+    RETURN_IF_NOT_OK(bboxList->SetItemAt({i, 1}, bb_Ymin));
+    RETURN_IF_NOT_OK(bboxList->SetItemAt({i, 2}, bb_Xwidth));
+    RETURN_IF_NOT_OK(bboxList->SetItemAt({i, 3}, bb_Ywidth));
+  }
+  return Status::OK();
+}
+
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/image/image_utils.h b/mindspore/ccsrc/dataset/kernels/image/image_utils.h
index 394323974a..231ee77de0 100644
--- a/mindspore/ccsrc/dataset/kernels/image/image_utils.h
+++ b/mindspore/ccsrc/dataset/kernels/image/image_utils.h
@@ -96,7 +96,7 @@ Status Decode(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
 
 Status DecodeCv(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);
 
-bool HasJpegMagic(const unsigned char *data, size_t data_size);
+bool HasJpegMagic(const std::shared_ptr<Tensor> &input);
 
 void JpegSetSource(j_decompress_ptr c_info, const void *data, int64_t data_size);
 
@@ -225,7 +225,39 @@ Status Erase(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outp
 Status Pad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const int32_t &pad_top,
            const int32_t &pad_bottom, const int32_t &pad_left, const int32_t &pad_right, const BorderType &border_types,
            uint8_t fill_r = 0, uint8_t fill_g = 0, uint8_t fill_b = 0);
+
+// -------- BBOX OPERATIONS -------- //
+// Updates and checks bounding boxes for new cropped region of image
+// @param bboxList: A tensor contaning bounding box tensors
+// @param bboxCount: total Number of bounding boxes - required within caller function to run update loop
+// @param CB_Xmin: Image's CropBox Xmin coordinate
+// @param CB_Xmin: Image's CropBox Ymin coordinate
+// @param CB_Xmax: Image's CropBox Xmax coordinate - (Xmin + width)
+// @param CB_Xmax: Image's CropBox Ymax coordinate - (Ymin + height)
+Status UpdateBBoxesForCrop(std::shared_ptr<Tensor> *bboxList, size_t *bboxCount, int CB_Xmin, int CB_Ymin, int CB_Xmax,
+                           int CB_Ymax);
+
+// Updates bounding boxes with required Top and Left padding
+// Top and Left padding amounts required to adjust bboxs min X,Y values according to padding 'push'
+// Top/Left since images 0,0 coordinate is taken from top left
+// @param bboxList: A tensor contaning bounding box tensors
+// @param bboxCount: total Number of bounding boxes - required within caller function to run update loop
+// @param pad_top: Total amount of padding applied to image top
+// @param pad_left: Total amount of padding applied to image left side
+Status PadBBoxes(std::shared_ptr<Tensor> *bboxList, const size_t &bboxCount, int32_t pad_top, int32_t pad_left);
+
+// Updates bounding boxes for an Image Resize Operation - Takes in set of valid BBoxes
+// For e.g those that remain after a crop
+// @param bboxList: A tensor contaning bounding box tensors
+// @param bboxCount: total Number of bounding boxes - required within caller function to run update loop
+// @param bboxList: A tensor contaning bounding box tensors
+// @param target_width_: required width of image post resize
+// @param target_width_: required height of image post resize
+// @param orig_width: current width of image pre resize
+// @param orig_height: current height of image pre resize
+Status UpdateBBoxesForResize(const std::shared_ptr<Tensor> &bboxList, const size_t &bboxCount, int32_t target_width_,
+                             int32_t target_height_, int orig_width, int orig_height);
+
 }  // namespace dataset
 }  // namespace mindspore
-
 #endif  // DATASET_KERNELS_IMAGE_IMAGE_UTILS_H_
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_op.cc b/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_op.cc
index a3cf8cefb5..c5b5f20c63 100644
--- a/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_op.cc
@@ -35,8 +35,10 @@ RandomCropAndResizeOp::RandomCropAndResizeOp(int32_t target_height, int32_t targ
     : target_height_(target_height),
       target_width_(target_width),
       rnd_scale_(scale_lb, scale_ub),
-      rnd_aspect_(aspect_lb, aspect_ub),
+      rnd_aspect_(log(aspect_lb), log(aspect_ub)),
       interpolation_(interpolation),
+      aspect_lb_(aspect_lb),
+      aspect_ub_(aspect_ub),
       max_iter_(max_iter) {
   rnd_.seed(GetSeed());
 }
@@ -64,33 +66,42 @@ Status RandomCropAndResizeOp::OutputShape(const std::vector<TensorShape> &inputs
   return Status(StatusCode::kUnexpectedError, "Input has a wrong shape");
 }
 Status RandomCropAndResizeOp::GetCropBox(int h_in, int w_in, int *x, int *y, int *crop_height, int *crop_width) {
-  double scale, aspect;
   *crop_width = w_in;
   *crop_height = h_in;
-  bool crop_success = false;
+  CHECK_FAIL_RETURN_UNEXPECTED(w_in != 0, "Width is 0");
+  CHECK_FAIL_RETURN_UNEXPECTED(h_in != 0, "Height is 0");
+  CHECK_FAIL_RETURN_UNEXPECTED(aspect_lb_ > 0, "Aspect lower bound must be greater than zero");
   for (int32_t i = 0; i < max_iter_; i++) {
-    scale = rnd_scale_(rnd_);
-    aspect = rnd_aspect_(rnd_);
-    *crop_width = static_cast<int32_t>(std::round(std::sqrt(h_in * w_in * scale / aspect)));
-    *crop_height = static_cast<int32_t>(std::round(*crop_width * aspect));
+    double const sample_scale = rnd_scale_(rnd_);
+    // In case of non-symmetrical aspect ratios, use uniform distribution on a logarithmic sample_scale.
+    // Note rnd_aspect_ is already a random distribution of the input aspect ratio in logarithmic sample_scale.
+    double const sample_aspect = exp(rnd_aspect_(rnd_));
+
+    *crop_width = static_cast<int32_t>(std::round(std::sqrt(h_in * w_in * sample_scale * sample_aspect)));
+    *crop_height = static_cast<int32_t>(std::round(*crop_width / sample_aspect));
     if (*crop_width <= w_in && *crop_height <= h_in) {
-      crop_success = true;
-      break;
+      std::uniform_int_distribution<> rd_x(0, w_in - *crop_width);
+      std::uniform_int_distribution<> rd_y(0, h_in - *crop_height);
+      *x = rd_x(rnd_);
+      *y = rd_y(rnd_);
+      return Status::OK();
     }
   }
-  if (!crop_success) {
-    CHECK_FAIL_RETURN_UNEXPECTED(w_in != 0, "Width is 0");
-    aspect = static_cast<double>(h_in) / w_in;
-    scale = rnd_scale_(rnd_);
-    *crop_width = static_cast<int32_t>(std::round(std::sqrt(h_in * w_in * scale / aspect)));
-    *crop_height = static_cast<int32_t>(std::round(*crop_width * aspect));
-    *crop_height = (*crop_height > h_in) ? h_in : *crop_height;
-    *crop_width = (*crop_width > w_in) ? w_in : *crop_width;
+  double const img_aspect = static_cast<double>(w_in) / h_in;
+  if (img_aspect < aspect_lb_) {
+    *crop_width = w_in;
+    *crop_height = static_cast<int32_t>(std::round(*crop_width / static_cast<double>(aspect_lb_)));
+  } else {
+    if (img_aspect > aspect_ub_) {
+      *crop_height = h_in;
+      *crop_width = static_cast<int32_t>(std::round(*crop_height * static_cast<double>(aspect_ub_)));
+    } else {
+      *crop_width = w_in;
+      *crop_height = h_in;
+    }
   }
-  std::uniform_int_distribution<> rd_x(0, w_in - *crop_width);
-  std::uniform_int_distribution<> rd_y(0, h_in - *crop_height);
-  *x = rd_x(rnd_);
-  *y = rd_y(rnd_);
+  *x = static_cast<int32_t>(std::round((w_in - *crop_width) / 2.0));
+  *y = static_cast<int32_t>(std::round((h_in - *crop_height) / 2.0));
   return Status::OK();
 }
 }  // namespace dataset
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_op.h b/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_op.h
index 97ee9f6092..db805a9374 100644
--- a/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_op.h
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_op.h
@@ -60,6 +60,8 @@ class RandomCropAndResizeOp : public TensorOp {
   std::mt19937 rnd_;
   InterpolationMode interpolation_;
   int32_t max_iter_;
+  double aspect_lb_;
+  double aspect_ub_;
 };
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_with_bbox_op.cc b/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_with_bbox_op.cc
new file mode 100644
index 0000000000..b820779ed1
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_with_bbox_op.cc
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <random>
+#include <utility>
+
+#include "dataset/util/random.h"
+#include "dataset/util/status.h"
+#include "dataset/kernels/image/image_utils.h"
+#include "dataset/kernels/image/random_crop_and_resize_with_bbox_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status RandomCropAndResizeWithBBoxOp::Compute(const TensorRow &input, TensorRow *output) {
+  IO_CHECK_VECTOR(input, output);
+  BOUNDING_BOX_CHECK(input);
+  CHECK_FAIL_RETURN_UNEXPECTED(input[0]->shape().Size() >= 2, "The shape of input is abnormal");
+
+  (*output).push_back(nullptr);  // init memory for return vector
+  (*output).push_back(nullptr);
+  (*output)[1] = std::move(input[1]);  // move boxes over to output
+
+  size_t bboxCount = input[1]->shape()[0];  // number of rows in bbox tensor
+  int h_in = input[0]->shape()[0];
+  int w_in = input[0]->shape()[1];
+  int x = 0;
+  int y = 0;
+  int crop_height = 0;
+  int crop_width = 0;
+
+  RETURN_IF_NOT_OK(RandomCropAndResizeOp::GetCropBox(h_in, w_in, &x, &y, &crop_height, &crop_width));
+
+  int maxX = x + crop_width;  // max dims of selected CropBox on image
+  int maxY = y + crop_height;
+
+  RETURN_IF_NOT_OK(UpdateBBoxesForCrop(&(*output)[1], &bboxCount, x, y, maxX, maxY));  // IMAGE_UTIL
+  RETURN_IF_NOT_OK(CropAndResize(input[0], &(*output)[0], x, y, crop_height, crop_width, target_height_, target_width_,
+                                 interpolation_));
+
+  RETURN_IF_NOT_OK(
+    UpdateBBoxesForResize((*output)[1], bboxCount, target_width_, target_height_, crop_width, crop_height));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_with_bbox_op.h b/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_with_bbox_op.h
new file mode 100644
index 0000000000..9675d43933
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_with_bbox_op.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_IMAGE_RANDOM_CROP_AND_RESIZE_WITH_BBOX_OP_H_
+#define DATASET_KERNELS_IMAGE_RANDOM_CROP_AND_RESIZE_WITH_BBOX_OP_H_
+
+#include "dataset/kernels/image/random_crop_and_resize_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+class RandomCropAndResizeWithBBoxOp : public RandomCropAndResizeOp {
+ public:
+  //  Constructor for RandomCropAndResizeWithBBoxOp, with default value and passing to base class constructor
+  RandomCropAndResizeWithBBoxOp(int32_t target_height, int32_t target_width, float scale_lb = kDefScaleLb,
+                                float scale_ub = kDefScaleUb, float aspect_lb = kDefAspectLb,
+                                float aspect_ub = kDefAspectUb, InterpolationMode interpolation = kDefInterpolation,
+                                int32_t max_iter = kDefMaxIter)
+      : RandomCropAndResizeOp(target_height, target_width, scale_lb, scale_ub, aspect_lb, aspect_ub, interpolation,
+                              max_iter) {}
+
+  ~RandomCropAndResizeWithBBoxOp() override = default;
+
+  void Print(std::ostream &out) const override {
+    out << "RandomCropAndResizeWithBBox: " << RandomCropAndResizeOp::target_height_ << " "
+        << RandomCropAndResizeOp::target_width_;
+  }
+
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_IMAGE_RANDOM_CROP_AND_RESIZE_WITH_BBOX_OP_H_
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_decode_resize_op.cc b/mindspore/ccsrc/dataset/kernels/image/random_crop_decode_resize_op.cc
index c11b5b5968..74aa91ea7e 100644
--- a/mindspore/ccsrc/dataset/kernels/image/random_crop_decode_resize_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_decode_resize_op.cc
@@ -31,7 +31,7 @@ Status RandomCropDecodeResizeOp::Compute(const std::shared_ptr<Tensor> &input, s
   if (input == nullptr) {
     RETURN_STATUS_UNEXPECTED("input tensor is null");
   }
-  if (!HasJpegMagic(input->GetMutableBuffer(), input->SizeInBytes())) {
+  if (!HasJpegMagic(input)) {
     DecodeOp op(true);
     std::shared_ptr<Tensor> decoded;
     RETURN_IF_NOT_OK(op.Compute(input, &decoded));
@@ -43,7 +43,7 @@ Status RandomCropDecodeResizeOp::Compute(const std::shared_ptr<Tensor> &input, s
     jerr.pub.error_exit = JpegErrorExitCustom;
     try {
       jpeg_create_decompress(&cinfo);
-      JpegSetSource(&cinfo, input->GetMutableBuffer(), input->SizeInBytes());
+      JpegSetSource(&cinfo, input->GetBuffer(), input->SizeInBytes());
       (void)jpeg_read_header(&cinfo, TRUE);
       jpeg_calc_output_dimensions(&cinfo);
     } catch (std::runtime_error &e) {
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_op.cc b/mindspore/ccsrc/dataset/kernels/image/random_crop_op.cc
index 7662c64cc4..110d769f26 100644
--- a/mindspore/ccsrc/dataset/kernels/image/random_crop_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_op.cc
@@ -48,44 +48,81 @@ RandomCropOp::RandomCropOp(int32_t crop_height, int32_t crop_width, int32_t pad_
   rnd_.seed(GetSeed());
 }
 
-Status RandomCropOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
-  IO_CHECK(input, output);
-
-  // Apply padding first then crop
-  std::shared_ptr<Tensor> pad_image;
+Status RandomCropOp::ImagePadding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *pad_image,
+                                  int32_t *t_pad_top, int32_t *t_pad_bottom, int32_t *t_pad_left, int32_t *t_pad_right,
+                                  int32_t *padded_image_w, int32_t *padded_image_h, bool *crop_further) {
+  *t_pad_top = pad_top_;
+  *t_pad_bottom = pad_bottom_;
+  *t_pad_left = pad_left_;
+  *t_pad_right = pad_right_;
 
   RETURN_IF_NOT_OK(
-    Pad(input, &pad_image, pad_top_, pad_bottom_, pad_left_, pad_right_, border_type_, fill_r_, fill_g_, fill_b_));
-  CHECK_FAIL_RETURN_UNEXPECTED(pad_image->shape().Size() >= 2, "Abnormal shape");
-  int32_t padded_image_h = pad_image->shape()[0];
-  int32_t padded_image_w = pad_image->shape()[1];
-  // no need to crop if same  size
-  if (padded_image_h == crop_height_ && padded_image_w == crop_width_) {
-    *output = pad_image;
+    Pad(input, pad_image, pad_top_, pad_bottom_, pad_left_, pad_right_, border_type_, fill_r_, fill_g_, fill_b_));
+  CHECK_FAIL_RETURN_UNEXPECTED((*pad_image)->shape().Size() >= 2, "Abnormal shape");
+
+  *padded_image_h = (*pad_image)->shape()[0];
+  *padded_image_w = (*pad_image)->shape()[1];
+
+  if (*padded_image_h == crop_height_ && *padded_image_w == crop_width_) {
+    *crop_further = false;  //  no need for further crop
     return Status::OK();
-  }
-  if (pad_if_needed_) {
+  } else if (pad_if_needed_) {
     // check the dimensions of the image for padding, if we do need padding, then we change the pad values
-    if (padded_image_h < crop_height_) {
-      RETURN_IF_NOT_OK(Pad(pad_image, &pad_image, crop_height_ - padded_image_h, crop_height_ - padded_image_h, 0, 0,
+    if (*padded_image_h < crop_height_) {
+      RETURN_IF_NOT_OK(Pad(*pad_image, pad_image, crop_height_ - *padded_image_h, crop_height_ - *padded_image_h, 0, 0,
                            border_type_, fill_r_, fill_g_, fill_b_));
+
+      // update pad total above/below
+      t_pad_top += (crop_height_ - *padded_image_h);
+      t_pad_bottom += (crop_height_ - *padded_image_h);
     }
-    if (padded_image_w < crop_width_) {
-      RETURN_IF_NOT_OK(Pad(pad_image, &pad_image, 0, 0, crop_width_ - padded_image_w, crop_width_ - padded_image_w,
+    if (*padded_image_w < crop_width_) {
+      RETURN_IF_NOT_OK(Pad(*pad_image, pad_image, 0, 0, crop_width_ - *padded_image_w, crop_width_ - *padded_image_w,
                            border_type_, fill_r_, fill_g_, fill_b_));
+      // update pad total left/right
+      t_pad_left += (crop_width_ - *padded_image_w);
+      t_pad_right += (crop_width_ - *padded_image_w);
     }
-    padded_image_h = pad_image->shape()[0];
-    padded_image_w = pad_image->shape()[1];
+    *padded_image_h = (*pad_image)->shape()[0];
+    *padded_image_w = (*pad_image)->shape()[1];
   }
-  if (padded_image_h < crop_height_ || padded_image_w < crop_width_ || crop_height_ == 0 || crop_width_ == 0) {
+
+  if (*padded_image_h < crop_height_ || *padded_image_w < crop_width_ || crop_height_ == 0 || crop_width_ == 0) {
     return Status(StatusCode::kShapeMisMatch, __LINE__, __FILE__,
                   "Crop size is greater than the image dimensions or is zero.");
   }
-  // random top corner
-  int x = std::uniform_int_distribution<int>(0, padded_image_w - crop_width_)(rnd_);
-  int y = std::uniform_int_distribution<int>(0, padded_image_h - crop_height_)(rnd_);
+  return Status::OK();
+}
+
+void RandomCropOp::GenRandomXY(int *x, int *y, const int32_t &padded_image_w, const int32_t &padded_image_h) {
+  // GenCropPoints for cropping
+  *x = std::uniform_int_distribution<int>(0, padded_image_w - crop_width_)(rnd_);
+  *y = std::uniform_int_distribution<int>(0, padded_image_h - crop_height_)(rnd_);
+}
+
+Status RandomCropOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+
+  // Apply padding first then crop
+  std::shared_ptr<Tensor> pad_image;
+  int32_t t_pad_top, t_pad_bottom, t_pad_left, t_pad_right;
+  int32_t padded_image_w;
+  int32_t padded_image_h;
+  bool crop_further = true;  // whether image needs further cropping based on new size & requirements
+
+  RETURN_IF_NOT_OK(  // error code sent back directly
+    ImagePadding(input, &pad_image, &t_pad_top, &t_pad_bottom, &t_pad_left, &t_pad_right, &padded_image_w,
+                 &padded_image_h, &crop_further));
+  if (!crop_further) {
+    *output = pad_image;
+    return Status::OK();
+  }
+
+  int x, y;
+  GenRandomXY(&x, &y, padded_image_w, padded_image_h);
   return Crop(pad_image, output, x, y, crop_width_, crop_height_);
 }
+
 Status RandomCropOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
   RETURN_IF_NOT_OK(TensorOp::OutputShape(inputs, outputs));
   outputs.clear();
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_op.h b/mindspore/ccsrc/dataset/kernels/image/random_crop_op.h
index d4ec49cd7b..cd43ec1efb 100644
--- a/mindspore/ccsrc/dataset/kernels/image/random_crop_op.h
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_op.h
@@ -50,11 +50,33 @@ class RandomCropOp : public TensorOp {
   void Print(std::ostream &out) const override { out << "RandomCropOp: " << crop_height_ << " " << crop_width_; }
 
   Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  // Function breaks out the compute function's image padding functionality and makes available to other Ops
+  // Using this class as a base - restructrued to allow for RandomCropWithBBox Augmentation Op
+  // @param input: Input is the original Image
+  // @param pad_image: Pointer to new Padded image
+  // @param t_pad_top: Total Top Padding - Based on input and value calculated in function if required
+  // @param t_pad_bottom: Total bottom Padding - Based on input and value calculated in function if required
+  // @param t_pad_left: Total left Padding - Based on input and value calculated in function if required
+  // @param t_pad_right: Total right Padding - Based on input and value calculated in function if required
+  // @param padded_image_w: Final Width of the 'pad_image'
+  // @param padded_image_h: Final Height of the 'pad_image'
+  // @param crop_further: Whether image required cropping after padding - False if new padded image matches required
+  // dimensions
+  Status ImagePadding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *pad_image, int32_t *t_pad_top,
+                      int32_t *t_pad_bottom, int32_t *t_pad_left, int32_t *t_pad_right, int32_t *padded_image_w,
+                      int32_t *padded_image_h, bool *crop_further);
+
+  // Function breaks X,Y generation functionality out of original compute function and makes available to other Ops
+  void GenRandomXY(int *x, int *y, const int32_t &padded_image_w, const int32_t &padded_image_h);
+
   Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
 
- private:
+ protected:
   int32_t crop_height_ = 0;
   int32_t crop_width_ = 0;
+
+ private:
   int32_t pad_top_ = 0;
   int32_t pad_bottom_ = 0;
   int32_t pad_left_ = 0;
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_with_bbox_op.cc b/mindspore/ccsrc/dataset/kernels/image/random_crop_with_bbox_op.cc
new file mode 100644
index 0000000000..2be37f1da3
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_with_bbox_op.cc
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <random>
+#include <algorithm>
+#include <utility>
+
+#include "dataset/kernels/image/random_crop_with_bbox_op.h"
+#include "dataset/kernels/image/image_utils.h"
+#include "dataset/util/random.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+Status RandomCropWithBBoxOp::Compute(const TensorRow &input, TensorRow *output) {
+  IO_CHECK_VECTOR(input, output);
+  BOUNDING_BOX_CHECK(input);
+
+  std::shared_ptr<Tensor> pad_image;
+  int32_t t_pad_top, t_pad_bottom, t_pad_left, t_pad_right;
+  size_t boxCount = input[1]->shape()[0];  // number of rows
+
+  int32_t padded_image_h;
+  int32_t padded_image_w;
+
+  (*output).push_back(nullptr);
+  (*output).push_back(nullptr);
+  (*output)[1] = std::move(input[1]);  // since some boxes may be removed
+
+  bool crop_further = true;  // Whether further cropping will be required or not, true unless required size matches
+  RETURN_IF_NOT_OK(          // Error passed back to caller
+    RandomCropOp::ImagePadding(input[0], &pad_image, &t_pad_top, &t_pad_bottom, &t_pad_left, &t_pad_right,
+                               &padded_image_w, &padded_image_h, &crop_further));
+
+  // update bounding boxes with new values based on relevant image padding
+  if (t_pad_left || t_pad_bottom) {
+    RETURN_IF_NOT_OK(PadBBoxes(&(*output)[1], boxCount, t_pad_left, t_pad_top));
+  }
+  if (!crop_further) {
+    // no further cropping required
+    (*output)[0] = pad_image;
+    (*output)[1] = std::move(input[1]);
+    return Status::OK();
+  }
+
+  int x, y;
+  RandomCropOp::GenRandomXY(&x, &y, padded_image_w, padded_image_h);
+  int maxX = x + RandomCropOp::crop_width_;  // max dims of selected CropBox on image
+  int maxY = y + RandomCropOp::crop_height_;
+  RETURN_IF_NOT_OK(UpdateBBoxesForCrop(&(*output)[1], &boxCount, x, y, maxX, maxY));
+  return Crop(pad_image, &(*output)[0], x, y, RandomCropOp::crop_width_, RandomCropOp::crop_height_);
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_crop_with_bbox_op.h b/mindspore/ccsrc/dataset/kernels/image/random_crop_with_bbox_op.h
new file mode 100644
index 0000000000..88a58d3557
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_with_bbox_op.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_IMAGE_RANDOM_CROP_WITH_BBOX_OP_H_
+#define DATASET_KERNELS_IMAGE_RANDOM_CROP_WITH_BBOX_OP_H_
+
+#include <memory>
+#include <vector>
+
+#include "dataset/kernels/image/random_crop_op.h"
+
+namespace mindspore {
+namespace dataset {
+class RandomCropWithBBoxOp : public RandomCropOp {
+ public:
+  //  Constructor for RandomCropWithBBoxOp, with default value and passing to base class constructor
+  RandomCropWithBBoxOp(int32_t crop_height, int32_t crop_width, int32_t pad_top = kDefPadTop,
+                       int32_t pad_bottom = kDefPadBottom, int32_t pad_left = kDefPadLeft,
+                       int32_t pad_right = kDefPadRight, BorderType border_types = kDefBorderType,
+                       bool pad_if_needed = kDefPadIfNeeded, uint8_t fill_r = kDefFillR, uint8_t fill_g = kDefFillG,
+                       uint8_t fill_b = kDefFillB)
+      : RandomCropOp(crop_height, crop_width, pad_top, pad_bottom, pad_left, pad_right, border_types, pad_if_needed,
+                     fill_r, fill_g, fill_b) {}
+
+  ~RandomCropWithBBoxOp() override = default;
+
+  void Print(std::ostream &out) const override {
+    out << "RandomCropWithBBoxOp: " << RandomCropOp::crop_height_ << " " << RandomCropOp::crop_width_;
+  }
+
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_IMAGE_RANDOM_CROP_WITH_BBOX_OP_H_
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_horizontal_flip_bbox_op.cc b/mindspore/ccsrc/dataset/kernels/image/random_horizontal_flip_bbox_op.cc
new file mode 100644
index 0000000000..5a5c632e81
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/random_horizontal_flip_bbox_op.cc
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <utility>
+#include "dataset/kernels/image/random_horizontal_flip_bbox_op.h"
+#include "dataset/kernels/image/image_utils.h"
+#include "dataset/util/status.h"
+#include "dataset/core/cv_tensor.h"
+#include "dataset/core/pybind_support.h"
+
+namespace mindspore {
+namespace dataset {
+const float RandomHorizontalFlipWithBBoxOp::kDefProbability = 0.5;
+
+Status RandomHorizontalFlipWithBBoxOp::Compute(const TensorRow &input, TensorRow *output) {
+  IO_CHECK_VECTOR(input, output);
+  BOUNDING_BOX_CHECK(input);
+  if (distribution_(rnd_)) {
+    // To test bounding boxes algorithm, create random bboxes from image dims
+    size_t num_of_boxes = input[1]->shape()[0];      // set to give number of bboxes
+    float img_center = (input[0]->shape()[1] / 2.);  // get the center of the image
+
+    for (int i = 0; i < num_of_boxes; i++) {
+      uint32_t b_w = 0;  // bounding box width
+      uint32_t min_x = 0;
+      // get the required items
+      input[1]->GetItemAt<uint32_t>(&min_x, {i, 0});
+      input[1]->GetItemAt<uint32_t>(&b_w, {i, 2});
+      // do the flip
+      float diff = img_center - min_x;          // get distance from min_x to center
+      uint32_t refl_min_x = diff + img_center;  // get reflection of min_x
+      uint32_t new_min_x = refl_min_x - b_w;    // subtract from the reflected min_x to get the new one
+      input[1]->SetItemAt<uint32_t>({i, 0}, new_min_x);
+    }
+    (*output).push_back(nullptr);
+    (*output).push_back(nullptr);
+    // move input to output pointer of bounding boxes
+    (*output)[1] = std::move(input[1]);
+    // perform HorizontalFlip on the image
+    std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(std::move(input[0]));
+    return HorizontalFlip(std::static_pointer_cast<Tensor>(input_cv), &(*output)[0]);
+  }
+  *output = input;
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_horizontal_flip_bbox_op.h b/mindspore/ccsrc/dataset/kernels/image/random_horizontal_flip_bbox_op.h
new file mode 100644
index 0000000000..06c96e11ae
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/random_horizontal_flip_bbox_op.h
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_IMAGE_RANDOM_HORIZONTAL_FLIP_BBOX_OP_H_
+#define DATASET_KERNELS_IMAGE_RANDOM_HORIZONTAL_FLIP_BBOX_OP_H_
+
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+#include <memory>
+#include <random>
+#include <cstdlib>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/random.h"
+#include "dataset/util/status.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl_bind.h"
+
+namespace mindspore {
+namespace dataset {
+class RandomHorizontalFlipWithBBoxOp : public TensorOp {
+ public:
+  // Default values, also used by python_bindings.cc
+  static const float kDefProbability;
+
+  explicit RandomHorizontalFlipWithBBoxOp(float probability = kDefProbability) : distribution_(probability) {
+    rnd_.seed(GetSeed());
+  }
+
+  ~RandomHorizontalFlipWithBBoxOp() override = default;
+
+  // Provide stream operator for displaying it
+  friend std::ostream &operator<<(std::ostream &out, const RandomHorizontalFlipWithBBoxOp &so) {
+    so.Print(out);
+    return out;
+  }
+
+  void Print(std::ostream &out) const override { out << "RandomHorizontalFlipWithBBoxOp"; }
+
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+ private:
+  std::mt19937 rnd_;
+  std::bernoulli_distribution distribution_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_IMAGE_RANDOM_HORIZONTAL_FLIP_BBOX_OP_H_
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_vertical_flip_with_bbox_op.cc b/mindspore/ccsrc/dataset/kernels/image/random_vertical_flip_with_bbox_op.cc
new file mode 100644
index 0000000000..d88c009559
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/random_vertical_flip_with_bbox_op.cc
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utility>
+
+#include "dataset/util/status.h"
+#include "dataset/kernels/image/image_utils.h"
+#include "dataset/kernels/image/random_vertical_flip_with_bbox_op.h"
+
+namespace mindspore {
+namespace dataset {
+const float RandomVerticalFlipWithBBoxOp::kDefProbability = 0.5;
+Status RandomVerticalFlipWithBBoxOp::Compute(const TensorRow &input, TensorRow *output) {
+  IO_CHECK_VECTOR(input, output);
+  BOUNDING_BOX_CHECK(input);
+
+  if (distribution_(rnd_)) {
+    dsize_t imHeight = input[0]->shape()[0];
+    size_t boxCount = input[1]->shape()[0];  // number of rows in tensor
+
+    // one time allocation -> updated in the loop
+    // type defined based on VOC test dataset
+    for (int i = 0; i < boxCount; i++) {
+      uint32_t boxCorner_y = 0;
+      uint32_t boxHeight = 0;
+      uint32_t newBoxCorner_y = 0;
+      RETURN_IF_NOT_OK(input[1]->GetUnsignedIntAt(&boxCorner_y, {i, 1}));  // get min y of bbox
+      RETURN_IF_NOT_OK(input[1]->GetUnsignedIntAt(&boxHeight, {i, 3}));    // get height of bbox
+
+      // subtract (curCorner + height) from (max) for new Corner position
+      newBoxCorner_y = (imHeight - 1) - (boxCorner_y + boxHeight);
+      RETURN_IF_NOT_OK(input[1]->SetItemAt({i, 1}, newBoxCorner_y));
+    }
+
+    (*output).push_back(nullptr);
+    (*output).push_back(nullptr);
+    (*output)[1] = std::move(input[1]);
+
+    return VerticalFlip(input[0], &(*output)[0]);
+  }
+  *output = input;
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/kernels/image/random_vertical_flip_with_bbox_op.h b/mindspore/ccsrc/dataset/kernels/image/random_vertical_flip_with_bbox_op.h
new file mode 100644
index 0000000000..4764cc2b75
--- /dev/null
+++ b/mindspore/ccsrc/dataset/kernels/image/random_vertical_flip_with_bbox_op.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_IMAGE_RANDOM_VERTICAL_FLIP_WITH_BBOX_OP_H_
+#define DATASET_KERNELS_IMAGE_RANDOM_VERTICAL_FLIP_WITH_BBOX_OP_H_
+
+#include <memory>
+#include <random>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+#include "dataset/util/random.h"
+
+namespace mindspore {
+namespace dataset {
+class RandomVerticalFlipWithBBoxOp : public TensorOp {
+ public:
+  // Default values, also used by python_bindings.cc
+  static const float kDefProbability;
+  // Constructor for RandomVerticalFlipWithBBoxOp
+  // @param probability: Probablity of Image flipping, 0.5 by default
+  explicit RandomVerticalFlipWithBBoxOp(float probability = kDefProbability) : distribution_(probability) {
+    rnd_.seed(GetSeed());
+  }
+
+  ~RandomVerticalFlipWithBBoxOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "RandomVerticalFlipWithBBoxOp"; }
+
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+ private:
+  std::mt19937 rnd_;
+  std::bernoulli_distribution distribution_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_IMAGE_RANDOM_VERTICAL_FLIP_WITH_BBOX_OP_H_
diff --git a/mindspore/ccsrc/dataset/kernels/image/uniform_aug_op.cc b/mindspore/ccsrc/dataset/kernels/image/uniform_aug_op.cc
index 147955ebac..7889b3b157 100644
--- a/mindspore/ccsrc/dataset/kernels/image/uniform_aug_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/image/uniform_aug_op.cc
@@ -27,8 +27,7 @@ UniformAugOp::UniformAugOp(std::vector<std::shared_ptr<TensorOp>> op_list, int32
 }
 
 // compute method to apply uniformly random selected augmentations from a list
-Status UniformAugOp::Compute(const std::vector<std::shared_ptr<Tensor>> &input,
-                             std::vector<std::shared_ptr<Tensor>> *output) {
+Status UniformAugOp::Compute(const TensorRow &input, TensorRow *output) {
   IO_CHECK_VECTOR(input, output);
 
   // randomly select ops to be applied
diff --git a/mindspore/ccsrc/dataset/kernels/image/uniform_aug_op.h b/mindspore/ccsrc/dataset/kernels/image/uniform_aug_op.h
index 605f510746..824898ba2d 100644
--- a/mindspore/ccsrc/dataset/kernels/image/uniform_aug_op.h
+++ b/mindspore/ccsrc/dataset/kernels/image/uniform_aug_op.h
@@ -44,8 +44,7 @@ class UniformAugOp : public TensorOp {
 
   // Overrides the base class compute function
   // @return Status - The error code return
-  Status Compute(const std::vector<std::shared_ptr<Tensor>> &input,
-                 std::vector<std::shared_ptr<Tensor>> *output) override;
+  Status Compute(const TensorRow &input, TensorRow *output) override;
 
  private:
   int32_t num_ops_;
diff --git a/mindspore/ccsrc/dataset/kernels/py_func_op.cc b/mindspore/ccsrc/dataset/kernels/py_func_op.cc
index c9e5d5b169..0a6a1452b5 100644
--- a/mindspore/ccsrc/dataset/kernels/py_func_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/py_func_op.cc
@@ -24,8 +24,7 @@
 
 namespace mindspore {
 namespace dataset {
-Status PyFuncOp::Compute(const std::vector<std::shared_ptr<Tensor>> &input,
-                         std::vector<std::shared_ptr<Tensor>> *output) {
+Status PyFuncOp::Compute(const TensorRow &input, TensorRow *output) {
   IO_CHECK_VECTOR(input, output);
   Status ret = Status(StatusCode::kOK, "PyFunc Call Succeed");
   {
diff --git a/mindspore/ccsrc/dataset/kernels/py_func_op.h b/mindspore/ccsrc/dataset/kernels/py_func_op.h
index af61f6ac55..a50aceafbb 100644
--- a/mindspore/ccsrc/dataset/kernels/py_func_op.h
+++ b/mindspore/ccsrc/dataset/kernels/py_func_op.h
@@ -36,8 +36,7 @@ class __attribute__((visibility("hidden"))) PyFuncOp : public TensorOp {
   uint32_t NumOutput() override { return 0; }
 
   // Compute function for n-n mapping.
-  Status Compute(const std::vector<std::shared_ptr<Tensor>> &input,
-                 std::vector<std::shared_ptr<Tensor>> *output) override;
+  Status Compute(const TensorRow &input, TensorRow *output) override;
 
  private:
   py::function py_func_ptr_;
diff --git a/mindspore/ccsrc/dataset/kernels/tensor_op.cc b/mindspore/ccsrc/dataset/kernels/tensor_op.cc
index 390dd42a71..92aef8dc9e 100644
--- a/mindspore/ccsrc/dataset/kernels/tensor_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/tensor_op.cc
@@ -37,8 +37,7 @@ Status TensorOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
 // Name: Compute()
 // Description: This Compute() take multiple Tensors from different columns and produce multiple Tensors too.
 //              The derived class should override this function otherwise error.
-Status TensorOp::Compute(const std::vector<std::shared_ptr<Tensor>> &input,
-                         std::vector<std::shared_ptr<Tensor>> *output) {
+Status TensorOp::Compute(const TensorRow &input, TensorRow *output) {
   IO_CHECK_VECTOR(input, output);
   if (OneToOne()) {
     output->resize(1);
diff --git a/mindspore/ccsrc/dataset/kernels/tensor_op.h b/mindspore/ccsrc/dataset/kernels/tensor_op.h
index 73fba4e28d..293d4a4f99 100644
--- a/mindspore/ccsrc/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/dataset/kernels/tensor_op.h
@@ -21,6 +21,7 @@
 #include <vector>
 
 #include "dataset/core/tensor.h"
+#include "dataset/core/tensor_row.h"
 #include "dataset/util/status.h"
 
 #define IO_CHECK(input, output)                             \
@@ -42,6 +43,40 @@
     }                                               \
   } while (false)
 
+#define BOUNDING_BOX_CHECK(input)                                                           \
+  do {                                                                                      \
+    if (input[1]->shape().Size() < 2) {                                                     \
+      return Status(StatusCode::kBoundingBoxInvalidShape, __LINE__, __FILE__,               \
+                    "Bounding boxes shape should have at least two dims");                  \
+    }                                                                                       \
+    uint32_t num_of_features = input[1]->shape()[1];                                        \
+    if (num_of_features < 4) {                                                              \
+      return Status(StatusCode::kBoundingBoxInvalidShape, __LINE__, __FILE__,               \
+                    "Bounding boxes should be have at least 4 features");                   \
+    }                                                                                       \
+    uint32_t num_of_boxes = input[1]->shape()[0];                                           \
+    uint32_t img_h = input[0]->shape()[0];                                                  \
+    uint32_t img_w = input[0]->shape()[1];                                                  \
+    for (uint32_t i = 0; i < num_of_boxes; i++) {                                           \
+      uint32_t min_x = 0;                                                                   \
+      uint32_t min_y = 0;                                                                   \
+      uint32_t b_w = 0;                                                                     \
+      uint32_t b_h = 0;                                                                     \
+      input[1]->GetItemAt<uint32_t>(&min_x, {i, 0});                                        \
+      input[1]->GetItemAt<uint32_t>(&min_y, {i, 1});                                        \
+      input[1]->GetItemAt<uint32_t>(&b_w, {i, 2});                                          \
+      input[1]->GetItemAt<uint32_t>(&b_h, {i, 3});                                          \
+      if ((min_x + b_w > img_w) || (min_y + b_h > img_h)) {                                 \
+        return Status(StatusCode::kBoundingBoxOutOfBounds, __LINE__, __FILE__,              \
+                      "At least one of the bounding boxes is out of bounds of the image."); \
+      }                                                                                     \
+      if (static_cast<int>(min_x) < 0 || static_cast<int>(min_y) < 0) {                     \
+        return Status(StatusCode::kBoundingBoxOutOfBounds, __LINE__, __FILE__,              \
+                      "At least one of the bounding boxes has negative min_x or min_y.");   \
+      }                                                                                     \
+    }                                                                                       \
+  } while (false)
+
 namespace mindspore {
 namespace dataset {
 // A class that does a computation on  a Tensor
@@ -75,8 +110,7 @@ class TensorOp {
   // @param input is a vector of shared_ptr to Tensor (pass by const reference).
   // @param output is the address to an empty vector of shared_ptr to Tensor.
   // @return Status
-  virtual Status Compute(const std::vector<std::shared_ptr<Tensor>> &input,
-                         std::vector<std::shared_ptr<Tensor>> *output);
+  virtual Status Compute(const TensorRow &input, TensorRow *output);
 
   // Returns true oif the TensorOp takes one input and returns one output.
   // @return true/false
diff --git a/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt b/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt
index 87d3dbad34..449bb93d8b 100644
--- a/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt
@@ -1,7 +1,23 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
+if (NOT (CMAKE_SYSTEM_NAME MATCHES "Windows"))
+        set(ICU_DEPEND_FILES
+                basic_tokenizer_op.cc
+                bert_tokenizer_op.cc
+                case_fold_op.cc
+                normalize_utf8_op.cc
+                regex_replace_op.cc
+                regex_tokenizer_op.cc
+                unicode_script_tokenizer_op.cc
+                whitespace_tokenizer_op.cc)
+endif()
 add_library(text-kernels OBJECT
         lookup_op.cc
         jieba_tokenizer_op.cc
         unicode_char_tokenizer_op.cc
+        ngram_op.cc
+        wordpiece_tokenizer_op.cc
+        truncate_sequence_pair_op.cc
+        to_number_op.cc
+        ${ICU_DEPEND_FILES}
         )
diff --git a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
new file mode 100644
index 0000000000..1128990b44
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+const bool BasicTokenizerOp::kDefLowerCase = false;
+const bool BasicTokenizerOp::kDefKeepWhitespace = false;
+const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
+const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
+const char BasicTokenizerOp::kCommonPattern[] =
+  "[!-/]"
+  "|[:-@]"
+  "|[\\[-`]"
+  "|[{-~]"
+  "|[\\p{P}]"
+  "|[\\x{4E00}-\\x{9FFF}]"
+  "|[\\x{3400}-\\x{4DBF}]"
+  "|[\\x{20000}-\\x{2A6DF}]"
+  "|[\\x{2A700}-\\x{2B73F}]"
+  "|[\\x{2B740}-\\x{2B81F}]"
+  "|[\\x{2B820}-\\x{2CEAF}]"
+  "|[\\x{F900}-\\x{FAFF}]"
+  "|[\\x{2F800}-\\x{2FA1F}]";
+const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|";
+
+BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
+                                   bool preserve_unused_token)
+    : lower_case_(lower_case),
+      keep_whitespace_(keep_whitespace),
+      preserve_unused_token_(preserve_unused_token),
+      case_fold_(std::make_unique<CaseFoldOp>()),
+      nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
+      normalization_form_(normalization_form),
+      common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
+      replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
+      replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
+  std::string delim_pattern = std::string("\\s+|") + kCommonPattern;
+  std::string keep_delim_pattern;
+  if (keep_whitespace_) {
+    keep_delim_pattern = delim_pattern;
+  } else {
+    keep_delim_pattern = kCommonPattern;
+  }
+  if (preserve_unused_token_) {
+    keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
+    delim_pattern = kUnusedPattern + delim_pattern;
+  }
+  regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern);
+}
+
+Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::shared_ptr<Tensor> cur_input;
+  std::shared_ptr<Tensor> processed_tensor;
+  if (lower_case_) {
+    // to lower case
+    RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor));
+    cur_input = processed_tensor;
+    // strip accent characters
+    RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor));
+    cur_input = processed_tensor;
+    RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
+  } else {
+    RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor));
+  }
+  // strip control characters
+  cur_input = processed_tensor;
+  RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
+  return regex_tokenizer_->Compute(processed_tensor, output);
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
new file mode 100644
index 0000000000..a37e841573
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/text/kernels/case_fold_op.h"
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include "dataset/text/kernels/regex_replace_op.h"
+#include "dataset/text/kernels/regex_tokenizer_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class BasicTokenizerOp : public TensorOp {
+ public:
+  static const bool kDefLowerCase;
+  static const bool kDefKeepWhitespace;
+  static const NormalizeForm kDefNormalizationForm;
+  static const bool kDefPreserveUnusedToken;
+  explicit BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace,
+                            NormalizeForm normalization_form = kDefNormalizationForm,
+                            bool preserve_unused_token = kDefPreserveUnusedToken);
+
+  ~BasicTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  static const char kCommonPattern[];
+  static const char kUnusedPattern[];
+  bool lower_case_;
+  bool keep_whitespace_;
+  NormalizeForm normalization_form_;
+  bool preserve_unused_token_;
+  std::unique_ptr<CaseFoldOp> case_fold_;
+  std::unique_ptr<NormalizeUTF8Op> nfd_normalize_;
+  std::unique_ptr<NormalizeUTF8Op> common_normalize_;
+  std::unique_ptr<RegexReplaceOp> replace_accent_chars_;
+  std::unique_ptr<RegexReplaceOp> replace_control_chars_;
+  std::unique_ptr<RegexTokenizerOp> regex_tokenizer_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
diff --git a/mindspore/ccsrc/kernel/gpu/nn/relu_gpu_kernel.cc b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
similarity index 55%
rename from mindspore/ccsrc/kernel/gpu/nn/relu_gpu_kernel.cc
rename to mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
index d4cefc73ca..2b68a5accb 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/relu_gpu_kernel.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,14 +13,15 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include "kernel/gpu/nn/relu_gpu_kernel.h"
-
+#include "dataset/text/kernels/bert_tokenizer_op.h"
 namespace mindspore {
-namespace kernel {
-MS_REG_GPU_KERNEL_ONE(ReLU, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                      ReLUGpuFwdKernel, float)
-MS_REG_GPU_KERNEL_ONE(ReLU, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-                      ReLUGpuFwdKernel, half)
-}  // namespace kernel
+namespace dataset {
+Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  std::shared_ptr<Tensor> basic_tensor;
+  RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
+  RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
+  return Status::OK();
+}
+}  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h
new file mode 100644
index 0000000000..660fdc7ba5
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class BertTokenizerOp : public TensorOp {
+ public:
+  explicit BertTokenizerOp(const std::shared_ptr<Vocab> &vocab,
+                           const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
+                           const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
+                           const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
+                           bool lower_case = BasicTokenizerOp::kDefLowerCase,
+                           bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
+                           NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm,
+                           bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken)
+      : wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token),
+        basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {}
+
+  ~BertTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  WordpieceTokenizerOp wordpiece_tokenizer_;
+  BasicTokenizerOp basic_tokenizer_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc
new file mode 100644
index 0000000000..d935608efd
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/case_fold_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "unicode/errorcode.h"
+#include "unicode/normalizer2.h"
+#include "unicode/utypes.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status CaseFoldOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  icu::ErrorCode error;
+  const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
+  CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCCasefoldInstance failed.");
+  std::vector<std::string> strs(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    icu::StringByteSink<std::string> sink(&strs[i++]);
+    nfkc_case_fold->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
+    CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
+  }
+  *output = std::make_shared<Tensor>(std::move(strs), input->shape());
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h
new file mode 100644
index 0000000000..d1b5ba53f1
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
+#define DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class CaseFoldOp : public TensorOp {
+ public:
+  CaseFoldOp() {}
+
+  ~CaseFoldOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "CaseFoldOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
index 16f9409645..de1d915fbb 100644
--- a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
@@ -29,6 +29,7 @@ JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::strin
 }
 
 Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
   RETURN_UNEXPECTED_IF_NULL(jieba_parser_);
 
   if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
diff --git a/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc b/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc
index d4661ea16b..07cf7aef5c 100644
--- a/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc
@@ -24,6 +24,7 @@ LookupOp::LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id)
     : vocab_(vocab), default_id_(default_id), type_(DataType("int32")) {}
 
 Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
   RETURN_UNEXPECTED_IF_NULL(vocab_);
   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor");
   std::vector<WordIdType> word_ids;
diff --git a/mindspore/ccsrc/dataset/text/kernels/lookup_op.h b/mindspore/ccsrc/dataset/text/kernels/lookup_op.h
index 58dea21d37..dad99c3241 100644
--- a/mindspore/ccsrc/dataset/text/kernels/lookup_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/lookup_op.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef DATASET_NLP_KERNELS_LOOKUP_OP_H_
-#define DATASET_NLP_KERNELS_LOOKUP_OP_H_
+#ifndef DATASET_TEXT_KERNELS_LOOKUP_OP_H_
+#define DATASET_TEXT_KERNELS_LOOKUP_OP_H_
 
 #include <memory>
 #include <vector>
@@ -33,7 +33,7 @@ class LookupOp : public TensorOp {
   // constructor for lookup, takes in a vocab object
   // @param std::shared_ptr<Vocab> vocab -
   // @param WordIdType default_id, id to lookup if a word is not in vocab
-  explicit LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id = Vocab::kSpecialTokens::unk);
+  explicit LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id = 1);
 
   ~LookupOp() = default;
 
@@ -61,4 +61,4 @@ class LookupOp : public TensorOp {
 }  // namespace dataset
 }  // namespace mindspore
 
-#endif  // DATASET_NLP_KERNELS_LOOKUP_OP_H_
+#endif  // DATASET_TEXT_KERNELS_LOOKUP_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc b/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc
new file mode 100644
index 0000000000..bbe449a89a
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc
@@ -0,0 +1,96 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/text/kernels/ngram_op.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+
+NgramOp::NgramOp(const std::vector<int32_t> &ngrams, int32_t l_len, int32_t r_len, const std::string &l_pad,
+                 const std::string &r_pad, const std::string &separator)
+    : ngrams_(ngrams),
+      l_len_(l_len),
+      r_len_(r_len),
+      l_pad_with_sp_(l_pad + separator),
+      r_pad_with_sp_(r_pad + separator),
+      separator_(separator) {}
+
+Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING && input->Rank() == 1, "Not a 1-D str Tensor");
+  std::vector<int32_t> offsets;                 // offsets for each str
+  std::vector<std::string> res;                 // holds the result of ngrams
+  std::string str_buffer;                       // concat all pad tokens with string interleaved with separators
+  res.reserve(input->shape().NumOfElements());  // this should be more than enough
+  offsets.reserve(1 + l_len_ + r_len_ + input->shape().NumOfElements());
+  str_buffer.reserve(l_pad_with_sp_.size() * l_len_ + r_pad_with_sp_.size() * r_len_ + input->SizeInBytes());
+  offsets.push_back(str_buffer.size());  // insert 0 as the starting pos
+  for (int i = 0; i < l_len_; i++) offsets.push_back((str_buffer += l_pad_with_sp_).size());
+
+  for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); itr++) {
+    str_buffer += (*itr);
+    str_buffer += separator_;
+    offsets.push_back(str_buffer.size());
+  }
+
+  for (int i = 0; i < r_len_; i++) offsets.push_back((str_buffer += r_pad_with_sp_).size());
+
+  for (auto n : ngrams_) {
+    CHECK_FAIL_RETURN_UNEXPECTED(n > 0, "n gram needs to be a positive number.\n");
+    int32_t start_ind = l_len_ - std::min(l_len_, n - 1);
+    int32_t end_ind = offsets.size() - r_len_ + std::min(r_len_, n - 1);
+    if (end_ind - start_ind <= n) {
+      res.emplace_back(std::string());  // push back empty string
+    } else {
+      CHECK_FAIL_RETURN_UNEXPECTED(end_ind - n >= 0, "Incorrect loop condition");
+
+      for (int i = start_ind; i < end_ind - n; i++) {
+        res.emplace_back(str_buffer.substr(offsets[i], offsets[i + n] - offsets[i] - separator_.size()));
+      }
+    }
+  }
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(output, res, TensorShape({static_cast<dsize_t>(res.size())})));
+  return Status::OK();
+}
+
+void NgramOp::Print(std::ostream &out) const {
+  out << "NgramOp: "
+      << "left pad width: " << l_len_ << " left pad token with separator: " << l_pad_with_sp_ << "\n"
+      << "right pad width: " << r_len_ << " right pad token with separator: " << r_pad_with_sp_ << "\n"
+      << "separator: " << separator_ << "\n";
+}
+
+Status NgramOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
+  CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() == NumInput(), "incorrect num of inputs\n");
+  CHECK_FAIL_RETURN_UNEXPECTED(inputs[0].Rank() == 1, "ngram only works with 1-dim data\n");
+  dsize_t num_elements = ngrams_.size();
+  for (int32_t n : ngrams_) {
+    // here since rank == 1, NumOfElements == shape[0]. add padding length to string
+    int32_t len_with_padding = inputs[0].NumOfElements() + std::min(n - 1, l_len_) + std::min(n - 1, r_len_);
+    // if len_with_padding - n < 0, this would return an empty string
+    num_elements += std::max(len_with_padding - n, 0);
+  }
+  outputs.emplace_back(TensorShape({num_elements}));
+  CHECK_FAIL_RETURN_UNEXPECTED(outputs.size() == NumOutput(), "incorrect num of outputs\n");
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/ngram_op.h b/mindspore/ccsrc/dataset/text/kernels/ngram_op.h
new file mode 100644
index 0000000000..3d2c547f79
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/ngram_op.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_TEXT_KERNELS_NGRAM_OP_H_
+#define DATASET_TEXT_KERNELS_NGRAM_OP_H_
+
+#include <string>
+#include <memory>
+#include <vector>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+namespace py = pybind11;
+
+class NgramOp : public TensorOp {
+ public:
+  // Constructor of Ngram model
+  // @param const std::vector<int32_t> &ngrams
+  // @param int32_tl_len - padding length on the left
+  // @param int32_t r_len - padding length on the right
+  // @param const std::string &l_pad - padding token on the left
+  // @param const std::string &r_pad - padding token on the right
+  // @param const std::string &separator - use to join strings
+  NgramOp(const std::vector<int32_t> &ngrams, int32_t l_len, int32_t r_len, const std::string &l_pad,
+          const std::string &r_pad, const std::string &separator);
+
+  // perform ngram model on each tensor
+  // @param const std::shared_ptr<Tensor> &input
+  // @param std::shared_ptr<Tensor> *output
+  // @return error code
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  // destructor
+  ~NgramOp() override = default;
+
+  // @param std::vector<TensorShape> &inputs - shape of input tensors
+  // @param std::vector<TensorShape> &outputs - shape of output tensors
+  // @return error code
+  Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
+
+  // print arg for debugging
+  // @param std::ostream &out
+  void Print(std::ostream &out) const override;
+
+ private:
+  std::vector<int32_t> ngrams_;  // list of n grams
+  int32_t l_len_;                // left padding length
+  int32_t r_len_;                // right padding length
+  std::string l_pad_with_sp_;    // left padding appended with separator
+  std::string r_pad_with_sp_;    // right padding appended with separator
+  std::string separator_;        // separator
+};
+
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_TEXT_KERNELS_NGRAM_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc
new file mode 100644
index 0000000000..b902286576
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "unicode/errorcode.h"
+#include "unicode/normalizer2.h"
+#include "unicode/utypes.h"
+
+namespace mindspore {
+namespace dataset {
+const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
+Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  icu::ErrorCode error;
+  const icu::Normalizer2 *normalize = nullptr;
+  switch (normalize_form_) {
+    case NormalizeForm::kNone: {
+      *output = input;
+      return Status::OK();
+    }
+    case NormalizeForm::kNfc: {
+      normalize = icu::Normalizer2::getNFCInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFCInstance failed");
+      break;
+    }
+    case NormalizeForm::kNfkc: {
+      normalize = icu::Normalizer2::getNFKCInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCInstance failed");
+      break;
+    }
+    case NormalizeForm::kNfd: {
+      normalize = icu::Normalizer2::getNFDInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFDInstance failed");
+      break;
+    }
+    case NormalizeForm::kNfkd: {
+      normalize = icu::Normalizer2::getNFKDInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKDInstance failed");
+      break;
+    }
+    default: {
+      RETURN_STATUS_UNEXPECTED("unexpected normalize form");
+      break;
+    }
+  }
+  std::vector<std::string> strs(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    icu::StringByteSink<std::string> sink(&strs[i++]);
+    normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
+    CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
+  }
+  *output = std::make_shared<Tensor>(std::move(strs), input->shape());
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h
new file mode 100644
index 0000000000..5033f2355f
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
+#define DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+enum class NormalizeForm {
+  kNone = 0,
+  kNfc,
+  kNfkc,
+  kNfd,
+  kNfkd,
+};
+
+class NormalizeUTF8Op : public TensorOp {
+ public:
+  static const NormalizeForm kDefNormalizeForm;
+  explicit NormalizeUTF8Op(NormalizeForm normalize_form = kDefNormalizeForm) : normalize_form_(normalize_form) {}
+
+  ~NormalizeUTF8Op() override = default;
+
+  void Print(std::ostream &out) const override { out << "NormalizeUTF8Op"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  NormalizeForm normalize_form_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc
new file mode 100644
index 0000000000..1ce2c5ea61
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/regex_replace_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+
+Status RegexReplaceOp::RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text,
+                                    std::string *out) const {
+  CHECK_FAIL_RETURN_UNEXPECTED((matcher != nullptr && out != nullptr), "Input is null");
+  UErrorCode icu_error = U_ZERO_ERROR;
+  icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text);
+  matcher->reset(unicode_text);
+  icu::UnicodeString unicode_out;
+  if (replace_all_) {
+    unicode_out = matcher->replaceAll(replace_, icu_error);
+  } else {
+    unicode_out = matcher->replaceFirst(replace_, icu_error);
+  }
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "RegexReplace failed");
+  unicode_out.toUTF8String(*out);
+  return Status::OK();
+}
+
+Status RegexReplaceOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  UErrorCode icu_error = U_ZERO_ERROR;
+  icu::RegexMatcher matcher(pattern_, 0, icu_error);
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "Create icu RegexMatcher failed, you may input one error pattern");
+  std::vector<std::string> strs(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    RETURN_IF_NOT_OK(RegexReplace(&matcher, *iter, &strs[i]));
+  }
+  *output = std::make_shared<Tensor>(std::move(strs), input->shape());
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h
new file mode 100644
index 0000000000..30fae13241
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
+#define DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
+#include <memory>
+#include <string>
+
+#include "unicode/regex.h"
+#include "unicode/errorcode.h"
+#include "unicode/utypes.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class RegexReplaceOp : public TensorOp {
+ public:
+  RegexReplaceOp(const std::string &pattern, const std::string &replace, bool replace_all = true)
+      : pattern_(icu::UnicodeString::fromUTF8(pattern)),
+        replace_(icu::UnicodeString::fromUTF8(replace)),
+        replace_all_(replace_all) {}
+
+  ~RegexReplaceOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "RegexReplaceOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ protected:
+  Status RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text, std::string *out) const;
+
+ private:
+  const icu::UnicodeString pattern_;
+  const icu::UnicodeString replace_;
+  const bool replace_all_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc
new file mode 100644
index 0000000000..34c06f28ea
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc
@@ -0,0 +1,103 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/regex_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
+                                          icu::UnicodeString *out_unicode) const {
+  CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
+  int total_len = input.length();
+  int end = start + len;
+  CHECK_FAIL_RETURN_UNEXPECTED((start >= 0 && len > 0 && end <= total_len), "Out of range");
+  icu::UnicodeString temp;
+  input.extract(start, len, temp);
+  if (out_utf8 != nullptr) {
+    temp.toUTF8String(*out_utf8);
+  }
+  if (out_unicode != nullptr) {
+    *out_unicode = temp;
+  }
+  return Status::OK();
+}
+
+Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const {
+  UErrorCode status = U_ZERO_ERROR;
+  out_tokens->clear();
+  icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
+  icu::RegexMatcher delim_matcher(keep_delim_pattern_, 0, status);
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
+
+  icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
+  token_matcher.reset(utext);
+
+  int token_start_index = 0;
+  status = U_ZERO_ERROR;
+  while (token_matcher.find(status) && U_SUCCESS(status)) {
+    int deli_start_index = token_matcher.start(status);
+    CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
+    int deli_end_index = token_matcher.end(status);
+    CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
+
+    // Add non-empty token
+    int token_len = deli_start_index - token_start_index;
+    if (token_len > 0) {
+      std::string token;
+      RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
+      out_tokens->emplace_back(std::move(token));
+    }
+
+    int delim_len = deli_end_index - deli_start_index;
+    if (keep_delim_ && delim_len > 0) {
+      icu::UnicodeString delim_str;
+      std::string delim_utf8_str;
+      RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
+      delim_matcher.reset(delim_str);
+      if (delim_matcher.matches(status) && U_SUCCESS(status)) {
+        out_tokens->emplace_back(std::move(delim_utf8_str));
+      }
+    }
+    token_start_index = deli_end_index;
+  }
+
+  if (token_start_index < utext.length()) {
+    std::string temp;
+    RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
+    out_tokens->emplace_back(std::move(temp));
+  }
+  return Status::OK();
+}
+
+Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::string_view text;
+  RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
+  std::vector<std::string> tokens;
+  RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens));
+  *output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h
new file mode 100644
index 0000000000..bcf02a4a11
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_REGEX_TOKENIZER_OP_H_
+#define DATASET_TEXT_REGEX_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "unicode/regex.h"
+#include "unicode/errorcode.h"
+#include "unicode/utypes.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class RegexTokenizerOp : public TensorOp {
+ public:
+  RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern)
+      : delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
+        keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
+        keep_delim_(!keep_delim_pattern.empty()) {}
+
+  ~RegexTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ protected:
+  Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
+                          icu::UnicodeString *out_unicode = nullptr) const;
+  Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const;
+
+ private:
+  const icu::UnicodeString delim_pattern_;
+  const icu::UnicodeString keep_delim_pattern_;
+  const bool keep_delim_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_REGEX_TOKENIZER_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/to_number_op.cc b/mindspore/ccsrc/dataset/text/kernels/to_number_op.cc
new file mode 100644
index 0000000000..1368684daf
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/to_number_op.cc
@@ -0,0 +1,241 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/text/kernels/to_number_op.h"
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "dataset/core/data_type.h"
+#include "dataset/core/tensor.h"
+#include "dataset/core/tensor_shape.h"
+#include "dataset/kernels/data/data_utils.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+ToNumberOp::ToNumberOp(const DataType &cast_to_type) : cast_to_type_(cast_to_type) {}
+
+ToNumberOp::ToNumberOp(const std::string &cast_to_type) : cast_to_type_(DataType(cast_to_type)) {}
+
+Status ToNumberOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "Input tenosrs should have type string.");
+
+  switch (cast_to_type_.value()) {
+    case DataType::DE_INT8:
+      RETURN_IF_NOT_OK(ToSignedIntegral<int8_t>(input, output));
+      break;
+    case DataType::DE_INT16:
+      RETURN_IF_NOT_OK(ToSignedIntegral<int16_t>(input, output));
+      break;
+    case DataType::DE_INT32:
+      RETURN_IF_NOT_OK(ToSignedIntegral<int32_t>(input, output));
+      break;
+    case DataType::DE_INT64:
+      RETURN_IF_NOT_OK(ToSignedIntegral<int64_t>(input, output));
+      break;
+    case DataType::DE_UINT8:
+      RETURN_IF_NOT_OK(ToUnsignedIntegral<uint8_t>(input, output));
+      break;
+    case DataType::DE_UINT16:
+      RETURN_IF_NOT_OK(ToUnsignedIntegral<uint16_t>(input, output));
+      break;
+    case DataType::DE_UINT32:
+      RETURN_IF_NOT_OK(ToUnsignedIntegral<uint32_t>(input, output));
+      break;
+    case DataType::DE_UINT64:
+      RETURN_IF_NOT_OK(ToUnsignedIntegral<uint64_t>(input, output));
+      break;
+    case DataType::DE_FLOAT16:
+      RETURN_IF_NOT_OK(this->ToFloat16(input, output));
+      break;
+    case DataType::DE_FLOAT32:
+      RETURN_IF_NOT_OK(ToFloat(input, output));
+      break;
+    case DataType::DE_FLOAT64:
+      RETURN_IF_NOT_OK(ToDouble(input, output));
+      break;
+  }
+
+  return Status::OK();
+}
+
+void ToNumberOp::Print(std::ostream &out) const { out << "ToNumberOp: casting to " << '\n'; }
+
+Status ToNumberOp::OutputShape(const std::vector<TensorShape> &input_shapes, std::vector<TensorShape> &output_shapes) {
+  (void)std::copy(input_shapes.begin(), input_shapes.end(), std::back_inserter(output_shapes));
+  return Status::OK();
+}
+
+template <typename T>
+Status ToNumberOp::ToSignedIntegral(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  std::vector<T> casted;
+
+  for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
+    bool is_cast_out_of_range = false;
+    int64_t result = 0;
+
+    try {
+      result = std::stoll(std::string(*it));
+    } catch (const std::out_of_range &) {
+      is_cast_out_of_range = true;
+    } catch (const std::invalid_argument &) {
+      RETURN_STATUS_UNEXPECTED("It is invalid to convert " + std::string(*it) + " to a number.");
+    }
+
+    if (result > std::numeric_limits<T>::max() || result < std::numeric_limits<T>::min() || is_cast_out_of_range) {
+      std::string error_message = "String input " + std::string(*it) + " will be out of bounds if casted to " +
+                                  cast_to_type_.ToString() + ". The valid range is: [" +
+                                  std::to_string(std::numeric_limits<T>::min()) + ", " +
+                                  std::to_string(std::numeric_limits<T>::max()) + "].";
+
+      RETURN_STATUS_UNEXPECTED(error_message);
+    }
+
+    T casted_result = static_cast<T>(result);
+    casted.push_back(casted_result);
+  }
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(output, casted, input->shape()));
+  return Status::OK();
+}
+
+template <typename T>
+Status ToNumberOp::ToUnsignedIntegral(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  std::vector<T> casted;
+
+  for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
+    bool is_cast_out_of_range = false;
+    uint64_t result = 0;
+
+    // If there is a - at the start of the string, it is considered by us to
+    // be out of bounds. If the - is somewhere else in the string, it is
+    // deemed invalid by std::stoull and will throw std::invalid_argument
+    for (int i = 0; i < (*it).size(); i++) {
+      if ((*it)[i] == '-') {
+        is_cast_out_of_range = true;
+        break;
+      }
+    }
+
+    try {
+      result = std::stoull(std::string(*it));
+    } catch (const std::out_of_range &) {
+      is_cast_out_of_range = true;
+    } catch (const std::invalid_argument &) {
+      RETURN_STATUS_UNEXPECTED("It is invalid to convert " + std::string(*it) + " to an unsigned integer.");
+    }
+
+    if (result > std::numeric_limits<T>::max() || result < std::numeric_limits<T>::min() || is_cast_out_of_range) {
+      std::string error_message = "String input " + std::string(*it) + " will be out of bounds if casted to " +
+                                  cast_to_type_.ToString() + ". The valid range is: [" +
+                                  std::to_string(std::numeric_limits<T>::min()) + ", " +
+                                  std::to_string(std::numeric_limits<T>::max()) + "].";
+
+      RETURN_STATUS_UNEXPECTED(error_message);
+    }
+
+    T casted_result = static_cast<T>(result);
+    casted.push_back(casted_result);
+  }
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(output, casted, input->shape()));
+  return Status::OK();
+}
+
+Status ToNumberOp::ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  // special case, float16 does not exist in c++, no native support for
+  // casting, so cast to float first then use this method, which use Eigen.
+  std::shared_ptr<Tensor> temp;
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&temp, TensorImpl::kFlexible, input->shape(), DataType("float32")));
+  RETURN_IF_NOT_OK(ToFloat(input, &temp));
+  RETURN_IF_NOT_OK(mindspore::dataset::ToFloat16(temp, output));
+  return Status::OK();
+}
+
+Status ToNumberOp::ToFloat(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  std::vector<float> casted;
+
+  for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
+    bool is_cast_out_of_range = false;
+    float result = 0;
+
+    try {
+      result = std::stof(std::string(*it));
+    } catch (const std::out_of_range &) {
+      is_cast_out_of_range = true;
+    } catch (const std::invalid_argument &) {
+      RETURN_STATUS_UNEXPECTED("It is invalid to convert " + std::string(*it) + " to an unsigned integer.");
+    }
+
+    if (result > std::numeric_limits<float>::max() || result < std::numeric_limits<float>::lowest() ||
+        is_cast_out_of_range) {
+      std::string error_message = "String input " + std::string(*it) + " will be out of bounds if casted to " +
+                                  cast_to_type_.ToString() + ". The valid range is: [" +
+                                  std::to_string(std::numeric_limits<float>::lowest()) + ", " +
+                                  std::to_string(std::numeric_limits<float>::max()) + "].";
+
+      RETURN_STATUS_UNEXPECTED(error_message);
+    }
+
+    float casted_result = static_cast<float>(result);
+    casted.push_back(casted_result);
+  }
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(output, casted, input->shape()));
+  return Status::OK();
+}
+
+Status ToNumberOp::ToDouble(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  std::vector<double> casted;
+
+  for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
+    bool is_cast_out_of_range = false;
+    double result = 0;
+
+    try {
+      result = std::stod(std::string(*it));
+    } catch (const std::out_of_range &) {
+      is_cast_out_of_range = true;
+    } catch (const std::invalid_argument &) {
+      RETURN_STATUS_UNEXPECTED("It is invalid to convert " + std::string(*it) + " to an unsigned integer.");
+    }
+
+    if (result > std::numeric_limits<double>::max() || result < std::numeric_limits<double>::lowest() ||
+        is_cast_out_of_range) {
+      std::string error_message = "String input " + std::string(*it) + " will be out of bounds if casted to " +
+                                  cast_to_type_.ToString() + ". The valid range is: [" +
+                                  std::to_string(std::numeric_limits<double>::lowest()) + ", " +
+                                  std::to_string(std::numeric_limits<double>::max()) + "].";
+
+      RETURN_STATUS_UNEXPECTED(error_message);
+    }
+
+    double casted_result = static_cast<double>(result);
+    casted.push_back(casted_result);
+  }
+
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(output, casted, input->shape()));
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/to_number_op.h b/mindspore/ccsrc/dataset/text/kernels/to_number_op.h
new file mode 100644
index 0000000000..1346ce2f47
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/to_number_op.h
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_TEXT_KERNELS_TO_NUMBER_OP_H_
+#define DATASET_TEXT_KERNELS_TO_NUMBER_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dataset/core/data_type.h"
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class ToNumberOp : public TensorOp {
+ public:
+  // Constructor of ToNumberOp
+  // @param const DataType &cast_to_type - the type to convert string inputs to.
+  explicit ToNumberOp(const DataType &cast_to_type);
+
+  // Constructor of ToNumberOp
+  // @param const std::string &cast_to_type - the type in string form to convert string inputs to.
+  explicit ToNumberOp(const std::string &cast_to_type);
+
+  ~ToNumberOp() override = default;
+
+  // Perform numeric conversion on each string in each tensor.
+  // @param const std::shared_ptr<Tensor> &input
+  // @param std::shared_ptr<Tensor> *output
+  // @return error code
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  // For each input shape, find the output shape
+  // @param std::vector<TensorShape> &inputs - shape of input tensors
+  // @param std::vector<TensorShape> &outputs - shape of output tensors
+  // @return error code
+  Status OutputShape(const std::vector<TensorShape> &input_shapes, std::vector<TensorShape> &output_shapes) override;
+
+  // print arg for debugging
+  // @param std::ostream &out
+  void Print(std::ostream &out) const override;
+
+ private:
+  template <typename T>
+  Status ToSignedIntegral(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);
+
+  template <typename T>
+  Status ToUnsignedIntegral(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);
+
+  Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);
+
+  Status ToFloat(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);
+
+  Status ToDouble(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);
+
+  DataType cast_to_type_;
+};
+
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_TEXT_KERNELS_TO_NUMBER_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/truncate_sequence_pair_op.cc b/mindspore/ccsrc/dataset/text/kernels/truncate_sequence_pair_op.cc
new file mode 100644
index 0000000000..136d5006df
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/truncate_sequence_pair_op.cc
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/text/kernels/truncate_sequence_pair_op.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/kernels/data/slice_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status TruncateSequencePairOp::Compute(const TensorRow &input, TensorRow *output) {
+  IO_CHECK_VECTOR(input, output);
+  CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 2, "Number of inputs should be two.");
+  std::shared_ptr<Tensor> seq1 = input[0];
+  std::shared_ptr<Tensor> seq2 = input[1];
+  CHECK_FAIL_RETURN_UNEXPECTED(seq1->shape().Rank() == 1 && seq2->shape().Rank() == 1,
+                               "Both sequences should be of rank 1");
+  dsize_t length1 = seq1->shape()[0];
+  dsize_t length2 = seq2->shape()[0];
+  dsize_t outLength1 = length1;
+  dsize_t outLength2 = length2;
+
+  dsize_t total = length1 + length2;
+  while (total > max_length_) {
+    if (outLength1 > outLength2)
+      outLength1--;
+    else
+      outLength2--;
+    total--;
+  }
+  std::shared_ptr<Tensor> outSeq1;
+  if (length1 != outLength1) {
+    std::unique_ptr<SliceOp> slice1(new SliceOp(Slice(outLength1 - length1)));
+    RETURN_IF_NOT_OK(slice1->Compute(seq1, &outSeq1));
+  } else {
+    outSeq1 = std::move(seq1);
+  }
+
+  std::shared_ptr<Tensor> outSeq2;
+  if (length2 != outLength2) {
+    std::unique_ptr<SliceOp> slice2(new SliceOp(Slice(outLength2 - length2)));
+    RETURN_IF_NOT_OK(slice2->Compute(seq2, &outSeq2));
+  } else {
+    outSeq2 = std::move(seq2);
+  }
+  output->push_back(outSeq1);
+  output->push_back(outSeq2);
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/truncate_sequence_pair_op.h b/mindspore/ccsrc/dataset/text/kernels/truncate_sequence_pair_op.h
new file mode 100644
index 0000000000..e8be6802a8
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/truncate_sequence_pair_op.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_DATA_TRUNCATE_SEQUENCE_PAIR_OP_H_
+#define DATASET_KERNELS_DATA_TRUNCATE_SEQUENCE_PAIR_OP_H_
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/kernels/data/type_cast_op.h"
+#include "dataset/kernels/data/data_utils.h"
+
+namespace mindspore {
+namespace dataset {
+
+class TruncateSequencePairOp : public TensorOp {
+ public:
+  explicit TruncateSequencePairOp(dsize_t length) : max_length_(length) {}
+
+  ~TruncateSequencePairOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "TruncateSequencePairOp"; }
+
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+ private:
+  dsize_t max_length_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_KERNELS_DATA_TRUNCATE_SEQUENCE_PAIR_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
index 343e079153..063bf21630 100644
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
@@ -28,6 +28,7 @@ namespace mindspore {
 namespace dataset {
 
 Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
   if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
     RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
   }
diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
index 53c42d599e..01a84eca8b 100644
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
-#define DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
+#ifndef DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
 #include <memory>
 
 #include "dataset/core/tensor.h"
@@ -37,4 +37,4 @@ class UnicodeCharTokenizerOp : public TensorOp {
 
 }  // namespace dataset
 }  // namespace mindspore
-#endif  // DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
+#endif  // DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc
new file mode 100644
index 0000000000..97a4f1333d
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc
@@ -0,0 +1,93 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "cppjieba/Unicode.hpp"
+#include "unicode/errorcode.h"
+#include "unicode/uchar.h"
+#include "unicode/uscript.h"
+
+using cppjieba::DecodeRunesInString;
+using cppjieba::RuneStrArray;
+
+namespace mindspore {
+namespace dataset {
+
+const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
+
+Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::string_view str;
+  RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
+  RuneStrArray runes;
+  if (!DecodeRunesInString(str.data(), str.size(), runes)) {
+    RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
+  }
+
+  UScriptCode last_script = USCRIPT_INVALID_CODE;
+  icu::ErrorCode status;
+  int start = 0;
+  int len = 0;
+  std::vector<std::string> splits;
+
+  bool was_space = false;
+  for (size_t i = 0; i < runes.size(); i++) {
+    bool is_space = u_isUWhiteSpace(runes[i].rune);
+    UScriptCode script = uscript_getScript(runes[i].rune, status);
+    if (status.isFailure()) {
+      status.reset();
+      script = USCRIPT_INVALID_CODE;
+    }
+    // 1) Seperate UTF-8 strings of different UScriptCode values
+    //    (such as: "Chinese中国" should be splited to ["Chinese", "中国"])
+    // 2) Seperate whitespace and non-whitespace UTF-8 strings
+    //    (such as: " ." should be split to [" ", "."])
+    if (len > 0 && (script != last_script || is_space != was_space)) {
+      // 3) If keep_whitespace_ is false, all the whitespace characters will be discard
+      if (keep_whitespace_ || !was_space) {
+        std::string temp(str.substr(start, len));
+        splits.emplace_back(std::move(temp));
+      }
+      start = runes[i].offset;
+      len = runes[i].len;
+    } else {
+      len += runes[i].len;
+    }
+    last_script = script;
+    was_space = is_space;
+  }
+
+  if (len > 0 && (keep_whitespace_ || !was_space)) {
+    std::string temp(str.substr(start, len));
+    splits.emplace_back(std::move(temp));
+  }
+  // 4) If the input is empty scalar string, the output will be 1-D empty string.
+  if (splits.empty()) {
+    splits.emplace_back("");
+  }
+  *output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h
new file mode 100644
index 0000000000..a77b0b3fa3
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class UnicodeScriptTokenizerOp : public TensorOp {
+ public:
+  static const bool kDefKeepWhitespace;
+
+  explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {}
+
+  ~UnicodeScriptTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  bool keep_whitespace_;  // If or not keep whitespace tokens
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc
new file mode 100644
index 0000000000..35f3f8d0e2
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc
@@ -0,0 +1,73 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/whitespace_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "cppjieba/Unicode.hpp"
+#include "unicode/errorcode.h"
+#include "unicode/uchar.h"
+#include "unicode/uscript.h"
+
+using cppjieba::DecodeRunesInString;
+using cppjieba::RuneStrArray;
+
+namespace mindspore {
+namespace dataset {
+Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::string_view str;
+  RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
+
+  RuneStrArray runes;
+  if (!DecodeRunesInString(str.data(), str.size(), runes)) {
+    RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
+  }
+  std::vector<std::string> splits;
+  int start = 0;
+  int len = 0;
+  for (size_t i = 0; i < runes.size(); i++) {
+    if (u_isUWhiteSpace(runes[i].rune)) {
+      if (len > 0) {
+        std::string temp(str.substr(start, len));
+        splits.emplace_back(std::move(temp));
+        len = 0;
+      }
+    } else {
+      if (len == 0) {
+        start = runes[i].offset;
+      }
+      len += runes[i].len;
+    }
+  }
+  if (len > 0) {
+    std::string temp(str.substr(start, len));
+    splits.emplace_back(std::move(temp));
+  }
+  if (splits.empty()) {
+    splits.emplace_back("");
+  }
+  *output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h
new file mode 100644
index 0000000000..6d0bab0bea
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class WhitespaceTokenizerOp : public TensorOp {
+ public:
+  WhitespaceTokenizerOp() {}
+
+  ~WhitespaceTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
new file mode 100644
index 0000000000..48092d89cd
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
+#include <algorithm>
+#include <utility>
+
+namespace mindspore {
+namespace dataset {
+
+const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##";
+const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100;
+const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]";
+
+WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
+                                           const int &max_bytes_per_token, const std::string &unknown_token)
+    : vocab_(vocab),
+      suffix_indicator_(suffix_indicator),
+      max_bytes_per_token_(max_bytes_per_token),
+      unknown_token_(unknown_token) {}
+
+Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
+                                        bool *out_found, int *out_end) const {
+  CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range");
+  *out_found = false;
+  for (int i = runes.size() - 1; i >= 0; i--) {
+    *out_end = runes[i].offset + runes[i].len;
+    int len = *out_end - start;
+    std::string word = input_token.substr(start, len);
+    if (start > 0) {
+      word = suffix_indicator_ + word;
+    }
+    WordIdType default_id = -1;
+    if (vocab_->Lookup(word, default_id) != default_id) {
+      *out_found = true;
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const {
+  out_tokens->clear();
+  if (unknown_token_.empty()) {
+    out_tokens->emplace_back(input_token);
+  } else {
+    out_tokens->emplace_back(unknown_token_);
+  }
+  return Status::OK();
+}
+
+Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end,
+                                        std::vector<std::string> *out_tokens) const {
+  CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range");
+  std::string subword = input_token.substr(start, end - start);
+  if (start > 0) {
+    subword = suffix_indicator_ + subword;
+  }
+  out_tokens->emplace_back(subword);
+  return Status::OK();
+}
+
+Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const {
+  if (input_token.size() > max_bytes_per_token_) {
+    return FoundNoToken(input_token, out_tokens);
+  }
+  RuneStrArray runes;
+  if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) {
+    RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
+  }
+  int end;
+  for (int start = 0; start < input_token.size();) {
+    bool found;
+    RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end));
+    if (found) {
+      RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens));
+      start = end;
+    } else {
+      return FoundNoToken(input_token, out_tokens);
+    }
+  }
+  return Status::OK();
+}
+
+Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() > 1 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
+  }
+  std::vector<std::string> out_tokens;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    std::vector<std::string> temp_tokens;
+    RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens));
+    out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end());
+  }
+  if (out_tokens.empty()) {
+    out_tokens.emplace_back("");
+  }
+  *output = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
new file mode 100644
index 0000000000..c9a75025c6
--- /dev/null
+++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "cppjieba/Unicode.hpp"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/text/vocab.h"
+#include "dataset/util/status.h"
+
+using cppjieba::DecodeRunesInString;
+using cppjieba::RuneStrArray;
+namespace mindspore {
+namespace dataset {
+
+class WordpieceTokenizerOp : public TensorOp {
+ public:
+  static const char kDefSuffixIndicator[];
+  static const int kDefMaxBytesPerToken;
+  static const char kDefUnknownToken[];
+  WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = kDefSuffixIndicator,
+                       const int &max_bytes_per_token = kDefMaxBytesPerToken,
+                       const std::string &unknown_token = kDefUnknownToken);
+
+  ~WordpieceTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ protected:
+  Status AddSubword(const std::string &input_token, const int start, const int end,
+                    std::vector<std::string> *out_token) const;
+  Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const;
+  Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found,
+                    int *out_end) const;
+  Status GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const;
+
+ private:
+  const std::shared_ptr<Vocab> vocab_;
+  const std::string suffix_indicator_;
+  const int max_bytes_per_token_;
+  const std::string unknown_token_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
diff --git a/mindspore/ccsrc/dataset/text/vocab.cc b/mindspore/ccsrc/dataset/text/vocab.cc
index 893336c62a..100dc9d655 100644
--- a/mindspore/ccsrc/dataset/text/vocab.cc
+++ b/mindspore/ccsrc/dataset/text/vocab.cc
@@ -14,51 +14,53 @@
  * limitations under the License.
  */
 #include <fstream>
-#include <map>
+#include <unordered_set>
+#include <unordered_map>
 #include <utility>
 
 #include "dataset/text/vocab.h"
 
 namespace mindspore {
 namespace dataset {
-Vocab::Vocab(std::unordered_map<WordType, WordIdType> word2id) {
-  word2id_ = std::move(word2id);
-  id2word_.resize(word2id_.size());
-  for (auto p : word2id_) {
-    id2word_[p.second - kSpecialTokens::num_tokens] = p.first;
-  }
-}
+Vocab::Vocab(std::unordered_map<WordType, WordIdType> word2id) { word2id_ = std::move(word2id); }
 
 WordIdType Vocab::Lookup(const WordType &word, WordIdType default_id) const {
   auto itr = word2id_.find(word);
   return itr == word2id_.end() ? default_id : itr->second;
 }
-WordType Vocab::Lookup(WordIdType id) const {
-  if (id < kSpecialTokens::num_tokens) {
-    return reserved_token_str_[id];
-  } else if (id - kSpecialTokens::num_tokens >= id2word_.size()) {
-    return reserved_token_str_[kSpecialTokens::unk];
-  } else {
-    return id2word_[id - kSpecialTokens::num_tokens];
-  }
-}
 
-Status Vocab::BuildFromPyList(const py::list &words, std::shared_ptr<Vocab> *vocab) {
+Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
+                              std::shared_ptr<Vocab> *vocab) {
+  // check of duplication on both words and special_tokens will be performed in python
+  // special_tokens and words both need to be unique, and shouldn't overlap
   std::unordered_map<WordType, WordIdType> word2id;
-  WordIdType word_id = kSpecialTokens::num_tokens;
+  // if special is added in front, normal words id will start from number of special tokens
+  WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
+
   for (auto word : words) {
-    const std::string s = py::str(word);
-    CHECK_FAIL_RETURN_UNEXPECTED(word2id.find(s) == word2id.end(), "duplicate word:" + s);
-    word2id[s] = word_id++;
+    word2id[py::str(word)] = word_id++;
   }
+
+  word_id = prepend_special ? 0 : word2id.size();
+
+  for (auto special_token : special_tokens) {
+    word2id[py::str(special_token)] = word_id++;
+  }
+
   *vocab = std::make_shared<Vocab>(std::move(word2id));
   return Status::OK();
 }
 
 Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                            std::shared_ptr<Vocab> *vocab) {
+                            const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) {
+  // python validator checks special_tokens doesn't contain any duplicate words
+  std::unordered_set<std::string> specials;
+  // used to check that words in file don't contain any special token that already exists
+  for (auto word : special_tokens) {
+    specials.insert(py::str(word));
+  }
+  WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
   std::unordered_map<WordType, WordIdType> word2id;
-  WordIdType word_id = kSpecialTokens::num_tokens;
   std::fstream handle(path, std::ios::in);
   CHECK_FAIL_RETURN_UNEXPECTED(handle.good() && handle.is_open(), "fail to open:" + path);
   std::string word;
@@ -67,35 +69,36 @@ Status Vocab::BuildFromFile(const std::string &path, const std::string &delimite
       // if delimiter is not found, find_first_of would return std::string::npos which is -1
       word = word.substr(0, word.find_first_of(delimiter));
     }
-    CHECK_FAIL_RETURN_UNEXPECTED(word2id.find(word) == word2id.end(), "duplicate word:" + word);
+    CHECK_FAIL_RETURN_UNEXPECTED(word2id.find(word) == word2id.end(), "duplicate word:" + word + ".");
+    CHECK_FAIL_RETURN_UNEXPECTED(specials.find(word) == specials.end(), word + " is already in special_tokens.");
     word2id[word] = word_id++;
     // break if enough row is read, if vocab_size is smaller than 0
-    if (word_id == vocab_size + kSpecialTokens::num_tokens) break;
+    if (word2id.size() == vocab_size) break;
   }
+
+  word_id = prepend_special ? 0 : word2id.size();
+
+  for (auto special_token : special_tokens) {
+    word2id[py::str(special_token)] = word_id++;
+  }
+
   *vocab = std::make_shared<Vocab>(std::move(word2id));
   return Status::OK();
 }
 
 Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) {
   std::unordered_map<WordType, WordIdType> word2id;
-  std::map<WordIdType, WordType> id2word;
   for (auto p : words) {
-    WordIdType word_id = py::reinterpret_borrow<py::int_>(p.second);
-    if (word_id < kSpecialTokens::num_tokens) continue;  // skip id that are reserved
-    std::string word = py::str(p.first);
-    CHECK_FAIL_RETURN_UNEXPECTED(id2word.find(word_id) == id2word.end(), "duplicate id:" + word);
-    id2word[word_id] = word;
-  }
-
-  WordIdType cnt = kSpecialTokens::num_tokens;
-  for (auto p : id2word) {
-    CHECK_FAIL_RETURN_UNEXPECTED(p.first == cnt++, "word id needs to be continuous starting from 2");
-    word2id[p.second] = p.first;
+    word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second);
   }
-
   *vocab = std::make_shared<Vocab>(std::move(word2id));
   return Status::OK();
 }
-const std::vector<WordType> Vocab::reserved_token_str_ = {"<pad>", "<unk>"};
+
+void Vocab::append_word(const std::string &word) {
+  if (word2id_.find(word) == word2id_.end()) {
+    word2id_[word] = word2id_.size();
+  }
+}
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/dataset/text/vocab.h b/mindspore/ccsrc/dataset/text/vocab.h
index 3dcc88c434..fc21c380a2 100644
--- a/mindspore/ccsrc/dataset/text/vocab.h
+++ b/mindspore/ccsrc/dataset/text/vocab.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef DATASET_NLP_VOCAB_H_
-#define DATASET_NLP_VOCAB_H_
+#ifndef DATASET_TEXT_VOCAB_H_
+#define DATASET_TEXT_VOCAB_H_
 
 #include <string>
 #include <memory>
@@ -45,7 +45,8 @@ class Vocab {
   // @param const py::list &words - a list of string, used to build vocab, id starts from 2
   // @param std::shared_ptr<Vocab> *vocab - return value, vocab object
   // @return error code
-  static Status BuildFromPyList(const py::list &words, std::shared_ptr<Vocab> *vocab);
+  static Status BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
+                                std::shared_ptr<Vocab> *vocab);
 
   // Build a vocab from reading a vocab file, id are automatically assigned, start from 2
   // @param std::string &path - path to vocab file , each line is assumed to contain 1 word
@@ -54,7 +55,7 @@ class Vocab {
   // @param std::shared_ptr<Vocab> *vocab - return value, vocab object
   // @return error code
   static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
-                              std::shared_ptr<Vocab> *vocab);
+                              const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab);
 
   // Lookup the id of a word, if word doesn't exist in vocab, return default_id
   // @param const WordType word - word to look up
@@ -65,26 +66,26 @@ class Vocab {
   // reverse lookup, lookup the word based on its id
   // @param WordIdType id - word id to lookup to
   // @return WordType the word
-  WordType Lookup(WordIdType id) const;
+  WordType Lookup(WordIdType id);
 
   // constructor, shouldn't be called directly, can't be private due to std::make_unique()
   // @param std::unordered_map<WordType, WordIdType> map - sanitized word2id map
   explicit Vocab(std::unordered_map<WordType, WordIdType> map);
 
-  ~Vocab() = default;
+  Vocab() = default;
 
-  // enum type that holds all special tokens, add more if needed
-  enum kSpecialTokens : WordIdType { pad = 0, unk = 1, num_tokens = 2 };
+  // add one word to vocab, increment it's index automatically
+  // @param std::string & word - word to be added will skip if word already exists
+  void append_word(const std::string &word);
 
-  // reversed lookup table for the reserved tokens
-  static const std::vector<WordType> reserved_token_str_;
+  // destructor
+  ~Vocab() = default;
 
  private:
   std::unordered_map<WordType, WordIdType> word2id_;
-  std::vector<WordType> id2word_;  // reverse lookup
 };
 
 }  // namespace dataset
 }  // namespace mindspore
 
-#endif  // DATASET_NLP_VOCAB_H_
+#endif  // DATASET_TEXT_VOCAB_H_
diff --git a/mindspore/ccsrc/dataset/util/README.md b/mindspore/ccsrc/dataset/util/README.md
index f62d77d1df..7cad3c0d7d 100644
--- a/mindspore/ccsrc/dataset/util/README.md
+++ b/mindspore/ccsrc/dataset/util/README.md
@@ -1,72 +1,426 @@
-# Event
-The header file WaitPost.h contains the implementation of an event which is a type of synchronization mechanism that is used to indicate to waiting processes when a particular condition has become true.
+This folder contains miscellaneous utilities used by the dataset code. We will describe a couple important classes in this file.
+## Thread Management
+This picture summarizes a few important classes that we will cover in the next few sections.
 
-An event is created with initial state set to false. It provides the following operations:
-* `wait` - causes the suspension of the executing process until the state of the event is set to true. If the state is already set to true has no effect.
-* `set` - sets the event's state to true, releasing all waiting processes.
-* `clear` - sets the event's state to false.
+![Thread management](https://images.gitee.com/uploads/images/2020/0601/220111_9b07c8fa_7342120.jpeg "task_manager.JPG")
 
-# Counting Semaphore
-The header file Semaphore.h contains the implementation of counting semaphore. Conceptually, a semaphore is a nonnegative integer count. Semaphores are typically used to coordinate access to resources, with the semaphore count initialized to the number of free resources. Threads then atomically increment the count when resources are added and atomically decrement the count when resources are removed.
+## Task
+A Task object corresponds to an instance of std::future returning from std::async. In general, a user will not create a Task object directly. Most work will go through TaskManager's TaskGroup interface which we will cover later in this document. Here are some important members and functions of Task class.
+```cpp
+std::function<Status()> fnc_obj_;
+```
+It is the entry function when the thead is spawned. The function does not take any input and will return a Status object. The returned Status object will be saved in this member
+```cpp
+Status rc_;
+```
+To retrieve the executed result from the entry function, call the following function
+```cpp
+Status Task::GetTaskErrorIfAny();
+```
+Here is roughly the pseudo code of a lifetime of a Task. Some extra works needed to spawn the thread are omitted for the purpose of simplicity. As mentioned previously, a user never spawn a thread directly using a Task class without using any helper.
+
+```cpp
+1 Task tk = Task("A name for this thread", []() -> Status {
+2   return Status::OK();
+3 });
+4 RETURN_IF_NOT_OK(tk.Run());
+5 RETURN_IF_NOT_OK(tk.Join();)
+6 RETURN_IF_NOT_OK(tk.GetTaskErrorIfAny());
+```
+In the above example line 1 to 3 we use Task constructor to prepare a thread that we are going to create and what it will be running. We also assign a name to this thread. The name is for eye catcher purpose. The second parameter is the real job for this thread to run. 
+<br/>Line 4 we spawn the thread. In the above example, the thread will execute the lambda function which does nothing but return a OK Status object.
+<br/>Line 5 We wait for the thread to complete
+<br/>Line 6 We retrieve the result from running the thread which should be the OK Status object.
+
+Another purpose of Task object is to wrap around the entry function and capture any possible exceptions thrown by running the entry function but not being caught within the entry function.
+```cpp
+  try {
+    rc_ = fnc_obj_();
+  } catch (const std::bad_alloc &e) {
+    rc_ = Status(StatusCode::kOutOfMemory, __LINE__, __FILE__, e.what());
+  } catch (const std::exception &e) {
+    rc_ = Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, e.what());
+  }
+```
+Note that 
+```cpp
+Status Task::Run();
+```
+is not returning the Status of running the entry function func_obj_. It merely indicates if the spawn is successful or not. This function returns immediately. 
+
+Another thing to point out that Task::Run() is not designed to re-run the thread repeatedly, say after it has returned. Result will be unexpected if a Task object is re-run.
+
+For the function
+```cpp
+Status Task::Join(WaitFlag wf = WaitFlag::kBlocking);
+```
+where
+```cpp
+enum class WaitFlag : int { kBlocking, kNonBlocking };
+```
+is also not returning the Status of running the entry function func_obj_ like the function Run(). It can return some other unexpected error while waiting for the thread to return. 
+
+This function blocks (kBlocking) by default until the spawned thread returns.
+
+As mentioned previously, use the function GetTaskErrorIfAny() to fetch the result from running the entry function func_obj_.
+
+The non-blocking version (kNonBlocking) of Join allows us to force the thread to return if timed out.
+
+```cpp
+while (thrd_.wait_for(std::chrono::seconds(1)) != std::future_status::ready) {
+    // Do something if the thread is blocked on a conditional variable
+}
+```
+The main use of this form of Join() is after we have interrupted the thread.
+
+A design alternative is to use
+```cpp
+std::future<Status>
+```
+to spawn the thread asynchronously and we can get the result using std::future::get(). But get() can only be called once and it is then more convenient to save the returned result in the rc_ member for unlimited number of retrieval. As we shall see later, the value of rc_ will be propagated to high level classes like TaskGroup, master thread.
+
+Currently it is how the thread is defined in Task class
+```cpp
+std::future<void> thrd_;
+```
+and spawned by this line of code.
+```cpp
+thrd_ = std::async(std::launch::async, std::ref(*this));
+```
+Every thread can access its own Task object using the FindMe() function.
+```cpp
+Task * TaskManager::FindMe();
+```
+
+There are other attributes of Task such as interrupt which we will cover later in this document.
+  
+## TaskGroup
+The first helper in managing Task objects is TaskGroup. Technically speaking a TaskGroup is a collection of related Tasks. As of this writing, every Task must belong to a TaskGroup. We spawn a thread using the following function
+```cpp
+Status TaskGroup::CreateAsyncTask(const std::string &my_name, const std::function<Status()> &f, Task **pTask = nullptr);
+```
+The created Task object is added to the TaskGroup object. In many cases, user do not need to get a reference to the newly created Task object. But the CreateAsyncTask can return one if requested.
+
+There is no other way to add a Task object to a TaskGroup other than by calling TaskGroup::CreateAsyncTask. As a result, no Task object can belong to multiple TaskGroup's by design. Every Task object has a back pointer to the TaskGroup it belongs to :
+```cpp
+TaskGroup *Task::MyTaskGroup();
+```
+
+Task objects in the same TaskGroup will form a linked list with newly created Task object appended to the end of the list.
+
+Globally we support multiple TaskGroups's running concurrently. TaskManager (discussed in the next section) will chain all Task objects from all TaskGroup's in a single LRU linked list.
+
+###### HandShaking
+As of this writing, the following handshaking logic is required. Suppose a thread T1 create another thread, say T2 by calling TaskGroup::CreateAsyncTask. T1 will block on a WaitPost area until T2 post back signalling T1 can resume.
+```cpp
+// Entry logic of T2
+auto *myTask = TaskManager::FindMe();
+myTask->Post();
+```
+If T2 is going to spawn more threads, say T3 and T4, it is *highly recommended* that T2 wait for T3 and T4 to post before it posts back to T1.
+
+The purpose of the handshake is to provide a way for T2 to synchronize with T1 if necessary. 
+
+TaskGroup provides similar functions as Task but at a group level.
+```cpp
+void TaskGroup::interrupt_all() noexcept;
+```
+This interrupt all the threads currently running in the TaskGroup. The function returns immediately. We will cover more details on the mechanism of interrupt later in this document.
+```cpp
+Status TaskGroup::join_all(Task::WaitFlag wf = Task::WaitFlag::kBlocking);
+```
+This performs Task::Join() on all the threads in the group. This is a blocking call by default.
+```cpp
+Status TaskGroup::GetTaskErrorIfAny();
+```
+A TaskGroup does not save records for all the Task::rc_ for all the threads in this group. Only the first error is saved. For example, if thread T1 reports error rc1 and later on T2 reports error rc2, only rc1 is saved in the TaskGroup and rc2 is ignored. TaskGroup::GetTaskErrorIfAny() will return rc1 in this case.
+```cpp
+int size() const noexcept;
+```
+This returns the size of the TaskGroup.
+
+## TaskManager
+TaskManager is a singleton, meaning there is only one such class object. It is created by another Services singleton object which we will cover it in the later section.
+```cpp
+TaskManager &TaskManager::GetInstance()
+```
+provides the method to access the singleton.
 
-When the semaphore count becomes zero, indicating that no more resources are present, threads trying to decrement the semaphore block wait until the count becomes greater than zero.
+TaskManager manages all the TaskGroups and all the Tasks objects ever created.
+```cpp
+  List<Task> lru_;
+  List<Task> free_lst_;
+  std::set<TaskGroup *> grp_list_;
+```
+As mentioned previously, all the Tasks in the same TaskGroup are linked in a linked list local to this TaskGroup. At the TaskManager level, all Task objects from all the TaskGroups are linked in the lru_ list.
 
-Two operations are provided 
-* `P`(). Decrement the semaphore count. If the count is 0, the current thread is blocked.
-* `V`(). Increment the semaphore count. Wake up one of the threads that are currently blocked. Note that the current implementation wakes up one of the blocked threads instead of waking up all of them.
+When a thread finished its job and returned, its corresponding Task object is saved for reuse in the free_lst_. When a new thread is created, TaskManager will first look into the free_lst_ before allocating memory for the new Task object.
 
-# List
-It is a doubly linked structure used solely by Buffer Manager. List can used for general purpose. The reason we use a home grown linked list because Buffer Manager manages several linked lists and an element can simultaneously in more than one list. Using STL C++ container is not as efficient as the home grown linked list.
+```cpp
+  std::shared_ptr<Task> master_;
+```
+The master thread itself also has a corresponding **fake** Task object in the TaskManager singleton object. But this fake Task is not in any of the List<Task>
 
-# Consumer/Producer Queue
-The header file Queue.h contains a generic implementation of producer/consumer queue. The problem describes two processes, the producer and the consumer, who share a common, fixed-size buffer used as a queue. The producer's job is to generate data, put it into the buffer, and start again. At the same time, the consumer is consuming the data (i.e., removing it from the buffer), one piece at a time. 
+###### Passing error to the master thread
+```cpp
+void TaskManager::InterruptGroup(Task &);
+void TaskManager::InterruptMaster(const Status &);
+Status Status::GetMasterThreadRc();
+```
+When a thread encounters some unexpected error, it performs the following actions before returning
+* It saves the error rc in the TaskGroup it belongs (assuming it is the first error reported in the TaskGroup).
+* It interrupts every other threads in the TaskGroup by calling TaskManager::InterruptGroup.
+* It interrupts the master thread and copy the error rc to the TaskManager::master_::rc_ by calling TaskManager::InterruptMaster(rc). However, because there can be many TaskGroups running in parallel or back to back, if the TaskManager::master_::rc_ is already set to some error from earlier TaskGroup run but not yet retrieved, the old error code will **not** be overwritten by the new error code.
 
-It has the following template signature
+Master thread can query the result using TaskGroup::GetTaskErrorIfAny or TaskManager::GetMasterThreadRc. The first form is the *preferred* method. For the second form, TaskManager::master_::rc_ will be reset to OK() once retrieved such that future call of TaskManager::InterruptMaster() will populate the error to the master thread again.
+
+###### WatchDog
+TaskManager will spawn an additional thread with "Watchdog" as name catcher. It executes the following function once startup
+```cpp
+Status TaskManager::WatchDog() {
+  TaskManager::FindMe()->Post();
+  errno_t err = sem_wait(&sem_);
+  if (err == -1) {
+    RETURN_STATUS_UNEXPECTED("Errno = " + std::to_string(errno));
+  }
+  // We are woken up by control-c and we are going to stop all threads that are running.
+  // In addition, we also want to prevent new thread from creating. This can be done
+  // easily by calling the parent function.
+  RETURN_IF_NOT_OK(ServiceStop());
+  return Status::OK();
+}
 ```
-        template<typename T, int SIZE> 
-        class Queue {
+Its main purpose is to handle Control-C and stop all the threads from running by interrupting all of them. We will cover more on the function call ServiceStop() when we reach the section about Service class.
+
+WatchDog has its own TaskGroup to follow the protocol but it is not in the set of all the TaskGroup.
 
+## Interrupt
+C++ std::thread and std::async do not provide a way to stop a thread. So we implement interrupt mechanism to stop a thread from running and exit.
+
+The initial design can be considered as a polling method. A bit or a flag may be set in some global shared area. The running thread will periodically check this bit/flag. If it is set, interrupt has been sent and the thread will quit. This method has a requirement that even if the thread is waiting on a std::conditional_variable, it can't do an unconditional wait() call. That is, it must do a wait_for() with a time out. Once returned from the wait_for() call, the thread must check if it is woken up due to time out or due to the condition is satisfied. 
+
+The cons of this approach is the performance cost and we design a pushing method approach. 
+
+To begin with we define an abstract class that describe objects that are interruptible.
+
+```cpp
+class IntrpResource { ... };
+```
+It has two states:
+```cpp
+ enum class State : int { kRunning, kInterrupted };
 ```
-_SIZE_ is the capacity of the queue.
-_T_ is the object class that represents the data that are produced and consumed by the producer and consumer respectively.
+either it is in the state of running or being interrupted.
+There are two virtual functions that any class inherit can override
+```cpp
+virtual Status Interrupt();
+virtual void ResetIntrpState();
+```
+Interrupt() in the base class change the state of the object to kInterrupted. ResetIntrpState() is doing the opposite to reset the state. Any class that inherits the base class can implement its own Interrupt(), for example, we will later on see how a CondVar class (a wrapper for std::condition_variable) deals with interrupt on its own.
 
-Initially the Queue is empty and all consumers are blocked.
+All related IntrpResource can register to a
+```cpp
+class IntrpService {...}
+```
+It provides the public method
+```cpp
+ void InterruptAll() noexcept;
+```
+which goes through all registered IntrpResource objects and call the corresponding Interrupt().
 
-The implementation of Queue is based on counting semaphore above.
+A IntrpResource is always associated with a TaskGroup:
+```cpp
+class TaskGroup {
+  ...
+  std::shared_ptr<IntrpService> intrp_svc_;
+  ...
+};
+```
 
-The following operations are provided
-* void `push_back`(const T&) used by producer to add data to the queue.
-* T `pop_front`() used by consumer to retrieve the data from the queue.
+As of this writing, both push and poll methods are used. There are still a few places (e.g. a busy while loop) where a thread must periodically check for interrupt.
 
+## CondVar
+A CondVar class is a wrapper of std::condition_variable 
+```cpp
+  std::condition_variable cv_;
+```
+and is interruptible :
+```cpp
+class CondVar : public IntrpResource { ... }
+```
+It overrides the Interrupt() method with its own
+```cpp
+void CondVar::Interrupt() {
+  IntrpResource::Interrupt();
+  cv_.notify_all();
+}
+```
+It provides a Wait() method and is equivalent to std::condition_variable::wait.
+```cpp
+Status Wait(std::unique_lock<std::mutex> *lck, const std::function<bool()> &pred);
+```
+The main difference is Wait() is interruptible. Thread returning from Wait must check Status return code if it is being interrupted.
 
-# Memory Pool
-Two different kinds of memory pools are provided. While they behave differently, they have identical interfaces
-* void * `allocate`(size_t reqSize). It allocates memory from the pool where reqSize is the size of memory requested
-* void `deallocate`(void *p) returns the memory previously acquired by allocate pointed to by p back to the memory pool
-* void `Reallocate`(void **pp, size_t oldSize, size_t newSize). Enlarge or shrink the memory acquired previously by allocate to the new size. The old pointer is passed in and a new pointer (or maybe the same ond) is returned.
+Note that once a CondVar is interrupted, its state remains interrupted until it is reset.
+## WaitPost
+A WaitPost is an implementation of <a href="https://en.wikipedia.org/wiki/Event_(synchronization_primitive)">Event</a>. In brief, it consists of a boolean state and provides methods to synchronize running threads.
+* Wait(). If the boolean state is false, the calling threads will block until the boolean state becomes true or an interrupt has occurred.
+* Set(). Change the boolean state to true. All blocking threads will be released.
+* Clear(). Reset the boolean state back to false.
 
-C++ operator **new** and **delete** are also overloaded to make use of the customized memory pools.
+WaitPost is implemented on top of CondVar and hence is interruptible, that is, caller of 
+```cpp
+Status Wait();
+```
+must check the return Status for interrupt.
 
-Both functions allocate and deallocate can throw `std::bad_alloc` if running out of memory from the arena. It is user's responsibility to catch the out of memory exception.
+The initial boolean state is false when a WaitPost object is created. Note that once a Set() call is invoked, the boolean state remains true until it is reset.
+## List
+A List is the implementation of doubly linked list. It is not thread safe and so user must provide methods to serialize the access to the list.
 
-An allocator header file Allocator.h is created to provided additional support to hook into the C++ STL container such as vector or map to allocate memory from the customized memory pools.
+The main feature of List is it allows an element to be inserted into multiple Lists. Take the Task class as an example. It can be in its TaskGroup list and at the same time linked in the global TaskManager task list. When a Task is done, it will be in the free list.
+```cpp
+class Task {
+  ...
+  Node<Task> node;
+  Node<Task> group;
+  Node<Task> free;
+  ...
+};
+class TaskGroup {
+  ...
+  List<Task> grp_list_;
+  ...
+};
+class TaskManager {
+  ...
+  List<Task> lru_;
+  List<Task> free_lst_;
+  ...
+};
+```
+where Node<T> is defined as 
+```cpp
+template <typename T>
+struct Node {
+  using value_type = T;
+  using pointer = T *;
+  pointer prev;
+  pointer next;
 
-## BuddyArena
-The first kind of memory pool is BuddyArena. The corresponding header file is BuddyArena.h.
+  Node() {
+    prev = nullptr;
+    next = nullptr;
+  }
+};
+```
+The constructor List class will take Node<> as input so it will follow this Node element to form a doubly linked chain. For example, List<Task> lru_ takes Task::node in its constructor while TaskGroup::grp_list_ takes Task::group in its constructor. This way we allow a Task to appear in two distinct linked lists.
+
+## Queue
+A Queue is a thread safe solution to producer-consumer problem. Every queue is of finite capacity and its size must be provided to the constructor of the Queue. Few methods are provided
+* Add(). It appends an element to queue and will be blocked if the queue is full or an interrupt has occurred.
+* EmplaceBack(). Same as an Add() but construct the element in place.
+* PopFront(). Remove the first element from the queue and will be blocked if the queue is empty or an interrupt has occurred.
+
+Queue is implemented on top of CondVar class and hence is interruptible. So callers of the above functions must check for Status return code for interrupt.
+
+## Locking
+C++11 does not provide any shared lock support. So we implement some simple locking classes for our own benefits.
+###### SpinLock
+It is a simple exclusive lock based on CAS (compared and swap). The caller repeatedly trying (and hence the name spinning) to acquire the lock until successful. It is best used when the critical section is very short.
+
+SpinLock is not interruptible.
 
-BuddyArena is a general purpose arena and the constructor takes K (in unit of MB) as input. The default value is 4096 which is 4G if no value is given to the constructor.
+There is helper class LockGuard to ensure the lock is released if it is acquired.
 
-BuddyArena is implemented based on Buddy System.
+###### RWLock
+It is a simple Read Write Lock where the implementation favors writers. Reader will acquire the lock in S (share) mode while writer will acquire the lock in X (exclusive) mode. X mode is not compatible with S and X. S is compatible with S but not X. In addition, we also provide additional functions
+* Upgrade(). Upgrade a S lock to X lock.
+* Downgrade(). Downgrade a X lock to S lock.
 
-## CircularPool
-The second kind of memory pool is CircularPool. The corresponding header file is CircularPool.h.
+RWLock is not interruptible.
 
-CircularPool is built upon multiple BuddyArena. Initially there is one BuddyArena. More BuddyArena are gradually added to the memory pool as needed until it reaches the specified maximum capacity. There is no guarantee the newly added BuddyArena is contiguous. Maximum size of allocated block in CircularPool is determined by the maximum block allowed by a BuddyArena. By default the maximum capacity is 32G and each BuddyArena is 4G.. The constructor takes unit of GB as input.
+Like LockGuard helper class, there are helper classes SharedLock and UniqueLock to release the lock when the lock goes out of scope.
 
-There are one important assumption of this kind of memory pool
-* Allocated memory is not kept for the whole duration of the memory pool and will be released soon.
+## Treap
+A Treap is the combination of BST (Binary Search Tree) and a heap. Each key is given a priority. The priority for any non-leaf node is greater than or equal to the priority of its children.
+
+Treap supports the following basic operations
+* To search for a given key value. Standard binary search algorithm is applied, ignoring the priorities.
+* To insert a new key X into the treap. Heap properties of the tree is maintained by tree rotation.
+* To delete a key from a treap. Heap properties of the tree is maintained by tree rotation.
+
+## MemoryPool
+A MemoryPool is an abstract class to allow memory blocks to be dynamically allocated from a designated memory region. Any class that implements MemoryPool must provide the following implementations.
+```cpp
+  // Allocate a block of size n
+  virtual Status Allocate(size_t, void **) = 0;
+
+  // Enlarge or shrink a block from oldSz to newSz
+  virtual Status Reallocate(void **, size_t old_sz, size_t new_sz) = 0;
+
+  // Free a pointer
+  virtual void Deallocate(void *) = 0;
+```
+There are several implementations of MemoryPool
+###### Arena
+Arena is a fixed size memory region which is allocated up front. Each Allocate() will sub-allocate a block from this region. 
 
-User allocates memory from the _logical_ end of the pool while allocated memory will be returned to the _logical_ head of the pool. When a new BuddyArena is added to the pool, it will become the new logical end. When a BuddyArena becomes full, the next BuddyArena (in a round robin fashion) will become the new tail.
+Internally free blocks are organized into a Treap where the address of the block is the key and its block size is the priority. So the top of the tree is the biggest free block that can be found. Memory allocation is always fast and at a constant cost. Contiguous free blocks are merged into one single free block. Similar algorithm is used to enlarge a block to avoid memory copy.
 
+The main advantage of Arena is we do not need to free individual memory block and simply free the whole region instead.
 
+###### CircularPool
+It is still an experimental class. It consists of one single Arena or multiple Arenas. To allocate memory we circle through the Arenas before new Arena is added. It has an assumption that memory is not kept for too long and will be released at some point in the future, and memory allocation strategy is based on this assumption.
 
+## B+ tree
+We also provide B+ tree support. Compared to std::map, we provide the following additional features
+* Thread safe
+* Concurrent insert/update/search support.
+
+As of this writing, no delete support has been implemented yet.
+## Service
+Many of the internal class inherit from a Service abstract class. A Service class simply speaking it provides service. A Service class consists of four states
+```cpp
+enum class STATE : int { kStartInProg = 1, kRunning, kStopInProg, kStopped };
+```
+Any class that inherits from Service class must implement the following two methods.
+```cpp
+  virtual Status DoServiceStart() = 0;
+  virtual Status DoServiceStop() = 0;
+```
+###### Service::ServiceStart()
+This function brings up the service and moves the state to kRunning. This function is thread safe. If another thread is bringing up the same service at the same time, only one of them will drive the service up. ServiceStart() will call DoServiceStart() provided by the child class when the state reaches kStartInProg.
+An example will be TaskManager which inherits from Service. Its implementation of DoServiceStart will be to spawn off the WatchDog thread.
+###### Service::ServiceStop()
+This function shut down the service and moves the state to kStopped. This function is thread safe. If another thread is bringing down the same service at the same time, only one of them will drive the service down. ServiceStop() will call DoServiceStop() provided by the child class when the states reaches kStopInProg.
+As an example, Both TaskManager and TaskGroup during service shutdown will generates interrupts to all the threads.
+###### State checking
+Other important use of Service is to synchronize operations. For example, TaskGroup::CreateAsyncTask will return interrupt error if the current state of TaskGroup is not kRunning. This way we can assure no new thread is allowed to create and added to a TaskGroup while the TaskGroup is going out of scope. Without this state check, we can have Task running without its TaskGroup, and may run into situation the Task is blocked on a CondVar and not returning.
+## Services
+Services is a singleton and is the first and only one singleton created as a result of calling 
+```cpp
+mindspore::dataset::GlobalInit();
+```
+The first thing Services singleton do is to create a small 16M circular memory pool. This pool is used by many important classes to ensure basic operation will not fail due to out of memory. The most important example is TaskManager. Each Task memory is allocated from this memory pool. 
+
+The next thing Services do is to spawn another singletons in some specific orders. One of the problems of multiple singletons is we have very limited control on the order of creation and destruction of singletons. Sometimes we need to control which singleton to allocate first and which one to deallocate last. One good example is logger. Logger is usually the last one to shutdown.
+
+Services singleton has a requirement on the list of singletons it bring up. They must inherit the Service class. Services singleton will bring each one up by calling the corresponding ServiceStart() function. The destructor of Services singleton will call ServiceStop() to bring down these singletons. TaskManager is a good example. It is invoked by Services singleton.
+
+Services singleton also provide other useful services like 
+* return the current hostname
+* return the current username
+* generate a random string
+
+## Path
+Path class provides many operating system specific functions to shield the user to write functions for different platforms. As of this writing, the following functions are provided.
+```cpp
+  bool Exists();
+  bool IsDirectory();
+  Status CreateDirectory();
+  Status CreateDirectories();
+  std::string Extension() const;
+  std::string ParentPath();
+```
+Simple "/" operators are also provided to allow folders and/or files to be concatenated and work on all platforms including Windows.
diff --git a/mindspore/ccsrc/dataset/util/auto_index.h b/mindspore/ccsrc/dataset/util/auto_index.h
index 2b4c2d6883..11a2e90b00 100644
--- a/mindspore/ccsrc/dataset/util/auto_index.h
+++ b/mindspore/ccsrc/dataset/util/auto_index.h
@@ -48,7 +48,7 @@ class AutoIndexObj : public BPlusTree<int64_t, T, A> {
   // @return
   Status insert(const value_type &val, key_type *key = nullptr) {
     key_type my_inx = inx_.fetch_add(1);
-    if (key) {
+    if (key != nullptr) {
       *key = my_inx;
     }
     return my_tree::DoInsert(my_inx, val);
diff --git a/mindspore/ccsrc/dataset/util/btree.h b/mindspore/ccsrc/dataset/util/btree.h
index df7cb8516f..ccf642e366 100644
--- a/mindspore/ccsrc/dataset/util/btree.h
+++ b/mindspore/ccsrc/dataset/util/btree.h
@@ -40,8 +40,6 @@ struct BPlusTreeTraits {
   static constexpr slot_type kLeafSlots = 256;
   // Number of slots in each inner node of the tree
   static constexpr slot_type kInnerSlots = 128;
-  // If kAppendMode is true, we will split high instead of 50/50 split
-  static constexpr bool kAppendMode = false;
 };
 
 /// Implementation of B+ tree
@@ -123,19 +121,14 @@ class BPlusTree {
   std::unique_ptr<value_type> DoUpdate(const key_type &key, const value_type &new_value);
   std::unique_ptr<value_type> DoUpdate(const key_type &key, std::unique_ptr<value_type> &&new_value);
 
-  void PopulateNumKeys();
-
-  key_type KeyAtPos(uint64_t inx);
-
   // Statistics
   struct tree_stats {
     std::atomic<uint64_t> size_;
     uint32_t leaves_;
     uint32_t inner_nodes_;
     uint32_t level_;
-    bool num_keys_array_valid_;
 
-    tree_stats() : size_(0), leaves_(0), inner_nodes_(0), level_(0), num_keys_array_valid_(false) {}
+    tree_stats() : size_(0), leaves_(0), inner_nodes_(0), level_(0) {}
   };
 
  private:
@@ -160,10 +153,6 @@ class BPlusTree {
     Node<BaseNode> lru_;
   };
 
-  uint64_t PopulateNumKeys(BaseNode *n);
-
-  key_type KeyAtPos(BaseNode *n, uint64_t inx);
-
   // This control block keeps track of all the nodes we traverse on insert.
   // To maximize concurrency, internal nodes are latched S. If a node split
   // is required, we must releases all the latches and redo it again and change
@@ -255,7 +244,6 @@ class BPlusTree {
     slot_type slot_dir_[traits::kInnerSlots] = {0};
     key_type keys_[traits::kInnerSlots] = {0};
     BaseNode *data_[traits::kInnerSlots + 1] = {nullptr};
-    uint64_t num_keys_[traits::kInnerSlots + 1] = {0};
     slot_type slotuse_;
   };
 
@@ -391,7 +379,6 @@ class BPlusTree {
     Iterator operator--(int);
 
     bool operator==(const Iterator &x) const { return (x.cur_ == cur_) && (x.slot_ == slot_); }
-
     bool operator!=(const Iterator &x) const { return (x.cur_ != cur_) || (x.slot_ != slot_); }
 
    private:
@@ -441,7 +428,6 @@ class BPlusTree {
     ConstIterator operator--(int);
 
     bool operator==(const ConstIterator &x) const { return (x.cur_ == cur_) && (x.slot_ == slot_); }
-
     bool operator!=(const ConstIterator &x) const { return (x.cur_ != cur_) || (x.slot_ != slot_); }
 
    private:
@@ -451,20 +437,17 @@ class BPlusTree {
   };
 
   Iterator begin();
-
   Iterator end();
 
   ConstIterator begin() const;
-
   ConstIterator end() const;
 
   ConstIterator cbegin() const;
-
   ConstIterator cend() const;
 
   // Locate the entry with key
-  ConstIterator Search(const key_type &key) const;
-  Iterator Search(const key_type &key);
+  std::pair<ConstIterator, bool> Search(const key_type &key) const;
+  std::pair<Iterator, bool> Search(const key_type &key);
 
   value_type operator[](key_type key);
 };
diff --git a/mindspore/ccsrc/dataset/util/btree_impl.tpp b/mindspore/ccsrc/dataset/util/btree_impl.tpp
index 63117a0097..8148a8d12c 100644
--- a/mindspore/ccsrc/dataset/util/btree_impl.tpp
+++ b/mindspore/ccsrc/dataset/util/btree_impl.tpp
@@ -23,41 +23,39 @@ template <typename K, typename V, typename A, typename C, typename T>
 typename BPlusTree<K, V, A, C, T>::IndexRc BPlusTree<K, V, A, C, T>::InnerNode::Sort() {
   // Build an inverse map. Basically it means keys[i] should be relocated to keys[inverse[i]];
   slot_allocator alloc(this->alloc_);
-  slot_type *inverse = nullptr;
   try {
-    inverse = alloc.allocate(traits::kInnerSlots);
+    // We use a unique_ptr will custom deleter to ensure the memory will be released when this
+    // function returns.
+    std::unique_ptr<slot_type[], std::function<void(slot_type *)>> memGuard(
+      alloc.allocate(traits::kInnerSlots), [&alloc](slot_type *p) { alloc.deallocate(p, traits::kInnerSlots); });
+    slot_type *inverse = memGuard.get();
+    for (slot_type i = 0; i < slotuse_; i++) {
+      inverse[slot_dir_[i]] = i;
+    }
+    for (slot_type i = 0; i < slotuse_; i++) {
+      while (inverse[i] != i) {
+        slot_type j = inverse[i];
+        slot_type k = inverse[j];
+        // Swap the key
+        std::swap(keys_[j], keys_[i]);
+        // Swap the pointers.
+        if ((j + 1) >= traits::kInnerSlots + 1 || (i + 1) >= traits::kInnerSlots + 1) {
+          return IndexRc::kUnexpectedError;
+        }
+        std::swap(data_[j + 1], data_[i + 1]);
+        // one key in order.
+        inverse[j] = j;
+        // continue to move
+        inverse[i] = k;
+      }
+      slot_dir_[i] = i;
+    }
+    return IndexRc::kOk;
   } catch (std::bad_alloc &e) {
     return IndexRc::kOutOfMemory;
   } catch (std::exception &e) {
     return IndexRc::kUnexpectedError;
   }
-
-  for (slot_type i = 0; i < slotuse_; i++) {
-    inverse[slot_dir_[i]] = i;
-  }
-  for (slot_type i = 0; i < slotuse_; i++) {
-    while (inverse[i] != i) {
-      slot_type j = inverse[i];
-      slot_type k = inverse[j];
-      // Swap the key
-      std::swap(keys_[j], keys_[i]);
-      // Swap the pointers.
-      if ((j + 1) >= traits::kInnerSlots + 1 || (i + 1) >= traits::kInnerSlots + 1) {
-        return IndexRc::kUnexpectedError;
-      }
-      std::swap(data_[j + 1], data_[i + 1]);
-      // one key in order.
-      inverse[j] = j;
-      // continue to move
-      inverse[i] = k;
-    }
-    slot_dir_[i] = i;
-  }
-  if (inverse != nullptr) {
-    alloc.deallocate(inverse, traits::kInnerSlots);
-    inverse = nullptr;
-  }
-  return IndexRc::kOk;
 }
 
 template <typename K, typename V, typename A, typename C, typename T>
@@ -117,41 +115,39 @@ template <typename K, typename V, typename A, typename C, typename T>
 typename BPlusTree<K, V, A, C, T>::IndexRc BPlusTree<K, V, A, C, T>::LeafNode::Sort() {
   // Build an inverse map. Basically it means keys[i] should be relocated to keys[inverse[i]];
   slot_allocator alloc(this->alloc_);
-  slot_type *inverse = nullptr;
   try {
-    inverse = alloc.allocate(traits::kLeafSlots);
+    // We use a unique_ptr will custom deleter to ensure the memory will be released when this
+    // function returns.
+    std::unique_ptr<slot_type[], std::function<void(slot_type *)>> memGuard(
+      alloc.allocate(traits::kLeafSlots), [&alloc](slot_type *p) { alloc.deallocate(p, traits::kLeafSlots); });
+    slot_type *inverse = memGuard.get();
+    for (slot_type i = 0; i < slotuse_; i++) {
+      inverse[slot_dir_[i]] = i;
+    }
+    for (slot_type i = 0; i < slotuse_; i++) {
+      while (inverse[i] != i) {
+        slot_type j = inverse[i];
+        slot_type k = inverse[j];
+        // Swap the key
+        if (j >= traits::kLeafSlots || i >= traits::kLeafSlots) {
+          return IndexRc::kUnexpectedError;
+        }
+        std::swap(keys_[j], keys_[i]);
+        // Swap the shared pointers
+        std::swap(data_[j], data_[i]);
+        // one key in order.
+        inverse[j] = j;
+        // continue to move
+        inverse[i] = k;
+      }
+      slot_dir_[i] = i;
+    }
+    return IndexRc::kOk;
   } catch (std::bad_alloc &e) {
     return IndexRc::kOutOfMemory;
   } catch (std::exception &e) {
     return IndexRc::kUnexpectedError;
   }
-
-  for (slot_type i = 0; i < slotuse_; i++) {
-    inverse[slot_dir_[i]] = i;
-  }
-  for (slot_type i = 0; i < slotuse_; i++) {
-    while (inverse[i] != i) {
-      slot_type j = inverse[i];
-      slot_type k = inverse[j];
-      // Swap the key
-      if (j >= traits::kLeafSlots || i >= traits::kLeafSlots) {
-        return IndexRc::kUnexpectedError;
-      }
-      std::swap(keys_[j], keys_[i]);
-      // Swap the shared pointers
-      std::swap(data_[j], data_[i]);
-      // one key in order.
-      inverse[j] = j;
-      // continue to move
-      inverse[i] = k;
-    }
-    slot_dir_[i] = i;
-  }
-  if (inverse != nullptr) {
-    alloc.deallocate(inverse, traits::kLeafSlots);
-    inverse = nullptr;
-  }
-  return IndexRc::kOk;
 }
 
 template <typename K, typename V, typename A, typename C, typename T>
@@ -273,26 +269,17 @@ typename BPlusTree<K, V, A, C, T>::IndexRc BPlusTree<K, V, A, C, T>::LeafInsertK
     RETURN_IF_BAD_RC(rc);
     leaf_nodes_.InsertAfter(node, new_leaf);
     *split_node = new_leaf;
-    if (slot == node->slotuse_ && traits::kAppendMode) {
-      // Split high. Good for bulk load and keys are in asending order on insert
-      *split_key = key;
-      // Just insert the new key to the new leaf. No further need to move the keys
-      // from one leaf to the other.
-      rc = new_leaf->InsertIntoSlot(nullptr, 0, key, std::move(value));
+    // 50/50 split
+    rc = node->Split(new_leaf);
+    RETURN_IF_BAD_RC(rc);
+    *split_key = new_leaf->keys_[0];
+    if (LessThan(key, *split_key)) {
+      rc = node->InsertIntoSlot(nullptr, slot, key, std::move(value));
       RETURN_IF_BAD_RC(rc);
     } else {
-      // 50/50 split
-      rc = node->Split(new_leaf);
+      slot -= node->slotuse_;
+      rc = new_leaf->InsertIntoSlot(nullptr, slot, key, std::move(value));
       RETURN_IF_BAD_RC(rc);
-      *split_key = new_leaf->keys_[0];
-      if (LessThan(key, *split_key)) {
-        rc = node->InsertIntoSlot(nullptr, slot, key, std::move(value));
-        RETURN_IF_BAD_RC(rc);
-      } else {
-        slot -= node->slotuse_;
-        rc = new_leaf->InsertIntoSlot(nullptr, slot, key, std::move(value));
-        RETURN_IF_BAD_RC(rc);
-      }
     }
   }
   return rc;
@@ -313,25 +300,18 @@ typename BPlusTree<K, V, A, C, T>::IndexRc BPlusTree<K, V, A, C, T>::InnerInsert
     rc = AllocateInner(&new_inner);
     RETURN_IF_BAD_RC(rc);
     *split_node = new_inner;
-    if (slot == node->slotuse_ && traits::kAppendMode) {
-      *split_key = key;
-      new_inner->data_[0] = node->data_[node->slotuse_];
-      rc = new_inner->InsertIntoSlot(0, key, ptr);
+    rc = node->Split(new_inner, split_key);
+    RETURN_IF_BAD_RC(rc);
+    if (LessThan(key, *split_key)) {
+      // Need to readjust the slot position since the split key is no longer in the two children.
+      slot = FindSlot(node, key);
+      rc = node->InsertIntoSlot(slot, key, ptr);
       RETURN_IF_BAD_RC(rc);
     } else {
-      rc = node->Split(new_inner, split_key);
+      // Same reasoning as above
+      slot = FindSlot(new_inner, key);
+      rc = new_inner->InsertIntoSlot(slot, key, ptr);
       RETURN_IF_BAD_RC(rc);
-      if (LessThan(key, *split_key)) {
-        // Need to readjust the slot position since the split key is no longer in the two children.
-        slot = FindSlot(node, key);
-        rc = node->InsertIntoSlot(slot, key, ptr);
-        RETURN_IF_BAD_RC(rc);
-      } else {
-        // Same reasoning as above
-        slot = FindSlot(new_inner, key);
-        rc = new_inner->InsertIntoSlot(slot, key, ptr);
-        RETURN_IF_BAD_RC(rc);
-      }
     }
   }
   return rc;
@@ -381,8 +361,7 @@ typename BPlusTree<K, V, A, C, T>::IndexRc BPlusTree<K, V, A, C, T>::InsertKeyVa
 }
 
 template <typename K, typename V, typename A, typename C, typename T>
-typename BPlusTree<K, V, A, C, T>::IndexRc BPlusTree<K, V, A, C, T>::Locate(RWLock *parent_lock,
-                                                                            bool forUpdate,
+typename BPlusTree<K, V, A, C, T>::IndexRc BPlusTree<K, V, A, C, T>::Locate(RWLock *parent_lock, bool forUpdate,
                                                                             BPlusTree<K, V, A, C, T>::BaseNode *top,
                                                                             const key_type &key,
                                                                             BPlusTree<K, V, A, C, T>::LeafNode **ln,
@@ -485,9 +464,6 @@ Status BPlusTree<K, V, A, C, T>::DoInsert(const key_type &key, std::unique_ptr<v
   do {
     // Track all the paths to the target and lock each internal node in S.
     LockPathCB InsCB(this, retry);
-    // Mark the numKeysArray invalid. We may latch the tree in S and multiple guys are doing insert.
-    // But it is okay as we all set the same value.
-    stats_.num_keys_array_valid_ = false;
     // Initially we lock path in S unless we need to do node split.
     retry = false;
     BaseNode *new_child = nullptr;
@@ -556,70 +532,6 @@ std::unique_ptr<V> BPlusTree<K, V, A, C, T>::DoUpdate(const key_type &key, std::
   }
 }
 
-template <typename K, typename V, typename A, typename C, typename T>
-void BPlusTree<K, V, A, C, T>::PopulateNumKeys() {
-  // Start from the root and we calculate how many leaf nodes as pointed to by each inner node.
-  // The results are stored in the numKeys array in each inner node.
-  (void)PopulateNumKeys(root_);
-  // Indicate the result is accurate since we have the tree locked exclusive.
-  stats_.num_keys_array_valid_ = true;
-}
-
-template <typename K, typename V, typename A, typename C, typename T>
-uint64_t BPlusTree<K, V, A, C, T>::PopulateNumKeys(BPlusTree<K, V, A, C, T>::BaseNode *n) {
-  if (n->is_leafnode()) {
-    auto *leaf = static_cast<LeafNode *>(n);
-    return leaf->slotuse_;
-  } else {
-    auto *inner = static_cast<InnerNode *>(n);
-    uint64_t num_keys = 0;
-    for (auto i = 0; i < inner->slotuse_ + 1; i++) {
-      inner->num_keys_[i] = PopulateNumKeys(inner->data_[i]);
-      num_keys += inner->num_keys_[i];
-    }
-    return num_keys;
-  }
-}
-
-template <typename K, typename V, typename A, typename C, typename T>
-typename BPlusTree<K, V, A, C, T>::key_type BPlusTree<K, V, A, C, T>::KeyAtPos(uint64_t inx) {
-  if (stats_.num_keys_array_valid_ == false) {
-    // We need exclusive access to the tree. If concurrent insert is going on, it is hard to get accurate numbers
-    UniqueLock lck(&rw_lock_);
-    // Check again.
-    if (stats_.num_keys_array_valid_ == false) {
-      PopulateNumKeys();
-    }
-  }
-  // Now we know how many keys each inner branch contains, we can now traverse the correct node in log n time.
-  return KeyAtPos(root_, inx);
-}
-
-template <typename K, typename V, typename A, typename C, typename T>
-typename BPlusTree<K, V, A, C, T>::key_type BPlusTree<K, V, A, C, T>::KeyAtPos(BPlusTree<K, V, A, C, T>::BaseNode *n,
-                                                                               uint64_t inx) {
-  if (n->is_leafnode()) {
-    auto *leaf = static_cast<LeafNode *>(n);
-    return leaf->keys_[leaf->slot_dir_[inx]];
-  } else {
-    auto *inner = static_cast<InnerNode *>(n);
-    if ((inx + 1) > inner->num_keys_[0]) {
-      inx -= inner->num_keys_[0];
-    } else {
-      return KeyAtPos(inner->data_[0], inx);
-    }
-    for (auto i = 0; i < inner->slotuse_; i++) {
-      if ((inx + 1) > inner->num_keys_[inner->slot_dir_[i] + 1]) {
-        inx -= inner->num_keys_[inner->slot_dir_[i] + 1];
-      } else {
-        return KeyAtPos(inner->data_[inner->slot_dir_[i] + 1], inx);
-      }
-    }
-  }
-  // If we get here, inx is way too big. Instead of throwing exception, we will just return the default value
-  // of key_type whatever it is.
-  return key_type();
-}
 }  // namespace dataset
 }  // namespace mindspore
 #endif
diff --git a/mindspore/ccsrc/dataset/util/btree_iterator.tpp b/mindspore/ccsrc/dataset/util/btree_iterator.tpp
index ef3a47f176..91ba2acd7a 100644
--- a/mindspore/ccsrc/dataset/util/btree_iterator.tpp
+++ b/mindspore/ccsrc/dataset/util/btree_iterator.tpp
@@ -286,7 +286,8 @@ typename BPlusTree<K, V, A, C, T>::ConstIterator &BPlusTree<K, V, A, C, T>::Cons
 }
 
 template <typename K, typename V, typename A, typename C, typename T>
-typename BPlusTree<K, V, A, C, T>::ConstIterator BPlusTree<K, V, A, C, T>::Search(const key_type &key) const {
+std::pair<typename BPlusTree<K, V, A, C, T>::ConstIterator, bool> BPlusTree<K, V, A, C, T>::Search(
+  const key_type &key) const {
   if (root_ != nullptr) {
     LeafNode *leaf = nullptr;
     slot_type slot;
@@ -294,21 +295,15 @@ typename BPlusTree<K, V, A, C, T>::ConstIterator BPlusTree<K, V, A, C, T>::Searc
     // Lock the tree in S, pass the lock to Locate which will unlock it for us underneath.
     myLock->LockShared();
     IndexRc rc = Locate(myLock, false, root_, key, &leaf, &slot);
-    if (rc == IndexRc::kOk) {
-      // All locks from the tree to the parent of leaf are all gone. We still have a S lock
-      // on the leaf. The unlock will be handled by the iterator when it goes out of scope.
-      return ConstIterator(leaf, slot, true);
-    } else {
-      MS_LOG(DEBUG) << "Key not found. rc = " << static_cast<int>(rc) << ".";
-      return cend();
-    }
+    bool find = (rc == IndexRc::kOk);
+    return std::make_pair(ConstIterator(leaf, slot, find), find);
   } else {
-    return cend();
+    return std::make_pair(cend(), false);
   }
 }
 
 template <typename K, typename V, typename A, typename C, typename T>
-typename BPlusTree<K, V, A, C, T>::Iterator BPlusTree<K, V, A, C, T>::Search(const key_type &key) {
+std::pair<typename BPlusTree<K, V, A, C, T>::Iterator, bool> BPlusTree<K, V, A, C, T>::Search(const key_type &key) {
   if (root_ != nullptr) {
     LeafNode *leaf = nullptr;
     slot_type slot;
@@ -316,23 +311,17 @@ typename BPlusTree<K, V, A, C, T>::Iterator BPlusTree<K, V, A, C, T>::Search(con
     // Lock the tree in S, pass the lock to Locate which will unlock it for us underneath.
     myLock->LockShared();
     IndexRc rc = Locate(myLock, false, root_, key, &leaf, &slot);
-    if (rc == IndexRc::kOk) {
-      // All locks from the tree to the parent of leaf are all gone. We still have a S lock
-      // on the leaf. The unlock will be handled by the iterator when it goes out of scope.
-      return Iterator(leaf, slot, true);
-    } else {
-      MS_LOG(DEBUG) << "Key not found. rc = " << static_cast<int>(rc) << ".";
-      return end();
-    }
+    bool find = (rc == IndexRc::kOk);
+    return std::make_pair(Iterator(leaf, slot, find), find);
   } else {
-    return end();
+    return std::make_pair(end(), false);
   }
 }
 
 template <typename K, typename V, typename A, typename C, typename T>
 typename BPlusTree<K, V, A, C, T>::value_type BPlusTree<K, V, A, C, T>::operator[](key_type key) {
-  Iterator it = Search(key);
-  return it.value();
+  auto r = Search(key);
+  return r.first.value();
 }
 
 template <typename K, typename V, typename A, typename C, typename T>
diff --git a/mindspore/ccsrc/dataset/util/queue.h b/mindspore/ccsrc/dataset/util/queue.h
index b97e6a5c28..9a51565861 100644
--- a/mindspore/ccsrc/dataset/util/queue.h
+++ b/mindspore/ccsrc/dataset/util/queue.h
@@ -230,6 +230,8 @@ class QueueList {
 
   std::unique_ptr<Queue<T>> &operator[](const int index) { return queue_list_[index]; }
 
+  const std::unique_ptr<Queue<T>> &operator[](const int index) const { return queue_list_[index]; }
+
   ~QueueList() = default;
 
  private:
diff --git a/mindspore/ccsrc/dataset/util/random.h b/mindspore/ccsrc/dataset/util/random.h
index 6c70d6c7ef..957a4214a8 100644
--- a/mindspore/ccsrc/dataset/util/random.h
+++ b/mindspore/ccsrc/dataset/util/random.h
@@ -19,13 +19,16 @@
 #if defined(_WIN32) || defined(_WIN64)
 #include <stdlib.h>
 #endif
+#include <chrono>
 #include <limits>
 #include <memory>
 #include <random>
 #include <string>
+#include <thread>
 
 #include "dataset/core/config_manager.h"
 #include "dataset/core/global_context.h"
+#include "utils/log_adapter.h"
 
 namespace mindspore {
 namespace dataset {
@@ -35,6 +38,17 @@ inline std::mt19937 GetRandomDevice() {
   rand_s(&number);
   std::mt19937 random_device{static_cast<uint32_t>(number)};
 #else
+  int i = 0;
+  while (i < 5) {
+    try {
+      std::mt19937 random_device{std::random_device("/dev/urandom")()};
+      return random_device;
+    } catch (const std::exception &e) {
+      MS_LOG(WARNING) << "Get std::random_device failed, retry: " << i << ", error: " << e.what();
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+      i++;
+    }
+  }
   std::mt19937 random_device{std::random_device("/dev/urandom")()};
 #endif
   return random_device;
diff --git a/mindspore/ccsrc/dataset/util/status.cc b/mindspore/ccsrc/dataset/util/status.cc
index 84d8ee582c..27e9dfbc83 100644
--- a/mindspore/ccsrc/dataset/util/status.cc
+++ b/mindspore/ccsrc/dataset/util/status.cc
@@ -45,6 +45,9 @@ std::string CodeAsString(const StatusCode c) {
       case StatusCode::kDuplicateKey:
         s = "Duplicate key";
         break;
+      case StatusCode::kProfilingError:
+        s = "Error encountered while profiling";
+        break;
       case StatusCode::kUnexpectedError:
       default:
         s = "Unexpected error";
diff --git a/mindspore/ccsrc/dataset/util/status.h b/mindspore/ccsrc/dataset/util/status.h
index 38ed1fef89..7a480f4239 100644
--- a/mindspore/ccsrc/dataset/util/status.h
+++ b/mindspore/ccsrc/dataset/util/status.h
@@ -70,6 +70,9 @@ enum class StatusCode : char {
   kPythonInterpreterFailure = 7,
   kTDTPushFailure = 8,
   kFileNotExist = 9,
+  kProfilingError = 10,
+  kBoundingBoxOutOfBounds = 11,
+  kBoundingBoxInvalidShape = 12,
   // Make this error code the last one. Add new error code above it.
   kUnexpectedError = 127
 };
diff --git a/mindspore/ccsrc/dataset/util/task.cc b/mindspore/ccsrc/dataset/util/task.cc
index d9e0e73243..f00f26f5ce 100644
--- a/mindspore/ccsrc/dataset/util/task.cc
+++ b/mindspore/ccsrc/dataset/util/task.cc
@@ -69,7 +69,7 @@ void Task::ShutdownGroup() {  // Wake up watch dog and shutdown the engine.
       vg->rc_ = rc_;
       rcLock.unlock();
       TaskManager::InterruptMaster(rc_);
-      TaskManager::InterruptGroup(*gMyTask);
+      TaskManager::InterruptGroup(*this);
     }
   }
 }
diff --git a/mindspore/ccsrc/debug/anf_ir_dump.cc b/mindspore/ccsrc/debug/anf_ir_dump.cc
index 1fd3096e7c..fc32e0fb5f 100644
--- a/mindspore/ccsrc/debug/anf_ir_dump.cc
+++ b/mindspore/ccsrc/debug/anf_ir_dump.cc
@@ -111,9 +111,15 @@ void DumpGlobalInfoEntry(const FuncGraphPtr &graph, std::ostringstream &buffer)
   }
 
   buffer << "#IR entry      : @" << graph->ToString() << "." << graph->debug_info()->get_id() << std::endl;
-  buffer << "#flags         :" << std::endl;
-  for (const auto &flag : graph->flags()) {
-    buffer << flag.first << " : " << flag.second << std::endl;
+  buffer << "#attrs         :" << std::endl;
+  for (const auto &attr : graph->attrs()) {
+    buffer << attr.first << " : ";
+    if (attr.second->isa<BoolImm>()) {
+      buffer << GetValue<bool>(attr.second);
+    } else if (attr.second->isa<StringImm>()) {
+      buffer << GetValue<std::string>(attr.second);
+    }
+    buffer << std::endl;
   }
 }
 
@@ -417,10 +423,16 @@ void DumpSubgraph(const OrderedMap<FuncGraphPtr, std::shared_ptr<SubGraphIRInfo>
   fout << std::endl;
 
   for (const auto &sg : *sub_graphs) {
-    fout << "subgraph flag:" << std::endl;
+    fout << "subgraph attr:" << std::endl;
     MS_EXCEPTION_IF_NULL(sg.first);
-    for (const auto &flag : sg.first->flags()) {
-      fout << flag.first << " : " << flag.second << std::endl;
+    for (const auto &attr : sg.first->attrs()) {
+      fout << attr.first << " : ";
+      if (attr.second->isa<BoolImm>()) {
+        fout << GetValue<bool>(attr.second);
+      } else if (attr.second->isa<StringImm>()) {
+        fout << GetValue<std::string>(attr.second);
+      }
+      fout << std::endl;
     }
     fout << "subgraph @" << sg.first->ToString() << ".";
     fout << sg.first->debug_info()->get_id() << "(";
diff --git a/mindspore/ccsrc/debug/anf_ir_utils.cc b/mindspore/ccsrc/debug/anf_ir_utils.cc
index 274cd43914..2b8e61ab15 100644
--- a/mindspore/ccsrc/debug/anf_ir_utils.cc
+++ b/mindspore/ccsrc/debug/anf_ir_utils.cc
@@ -30,6 +30,7 @@
 #include "pipeline/parse/python_adapter.h"
 #include "pipeline/parse/resolve.h"
 #include "operator/composite/composite.h"
+#include "operator/composite/map.h"
 #include "utils/ordered_map.h"
 #include "utils/ordered_set.h"
 #include "utils/utils.h"
@@ -190,6 +191,8 @@ std::string AnfExporter::GetMultitypeFuncGraphText(const prim::MultitypeFuncGrap
  * ├── MultitypeGraph
  * ├── HyperMap
  * │   └── HyperMapPy
+ * ├── Map
+ * │   └── MapPy
  * ├── Tail
  * ├── MakeTupleGradient
  * ├── GradOperation
@@ -208,17 +211,25 @@ std::string AnfExporter::GetMetaFuncGraphText(const MetaFuncGraphPtr &meta_func_
     oss << GetMultitypeFuncGraphText(mt_func_graph);
   } else if (meta_func_graph
                ->isa<prim::HyperMapPy>()) {  // this statement must before 'meta_graph->isa<prim::HyperMap>()'
-    prim::HyperMapPyPtr hyper_map = meta_func_graph->cast<prim::HyperMapPyPtr>();
-    MS_EXCEPTION_IF_NULL(hyper_map);
+    auto hyper_map = meta_func_graph->cast<prim::HyperMapPyPtr>();
     if (hyper_map->GetFnLeaf() != nullptr) {
       oss << "{fn_leaf=" << GetMetaFuncGraphText(hyper_map->GetFnLeaf()) << "}";
     }
   } else if (meta_func_graph->isa<prim::HyperMap>()) {
-    prim::HyperMapPtr hyper_map = meta_func_graph->cast<prim::HyperMapPtr>();
-    MS_EXCEPTION_IF_NULL(hyper_map);
+    auto hyper_map = meta_func_graph->cast<prim::HyperMapPtr>();
     if (hyper_map->GetFnLeaf() != nullptr) {
       oss << "{fn_leaf=" << GetMetaFuncGraphText(hyper_map->GetFnLeaf()) << "}";
     }
+  } else if (meta_func_graph->isa<prim::MapPy>()) {  // this statement must before 'meta_graph->isa<prim::Map>()'
+    auto map = meta_func_graph->cast<prim::MapPyPtr>();
+    if (map->GetFnLeaf() != nullptr) {
+      oss << "{fn_leaf=" << GetMetaFuncGraphText(map->GetFnLeaf()) << "}";
+    }
+  } else if (meta_func_graph->isa<prim::Map>()) {
+    auto map = meta_func_graph->cast<prim::MapPtr>();
+    if (map->GetFnLeaf() != nullptr) {
+      oss << "{fn_leaf=" << GetMetaFuncGraphText(map->GetFnLeaf()) << "}";
+    }
   } else if (meta_func_graph->isa<prim::GradOperation>()) {
     prim::GradOperationPtr grad_op = meta_func_graph->cast<prim::GradOperationPtr>();
     oss << "{get_all=" << grad_op->get_all_ << ", get_by_list=" << grad_op->get_by_list_
@@ -1555,7 +1566,7 @@ class IrParser {
       return lexer_.GetNextToken();
     } else if (type == "Tuple") {
       return ParseTypeVector(func_graph, lexer_.GetNextToken(), type, ptr);
-    } else if (type == "Array") {
+    } else if (type == "Tensor") {
       return ParseTypeArray(func_graph, lexer_.GetNextToken(), ptr);
     } else if (type == "List") {
       return ParseTypeVector(func_graph, lexer_.GetNextToken(), type, ptr);
@@ -1971,7 +1982,11 @@ class IrParser {
         MS_LOG(EXCEPTION) << "Cast to type 'PrimitivePyPtr' error";
       }
     } else {
-      ptr = std::make_shared<PrimitivePy>(id.substr(strlen("PrimitivePy::")), py_obj);
+      auto len = strlen("PrimitivePy::");
+      if (id.size() < len) {
+        return TOK_ERROR;
+      }
+      ptr = std::make_shared<PrimitivePy>(id.substr(len), py_obj);
     }
     *val_ptr = ptr;
 
@@ -1988,7 +2003,7 @@ class IrParser {
     return next;
   }
 
-  Token ParseValueGraphAndNamespace(const std::string &id, ValuePtr *val_ptr) {
+  Token ParseValueGraphAndNamespace(const std::string &id, ValuePtr *const val_ptr) {
     if (Match(id, "MultitypeFuncGraph::")) {
       std::string name = id.substr(strlen("MultitypeFuncGraph::"));
       auto mt_func_graph = std::make_shared<prim::MultitypeFuncGraph>(name);
@@ -2028,7 +2043,7 @@ class IrParser {
     }
   }
 
-  Token ParseValueBasic(const FuncGraphPtr &func_graph, const std::string &id, ValuePtr *val_ptr,
+  Token ParseValueBasic(const FuncGraphPtr &func_graph, const std::string &id, ValuePtr *const val_ptr,
                         AnfNodePtr *const node_ptr = nullptr) {
     if (id == "None") {
       *val_ptr = std::make_shared<None>();
diff --git a/mindspore/ccsrc/debug/anf_ir_utils.h b/mindspore/ccsrc/debug/anf_ir_utils.h
index 6c8601c4af..4503692eb9 100644
--- a/mindspore/ccsrc/debug/anf_ir_utils.h
+++ b/mindspore/ccsrc/debug/anf_ir_utils.h
@@ -91,12 +91,12 @@ class AnfExporter {
   std::string GetMetaFuncGraphText(const MetaFuncGraphPtr &meta_func_graph);
   std::string GetAnfNodeText(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                              const std::map<AnfNodePtr, int> &apply_map);
-  void ExportOneFuncGraph(std::ofstream &ofs, const FuncGraphPtr &func_graph);
+  virtual void ExportOneFuncGraph(std::ofstream &ofs, const FuncGraphPtr &func_graph);
   void OutputParameters(std::ofstream &ofs, const std::vector<AnfNodePtr> &parameters,
                         OrderedMap<AnfNodePtr, int, ParamPtrHasher, ParamPtrEqual> *param_map);
 
   void OutputStatementComment(std::ofstream &ofs, const CNodePtr &node);
-  void OutputCNodes(std::ofstream &ofs, const std::vector<AnfNodePtr> &nodes, const FuncGraphPtr &func_graph);
+  virtual void OutputCNodes(std::ofstream &ofs, const std::vector<AnfNodePtr> &nodes, const FuncGraphPtr &func_graph);
 
   int param_index;
   OrderedSet<FuncGraphPtr> func_graph_set{};
@@ -118,6 +118,8 @@ std::string GetFuncGraphProtoString(const FuncGraphPtr &func_graph);
 void DumpIRProto(const FuncGraphPtr &func_graph, const std::string &suffix);
 
 std::string GetOnnxProtoString(const FuncGraphPtr &func_graph);
+
+std::string GetBinaryProtoString(const FuncGraphPtr &func_graph);
 }  // namespace mindspore
 
 #endif  // MINDSPORE_CCSRC_DEBUG_ANF_IR_UTILS_H_
diff --git a/mindspore/ccsrc/debug/info.cc b/mindspore/ccsrc/debug/info.cc
index 406bd11fab..f58522cf33 100644
--- a/mindspore/ccsrc/debug/info.cc
+++ b/mindspore/ccsrc/debug/info.cc
@@ -126,10 +126,10 @@ int64_t DebugInfo::debug_id() {
 }
 
 int64_t DebugInfo::unique_id_through_copy() const {
-  TraceInfoPtr trace_info = const_cast<DebugInfo *>(this)->trace_info();
-  if (trace_info != nullptr) {
-    if (trace_info->isa<TraceCopy>() && trace_info->debug_info() != nullptr) {
-      return trace_info->debug_info()->unique_id_through_copy();
+  auto info = trace_info();
+  if (info != nullptr) {
+    if (info->isa<TraceCopy>() && info->debug_info() != nullptr) {
+      return info->debug_info()->unique_id_through_copy();
     }
   }
   return unique_id();
diff --git a/mindspore/ccsrc/debug/info.h b/mindspore/ccsrc/debug/info.h
index 9ed216277e..c09c6031b3 100644
--- a/mindspore/ccsrc/debug/info.h
+++ b/mindspore/ccsrc/debug/info.h
@@ -118,7 +118,7 @@ class TraceContext {
   void set_location(const LocationPtr &loc) { location_ = loc; }
   LocationPtr location() { return location_; }
   void set_trace_info(const TraceInfoPtr &trace_info) { trace_info_ = trace_info; }
-  TraceInfoPtr trace_info() { return trace_info_; }
+  TraceInfoPtr trace_info() const { return trace_info_; }
   void set_func_name(const std::string &func_name) { func_name_ = func_name; }
   std::string func_name() { return func_name_; }
 };
@@ -139,7 +139,7 @@ class DebugInfo : public Base {
   std::string get_id() { return std::to_string(debug_id()); }
 
   void set_trace_info(const TraceInfoPtr &trace_info) { trace_info_ = trace_info; }
-  TraceInfoPtr trace_info() { return trace_info_; }
+  TraceInfoPtr trace_info() const { return trace_info_; }
   void set_location(const LocationPtr &loc) { location_ = loc; }
   virtual LocationPtr location() { return location_; }
   std::string name() { return name_; }
diff --git a/mindspore/ccsrc/debug/trace.cc b/mindspore/ccsrc/debug/trace.cc
index e5507a8c2b..e12a7b1209 100644
--- a/mindspore/ccsrc/debug/trace.cc
+++ b/mindspore/ccsrc/debug/trace.cc
@@ -37,6 +37,11 @@
 namespace mindspore {
 // namespace to support debug trace infomation
 namespace trace {
+using abstract::AbstractBasePtr;
+using abstract::AnalysisContextPtr;
+using abstract::AnalysisEnginePtr;
+using abstract::AnfNodeConfigPtr;
+
 std::string GetAbstractStr(const abstract::AbstractBasePtr &abs) {
   if (abs == nullptr) {
     return "Null Abstract";
@@ -117,8 +122,23 @@ class AnalyzedFuncGraphExporter : public AnfExporter {
 
   void ExportFuncGraph(const std::string &filename, const std::vector<abstract::AnfNodeConfigPtr> &node_cfgs);
 
+  void ExportOneFuncGraph(std::ofstream &ofs, const FuncGraphPtr &func_graph);
+  void OutputCNodes(std::ofstream &ofs, const std::vector<AnfNodePtr> &nodes, const FuncGraphPtr &func_graph);
+  void OutputCNode(std::ofstream &ofs, const CNodePtr &cnode, const FuncGraphPtr &func_graph, int *idx,
+                   std::map<AnfNodePtr, int> *const apply_map);
+
  private:
   std::string GetNodeType(const AnfNodePtr &nd) override;
+  AbstractBasePtr GetNodeAbstract(const AnfNodePtr &nd);
+  AnfNodeConfigPtr GetFordwardConfigPtr(const AnfNodeConfigPtr &cfg);
+  AnalysisContextPtr ProcessFuncGraphCall(const CNodePtr &node);
+
+  // key: context, val: whether the context has already been printed
+  std::unordered_map<AnalysisContextPtr, bool> context_map_;
+  std::vector<AnalysisContextPtr> context_vec_;
+
+  AnalysisContextPtr cur_ctx_ = nullptr;
+  AnalysisEnginePtr engine_ = nullptr;
 };
 
 std::unordered_map<FuncGraphPtr, TaggedNodeMap> CalcTaggedFuncGraphs() {
@@ -139,17 +159,20 @@ void OutputAnalyzedGraphWithType() {
 }
 
 std::string AnalyzedFuncGraphExporter::GetNodeType(const AnfNodePtr &node) {
-  if (node_cfg_ == nullptr) {
+  if (cur_ctx_ == nullptr) {
     return AnfExporter::GetNodeType(node);
   }
-  auto ctx = node_cfg_->context();
-  auto engine = node_cfg_->engine();
-  auto cfg = engine->MakeConfig(node, ctx);
-  auto eval_result = engine->cache().GetValue(cfg);
-  if (eval_result == nullptr || eval_result->abstract() == nullptr) {
+
+  MS_EXCEPTION_IF_NULL(engine_);
+  auto cfg = engine_->MakeConfig(node, cur_ctx_);
+  auto ret = engine_->cache().GetValue(cfg);
+  if (ret == nullptr) {
+    return "Undefined";
+  }
+  auto abs = ret->abstract();
+  if (abs == nullptr) {
     return "Undefined";
   }
-  auto abs = eval_result->abstract();
   auto dtype = abs->BuildType();
   auto shape = abs->BuildShape();
   std::ostringstream oss;
@@ -163,6 +186,176 @@ std::string AnalyzedFuncGraphExporter::GetNodeType(const AnfNodePtr &node) {
   return oss.str();
 }
 
+AbstractBasePtr AnalyzedFuncGraphExporter::GetNodeAbstract(const AnfNodePtr &node) {
+  if (cur_ctx_ == nullptr) {
+    return nullptr;
+  }
+  MS_EXCEPTION_IF_NULL(engine_);
+  auto cfg = engine_->MakeConfig(node, cur_ctx_);
+  auto ret = engine_->cache().GetValue(cfg);
+  return ret == nullptr ? nullptr : ret->abstract();
+}
+
+AnfNodeConfigPtr AnalyzedFuncGraphExporter::GetFordwardConfigPtr(const AnfNodeConfigPtr &cfg) {
+  AnfNodeConfigPtr cur_cfg = cfg;
+  auto iter = engine_->anfnode_config_map().find(cur_cfg);
+  while (iter != engine_->anfnode_config_map().end()) {
+    auto node = cur_cfg->node();
+    cur_cfg = iter->second;
+    MS_LOG(DEBUG) << "Get forword node: " << node.get() << "[" << node->ToString() << "] --> " << cur_cfg->node().get()
+                  << "[" << cur_cfg->node()->ToString() << "]";
+    iter = engine_->anfnode_config_map().find(cur_cfg);
+  }
+  return cur_cfg;
+}
+
+AnalysisContextPtr AnalyzedFuncGraphExporter::ProcessFuncGraphCall(const CNodePtr &node) {
+  if (node == nullptr) {
+    return nullptr;
+  }
+  auto cfg = engine_->MakeConfig(node, cur_ctx_);
+  cfg = GetFordwardConfigPtr(cfg);
+  auto cnode = dyn_cast<CNode>(cfg->node());
+  if (cnode == nullptr) {
+    MS_LOG(DEBUG) << "CNode is nullptr";
+    return nullptr;
+  }
+  const auto &inputs = cnode->inputs();
+  auto op_abs = GetNodeAbstract(inputs[0]);
+  if (op_abs == nullptr) {
+    MS_LOG(DEBUG) << "Abstract of inputs[0] of cnode " << cnode->ToString() << "  is nullptr";
+    return nullptr;
+  }
+
+  if (!op_abs->isa<abstract::FuncGraphAbstractClosure>() && !op_abs->isa<abstract::MetaFuncGraphAbstractClosure>()) {
+    MS_LOG(DEBUG) << "Inputs[0] of cnode " << cnode->ToString() << " is of type " << op_abs->type_name()
+                  << ", not function, ignore it";
+    return nullptr;
+  }
+
+  auto evaluator = engine_->GetEvaluatorFor(dyn_cast<abstract::AbstractFunction>(op_abs));
+  if (!evaluator->isa<abstract::BaseFuncGraphEvaluator>()) {
+    MS_LOG(DEBUG) << "Evaluator for inputs[0] of cnode " << cnode->ToString() << " is of type "
+                  << evaluator->type_name() << ", not BaseFuncGraphEvaluator, ignore it.";
+    return nullptr;
+  }
+
+  auto base_fg_evaluator = dyn_cast<abstract::BaseFuncGraphEvaluator>(evaluator);
+  auto ctx = base_fg_evaluator->graph_context();
+  if (ctx != nullptr && context_map_.insert({ctx, false}).second) {
+    MS_LOG(DEBUG) << "Add new context, ctx.addr = " << ctx.get() << "ctx = " << ctx->ToString();
+    context_vec_.push_back(ctx);
+  }
+  return ctx;
+}
+
+void AnalyzedFuncGraphExporter::OutputCNode(std::ofstream &ofs, const CNodePtr &cnode, const FuncGraphPtr &func_graph,
+                                            int *idx, std::map<AnfNodePtr, int> *const apply_map) {
+  auto &inputs = cnode->inputs();
+  std::string op_text = GetAnfNodeText(func_graph, inputs[0], *apply_map);
+  // non-return node
+  if (cnode != func_graph->get_return()) {
+    int apply_idx = (*idx)++;
+    (*apply_map)[cnode] = apply_idx;
+    std::string type_info = GetNodeType(cnode);
+    if (type_info == "Undefined") {
+      ofs << "    %" << apply_idx << " = " << op_text << "(";
+    } else {
+      ofs << "    %" << apply_idx << " : " << type_info << " = " << op_text << "(";
+    }
+  } else {
+    ofs << "    " << op_text << "(";
+  }
+
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    if (i != 1) {
+      ofs << ", ";
+    }
+    AnfNodePtr arg = inputs[i];
+    ofs << GetAnfNodeText(func_graph, arg, *apply_map);
+  }
+  ofs << ")";
+
+  // process function graph call
+  auto ctx = ProcessFuncGraphCall(cnode);
+
+  // output comment
+  OutputStatementComment(ofs, cnode);
+  if (ctx != nullptr) {
+    ofs << " @ctx.addr=" << ctx.get();
+  }
+  ofs << "\n";
+
+  if (label_manage::GetGlobalTraceLabelType() == label_manage::TraceLabelType::kWithUniqueId) {
+    ofs << trace::GetDebugInfo(cnode->debug_info(), "      # ", kSourceLineTipDiscard) << "#"
+        << label_manage::Label(cnode->debug_info()) << "\n";
+  } else {
+    ofs << trace::GetDebugInfo(cnode->debug_info(), "      # ", kSourceLineTipDiscard) << "\n";
+  }
+}
+
+void AnalyzedFuncGraphExporter::OutputCNodes(std::ofstream &ofs, const std::vector<AnfNodePtr> &nodes,
+                                             const FuncGraphPtr &func_graph) {
+  if (func_graph == nullptr) {
+    return;
+  }
+
+  int idx = 1;
+  std::map<AnfNodePtr, int> apply_map;
+  for (const AnfNodePtr &node : nodes) {
+    MS_EXCEPTION_IF_NULL(node);
+    if (!node->isa<CNode>()) {
+      continue;
+    }
+
+    auto iter = tagged_cnodes_.find(node);
+    if (iter != tagged_cnodes_.end()) {
+      ofs << "\n#------------------------> " << iter->second << "\n";
+    }
+
+    auto cnode = node->cast<CNodePtr>();
+    OutputCNode(ofs, cnode, func_graph, &idx, &apply_map);
+  }
+}
+
+void AnalyzedFuncGraphExporter::ExportOneFuncGraph(std::ofstream &ofs, const FuncGraphPtr &func_graph) {
+  if (func_graph == nullptr) {
+    return;
+  }
+
+  std::vector<AnfNodePtr> nodes = TopoSort(func_graph->get_return(), SuccIncoming, AlwaysInclude);
+  std::vector<AnfNodePtr> parameters = func_graph->parameters();
+  OrderedMap<AnfNodePtr, int, ParamPtrHasher, ParamPtrEqual> param_map;
+
+  ofs << "# [No." << (exported.size() + 1) << "] " << func_graph->DumpText() << "."
+      << func_graph->debug_info()->get_id();
+  if (cur_ctx_ != nullptr) {
+    ofs << " @ctx.addr=" << cur_ctx_.get();
+  }
+  ofs << "\n";
+  if (label_manage::GetGlobalTraceLabelType() == label_manage::TraceLabelType::kWithUniqueId) {
+    ofs << trace::GetDebugInfo(func_graph->debug_info(), "# ", kSourceLineTipDiscard) << "#"
+        << label_manage::Label(func_graph->debug_info()) << "\n";
+  } else {
+    ofs << trace::GetDebugInfo(func_graph->debug_info(), "# ", kSourceLineTipDiscard) << "\n";
+  }
+  ofs << "funcgraph fg_" << func_graph->debug_info()->get_id();
+  // output name of parent of graph if exists
+  if (func_graph->parent() != nullptr) {
+    ofs << "[fg_" << func_graph->parent()->debug_info()->get_id() << "]";
+  }
+  ofs << "(\n";
+
+  OutputParameters(ofs, parameters, &param_map);
+
+  exported[func_graph] = param_map;
+  ofs << (!parameters.empty() ? "    " : "") << ") {\n";
+
+  OutputCNodes(ofs, nodes, func_graph);
+
+  ofs << "}\n";
+}
+
 void AnalyzedFuncGraphExporter::ExportFuncGraph(const std::string &filename,
                                                 const std::vector<abstract::AnfNodeConfigPtr> &node_cfgs) {
   if (node_cfgs.empty()) {
@@ -170,6 +363,9 @@ void AnalyzedFuncGraphExporter::ExportFuncGraph(const std::string &filename,
     return;
   }
 
+  context_map_.clear();
+  context_vec_.clear();
+
   std::ofstream ofs(filename);
   if (!ofs.is_open()) {
     MS_LOG(ERROR) << "Open file '" << filename << "' failed!";
@@ -181,32 +377,47 @@ void AnalyzedFuncGraphExporter::ExportFuncGraph(const std::string &filename,
 
   // first output graph on the analysis stack
   for (const auto &node_cfg : node_cfgs) {
-    auto fg = node_cfg->context()->func_graph();
-    // the graph is already output, skip it
-    if (exported.find(fg) != exported.end()) {
+    auto ctx = node_cfg->context();
+    if (engine_ == nullptr) {
+      engine_ = node_cfg->engine();
+    }
+    if (context_map_.insert({ctx, false}).second) {
+      context_vec_.push_back(ctx);
+    }
+    // the graph has already been printed
+    if (context_map_[ctx]) {
       continue;
     }
-    // set node_cfg info for getting type
-    node_cfg_ = node_cfg;
+    context_map_[ctx] = true;
+
+    auto fg = ctx->func_graph();
+
+    // set current context
+    cur_ctx_ = ctx;
     tagged_cnodes_ = tagged_func_graphs[fg];
     ExportOneFuncGraph(ofs, fg);
     ofs << "\n\n";
   }
 
-  node_cfg_ = nullptr;
   tagged_cnodes_.clear();
 
   // print seperator between function graphs on analyzed graph call stack and others
   ofs << "#===============================================================================\n\n\n";
 
   // second output other graphs
-  while (!func_graph_set.empty()) {
-    FuncGraphPtr fg = *func_graph_set.begin();
-    ExportOneFuncGraph(ofs, fg);
+  size_t ctx_idx = 0;
+  while (ctx_idx < context_vec_.size()) {
+    auto ctx = context_vec_[ctx_idx++];
+    if (context_map_[ctx]) {
+      continue;
+    }
+    context_map_[ctx] = true;
+    cur_ctx_ = ctx;
+    ExportOneFuncGraph(ofs, ctx->func_graph());
     ofs << "\n\n";
-    (void)func_graph_set.erase(fg);
   }
-  ofs << "# num of total function graphs: " << exported.size();
+
+  ofs << "# num of total function graphs: " << context_map_.size() << "\n";
 
   ofs.close();
 }
diff --git a/mindspore/ccsrc/debug/trace_info.h b/mindspore/ccsrc/debug/trace_info.h
index 19c07bdbbc..cf4f0c080a 100644
--- a/mindspore/ccsrc/debug/trace_info.h
+++ b/mindspore/ccsrc/debug/trace_info.h
@@ -281,6 +281,16 @@ class TraceForceBool : public TraceInfo {
   TraceInfoPtr clone() override { return std::make_shared<TraceForceBool>(*shared_from_base<TraceForceBool>()); }
 };
 
+class TraceForceWhileCond : public TraceInfo {
+ public:
+  explicit TraceForceWhileCond(const DebugInfoPtr &info) : TraceInfo(info, "force_while_cond", "") {}
+  MS_DECLARE_PARENT(TraceForceWhileCond, TraceInfo);
+  ~TraceForceWhileCond() override = default;
+  TraceInfoPtr clone() override {
+    return std::make_shared<TraceForceWhileCond>(*shared_from_base<TraceForceWhileCond>());
+  }
+};
+
 class TraceExpandJ : public TraceInfo {
  public:
   explicit TraceExpandJ(const DebugInfoPtr &info) : TraceInfo(info, "expand_j", "") {}
diff --git a/mindspore/ccsrc/device/CMakeLists.txt b/mindspore/ccsrc/device/CMakeLists.txt
index 2ade0f0ef3..7178a01ce6 100644
--- a/mindspore/ccsrc/device/CMakeLists.txt
+++ b/mindspore/ccsrc/device/CMakeLists.txt
@@ -14,6 +14,17 @@ endif ()
 
 if (ENABLE_CPU)
     file(GLOB_RECURSE CPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "cpu/*.cc")
+    if (NOT ENABLE_MPI)
+        list(REMOVE_ITEM CPU_SRC_LIST "cpu/mpi/mpi_adapter.cc")
+    endif ()
+endif ()
+
+if (ENABLE_MPI)
+    # _ms_mpi
+    set_property(SOURCE "gpu/mpi/mpi_initializer.cc"
+        PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
+    pybind11_add_module(_ms_mpi "gpu/mpi/mpi_initializer.cc")
+    target_link_libraries(_ms_mpi PRIVATE mindspore::pybind11_module mindspore::ompi)
 endif ()
 
 # gpu
@@ -39,11 +50,6 @@ if (ENABLE_GPU)
         set_property(SOURCE ${GPU_COLLECTIVE_SRCS}
             PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
         add_library(gpu_collective SHARED ${GPU_COLLECTIVE_SRCS})
-        # _ms_mpi
-        set_property(SOURCE "gpu/mpi/mpi_initializer.cc"
-            PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
-        pybind11_add_module(_ms_mpi "gpu/mpi/mpi_initializer.cc")
-        target_link_libraries(_ms_mpi PRIVATE mindspore::pybind11_module mindspore::ompi)
         target_link_libraries(gpu_collective PRIVATE mindspore::ompi mindspore::nccl)
     endif ()
 
diff --git a/mindspore/ccsrc/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/device/ascend/ascend_device_address.cc
index c9fb6bacd3..a47c482c0e 100644
--- a/mindspore/ccsrc/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_device_address.cc
@@ -92,10 +92,29 @@ bool SyncDeviceToHostAndFloatToFloat64(void *dst, size_t dst_size, const void *s
   return true;
 }
 
+void AscendDeviceAddress::SyncStream() const {
+  MS_LOG(INFO) << "Start!";
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->execution_mode() != kPynativeMode) {
+    MS_LOG(INFO) << "Finish!";
+    return;
+  }
+  auto device_id = ms_context->device_id();
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  auto ret = runtime_instance->SyncStream();
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "Sync stream error!";
+  }
+  MS_LOG(INFO) << "Finish!";
+}
+
 bool AscendDeviceAddress::SyncDeviceToHost(const std::vector<int> &shape, size_t size, mindspore::TypeId type,
                                            void *host_ptr) const {
   MS_LOG(INFO) << "SyncDeviceToHost, Device(format:" << format_ << ", type_id:" << TypeIdLabel(type_id_)
                << ", size:" << size_ << "), Host(type_id:" << TypeIdLabel(type) << ", size:" << size << ")";
+  SyncStream();
   bool sync_ok = false;
   std::vector<size_t> host_shape;
   (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), IntToSize);
@@ -186,6 +205,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const std::vector<int> &shape, size_t
                                            const void *host_ptr) const {
   MS_LOG(INFO) << "SyncHostToDevice, Device(format:" << format_ << ", type_id:" << TypeIdLabel(type_id_)
                << ", size:" << size_ << "), Host(type_id:" << TypeIdLabel(type) << ", size:" << size << ")";
+  SyncStream();
   bool sync_ok = false;
   std::vector<size_t> host_shape;
   (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), IntToSize);
diff --git a/mindspore/ccsrc/device/ascend/ascend_device_address.h b/mindspore/ccsrc/device/ascend/ascend_device_address.h
index 93746082c1..364f9e95fd 100644
--- a/mindspore/ccsrc/device/ascend/ascend_device_address.h
+++ b/mindspore/ccsrc/device/ascend/ascend_device_address.h
@@ -35,6 +35,7 @@ class AscendDeviceAddress : public DeviceAddress {
   ~AscendDeviceAddress() override;
   bool SyncDeviceToHost(const std::vector<int> &shape, size_t size, TypeId type, void *host_ptr) const override;
   bool SyncHostToDevice(const std::vector<int> &shape, size_t size, TypeId type, const void *host_ptr) const override;
+  DeviceAddressType DeviceType() const override { return DeviceAddressType::kAscend; }
 #ifdef ENABLE_DUMP_E2E
   bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt,
                      const std::vector<int> &host_shape, TypeId host_type) const;
@@ -43,6 +44,7 @@ class AscendDeviceAddress : public DeviceAddress {
   bool SyncDeviceToHostAndConvertFormat(const std::vector<int> &shape, size_t size, TypeId type, void *host_ptr) const;
   bool ConvertFormatAndSyncHostToDevice(const std::vector<int> &shape, size_t size, TypeId type,
                                         const void *host_ptr) const;
+  void SyncStream() const;
 };
 using AscendDeviceAddressPtr = std::shared_ptr<AscendDeviceAddress>;
 }  // namespace ascend
diff --git a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
index 6ffa835204..fb2a3f350b 100644
--- a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
@@ -15,7 +15,6 @@
  */
 
 #include "device/ascend/ascend_kernel_runtime.h"
-
 #include <string>
 #include <vector>
 #include <memory>
@@ -24,7 +23,9 @@
 #include <algorithm>
 
 #include "device/ascend/ascend_device_address.h"
+#include "device/cpu/mpi/mpi_adapter.h"
 #include "utils/context/ms_context.h"
+#include "utils/mpi/mpi_config.h"
 #include "device/ascend/profiling/profiling_manager.h"
 #include "hccl/hcom.h"
 #include "common/trans.h"
@@ -51,6 +52,38 @@ namespace mindspore {
 namespace device {
 namespace ascend {
 static const size_t PRAMATER_OUTPUT_INDEX = 0;
+namespace {
+std::string GetRankId() {
+  std::string rank_id_str;
+#ifdef ENABLE_MPI
+  auto mpi_config_ptr = MpiConfig::GetInstance();
+  MS_EXCEPTION_IF_NULL(mpi_config_ptr);
+  if (mpi_config_ptr->enable_mpi()) {
+    int rank_id = device::cpu::MPIAdapter::Instance().GetRankId();
+    const char *offset = std::getenv("RANK_OFFSET");
+    if (offset != nullptr) {
+      try {
+        int rank_offset = std::stoi(offset);
+        rank_id += rank_offset;
+      } catch (std::invalid_argument) {
+        MS_LOG(EXCEPTION) << "stoi invalid argument:" << offset;
+      } catch (std::out_of_range) {
+        MS_LOG(EXCEPTION) << "stoi out_of_range:" << offset;
+      }
+    }
+    rank_id_str = std::to_string(rank_id);
+  } else {
+    rank_id_str = std::getenv("RANK_ID");
+  }
+#else
+  rank_id_str = std::getenv("RANK_ID");
+#endif
+  if (rank_id_str.empty()) {
+    MS_LOG(ERROR) << "get hccl rankid failed, please set env RANK_ID";
+  }
+  return rank_id_str;
+}
+}  // namespace
 
 AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); }
 
@@ -65,13 +98,13 @@ void AscendKernelRuntime::ClearGraphModelMap() {
 }
 
 void AscendKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id) {
-  MS_LOG(INFO) << "clear graph:" << graph_id << " runtime resource";
+  MS_LOG(DEBUG) << "clear graph:" << graph_id << " runtime resource";
   auto iter = graph_model_map_.find(graph_id);
   if (iter == graph_model_map_.end()) {
-    MS_LOG(WARNING) << "GraphId:" << graph_id << " not found";
+    MS_LOG(DEBUG) << "GraphId:" << graph_id << " not found";
     return;
   }
-  MS_LOG(INFO) << "Ge UnloadModel " << iter->first;
+  MS_LOG(DEBUG) << "Ge UnloadModel " << iter->first;
   auto ret = ge::model_runner::ModelRunner::Instance().UnloadModel(iter->first);
   if (!ret) {
     MS_LOG(ERROR) << "UnloadModel failed";
@@ -124,6 +157,12 @@ bool AscendKernelRuntime::Init() {
   }
 #endif
 
+  // Start up profiling before rtSetDevice
+  ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
+  if (!ret) {
+    MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
+  }
+
   ret = InitDevice();
   if (!ret) {
     return ret;
@@ -132,11 +171,6 @@ bool AscendKernelRuntime::Init() {
   MS_EXCEPTION_IF_NULL(mem_manager_);
   mem_manager_->MallocDeviceMemory();
 
-  ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
-  if (!ret) {
-    MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
-  }
-
   initialized_ = true;
   return ret;
 }
@@ -259,6 +293,15 @@ bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph) {
   return true;
 }
 
+bool AscendKernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) {
+  if (AnfAlgo::OutputAddrExist(kernel, index)) {
+    auto address = AnfAlgo::GetOutputAddr(kernel, index);
+    MS_EXCEPTION_IF_NULL(address);
+    return address->DeviceType() == DeviceAddressType::kAscend;
+  }
+  return false;
+}
+
 DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                                           TypeId type_id) {
   return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id);
@@ -284,38 +327,34 @@ bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
   vector<std::shared_ptr<TaskInfo>> task_info_list;
   auto anf_node_list = graph->execution_order();
   TaskGenerator::GenTasks(anf_node_list, &task_info_list, graph->graph_id());
-
   // Store the task_info_list
   auto insert_ret = task_map_.insert(std::make_pair(graph->graph_id(), task_info_list));
   if (!insert_ret.second) {
     MS_LOG(EXCEPTION) << "Duplicate GraphId! Please check in ascend_session.";
   }
-
   // Graph may have no compute node, such TensorAddGrad.
   if (task_info_list.empty()) {
     MS_LOG(WARNING) << "graph " << graph->graph_id() << " have no compute node";
     return true;
   }
-
-  AscendStreamAssign &stream_assign_instance = AscendStreamAssign::GetInstance();
+  AscendStreamAssign &assign_instance = AscendStreamAssign::GetInstance();
+  AscendStreamMng &stream_manager = AscendStreamMng::GetInstance();
   AscendLabelAssign &label_assign_instance = AscendLabelAssign::GetInstance();
   // the streams' flag not HEAD_STREAM
   std::vector<uint32_t> wait_active_stream_list;
-  stream_assign_instance.GetWaitStreams(&wait_active_stream_list);
-  auto force_copy_stream_list = stream_assign_instance.hcom_streams();
-
-  MS_LOG(INFO) << "call DavinciModel total stream num:" << stream_assign_instance.GetTotalStreamNum()
-               << ", total event num:" << stream_assign_instance.total_event_num()
+  assign_instance.GetWaitStreams(&wait_active_stream_list);
+  std::vector<uint32_t> force_copy_stream_list;
+  assign_instance.GetHcomStreams(&force_copy_stream_list);
+  MS_LOG(INFO) << "call DavinciModel total stream num:" << stream_manager.GetCurAllocStreamNum()
+               << ", total event num:" << assign_instance.total_event_num()
                << ", total label num:" << label_assign_instance.GetLabelNum(NOT_NULL(graph))
                << ", wait_active_stream_list size:" << wait_active_stream_list.size()
                << ", force_copy_stream_list size:" << force_copy_stream_list.size();
-
   std::vector<std::shared_ptr<ge::model_runner::OpInfo>> empty_list;
   std::shared_ptr<ge::model_runner::DavinciModel> model = std::make_shared<ge::model_runner::DavinciModel>(
     task_info_list, empty_list, empty_list, empty_list, empty_list, wait_active_stream_list, force_copy_stream_list, 0,
-    0, 0, 0, 0, 0, stream_assign_instance.GetTotalStreamNum(), label_assign_instance.GetLabelNum(NOT_NULL(graph)),
-    stream_assign_instance.total_event_num(), 0);
-
+    0, 0, 0, 0, 0, stream_manager.GetCurAllocStreamNum(), label_assign_instance.GetLabelNum(NOT_NULL(graph)),
+    assign_instance.total_event_num(), 0);
   auto ret = graph_model_map_.insert(std::make_pair(graph->graph_id(), model));
   if (!ret.second) {
     MS_LOG(EXCEPTION) << "Duplicate GraphId! Please check in ascend_session.";
@@ -356,7 +395,8 @@ bool AscendKernelRuntime::LoadTask(const session::KernelGraph *graph) {
   }
   if (ProfilingManager::GetInstance().IsProfiling()) {
     auto task_ids = ge::model_runner::ModelRunner::Instance().GetTaskIdList(model_iter->first);
-    ProfilingUtils::ReportProfilingData(task_ids, NOT_NULL(graph));
+    auto stream_ids = ge::model_runner::ModelRunner::Instance().GetStreamIdList(model_iter->first);
+    ProfilingUtils::ReportProfilingData(task_ids, stream_ids, NOT_NULL(graph));
   }
   return true;
 }
@@ -486,30 +526,23 @@ bool AscendKernelRuntime::HcclInit() {
   if (!context_ptr->IsTsdOpened()) {
     MS_LOG(EXCEPTION) << "Hccl dependent tsd is not open";
   }
-
   MS_LOG(INFO) << "do hcom init";
   auto config_path_str = std::getenv("MINDSPORE_HCCL_CONFIG_PATH");
   if (config_path_str == nullptr) {
     config_path_str = std::getenv("RANK_TABLE_FILE");
     if (config_path_str == nullptr) {
       MS_LOG(ERROR) << "get hccl json config failed, please set env MINDSPORE_HCCL_CONFIG_PATH or RANK_TABLE_FILE";
+      return false;
     }
-    return false;
   }
+  std::string rank_id_str = GetRankId();
   auto full_path = realpath(config_path_str, nullptr);
   if (full_path == nullptr) {
     MS_LOG(ERROR) << "file path " << config_path_str << " does not exist";
     return false;
   }
-
-  const char *identify = std::getenv("RANK_ID");
-  if (identify == nullptr) {
-    MS_LOG(ERROR) << "get hccl rankid failed, please set env RANK_ID";
-    free(full_path);
-    return false;
-  }
-  MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << full_path << ", RANK_ID: " << identify;
-  hcclResult_t res = hcom_init(full_path, identify);
+  MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << full_path << ", RANK_ID: " << rank_id_str;
+  hcclResult_t res = hcom_init(full_path, rank_id_str.c_str());
   free(full_path);
   if (res != HCCL_SUCCESS) {
     MS_LOG(ERROR) << "hcom init failed, res is " << static_cast<int>(res);
diff --git a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
index 336cfdc9f2..28076f95b7 100644
--- a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
@@ -41,11 +41,12 @@ class AscendKernelRuntime : public KernelRuntime {
   bool RunTask(const session::KernelGraph *graph) override;
   bool LoadTask(const session::KernelGraph *graph) override;
   void ClearGraphRuntimeResource(uint32_t graph_id) override;
+  bool SyncStream() override;
 
  protected:
   DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                        TypeId type_id) override;
-  bool SyncStream() override;
+  bool NodeOutputDeviceAddressExist(const AnfNodePtr &node, size_t index) override;
 
  private:
   bool InitDevice();
diff --git a/mindspore/ccsrc/device/ascend/ascend_label_assign.cc b/mindspore/ccsrc/device/ascend/ascend_label_assign.cc
index 9908b5d03d..7af615f448 100644
--- a/mindspore/ccsrc/device/ascend/ascend_label_assign.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_label_assign.cc
@@ -33,11 +33,9 @@ static void UpdateLabelGoto(NotNull<CNodePtr> node) {
   if (node->size() <= kLabelGotoLabelId) {
     MS_LOG(EXCEPTION) << "Node " << node->DebugString() << " has invalid input size " << node->size();
   }
-  auto label_set = AnfAlgo::GetCNodePrimitive(node->input(kLabelGotoLabelId));
-  MS_EXCEPTION_IF_NULL(label_set);
-  auto value = label_set->GetAttr(kAttrLabelIndex);
-  MS_EXCEPTION_IF_NULL(value);
-  uint32_t goto_label_id = GetValue<uint32_t>(value);
+
+  auto input = node->input(kLabelGotoLabelId);
+  uint32_t goto_label_id = AnfAlgo::GetNodeAttr<uint32_t>(input, kAttrLabelIndex);
   AnfAlgo::SetNodeAttr(kAttrLabelIndex, MakeValue<uint32_t>(goto_label_id), node.get());
   MS_LOG(INFO) << "Node " << node->DebugString() << " goto label id " << goto_label_id;
   node->set_inputs({node->input(0)});
@@ -57,16 +55,12 @@ static void UpdateLabelSwitch(NotNull<CNodePtr> node) {
       break;
     }
 
-    auto label_set = AnfAlgo::GetCNodePrimitive(input);
-    MS_EXCEPTION_IF_NULL(label_set);
-    auto value = label_set->GetAttr(kAttrLabelIndex);
-    MS_EXCEPTION_IF_NULL(value);
-    uint32_t goto_label_id = GetValue<uint32_t>(value);
+    uint32_t goto_label_id = AnfAlgo::GetNodeAttr<uint32_t>(input, kAttrLabelIndex);
     label_list.push_back(goto_label_id);
     MS_LOG(INFO) << "Switch " << node->DebugString() << " case " << i - kLabelSwitchLabelId << ": id " << goto_label_id;
   }
   AnfAlgo::SetNodeAttr(kAttrLabelSwitchList, MakeValue<std::vector<uint32_t>>(label_list), node.get());
-  node->set_inputs({node->input(0), node->input(1)});
+  node->set_inputs({node->input(kAnfPrimitiveIndex), node->input(kFirstDataInputIndex)});
 }
 
 static void AssignLabelForLabelSet(NotNull<std::shared_ptr<session::KernelGraph>> graph, NotNull<uint32_t *> label_id,
@@ -154,8 +148,8 @@ uint32_t AscendLabelAssign::GetLabelNum(NotNull<const session::KernelGraph *> gr
   std::lock_guard<std::mutex> lock(label_num_mutex_);
   auto iter = label_num_.find(graph.get());
   if (iter == label_num_.end()) {
-    MS_LOG(WARNING) << "Graph " << graph->ToString() << " has not assigned label.";
-    return 1;
+    MS_LOG(DEBUG) << "Graph " << graph->ToString() << " has not assigned label, defalut is 0.";
+    return 0;
   }
   return iter->second;
 }
diff --git a/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc b/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc
index 4c7b897cac..42c611c3af 100644
--- a/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#include <string>
 #include "device/ascend/ascend_memory_manager.h"
 #include "device/ascend/ascend_memory_pool.h"
 #include "utils/context/ms_context.h"
@@ -21,25 +21,52 @@
 namespace mindspore {
 namespace device {
 namespace ascend {
-const uint64_t kAscendDeviceMemGB = 26;
-const uint64_t kAscendMemPoolGB = 4;
-const uint64_t kAscendDeviceMemSize = (kAscendDeviceMemGB << 30);
-const uint64_t kAscendMemPoolSize = (kAscendMemPoolGB << 30);
+constexpr uint64_t kAscendDeviceMemGB = 26;
+constexpr uint64_t kAscendMemPoolGB = 4;
+constexpr uint64_t kMemSizeGB = 30;
+constexpr uint64_t kMaxMemSizeGB = 30;
+constexpr uint64_t kAscendDeviceMemSize = (kAscendDeviceMemGB << kMemSizeGB);
+constexpr uint64_t kAscendMemPoolSize = (kAscendMemPoolGB << kMemSizeGB);
 
 void AscendMemoryManager::MallocDeviceMemory() {
-  device_mem_size_ = kAscendDeviceMemSize;
+  auto context_mem = GetDeviceMemSizeFromContext();
+  device_mem_size_ = context_mem == 0 ? kAscendDeviceMemSize : context_mem;
   static_mem_offset_ = device_mem_size_;
   auto ret = rtMalloc(reinterpret_cast<void **>(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM);
   if (ret != RT_ERROR_NONE) {
     MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]";
   }
-  device_mem_pool_size_ = kAscendMemPoolSize;
-  ret = rtMalloc(reinterpret_cast<void **>(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM);
-  if (ret != RT_ERROR_NONE) {
-    MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
+
+  if (context_mem == 0) {
+    device_mem_pool_size_ = kAscendMemPoolSize;
+    ret = rtMalloc(reinterpret_cast<void **>(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM);
+    if (ret != RT_ERROR_NONE) {
+      MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
+    }
+    AscendMemoryPool::GetInstance().set_device_mem_pool_base(device_mem_pool_base_);
+    AscendMemoryPool::GetInstance().set_device_mem_pool_size(device_mem_pool_size_);
+  }
+}
+
+uint64_t AscendMemoryManager::GetDeviceMemSizeFromContext() {
+  auto context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context);
+  auto variable_memory_max_size = context->variable_memory_max_size();
+  if (variable_memory_max_size == "0") {
+    return 0;
+  }
+  MS_LOG(INFO) << "context variable_memory_max_size:" << variable_memory_max_size;
+  auto pos = variable_memory_max_size.find('*');
+  if (pos == std::string::npos) {
+    MS_LOG(EXCEPTION) << "Invalid variable_memory_max_size";
+  }
+  auto gb_str = variable_memory_max_size.substr(0, pos);
+  auto gb_var = std::stoull(gb_str);
+  MS_LOG(INFO) << "variable_memory_max_size(GB):" << gb_var;
+  if (gb_var > kMaxMemSizeGB || gb_var == 0) {
+    MS_LOG(EXCEPTION) << "Invalid allocate memory size:" << gb_var << " which should be in (0-30]GB";
   }
-  AscendMemoryPool::GetInstance().set_device_mem_pool_base(device_mem_pool_base_);
-  AscendMemoryPool::GetInstance().set_device_mem_pool_size(device_mem_pool_size_);
+  return gb_var << kMemSizeGB;
 }
 
 void AscendMemoryManager::FreeDeviceMemory() {
diff --git a/mindspore/ccsrc/device/ascend/ascend_memory_manager.h b/mindspore/ccsrc/device/ascend/ascend_memory_manager.h
index 90c8b2dfca..7fdd8f553e 100644
--- a/mindspore/ccsrc/device/ascend/ascend_memory_manager.h
+++ b/mindspore/ccsrc/device/ascend/ascend_memory_manager.h
@@ -32,6 +32,8 @@ class AscendMemoryManager : public MemoryManager {
  private:
   uint8_t *device_mem_pool_base_{nullptr};
   uint64_t device_mem_pool_size_{0};
+
+  uint64_t GetDeviceMemSizeFromContext();
 };
 }  // namespace ascend
 }  // namespace device
diff --git a/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc b/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc
index 5e15055a08..f0bad6b492 100644
--- a/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc
@@ -33,238 +33,220 @@ namespace device {
 namespace ascend {
 const uint32_t kHcomMaxTask = 5;
 const uint32_t kCommonMaxTask = 350;
-const uint32_t kIndependFirstStreamId = 1024;
 
-bool AscendStreamAssign::IsHcom(const CNodePtr &apply_kernel) {
-  MS_EXCEPTION_IF_NULL(apply_kernel);
-  return AnfAlgo::GetKernelType(apply_kernel) == HCCL_KERNEL;
-}
-
-void AscendStreamAssign::ResetNew() {
-  total_common_stream_num_ = 0;
-  total_independ_stream_num_ = 0;
-  total_event_num_ = 0;
-  first_physic_id_ = UINT32_MAX;
-  first_logic_id_ = UINT32_MAX;
-  independent_id_ = kIndependFirstStreamId;
-  logic_to_independent_map_.clear();
-  processed_logic_id_.clear();
-  logic_to_physic_map_.clear();
-  independent_before_physic_id_.clear();
-  inner_parallel_streams_.clear();
-  processed_parallel_streams_.clear();
-  hcom_stream_list_.clear();
-  need_first_active_streams_.clear();
-}
-
-void AscendStreamAssign::AssignIndependentStreamId(const CNodePtr &cur_cnode_ptr, uint32_t processing_logic_id) {
-  MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
-  auto it = logic_to_independent_map_.find(processing_logic_id);
-  if (it == logic_to_independent_map_.end()) {
-    (void)logic_to_independent_map_.insert(std::make_pair(processing_logic_id, independent_id_));
-    AnfAlgo::SetStreamId(independent_id_, cur_cnode_ptr.get());
-    independent_id_++;
-  } else {
-    AnfAlgo::SetStreamId(it->second, cur_cnode_ptr.get());
-  }
-
-  if (first_physic_id_ == UINT32_MAX) {
-    auto res = std::find(independent_before_physic_id_.begin(), independent_before_physic_id_.end(),
-                         AnfAlgo::GetStreamId(cur_cnode_ptr));
-    if (res == independent_before_physic_id_.end()) {
-      independent_before_physic_id_.push_back(AnfAlgo::GetStreamId(cur_cnode_ptr));
-    }
-  }
-}
-
-void AscendStreamAssign::AssignCommonStreamId(const CNodePtr &cur_cnode_ptr, CNodePtr *pre_cnode_ptr,
-                                              uint32_t *cur_index, uint32_t *cur_stream_id) {
-  MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
-  MS_EXCEPTION_IF_NULL(*pre_cnode_ptr);
-  bool over_max_hcom_task = (IsHcom(cur_cnode_ptr) && (*cur_index) % kHcomMaxTask == 0);
-  bool over_max_common_task = (!IsHcom(cur_cnode_ptr) && (*cur_index) % kCommonMaxTask == 0);
-  bool pre_common_cur_hcom = (IsHcom(cur_cnode_ptr) && !IsHcom(*pre_cnode_ptr));
-  bool pre_hcom_cur_common = (!IsHcom(cur_cnode_ptr) && IsHcom(*pre_cnode_ptr));
-  if (over_max_hcom_task || over_max_common_task || pre_common_cur_hcom || pre_hcom_cur_common) {
-    *cur_index = 0;
-    ++(*cur_stream_id);
-  }
-
-  if (over_max_hcom_task || pre_common_cur_hcom) {
-    hcom_stream_list_.emplace_back(*cur_stream_id);
-  }
-  ++(*cur_index);
-  AnfAlgo::SetStreamId(*cur_stream_id, cur_cnode_ptr.get());
-  *pre_cnode_ptr = cur_cnode_ptr;
-}
-
-bool AscendStreamAssign::IsProcessed(uint32_t logic_id) {
-  auto it = std::find(processed_logic_id_.begin(), processed_logic_id_.end(), logic_id);
-  if (it == processed_logic_id_.end()) {
-    return false;
-  }
-
-  return true;
-}
+void AscendStreamAssign::AssignStream(const shared_ptr<session::KernelGraph> &graph_ptr) {
+  if (IsTaskSink()) {
+    Reset();
+    ReorderIndependentOrders(graph_ptr);
+    AssignAllNodesStream(graph_ptr);
+    UpdateAtomicAddrCleanStreamId(graph_ptr);
+    FindHcomParallelStreams(graph_ptr);
+    InsertStreamActive(graph_ptr);
+    InsertSendRecvForHcomParallel(graph_ptr);
+    InsertSendRecvForIndependent(graph_ptr);
+    UpdateEventId(graph_ptr);
+    GetNeedActiveStreams(graph_ptr);
+    graph_ptr->PrintGraphExecuteOrder();
+    CheckStreamAssign(graph_ptr);
+    MS_LOG(INFO) << "after finish stream assign";
 
-void AscendStreamAssign::RecordIdMap(uint32_t logic_id, uint32_t physic_id) {
-  auto it = logic_to_physic_map_.find(logic_id);
-  if (it == logic_to_physic_map_.end()) {
-    MS_LOG(INFO) << "New logic_id[" << logic_id << "] to physic_id[" << physic_id << "]";
-    (void)logic_to_physic_map_.insert(std::make_pair(logic_id, physic_id));
+    // Get info for D Model
+    AscendStreamMng &stream_manager = AscendStreamMng::GetInstance();
+    generator::IRModelUtil::GetInstance().set_event_num(total_event_num());
+    generator::IRModelUtil::GetInstance().set_stream_num(stream_manager.GetCurAllocStreamNum());
+    // Init to 1,temporarily
+    generator::IRModelUtil::GetInstance().set_batch_num(1);
   }
 }
 
-void AscendStreamAssign::RecordFirstCommonOp(const CNodePtr &cur_cnode_ptr, uint32_t cur_node_logic_id,
-                                             uint32_t cur_stream_id) {
-  AnfAlgo::SetStreamId(cur_stream_id, cur_cnode_ptr.get());
-  RecordIdMap(cur_node_logic_id, cur_stream_id);
-  first_physic_id_ = cur_stream_id;
-  first_logic_id_ = cur_node_logic_id;
-}
+// section 0
+void AscendStreamAssign::CheckStreamAssign(const shared_ptr<session::KernelGraph> &graph_ptr) {
+  MS_EXCEPTION_IF_NULL(graph_ptr);
+  std::set<uint32_t> streams;
+  uint32_t max_stream = 0;
+  uint32_t min_stream = kInvalidStreamId;
+  const std::vector<CNodePtr> &cnode_ptr_list = graph_ptr->execution_order();
+  for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
+    CNodePtr cur_cnode_ptr = cnode_ptr_list[i];
+    MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+    uint32_t stream_id = AnfAlgo::GetStreamId(cur_cnode_ptr);
+    if (stream_id == kInvalidStreamId) {
+      MS_LOG(EXCEPTION) << "node [" << AnfAlgo::GetCNodeName(cur_cnode_ptr) << "] had not been assigned streams";
+    }
 
-uint32_t AscendStreamAssign::GetLogicId(const CNodePtr &cur_cnode_ptr) {
-  uint32_t logic_id = AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get());
-  if (logic_id == kInvalidDistincLabel) {
-    MS_LOG(EXCEPTION) << "node[" << cur_cnode_ptr->DebugString() << "] logic id is invalid";
+    streams.emplace(stream_id);
+    if (stream_id > max_stream) {
+      max_stream = stream_id;
+    }
+    if (stream_id < min_stream) {
+      min_stream = stream_id;
+    }
   }
-  return logic_id;
-}
 
-void AscendStreamAssign::SetCommonStreamNum(uint32_t cur_stream_id) {
-  if (first_physic_id_ == UINT32_MAX) {
-    MS_LOG(INFO) << "cur common node size is zero";
-    total_common_stream_num_ = 0;
-  } else {
-    total_common_stream_num_ = cur_stream_id + 1;
+  if (!streams.empty()) {
+    if (min_stream != 0) {
+      MS_LOG(EXCEPTION) << "before stream assign, assigned stream should start from 0, now is from " << min_stream;
+    }
+    if (max_stream != (streams.size() - 1)) {
+      MS_LOG(EXCEPTION) << "before stream assign, assigned stream should be consecutive";
+    }
   }
 }
 
+// section 1
 void AscendStreamAssign::AssignAllNodesStream(const shared_ptr<session::KernelGraph> &graph_ptr) {
   MS_EXCEPTION_IF_NULL(graph_ptr);
   auto cnode_ptr_list = graph_ptr->execution_order();
   CNodePtr pre_cnode_ptr = nullptr;
   uint32_t cur_index = 0;
   uint32_t cur_stream_id = 0;
-  uint32_t processing_logic_id = UINT32_MAX;
 
+  bool exit_independent = false;
+  AscendStreamMng &stream_manager = AscendStreamMng::GetInstance();
   for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
     CNodePtr cur_cnode_ptr = cnode_ptr_list[i];
     MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
-    // get logic id
-    uint32_t cur_node_logic_id = GetLogicId(cur_cnode_ptr);
+    if (AnfAlgo::GetStreamId(cur_cnode_ptr) != kInvalidStreamId) {
+      continue;
+    }
     if (IsIndependentNode(cur_cnode_ptr)) {
-      AssignIndependentStreamId(cur_cnode_ptr, cur_node_logic_id);
+      exit_independent = true;
       continue;
     }
+    // first common node, only exe one time
     if (pre_cnode_ptr == nullptr) {
-      RecordFirstCommonOp(cur_cnode_ptr, cur_node_logic_id, cur_stream_id);
-      processing_logic_id = cur_node_logic_id;
+      uint32_t cur_stream_num = stream_manager.GetCurAllocStreamNum();
+      if (cur_stream_num == 0) {
+        cur_stream_id = stream_manager.ApplyNewStream();
+      } else {
+        cur_stream_id = stream_manager.GetCurAllocStream();
+      }
       ++cur_index;
       pre_cnode_ptr = cur_cnode_ptr;
+      AnfAlgo::SetStreamId(cur_stream_id, cur_cnode_ptr.get());
+      if (IsHcom(cur_cnode_ptr)) {
+        hcom_stream_list_.emplace(cur_stream_id);
+      }
       continue;
     }
 
-    // 1.has been processed
-    if (IsProcessed(cur_node_logic_id)) {
-      continue;
-    }
+    AssignCommonStreamId(cur_cnode_ptr, &pre_cnode_ptr, &cur_index, &cur_stream_id);
+  }
 
-    if (cur_node_logic_id == processing_logic_id) {
-      AssignCommonStreamId(cur_cnode_ptr, &pre_cnode_ptr, &cur_index, &cur_stream_id);
-    } else {
-      // 1.find other same logic id
-      for (size_t j = i; j < cnode_ptr_list.size(); ++j) {
-        CNodePtr cnode_ptr = cnode_ptr_list[j];
-        MS_EXCEPTION_IF_NULL(cnode_ptr);
-        uint32_t logic_id = AnfAlgo::GetStreamDistinctionLabel(cnode_ptr.get());
-        if (logic_id == processing_logic_id) {
-          AssignCommonStreamId(cnode_ptr, &pre_cnode_ptr, &cur_index, &cur_stream_id);
-        }
+  if (exit_independent) {
+    uint32_t first_independent_stream_id = stream_manager.ApplyNewStream();
+    for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
+      CNodePtr cur_cnode_ptr = cnode_ptr_list[i];
+      MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+      if (AnfAlgo::GetStreamId(cur_cnode_ptr) != kInvalidStreamId) {
+        continue;
+      }
+      if (IsIndependentNode(cur_cnode_ptr)) {
+        AssignIndependentStreamId(cur_cnode_ptr);
       }
-      // 2.after deal:
-      processed_logic_id_.push_back(processing_logic_id);
-      cur_cnode_ptr = cnode_ptr_list[i];
-      // 3. new stream
-      ++cur_stream_id;
-      AnfAlgo::SetStreamId(cur_stream_id, cur_cnode_ptr.get());
-      cur_index = 1;
-
-      pre_cnode_ptr = cur_cnode_ptr;
-      processing_logic_id = cur_node_logic_id;
-      RecordIdMap(processing_logic_id, cur_stream_id);
     }
+    MS_LOG(INFO) << "independent start from :" << first_independent_stream_id;
   }
 
-  SetCommonStreamNum(cur_stream_id);
-  total_independ_stream_num_ = independent_id_ - kIndependFirstStreamId;
-  MS_LOG(INFO) << "stream nums:common:" << total_common_stream_num_ << ",independ:" << total_independ_stream_num_;
+  MS_LOG(INFO) << "total stream nums:" << stream_manager.GetCurAllocStreamNum();
 }
 
-void AscendStreamAssign::TransLogicToPhysic(const vector<uint32_t> &logic_ids, vector<uint32_t> *physic_ids) {
-  for (auto &id : logic_ids) {
-    auto it = logic_to_physic_map_.find(id);
-    if (it != logic_to_physic_map_.end()) {
-      MS_LOG(INFO) << "logic id[" << id << "] to physic id[" << it->second << "]";
-      (*physic_ids).push_back(it->second);
+void AscendStreamAssign::AssignIndependentStreamId(const CNodePtr &cur_cnode_ptr) {
+  MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+  AscendStreamMng &stream_manager = AscendStreamMng::GetInstance();
+  uint32_t cur_independent_id = stream_manager.GetCurAllocStream();
+  auto it = independent_stream_map_.find(cur_independent_id);
+  if (it == independent_stream_map_.end()) {
+    AnfAlgo::SetStreamId(cur_independent_id, cur_cnode_ptr.get());
+    independent_stream_map_.emplace(cur_independent_id, 1);
+  } else {
+    if (it->second < kCommonMaxTask) {
+      AnfAlgo::SetStreamId(it->first, cur_cnode_ptr.get());
+      it->second++;
     } else {
-      MS_LOG(EXCEPTION) << "logic id[" << id << "] has no correspond physic id";
+      cur_independent_id = stream_manager.ApplyNewStream();
+      AnfAlgo::SetStreamId(cur_independent_id, cur_cnode_ptr.get());
+      independent_stream_map_.emplace(cur_independent_id, 1);
     }
+  }
+}
+
+bool AscendStreamAssign::IsIndependentNode(const CNodePtr &node_ptr) {
+  MS_EXCEPTION_IF_NULL(node_ptr);
+  if (AnfAlgo::GetKernelType(node_ptr) != AICPU_KERNEL) {
+    return false;
+  }
 
-    auto it_independ = logic_to_independent_map_.find(id);
-    if (it_independ != logic_to_independent_map_.end()) {
-      MS_LOG(INFO) << "logic id[" << id << "] to independent id[" << it_independ->second << "]";
-      (*physic_ids).push_back(it_independ->second);
+  if (AnfAlgo::GetCNodeName(node_ptr) == kGetNextOpName) {
+    MS_LOG(INFO) << "GetNext should not be independent node";
+    return false;
+  }
+
+  uint32_t input_nums = AnfAlgo::GetInputTensorNum(node_ptr);
+  if (input_nums == 0) {
+    MS_LOG(INFO) << "node " << node_ptr->fullname_with_scope() << " is independent, as inputs nums is zero";
+    return true;
+  }
+
+  const std::vector<AnfNodePtr> &inputs = node_ptr->inputs();
+  for (size_t i = 1; i < inputs.size(); i++) {
+    if (!inputs[i]->isa<ValueNode>()) {
+      return false;
     }
   }
+  MS_LOG(INFO) << "node " << node_ptr->fullname_with_scope() << " is independent, as inputs is all value node";
+  return true;
 }
 
-void AscendStreamAssign::UpdateStreamActive(const CNodePtr &active_ptr) {
-  MS_LOG(INFO) << "start update outter active op[" << active_ptr->DebugString() << "] ";
-  MS_EXCEPTION_IF_NULL(active_ptr);
-  auto primitive = AnfAlgo::GetCNodePrimitive(active_ptr);
-  MS_EXCEPTION_IF_NULL(primitive);
-  vector<uint32_t> active_logic_ids = GetValue<std::vector<uint32_t>>(primitive->GetAttr(kAttrActiveStreamList));
-  // out StreamAcitve active physic stream is not parallel now, if parallel, should deal here.
-  vector<uint32_t> active_physic_ids;
-  TransLogicToPhysic(active_logic_ids, &active_physic_ids);
-  ValuePtr active_physic_value = MakeValue<std::vector<uint32_t>>(active_physic_ids);
-  AnfAlgo::SetNodeAttr(kAttrActiveStreamList, active_physic_value, active_ptr);
-}
+void AscendStreamAssign::AssignCommonStreamId(const CNodePtr &cur_cnode_ptr, CNodePtr *pre_cnode_ptr,
+                                              uint32_t *cur_index, uint32_t *cur_stream_id) {
+  MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+  MS_EXCEPTION_IF_NULL(pre_cnode_ptr);
+  MS_EXCEPTION_IF_NULL(*pre_cnode_ptr);
+  AscendStreamMng &stream_manager = AscendStreamMng::GetInstance();
+  bool over_max_hcom_task = (IsHcom(cur_cnode_ptr) && (*cur_index) % kHcomMaxTask == 0);
+  bool over_max_common_task = (!IsHcom(cur_cnode_ptr) && (*cur_index) % kCommonMaxTask == 0);
+  bool pre_common_cur_hcom = (IsHcom(cur_cnode_ptr) && !IsHcom(*pre_cnode_ptr));
+  bool pre_hcom_cur_common = (!IsHcom(cur_cnode_ptr) && IsHcom(*pre_cnode_ptr));
+  if (over_max_hcom_task || over_max_common_task || pre_common_cur_hcom || pre_hcom_cur_common) {
+    *cur_index = 0;
+    *cur_stream_id = stream_manager.ApplyNewStream();
+  }
 
-void AscendStreamAssign::UpdateStreamSwitch(const CNodePtr &switch_ptr, const CNodePtr &active_ptr) {
-  MS_LOG(INFO) << "start update switch op[" << switch_ptr->DebugString() << "]";
-  MS_EXCEPTION_IF_NULL(switch_ptr);
-  MS_EXCEPTION_IF_NULL(active_ptr);
-  auto primitive = AnfAlgo::GetCNodePrimitive(switch_ptr);
-  MS_EXCEPTION_IF_NULL(primitive);
-  auto true_logic_id = GetValue<uint32_t>(primitive->GetAttr(kAttrTrueBranchStream));
-  MS_LOG(INFO) << "streamswtich stream id[" << AnfAlgo::GetStreamId(switch_ptr) << "], true_logic_id[" << true_logic_id
-               << "]";
-  vector<uint32_t> logic_ids{true_logic_id};
-  vector<uint32_t> physic_ids;
-  TransLogicToPhysic(logic_ids, &physic_ids);
-  if (physic_ids.empty()) {
-    MS_LOG(EXCEPTION) << "stream switch true logic id[" << true_logic_id << "] has no physical id";
+  ++(*cur_index);
+  AnfAlgo::SetStreamId(*cur_stream_id, cur_cnode_ptr.get());
+  *pre_cnode_ptr = cur_cnode_ptr;
+
+  // record ll hcom streams as hcom stream has different stream flag
+  if (IsHcom(cur_cnode_ptr)) {
+    auto it = std::find(hcom_stream_list_.begin(), hcom_stream_list_.end(), *cur_stream_id);
+    if (it == hcom_stream_list_.end()) {
+      MS_LOG(INFO) << "hcom stream id:" << *cur_stream_id;
+      hcom_stream_list_.emplace(*cur_stream_id);
+    }
   }
-  ValuePtr true_index = MakeValue<uint32_t>(physic_ids[0]);
-  AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, true_index, switch_ptr);
+}
 
-  MS_LOG(INFO) << "start update StreamActive op[" << active_ptr->DebugString() << "]";
-  AnfAlgo::SetStreamId(physic_ids[0], active_ptr.get());
-  vector<uint32_t> active_ids;
-  for (size_t i = 0; i < physic_ids.size(); i++) {
-    if (i == 0) {
-      MS_LOG(INFO) << "StreamActive op self stream id[" << physic_ids[i] << "]";
-    } else {
-      MS_LOG(INFO) << "StreamActive op active stream id[" << physic_ids[i] << "]";
-      active_ids.emplace_back(physic_ids[i]);
+// section 2:
+void AscendStreamAssign::UpdateAtomicAddrCleanStreamId(const shared_ptr<session::KernelGraph> &graph_ptr) {
+  MS_LOG(INFO) << "start";
+  MS_EXCEPTION_IF_NULL(graph_ptr);
+  const std::vector<CNodePtr> &cnode_ptr_list = graph_ptr->execution_order();
+  for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
+    CNodePtr cur_cnode_ptr = cnode_ptr_list[i];
+    MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+    // update AtomicAddrClean stream same witch the next node
+    if (i > 0 && AnfAlgo::GetCNodeName(cnode_ptr_list[i - 1]) == kAtomicAddrCleanOpName) {
+      MS_LOG(INFO) << "update AtomicAddrClean stream id from[" << AnfAlgo::GetStreamId(cnode_ptr_list[i - 1])
+                   << "] to [" << AnfAlgo::GetStreamId(cur_cnode_ptr) << "]";
+      AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(cur_cnode_ptr), cnode_ptr_list[i - 1].get());
     }
   }
-  AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(active_ids), active_ptr);
+  MS_LOG(INFO) << "end";
 }
 
-void AscendStreamAssign::FindAllReduceParallel(const shared_ptr<session::KernelGraph> &graph_ptr) {
+// section 3
+void AscendStreamAssign::FindHcomParallelStreams(const shared_ptr<session::KernelGraph> &graph_ptr) {
   MS_EXCEPTION_IF_NULL(graph_ptr);
   CNodePtr cur_cnode_ptr = nullptr;
   CNodePtr pre_cnode_ptr = nullptr;
@@ -280,9 +262,9 @@ void AscendStreamAssign::FindAllReduceParallel(const shared_ptr<session::KernelG
       continue;
     }
 
-    bool diff_stream = (pre_stream_id != cur_stream_id) && (pre_stream_id < cur_stream_id);
-    bool pre_hcom = IsHcom(pre_cnode_ptr);
-    if (diff_stream && pre_hcom) {
+    bool pre_fusion_hcom = IsFusionHcom(pre_cnode_ptr);
+    bool diff_stream = (pre_stream_id != cur_stream_id);
+    if (diff_stream && pre_fusion_hcom) {
       inner_parallel_streams_.emplace_back(std::vector<uint32_t>{pre_stream_id, cur_stream_id});
     }
 
@@ -291,45 +273,107 @@ void AscendStreamAssign::FindAllReduceParallel(const shared_ptr<session::KernelG
   }
 }
 
-void AscendStreamAssign::InsertSendRecvForHcomParallel(const shared_ptr<mindspore::session::KernelGraph> &graph_ptr) {
+// section 4
+void AscendStreamAssign::UpdateStreamSwitch(const std::shared_ptr<session::KernelGraph> &graph_ptr,
+                                            const CNodePtr &switch_ptr, const vector<uint32_t> &independent_stream,
+                                            vector<CNodePtr> *orders) {
+  MS_EXCEPTION_IF_NULL(orders);
+  orders->emplace_back(switch_ptr);
+  auto primitive = AnfAlgo::GetCNodePrimitive(switch_ptr);
+  MS_EXCEPTION_IF_NULL(primitive);
+  auto value_ptr = primitive->GetAttr(kStreamNeedActivedFirst);
+  if (value_ptr == nullptr) {
+    return;
+  }
+
+  auto need_active = GetValue<bool>(value_ptr);
+  if (!need_active) {
+    return;
+  }
+
+  MS_LOG(INFO) << "start update switch op[" << switch_ptr->DebugString() << "]";
+  MS_EXCEPTION_IF_NULL(switch_ptr);
+  auto true_stream_id = GetValue<uint32_t>(primitive->GetAttr(kAttrTrueBranchStream));
+  MS_LOG(INFO) << "streamswtich stream id[" << AnfAlgo::GetStreamId(switch_ptr) << "], true_logic_id[" << true_stream_id
+               << "]";
+
+  CNodePtr active_ptr = KernelAdjust::GetInstance().CreateStreamActiveOp(graph_ptr);
+  MS_LOG(INFO) << "start update StreamActive op[" << active_ptr->DebugString() << "]";
+  AnfAlgo::SetStreamId(true_stream_id, active_ptr.get());
+  AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(independent_stream), active_ptr);
+  independent_stream_activated_ = true;
+
+  // update processed stream
+  for (auto &item : independent_stream) {
+    processed_streams_.emplace(item);
+  }
+
+  orders->emplace_back(active_ptr);
+}  // namespace ascend
+
+void AscendStreamAssign::InsertStreamActive(const std::shared_ptr<session::KernelGraph> &graph_ptr) {
   MS_LOG(INFO) << "start";
   MS_EXCEPTION_IF_NULL(graph_ptr);
-  auto cnode_ptr_list = graph_ptr->execution_order();
-  vector<CNodePtr> cnodes = cnode_ptr_list;
-  uint32_t cur_event_id = 0;
-  auto it = cnodes.begin();
-  while (it != cnodes.end() && (it + 1) != cnodes.end()) {
-    MS_EXCEPTION_IF_NULL(*it);
-    MS_EXCEPTION_IF_NULL(*(it + 1));
-    if (IsHcom(*it) && !IsHcom(*(it + 1))) {
-      CNodePtr send_cnode_ptr = CreateSendApplyKernel(graph_ptr, cur_event_id, AnfAlgo::GetStreamId(*it));
-      it = cnodes.insert(it + 1, send_cnode_ptr);
+  std::vector<CNodePtr> update_cnode_list;
+  CNodePtr cur_cnode_ptr = nullptr;
+  CNodePtr pre_cnode_ptr = nullptr;
+  uint32_t pre_stream_id = UINT32_MAX;
+  std::vector<uint32_t> independent_stream;
+  MS_LOG(INFO) << "independent stream size:" << independent_stream_map_.size();
+  for (auto item : independent_stream_map_) {
+    independent_stream.emplace_back(item.first);
+  }
 
-      auto target = FindTargetOp(it, cnodes.end(), *(it - 1));
-      if (target == cnodes.end()) {
-        MS_LOG(WARNING) << "hcom node[" << (*(it - 1))->fullname_with_scope()
-                        << "] can't find target for insert recv op, no insert send/recv";
-        it = cnodes.erase(it);
-        continue;
-      }
+  bool independent_flag = !(independent_stream.empty());
 
-      // deal recv op
-      uint32_t stream_id = AnfAlgo::GetStreamId(*target);
-      CNodePtr recv_cnode_ptr = CreateRecvApplyKernel(graph_ptr, cur_event_id, stream_id);
-      (void)cnodes.insert(target, recv_cnode_ptr);
-      ++cur_event_id;
+  const std::vector<CNodePtr> &cnode_ptr_list = graph_ptr->execution_order();
+  for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
+    cur_cnode_ptr = cnode_ptr_list[i];
+    MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+    uint32_t cur_stream_id = AnfAlgo::GetStreamId(cur_cnode_ptr);
+    if (IsIndependentNode(cur_cnode_ptr)) {
+      update_cnode_list.emplace_back(cur_cnode_ptr);
+      continue;
     }
-    ++it;
+
+    bool inner_active = false;
+    if (pre_cnode_ptr != nullptr) {
+      inner_active = pre_stream_id != cur_stream_id && AnfAlgo::GetCNodeName(pre_cnode_ptr) != kStreamSwitchOpName &&
+                     AnfAlgo::GetCNodeName(pre_cnode_ptr) != kSendOpName;
+    }
+
+    bool processed = IsProcessedStream(cur_stream_id);
+    // 1)inner stream assign, need insert active op
+    if (inner_active && !processed) {
+      MS_LOG(INFO) << "Inner insert active op, self stream id[" << pre_stream_id << "]";
+      CNodePtr active_ptr = KernelAdjust::GetInstance().CreateStreamActiveOp(graph_ptr);
+      // 1.set stream id
+      AnfAlgo::SetStreamId(pre_stream_id, active_ptr.get());
+      // 2.set active stream ids
+      std::vector<uint32_t> active_index_list;
+      GetParallelStream(cur_stream_id, pre_stream_id, &active_index_list);
+      AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(active_index_list), active_ptr);
+      update_cnode_list.emplace_back(active_ptr);
+    }
+
+    if (independent_flag && (AnfAlgo::GetCNodeName(cur_cnode_ptr) == kStreamSwitchOpName)) {
+      MS_LOG(INFO) << "Insert StreamActive op after FP StreamSwitch for stream parallel";
+      UpdateStreamSwitch(graph_ptr, cur_cnode_ptr, independent_stream, &update_cnode_list);
+    } else {
+      update_cnode_list.emplace_back(cur_cnode_ptr);
+    }
+
+    processed_streams_.emplace(cur_stream_id);
+    pre_stream_id = cur_stream_id;
+    pre_cnode_ptr = cur_cnode_ptr;
   }
-  graph_ptr->set_execution_order(cnodes);
-  total_event_num_ = cur_event_id;
-  MS_LOG(INFO) << "after insert send/recv for hcom parallel, total event nums[" << total_event_num_ << "]";
+  graph_ptr->set_execution_order(update_cnode_list);
   MS_LOG(INFO) << "end";
 }
 
-bool AscendStreamAssign::IsProcessedParallelStream(uint32_t stream_id) {
-  auto it = std::find(processed_parallel_streams_.begin(), processed_parallel_streams_.end(), stream_id);
-  if (it != processed_parallel_streams_.end()) {
+bool AscendStreamAssign::IsProcessedStream(uint32_t stream_id) {
+  auto it = std::find(processed_streams_.begin(), processed_streams_.end(), stream_id);
+  if (it != processed_streams_.end()) {
     return true;
   }
   return false;
@@ -337,8 +381,9 @@ bool AscendStreamAssign::IsProcessedParallelStream(uint32_t stream_id) {
 
 void AscendStreamAssign::GetParallelStream(uint32_t cur_stream_id, uint32_t stream_acitve_id,
                                            vector<uint32_t> *parallel_streams) {
+  MS_EXCEPTION_IF_NULL(parallel_streams);
   for (size_t i = 0; i < inner_parallel_streams_.size(); i++) {
-    auto cur_parallel_streams = inner_parallel_streams_[i];
+    const auto &cur_parallel_streams = inner_parallel_streams_[i];
     auto it = std::find(cur_parallel_streams.begin(), cur_parallel_streams.end(), cur_stream_id);
     if (it != cur_parallel_streams.end()) {
       MS_LOG(INFO) << "stream id:" << cur_stream_id << " is parallel stream";
@@ -349,74 +394,118 @@ void AscendStreamAssign::GetParallelStream(uint32_t cur_stream_id, uint32_t stre
           continue;
         }
         (*parallel_streams).emplace_back(cur_parallel_streams[j]);
+        processed_streams_.emplace(cur_parallel_streams[j]);
       }
-
-      // record processed parallel streams
-      (void)std::copy((*parallel_streams).begin(), (*parallel_streams).end(),
-                      std::back_inserter(processed_parallel_streams_));
       return;
     }
   }
 
+  processed_streams_.emplace(cur_stream_id);
   (*parallel_streams).push_back(cur_stream_id);
 }
 
-void AscendStreamAssign::InsertActiveNew(const std::shared_ptr<session::KernelGraph> &graph_ptr) {
+// section5
+void AscendStreamAssign::InsertSendRecvForDiffHcom(const shared_ptr<mindspore::session::KernelGraph> &graph_ptr) {
   MS_LOG(INFO) << "start";
   MS_EXCEPTION_IF_NULL(graph_ptr);
-  std::vector<CNodePtr> update_cnode_list;
-  CNodePtr cur_cnode_ptr = nullptr;
-  CNodePtr pre_cnode_ptr = nullptr;
-  uint32_t pre_stream_id = UINT32_MAX;
-
   auto cnode_ptr_list = graph_ptr->execution_order();
-  for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
-    cur_cnode_ptr = cnode_ptr_list[i];
-    MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
-    uint32_t cur_stream_id = AnfAlgo::GetStreamId(cur_cnode_ptr);
-    if (cur_stream_id >= kIndependFirstStreamId) {
-      update_cnode_list.emplace_back(cur_cnode_ptr);
+  vector<uint32_t> fusion_hcom_index;
+  vector<CNodePtr> orders;
+  for (size_t i = 0; i < cnode_ptr_list.size(); i++) {
+    auto cur_cnode = cnode_ptr_list[i];
+    if (IsFusionHcom(cur_cnode)) {
+      fusion_hcom_index.emplace_back(i);
+    }
+  }
+  if (fusion_hcom_index.size() < 2) {
+    MS_LOG(INFO) << "fusion hcom size is less than 2, no need insert event between them";
+    return;
+  }
+  uint32_t first_index = fusion_hcom_index[0];
+  uint32_t last_index = fusion_hcom_index[fusion_hcom_index.size() - 1];
+  uint32_t cur_event_id = total_event_num_;
+  uint32_t pre_hcom_stream_id = kInvalidStreamId;
+  std::copy(cnode_ptr_list.begin(), cnode_ptr_list.begin() + first_index, std::back_inserter(orders));
+  for (size_t i = first_index; i <= last_index; i++) {
+    auto cur_cnode = cnode_ptr_list[i];
+    auto it = std::find(fusion_hcom_index.begin(), fusion_hcom_index.end(), i);
+    if (it == fusion_hcom_index.end()) {
+      orders.emplace_back(cur_cnode);
       continue;
     }
-
-    bool inner_active = pre_stream_id != cur_stream_id && pre_stream_id < cur_stream_id &&
-                        AnfAlgo::GetCNodeName(pre_cnode_ptr) != kStreamSwitchOpName &&
-                        AnfAlgo::GetCNodeName(pre_cnode_ptr) != kStreamActiveOpName &&
-                        AnfAlgo::GetCNodeName(pre_cnode_ptr) != kSendOpName;
-    bool processed = IsProcessedParallelStream(cur_stream_id);
-    // 1)inner stream assign, need insert active op
-    if (inner_active && !processed) {
-      MS_LOG(INFO) << "Inner insert active op, self stream id[" << pre_stream_id << "]";
-      CNodePtr active_ptr = KernelAdjust::GetInstance().CreateStreamActiveOp(graph_ptr);
-      update_cnode_list.emplace_back(active_ptr);
-      // 1.set stream id
-      AnfAlgo::SetStreamId(pre_stream_id, active_ptr.get());
-      // 2.set active stream ids
-      std::vector<uint32_t> active_index_list;
-      GetParallelStream(cur_stream_id, pre_stream_id, &active_index_list);
-      AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(active_index_list), active_ptr);
+    auto cur_hcom_stream_id = AnfAlgo::GetStreamId(cur_cnode);
+    if (cur_hcom_stream_id == pre_hcom_stream_id) {
+      orders.emplace_back(cur_cnode);
+      continue;
     }
-    // inner_active is not a if/else relationship with the next if/else. such as:StreamActive(S7)-->StreamActive(S8)
-    if (AnfAlgo::GetCNodeName(cur_cnode_ptr) == kStreamActiveOpName &&
-        AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get()) != UINT32_MAX) {
-      // 2)outter stream assign, update active op
-      update_cnode_list.emplace_back(cur_cnode_ptr);
-      UpdateStreamActive(cur_cnode_ptr);
-    } else if (AnfAlgo::GetCNodeName(cur_cnode_ptr) == kStreamSwitchOpName) {
-      // 3)update switch op
-      MS_LOG(INFO) << "Insert active op after switch";
-      CNodePtr active_ptr = KernelAdjust::GetInstance().CreateStreamActiveOp(graph_ptr);
-      update_cnode_list.emplace_back(cur_cnode_ptr);
-      update_cnode_list.emplace_back(active_ptr);
-      UpdateStreamSwitch(cur_cnode_ptr, active_ptr);
+    if (i == first_index) {
+      // first fusion hcom
+      orders.emplace_back(cur_cnode);
+      auto send = CreateSendApplyKernel(graph_ptr, cur_event_id, cur_hcom_stream_id);
+      orders.emplace_back(send);
+    } else if (i == last_index) {
+      // last fusion hcom
+      auto recv = CreateRecvApplyKernel(graph_ptr, cur_event_id, cur_hcom_stream_id);
+      orders.emplace_back(recv);
+      orders.emplace_back(cur_cnode);
+      cur_event_id++;
     } else {
-      update_cnode_list.emplace_back(cur_cnode_ptr);
+      auto recv = CreateRecvApplyKernel(graph_ptr, cur_event_id, cur_hcom_stream_id);
+      orders.emplace_back(recv);
+      cur_event_id++;
+      orders.emplace_back(cur_cnode);
+      auto send = CreateSendApplyKernel(graph_ptr, cur_event_id, cur_hcom_stream_id);
+      orders.emplace_back(send);
     }
+    pre_hcom_stream_id = cur_hcom_stream_id;
+  }
+  std::copy(cnode_ptr_list.begin() + last_index + 1, cnode_ptr_list.end(), std::back_inserter(orders));
+  graph_ptr->set_execution_order(orders);
+  total_event_num_ = cur_event_id;
+  MS_LOG(INFO) << "after indsert between allreduce, total event nums[" << total_event_num_ << "]\n end";
+}
 
-    pre_stream_id = cur_stream_id;
-    pre_cnode_ptr = cur_cnode_ptr;
+void AscendStreamAssign::InsertSendRecvForHcomParallel(const shared_ptr<mindspore::session::KernelGraph> &graph_ptr) {
+  MS_LOG(INFO) << "start";
+  MS_EXCEPTION_IF_NULL(graph_ptr);
+  auto cnode_ptr_list = graph_ptr->execution_order();
+  vector<CNodePtr> cnodes = cnode_ptr_list;
+  uint32_t cur_event_id = 0;
+  auto it = cnodes.begin();
+  while (it != cnodes.end() && (it + 1) != cnodes.end()) {
+    MS_EXCEPTION_IF_NULL(*it);
+    MS_EXCEPTION_IF_NULL(*(it + 1));
+    if (IsHcom(*it) && !IsHcom(*(it + 1))) {
+      bool is_fusion = IsFusionHcom(*it);
+      if (!is_fusion) {
+        ++it;
+        continue;
+      }
+      CNodePtr send_cnode_ptr = CreateSendApplyKernel(graph_ptr, cur_event_id, AnfAlgo::GetStreamId(*it));
+      it = cnodes.insert(it + 1, send_cnode_ptr);
+
+      auto target = FindTargetOp(it, cnodes.end(), *(it - 1));
+      if (target == cnodes.end()) {
+        MS_LOG(WARNING) << "hcom node[" << (*(it - 1))->fullname_with_scope()
+                        << "] can't find target for insert recv op, no insert send/recv";
+        it = cnodes.erase(it);
+        continue;
+      }
+
+      // deal recv op
+      uint32_t stream_id = AnfAlgo::GetStreamId(*target);
+      CNodePtr recv_cnode_ptr = CreateRecvApplyKernel(graph_ptr, cur_event_id, stream_id);
+      (void)cnodes.insert(target, recv_cnode_ptr);
+      ++cur_event_id;
+    }
+    ++it;
   }
-  graph_ptr->set_execution_order(update_cnode_list);
+  graph_ptr->set_execution_order(cnodes);
+  total_event_num_ = cur_event_id;
+  MS_LOG(INFO) << "after insert send/recv for hcom parallel, total event nums[" << total_event_num_ << "]";
+
+  // Insert Send/Recv between Hcom(such as:AllReduce1 Send1 Common Recv1 AllReduce2)
+  InsertSendRecvForDiffHcom(graph_ptr);
   MS_LOG(INFO) << "end";
 }
 
@@ -451,70 +540,23 @@ void AscendStreamAssign::UpdateEventId(const shared_ptr<session::KernelGraph> &g
   }
 }
 
-void AscendStreamAssign::UpdateStreamId(const shared_ptr<session::KernelGraph> &graph_ptr) {
-  MS_LOG(INFO) << "start";
-  MS_EXCEPTION_IF_NULL(graph_ptr);
-  CNodePtr cur_cnode_ptr = nullptr;
-  auto cnode_ptr_list = graph_ptr->execution_order();
-  for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
-    cur_cnode_ptr = cnode_ptr_list[i];
-    MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
-    uint32_t cur_stream_id = AnfAlgo::GetStreamId(cur_cnode_ptr);
-    if (cur_stream_id < kIndependFirstStreamId) {
-      if (AnfAlgo::GetCNodeName(cur_cnode_ptr) == kStreamActiveOpName) {
-        auto primitive = AnfAlgo::GetCNodePrimitive(cur_cnode_ptr);
-        MS_EXCEPTION_IF_NULL(primitive);
-        vector<uint32_t> active_ids = GetValue<std::vector<uint32_t>>(primitive->GetAttr(kAttrActiveStreamList));
-        for (size_t j = 0; j < active_ids.size(); j++) {
-          if (active_ids[j] >= kIndependFirstStreamId) {
-            active_ids[j] = active_ids[j] - kIndependFirstStreamId + total_common_stream_num_;
-          }
-        }
-        ValuePtr active_value = MakeValue<std::vector<uint32_t>>(active_ids);
-        AnfAlgo::SetNodeAttr(kAttrActiveStreamList, active_value, cur_cnode_ptr);
-      }
-    } else {
-      uint32_t update_id = cur_stream_id - kIndependFirstStreamId + total_common_stream_num_;
-      AnfAlgo::SetStreamId(update_id, cur_cnode_ptr.get());
-    }
-
-    // update AtomicAddrClean stream same witch the next node
-    if (i > 0 && AnfAlgo::GetCNodeName(cnode_ptr_list[i - 1]) == "AtomicAddrClean") {
-      MS_LOG(INFO) << "update AtomicAddrClean stream id from[" << AnfAlgo::GetStreamId(cnode_ptr_list[i - 1])
-                   << "] to [" << AnfAlgo::GetStreamId(cur_cnode_ptr) << "]";
-      AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(cur_cnode_ptr), cnode_ptr_list[i - 1].get());
-    }
-  }
-
-  // update logic_to_independent_map_
-  for (auto &indep : logic_to_independent_map_) {
-    if (indep.second >= kIndependFirstStreamId) {
-      indep.second = indep.second - kIndependFirstStreamId + total_common_stream_num_;
-    }
-  }
-
-  // update independent_before_physic_id_
-  for (auto &id : independent_before_physic_id_) {
-    if (id >= kIndependFirstStreamId) {
-      id = id - kIndependFirstStreamId + total_common_stream_num_;
-    }
-  }
-
-  // update independent_id_
-  independent_id_ = independent_id_ - kIndependFirstStreamId + total_common_stream_num_;
-  MS_LOG(INFO) << "end";
-}
-
 void AscendStreamAssign::GetNeedActiveStreams(const shared_ptr<session::KernelGraph> &graph_ptr) {
   MS_EXCEPTION_IF_NULL(graph_ptr);
   CNodePtr cur_cnode_ptr = nullptr;
   auto cnode_ptr_list = graph_ptr->execution_order();
+  // 1)stream witch kStreamNeedActivedFirst attr should be actived;
   for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
     cur_cnode_ptr = cnode_ptr_list[i];
     MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+    ValuePtr value_ptr = nullptr;
     auto primitive = AnfAlgo::GetCNodePrimitive(cur_cnode_ptr);
-    MS_EXCEPTION_IF_NULL(primitive);
-    auto value_ptr = primitive->GetAttr(kStreamNeedActivedFirst);
+    if (primitive != nullptr) {
+      value_ptr = primitive->GetAttr(kStreamNeedActivedFirst);
+    } else {
+      auto func_graph = AnfAlgo::GetCNodeFuncGraphPtr(cur_cnode_ptr);
+      MS_EXCEPTION_IF_NULL(func_graph);
+      value_ptr = func_graph->get_attr(kStreamNeedActivedFirst);
+    }
     if (value_ptr == nullptr) {
       continue;
     }
@@ -526,29 +568,15 @@ void AscendStreamAssign::GetNeedActiveStreams(const shared_ptr<session::KernelGr
       need_first_active_streams_.push_back(stream_id);
     }
   }
-}
-
-void AscendStreamAssign::AssignStreamNew(const shared_ptr<session::KernelGraph> &graph_ptr) {
-  if (IsTaskSink()) {
-    ResetNew();
-    ReorderIndependentOrders(graph_ptr);
-    AssignAllNodesStream(graph_ptr);
-    FindAllReduceParallel(graph_ptr);
-    InsertActiveNew(graph_ptr);
-    InsertSendRecvForHcomParallel(graph_ptr);
-    InsertSendRecvForIndependent(graph_ptr);
-    UpdateStreamId(graph_ptr);
-    UpdateEventId(graph_ptr);
-    GetNeedActiveStreams(graph_ptr);
 
-    MS_LOG(INFO) << "after finish stream assign";
-    PrintGraphExeOrders(graph_ptr);
+  // 2)first stream 0 should be actived first;
+  need_first_active_streams_.emplace_back(0);
 
-    // Get info for D Model
-    generator::IRModelUtil::GetInstance().set_event_num(total_event_num());
-    generator::IRModelUtil::GetInstance().set_stream_num(total_common_stream_num() + total_independ_stream_num());
-    // Init to 1,temporarily
-    generator::IRModelUtil::GetInstance().set_batch_num(1);
+  // 3)independent stream:if has not been activate, push to need active vector
+  if (!independent_stream_activated_) {
+    for (auto &item : independent_stream_map_) {
+      need_first_active_streams_.emplace_back(item.first);
+    }
   }
 }
 
@@ -659,33 +687,6 @@ void AscendStreamAssign::InsertSendRecvForIndependent(const shared_ptr<session::
   MS_LOG(INFO) << "end";
 }
 
-bool AscendStreamAssign::IsIndependentNode(const CNodePtr &node_ptr) {
-  MS_EXCEPTION_IF_NULL(node_ptr);
-  if (AnfAlgo::GetKernelType(node_ptr) != AICPU_KERNEL) {
-    return false;
-  }
-
-  if (AnfAlgo::GetCNodeName(node_ptr) == kGetNextOpName) {
-    MS_LOG(INFO) << "GetNext should not be independent node";
-    return false;
-  }
-
-  uint32_t input_nums = AnfAlgo::GetInputTensorNum(node_ptr);
-  if (input_nums == 0) {
-    MS_LOG(INFO) << "node " << node_ptr->fullname_with_scope() << " is independent, as inputs nums is zero";
-    return true;
-  }
-
-  auto inputs = node_ptr->inputs();
-  for (size_t i = 1; i < inputs.size(); i++) {
-    if (!inputs[i]->isa<ValueNode>()) {
-      return false;
-    }
-  }
-  MS_LOG(INFO) << "node " << node_ptr->fullname_with_scope() << " is independent, as inputs is all value node";
-  return true;
-}
-
 bool AscendStreamAssign::IsTaskSink() {
   auto ms_context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(ms_context);
@@ -699,63 +700,60 @@ bool AscendStreamAssign::IsTaskSink() {
 }
 
 void AscendStreamAssign::GetWaitStreams(vector<uint32_t> *wait_active_stream_list) {
-  if (total_common_stream_num_ == 0) {
+  MS_EXCEPTION_IF_NULL(wait_active_stream_list);
+  AscendStreamMng &stream_manager = AscendStreamMng::GetInstance();
+  uint32_t total_stream_num = stream_manager.GetCurAllocStreamNum();
+  if (total_stream_num == 0) {
     MS_LOG(INFO) << "total_common_stream_num is zero";
     return;
   }
 
   // common stream:active first common stream
-  MS_LOG(INFO) << "active physic id[" << first_physic_id_ << "]";
-  for (uint32_t i = first_physic_id_ + 1; i < total_common_stream_num_; i++) {
+  for (uint32_t i = 0; i < total_stream_num; i++) {
     auto it = std::find(need_first_active_streams_.begin(), need_first_active_streams_.end(), i);
     if (it == need_first_active_streams_.end()) {
       MS_LOG(INFO) << "wait common stream id = " << i;
       (*wait_active_stream_list).push_back(i);
     }
   }
+}
 
-  // all independ stream id before first physical stream id should be actived
-  auto it = logic_to_independent_map_.find(first_logic_id_);
-  if (it != logic_to_independent_map_.end()) {
-    uint32_t independent_id = it->second;
-    auto res = std::find(independent_before_physic_id_.begin(), independent_before_physic_id_.end(), independent_id);
-    if (res == independent_before_physic_id_.end()) {
-      // first physical to independ id may be not in independent_before_physic_id_
-      independent_before_physic_id_.push_back(independent_id);
-    }
-    MS_LOG(INFO) << "active independent id[" << independent_id << "]";
+bool AscendStreamAssign::IsHcom(const CNodePtr &apply_kernel) {
+  MS_EXCEPTION_IF_NULL(apply_kernel);
+  return AnfAlgo::GetKernelType(apply_kernel) == HCCL_KERNEL;
+}
+
+bool AscendStreamAssign::IsFusionHcom(const CNodePtr &cur_cnode_ptr) {
+  MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+  bool is_hcom = IsHcom(cur_cnode_ptr);
+  if (!is_hcom) {
+    return false;
   }
 
-  uint32_t max_before_physic = 0;
-  for (size_t i = 0; i < independent_before_physic_id_.size(); i++) {
-    if (independent_before_physic_id_[i] > max_before_physic) {
-      max_before_physic = independent_before_physic_id_[i];
-    }
-    MS_LOG(INFO) << "independent id[" << independent_before_physic_id_[i] << "] before first physic is active";
+  if (!AnfAlgo::HasNodeAttr(kAttrFusion, cur_cnode_ptr)) {
+    return false;
   }
 
-  for (uint32_t i = 0; i < total_independ_stream_num_; i++) {
-    if (i + total_common_stream_num_ <= max_before_physic) {
-      continue;
-    }
-    // all wait streams should not in need_first_active_streams_
-    auto iter =
-      std::find(need_first_active_streams_.begin(), need_first_active_streams_.end(), i + total_common_stream_num_);
-    if (iter == need_first_active_streams_.end()) {
-      MS_LOG(INFO) << "wait independent stream id:" << i + total_common_stream_num_;
-      (*wait_active_stream_list).push_back(i + total_common_stream_num_);
-    }
+  if (AnfAlgo::GetNodeAttr<int>(cur_cnode_ptr, kAttrFusion) == 0) {
+    return false;
+  }
+
+  return true;
+}
+
+void AscendStreamAssign::GetHcomStreams(std::vector<uint32_t> *streams) {
+  MS_EXCEPTION_IF_NULL(streams);
+  for (const auto &stream : hcom_stream_list_) {
+    (*streams).emplace_back(stream);
   }
 }
 
-uint32_t AscendStreamAssign::GetTotalStreamNum() const { return total_common_stream_num_ + total_independ_stream_num_; }
 void AscendStreamAssign::ReorderIndependentOrders(const shared_ptr<mindspore::session::KernelGraph> &graph_ptr) {
   MS_EXCEPTION_IF_NULL(graph_ptr);
   CNodePtr cur_cnode_ptr = nullptr;
   std::vector<CNodePtr> exe_orders;
   std::vector<CNodePtr> independents;
   std::vector<CNodePtr> others;
-
   auto cnode_ptr_list = graph_ptr->execution_order();
   MS_LOG(INFO) << "before reorder, graph orders size:" << cnode_ptr_list.size();
   for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
@@ -767,68 +765,52 @@ void AscendStreamAssign::ReorderIndependentOrders(const shared_ptr<mindspore::se
       others.emplace_back(cur_cnode_ptr);
     }
   }
-
-  if (others.empty()) {
-    std::copy(independents.begin(), independents.end(), std::back_inserter(exe_orders));
-    graph_ptr->set_execution_order(exe_orders);
+  if (others.empty() || independents.empty()) {
+    MS_LOG(INFO) << "independent or others is empty, no need reorder";
     return;
   }
 
-  if (independents.empty()) {
-    std::copy(others.begin(), others.end(), std::back_inserter(exe_orders));
-    graph_ptr->set_execution_order(exe_orders);
-    return;
-  }
-
-  std::vector<CNodePtr> processed;
+  std::set<CNode *> processed;
   for (size_t i = 0; i < others.size(); i++) {
     auto begin = others.begin() + i;
     auto end = begin + 1;
     bool flag = false;
     for (size_t j = 0; j < independents.size(); j++) {
       auto cur_independent = independents[j];
-      auto it = std::find(processed.begin(), processed.end(), cur_independent);
+      auto it = std::find(processed.begin(), processed.end(), cur_independent.get());
       if (it != processed.end()) {
         continue;
       }
-
       auto res = FindTargetOp(begin, end, cur_independent);
       if (res != end) {
         flag = true;
         exe_orders.emplace_back(cur_independent);
         exe_orders.emplace_back(*begin);
-        processed.emplace_back(cur_independent);
+        processed.emplace(cur_independent.get());
         break;
       }
     }
-
     if (!flag) {
       exe_orders.emplace_back(*begin);
     }
   }
-
   MS_LOG(INFO) << "after reorder, graph orders size:" << exe_orders.size();
+  if (processed.size() != independents.size()) {
+    MS_LOG(WARNING) << "processed independent nodes size is not equal to exiting independent nodes size";
+    return;
+  }
+
   graph_ptr->set_execution_order(exe_orders);
 }
 
-void AscendStreamAssign::PrintGraphExeOrders(const shared_ptr<mindspore::session::KernelGraph> &graph_ptr) {
-  MS_EXCEPTION_IF_NULL(graph_ptr);
-  auto cnode_ptr_list = graph_ptr->execution_order();
-  for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
-    CNodePtr cur_cnode_ptr = cnode_ptr_list[i];
-    MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
-    if (AnfAlgo::GetCNodeName(cur_cnode_ptr) == kSendOpName || AnfAlgo::GetCNodeName(cur_cnode_ptr) == kRecvOpName) {
-      auto primitive = AnfAlgo::GetCNodePrimitive(cur_cnode_ptr);
-      MS_LOG(INFO) << "node name[" << AnfAlgo::GetCNodeName(cur_cnode_ptr) << "], logic id["
-                   << AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get()) << "], stream id["
-                   << AnfAlgo::GetStreamId(cur_cnode_ptr) << "], event_id["
-                   << GetValue<uint32_t>(primitive->GetAttr(kAttrEventId)) << "]";
-    } else {
-      MS_LOG(INFO) << "node name[" << cur_cnode_ptr->fullname_with_scope() << "], logic id["
-                   << AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get()) << "], stream id["
-                   << AnfAlgo::GetStreamId(cur_cnode_ptr) << "]";
-    }
-  }
+void AscendStreamAssign::Reset() {
+  total_event_num_ = 0;
+  independent_stream_activated_ = false;
+  independent_stream_map_.clear();
+  processed_streams_.clear();
+  hcom_stream_list_.clear();
+  need_first_active_streams_.clear();
+  inner_parallel_streams_.clear();
 }
 }  // namespace ascend
 }  // namespace device
diff --git a/mindspore/ccsrc/device/ascend/ascend_stream_assign.h b/mindspore/ccsrc/device/ascend/ascend_stream_assign.h
old mode 100755
new mode 100644
index b6f6bfd479..bb918cfc79
--- a/mindspore/ccsrc/device/ascend/ascend_stream_assign.h
+++ b/mindspore/ccsrc/device/ascend/ascend_stream_assign.h
@@ -19,6 +19,8 @@
 
 #include <functional>
 #include <unordered_map>
+#include <map>
+#include <set>
 #include <string>
 #include <vector>
 #include <memory>
@@ -36,6 +38,36 @@ using std::shared_ptr;
 using std::unordered_map;
 using std::unordered_set;
 using std::vector;
+using CnodeKey = void *;
+const uint32_t kInvalidStreamId = UINT32_MAX;
+class AscendStreamMng {
+ public:
+  static AscendStreamMng &GetInstance() {
+    static AscendStreamMng instance;
+    return instance;
+  }
+
+  void Reset() {
+    cur_stream_id = 0;
+    cur_stream_num = 0;
+  }
+  uint32_t ApplyNewStream() {
+    if (!cur_stream_num) {
+      cur_stream_num++;
+      return cur_stream_id;
+    }
+    cur_stream_num++;
+    cur_stream_id++;
+    return cur_stream_id;
+  }
+
+  uint32_t GetCurAllocStream() { return cur_stream_id; }
+  uint32_t GetCurAllocStreamNum() { return cur_stream_num; }
+
+ private:
+  uint32_t cur_stream_num{0};
+  uint32_t cur_stream_id{0};
+};
 
 class AscendStreamAssign {
  public:
@@ -47,22 +79,11 @@ class AscendStreamAssign {
   AscendStreamAssign(const AscendStreamAssign &) = delete;
   AscendStreamAssign &operator=(const AscendStreamAssign &) = delete;
 
-  uint32_t GetTotalStreamNum() const;
-  // new stream policy
-  uint32_t total_common_stream_num() const { return total_common_stream_num_; }
-  uint32_t total_independ_stream_num() const { return total_independ_stream_num_; }
   uint32_t total_event_num() const { return total_event_num_; }
+  void GetHcomStreams(std::vector<uint32_t> *streams);
 
-  void InsertActiveNew(const std::shared_ptr<session::KernelGraph> &graph_ptr);
-  void AssignAllNodesStream(const std::shared_ptr<session::KernelGraph> &graph_ptr);
-  void ResetNew();
-  void AssignStreamNew(const std::shared_ptr<session::KernelGraph> &graph_ptr);
-  bool IsIndependentNode(const CNodePtr &node_ptr);
-  const std::unordered_map<uint32_t, uint32_t> &logic_to_independent_map() { return logic_to_independent_map_; }
-  const std::unordered_map<uint32_t, uint32_t> &logic_to_physic_map() { return logic_to_physic_map_; }
-  const std::vector<std::vector<uint32_t>> &inner_parallel_streams() { return inner_parallel_streams_; }
+  void AssignStream(const std::shared_ptr<session::KernelGraph> &graph_ptr);
   void GetWaitStreams(vector<uint32_t> *wait_active_stream_list);
-  const std::vector<uint32_t> &hcom_streams() { return hcom_stream_list_; }
   CNodePtr CreateSendApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr, uint32_t event_id,
                                  uint32_t stream_id);
   CNodePtr CreateRecvApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr, uint32_t event_id,
@@ -71,49 +92,41 @@ class AscendStreamAssign {
  private:
   AscendStreamAssign() = default;
   ~AscendStreamAssign() = default;
-
-  vector<CNodePtr>::iterator FindTargetOp(vector<CNodePtr>::iterator begin, vector<CNodePtr>::iterator end,
-                                          const CNodePtr &node);
-
-  bool IsHcom(const CNodePtr &apply_kernel);
-  bool IsProcessed(uint32_t logic_id);
-  void TransLogicToPhysic(const vector<uint32_t> &logic_ids, vector<uint32_t> *physic_ids);
+  void Reset();
+  void CheckStreamAssign(const std::shared_ptr<session::KernelGraph> &graph_ptr);
+  void AssignAllNodesStream(const std::shared_ptr<session::KernelGraph> &graph_ptr);
   void AssignCommonStreamId(const CNodePtr &cur_cnode_ptr, CNodePtr *pre_cnode_ptr, uint32_t *cur_index,
                             uint32_t *cur_stream_id);
-  void RecordIdMap(uint32_t logic_id, uint32_t physic_id);
-  void UpdateStreamActive(const CNodePtr &active_ptr);
-  void UpdateStreamSwitch(const CNodePtr &switch_ptr, const CNodePtr &active_ptr);
-  bool IsTaskSink();
-  void AssignIndependentStreamId(const CNodePtr &cur_cnode_ptr, uint32_t deal_logic_id);
-  void UpdateStreamId(const std::shared_ptr<session::KernelGraph> &graph_ptr);
-  void UpdateEventId(const std::shared_ptr<session::KernelGraph> &graph_ptr);
-  void PrintGraphExeOrders(const std::shared_ptr<session::KernelGraph> &graph_ptr);
-  void RecordFirstCommonOp(const CNodePtr &cur_cnode_ptr, uint32_t cur_node_logic_id, uint32_t cur_stream_id);
-  uint32_t GetLogicId(const CNodePtr &cur_cnode_ptr);
-  void SetCommonStreamNum(uint32_t cur_stream_id);
-  void FindAllReduceParallel(const std::shared_ptr<session::KernelGraph> &graph_ptr);
-  bool IsProcessedParallelStream(uint32_t stream_id);
-  void GetParallelStream(uint32_t cur_stream_id, uint32_t stream_acitve_id, std::vector<uint32_t> *parallel_streams);
+  void AssignIndependentStreamId(const CNodePtr &cur_cnode_ptr);
+  void UpdateAtomicAddrCleanStreamId(const std::shared_ptr<session::KernelGraph> &graph_ptr);
+  void FindHcomParallelStreams(const std::shared_ptr<session::KernelGraph> &graph_ptr);
+  void InsertStreamActive(const std::shared_ptr<session::KernelGraph> &graph_ptr);
+  void UpdateStreamSwitch(const std::shared_ptr<session::KernelGraph> &graph_ptr, const CNodePtr &switch_ptr,
+                          const vector<uint32_t> &independent_stream, vector<CNodePtr> *orders);
   void InsertSendRecvForIndependent(const std::shared_ptr<session::KernelGraph> &graph_ptr);
   void InsertSendRecvForHcomParallel(const std::shared_ptr<session::KernelGraph> &graph_ptr);
+  void InsertSendRecvForDiffHcom(const shared_ptr<mindspore::session::KernelGraph> &graph_ptr);
+  void UpdateEventId(const std::shared_ptr<session::KernelGraph> &graph_ptr);
   void GetNeedActiveStreams(const std::shared_ptr<session::KernelGraph> &graph_ptr);
   void ReorderIndependentOrders(const std::shared_ptr<session::KernelGraph> &graph_ptr);
 
-  uint32_t total_common_stream_num_{0};
-  uint32_t total_independ_stream_num_{0};
-  uint32_t total_event_num_{0};
+  bool IsTaskSink();
+  bool IsFusionHcom(const CNodePtr &cur_cnode_ptr);
+  bool IsHcom(const CNodePtr &cur_cnode_ptr);
+  bool IsIndependentNode(const CNodePtr &node_ptr);
+  bool IsProcessedStream(uint32_t stream_id);
+  vector<CNodePtr>::iterator FindTargetOp(vector<CNodePtr>::iterator begin, vector<CNodePtr>::iterator end,
+                                          const CNodePtr &node);
+  void GetParallelStream(uint32_t cur_stream_id, uint32_t stream_acitve_id, std::vector<uint32_t> *parallel_streams);
 
-  uint32_t first_physic_id_{UINT32_MAX};
-  uint32_t first_logic_id_{UINT32_MAX};
-  uint32_t independent_id_{UINT32_MAX};
-  vector<uint32_t> processed_logic_id_{};
-  std::unordered_map<uint32_t, uint32_t> logic_to_physic_map_{};       // key:logic id, value: first physic id
-  std::unordered_map<uint32_t, uint32_t> logic_to_independent_map_{};  // key:logic id, value: dependent id
-  std::vector<uint32_t> independent_before_physic_id_{};               // record independent id before first physic id
-  std::vector<std::vector<uint32_t>> inner_parallel_streams_{};
-  std::vector<uint32_t> processed_parallel_streams_{};
-  std::vector<uint32_t> hcom_stream_list_{};
+  uint32_t total_event_num_{0};
+  bool independent_stream_activated_{false};
+  std::map<uint32_t, uint32_t> independent_stream_map_{};
+  std::set<uint32_t> processed_streams_{};
+  std::set<uint32_t> hcom_stream_list_{};
   std::vector<uint32_t> need_first_active_streams_{};
+  std::vector<std::vector<uint32_t>> inner_parallel_streams_{};
+
   // new policy end
 };
 }  // namespace ascend
diff --git a/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc b/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc
index afce5f3607..81d5be6731 100644
--- a/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc
+++ b/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc
@@ -26,10 +26,12 @@
 #include "kernel/kernel.h"
 #include "kernel/tbe/tbe_kernel_build.h"
 #include "kernel/tbe/tbe_kernel_parallel_build.h"
+#include "kernel/akg/ascend/akg_ascend_kernel_build.h"
 #include "kernel/aicpu/aicpu_kernel_build.h"
 #include "kernel/hccl/hccl_kernel_build.h"
 #include "kernel/rts/rt_kernel_build.h"
 #include "kernel/tbe/tbe_utils.h"
+#include "kernel/common_utils.h"
 #include "operator/ops.h"
 #include "session/anf_runtime_algorithm.h"
 #include "./common.h"
@@ -62,9 +64,36 @@ static kernel::KernelModPtr SerialCompileImpl(const AnfNodePtr &anf_node) {
   return kernel_mod_ptr;
 }
 
+static bool KernelPreBuildParallelCompile(const mindspore::session::KernelGraph *kernel_graph_ptr) {
+  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
+  std::vector<AnfNodePtr> tbe_nodes;
+  for (const auto &anf_node : kernel_graph_ptr->execution_order()) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    if (!AnfAlgo::IsRealKernel(anf_node)) {
+      continue;
+    }
+    KernelType kernel_type = AnfAlgo::GetKernelType(anf_node);
+    switch (kernel_type) {
+      case KernelType::TBE_KERNEL: {
+        if (AnfAlgo::GetKernelMod(anf_node) == nullptr &&
+            AnfAlgo::GetFusionType(anf_node) == kernel::FusionType::DYNAMIC) {
+          tbe_nodes.push_back(anf_node);
+        }
+        break;
+      }
+      default: {
+        break;
+      }
+    }
+  }
+  bool ret = kernel::TbeOpParallelPreBuild(tbe_nodes);
+  return ret;
+}
+
 static bool KernelBuildParallelCompile(const mindspore::session::KernelGraph *kernel_graph_ptr) {
   MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
   std::vector<AnfNodePtr> tbe_nodes;
+  std::vector<AnfNodePtr> akg_nodes;
   std::vector<AnfNodePtr> other_nodes;
   for (const auto &anf_node : kernel_graph_ptr->execution_order()) {
     MS_EXCEPTION_IF_NULL(anf_node);
@@ -79,43 +108,52 @@ static bool KernelBuildParallelCompile(const mindspore::session::KernelGraph *ke
         }
         break;
       }
+      case KernelType::AKG_KERNEL: {
+        akg_nodes.push_back(anf_node);
+        break;
+      }
       default: {
         other_nodes.push_back(anf_node);
         break;
       }
     }
   }
-  bool ret = kernel::TbeOpParallelBuild(tbe_nodes);
+  bool tbe_ret = kernel::TbeOpParallelBuild(tbe_nodes);
+  bool akg_ret = kernel::AkgAscendKernelParallelBuild(akg_nodes);
+  auto bin_map = kernel::tbe::KernelMeta::GetInstance();
+  (void)bin_map->ReadIndex(kernel::kCceKernelMeta);
   for (const auto &anf_node : other_nodes) {
     kernel::KernelModPtr kernel_mod_ptr = SerialCompileImpl(anf_node);
     MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
     AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
   }
-  return ret;
+  return tbe_ret && akg_ret;
 }
 
-static std::vector<int> CalCleanZerosSize(const CNodePtr &pre_node) {
+static std::vector<size_t> CalCleanZerosSize(const CNodePtr &pre_node) {
   MS_EXCEPTION_IF_NULL(pre_node);
-  std::vector<int> clean_size_list;
+  auto kernel_mod = AnfAlgo::GetKernelMod(pre_node);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  std::vector<size_t> clean_size_list;
   // clean output
-  if (AnfAlgo::HasNodeAttr(kAttrAutomicOutputIndexs, pre_node)) {
-    auto clean_output_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAutomicOutputIndexs);
-    for (auto index : clean_output_indexs) {
-      TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(pre_node, index);
-      size_t type_size = GetTypeByte(TypeIdToType(output_type_id));
-      std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(pre_node, index);
-      auto size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
-      clean_size_list.push_back((size + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize);
+  if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, pre_node)) {
+    auto output_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAtomicOutputIndexs);
+    auto output_men_size = kernel_mod->GetOutputSizeList();
+    for (auto index : output_indexs) {
+      auto clean_item = (output_men_size.at(index) + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize;
+      clean_size_list.emplace_back(clean_item);
     }
   }
   // clean workspace
-  auto workspaces_size = 0;
-  if (AnfAlgo::HasNodeAttr(kAttrAutomicWorkspaceSize, pre_node)) {
-    workspaces_size = AnfAlgo::GetNodeAttr<int>(pre_node, kAttrAutomicWorkspaceSize);
-    clean_size_list.push_back(workspaces_size);
+  if (AnfAlgo::HasNodeAttr(kAttrAtomicWorkspaceIndexs, pre_node)) {
+    auto workspace_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAtomicWorkspaceIndexs);
+    auto workspace_men_sizes = kernel_mod->GetWorkspaceSizeList();
+    for (const auto &index : workspace_indexs) {
+      auto clean_item = (workspace_men_sizes.at(index) + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize;
+      clean_size_list.emplace_back(clean_item);
+    }
   }
-  MS_LOG(INFO) << "clear output size:" << clean_size_list.size() << ", workspace size:" << workspaces_size
-               << ",pre_node:" << pre_node->fullname_with_scope();
+  MS_LOG(INFO) << "clear output size:" << clean_size_list.size() << ",pre_node:" << pre_node->fullname_with_scope();
   return clean_size_list;
 }
 
@@ -139,12 +177,12 @@ static void AddTbeClearZeroNode(mindspore::session::KernelGraph *const kernel_gr
   builder->SetKernelType(KernelType::TBE_KERNEL);
   AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clear_zero.get());
   auto clean_size = CalCleanZerosSize(pre_node);
-  AnfAlgo::SetNodeAttr(kAttrAutomicAddMemSize, MakeValue(clean_size), clear_zero);
+  AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clear_zero);
   AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(pre_node.get()), clear_zero.get());
   new_nodes->push_back(clear_zero);
 }
 
-bool IsAtomicNode(const CNodePtr &kernel_node) {
+static bool IsAtomicNode(const CNodePtr &kernel_node) {
   MS_EXCEPTION_IF_NULL(kernel_node);
   auto kernel_mod = AnfAlgo::GetKernelMod(kernel_node);
   MS_EXCEPTION_IF_NULL(kernel_mod);
@@ -152,40 +190,50 @@ bool IsAtomicNode(const CNodePtr &kernel_node) {
   if (parameters_indexs.empty()) {
     return false;
   }
-  auto atomic_flag = false;
   size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
   size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  auto workspace_size_list = kernel_mod->GetWorkspaceSizeList();
   size_t workspace_num = kernel_mod->GetWorkspaceSizeList().size();
-  if (input_num + workspace_num + output_num > parameters_indexs.size()) {
-    size_t lossNum = (input_num + workspace_num + output_num) - parameters_indexs.size();
-    for (size_t i = 0; i < lossNum; i++) {
-      parameters_indexs.push_back(0);
-    }
+  size_t param_num = parameters_indexs.size();
+  size_t total_num = input_num + workspace_num + output_num;
+  MS_LOG(INFO) << "parameters size: " << param_num << ", input & workspace & output num: " << total_num;
+  size_t pad_index = param_num;
+  for (; pad_index < total_num; ++pad_index) {
+    parameters_indexs.emplace_back(0);
   }
-  std::vector<size_t> clean_output_indexs;
-  // in parameters data sort as input->workspace->output
-  size_t index = 0;
-  while (index < output_num) {
-    if (parameters_indexs[input_num + workspace_num + index] == 1) {
-      atomic_flag = true;
-      clean_output_indexs.push_back(index);
+  // process input
+  for (size_t j = 0; j < input_num; ++j) {
+    if (parameters_indexs.at(j) == 1) {
+      MS_LOG(EXCEPTION) << "Atomic addr clean does't support clean input address, input index: " << j;
     }
-    index++;
   }
-  if (atomic_flag) {
-    AnfAlgo::SetNodeAttr(kAttrAutomicOutputIndexs, MakeValue(clean_output_indexs), kernel_node);
+  // process output
+  std::vector<size_t> output_indexs;
+  for (size_t i = 0; i < output_num; ++i) {
+    auto param_output = parameters_indexs.at(input_num + workspace_num + i);
+    if (param_output == 1) {
+      output_indexs.emplace_back(i);
+      MS_LOG(INFO) << "Atomic clear output index: " << i;
+    }
   }
-  for (size_t i = 0; i < workspace_num; ++i) {
-    if (parameters_indexs[input_num + i] == 1) {
-      atomic_flag = true;
-      AnfAlgo::SetNodeAttr(kAttrAutomicWorkspaceSize,
-                           MakeValue(std::accumulate(workspace_size_list.begin(), workspace_size_list.end(), 0)),
-                           kernel_node);
-      break;
+  AnfAlgo::SetNodeAttr(kAttrAtomicOutputIndexs, MakeValue(output_indexs), kernel_node);
+  // process workspace
+  std::vector<size_t> workspace_indexs;
+  for (size_t k = 0; k < workspace_num; ++k) {
+    auto param_workspace = parameters_indexs.at(input_num + k);
+    if (param_workspace == 1) {
+      workspace_indexs.emplace_back(k);
+      MS_LOG(INFO) << "Atomic clear workspace index: " << k;
     }
   }
-  return atomic_flag;
+  AnfAlgo::SetNodeAttr(kAttrAtomicWorkspaceIndexs, MakeValue(workspace_indexs), kernel_node);
+
+  return !(workspace_indexs.empty() && output_indexs.empty());
+}
+
+bool KernelPreBuild(const mindspore::session::KernelGraph *kernel_graph_ptr) {
+  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
+  bool ret = device::ascend::KernelPreBuildParallelCompile(kernel_graph_ptr);
+  return ret;
 }
 
 bool KernelBuild(const mindspore::session::KernelGraph *kernel_graph_ptr) {
@@ -202,7 +250,7 @@ void KernelBuildPreprocess(mindspore::session::KernelGraph *kernel_graph) {
   for (const auto &anf_node : kernel_graph->execution_order()) {
     std::string apply_function_name = AnfAlgo::GetCNodeName(anf_node);
     if (apply_function_name == prim::kPrimMaxPoolGrad->name() &&
-        AnfAlgo::GetKernelType(anf_node) == KernelType::AUTO_DIFF_KERNEL) {
+        AnfAlgo::GetKernelType(anf_node) == KernelType::AKG_KERNEL) {
       auto clear_zero_prim = std::make_shared<Primitive>(kClearZeroOpName);
       MS_EXCEPTION_IF_NULL(clear_zero_prim);
       auto new_value_node = NewValueNode(clear_zero_prim);
diff --git a/mindspore/ccsrc/device/ascend/kernel_build_ascend.h b/mindspore/ccsrc/device/ascend/kernel_build_ascend.h
index 5dea36a183..d987b6ce7a 100644
--- a/mindspore/ccsrc/device/ascend/kernel_build_ascend.h
+++ b/mindspore/ccsrc/device/ascend/kernel_build_ascend.h
@@ -22,6 +22,10 @@
 namespace mindspore {
 namespace device {
 namespace ascend {
+/**
+ * @brief kernel pre build for ascend.
+ */
+bool KernelPreBuild(const mindspore::session::KernelGraph *kernel_graph_ptr);
 /**
  * @brief kernel build for ascend.
  */
diff --git a/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc b/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc
index 6e6e7419fd..4e56721fe0 100644
--- a/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc
+++ b/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc
@@ -15,18 +15,27 @@
  */
 
 #include "device/ascend/kernel_select_ascend.h"
+
 #include <string>
 #include <vector>
 #include <memory>
 #include <utility>
+#include <algorithm>
 #include <map>
-#include "kernel/oplib/oplib.h"
-#include "kernel/kernel_query.h"
+#include <unordered_map>
+#include <unordered_set>
+
+#include "common/utils.h"
+#include "debug/anf_ir_dump.h"
+#include "operator/ops.h"
+#include "ir/func_graph.h"
+#include "utils/context/ms_context.h"
 #include "session/anf_runtime_algorithm.h"
+#include "device/kernel_info.h"
+#include "kernel/common_utils.h"
+#include "kernel/kernel_query.h"
+#include "kernel/oplib/oplib.h"
 #include "kernel/kernel_build_info.h"
-#include "utils/context/ms_context.h"
-#include "operator/ops.h"
-#include "debug/anf_ir_dump.h"
 
 namespace mindspore {
 namespace device {
@@ -45,7 +54,6 @@ enum MatchCountPriority : int {
   MATCH_COUNT_PRIORITY_END
 };
 
-const size_t kMaxCount = 0xffffffff;
 const int kUnSupportMixedDataTypeIndex = -1;
 
 bool MatchInferOutputDataType(const CNodePtr &cnode, const kernel::KernelBuildInfo &kernel_build_info) {
@@ -73,7 +81,7 @@ string GetPriorityMatchFormat(const CNodePtr &cnode) {
   for (size_t index = 0; index < AnfAlgo::GetInputTensorNum(cnode); ++index) {
     auto pre_output_format = AnfAlgo::GetPrevNodeOutputFormat(cnode, index);
     if (AnfAlgo::IsFeatureMapInput(cnode, index) &&
-        kNeedTransFormatSet.find(pre_output_format) != kNeedTransFormatSet.end()) {
+        kHWSpecialFormatSet.find(pre_output_format) != kHWSpecialFormatSet.end()) {
       priority_matched_format = !is_init ? pre_output_format : priority_matched_format;
       is_init = true;
     }
@@ -91,14 +99,14 @@ string GetPriorityMatchFormat(const CNodePtr &cnode) {
   return priority_matched_format;
 }
 /**
- * compare two vector by priority, select a better vector, like compare two num, first compare highest num location,
+ * Compare two vector by priority, select a better vector, like compare two num, first compare highest num location,
  * if equal then next num location
  * example:[3,1,1,1] > [2,2,2,2] > [2,2,1,2] > [2,1,1,3]
  */
 bool PriorityChooseItem(const std::vector<int> &cur_item, std::vector<int> *best_item) {
   MS_EXCEPTION_IF_NULL(best_item);
   if (cur_item.size() != best_item->size()) {
-    MS_LOG(ERROR) << "item size should be same!";
+    MS_LOG(ERROR) << "Item size should be same!";
     return false;
   }
   // Update the best_item by comparing the cur_item and best_item
@@ -124,12 +132,23 @@ void UpdateCurMatchCounts(const kernel::KernelBuildInfo &kernel_build_info, cons
   }
   auto pri_match_format = GetPriorityMatchFormat(kernel_node);
   for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
+    auto input_anf_node = kernel_node->input(input_index + 1);
+    // we do not take ValueNode into consideration in graph kernel.
+    if (kernel_build_info.kernel_type() == KernelType::AKG_KERNEL) {
+      if (input_anf_node->isa<ValueNode>() && AnfAlgo::GetOutputDeviceDataType(input_anf_node, 0) == kTypeUnknown) {
+        continue;
+      }
+    }
     auto base_score = AnfAlgo::IsFeatureMapInput(kernel_node, input_index) ? kFeatureMapBaseScore : kWegihtBaseScore;
     if (kernel_build_info.GetInputFormat(input_index) == AnfAlgo::GetPrevNodeOutputFormat(kernel_node, input_index)) {
       (*cur_kernelinfo_match_counts)[MATCH_FORMAT_COUNT] += base_score;
     }
-    if (kernel_build_info.GetInputDeviceType(input_index) ==
-        AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, input_index)) {
+    // we match output fix precision first.
+    auto prev_device_type = AnfAlgo::GetPrevNodeOutputPrecision(kernel_node, input_index);
+    if (prev_device_type == kTypeUnknown) {
+      prev_device_type = AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, input_index);
+    }
+    if (kernel_build_info.GetInputDeviceType(input_index) == prev_device_type) {
       (*cur_kernelinfo_match_counts)[MATCH_DTYPE_COUNT] += base_score;
     }
     if (kernel_build_info.GetInputFormat(input_index) == pri_match_format) {
@@ -149,40 +168,6 @@ void UpdateCurMatchCounts(const kernel::KernelBuildInfo &kernel_build_info, cons
   }
 }
 
-void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, const CNodePtr &kernel_node) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
-    auto input_kernel_node = AnfAlgo::GetInputNode(kernel_node, input_index);
-    MS_EXCEPTION_IF_NULL(input_kernel_node);
-    auto input_with_index = AnfAlgo::VisitKernel(input_kernel_node, 0);
-    MS_EXCEPTION_IF_NULL(input_with_index.first);
-    auto real_input_node = input_with_index.first;
-    if (real_input_node->isa<CNode>()) {
-      continue;
-    }
-    std::shared_ptr<kernel::KernelBuildInfo::KernelBuildInfoBuilder> builder =
-      std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
-    bool is_ref = false;
-    auto op_info = mindspore::kernel::OpLib::FindOp(AnfAlgo::GetCNodeName(kernel_node), kernel::kTBE);
-    if (op_info != nullptr) {
-      is_ref = op_info->is_ref();
-    }
-    MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
-    if (MsContext::GetInstance()->execution_mode() == kPynativeMode &&
-        AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) != kTypeUnknown) {
-      continue;
-    }
-    // we set special device info of a input tensor.
-    if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown || is_ref) {
-      std::vector<std::string> output_format = {selected_kernel_info.GetInputFormat(input_index)};
-      builder->SetOutputsFormat(output_format);
-      std::vector<TypeId> output_type = {AnfAlgo::GetOutputInferDataType(real_input_node, 0)};
-      builder->SetOutputsDeviceType(output_type);
-      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), real_input_node.get());
-    }
-  }
-}
-
 void AddSupportMixedPrecisionDataTypeIndex(TypeId data_type, std::vector<int> *support_index) {
   MS_EXCEPTION_IF_NULL(support_index);
   int index = kUnSupportMixedDataTypeIndex;
@@ -221,6 +206,7 @@ void AddNodeInputDataType(const CNodePtr &kernel_node, size_t input_index,
                           std::vector<TypeId> *node_mix_precision_datatype) {
   AnfNodePtr cur_input = AnfAlgo::GetInputNode(kernel_node, input_index);
   MS_EXCEPTION_IF_NULL(cur_input);
+  MS_EXCEPTION_IF_NULL(node_mix_precision_datatype);
   TypeId input_origin_type = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, input_index);
   AddSupportMixedPrecisionDataTypeIndex(input_origin_type, node_mix_precision_datatype_index);
   node_mix_precision_datatype->push_back(input_origin_type);
@@ -229,6 +215,7 @@ void AddNodeInputDataType(const CNodePtr &kernel_node, size_t input_index,
 void AddNodeOutputDataType(const CNodePtr &kernel_node, size_t output_index,
                            std::vector<int> *node_mix_precision_datatype_index,
                            std::vector<TypeId> *node_mix_precision_datatype) {
+  MS_EXCEPTION_IF_NULL(node_mix_precision_datatype);
   auto output_origin_type = AnfAlgo::GetOutputInferDataType(kernel_node, output_index);
   AddSupportMixedPrecisionDataTypeIndex(output_origin_type, node_mix_precision_datatype_index);
   node_mix_precision_datatype->push_back(output_origin_type);
@@ -239,12 +226,12 @@ void CheckDataTypeInputs(const std::vector<int> &node_mix_precision_datatype_ind
                          const std::map<size_t, std::vector<TypeId>> &kernel_support_datatypes,
                          std::map<size_t, std::vector<int>> *kernel_match_datatype_idx) {
   if (node_mix_precision_datatype_index.size() != node_mix_precision_datatype.size()) {
-    MS_LOG(EXCEPTION) << "node datatype index size " << node_mix_precision_datatype_index.size() << " != datatype size "
+    MS_LOG(EXCEPTION) << "Node datatype index size " << node_mix_precision_datatype_index.size() << " != datatype size "
                       << node_mix_precision_datatype.size();
   }
   MS_EXCEPTION_IF_NULL(kernel_match_datatype_idx);
   if (kernel_support_datatypes.size() != kernel_match_datatype_idx->size()) {
-    MS_LOG(EXCEPTION) << "kernel datatype index size " << kernel_match_datatype_idx->size() << " != datatype size "
+    MS_LOG(EXCEPTION) << "Kernel datatype index size " << kernel_match_datatype_idx->size() << " != datatype size "
                       << kernel_support_datatypes.size();
   }
 }
@@ -265,10 +252,10 @@ bool RaiseDataTypePrecisionSelect(const std::vector<int> &node_mix_precision_dat
       if (node_mix_precision_datatype_index[i] == kUnSupportMixedDataTypeIndex) {
         auto find_iter = kernel_support_datatypes.find(iter->first);
         if (find_iter == kernel_support_datatypes.end()) {
-          MS_LOG(EXCEPTION) << "kernel datatype index:%lu can not be found " << iter->first;
+          MS_LOG(EXCEPTION) << "Kernel datatype index:%lu can not be found " << iter->first;
         }
         if (i >= find_iter->second.size()) {
-          MS_LOG(EXCEPTION) << "node index " << i << "kernel datatype size " << find_iter->second.size();
+          MS_LOG(EXCEPTION) << "Node index " << i << "kernel datatype size " << find_iter->second.size();
         }
         if (node_mix_precision_datatype[i] != find_iter->second[i]) {
           iter = kernel_match_datatype_idx->erase(iter);
@@ -279,7 +266,7 @@ bool RaiseDataTypePrecisionSelect(const std::vector<int> &node_mix_precision_dat
       }
       auto datatype_indexes = iter->second;
       if (i >= datatype_indexes.size()) {
-        MS_LOG(EXCEPTION) << "node datatype index: " << i << " kernel support size " << datatype_indexes.size();
+        MS_LOG(EXCEPTION) << "Node datatype index: " << i << " kernel support size " << datatype_indexes.size();
       }
       if (datatype_indexes[i] < node_mix_precision_datatype_index[i]) {
         iter = kernel_match_datatype_idx->erase(iter);
@@ -293,8 +280,12 @@ bool RaiseDataTypePrecisionSelect(const std::vector<int> &node_mix_precision_dat
 
 bool CanDataTypeReduce(const std::vector<int> &datatype_indexes, int check_index,
                        const std::vector<int> &node_mix_precision_datatype_index) {
-  return datatype_indexes[check_index] != kUnSupportMixedDataTypeIndex &&
-         datatype_indexes[check_index] <= node_mix_precision_datatype_index[check_index];
+  auto check_index_tmp = IntToSize(check_index);
+  if (check_index_tmp < datatype_indexes.size() && check_index_tmp < node_mix_precision_datatype_index.size()) {
+    return datatype_indexes[check_index] != kUnSupportMixedDataTypeIndex &&
+           datatype_indexes[check_index] <= node_mix_precision_datatype_index[check_index];
+  }
+  MS_LOG(EXCEPTION) << "Check index " << check_index << "is outof range";
 }
 
 bool RaiseOrReduceDataTypePrecisionSelect(const std::vector<int> &node_mix_precision_datatype_index,
@@ -313,10 +304,10 @@ bool RaiseOrReduceDataTypePrecisionSelect(const std::vector<int> &node_mix_preci
       if (node_mix_precision_datatype_index[i] == kUnSupportMixedDataTypeIndex) {
         auto find_iter = kernel_support_datatypes.find(iter->first);
         if (find_iter == kernel_support_datatypes.end()) {
-          MS_LOG(EXCEPTION) << "kernel datatype index:%lu can not be found " << iter->first;
+          MS_LOG(EXCEPTION) << "Kernel datatype index:%lu can not be found " << iter->first;
         }
         if (i >= find_iter->second.size()) {
-          MS_LOG(EXCEPTION) << "node index " << i << " >= kernel datatype size " << find_iter->second.size();
+          MS_LOG(EXCEPTION) << "Node index " << i << " >= kernel datatype size " << find_iter->second.size();
         }
         if (node_mix_precision_datatype[i] != find_iter->second[i]) {
           iter = kernel_match_datatype_idx->erase(iter);
@@ -327,7 +318,7 @@ bool RaiseOrReduceDataTypePrecisionSelect(const std::vector<int> &node_mix_preci
       }
       auto datatype_indexes = iter->second;
       if (i >= datatype_indexes.size()) {
-        MS_LOG(EXCEPTION) << "index " << i << "> kernel datatype indexes size " << datatype_indexes.size();
+        MS_LOG(EXCEPTION) << "Index " << i << "> kernel datatype indexes size " << datatype_indexes.size();
       }
       if (!CanDataTypeReduce(datatype_indexes, i, node_mix_precision_datatype_index)) {
         iter = kernel_match_datatype_idx->erase(iter);
@@ -397,9 +388,9 @@ void PrintRaiseOrReducePrecisionSelectedInfo(const CNodePtr &cnode,
   std::ostringstream buffer;
   buffer << cnode->DebugString();
   if (precision_reduce) {
-    buffer << " reduce precision, node datatype: \n";
+    buffer << " Reduce precision, node datatype: \n";
   } else {
-    buffer << " raise precision, node datatype: \n";
+    buffer << " Raise precision, node datatype: \n";
   }
   PrintInputAndOutputInferType(buffer, cnode);
   buffer << ", select kernel:" << selected_kernel_build_info->ToString();
@@ -415,8 +406,8 @@ std::shared_ptr<kernel::KernelBuildInfo> ChooseMatchedKernelInfo(
   size_t selected_index = 0;
   for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) {
     std::vector<int> cur_kernel_info_match_counts = {0, 0, 0, 0, 0};
-    auto kernel_build_info = *(kernel_info_list[info_index]);
-    std::shared_ptr<kernel::KernelBuildInfo> kernel_info_ptr = kernel_info_list[info_index];
+    auto kernel_info_ptr = kernel_info_list[info_index];
+    MS_EXCEPTION_IF_NULL(kernel_info_ptr);
     UpdateCurMatchCounts(*kernel_info_ptr, kernel_node, &cur_kernel_info_match_counts);
     // Currently the selection policy is the match format count first, and then is datatype counts.
     if (PriorityChooseItem(cur_kernel_info_match_counts, &most_match_counts)) {
@@ -467,6 +458,51 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
 }
 }  // namespace
 
+void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
+    auto input_kernel_node = AnfAlgo::GetInputNode(kernel_node, input_index);
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    auto input_with_index = AnfAlgo::VisitKernel(input_kernel_node, 0);
+    MS_EXCEPTION_IF_NULL(input_with_index.first);
+    auto real_input_node = input_with_index.first;
+    if (real_input_node->isa<CNode>()) {
+      continue;
+    }
+    if (real_input_node->isa<Parameter>() && !AnfAlgo::IsParameterWeight(real_input_node->cast<ParameterPtr>())) {
+      continue;
+    }
+    auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+    if (IsValueNode<tensor::Tensor>(input_kernel_node) &&
+        AnfAlgo::GetOutputDeviceDataType(input_kernel_node, 0) == kTypeUnknown) {
+      std::vector<std::string> output_format = {selected_kernel_info.GetInputFormat(input_index)};
+      builder->SetOutputsFormat(output_format);
+      std::vector<TypeId> output_type = {selected_kernel_info.GetInputDeviceType(input_index)};
+      builder->SetOutputsDeviceType(output_type);
+      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), input_kernel_node.get());
+      continue;
+    }
+    // we set special device info of a input tensor.
+    bool is_ref = false;
+    auto op_info = kernel::OpLib::FindOp(AnfAlgo::GetCNodeName(kernel_node), kernel::kTBE);
+    if (op_info != nullptr) {
+      is_ref = op_info->is_ref();
+    }
+    MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+    if (MsContext::GetInstance()->execution_mode() == kPynativeMode &&
+        AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) != kTypeUnknown) {
+      continue;
+    }
+    if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown || is_ref) {
+      std::vector<std::string> output_format = {selected_kernel_info.GetInputFormat(input_index)};
+      builder->SetOutputsFormat(output_format);
+      std::vector<TypeId> output_type = {selected_kernel_info.GetInputDeviceType(input_index)};
+      builder->SetOutputsDeviceType(output_type);
+      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), real_input_node.get());
+    }
+  }
+}
+
 KernelSelectStatus SetMatchedKernelInfo(const CNodePtr &kernel_node,
                                         const std::vector<std::shared_ptr<kernel::KernelBuildInfo>> &kernel_info_list) {
   MS_EXCEPTION_IF_NULL(kernel_node);
@@ -498,11 +534,17 @@ KernelSelectStatus SetMatchedKernelInfo(const CNodePtr &kernel_node,
   return select_status;
 }
 
-KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node) {
+KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) {
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> aicpu_kernel_info_list;
   MS_EXCEPTION_IF_NULL(kernel_node);
-  kernel::KernelQuery(kernel_node, &kernel_info_list);
+  if (AnfAlgo::IsGraphKernel(kernel_node)) {
+    auto func_graph = GetValueNode<FuncGraphPtr>(kernel_node->input(kAnfPrimitiveIndex));
+    MS_EXCEPTION_IF_NULL(func_graph);
+    SelectGraphKernelInfo(kernel_node, func_graph);
+    return kStatusAllMatched;
+  }
+  kernel::KernelQuery(kernel_node, &kernel_info_list, kernel_type);
   auto select_status = SetMatchedKernelInfo(kernel_node, kernel_info_list);
   // If aicore not find valid kernel info reloading aicpu kernel info list to find it
   if (select_status == kNoMatched) {
@@ -516,12 +558,12 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node) {
   if (select_status == kNoMatched) {
     std::ostringstream buffer;
     PrintInputAndOutputInferType(buffer, kernel_node);
-    MS_LOG(WARNING) << ">>> candidates kernel info list:";
+    MS_LOG(WARNING) << ">>> Candidates kernel info list:";
     for (size_t index = 0; index < kernel_info_list.size(); ++index) {
-      MS_LOG(WARNING) << "kernel [" << index << "] :" << kernel_info_list[index]->ToString();
+      MS_LOG(WARNING) << "Kernel [" << index << "] :" << kernel_info_list[index]->ToString();
     }
     for (size_t index = 0; index < aicpu_kernel_info_list.size(); ++index) {
-      MS_LOG(WARNING) << "kernel [" << (kernel_info_list.size() + index)
+      MS_LOG(WARNING) << "Kernel [" << (kernel_info_list.size() + index)
                       << "] :" << aicpu_kernel_info_list[index]->ToString();
     }
     MS_LOG(WARNING) << " <<<";
diff --git a/mindspore/ccsrc/device/ascend/kernel_select_ascend.h b/mindspore/ccsrc/device/ascend/kernel_select_ascend.h
index c4c777c18a..7b7a7b9fb9 100644
--- a/mindspore/ccsrc/device/ascend/kernel_select_ascend.h
+++ b/mindspore/ccsrc/device/ascend/kernel_select_ascend.h
@@ -27,7 +27,10 @@ enum KernelSelectStatus {
   kStatusReducePrecision = 1,
   kStatusRaisePrecision = 2,
 };
-KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node);
+KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node,
+                                    KernelType kernel_type = KernelType::UNKNOWN_KERNEL_TYPE);
+void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, const CNodePtr &kernel_node);
+void SelectGraphKernelInfo(const CNodePtr &kernel_node, const FuncGraphPtr &func_graph);
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/device/ascend/kernel_select_graph_kernel.cc b/mindspore/ccsrc/device/ascend/kernel_select_graph_kernel.cc
new file mode 100644
index 0000000000..b57ed1cd1b
--- /dev/null
+++ b/mindspore/ccsrc/device/ascend/kernel_select_graph_kernel.cc
@@ -0,0 +1,516 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/ascend/kernel_select_ascend.h"
+#include "session/anf_runtime_algorithm.h"
+#include "device/kernel_info.h"
+#include "ir/func_graph.h"
+#include "kernel/common_utils.h"
+#include "kernel/kernel_query.h"
+#include "kernel/kernel_build_info.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+
+TypeId GetPrimitivePrecision(const CNodePtr &cnode) {
+  auto primitive = AnfAlgo::GetCNodePrimitive(cnode);
+  MS_EXCEPTION_IF_NULL(primitive);
+
+  TypeId except_type = kTypeUnknown;
+  if (primitive->GetAttr(kAttrFixPrecision) != nullptr) {
+    auto strExceptDtype = GetValue<std::string>(primitive->GetAttr(kAttrFixPrecision));
+    if (strExceptDtype == "float16") {
+      except_type = kNumberTypeFloat16;
+    } else if (strExceptDtype == "float32") {
+      except_type = kNumberTypeFloat32;
+    } else {
+      MS_LOG(EXCEPTION) << "The fix precision must be float16 or float32, but got" << strExceptDtype;
+    }
+  }
+
+  return except_type;
+}
+
+void ResetKernelBuildInfo(const CNodePtr &kernel_node) {
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  for (size_t input_index = 0; input_index < input_num; ++input_index) {
+    auto input_kernel_node = AnfAlgo::GetInputNode(kernel_node, input_index);
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    auto kernel_with_index = AnfAlgo::VisitKernel(input_kernel_node, 0);
+    if (!kernel::IsWeightBoundary(kernel_with_index.first)) {
+      continue;
+    }
+    // reset format and dtype.
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+    builder.SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+    builder.SetOutputsDeviceType(std::vector<TypeId>{kTypeUnknown});
+    AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), input_kernel_node.get());
+  }
+}
+
+void UpdateKernelInfo(const std::vector<AnfNodePtr> &node_list) {
+  for (size_t i = 0; i < node_list.size(); ++i) {
+    // select nodes in subgraph.
+    auto anf_node = node_list[i];
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    auto fix_precision_type = GetPrimitivePrecision(cnode);
+    if (fix_precision_type != kTypeUnknown) {
+      std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
+      kernel::KernelQuery(cnode, &kernel_info_list, KernelType::AKG_KERNEL);
+
+      for (size_t index = 0; index < kernel_info_list.size(); ++index)
+        // only math the first input
+        if (kernel_info_list[index]->GetInputDeviceType(0) == fix_precision_type &&
+            kernel_info_list[index]->GetInputFormat(0) == AnfAlgo::GetPrevNodeOutputFormat(cnode, 0) &&
+            AnfAlgo::GetInputDeviceDataType(cnode, 0) != fix_precision_type) {
+          auto selected_kernel_info_ptr = kernel_info_list[index];
+          ResetKernelBuildInfo(cnode);
+          AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_info_ptr, cnode.get());
+          SetTensorDeviceInfo(*selected_kernel_info_ptr, cnode);
+          break;
+        }
+    }
+  }
+}
+
+bool CanConvertDefaultShapeToNZ(const std::vector<size_t> &shape) {
+  for (size_t i = 1; i <= shape.size(); ++i) {
+    if (i > 2) {
+      break;
+    }
+    if (shape[shape.size() - i] != 1 && shape[shape.size() - i] % kCubeSize != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<int> DefaultToFracNZAxis(const std::vector<size_t> &ori_shape, const std::vector<int> &axis) {
+  std::vector<int> frac_nz_axis = axis;
+  auto shape_len = ori_shape.size();
+  for (size_t i = 0; i < axis.size(); ++i) {
+    auto axis_idx = (frac_nz_axis[i] + shape_len) % shape_len;
+    if (axis_idx == shape_len - 1) {
+      frac_nz_axis[i] = axis_idx - 1;
+      frac_nz_axis.push_back(axis_idx + 2);
+    } else if (axis_idx == shape_len - 2) {
+      frac_nz_axis[i] = axis_idx + 1;
+      frac_nz_axis.push_back(axis_idx + 2);
+    } else {
+      frac_nz_axis[i] = axis_idx;
+    }
+  }
+  return frac_nz_axis;
+}
+
+std::vector<size_t> GetReducedFracNZShape(const std::vector<size_t> &ori_shape, const std::vector<int> &axis,
+                                          bool keep_dims) {
+  std::vector<size_t> result;
+  std::set<size_t> positive_idx;
+  for (const auto &a : axis) {
+    positive_idx.insert(a >= 0 ? a : ori_shape.size() + a);
+  }
+  for (size_t i = 0; i < ori_shape.size(); ++i) {
+    if (positive_idx.count(i) == 0) {
+      result.push_back(ori_shape[i]);
+    } else if (keep_dims) {
+      result.push_back(1);
+    }
+  }
+  return result;
+}
+
+void UpdateFracNZReduceOp(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto input_format = AnfAlgo::GetPrevNodeOutputFormat(cnode, 0);
+  if (input_format == kOpFormat_FRAC_NZ) {
+    // Clone primitive to modify it
+    auto prim = GetCNodePrimitive(cnode);
+    auto new_prim = std::make_shared<Primitive>(*prim);
+    auto new_prim_node = NewValueNode(new_prim);
+    cnode->set_input(0, new_prim_node);
+
+    auto axis_value = new_prim->GetAttr(kAttrAxis);
+    std::vector<int> default_axis;
+    if (axis_value->isa<ValueList>()) {
+      auto value_list = dyn_cast<ValueList>(axis_value);
+      for (const auto &item : value_list->value()) {
+        if (item->isa<Int32Imm>()) {
+          default_axis.push_back(GetValue<int32_t>(item));
+        }
+      }
+    } else if (axis_value->isa<ValueTuple>()) {
+      auto value_tuple = dyn_cast<ValueTuple>(axis_value);
+      for (const auto &item : value_tuple->value()) {
+        if (item->isa<Int32Imm>()) {
+          default_axis.push_back(GetValue<int32_t>(item));
+        }
+      }
+    } else {
+      MS_LOG(ERROR) << "Axis attr type is not correct!";
+    }
+    auto infer_shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, 0);
+    std::vector<int> frac_nz_axis = DefaultToFracNZAxis(infer_shape, default_axis);
+    AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue<std::vector<int>>(frac_nz_axis), cnode);
+    auto output_shape = AnfAlgo::GetOutputInferShape(cnode, 0);
+    if (output_shape.size() == 1) {
+      AnfAlgo::SetNodeAttr(kAttrOutputDefault, MakeValue<bool>(true), cnode);
+    }
+  }
+}
+
+void GetDefaultFormat(const CNodePtr &kernel_node, std::string *default_format, bool *use_same_format) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(default_format);
+  MS_EXCEPTION_IF_NULL(use_same_format);
+  std::unordered_map<std::string, size_t> all_input_formats;
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  for (size_t i = 0; i < input_num; ++i) {
+    auto input_kernel_node = AnfAlgo::VisitKernel(kernel_node->input(i + 1), 0).first;
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    if (!input_kernel_node->isa<Parameter>()) {
+      auto pre_format = AnfAlgo::GetPrevNodeOutputFormat(kernel_node, i);
+      ++all_input_formats[pre_format];
+      continue;
+    }
+    auto para = input_kernel_node->cast<ParameterPtr>();
+    MS_EXCEPTION_IF_NULL(para);
+    if (AnfAlgo::GetOutputDeviceDataType(para, 0) != kTypeUnknown) {
+      auto pre_format = AnfAlgo::GetOutputFormat(para, 0);
+      ++all_input_formats[pre_format];
+      continue;
+    }
+    *use_same_format = false;
+  }
+
+  if (all_input_formats.empty()) {
+    // all inputs are parameter.
+    *default_format = kOpFormat_NC1HWC0;
+  } else {
+    std::vector<std::pair<std::string, size_t>> pairs;
+    for (auto iter = all_input_formats.begin(); iter != all_input_formats.end(); ++iter) {
+      pairs.push_back(std::make_pair(iter->first, iter->second));
+    }
+    auto cmp_func = [](const std::pair<std::string, size_t> &a, const std::pair<std::string, size_t> &b) {
+      if (a.second != b.second) {
+        return a.second > b.second;
+      } else if (a.first == kOpFormat_DEFAULT) {
+        return a.second + 1 > b.second;
+      } else if (b.first == kOpFormat_DEFAULT) {
+        return a.second > b.second + 1;
+      }
+      return a.second > b.second;
+    };
+    std::sort(pairs.begin(), pairs.end(), cmp_func);
+    *default_format = pairs.begin()->first;
+  }
+
+  for (size_t i = 0; i < input_num; ++i) {
+    auto input_kernel_node = AnfAlgo::VisitKernel(kernel_node->input(i + 1), 0).first;
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    if (!input_kernel_node->isa<Parameter>() ||
+        AnfAlgo::GetOutputDeviceDataType(input_kernel_node, 0) != kTypeUnknown) {
+      continue;
+    }
+    auto weight_infer_shape = AnfAlgo::GetOutputInferShape(input_kernel_node, 0);
+    if (weight_infer_shape.size() < 2 && *default_format == kOpFormat_FRAC_NZ) {
+      *default_format = kOpFormat_DEFAULT;
+      *use_same_format = true;
+      break;
+    }
+  }
+}
+
+void UpdateGraphKernelInputsKernelInfo(const CNodePtr &kernel_node, const std::vector<AnfNodePtr> &input_list,
+                                       const std::string &default_format, bool use_same_format,
+                                       std::vector<std::string> *graph_input_format,
+                                       std::vector<TypeId> *graph_input_type) {
+  MS_EXCEPTION_IF_NULL(graph_input_format);
+  MS_EXCEPTION_IF_NULL(graph_input_type);
+  // We set same format to all inputs of graph kernel subgraph, and process this latter.
+  // We set dtype to inputs of graph kernel subgraph same as infer dtypes.
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  for (size_t i = 0; i < input_num; ++i) {
+    auto input_kernel_node = AnfAlgo::VisitKernel(kernel_node->input(i + 1), 0).first;
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    if (use_same_format) {
+      bool can_convert = true;
+      if (default_format == kOpFormat_FRAC_NZ) {
+        auto infer_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
+        if (!CanConvertDefaultShapeToNZ(infer_shape)) {
+          MS_LOG(WARNING) << "Shape can't be converted to frac nz shape, so use default format instead";
+          can_convert = false;
+        }
+      }
+      if (can_convert) {
+        graph_input_format->push_back(default_format);
+      } else {
+        graph_input_format->push_back(kOpFormat_DEFAULT);
+      }
+      graph_input_type->push_back(AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, i));
+      continue;
+    }
+
+    if (!input_kernel_node->isa<Parameter>()) {
+      // subgraph parameter from output of other nodes.
+      graph_input_format->push_back(AnfAlgo::GetPrevNodeOutputFormat(kernel_node, i));
+      graph_input_type->push_back(AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, i));
+      continue;
+    }
+
+    auto para = input_kernel_node->cast<ParameterPtr>();
+    MS_EXCEPTION_IF_NULL(para);
+    if (AnfAlgo::GetOutputDeviceDataType(para, 0) != kTypeUnknown) {
+      // parameter already selected.
+      graph_input_format->push_back(AnfAlgo::GetOutputFormat(para, 0));
+      graph_input_type->push_back(AnfAlgo::GetOutputDeviceDataType(para, 0));
+      continue;
+    }
+
+    // weight parameter.
+    graph_input_format->push_back(default_format);
+    graph_input_type->push_back(AnfAlgo::GetOutputInferDataType(input_kernel_node, 0));
+  }
+
+  for (size_t i = 0; i < input_num; ++i) {
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+    std::vector<std::string> outputs_format = {(*graph_input_format)[i]};
+    std::vector<TypeId> outputs_device_type = {(*graph_input_type)[i]};
+    builder.SetOutputsFormat(outputs_format);
+    builder.SetOutputsDeviceType(outputs_device_type);
+    AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), input_list[i].get());
+  }
+}
+
+void UpdateEquivFormat(const std::vector<std::pair<AnfNodePtr, size_t>> &output_index,
+                       const std::vector<AnfNodePtr> &node_list, const FuncGraphPtr &func_graph,
+                       const FuncGraphManagerPtr &mng) {
+  MS_EXCEPTION_IF_NULL(mng);
+  for (size_t i = 0; i < node_list.size(); ++i) {
+    // select nodes in subgraph.
+    auto anf_node = node_list[i];
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
+    SelectKernelInfo(cnode, KernelType::AKG_KERNEL);
+    // Update ReduceSum
+    if (!IsPrimitiveCNode(cnode, prim::kPrimReduceSum)) {
+      continue;
+    }
+    UpdateFracNZReduceOp(cnode);
+    // If ReduceSum's output is 1d and not Default format, convert it to Default format
+    auto out_format = AnfAlgo::GetOutputFormat(cnode, 0);
+    if (out_format == kOpFormat_DEFAULT || !AnfAlgo::HasNodeAttr(kAttrOutputDefault, cnode)) {
+      continue;
+    }
+    auto infer_shape = AnfAlgo::GetOutputInferShape(cnode, 0);
+    // Insert EquivFormat node, then select kernel info again
+    std::vector<AnfNodePtr> trans_inputs;
+    trans_inputs.push_back(NewValueNode(prim::kPrimEquivFormat));
+    trans_inputs.push_back(cnode);
+    CNodePtr trans_node = func_graph->NewCNode(trans_inputs);
+    AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetPrevNodeOutputInferDataType(cnode, 0)},
+                                        {AnfAlgo::GetOutputInferShape(cnode, 0)}, trans_node.get());
+    AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue<std::vector<std::string>>({"x"}), trans_node);
+
+    if (trans_node->kernel_info() == nullptr) {
+      trans_node->set_kernel_info(std::make_shared<device::KernelInfo>());
+    }
+    SelectKernelInfo(trans_node, KernelType::AKG_KERNEL);
+    mng->Replace(cnode, trans_node);
+  }
+}
+
+void UpdateFormatsAndDtypes(const CNodePtr &kernel_node, const std::vector<AnfNodePtr> &node_list,
+                            const std::vector<AnfNodePtr> &input_list, const FuncGraphManagerPtr &mng,
+                            const std::string &default_format, std::vector<std::string> *graph_input_format,
+                            std::vector<TypeId> *graph_input_type) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(mng);
+  MS_EXCEPTION_IF_NULL(graph_input_format);
+  MS_EXCEPTION_IF_NULL(graph_input_type);
+  // update graph input format and dtype use inner ops.
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (graph_input_format->size() != input_num) {
+    MS_LOG(EXCEPTION) << "Graph input format size is not equal to input num of cnode[" << kernel_node->DebugString()
+                      << "], [%" << graph_input_format->size() << "] != [%" << input_num << "]";
+  }
+  std::vector<bool> need_update(input_num, false);
+  auto &node_users = mng->node_users();
+  for (size_t i = 0; i < input_num; ++i) {
+    auto &input = input_list[i];
+    auto iter = node_users.find(input);
+    if (iter == node_users.end() || iter->second.empty()) {
+      continue;
+    }
+    for (auto &node_user : iter->second) {
+      if (node_user.first->kernel_info() == nullptr ||
+          node_user.first->kernel_info()->select_kernel_build_info() == nullptr) {
+        // maybe not a real kernel.
+        continue;
+      }
+      auto user_format = AnfAlgo::GetInputFormat(node_user.first, IntToSize(node_user.second - 1));
+      if (user_format != (*graph_input_format)[i]) {
+        MS_LOG(WARNING) << "Users of input: [" << i << "][" << input->DebugString(2) << " of ["
+                        << kernel_node->DebugString()
+                        << "] selected different format. we use defult: " << default_format;
+        (*graph_input_format)[i] = default_format;
+        need_update[i] = true;
+      }
+
+      if (kernel_node->input(i + 1)->isa<Parameter>()) {
+        auto user_dtype = AnfAlgo::GetInputDeviceDataType(node_user.first, IntToSize(node_user.second - 1));
+        if (user_dtype != (*graph_input_type)[i]) {
+          TypeId default_dtype = AnfAlgo::GetOutputInferDataType(input, 0);
+          MS_LOG(WARNING) << "Users of input: [" << i << "][" << input->DebugString(2) << " of ["
+                          << kernel_node->DebugString()
+                          << "] selected different dtype. we use default: " << TypeIdLabel(default_dtype);
+          (*graph_input_type)[i] = default_dtype;
+          need_update[i] = true;
+        }
+      }
+    }
+  }
+
+  for (size_t i = 0; i < input_num; ++i) {
+    if (!need_update[i]) {
+      continue;
+    }
+    need_update[i] = false;
+
+    MS_LOG(DEBUG) << "Update input format: " << i << " of: [" << kernel_node->DebugString()
+                  << "] to: " << (*graph_input_format)[i];
+    MS_LOG(DEBUG) << "Update input dtype: " << i << " of: [" << kernel_node->DebugString()
+                  << "] to: " << TypeIdLabel((*graph_input_type)[i]);
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+    std::vector<std::string> outputs_format = {(*graph_input_format)[i]};
+    std::vector<TypeId> outputs_device_type = {(*graph_input_type)[i]};
+    builder.SetOutputsFormat(outputs_format);
+    builder.SetOutputsDeviceType(outputs_device_type);
+    AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), input_list[i].get());
+  }
+
+  ResetKernelBuildInfo(kernel_node);
+  // select nodes in subgraph again.
+  for (size_t i = 0; i < node_list.size(); ++i) {
+    auto anf_node = node_list[i];
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+    size_t cnode_input_num = AnfAlgo::GetInputTensorNum(cnode);
+    for (size_t j = 0; j < cnode_input_num; ++j) {
+      auto input_node = cnode->input(j + 1);
+      MS_EXCEPTION_IF_NULL(input_node);
+      if (!IsValueNode<tensor::Tensor>(input_node)) {
+        continue;
+      }
+      // reset format and dtype of const tensor.
+      builder.SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+      builder.SetOutputsDeviceType(std::vector<TypeId>{kTypeUnknown});
+      AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), input_node.get());
+    }
+    SelectKernelInfo(node_list[i]->cast<CNodePtr>(), KernelType::AKG_KERNEL);
+  }
+}
+
+void SetGraphKernelInfo(const CNodePtr &kernel_node, const std::vector<std::pair<AnfNodePtr, size_t>> &output_index,
+                        const std::vector<std::string> &graph_input_format,
+                        const std::vector<TypeId> &graph_input_type) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<std::string> graph_output_format;
+  std::vector<TypeId> graph_output_type;
+  for (size_t i = 0; i < output_index.size(); ++i) {
+    auto const &output = output_index[i];
+    graph_output_format.push_back(AnfAlgo::GetOutputFormat(output.first, output.second));
+    TypeId output_type(kTypeUnknown);
+    if (output.first->isa<CNode>()) {
+      output_type = AnfAlgo::GetCNodeOutputPrecision(output.first);
+    }
+    if (output_type == kTypeUnknown) {
+      output_type = AnfAlgo::GetOutputDeviceDataType(output.first, output.second);
+    }
+    graph_output_type.push_back(output_type);
+  }
+
+  kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder;
+  graph_info_builder.SetInputsFormat(graph_input_format);
+  graph_info_builder.SetInputsDeviceType(graph_input_type);
+  graph_info_builder.SetOutputsFormat(graph_output_format);
+  graph_info_builder.SetOutputsDeviceType(graph_output_type);
+  graph_info_builder.SetProcessor(kernel::Processor::AICORE);
+  graph_info_builder.SetKernelType(KernelType::AKG_KERNEL);
+  graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
+  auto graph_selected_info = graph_info_builder.Build();
+  MS_EXCEPTION_IF_NULL(graph_selected_info);
+  AnfAlgo::SetSelectKernelBuildInfo(graph_selected_info, kernel_node.get());
+  SetTensorDeviceInfo(*graph_selected_info, kernel_node);
+}
+
+void SelectGraphKernelInfo(const CNodePtr &kernel_node, const FuncGraphPtr &func_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(func_graph);
+
+  // collect input info of funcgraph
+  std::vector<AnfNodePtr> node_list;
+  std::vector<AnfNodePtr> input_list;
+  std::vector<AnfNodePtr> output_list;
+  kernel::GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
+  if (input_list.size() != kernel_node->inputs().size() - 1) {
+    MS_EXCEPTION(ArgumentError) << "Input num of funcgraph[" << func_graph->ToString() << "] not equal input of cnode["
+                                << kernel_node->DebugString() << "], [%" << input_list.size() << "] != ["
+                                << kernel_node->inputs().size() << "]";
+  }
+
+  std::string default_format;
+  bool use_same_format = true;
+  GetDefaultFormat(kernel_node, &default_format, &use_same_format);
+  MS_LOG(DEBUG) << "GraphKernel[" << func_graph->ToString() << "] use same input format[" << default_format
+                << "] for ParameterWeight.";
+
+  std::vector<std::string> graph_input_format;
+  std::vector<TypeId> graph_input_type;
+  UpdateGraphKernelInputsKernelInfo(kernel_node, input_list, default_format, use_same_format, &graph_input_format,
+                                    &graph_input_type);
+
+  auto mng = func_graph->manager();
+  if (mng == nullptr) {
+    mng = Manage(func_graph, true);
+  }
+  auto output_index = kernel::GetOutputIndex(node_list, input_list, output_list);
+  UpdateEquivFormat(output_index, node_list, func_graph, mng);
+  node_list.clear();
+  input_list.clear();
+  output_list.clear();
+  kernel::GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
+
+  // update graph input format and dtype use inner ops.
+  UpdateFormatsAndDtypes(kernel_node, node_list, input_list, mng, default_format, &graph_input_format,
+                         &graph_input_type);
+
+  // set fix_precision for kernel when the me prim has fix_precision attr
+  UpdateKernelInfo(node_list);
+
+  output_index = kernel::GetOutputIndex(node_list, input_list, output_list);
+  SetGraphKernelInfo(kernel_node, output_index, graph_input_format, graph_input_type);
+}
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/device/ascend/profiling/profiling_manager.cc b/mindspore/ccsrc/device/ascend/profiling/profiling_manager.cc
index 6cf3cad62f..fec1aac685 100644
--- a/mindspore/ccsrc/device/ascend/profiling/profiling_manager.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/profiling_manager.cc
@@ -28,6 +28,7 @@
 #include "utils/context/ms_context.h"
 #include "common/utils.h"
 #include "utils/convert_utils.h"
+#include "runtime/base.h"
 
 using std::vector;
 using Json = nlohmann::json;
@@ -120,7 +121,6 @@ bool ProfilingManager::StartupProfiling(uint32_t device_id) {
     MS_LOG(ERROR) << "Register profiling Engine failed.";
     return false;
   }
-
   auto context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context);
   const string prof_options_str = context->profiling_options();
@@ -129,7 +129,6 @@ bool ProfilingManager::StartupProfiling(uint32_t device_id) {
     MS_LOG(WARNING) << "Profiling is enabled, but profiling option is not set!";
     return true;
   }
-
   // current one docker only use one device`
   Json p_device;
   // JOBID
@@ -148,7 +147,6 @@ bool ProfilingManager::StartupProfiling(uint32_t device_id) {
   // only one device, but sProfMgrStartUp API require for device list
   Json devices;
   devices[0] = p_device;
-
   Json startCfg;
   startCfg["startCfg"] = devices;
 
@@ -156,8 +154,12 @@ bool ProfilingManager::StartupProfiling(uint32_t device_id) {
   std::stringstream ss;
   ss << startCfg;
   std::string cfg = ss.str();
-
   MS_LOG(INFO) << "profiling config " << cfg;
+  auto ret = rtProfilerStart();
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(INFO) << "Call rtProfilerStart failed, ret:" << ret;
+    return false;
+  }
 
   // call profiling startup API
   ProfMgrCfg prof_cfg = {cfg};
@@ -169,7 +171,7 @@ bool ProfilingManager::StartupProfiling(uint32_t device_id) {
   return true;
 }
 
-bool ProfilingManager::StopProfiling() const {
+bool ProfilingManager::StopProfiling() {
   MS_LOG(INFO) << "StopProfiling";
   if (!IsProfiling()) {
     MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
@@ -180,12 +182,20 @@ bool ProfilingManager::StopProfiling() const {
     MS_LOG(INFO) << "report data end, ret = " << reporter->Flush();
   }
 
+  auto rt_ret = rtProfilerStop();
+  if (rt_ret != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "Call rtProfilerStop failed";
+    return false;
+  }
+
   if (prof_handle_ != nullptr) {
     int result = ProfMgrStop(prof_handle_);
     if (result != 0) {
       MS_LOG(ERROR) << "ProfMgr stop return fail:" << result << ".";
+      prof_handle_ = nullptr;
       return false;
     }
+    prof_handle_ = nullptr;
   }
 
   return true;
diff --git a/mindspore/ccsrc/device/ascend/profiling/profiling_manager.h b/mindspore/ccsrc/device/ascend/profiling/profiling_manager.h
index f0c25d7f8a..c30c6898ea 100644
--- a/mindspore/ccsrc/device/ascend/profiling/profiling_manager.h
+++ b/mindspore/ccsrc/device/ascend/profiling/profiling_manager.h
@@ -39,7 +39,7 @@ class ProfilingManager {
   uint64_t GetJobId() const;
   bool ReportProfilingData(const map<uint32_t, string> &op_taskId_map) const;
   bool StartupProfiling(uint32_t device_id);
-  bool StopProfiling() const;
+  bool StopProfiling();
 
   inline bool IsProfiling() const {
     auto context = MsContext::GetInstance();
diff --git a/mindspore/ccsrc/device/ascend/profiling/profiling_utils.cc b/mindspore/ccsrc/device/ascend/profiling/profiling_utils.cc
index 62e18793b2..131a22805d 100644
--- a/mindspore/ccsrc/device/ascend/profiling/profiling_utils.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/profiling_utils.cc
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <map>
 #include "device/ascend/profiling/reporter/graph_desc_reporter.h"
 #include "device/ascend/profiling/profiling_utils.h"
 #include "kernel/kernel.h"
@@ -24,6 +23,7 @@
 #include "utils/utils.h"
 #include "device/ascend/profiling/reporter/task_desc_reporter.h"
 #include "utils/context/ms_context.h"
+#include "device/ascend/profiling/reporter/point_reporter.h"
 
 namespace mindspore {
 namespace device {
@@ -33,8 +33,9 @@ constexpr char kCustomNode[] = "PROFILING_CUSTOM_";
 constexpr char kFpStartNode[] = "PROFILING_FP_START";
 constexpr char kBpEndNode[] = "PROFILING_BP_END";
 constexpr char kIterEndNode[] = "PROFILING_ITER_END";
-std::unordered_map<uint32_t, std::vector<CNodePtr>> ProfilingUtils::graph_profiling_cnode_;
-std::unordered_map<uint32_t, std::vector<std::string>> ProfilingUtils::graph_kernel_name_;
+std::map<uint32_t, std::vector<CNodePtr>> ProfilingUtils::graph_profiling_cnode_;
+std::map<uint32_t, std::vector<std::string>> ProfilingUtils::graph_kernel_name_;
+std::map<uint32_t, std::vector<std::shared_ptr<ProfDesc>>> ProfilingUtils::graph_point_;
 uint32_t ProfilingUtils::custom_node_index_ = 1;
 
 ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<const session::KernelGraph *> graph_ptr) {
@@ -102,6 +103,7 @@ std::string ProfilingUtils::GetTraceBegin(const std::vector<CNodePtr> &cnode_exe
 void ProfilingUtils::GetCNodeOutputRealNode(const std::string &node_name, const std::vector<CNodePtr> &cnode_exec_order,
                                             NotNull<std::set<std::string> *> getnext_outputs) {
   for (const auto &cnode : cnode_exec_order) {
+    MS_EXCEPTION_IF_NULL(cnode);
     for (const auto &input : cnode->inputs()) {
       auto prev_cnode = AnfAlgo::VisitKernel(input, 0);
       if (!prev_cnode.first->isa<CNode>()) {
@@ -203,6 +205,17 @@ NotNull<CNodePtr> ProfilingUtils::CreateProfilingCNode(const ProfilingContent &p
   return NOT_NULL(cnode_ptr);
 }
 
+void ProfilingUtils::SaveProfilingPoint(uint32_t graph_id, const std::string &node_name, uint32_t point_id) {
+  std::shared_ptr<ProfDesc> prof_desc_ptr = std::make_shared<PointDesc>(node_name, point_id);
+  auto iter = graph_point_.find(graph_id);
+  if (iter == graph_point_.end()) {
+    std::vector<std::shared_ptr<ProfDesc>> tmp_vect = {prof_desc_ptr};
+    graph_point_.insert({graph_id, tmp_vect});
+  } else {
+    iter->second.emplace_back(prof_desc_ptr);
+  }
+}
+
 void ProfilingUtils::ProfilingTraceFpStart(const mindspore::AnfNodePtr &anf_node,
                                            const ProfilingTraceInfo &profiling_trace_info,
                                            NotNull<session::KernelGraph *> graph_ptr,
@@ -213,6 +226,8 @@ void ProfilingUtils::ProfilingTraceFpStart(const mindspore::AnfNodePtr &anf_node
     ProfilingContent fp_profiling_content = {false, kProfilingFpStartLogId, 0};
     auto fp_profiling_node = CreateProfilingCNodeWithStream(anf_node, fp_profiling_content, graph_ptr);
     kernel_list->emplace_back(fp_profiling_node);
+    // insert ProfDesc
+    SaveProfilingPoint(graph_ptr->graph_id(), anf_node->fullname_with_scope(), kProfilingFpStartLogId);
   }
 }
 
@@ -244,13 +259,16 @@ void ProfilingUtils::ProfilingCustomOp(const AnfNodePtr &anf_node, const Profili
   }
   MS_LOG(INFO) << "Profiling Match CustomOp:" << anf_node->fullname_with_scope();
   // custom op profiling job start from 3.
-  ProfilingContent front_profiling_content = {false, 2 * custom_node_index_ + 1, 0};
+  auto custom_point_id = 2 * custom_node_index_ + 1;
+  ProfilingContent front_profiling_content = {false, custom_point_id, 0};
   CNodePtr front_node = CreateProfilingCNodeWithStream(anf_node, front_profiling_content, graph_ptr);
   kernel_list->insert(kernel_list->end() - 1, front_node);
+  SaveProfilingPoint(graph_ptr->graph_id(), anf_node->fullname_with_scope(), custom_point_id);
 
-  ProfilingContent back_profiling_content = {false, 2 * custom_node_index_ + 2, 0};
+  ProfilingContent back_profiling_content = {false, custom_point_id + 1, 0};
   CNodePtr back_node = CreateProfilingCNodeWithStream(anf_node, back_profiling_content, graph_ptr);
   kernel_list->insert(kernel_list->end(), back_node);
+  SaveProfilingPoint(graph_ptr->graph_id(), anf_node->fullname_with_scope(), custom_point_id + 1);
   ++custom_node_index_;
 }
 
@@ -263,6 +281,7 @@ void ProfilingUtils::ProfilingTraceBpEnd(const AnfNodePtr &anf_node, const Profi
     ProfilingContent bp_end_profiling_content = {false, kProfilingBpEndLogId, 0};
     CNodePtr bp_end_node = CreateProfilingCNodeWithStream(anf_node, bp_end_profiling_content, graph_ptr);
     kernel_list->emplace_back(bp_end_node);
+    SaveProfilingPoint(graph_ptr->graph_id(), anf_node->fullname_with_scope(), kProfilingBpEndLogId);
   }
 }
 
@@ -276,6 +295,7 @@ void ProfilingUtils::ProfilingTraceEnd(const AnfNodePtr &anf_node, const Profili
     ProfilingContent bp_end_profiling_content = {true, kProfilingIterEndLogId, 0};
     CNodePtr bp_kernel_ptr = CreateProfilingCNodeWithStream(anf_node, bp_end_profiling_content, graph_ptr);
     kernel_list->emplace_back(bp_kernel_ptr);
+    SaveProfilingPoint(graph_ptr->graph_id(), anf_node->fullname_with_scope(), kProfilingIterEndLogId);
   }
 }
 
@@ -302,7 +322,7 @@ bool ProfilingUtils::ValidComputeGraph(NotNull<const session::KernelGraph *> gra
   return false;
 }
 
-void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids,
+void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
                                          NotNull<const session::KernelGraph *> graph) {
   if (!ValidComputeGraph(graph)) {
     MS_LOG(WARNING) << "Not a valid compute graph:" << graph->graph_id();
@@ -319,11 +339,24 @@ void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids,
   MS_EXCEPTION_IF_NULL(context);
   TaskDescReporter task_reporter(context->device_id(), "vm.task_desc_info", ret->second);
   task_reporter.set_task_ids(task_ids);
+  task_reporter.set_stream_ids(stream_ids);
   task_reporter.ReportData();
 
   GraphDescReporter graph_reporter(context->device_id(), "vm.graph_desc_info", ret->second);
   graph_profiling_cnode_.erase(ret);
   graph_reporter.ReportData();
+
+  // Report profiling point
+  auto point_iter = graph_point_.find(graph->graph_id());
+  if (point_iter == graph_point_.end()) {
+    MS_LOG(ERROR) << "Graph id not found in graph_point";
+    return;
+  }
+  PointReporter point_reporter(context->device_id(), "vm.point");
+  for (const auto &point : point_iter->second) {
+    point_reporter.AddReportData(point);
+  }
+  point_reporter.ReportData();
 }
 }  // namespace ascend
 }  // namespace device
diff --git a/mindspore/ccsrc/device/ascend/profiling/profiling_utils.h b/mindspore/ccsrc/device/ascend/profiling/profiling_utils.h
index 39ea80a2e9..a3c7739447 100644
--- a/mindspore/ccsrc/device/ascend/profiling/profiling_utils.h
+++ b/mindspore/ccsrc/device/ascend/profiling/profiling_utils.h
@@ -16,6 +16,7 @@
 #ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_PROFILING_PROFILING_UTILS_H_
 #define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_PROFILING_PROFILING_UTILS_H_
 
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -23,6 +24,7 @@
 #include <unordered_map>
 #include "session/kernel_graph.h"
 #include "utils/contract.h"
+#include "device/ascend/profiling/reporter/profiling_desc.h"
 
 namespace mindspore {
 namespace device {
@@ -87,7 +89,8 @@ class ProfilingUtils {
   // Mapping task_id and kernel name for device to generate the time cost of specific kernel.
   // Device calculate the time cost of the task which is marked by task id.
   // But we need data of (kernel name , time cost)
-  static void ReportProfilingData(const std::vector<uint32_t> &task_ids, NotNull<const session::KernelGraph *> graph);
+  static void ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
+                                  NotNull<const session::KernelGraph *> graph);
 
   // Get profiling trace point from envs.
   // export PROFILING_FP_START='full name of the first cnode to execute'
@@ -103,7 +106,7 @@ class ProfilingUtils {
                                 NotNull<session::KernelGraph *> graph_ptr,
                                 NotNull<std::vector<mindspore::CNodePtr> *> kernel_list);
 
-  static std::unordered_map<uint32_t, std::vector<std::string>> graph_kernel_name() { return graph_kernel_name_; }
+  static std::map<uint32_t, std::vector<std::string>> graph_kernel_name() { return graph_kernel_name_; }
 
   inline static constexpr char kProfiling[] = "Profiling";
   inline static constexpr char kNotify[] = "notify";
@@ -125,10 +128,12 @@ class ProfilingUtils {
                                      NotNull<std::set<std::string> *> getnext_outputs);
 
   static bool ValidComputeGraph(NotNull<const session::KernelGraph *> graph_ptr);
+  static void SaveProfilingPoint(uint32_t graph_id, const std::string &node_name, uint32_t point_id);
 
   // graph id --> (kernel name list)
-  static std::unordered_map<uint32_t, std::vector<CNodePtr>> graph_profiling_cnode_;
-  static std::unordered_map<uint32_t, std::vector<std::string>> graph_kernel_name_;
+  static std::map<uint32_t, std::vector<CNodePtr>> graph_profiling_cnode_;
+  static std::map<uint32_t, std::vector<std::string>> graph_kernel_name_;
+  static std::map<uint32_t, std::vector<std::shared_ptr<ProfDesc>>> graph_point_;
   static uint32_t custom_node_index_;
 };
 }  // namespace ascend
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/desc_reporter.cc b/mindspore/ccsrc/device/ascend/profiling/reporter/desc_reporter.cc
index bf61471827..cf80c07ca9 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/desc_reporter.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/desc_reporter.cc
@@ -42,22 +42,22 @@ void DescReporter::ReportByLine(const std::string &data, const std::string &file
     report_data.data = (unsigned char *)data.c_str() + cur_size;
     auto ret = memcpy_s(report_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, file_name.c_str(), file_name.length());
     if (ret != 0) {
-      MS_LOG(EXCEPTION) << "memcpy_s report data tag failed";
+      MS_LOG(EXCEPTION) << "Memcpy_s report data tag failed";
     }
     auto report_ret = reporter->Report(&report_data);
     if (report_ret != 0) {
-      MS_LOG(EXCEPTION) << "report data failed";
+      MS_LOG(EXCEPTION) << "Report data failed";
     }
     if (report_size == 0) {
-      MS_LOG(WARNING) << "report_size is 0";
+      MS_LOG(WARNING) << "Report_size is 0";
       break;
     }
     cur_size += report_size;
   }
 }
 
-void DescReporter::ReportData() {
-  for (const auto &desc : prof_desc_) {
+void DescReporter::ReportAllLine() {
+  for (const auto &desc : prof_desc_list_) {
     auto data = desc->ToString();
     ReportByLine(data, file_name_);
   }
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/desc_reporter.h b/mindspore/ccsrc/device/ascend/profiling/reporter/desc_reporter.h
index b8f0cd2f25..c8e1b3ed62 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/desc_reporter.h
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/desc_reporter.h
@@ -32,16 +32,17 @@ namespace ascend {
 class DescReporter {
  public:
   virtual ~DescReporter() = 0;
-  DescReporter(int device_id, std::string file_name, std::vector<CNodePtr> cnode_list)
-      : device_id_(device_id), file_name_(std::move(file_name)), cnode_list_(std::move(cnode_list)) {}
-  virtual void ReportData();
+  DescReporter(int device_id, std::string file_name) : device_id_(device_id), file_name_(std::move(file_name)) {}
+
+  virtual void ReportData() = 0;
 
  protected:
   void ReportByLine(const std::string &data, const std::string &file_name) const;
+  void ReportAllLine();
+
   int device_id_;
   std::string file_name_;
-  std::vector<CNodePtr> cnode_list_;
-  std::vector<std::shared_ptr<ProfDesc>> prof_desc_;
+  std::vector<std::shared_ptr<ProfDesc>> prof_desc_list_;
 };
 }  // namespace ascend
 }  // namespace device
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.cc b/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.cc
index f4f4b3362c..1f2d1570bb 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.cc
@@ -24,12 +24,13 @@ namespace device {
 namespace ascend {
 void GraphDescReporter::ReportData() {
   for (const auto &node : cnode_list_) {
-    if (AnfAlgo::GetKernelType(node) != TBE_KERNEL) {
+    if (AnfAlgo::GetKernelType(node) != TBE_KERNEL && AnfAlgo::GetKernelType(node) != AKG_KERNEL) {
       MS_LOG(WARNING) << "Skip non tbe kernel";
       continue;
     }
     std::vector<DataElement> input_data_list;
     std::vector<DataElement> output_data_list;
+    MS_EXCEPTION_IF_NULL(node);
     auto op_name = node->fullname_with_scope();
     auto op_type = AnfAlgo::GetCNodeName(node);
     auto input_size = AnfAlgo::GetInputTensorNum(node);
@@ -56,9 +57,9 @@ void GraphDescReporter::ReportData() {
     }
 
     auto graph_desc = std::make_shared<GraphDesc>(op_name, op_type, input_data_list, output_data_list);
-    prof_desc_.emplace_back(graph_desc);
+    prof_desc_list_.emplace_back(graph_desc);
   }
-  DescReporter::ReportData();
+  ReportAllLine();
 }
 }  // namespace ascend
 }  // namespace device
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.h b/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.h
index 3c48a90efe..10f78092f2 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.h
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.h
@@ -28,9 +28,12 @@ namespace ascend {
 class GraphDescReporter : public DescReporter {
  public:
   GraphDescReporter(uint32_t device_id, const std::string &file_name, std::vector<CNodePtr> cnode_list)
-      : DescReporter(device_id, file_name, std::move(cnode_list)) {}
+      : DescReporter(device_id, file_name), cnode_list_(std::move(cnode_list)) {}
   ~GraphDescReporter() override = default;
   void ReportData() override;
+
+ private:
+  std::vector<CNodePtr> cnode_list_;
 };
 }  // namespace ascend
 }  // namespace device
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/point_reporter.cc b/mindspore/ccsrc/device/ascend/profiling/reporter/point_reporter.cc
new file mode 100644
index 0000000000..0024ab9c22
--- /dev/null
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/point_reporter.cc
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/ascend/profiling/reporter/point_reporter.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+void PointReporter::ReportData() { ReportAllLine(); }
+
+void PointReporter::AddReportData(const std::shared_ptr<ProfDesc> &prof_desc) {
+  prof_desc_list_.emplace_back(prof_desc);
+}
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/point_reporter.h b/mindspore/ccsrc/device/ascend/profiling/reporter/point_reporter.h
new file mode 100644
index 0000000000..ae12672df6
--- /dev/null
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/point_reporter.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_PROFILING_REPORTER_POINT_REPORTER_H_
+#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_PROFILING_REPORTER_POINT_REPORTER_H_
+
+#include <memory>
+#include <string>
+#include "device/ascend/profiling/reporter/desc_reporter.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+class PointReporter : public DescReporter {
+ public:
+  PointReporter(uint32_t device_id, const std::string &file_name) : DescReporter(device_id, file_name) {}
+  ~PointReporter() override = default;
+  void ReportData() override;
+  void AddReportData(const std::shared_ptr<ProfDesc> &prof_desc);
+};
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_PROFILING_REPORTER_POINT_REPORTER_H_
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/profiling_desc.cc b/mindspore/ccsrc/device/ascend/profiling/reporter/profiling_desc.cc
index f28f133e1a..082cb81e42 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/profiling_desc.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/profiling_desc.cc
@@ -66,6 +66,12 @@ std::string GraphDesc::ToString() {
   return desc;
 }
 
+std::string PointDesc::ToString() {
+  std::string desc;
+  desc.append(std::to_string(point_id_)).append(" ").append(op_name_).append("\n");
+  return desc;
+}
+
 std::string GraphDesc::DataShapeToString(const std::vector<size_t> &shape) {
   std::ostringstream oss;
   oss << "\"";
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/profiling_desc.h b/mindspore/ccsrc/device/ascend/profiling/reporter/profiling_desc.h
index 852bcf116b..6d0ed45bef 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/profiling_desc.h
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/profiling_desc.h
@@ -71,6 +71,16 @@ class GraphDesc : public ProfDesc {
   std::vector<DataElement> output_data_list_;
   [[nodiscard]] static std::string DataShapeToString(const std::vector<size_t> &shape);
 };
+
+class PointDesc : public ProfDesc {
+ public:
+  PointDesc(std::string op_name, uint32_t point_id) : ProfDesc(std::move(op_name)), point_id_(point_id) {}
+  ~PointDesc() override = default;
+  std::string ToString() override;
+
+ private:
+  uint32_t point_id_;
+};
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.cc b/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.cc
index 8f59e72613..0bd66e31ef 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.cc
@@ -31,7 +31,7 @@ void TaskDescReporter::ReportData() {
 
   size_t task_index = 0;
   for (const auto &node : cnode_list_) {
-    if (AnfAlgo::GetKernelType(node) != TBE_KERNEL) {
+    if (AnfAlgo::GetKernelType(node) != TBE_KERNEL && AnfAlgo::GetKernelType(node) != AKG_KERNEL) {
       MS_LOG(WARNING) << "Skip non tbe kernel";
       ++task_index;
       continue;
@@ -40,11 +40,21 @@ void TaskDescReporter::ReportData() {
     auto ascend_kernel_mod = dynamic_cast<kernel::AscendKernelMod *>(kernel_mod);
     MS_EXCEPTION_IF_NULL(node);
     MS_EXCEPTION_IF_NULL(ascend_kernel_mod);
-    auto desc_ptr = std::make_shared<TaskDesc>(node->fullname_with_scope(), task_ids_[task_index++],
-                                               ascend_kernel_mod->block_dim(), ascend_kernel_mod->stream_id());
-    prof_desc_.emplace_back(desc_ptr);
+    // Check task_id and stream_id valid
+    CheckStreamTaskValid(task_index, task_index);
+    auto desc_ptr = std::make_shared<TaskDesc>(node->fullname_with_scope(), task_ids_[task_index],
+                                               ascend_kernel_mod->block_dim(), stream_ids_[task_index]);
+    prof_desc_list_.emplace_back(desc_ptr);
+    ++task_index;
+  }
+  ReportAllLine();
+}
+
+void TaskDescReporter::CheckStreamTaskValid(uint32_t task_id, uint32_t stream_id) {
+  if (task_id >= task_ids_.size() || stream_id >= stream_ids_.size()) {
+    MS_LOG(EXCEPTION) << "Index invalid. task_id:" << task_id << ", task_ids.size:" << task_ids_.size()
+                      << ", stream_id:" << stream_id << ", stream_ids.size:" << stream_ids_.size();
   }
-  DescReporter::ReportData();
 }
 }  // namespace ascend
 }  // namespace device
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.h b/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.h
index c1f70cacaf..087c691a5f 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.h
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.h
@@ -28,13 +28,17 @@ namespace ascend {
 class TaskDescReporter : public DescReporter {
  public:
   TaskDescReporter(int device_id, const std::string &file_name, std::vector<CNodePtr> cnode_list)
-      : DescReporter(device_id, file_name, std::move(cnode_list)) {}
+      : DescReporter(device_id, file_name), cnode_list_(std::move(cnode_list)) {}
   ~TaskDescReporter() override = default;
   void ReportData() override;
   void set_task_ids(const std::vector<uint32_t> &task_ids) { task_ids_ = task_ids; }
+  void set_stream_ids(const std::vector<uint32_t> &stream_ids) { stream_ids_ = stream_ids; }
 
  private:
   std::vector<uint32_t> task_ids_;
+  std::vector<uint32_t> stream_ids_;
+  void CheckStreamTaskValid(uint32_t task_id, uint32_t stream_id);
+  std::vector<CNodePtr> cnode_list_;
 };
 }  // namespace ascend
 }  // namespace device
diff --git a/mindspore/ccsrc/device/ascend/tasksink/runtime_utils.cc b/mindspore/ccsrc/device/ascend/tasksink/runtime_utils.cc
index 20084c0927..603dd989e5 100644
--- a/mindspore/ccsrc/device/ascend/tasksink/runtime_utils.cc
+++ b/mindspore/ccsrc/device/ascend/tasksink/runtime_utils.cc
@@ -54,13 +54,13 @@ bool RuntimeUtils::HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info
   MS_EXCEPTION_IF_NULL(task_info);
   hcclResult_t ret;
   static uint32_t task_counter = 0;
-
+  auto hccl_group = task_info->group();
   if (task_info->hccl_type() == kBroadcastOpName) {
     // call hcom broadcast interface to run op
     const string tag_broadcast = kHcomBroadcast + std::to_string(task_counter++) + kUnderline + std::to_string(0);
     ret = hcom_broadcast(tag_broadcast.c_str(), reinterpret_cast<void *>(task_info->input_data_addr()),
                          static_cast<u64>(task_info->count()), static_cast<hcclDataType_t>(task_info->data_type()),
-                         static_cast<u32>(task_info->root_id()), task_info->group().c_str(), stream);
+                         static_cast<u32>(task_info->root_id()), hccl_group.c_str(), stream);
     if (ret != HCCL_SUCCESS) {
       MS_LOG(ERROR) << "hcom_broadcast fail, return ret: " << static_cast<int>(ret);
       return false;
@@ -70,7 +70,7 @@ bool RuntimeUtils::HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info
     const string tag_all_gather = kHcomAllGather + std::to_string(task_counter++) + kUnderline + std::to_string(0);
     ret = hcom_all_gather(tag_all_gather.c_str(), reinterpret_cast<void *>(task_info->input_data_addr()),
                           reinterpret_cast<void *>(task_info->output_data_addr()), static_cast<u64>(task_info->count()),
-                          static_cast<hcclDataType_t>(task_info->data_type()), task_info->group().c_str(), stream);
+                          static_cast<hcclDataType_t>(task_info->data_type()), hccl_group.c_str(), stream);
     if (ret != HCCL_SUCCESS) {
       MS_LOG(ERROR) << "hcom_all_gather fail, return ret: " << ret;
       return false;
@@ -81,7 +81,7 @@ bool RuntimeUtils::HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info
     ret = hcom_all_reduce(tag_all_reduce.c_str(), reinterpret_cast<void *>(task_info->input_data_addr()),
                           reinterpret_cast<void *>(task_info->output_data_addr()), static_cast<u64>(task_info->count()),
                           static_cast<hcclDataType_t>(task_info->data_type()),
-                          static_cast<hcclRedOp_t>(task_info->op_type()), task_info->group().c_str(), stream);
+                          static_cast<hcclRedOp_t>(task_info->op_type()), hccl_group.c_str(), stream);
     if (ret != HCCL_SUCCESS) {
       MS_LOG(ERROR) << "hcom_all_reduce fail, return ret: " << ret;
       return false;
@@ -93,7 +93,7 @@ bool RuntimeUtils::HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info
     ret = hcom_reduce_scatter(tag_reduce_scatter.c_str(), reinterpret_cast<void *>(task_info->input_data_addr()),
                               reinterpret_cast<void *>(task_info->output_data_addr()),
                               static_cast<u64>(task_info->count()), static_cast<hcclDataType_t>(task_info->data_type()),
-                              static_cast<hcclRedOp_t>(task_info->op_type()), task_info->group().c_str(), stream);
+                              static_cast<hcclRedOp_t>(task_info->op_type()), hccl_group.c_str(), stream);
     if (ret != HCCL_SUCCESS) {
       MS_LOG(ERROR) << "hcom_reduce_scatter fail, return ret: " << ret;
       return false;
diff --git a/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc b/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
index 18da966575..0cdf751801 100644
--- a/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
+++ b/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
@@ -43,13 +43,43 @@ bool TaskGenerator::GenTasks(const std::vector<CNodePtr> &anf_node_list, std::ve
 void TaskGenerator::LaunchAddrCleanKernel(const CNodePtr &anf_node_ptr, AddressPtrList *kernel_inputs) {
   MS_EXCEPTION_IF_NULL(anf_node_ptr);
   if (anf_node_ptr->inputs().size() != 2) {
-    MS_LOG(EXCEPTION) << "atomic Addr clean Node Input nodes not equal 2.";
+    // akg process
+    // set atomic clean addr
+    if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, anf_node_ptr)) {
+      auto clean_output_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(anf_node_ptr, kAttrAtomicOutputIndexs);
+      auto graph = anf_node_ptr->func_graph();
+      MS_EXCEPTION_IF_NULL(graph);
+      auto manager = graph->manager();
+      MS_EXCEPTION_IF_NULL(manager);
+      auto node_users = manager->node_users();
+      if (node_users[anf_node_ptr].empty()) {
+        MS_LOG(EXCEPTION) << "Node users of " << anf_node_ptr->ToString() << " is empty.";
+      }
+      auto depend_node = node_users[anf_node_ptr].pop().first;
+      if (!IsPrimitiveCNode(depend_node, prim::kPrimDepend)) {
+        MS_LOG(EXCEPTION) << "Checking Depend node failed";
+      }
+      if (node_users[depend_node].empty()) {
+        MS_LOG(EXCEPTION) << "Node users of " << depend_node->ToString() << " is empty.";
+      }
+      auto post_node = node_users[depend_node].pop().first;
+      for (auto index : clean_output_indexs) {
+        auto device_address = AnfAlgo::GetOutputAddr(post_node, index);
+        kernel::AddressPtr input = std::make_shared<kernel::Address>();
+        input->addr = device_address->ptr_;
+        MS_EXCEPTION_IF_NULL(input->addr);
+        input->size = device_address->size_;
+        kernel_inputs->push_back(input);
+      }
+      MS_LOG(DEBUG) << "AtomicAddClean clean output size: " << clean_output_indexs.size();
+    }
+    return;
   }
   MS_EXCEPTION_IF_NULL(anf_node_ptr->inputs()[1]);
   auto pre_node = (anf_node_ptr->inputs()[1])->cast<CNodePtr>();
   // set clean output addr
-  if (AnfAlgo::HasNodeAttr(kAttrAutomicOutputIndexs, pre_node)) {
-    auto clean_output_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAutomicOutputIndexs);
+  if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, pre_node)) {
+    auto clean_output_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAtomicOutputIndexs);
     for (auto index : clean_output_indexs) {
       auto device_address = AnfAlgo::GetOutputAddr(pre_node, index);
       kernel::AddressPtr input = std::make_shared<kernel::Address>();
@@ -59,13 +89,13 @@ void TaskGenerator::LaunchAddrCleanKernel(const CNodePtr &anf_node_ptr, AddressP
       input->size = device_address->size_;
       kernel_inputs->push_back(input);
     }
-    MS_LOG(INFO) << "AtomicAddClean clean output size:" << clean_output_indexs.size();
+    MS_LOG(DEBUG) << "AtomicAddClean clean output size:" << clean_output_indexs.size();
   }
   // set clean workspace address
-  if (AnfAlgo::HasNodeAttr(kAttrAutomicWorkspaceSize, pre_node)) {
-    auto clean_workspaces = AnfAlgo::GetNodeAttr<int>(pre_node, kAttrAutomicWorkspaceSize);
-    if (clean_workspaces != 0) {
-      auto device_address = AnfAlgo::GetWorkspaceAddr(pre_node, 0);
+  if (AnfAlgo::HasNodeAttr(kAttrAtomicWorkspaceIndexs, pre_node)) {
+    auto clean_workspace_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAtomicWorkspaceIndexs);
+    for (const auto &index : clean_workspace_indexs) {
+      auto device_address = AnfAlgo::GetWorkspaceAddr(pre_node, index);
       kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
       MS_EXCEPTION_IF_NULL(workspace);
       workspace->addr = device_address->ptr_;
@@ -73,9 +103,8 @@ void TaskGenerator::LaunchAddrCleanKernel(const CNodePtr &anf_node_ptr, AddressP
       workspace->size = device_address->size_;
       kernel_inputs->push_back(workspace);
     }
-    MS_LOG(INFO) << "AtomicAddClean clean workspace size" << clean_workspaces;
   }
-  auto clear_mems = AnfAlgo::GetNodeAttr<std::vector<int>>(anf_node_ptr, kAttrAutomicAddMemSize);
+  auto clear_mems = AnfAlgo::GetNodeAttr<std::vector<size_t>>(anf_node_ptr, kAttrAtomicAddMemSize);
   if (kernel_inputs->size() != clear_mems.size()) {
     MS_LOG(EXCEPTION) << "AtomicAddClean kernel inputs size not equal clear memory size,kerenl_inputs size:"
                       << kernel_inputs->size() << ",clean mem size" << clear_mems.size();
diff --git a/mindspore/ccsrc/device/cpu/cpu_device_address.cc b/mindspore/ccsrc/device/cpu/cpu_device_address.cc
index 56e9b6d36e..09ab0da12b 100644
--- a/mindspore/ccsrc/device/cpu/cpu_device_address.cc
+++ b/mindspore/ccsrc/device/cpu/cpu_device_address.cc
@@ -22,10 +22,30 @@ namespace device {
 namespace cpu {
 bool CPUDeviceAddress::SyncDeviceToHost(const std::vector<int> & /*shape*/, size_t size, TypeId type,
                                         void *host_ptr) const {
-  if (type == kNumberTypeFloat16) {
+  if (ptr_ == nullptr) {
+    MS_LOG(ERROR) << "The pointer ptr_ is null!";
+    return false;
+  }
+
+  if (host_ptr == ptr_) {
+    MS_LOG(DEBUG) << "host_ptr is equal to ptr_, request ignored.";
+    return true;
+  }
+
+  if (type == type_id_) {
+    auto ret_code = memcpy_s(host_ptr, size, ptr_, size_);
+    if (ret_code != EOK) {
+      MS_LOG(ERROR) << "Failed to copy tensor!";
+      return false;
+    }
+  } else if (type == kNumberTypeFloat16) {
     FloatToHalf(host_ptr, ptr_, size / 2);
   } else if (type == kNumberTypeFloat64) {
     FloatToDouble(host_ptr, ptr_, size / sizeof(double));
+  } else {
+    MS_LOG(ERROR) << "Types not match. Device type: " << TypeIdLabel(type_id_) << ", host type: " << TypeIdLabel(type)
+                  << "!";
+    return false;
   }
   return true;
 }
diff --git a/mindspore/ccsrc/device/cpu/cpu_device_address.h b/mindspore/ccsrc/device/cpu/cpu_device_address.h
index 9d51abe625..a041567f47 100644
--- a/mindspore/ccsrc/device/cpu/cpu_device_address.h
+++ b/mindspore/ccsrc/device/cpu/cpu_device_address.h
@@ -34,6 +34,7 @@ class CPUDeviceAddress : public DeviceAddress {
 
   bool SyncDeviceToHost(const std::vector<int> &shape, size_t size, TypeId type, void *host_ptr) const override;
   bool SyncHostToDevice(const std::vector<int> &shape, size_t size, TypeId type, const void *host_ptr) const override;
+  DeviceAddressType DeviceType() const override { return DeviceAddressType::kCPU; }
 };
 }  // namespace cpu
 }  // namespace device
diff --git a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
index 67328f04c2..6725dff524 100644
--- a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
@@ -21,18 +21,37 @@
 #include <utility>
 #include <functional>
 #include <unordered_map>
+#include <set>
 #include "kernel/kernel.h"
 #include "device/cpu/cpu_device_address.h"
 #include "utils/context/ms_context.h"
 #include "utils/config_manager.h"
 #include "common/utils.h"
 #include "session/anf_runtime_algorithm.h"
+#include "session/session_basic.h"
 #include "operator/ops.h"
 
 namespace mindspore {
 namespace device {
 namespace cpu {
 const size_t INIT_NODE_REF = 1;
+namespace {
+TypeId GetCPUSupportOutputTypeId(const TypeId type_id) {
+  TypeId support_type_id = type_id;
+  if (type_id == kNumberTypeUInt32) {
+    support_type_id = kNumberTypeInt32;
+  }
+  if (type_id == kNumberTypeFloat || type_id == kNumberTypeFloat16 || type_id == kNumberTypeFloat32 ||
+      type_id == kNumberTypeFloat64) {
+    support_type_id = kNumberTypeFloat32;
+  }
+  if (support_type_id != kNumberTypeInt32 && support_type_id != kNumberTypeFloat32) {
+    MS_LOG(EXCEPTION) << "Check output type failed.";
+  }
+  return support_type_id;
+}
+}  // namespace
+
 void CPUKernelRuntime::AssignKernelAddress(session::KernelGraph *kernel_graph) {
   AssignValueNodeAddress(kernel_graph);
   AssignInputNodeAddress(kernel_graph);
@@ -121,23 +140,25 @@ DeviceAddressPtr CPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t
   return std::make_shared<CPUDeviceAddress>(device_ptr, device_size, format, type_id);
 }
 
-BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, size_t index,
-                                               const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map) {
+BaseRef CPUKernelRuntime::CreatTensorForOutput(const session::KernelWithIndex &kernel_with_index,
+                                               const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map,
+                                               std::set<DeviceAddressPtr> *bound_addresses,
+                                               std::vector<tensor::TensorPtr> *need_sync_outputs) {
+  auto &input_node = kernel_with_index.first;
+  auto index = kernel_with_index.second;
   MS_EXCEPTION_IF_NULL(input_node);
-  if (input_node->isa<CNode>() && AnfAlgo::GetCNodeName(input_node) == prim::kPrimMakeTuple->name()) {
-    auto cnode = input_node->cast<CNodePtr>();
-    MS_EXCEPTION_IF_NULL(cnode);
-    VectorRef ret;
-    for (size_t i = 1; i < cnode->inputs().size(); i++) {
-      auto item_with_index = AnfAlgo::VisitKernelWithReturnType(cnode->input(i), 0);
-      auto out = CreatTensorForOutput(item_with_index.first, item_with_index.second, input_map);
-      ret.push_back(out);
-    }
-    return ret;
-  }
   if (input_node->isa<CNode>()) {
     auto node = input_node->cast<CNodePtr>();
     MS_EXCEPTION_IF_NULL(node);
+    if (AnfAlgo::GetCNodeName(input_node) == prim::kPrimMakeTuple->name()) {
+      VectorRef ret;
+      for (size_t i = 1; i < node->inputs().size(); i++) {
+        auto item_with_index = AnfAlgo::VisitKernelWithReturnType(node->input(i), 0);
+        auto out = CreatTensorForOutput(item_with_index, input_map, bound_addresses, need_sync_outputs);
+        ret.push_back(out);
+      }
+      return ret;
+    }
     size_t output_size = AnfAlgo::GetOutputTensorNum(node);
     if (index >= output_size) {
       MS_LOG(EXCEPTION) << "Invalid input index " << index;
@@ -148,20 +169,17 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz
     std::vector<int> temp_shape;
     (void)temp_shape.insert(temp_shape.end(), shape.begin(), shape.end());
     TypeId type_id = AnfAlgo::GetOutputInferDataType(node, index);
-    if (type_id == kNumberTypeUInt32) {
-      type_id = kNumberTypeInt32;
-    }
-    if (type_id == kNumberTypeFloat || type_id == kNumberTypeFloat16 || type_id == kNumberTypeFloat32 ||
-        type_id == kNumberTypeFloat64) {
-      type_id = kNumberTypeFloat32;
-    }
-    if (type_id != kNumberTypeInt32 && type_id != kNumberTypeFloat32) {
-      MS_LOG(EXCEPTION) << "Check output type failed.";
-    }
+    type_id = GetCPUSupportOutputTypeId(type_id);
     tensor::TensorPtr tensor = std::make_shared<tensor::Tensor>(type_id, temp_shape);
     MS_EXCEPTION_IF_NULL(tensor);
-    address->ptr_ = tensor->data_c(true);
-    address->ref_count_ = INIT_NODE_REF;
+    if (bound_addresses->find(address) != bound_addresses->end()) {
+      tensor->set_device_address(address);
+      need_sync_outputs->emplace_back(tensor);
+    } else {
+      address->ptr_ = tensor->data_c(true);
+      address->ref_count_ = INIT_NODE_REF;
+      (void)bound_addresses->insert(address);
+    }
     tensor->set_dirty(false);
     return tensor;
   } else if (input_node->isa<Parameter>() || input_node->isa<ValueNode>()) {
@@ -174,7 +192,8 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz
 }
 
 void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
-                                       const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
+                                       const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs,
+                                       std::vector<tensor::TensorPtr> *need_sync_outputs) {
   MS_EXCEPTION_IF_NULL(kernel_graph);
   MS_EXCEPTION_IF_NULL(outputs);
   // bind input ptr
@@ -182,20 +201,23 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
   if (input_nodes.size() != inputs.size()) {
     MS_LOG(EXCEPTION) << "Input size not equal to input node size!";
   }
-
   std::unordered_map<AnfNode *, tensor::TensorPtr> input_map;
   size_t input_idx = 0;
-  size_t type_size = sizeof(float);
   for (auto &item : input_nodes) {
     MS_EXCEPTION_IF_NULL(item);
     input_map[item.get()] = inputs[input_idx];
     if (item->isa<Parameter>()) {
       auto address = AnfAlgo::GetMutableOutputAddr(item, 0);
       auto tensor = inputs[input_idx];
+      auto tensor_address = tensor->device_address();
       MS_EXCEPTION_IF_NULL(address);
       MS_EXCEPTION_IF_NULL(tensor);
+      if (tensor_address != nullptr && tensor_address != address) {
+        (void)tensor->data_sync();
+      }
       std::vector<int> data_shape = tensor->shape();
-      size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), type_size, std::multiplies<size_t>());
+      size_t tensor_size =
+        std::accumulate(data_shape.begin(), data_shape.end(), sizeof(float), std::multiplies<size_t>());
       if (tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32) {
         address->ptr_ = tensor->data_c(false);
       } else {
@@ -211,12 +233,12 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
     }
     input_idx++;
   }
-
   // new output and bind ptr
+  std::set<DeviceAddressPtr> bound_addresses;
   auto output_nodes = kernel_graph->outputs();
   for (const auto &item : output_nodes) {
-    auto item_with_index = AnfAlgo::VisitKernelWithReturnType(item, 0);
-    auto out = CreatTensorForOutput(item_with_index.first, item_with_index.second, input_map);
+    auto item_with_index = AnfAlgo::VisitKernelWithReturnType(item, 0, true);
+    auto out = CreatTensorForOutput(item_with_index, input_map, &bound_addresses, need_sync_outputs);
     outputs->push_back(std::move(out));
   }
 }
@@ -234,9 +256,18 @@ void CPUKernelRuntime::AddRuntimeAddress(DeviceAddress *address, std::vector<ker
   input_list->push_back(input);
 }
 
+void CPUKernelRuntime::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
+  resource_manager_.IncreaseSummaryRefCount(summary_outputs);
+}
+
+void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
+  resource_manager_.DecreaseSummaryRefCount(summary_outputs);
+}
+
 bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
   MS_EXCEPTION_IF_NULL(kernel_graph);
-  resource_manager_.ResetAddressRefCount(kernel_graph);
+  resource_manager_.IncreaseAddressRefCount(kernel_graph);
+
   auto kernels = kernel_graph->execution_order();
   for (const auto &kernel : kernels) {
     std::vector<kernel::AddressPtr> kernel_inputs;
diff --git a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h
index 28e61c1479..27dcefdba9 100644
--- a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h
@@ -20,9 +20,12 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
+#include <set>
 #include "device/kernel_runtime.h"
 #include "session/kernel_graph.h"
+#include "session/session_basic.h"
 #include "device/cpu/cpu_resource_manager.h"
+#include "session/anf_runtime_algorithm.h"
 #include "utils/any.h"
 namespace mindspore {
 namespace device {
@@ -36,7 +39,9 @@ class CPUKernelRuntime : public KernelRuntime {
   bool Run(session::KernelGraph *graph) override;
   void AssignKernelAddress(session::KernelGraph *kernel_graph);
   void BindInputOutput(const session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
-                       VectorRef *outputs);
+                       VectorRef *outputs, std::vector<tensor::TensorPtr> *need_sync_outputs);
+  void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
+  void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
 
  protected:
   bool SyncStream() override { return true; };
@@ -44,8 +49,10 @@ class CPUKernelRuntime : public KernelRuntime {
                                        TypeId type_id) override;
 
  private:
-  BaseRef CreatTensorForOutput(const AnfNodePtr &input_node, size_t index,
-                               const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map);
+  BaseRef CreatTensorForOutput(const session::KernelWithIndex &kernel_with_index,
+                               const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map,
+                               std::set<DeviceAddressPtr> *bound_addresses,
+                               std::vector<tensor::TensorPtr> *need_sync_outputs);
   void AssignValueNodeAddress(session::KernelGraph *kernel_graph);
   void AssignInputNodeAddress(const session::KernelGraph *kernel_graph);
   void AssignKernelOutputAddress(const session::KernelGraph *kernel_graph);
diff --git a/mindspore/ccsrc/device/cpu/cpu_resource_manager.cc b/mindspore/ccsrc/device/cpu/cpu_resource_manager.cc
index 45b9ea5bed..c69ef35305 100644
--- a/mindspore/ccsrc/device/cpu/cpu_resource_manager.cc
+++ b/mindspore/ccsrc/device/cpu/cpu_resource_manager.cc
@@ -76,7 +76,47 @@ void CPUResourceManager::MemFree(void *ptr) {
   }
 }
 
-void CPUResourceManager::ResetAddressRefCount(const session::KernelGraph *graph) {
+void CPUResourceManager::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
+  if (!dynamic_malloc_) {
+    return;
+  }
+
+  if (summary_outputs.empty()) {
+    return;
+  }
+
+  for (auto &output_item : summary_outputs) {
+    auto node = output_item.second.first;
+    size_t index = IntToSize(output_item.second.second);
+    auto address = AnfAlgo::GetMutableOutputAddr(node, index);
+    MS_EXCEPTION_IF_NULL(address);
+    address->ref_count_++;
+  }
+}
+
+void CPUResourceManager::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) {
+  if (!dynamic_malloc_) {
+    return;
+  }
+
+  if (summary_outputs.empty()) {
+    return;
+  }
+
+  for (auto &output_item : summary_outputs) {
+    auto node = output_item.second.first;
+    size_t index = IntToSize(output_item.second.second);
+    auto address = AnfAlgo::GetMutableOutputAddr(node, index);
+    MS_EXCEPTION_IF_NULL(address);
+    address->ref_count_--;
+    if (address->ref_count_ == 0 && address->ptr_ != nullptr) {
+      MemFree(address->ptr_);
+      address->ptr_ = nullptr;
+    }
+  }
+}
+
+void CPUResourceManager::IncreaseAddressRefCount(const session::KernelGraph *graph) {
   if (!dynamic_malloc_) {
     return;
   }
diff --git a/mindspore/ccsrc/device/cpu/cpu_resource_manager.h b/mindspore/ccsrc/device/cpu/cpu_resource_manager.h
index 96cf00f3d8..d130241464 100644
--- a/mindspore/ccsrc/device/cpu/cpu_resource_manager.h
+++ b/mindspore/ccsrc/device/cpu/cpu_resource_manager.h
@@ -19,6 +19,7 @@
 #include <vector>
 #include <unordered_map>
 #include "session/kernel_graph.h"
+#include "session/session_basic.h"
 #include "device/device_address.h"
 #include "device/cpu/cpu_simple_mem_plan.h"
 namespace mindspore {
@@ -31,10 +32,12 @@ class CPUResourceManager {
 
   void MemPlan(const session::KernelGraph *graph);
   void MemMalloc(const session::KernelGraph *graph);
-  void ResetAddressRefCount(const session::KernelGraph *graph);
+  void IncreaseAddressRefCount(const session::KernelGraph *graph);
   void DecreaseAddressRefCount(const AnfNodePtr &kernel);
   void *MemMalloc(size_t mem_size);
   void MemFree(void *ptr);
+  void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
+  void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
 
  private:
   void MemFree();
diff --git a/mindspore/ccsrc/device/cpu/kernel_select_cpu.cc b/mindspore/ccsrc/device/cpu/kernel_select_cpu.cc
index 76e91e059a..9d72bcab89 100644
--- a/mindspore/ccsrc/device/cpu/kernel_select_cpu.cc
+++ b/mindspore/ccsrc/device/cpu/kernel_select_cpu.cc
@@ -71,9 +71,6 @@ void GetInputFormatsAndDtypes(const CNodePtr &kernel_node, std::vector<std::stri
 void GetOutputFormatsAndDtypes(const CNodePtr &kernel_node, const KernelAttr &kernel_attr,
                                std::vector<std::string> *output_formats, std::vector<TypeId> *output_types) {
   size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  if (kernel_attr.GetOutputSize() != output_num) {
-    MS_LOG(EXCEPTION) << "Output num is not equal!";
-  }
   for (size_t output_index = 0; output_index < output_num; ++output_index) {
     output_formats->emplace_back(kernel_attr.GetOutputAttr(output_index).second);
     auto dtype = kernel_attr.GetOutputAttr(output_index).first;
@@ -145,6 +142,11 @@ void SetKernelInfo(const CNodePtr &kernel_node) {
       ExpandKernelAttr(kernel_node, &kernel_attr);
     }
     if (IsInputFormatDtypeMatched(kernel_attr, input_formats, input_types, input_not_cnode_indexes)) {
+      size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+      if (kernel_attr.GetOutputSize() != output_num) {
+        MS_LOG(DEBUG) << "Output num is not equal!";
+        continue;
+      }
       MS_LOG(INFO) << "Input format and dtype is matched, index: " << index;
       GetOutputFormatsAndDtypes(kernel_node, kernel_attr, &output_formats, &output_types);
       UpdatePrevNotCNodeFormatDtype(kernel_attr, input_not_cnode_indexes, kernel_node);
diff --git a/mindspore/ccsrc/device/cpu/kernel_select_cpu.h b/mindspore/ccsrc/device/cpu/kernel_select_cpu.h
index d2138ec66d..b707c55e2c 100644
--- a/mindspore/ccsrc/device/cpu/kernel_select_cpu.h
+++ b/mindspore/ccsrc/device/cpu/kernel_select_cpu.h
@@ -33,7 +33,7 @@ void SetKernelInfo(const CNodePtr &apply_kernel_ptr);
 class KernelAttr {
  public:
   using DataType = std::pair<TypeId, std::string>;
-  KernelAttr() = default;
+  KernelAttr() : all_same_(0) {}
   ~KernelAttr() = default;
 
   KernelAttr &AddInputAttr(const TypeId &ms_type, const std::string &format = kOpFormat_DEFAULT) {
diff --git a/mindspore/ccsrc/device/cpu/mpi/mpi_adapter.cc b/mindspore/ccsrc/device/cpu/mpi/mpi_adapter.cc
new file mode 100644
index 0000000000..0d49846bf7
--- /dev/null
+++ b/mindspore/ccsrc/device/cpu/mpi/mpi_adapter.cc
@@ -0,0 +1,259 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/mpi/mpi_adapter.h"
+#include <algorithm>
+#include "utils/mpi/mpi_config.h"
+#include "utils/log_adapter.h"
+
+namespace mindspore {
+namespace device {
+namespace cpu {
+namespace {
+MPI_Op GetMpiOp(const std::string &op_type) {
+  if (op_type == "sum") {
+    return MPI_SUM;
+  } else if (op_type == "max") {
+    return MPI_MAX;
+  } else if (op_type == "min") {
+    return MPI_MIN;
+  } else if (op_type == "prod") {
+    return MPI_PROD;
+  }
+  MS_LOG(EXCEPTION) << "unsupport op_type:" << op_type;
+  return MPI_SUM;
+}
+
+int GetScatterIndex(int rankid, const std::vector<int> &ranks_group) {
+  int scatter_index = -1;
+  for (size_t i = 0; i < ranks_group.size(); ++i) {
+    if (ranks_group[i] == rankid) {
+      scatter_index = static_cast<int>(i);
+      break;
+    }
+  }
+  if (scatter_index == -1) {
+    MS_LOG(EXCEPTION) << "process rankid " << rankid << " does not in the input rank group!";
+  }
+  return scatter_index;
+}
+}  // namespace
+
+MPIAdapter::MPIAdapter() : rank_id_(0), rank_size_(0), comm_group_world_(MPI_GROUP_NULL) { Init(); }
+
+MPIAdapter::~MPIAdapter() {
+  for (auto iter = ranks_group_.begin(); iter != ranks_group_.end(); ++iter) {
+    MPI_Group_free(&iter->second);
+  }
+  if (comm_group_world_ != MPI_GROUP_NULL) {
+    MPI_Group_free(&comm_group_world_);
+  }
+  int finalized;
+  MPI_Finalized(&finalized);
+  if (finalized == 0) {
+    MPI_Finalize();
+  }
+}
+
+MPIAdapter &MPIAdapter::Instance() {
+  static MPIAdapter instance;
+  return instance;
+}
+
+int MPIAdapter::GetRankId() const { return rank_id_; }
+
+void MPIAdapter::Init() {
+  static bool init = false;
+  if (init) {
+    return;
+  }
+  auto mpi_config_ptr = MpiConfig::GetInstance();
+  MS_EXCEPTION_IF_NULL(mpi_config_ptr);
+  if (!mpi_config_ptr->enable_mpi()) {
+    MS_LOG(EXCEPTION) << "MPI is disabled now!Please enable mpi with mpi config first.";
+  }
+  int init_flag = 0;
+  if (MPI_Initialized(&init_flag) != MPI_SUCCESS) {
+    MS_LOG(EXCEPTION) << "Check mpi initialized fail!";
+  }
+  if (init_flag == 0) {
+    auto ret = MPI_Init(nullptr, nullptr);
+    if (ret != MPI_SUCCESS) {
+      MS_LOG(EXCEPTION) << "Failed to init mpi!";
+    }
+  }
+
+  MPI_Comm_group(MPI_COMM_WORLD, &comm_group_world_);
+  if (comm_group_world_ == MPI_GROUP_NULL) {
+    MS_LOG(EXCEPTION) << "comm_group_world_ init fail!";
+  }
+  auto ret = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id_);
+  if (ret != MPI_SUCCESS) {
+    MS_LOG(EXCEPTION) << "Failed to init mpi rank id!";
+  }
+
+  ret = MPI_Comm_size(MPI_COMM_WORLD, &rank_size_);
+  if (ret != MPI_SUCCESS) {
+    MS_LOG(EXCEPTION) << "Failed to init mpi rank size!rankid:" << rank_id_;
+  }
+  init = true;
+}
+
+MPI_Group MPIAdapter::AddGroup(const std::vector<int> &ranks) {
+  if (ranks.size() > static_cast<size_t>(rank_size_) || ranks.empty()) {
+    MS_LOG(EXCEPTION) << "input rank size: " << ranks.size() << ", max rank size: " << rank_size_;
+  }
+
+  if (std::find(ranks.begin(), ranks.end(), rank_id_) == ranks.end()) {
+    MS_LOG(ERROR) << "rankid:" << rank_id_ << " is not in the input group.";
+    return MPI_GROUP_NULL;
+  }
+  std::lock_guard<std::mutex> lock(group_mutex_);
+  auto iter = ranks_group_.find(ranks);
+  if (iter != ranks_group_.end()) {
+    return iter->second;
+  }
+  const auto ranks_size = ranks.size();
+  std::vector<int> ranks_input(ranks_size, 0);
+  for (size_t i = 0; i < ranks_size; ++i) {
+    ranks_input[i] = ranks[i];
+  }
+
+  MPI_Group group = MPI_GROUP_NULL;
+  MPI_Group_incl(comm_group_world_, ranks.size(), ranks_input.data(), &group);
+  if (group == MPI_GROUP_NULL) {
+    MS_LOG(EXCEPTION) << "create mpi group fail!rankid:" << rank_id_;
+  }
+
+  ranks_group_[ranks] = group;
+  MS_LOG(INFO) << "rank:" << rank_id_ << " add group:" << group;
+  return group;
+}
+
+bool MPIAdapter::ReduceScatter(const float *input, float *output, const std::vector<int> &ranks_group, size_t data_num,
+                               const std::string &op_type) {
+  if (ranks_group.empty()) {
+    MS_LOG(ERROR) << "input rank group is empty!";
+    return false;
+  }
+
+  auto group = AddGroup(ranks_group);
+  if (group == MPI_GROUP_NULL) {
+    MS_LOG(EXCEPTION) << "Get mpi group fail!rankid:" << rank_id_;
+  }
+  MPI_Comm comm;
+  MPI_Comm_create_group(MPI_COMM_WORLD, group, 0, &comm);
+  if (comm == MPI_COMM_NULL) {
+    MS_LOG(EXCEPTION) << "create mpi comm fail!rankid:" << rank_id_;
+  }
+  std::vector<int> receive_count(ranks_group.size(), 0);
+  for (size_t i = 0; i < ranks_group.size(); ++i) {
+    receive_count[i] = data_num;
+  }
+
+  auto op = GetMpiOp(op_type);
+  auto ret = MPI_Reduce_scatter(input, output, receive_count.data(), MPI_FLOAT, op, comm);
+  bool result = true;
+  if (ret != MPI_SUCCESS) {
+    MS_LOG(ERROR) << "mpi reduce_scatter fail!ret = " << ret << ", rankid:" << rank_id_;
+    result = false;
+  }
+
+  ret = MPI_Comm_free(&comm);
+  if (ret != MPI_SUCCESS) {
+    MS_LOG(WARNING) << "mpi comm free fail! ret = " << ret << ", rankid:" << rank_id_;
+  }
+  return result;
+}
+
+bool MPIAdapter::ReduceScatterOverwriteInput(float *input, const std::vector<int> &ranks_group, size_t input_data_num,
+                                             size_t output_size, const std::string &op_type, float *output) {
+  int scatter_index = GetScatterIndex(rank_id_, ranks_group);
+  auto group = AddGroup(ranks_group);
+  if (group == MPI_GROUP_NULL) {
+    MS_LOG(EXCEPTION) << "Get mpi group fail!rankid:" << rank_id_;
+  }
+  MPI_Comm comm;
+  MPI_Comm_create_group(MPI_COMM_WORLD, group, 0, &comm);
+  if (comm == MPI_COMM_NULL) {
+    MS_LOG(EXCEPTION) << "create mpi comm fail!rankid:" << rank_id_;
+  }
+
+  MPI_Win window;
+  auto ret = MPI_Win_create(input, input_data_num * sizeof(float), sizeof(float), MPI_INFO_NULL, comm, &window);
+  if (ret != MPI_SUCCESS) {
+    MS_LOG(ERROR) << "mpi window create fail! ret = " << ret;
+    return false;
+  }
+  MPI_Win_fence(0, window);
+  for (size_t i = 0; i < ranks_group.size(); ++i) {
+    int remote_rank = ranks_group[i];
+    if (rank_id_ == remote_rank) {
+      continue;
+    }
+    auto op = GetMpiOp(op_type);
+    ret = MPI_Accumulate(input + i * input_data_num, input_data_num, MPI_FLOAT, remote_rank, i * input_data_num,
+                         input_data_num, MPI_FLOAT, op, window);
+    if (ret != MPI_SUCCESS) {
+      MS_LOG(EXCEPTION) << "mpi accumulate " << op_type << " fail!ret = " << ret;
+    }
+  }
+  MPI_Win_fence(0, window);
+  if (output != nullptr) {
+    auto data_size = input_data_num * sizeof(float);
+    if (output_size < data_size) {
+      MS_LOG(EXCEPTION) << "output buffer size " << output_size << " < input size " << data_size;
+    }
+    auto copy_ret = memcpy_s(output, output_size, input + scatter_index * input_data_num, data_size);
+    if (copy_ret != 0) {
+      MS_LOG(EXCEPTION) << "copy output memory fail!ret = " << copy_ret;
+    }
+  }
+  MPI_Win_free(&window);
+  MPI_Comm_free(&comm);
+  return true;
+}
+
+bool MPIAdapter::AllGather(const float *input, float *output, const std::vector<int> &ranks_group, size_t data_num) {
+  if (ranks_group.empty()) {
+    MS_LOG(ERROR) << "input rank group is empty!";
+    return false;
+  }
+  auto group = AddGroup(ranks_group);
+  if (group == MPI_GROUP_NULL) {
+    MS_LOG(EXCEPTION) << "Get mpi group fail! rankid:" << rank_id_;
+  }
+  MPI_Comm comm;
+  MPI_Comm_create_group(MPI_COMM_WORLD, group, 0, &comm);
+  if (comm == MPI_COMM_NULL) {
+    MS_LOG(EXCEPTION) << "create mpi comm fail! rankid:" << rank_id_;
+  }
+
+  auto ret = MPI_Allgather(input, data_num, MPI_FLOAT, output, data_num, MPI_FLOAT, comm);
+  bool result = true;
+  if (ret != MPI_SUCCESS) {
+    MS_LOG(ERROR) << "mpi allgater fail!ret = " << ret << ", rankid:" << rank_id_;
+    result = false;
+  }
+  ret = MPI_Comm_free(&comm);
+  if (ret != MPI_SUCCESS) {
+    MS_LOG(WARNING) << "mpi comm free fail!ret = " << ret << ",rankid:" << rank_id_;
+  }
+  return result;
+}
+}  // namespace cpu
+}  // namespace device
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/device/cpu/mpi/mpi_adapter.h b/mindspore/ccsrc/device/cpu/mpi/mpi_adapter.h
new file mode 100644
index 0000000000..8265e89eab
--- /dev/null
+++ b/mindspore/ccsrc/device/cpu/mpi/mpi_adapter.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_DEVICE_CPU_MPI_MPI_ADAPTER_H_
+#define MINDSPORE_CCSRC_DEVICE_CPU_MPI_MPI_ADAPTER_H_
+#ifdef ENABLE_MPI
+#include <mpi.h>
+#include <vector>
+#include <map>
+#include <string>
+#include <mutex>
+
+namespace mindspore {
+namespace device {
+namespace cpu {
+constexpr auto kOpTypeSum = "sum";
+class MPIAdapter {
+ public:
+  ~MPIAdapter();
+  static MPIAdapter &Instance();
+  int GetRankId() const;
+  bool ReduceScatter(const float *input, float *output, const std::vector<int> &ranks_group, size_t data_num,
+                     const std::string &op_type = kOpTypeSum);
+  bool ReduceScatterOverwriteInput(float *input, const std::vector<int> &ranks_group, size_t input_data_num,
+                                   size_t output_size, const std::string &op_type = kOpTypeSum,
+                                   float *output = nullptr);
+  bool AllGather(const float *input, float *output, const std::vector<int> &ranks_group, size_t data_num);
+
+ private:
+  MPIAdapter();
+  void Init();
+  MPI_Group AddGroup(const std::vector<int> &ranks);
+
+  int rank_id_;
+  int rank_size_;
+  MPI_Group comm_group_world_;
+  // key:ranks group, value: mpi group
+  std::map<std::vector<int>, MPI_Group> ranks_group_;
+  std::mutex group_mutex_;
+};
+}  // namespace cpu
+}  // namespace device
+}  // namespace mindspore
+#endif  // ENABLE_MPI
+#endif  // MINDSPORE_CCSRC_DEVICE_CPU_MPI_MPI_ADAPTER_H_
diff --git a/mindspore/ccsrc/device/device_address.h b/mindspore/ccsrc/device/device_address.h
index fd3188e0f2..e02d231dd5 100644
--- a/mindspore/ccsrc/device/device_address.h
+++ b/mindspore/ccsrc/device/device_address.h
@@ -48,6 +48,7 @@ class GPUMemoryManager;
 namespace mindspore {
 namespace device {
 enum class DeviceAddressStatus { kInDevice, kInHost, kInDeviceToHost, kInHostToDevice };
+enum class DeviceAddressType { kUnknown, kAscend, kCPU, kGPU };
 
 class DeviceAddress {
  public:
@@ -64,6 +65,7 @@ class DeviceAddress {
   TypeId type_id() const { return type_id_; }
   virtual void set_status(DeviceAddressStatus status) {}
   virtual DeviceAddressStatus status() const { return DeviceAddressStatus::kInDevice; }
+  virtual DeviceAddressType DeviceType() const { return DeviceAddressType::kUnknown; }
 
  protected:
   const void *ptr() const { return ptr_; }
diff --git a/mindspore/ccsrc/device/gpu/distribution/collective_fake_init.h b/mindspore/ccsrc/device/gpu/distribution/collective_fake_init.h
index 65467139c0..c8405f12f6 100644
--- a/mindspore/ccsrc/device/gpu/distribution/collective_fake_init.h
+++ b/mindspore/ccsrc/device/gpu/distribution/collective_fake_init.h
@@ -20,7 +20,6 @@
 namespace mindspore {
 namespace device {
 namespace gpu {
-
 class CollectiveFakeInitializer {
  public:
   CollectiveFakeInitializer() = default;
diff --git a/mindspore/ccsrc/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/device/gpu/gpu_device_address.cc
index c4c1094293..24097f3637 100644
--- a/mindspore/ccsrc/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_device_address.cc
@@ -15,9 +15,7 @@
  */
 
 #include "device/gpu/gpu_device_address.h"
-
 #include <vector>
-
 #include "device/gpu/gpu_device_manager.h"
 #include "utils/log_adapter.h"
 #include "utils/context/ms_context.h"
@@ -28,6 +26,13 @@ namespace device {
 namespace gpu {
 bool GPUDeviceAddress::SyncDeviceToHost(const std::vector<int> &, size_t size, TypeId, void *host_ptr) const {
   MS_EXCEPTION_IF_NULL(host_ptr);
+  auto &stream = GPUDeviceManager::GetInstance().default_stream();
+  MS_EXCEPTION_IF_NULL(stream);
+  auto ret = GPUDeviceManager::GetInstance().SyncStream(stream);
+  if (!ret) {
+    MS_LOG(ERROR) << "SyncStream failed";
+    return ret;
+  }
   if (size != size_) {
     MS_LOG(WARNING) << "SyncDeviceToHost ignored, host size: " << size << ", device size " << size_;
     return true;
diff --git a/mindspore/ccsrc/device/gpu/gpu_device_address.h b/mindspore/ccsrc/device/gpu/gpu_device_address.h
index f5c6b6e36b..4074cb6ce9 100644
--- a/mindspore/ccsrc/device/gpu/gpu_device_address.h
+++ b/mindspore/ccsrc/device/gpu/gpu_device_address.h
@@ -35,6 +35,7 @@ class GPUDeviceAddress : public DeviceAddress {
   bool SyncHostToDevice(const std::vector<int> &shape, size_t size, TypeId type, const void *host_ptr) const override;
   void set_status(DeviceAddressStatus status) { status_ = status; }
   DeviceAddressStatus status() const { return status_; }
+  DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; }
 
  private:
   DeviceAddressStatus status_{DeviceAddressStatus::kInDevice};
diff --git a/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc b/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc
index f9d2cb878f..19d2284510 100644
--- a/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc
@@ -16,18 +16,17 @@
 #include "device/gpu/gpu_kernel_build.h"
 #include <string>
 #include "kernel/kernel.h"
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include "kernel/akg/gpu/akg_gpu_kernel_build.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "operator/ops.h"
-#include "pybind11/stl.h"
 #include "session/anf_runtime_algorithm.h"
 namespace mindspore {
 namespace device {
 namespace gpu {
-namespace py = pybind11;
 void GpuBuild(const KernelGraphPtr &kernel_graph) {
   kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
+  MS_EXCEPTION_IF_NULL(bin_map);
   bin_map->Initialize();
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto kernels = kernel_graph->execution_order();
@@ -38,7 +37,7 @@ void GpuBuild(const KernelGraphPtr &kernel_graph) {
       continue;
     }
 
-    if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) == KernelType::AUTO_DIFF_KERNEL) {
+    if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) == KernelType::AKG_KERNEL) {
       auto gpu_kernel_ptr = kernel::AkgGpuKernelBuild(kernel);
       if (!gpu_kernel_ptr) {
         MS_LOG(EXCEPTION) << "Build akg kernel op[" << kernel_name << "] failed";
diff --git a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
index 6c658f12e8..8095a503e3 100644
--- a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
@@ -39,6 +39,7 @@ bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().Syn
 
 bool GPUKernelRuntime::Init() {
   if (device_init_ == true) {
+    GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory();
     return true;
   }
   auto ret = InitDevice();
@@ -105,7 +106,7 @@ void GPUKernelRuntime::ReleaseDeviceRes() {
     CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
   }
 
-  // destroy remaining memory swap events and free host memory
+  // Destroy remaining memory swap events and free host memory.
   for (auto &item : mem_swap_map_) {
     auto &mem_swap_manager = item.second;
     MS_EXCEPTION_IF_NULL(mem_swap_manager);
@@ -119,7 +120,10 @@ void GPUKernelRuntime::ReleaseDeviceRes() {
   if (mem_manager_ != nullptr) {
     mem_manager_->FreeDeviceMemory();
   }
-  kernel::KernelMeta::GetInstance()->RemoveKernelCache();
+
+  kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
+  MS_EXCEPTION_IF_NULL(bin_map);
+  bin_map->RemoveKernelCache();
 }
 
 void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
@@ -171,7 +175,7 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
   const uint64_t kUSecondInSecond = 1000000;
   uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
   cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
-  MS_LOG(DEBUG) << "kernel runtime run graph in " << cost << " us";
+  MS_LOG(DEBUG) << "GPU kernel runtime run graph in " << cost << " us";
   return ret;
 }
 
@@ -187,6 +191,8 @@ void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
   mem_reuse_util_ptr->SetReuseRefCount();
   // Can't free the device address of graph output, so set the reference count of graph output specially.
   mem_reuse_util_ptr->SetGraphOutputRefCount();
+  // Can't free the device address of summary nodes, so set the reference count of summary nodes specially.
+  mem_reuse_util_ptr->SetSummaryNodesRefCount();
   auto graph_id = graph->graph_id();
   mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr;
 }
@@ -222,7 +228,7 @@ void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *grap
         continue;
       }
 
-      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
+      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
       if (device_address->ptr_) {
         mem_manager_->FreeMemFromMemPool(device_address);
       }
@@ -233,6 +239,7 @@ void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *grap
 
 bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
   MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
   auto graph_id = graph->graph_id();
   auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
   MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
@@ -277,11 +284,12 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
 }
 
 bool GPUKernelRuntime::AddMemSwapTask(const AnfNodePtr &kernel) {
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
   auto &mem_swap_info_list = mem_swap_manager_->QueryKernelMemSwapInfo(kernel);
   for (auto &mem_swap_info : mem_swap_info_list) {
     auto &kernel_exec_info = mem_swap_manager_->SearchKernelExecutionInfo(mem_swap_info.kernel_);
     const HostAddress &host_address = kernel_exec_info.host_addrs_[mem_swap_info.output_idx_];
-    auto device_address = AnfAlgo::GetMutableOutputAddr(mem_swap_info.kernel_, mem_swap_info.output_idx_);
+    auto device_address = AnfAlgo::GetMutableOutputAddr(mem_swap_info.kernel_, mem_swap_info.output_idx_, false);
 
     if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) {
       mem_swap_manager_->AddMemSwapTask(SwapKind::kDeviceToHost, device_address, host_address);
@@ -304,6 +312,7 @@ bool GPUKernelRuntime::AddMemSwapTask(const AnfNodePtr &kernel) {
 }
 
 bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size) {
+  MS_EXCEPTION_IF_NULL(mem_manager_);
   auto ret = mem_manager_->MallocMemFromMemPool(device_address, size);
   if (!ret) {
     if (!mem_swap_manager_->trigger_swap()) {
@@ -327,6 +336,7 @@ bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address,
 }
 
 void *GPUKernelRuntime::AttemptMallocMem(size_t size) {
+  MS_EXCEPTION_IF_NULL(mem_manager_);
   auto device_ptr = mem_manager_->MallocMemFromMemPool(size);
   if (!device_ptr) {
     if (!mem_swap_manager_->trigger_swap()) {
@@ -367,8 +377,10 @@ bool GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
 bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs) {
   MS_EXCEPTION_IF_NULL(kernel);
   MS_EXCEPTION_IF_NULL(kernel_inputs);
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
   for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
-    auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
+    // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
+    auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
     MS_EXCEPTION_IF_NULL(device_address);
     if (mem_swap_manager_->trigger_swap()) {
       while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice)) {
@@ -415,6 +427,7 @@ bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::Kern
   MS_EXCEPTION_IF_NULL(kernel);
   MS_EXCEPTION_IF_NULL(kernel_outputs);
   MS_EXCEPTION_IF_NULL(mem_manager_);
+  MS_EXCEPTION_IF_NULL(mem_swap_manager_);
   if (mem_swap_manager_->trigger_swap()) {
     while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost)) {
       if (!mem_swap_manager_->FindInSwapInBlackList(device_address_swap_out->ptr_) && device_address_swap_out->ptr_) {
@@ -425,7 +438,7 @@ bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::Kern
   }
   auto output_sizes = kernel_mod.GetOutputSizeList();
   for (size_t i = 0; i < output_sizes.size(); ++i) {
-    auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
+    auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
     MS_EXCEPTION_IF_NULL(device_address);
     if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i])) {
       return false;
@@ -444,7 +457,6 @@ bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::K
                                                       AddressPtrList *kernel_workspaces) {
   MS_EXCEPTION_IF_NULL(kernel);
   MS_EXCEPTION_IF_NULL(kernel_workspaces);
-  MS_EXCEPTION_IF_NULL(mem_manager_);
   auto workspace_sizes = kernel_mod.GetWorkspaceSizeList();
   for (size_t i = 0; i < workspace_sizes.size(); ++i) {
     if (workspace_sizes[i] == 0) {
@@ -478,14 +490,13 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph
 
 void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
   MS_EXCEPTION_IF_NULL(kernel);
-  MS_EXCEPTION_IF_NULL(mem_manager_);
   bool is_need_alloc_memory = false;
   bool is_need_free_memory = false;
   size_t total_size = 0;
   std::vector<size_t> size_list;
   DeviceAddressPtrList addr_list;
   for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
-    auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
+    auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
     MS_EXCEPTION_IF_NULL(device_address);
     if (device_address->ptr_ == nullptr) {
       is_need_alloc_memory = true;
@@ -501,7 +512,6 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN
 
 void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
   MS_EXCEPTION_IF_NULL(kernel);
-  MS_EXCEPTION_IF_NULL(mem_manager_);
   bool is_need_alloc_memory = false;
   bool is_need_free_memory = false;
   size_t total_size = 0;
@@ -511,7 +521,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
   MS_EXCEPTION_IF_NULL(kernel_mod);
   auto output_sizes = kernel_mod->GetOutputSizeList();
   for (size_t i = 0; i < output_sizes.size(); ++i) {
-    auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
+    auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
     MS_EXCEPTION_IF_NULL(device_address);
     if (device_address->ptr_ == nullptr) {
       is_need_alloc_memory = true;
@@ -528,6 +538,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
 void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
                                                   const DeviceAddressPtrList addr_list, size_t total_size,
                                                   std::vector<size_t> size_list) {
+  MS_EXCEPTION_IF_NULL(mem_manager_);
   if (!is_need_alloc_memory) {
     return;
   }
@@ -568,7 +579,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
       MS_LOG(EXCEPTION) << "Check dynamic reference count failed.";
     }
     if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
-      auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
+      auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
       mem_manager_->FreeMemFromMemPool(device_address);
       device_address->set_status(DeviceAddressStatus::kInDevice);
     }
@@ -580,7 +591,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
       continue;
     }
     if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
-      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
+      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
       mem_manager_->FreeMemFromMemPool(device_address);
       device_address->set_status(DeviceAddressStatus::kInDevice);
     }
diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc
index 3a1a53c600..9137945661 100644
--- a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc
@@ -14,25 +14,45 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include "device/gpu/gpu_memory_allocator.h"
 #include "device/gpu/cuda_driver.h"
 #include "utils/log_adapter.h"
+#include "utils/context/ms_context.h"
+#include "utils/convert_utils_base.h"
 
 namespace mindspore {
 namespace device {
 namespace gpu {
 bool GPUMemoryAllocator::Init() {
   size_t total_size = total_mem_size();
-  size_t free_size = free_mem_size();
-  if (total_size > 0 && free_size > 0) {
-    MS_LOG(INFO) << "GPU device total memory size " << total_size << ", current free memory size " << free_size;
+  size_t free_size = CudaDriver::free_mem_size();
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  limited_device_memory_ = context_ptr->max_device_memory();
+  available_device_memory_ = FloatToSize(limited_device_memory_ * 1024 * 1024 * 1024);
+  if (total_size > 0 && free_size > 0 && available_device_memory_ > 0) {
+    MS_LOG(INFO) << "GPU device total memory size " << total_size << ", current free memory size " << free_size
+                 << ", set max available memory size " << available_device_memory_ << ".";
   } else {
     MS_LOG(EXCEPTION) << "GPU device memory error, total memory size " << total_size << ", current free memory size "
-                      << free_size;
+                      << free_size << ", set max available memory size " << available_device_memory_ << ".";
   }
   return true;
 }
 
+void GPUMemoryAllocator::CheckMaxDeviceMemory() const {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  auto max_device_memory = context_ptr->max_device_memory();
+  //  Currently not support modifying the max device memory.
+  if (limited_device_memory_ != max_device_memory) {
+    MS_LOG(EXCEPTION)
+      << "Can't change context param max_device_memory in runtime, currently effective max_device_memory("
+      << limited_device_memory_ << "GB), set new max_device_memory(" << max_device_memory << "GB) failed.";
+  }
+}
+
 bool GPUMemoryAllocator::Finalize() {
   if (buffer_q_addr_ != nullptr) {
     if (!CudaDriver::FreeDeviceMem(buffer_q_addr_)) {
@@ -64,13 +84,16 @@ size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
   if (alloc_size == 0) {
     MS_LOG(EXCEPTION) << "Alloc device memory[" << size << "] failed.";
   }
-  MS_LOG(INFO) << "Current free memory size[" << free_size << "], current alloc size[" << alloc_size << "].";
+  total_used_device_memory_ += alloc_size;
+  available_device_memory_ -= alloc_size;
+  MS_LOG(INFO) << "Current free memory size[" << free_size - alloc_size << "], current alloc size[" << alloc_size
+               << "], total used size[" << total_used_device_memory_ << "].";
   return alloc_size;
 }
 
 bool GPUMemoryAllocator::FreeDeviceMem(const DeviceMemPtr &addr) { return CudaDriver::FreeDeviceMem(addr); }
 
-size_t GPUMemoryAllocator::free_mem_size() { return CudaDriver::free_mem_size(); }
+size_t GPUMemoryAllocator::free_mem_size() { return std::min(CudaDriver::free_mem_size(), available_device_memory_); }
 
 size_t GPUMemoryAllocator::total_mem_size() { return CudaDriver::total_mem_size(); }
 }  // namespace gpu
diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h
index 36374bfaad..90d7791057 100644
--- a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h
@@ -28,6 +28,7 @@ class GPUMemoryAllocator : public DynamicMemPoolBestFit {
  public:
   ~GPUMemoryAllocator() override = default;
   bool Init();
+  void CheckMaxDeviceMemory() const;
   bool Finalize();
   bool AllocBufferQueueMem(size_t size, DeviceMemPtr *addr);
 
@@ -48,6 +49,10 @@ class GPUMemoryAllocator : public DynamicMemPoolBestFit {
 
   // Used to track address of data buffer queue.
   DeviceMemPtr buffer_q_addr_{nullptr};
+
+  float limited_device_memory_{0.0};
+  size_t total_used_device_memory_{0};
+  size_t available_device_memory_{0};
 };
 }  // namespace gpu
 }  // namespace device
diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc
index 8443e4799f..80206f309d 100644
--- a/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc
@@ -104,12 +104,12 @@ DeviceAddressPtr GPUMemCopyManager::UpdateSwapInQueue() {
   return device_address;
 }
 
-bool GPUMemCopyManager::AllocHostPinnedMem(size_t size, void **addr) {
+bool GPUMemCopyManager::AllocHostPinnedMem(size_t size, void **addr) const {
   auto alloc_size = CudaDriver::AllocHostPinnedMem(size, addr);
   return alloc_size == size;
 }
 
-void GPUMemCopyManager::FreeHostPinnedMem(void *addr) { CudaDriver::FreeHostPinnedMem(addr); }
+void GPUMemCopyManager::FreeHostPinnedMem(void *addr) const { CudaDriver::FreeHostPinnedMem(addr); }
 
 void GPUMemCopyManager::ClearSwapQueue() {
   CHECK_OP_RET_WITH_EXCEPT(SyncMemCopyStream(SwapKind::kDeviceToHost), "Failed to sync swap out stream");
diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h
index a7cd8d4d8f..36ff273015 100644
--- a/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h
@@ -48,9 +48,9 @@ class GPUMemCopyManager : public MemCopyManager {
 
   DeviceAddressPtr UpdateSwapInQueue() override;
 
-  bool AllocHostPinnedMem(size_t size, void **addr) override;
+  bool AllocHostPinnedMem(size_t size, void **addr) const override;
 
-  void FreeHostPinnedMem(void *addr) override;
+  void FreeHostPinnedMem(void *addr) const override;
 
   void ClearSwapQueue() override;
 
@@ -61,7 +61,6 @@ class GPUMemCopyManager : public MemCopyManager {
   std::queue<std::pair<DeviceAddressPtr, DeviceEvent>> swap_in_queue_;
 };
 using GPUMemCopyManagerPtr = std::shared_ptr<GPUMemCopyManager>;
-
 }  // namespace gpu
 }  // namespace device
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/device/gpu/gpu_stream_assign.cc b/mindspore/ccsrc/device/gpu/gpu_stream_assign.cc
index 3594081cc7..42cdcf29ec 100644
--- a/mindspore/ccsrc/device/gpu/gpu_stream_assign.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_stream_assign.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "device/gpu/gpu_stream_assign.h"
 #include <set>
 #include <string>
 #include <memory>
@@ -21,7 +22,6 @@
 #include "device/gpu/gpu_common.h"
 #include "device/gpu/kernel_info_setter.h"
 #include "device/gpu/gpu_device_manager.h"
-#include "device/gpu/gpu_stream_assign.h"
 
 namespace mindspore {
 namespace device {
@@ -36,18 +36,19 @@ void AssignGpuStream(const std::shared_ptr<session::KernelGraph> &kernel_graph)
       allreduce_kernels.emplace_back(kernel_node);
     } else {
       DeviceStream compute_stream = GPUDeviceManager::GetInstance().default_stream();
-      AnfAlgo::SetNodeAttr("stream_id", MakeValue(reinterpret_cast<uintptr_t>(compute_stream)), kernel_node);
+      MS_EXCEPTION_IF_NULL(compute_stream);
+      AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(reinterpret_cast<uintptr_t>(compute_stream)), kernel_node);
     }
   }
   if (allreduce_kernels.size() > 1) {
-    // Assign multiple streams only when there's Recv node for AllReduce.
+    // Assign multiple streams only when there're multiple AllReduce nodes.
     std::vector<SendRecvPair> send_recv_pairs;
     if (FindAllReduceStreamSwitchPos(kernel_graph, &send_recv_pairs)) {
       DeviceStream comm_stream = nullptr;
       GPUDeviceManager::GetInstance().CreateStream(&comm_stream);
       std::transform(
         allreduce_kernels.begin(), allreduce_kernels.end(), allreduce_kernels.begin(), [&](CNodePtr allreduce_kernel) {
-          AnfAlgo::SetNodeAttr("stream_id", MakeValue(reinterpret_cast<uintptr_t>(comm_stream)), allreduce_kernel);
+          AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(reinterpret_cast<uintptr_t>(comm_stream)), allreduce_kernel);
           return allreduce_kernel;
         });
       InsertStreamSwitchNode(kernel_graph, send_recv_pairs);
@@ -161,25 +162,28 @@ bool GenSendRecvCNodesForAllReduce(const std::shared_ptr<session::KernelGraph> &
 
   cudaEvent_t event = nullptr;
   CHECK_CUDA_RET_WITH_EXCEPT(cudaEventCreate(&event, cudaEventDisableTiming), "Creating cuda event failed.");
-  AnfAlgo::SetNodeAttr("record_event", MakeValue(reinterpret_cast<uintptr_t>(event)), *send_node);
-  AnfAlgo::SetNodeAttr("wait_event", MakeValue(reinterpret_cast<uintptr_t>(event)), *recv_node);
+  AnfAlgo::SetNodeAttr(kAttrRecordEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), *send_node);
+  AnfAlgo::SetNodeAttr(kAttrWaitEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), *recv_node);
 
-  uintptr_t send_stream = AnfAlgo::GetNodeAttr<uintptr_t>(mock_send_node, "stream_id");
-  AnfAlgo::SetNodeAttr("record_event_stream", MakeValue(send_stream), *send_node);
-  uintptr_t recv_stream = AnfAlgo::GetNodeAttr<uintptr_t>(mock_recv_node, "stream_id");
-  AnfAlgo::SetNodeAttr("wait_event_stream", MakeValue(recv_stream), *recv_node);
+  uintptr_t send_stream = AnfAlgo::GetNodeAttr<uintptr_t>(mock_send_node, kAttrStreamId);
+  AnfAlgo::SetNodeAttr(kAttrRecordEventStream, MakeValue(send_stream), *send_node);
+  uintptr_t recv_stream = AnfAlgo::GetNodeAttr<uintptr_t>(mock_recv_node, kAttrStreamId);
+  AnfAlgo::SetNodeAttr(kAttrWaitEventStream, MakeValue(recv_stream), *recv_node);
   return true;
 }
 
 CNodePtr CreateStreamSwitchNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const std::string &name) {
   auto op = std::make_shared<Primitive>(name);
+  MS_EXCEPTION_IF_NULL(op);
   auto apply = std::make_shared<ValueNode>(op);
+  MS_EXCEPTION_IF_NULL(apply);
   std::vector<AnfNodePtr> input_list = {apply};
   CNodePtr node = kernel_graph->NewCNode(input_list);
   MS_EXCEPTION_IF_NULL(node);
   kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
   AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), node.get());
   auto abstract_none = std::make_shared<abstract::AbstractNone>();
+  MS_EXCEPTION_IF_NULL(abstract_none);
   node->set_abstract(abstract_none);
   SetKernelInfo(node);
   return node;
diff --git a/mindspore/ccsrc/device/gpu/kernel_info_setter.cc b/mindspore/ccsrc/device/gpu/kernel_info_setter.cc
index 2ba154b87b..42e76e2483 100644
--- a/mindspore/ccsrc/device/gpu/kernel_info_setter.cc
+++ b/mindspore/ccsrc/device/gpu/kernel_info_setter.cc
@@ -82,11 +82,16 @@ std::string SupportedTypeList(const CNodePtr &kernel_node) {
   (void)ParseMetadata(kernel_node, op_info_ptr, kernel::Processor::CUDA, &kernel_info_list);
   for (size_t i = 0; i < kernel_info_list.size(); i++) {
     auto supported_akg_type = kernel_info_list[i]->GetAllInputDeviceTypes();
-    std::string supported_akg_type_list = "[";
+    auto supported_akg_type_out = kernel_info_list[i]->GetAllOutputDeviceTypes();
+    std::string supported_akg_type_list = "in[";
     for (auto type : supported_akg_type) {
       supported_akg_type_list = supported_akg_type_list + mindspore::kernel::TypeId2String(type);
     }
-    supported_type_lists = supported_type_lists + supported_akg_type_list + "] ";
+    supported_type_lists = supported_type_lists + supported_akg_type_list + "], out[";
+    for (auto type : supported_akg_type_out) {
+      supported_akg_type_list = supported_akg_type_list + mindspore::kernel::TypeId2String(type);
+    }
+    supported_type_lists += "]; ";
   }
   return supported_type_lists;
 }
@@ -179,7 +184,7 @@ void SetKernelInfo(const CNodePtr &kernel_node) {
 
   if (!result) {
     result = SelectAkgKernel(kernel_node, builder->Build());
-    kernel_type = AUTO_DIFF_KERNEL;
+    kernel_type = AKG_KERNEL;
   }
 
   if (!result) {
diff --git a/mindspore/ccsrc/device/gpu/mpi/mpi_initializer.cc b/mindspore/ccsrc/device/gpu/mpi/mpi_initializer.cc
index f2dbd4491b..bcad74e5b5 100644
--- a/mindspore/ccsrc/device/gpu/mpi/mpi_initializer.cc
+++ b/mindspore/ccsrc/device/gpu/mpi/mpi_initializer.cc
@@ -24,10 +24,28 @@ namespace mindspore {
 namespace device {
 namespace gpu {
 MPIInitializer::MPIInitializer() {
+  int init_flag = 0;
+  if (MPI_Initialized(&init_flag) != MPI_SUCCESS) {
+    return;
+  }
+  if (init_flag == 0) {
+    auto ret = MPI_Init(nullptr, nullptr);
+    if (ret != MPI_SUCCESS) {
+      return;
+    }
+  }
   MPI_Comm_rank(MPI_COMM_WORLD, &rank_id_);
   MPI_Comm_size(MPI_COMM_WORLD, &rank_size_);
 }
 
+MPIInitializer::~MPIInitializer() {
+  int finalized_flag = 0;
+  (void)MPI_Finalized(&finalized_flag);
+  if (finalized_flag == 0) {
+    (void)MPI_Finalize();
+  }
+}
+
 MPIInitializer &MPIInitializer::GetInstance() {
   static MPIInitializer instance;
   return instance;
diff --git a/mindspore/ccsrc/device/gpu/mpi/mpi_initializer.h b/mindspore/ccsrc/device/gpu/mpi/mpi_initializer.h
index 00f3b9d713..bd0a4aa948 100644
--- a/mindspore/ccsrc/device/gpu/mpi/mpi_initializer.h
+++ b/mindspore/ccsrc/device/gpu/mpi/mpi_initializer.h
@@ -30,7 +30,7 @@ class MPIInitializer {
 
  private:
   MPIInitializer();
-  ~MPIInitializer() = default;
+  ~MPIInitializer();
 
   int rank_id_;
   int rank_size_;
diff --git a/mindspore/ccsrc/device/kernel_adjust.cc b/mindspore/ccsrc/device/kernel_adjust.cc
index 1bd384ff13..93007764af 100644
--- a/mindspore/ccsrc/device/kernel_adjust.cc
+++ b/mindspore/ccsrc/device/kernel_adjust.cc
@@ -37,24 +37,6 @@
 namespace mindspore {
 namespace device {
 using device::ascend::ProfilingUtils;
-void KernelAdjust::Reorder(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
-  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
-  const std::vector<CNodePtr> &origin_cnode_list = kernel_graph_ptr->execution_order();
-  std::vector<CNodePtr> momentum_list;
-  std::vector<CNodePtr> other_list;
-  for (const auto &cnode : origin_cnode_list) {
-    if (kOptOperatorSet.find(AnfAlgo::GetCNodeName(cnode)) != kOptOperatorSet.end()) {
-      momentum_list.emplace_back(cnode);
-    } else {
-      other_list.emplace_back(cnode);
-    }
-  }
-  std::vector<CNodePtr> new_order_list;
-  new_order_list.insert(new_order_list.end(), other_list.begin(), other_list.end());
-  new_order_list.insert(new_order_list.end(), momentum_list.begin(), momentum_list.end());
-  kernel_graph_ptr->set_execution_order(new_order_list);
-}
-
 void KernelAdjust::ReorderGetNext(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
   MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
   const std::vector<CNodePtr> &origin_cnode_list = kernel_graph_ptr->execution_order();
@@ -80,23 +62,6 @@ bool KernelAdjust::NeedInsertSwitch() {
           ConfigManager::GetInstance().iter_num() > 1);
 }
 
-uint32_t KernelAdjust::FindFirstStreamSwitchLabel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
-  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
-  auto cnode_ptr_list = kernel_graph_ptr->execution_order();
-  CNodePtr cur_cnode_ptr = nullptr;
-  uint32_t label = kInvalidDistincLabel;
-  for (uint32_t i = 0; i < cnode_ptr_list.size(); ++i) {
-    cur_cnode_ptr = cnode_ptr_list[i];
-    MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
-    if (AnfAlgo::GetCNodeName(cur_cnode_ptr) == kStreamSwitchOpName) {
-      label = AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get());
-      break;
-    }
-  }
-
-  return label;
-}
-
 CNodePtr KernelAdjust::CreateSendApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr,
                                              uint32_t event_id) {
   MS_EXCEPTION_IF_NULL(graph_ptr);
@@ -138,6 +103,8 @@ CNodePtr KernelAdjust::CreateRecvApplyKernel(const std::shared_ptr<session::Kern
 }
 
 void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
+  device::ascend::AscendStreamMng &stream_manager = device::ascend::AscendStreamMng::GetInstance();
+  stream_manager.Reset();
   if (!NeedInsertSwitch()) {
     return;
   }
@@ -166,68 +133,62 @@ void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph>
   if (orders.empty()) {
     MS_LOG(EXCEPTION) << "graph execution order is empty";
   }
-  uint32_t first_cnode_stream_label = AnfAlgo::GetStreamDistinctionLabel(orders[0].get());
 
   std::vector<CNodePtr> exec_order;
-  CNodePtr first_stream_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input);
-  MS_EXCEPTION_IF_NULL(first_stream_switch_app);
-  AnfAlgo::SetStreamDistinctionLabel(kFirstStreamSwitchLabel, first_stream_switch_app.get());
-  AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(kGetNextLabel), first_stream_switch_app);
-
-  CNodePtr second_stream_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input);
-  MS_EXCEPTION_IF_NULL(second_stream_switch_app);
-  AnfAlgo::SetStreamDistinctionLabel(kSecondStreamSwitchLabel, second_stream_switch_app.get());
-  AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(first_cnode_stream_label), second_stream_switch_app);
-  // add attr "stream_need_active"
-  AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), second_stream_switch_app);
-
-  CNodePtr first_stream_active_app = CreateStreamActiveOp(kernel_graph_ptr);
-  MS_EXCEPTION_IF_NULL(first_stream_active_app);
-  AnfAlgo::SetStreamDistinctionLabel(first_cnode_stream_label, first_stream_active_app.get());
-  std::vector<uint32_t> first_active_streams = {kFirstStreamSwitchLabel};
-  AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(first_active_streams),
-                       first_stream_active_app);
-
-  CNodePtr second_stream_active_app = CreateStreamActiveOp(kernel_graph_ptr);
-  MS_EXCEPTION_IF_NULL(second_stream_active_app);
-  // specific deal for common ctrl stream policy
-  uint32_t first_common_stream_switch_label = FindFirstStreamSwitchLabel(kernel_graph_ptr);
-  if (first_common_stream_switch_label == kInvalidDistincLabel) {
-    AnfAlgo::SetStreamDistinctionLabel(first_cnode_stream_label, second_stream_active_app.get());
-  } else {
-    AnfAlgo::SetStreamDistinctionLabel(first_common_stream_switch_label, second_stream_active_app.get());
-  }
 
-  std::vector<uint32_t> second_active_streams = {kSecondStreamSwitchLabel};
-  AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(second_active_streams),
-                       second_stream_active_app);
+  // getnext loop process
+  // getnext loop stream switch op
+  CNodePtr getnext_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input);
+  MS_EXCEPTION_IF_NULL(getnext_switch_app);
+  uint32_t getnext_switch_stream_id = stream_manager.ApplyNewStream();
+  AnfAlgo::SetStreamId(getnext_switch_stream_id, getnext_switch_app.get());
+  exec_order.push_back(getnext_switch_app);
 
-  CNodePtr assign_add_one = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input);
-  MS_EXCEPTION_IF_NULL(assign_add_one);
-  AnfAlgo::SetStreamDistinctionLabel(first_cnode_stream_label, assign_add_one.get());
-
-  CNodePtr send = CreateSendApplyKernel(kernel_graph_ptr, kFirstEventId);
-  AnfAlgo::SetStreamDistinctionLabel(kGetNextLabel, send.get());
-  CNodePtr recv = CreateRecvApplyKernel(kernel_graph_ptr, kFirstEventId);
-  AnfAlgo::SetStreamDistinctionLabel(first_cnode_stream_label, recv.get());
-
-  // reorder graph orders
-  exec_order.push_back(first_stream_switch_app);
+  // getnext op
+  uint32_t getnext_stream_id = stream_manager.ApplyNewStream();
   size_t i = 0;
   for (; i < orders.size(); i++) {
     auto node = orders[i];
     exec_order.push_back(node);
-    AnfAlgo::SetStreamDistinctionLabel(kGetNextLabel, exec_order[exec_order.size() - 1].get());
+    AnfAlgo::SetStreamId(getnext_stream_id, exec_order[exec_order.size() - 1].get());
     if (AnfAlgo::GetCNodeName(node) == kGetNextOpName) {
       break;
     }
   }
 
+  // update getnext loop stream switch true_branch_stream attr
+  AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(getnext_stream_id), getnext_switch_app);
+
+  // getnext loop send
+  CNodePtr send = CreateSendApplyKernel(kernel_graph_ptr, kFirstEventId);
+  AnfAlgo::SetStreamId(getnext_stream_id, send.get());
   exec_order.push_back(send);
-  exec_order.push_back(second_stream_switch_app);
+
+  // fpbp loop process
+  // fpbp loop stream switch
+  CNodePtr fpbp_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input);
+  MS_EXCEPTION_IF_NULL(fpbp_switch_app);
+  uint32_t fpbp_switch_stream_id = stream_manager.ApplyNewStream();
+  AnfAlgo::SetStreamId(fpbp_switch_stream_id, fpbp_switch_app.get());
+  AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), fpbp_switch_app);
+  exec_order.push_back(fpbp_switch_app);
+
+  // fpbp loop recv
+  CNodePtr recv = CreateRecvApplyKernel(kernel_graph_ptr, kFirstEventId);
+  uint32_t fpbp_stream_id = stream_manager.ApplyNewStream();
+  AnfAlgo::SetStreamId(fpbp_stream_id, recv.get());
   exec_order.push_back(recv);
+
+  // update fpbp loop stream switch true_branch_stream attr
+  AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(fpbp_stream_id), fpbp_switch_app);
+
+  // fpbp loop AssignAdd
+  CNodePtr assign_add_one = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input);
+  MS_EXCEPTION_IF_NULL(assign_add_one);
+  AnfAlgo::SetStreamId(fpbp_stream_id, assign_add_one.get());
   exec_order.push_back(assign_add_one);
 
+  // fpbp memcpy
   std::vector<CNodePtr> memcpy_list;
   std::vector<CNodePtr> before_list;
   std::vector<CNodePtr> after_list;
@@ -244,12 +205,28 @@ void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph>
       before_list.emplace_back(cur_cnode);
     }
   }
-
   (void)std::copy(before_list.begin(), before_list.end(), std::back_inserter(exec_order));
   (void)std::copy(memcpy_list.begin(), memcpy_list.end(), std::back_inserter(exec_order));
-  exec_order.push_back(first_stream_active_app);
+
+  // stream active to activate getnext loop
+  CNodePtr getnext_active_app = CreateStreamActiveOp(kernel_graph_ptr);
+  MS_EXCEPTION_IF_NULL(getnext_active_app);
+  std::vector<uint32_t> getnext_active_streams = {getnext_switch_stream_id};
+  AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(getnext_active_streams),
+                       getnext_active_app);
+  exec_order.push_back(getnext_active_app);
+
+  // fpbp loop other ops
   (void)std::copy(after_list.begin(), after_list.end(), std::back_inserter(exec_order));
-  exec_order.push_back(second_stream_active_app);
+
+  // stream active to activate fpbp loop
+  CNodePtr fpbp_active_app = CreateStreamActiveOp(kernel_graph_ptr);
+  MS_EXCEPTION_IF_NULL(fpbp_active_app);
+  // specific deal for common ctrl stream policy
+  std::vector<uint32_t> fpbp_active_streams = {fpbp_switch_stream_id};
+  AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(fpbp_active_streams), fpbp_active_app);
+  exec_order.push_back(fpbp_active_app);
+
   kernel_graph_ptr->set_execution_order(exec_order);
 }
 
diff --git a/mindspore/ccsrc/device/kernel_adjust.h b/mindspore/ccsrc/device/kernel_adjust.h
index 87195ecfc4..1a7436b396 100644
--- a/mindspore/ccsrc/device/kernel_adjust.h
+++ b/mindspore/ccsrc/device/kernel_adjust.h
@@ -39,9 +39,9 @@ constexpr auto kZeroParamName = "zero";
 constexpr auto kOneParamName = "one";
 constexpr auto kStreamNeedActivedFirst = "stream_need_active_first";
 
-const uint32_t kFirstStreamSwitchLabel = kInvalidDistincLabel - 1;
-const uint32_t kGetNextLabel = kInvalidDistincLabel - 2;
-const uint32_t kSecondStreamSwitchLabel = kInvalidDistincLabel - 3;
+const uint32_t kFirstStreamSwitchLabel = 0;
+const uint32_t kGetNextLabel = 1;
+const uint32_t kSecondStreamSwitchLabel = 2;
 const uint32_t kInvalidEventId = UINT32_MAX;
 const uint32_t kFirstEventId = kInvalidEventId / 2;
 namespace device {
@@ -51,7 +51,7 @@ class KernelAdjust {
     static KernelAdjust instance;
     return instance;
   }
-  void Reorder(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
+
   void InsertSwitchLoop(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
   bool StepLoadCtrlInputs(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
   void Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr);
@@ -65,7 +65,6 @@ class KernelAdjust {
   void ReorderGetNext(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
   CNodePtr CreateRecvApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr, uint32_t event_id);
   CNodePtr CreateSendApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr, uint32_t event_id);
-  uint32_t FindFirstStreamSwitchLabel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
   void CreateSwitchOpParameters(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
                                 std::map<std::string, mindspore::ParameterPtr> *switch_loop_input);
   CNodePtr CreateStreamSwitchOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
diff --git a/mindspore/ccsrc/device/kernel_info.h b/mindspore/ccsrc/device/kernel_info.h
index 33ddda83c9..84cfaa0fa3 100644
--- a/mindspore/ccsrc/device/kernel_info.h
+++ b/mindspore/ccsrc/device/kernel_info.h
@@ -35,7 +35,7 @@ class KernelInfo {
     select_kernel_build_info_ = nullptr;
     output_address_list_ = {};
     workspace_address_list_ = {};
-    stream_id_ = 0;
+    stream_id_ = UINT32_MAX;
     stream_distinction_label_ = kInvalidDistincLabel;
     graph_id_ = kInvalidGraphId;
   }
diff --git a/mindspore/ccsrc/device/kernel_runtime.cc b/mindspore/ccsrc/device/kernel_runtime.cc
index 9a8e65b474..cc1e3ab8f3 100644
--- a/mindspore/ccsrc/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/device/kernel_runtime.cc
@@ -102,6 +102,14 @@ bool KernelRuntime::RunTask(const session::KernelGraph *graph) {
   return false;
 }
 
+bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) {
+  MS_EXCEPTION_IF_NULL(kernel);
+  if (AnfAlgo::OutputAddrExist(kernel, index)) {
+    return true;
+  }
+  return false;
+}
+
 size_t KernelRuntime::CountNodeDeviceMemorySize(const mindspore::AnfNodePtr &node, size_t output_index) {
   MS_EXCEPTION_IF_NULL(node);
   if (output_index >= AnfAlgo::GetOutputTensorNum(node)) {
@@ -146,6 +154,34 @@ void KernelRuntime::RunOpAssignMemory(const std::vector<tensor::TensorPtr> &inpu
   UpdateRefNodeOutputMem(graph);
 }
 
+void KernelRuntime::RunOpClearMemory(session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  // clear input parameter memory resource
+  for (const auto &input_node : graph->inputs()) {
+    MS_EXCEPTION_IF_NULL(input_node);
+    AnfAlgo::SetOutputAddr(nullptr, 0, input_node.get());
+  }
+  // clear input value node memory resource
+  for (const auto &value_node : graph->graph_value_nodes()) {
+    MS_EXCEPTION_IF_NULL(value_node);
+    AnfAlgo::SetOutputAddr(nullptr, 0, value_node.get());
+  }
+  for (const auto &cnode : graph->execution_order()) {
+    MS_EXCEPTION_IF_NULL(cnode);
+    // clear output memory resource
+    for (size_t index = 0; index < AnfAlgo::GetOutputTensorNum(cnode); ++index) {
+      AnfAlgo::SetOutputAddr(nullptr, index, cnode.get());
+    }
+    // clear workspace memory resource
+    auto kernel_mod = AnfAlgo::GetKernelMod(cnode);
+    MS_EXCEPTION_IF_NULL(kernel_mod);
+    auto workspace_lists = kernel_mod->GetWorkspaceSizeList();
+    for (size_t index = 0; index < workspace_lists.size(); ++index) {
+      AnfAlgo::SetWorkspaceAddr(nullptr, index, cnode.get());
+    }
+  }
+}
+
 void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) {
   AssignStaticMemoryInput(graph);
   AssignStaticMemoryValueNode(graph);
@@ -182,6 +218,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>
       auto device_address =
         CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
       MS_EXCEPTION_IF_NULL(device_address);
+      MS_EXCEPTION_IF_NULL(mem_manager_);
       auto ret = mem_manager_->MallocMemFromMemPool(device_address, tensor_size);
       if (!ret) {
         MS_LOG(EXCEPTION) << "Malloc device memory failed.";
@@ -246,18 +283,37 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
   MS_EXCEPTION_IF_NULL(mem_manager_);
   auto graph_inputs = graph->inputs();
   auto graph_valid_input = graph->valid_inputs();
-  for (size_t i = 0; i < graph_inputs.size(); i++) {
+  std::vector<AnfNodePtr> need_alloc_nodes;
+  for (size_t i = 0; i < graph_inputs.size(); ++i) {
     auto item = graph_inputs[i];
     MS_EXCEPTION_IF_NULL(item);
-    if (!item->isa<Parameter>()) {
+    if (i < graph_valid_input.size() && !graph_valid_input[i]) {
       continue;
     }
-    if (i < graph_valid_input.size() && !graph_valid_input[i]) {
+
+    if (AnfAlgo::CheckPrimitiveType(item, prim::kPrimMakeTuple)) {
+      auto outs = AnfAlgo::GetAllOutput(item);
+      for (auto &out : outs) {
+        MS_EXCEPTION_IF_NULL(out);
+        if (!out->isa<Parameter>()) {
+          continue;
+        }
+        if (NodeOutputDeviceAddressExist(out, 0)) {
+          continue;
+        }
+        need_alloc_nodes.push_back(out);
+      }
+    }
+    if (!item->isa<Parameter>()) {
       continue;
     }
-    if (AnfAlgo::OutputAddrExist(item, 0)) {
+    if (NodeOutputDeviceAddressExist(item, 0)) {
       continue;
     }
+    need_alloc_nodes.push_back(item);
+  }
+
+  for (auto &item : need_alloc_nodes) {
     auto output_size = AnfAlgo::GetOutputTensorNum(item);
     for (size_t index = 0; index < output_size; index++) {
       TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(item, index);
@@ -431,7 +487,7 @@ void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int in
     if ((kGetAllOuts != index) && (SizeToInt(i) != index)) {
       continue;
     }
-    if (AnfAlgo::OutputAddrExist(node, i)) {
+    if (NodeOutputDeviceAddressExist(node, i)) {
       MS_LOG(INFO) << "Already malloc index:" << i;
       continue;
     }
@@ -493,7 +549,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
   MS_EXCEPTION_IF_NULL(ms_context);
   for (auto &value_node : graph->graph_value_nodes()) {
     MS_EXCEPTION_IF_NULL(value_node);
-    if (AnfAlgo::OutputAddrExist(value_node, 0)) {
+    if (NodeOutputDeviceAddressExist(value_node, 0)) {
       MS_LOG(INFO) << "value_node[" << value_node->DebugString() << "] address already exist";
       continue;
     }
@@ -583,6 +639,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
   for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
     auto real_input = AnfAlgo::GetRealInputIndex(kernel, i);
     auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input);
+    MS_EXCEPTION_IF_NULL(device_address);
     kernel::AddressPtr input = std::make_shared<kernel::Address>();
     MS_EXCEPTION_IF_NULL(input);
     input->addr = device_address->ptr_;
@@ -619,8 +676,8 @@ void KernelRuntime::GenAddrCleanLaunchArgs(const CNodePtr &cnode, AddressPtrList
   MS_EXCEPTION_IF_NULL(cnode->inputs()[1]);
   auto pre_node = (cnode->inputs()[1])->cast<CNodePtr>();
   // set clean output address
-  if (AnfAlgo::HasNodeAttr(kAttrAutomicOutputIndexs, pre_node)) {
-    auto clean_output_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAutomicOutputIndexs);
+  if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, pre_node)) {
+    auto clean_output_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAtomicOutputIndexs);
     for (auto index : clean_output_indexs) {
       auto device_address = AnfAlgo::GetOutputAddr(pre_node, index);
       kernel::AddressPtr input = std::make_shared<kernel::Address>();
@@ -633,10 +690,10 @@ void KernelRuntime::GenAddrCleanLaunchArgs(const CNodePtr &cnode, AddressPtrList
     MS_LOG(INFO) << "AtomicAddClean clean output size:" << clean_output_indexs.size();
   }
   // set clean workspace address
-  if (AnfAlgo::HasNodeAttr(kAttrAutomicWorkspaceSize, pre_node)) {
-    auto clean_workspaces = AnfAlgo::GetNodeAttr<int>(pre_node, kAttrAutomicWorkspaceSize);
-    if (clean_workspaces != 0) {
-      auto device_address = AnfAlgo::GetWorkspaceAddr(pre_node, 0);
+  if (AnfAlgo::HasNodeAttr(kAttrAtomicWorkspaceIndexs, pre_node)) {
+    auto clean_workspaces_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAtomicWorkspaceIndexs);
+    for (const auto &index : clean_workspaces_indexs) {
+      auto device_address = AnfAlgo::GetWorkspaceAddr(pre_node, index);
       kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
       MS_EXCEPTION_IF_NULL(workspace);
       workspace->addr = device_address->ptr_;
@@ -644,7 +701,6 @@ void KernelRuntime::GenAddrCleanLaunchArgs(const CNodePtr &cnode, AddressPtrList
       workspace->size = device_address->size_;
       kernel_inputs->emplace_back(workspace);
     }
-    MS_LOG(INFO) << "AtomicAddClean clean workspace size" << clean_workspaces;
   }
 }
 
@@ -673,10 +729,6 @@ bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) {
     MS_LOG(ERROR) << "LaunchKernelMod failed!";
     return false;
   }
-  if (!SyncStream()) {
-    MS_LOG(ERROR) << "SyncStream failed!";
-    return false;
-  }
   return true;
 }
 
diff --git a/mindspore/ccsrc/device/kernel_runtime.h b/mindspore/ccsrc/device/kernel_runtime.h
index 668fb2580f..bfe857f61b 100644
--- a/mindspore/ccsrc/device/kernel_runtime.h
+++ b/mindspore/ccsrc/device/kernel_runtime.h
@@ -47,6 +47,7 @@ class KernelRuntime {
   virtual bool Init() = 0;
   virtual void AssignMemory(session::KernelGraph *graph);
   void RunOpAssignMemory(const std::vector<tensor::TensorPtr> &input_tensors, session::KernelGraph *graph);
+  void RunOpClearMemory(session::KernelGraph *graph);
   virtual bool Run(session::KernelGraph *graph);
   virtual bool DumpData(session::KernelGraph *graph);
   virtual bool RunTask(const session::KernelGraph *graph);
@@ -55,6 +56,7 @@ class KernelRuntime {
   virtual void AssignStaticMemoryInput(const session::KernelGraph *graph);
   virtual void AssignStaticMemoryValueNode(session::KernelGraph *graph);
   virtual void ClearGraphRuntimeResource(uint32_t graph_id);
+  virtual bool SyncStream() = 0;
 
 #ifdef ENABLE_DUMP_E2E
   DumpConfPtr GetDumpConf();
@@ -67,7 +69,7 @@ class KernelRuntime {
  protected:
   virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                                TypeId type_id) = 0;
-  virtual bool SyncStream() = 0;
+  virtual bool NodeOutputDeviceAddressExist(const AnfNodePtr &node, size_t index);
   void AssignStaticMemory(session::KernelGraph *graph);
   void AssignDynamicMemory(session::KernelGraph *graph);
   void ReuseAssignDynamicMemory(session::KernelGraph *graph);
diff --git a/mindspore/ccsrc/device/kernel_runtime_manager.cc b/mindspore/ccsrc/device/kernel_runtime_manager.cc
index ca6f386b50..0f95f3e79b 100644
--- a/mindspore/ccsrc/device/kernel_runtime_manager.cc
+++ b/mindspore/ccsrc/device/kernel_runtime_manager.cc
@@ -54,8 +54,9 @@ KernelRuntime *KernelRuntimeManager::GetSingleKernelRuntime(const std::string &d
     return runtime_iter->second.get();
   } else if (runtime_map_.size() > 0) {
     auto cur_runtime_key = runtime_map_.begin()->first;
-    if (cur_runtime_key.rfind('_') != std::string::npos) {
-      auto cur_device_id = cur_runtime_key.substr(cur_runtime_key.rfind('_') + 1);
+    auto find_pos = cur_runtime_key.rfind('_');
+    if (find_pos != std::string::npos) {
+      auto cur_device_id = cur_runtime_key.substr(find_pos + 1);
       MS_LOG(EXCEPTION) << "Can't change device id in runtime, already set device id: " << cur_device_id
                         << ", set device id: " << device_id << " failed";
     }
diff --git a/mindspore/ccsrc/device/memory_manager.cc b/mindspore/ccsrc/device/memory_manager.cc
index d2a38038c6..5efbcd8a36 100644
--- a/mindspore/ccsrc/device/memory_manager.cc
+++ b/mindspore/ccsrc/device/memory_manager.cc
@@ -68,6 +68,7 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, in
   } else if (flag == kDynamicMem) {
     ptr = MallocDynamicMem(size, false);
   } else if (flag == kReuseDynamicMem) {
+    MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_);
     ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
   }
   return ptr;
@@ -75,6 +76,7 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, in
 
 uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, int flag, size_t size) {
   if (flag == kReuseDynamicMem) {
+    MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_);
     return mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
   }
   return MallocDynamicMem(size, false);
diff --git a/mindspore/ccsrc/ir/CMakeLists.txt b/mindspore/ccsrc/ir/CMakeLists.txt
index 77bc1b7661..2a0b81ae04 100644
--- a/mindspore/ccsrc/ir/CMakeLists.txt
+++ b/mindspore/ccsrc/ir/CMakeLists.txt
@@ -1,3 +1,7 @@
 file(GLOB_RECURSE _IR_SRC_LIST ./*.cc dtype/*.cc)
+file(GLOB_RECURSE _IR_LITE_SRC_FILES
+        ./lite/tensor.cc
+        )
+list(REMOVE_ITEM _IR_SRC_LIST ${_IR_LITE_SRC_FILES})
 set_property(SOURCE ${_IR_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_IR)
 add_library(_mindspore_ir_obj OBJECT ${_IR_SRC_LIST})
diff --git a/mindspore/ccsrc/ir/anf.cc b/mindspore/ccsrc/ir/anf.cc
index 29a74b79ba..3b2402172b 100644
--- a/mindspore/ccsrc/ir/anf.cc
+++ b/mindspore/ccsrc/ir/anf.cc
@@ -26,6 +26,8 @@
 #include "ir/func_graph.h"
 #include "ir/primitive_base.h"
 
+#include "operator/ops.h"
+
 namespace mindspore {
 // namespace to support intermediate representation definition
 CNode::CNode(const std::vector<AnfNodePtr> &inputs, const FuncGraphPtr &func_graph)
@@ -106,10 +108,14 @@ std::string ValueNode::fullname_with_scope() {
 bool IsPrimitiveCNode(const AnfNodePtr &node, const PrimitivePtr &value) {
   MS_EXCEPTION_IF_NULL(node);
   auto cnode = node->cast<CNodePtr>();
-  if (cnode != nullptr) {
+  if (cnode == nullptr) {
+    return false;
+  }
+  if (value != nullptr) {
     return cnode->IsApply(value);
   }
-  return false;
+  const auto &prim = GetValueNode<PrimitivePtr>(cnode->input(0));
+  return prim != nullptr;
 }
 
 PrimitivePtr GetCNodePrimitive(const AnfNodePtr &node) {
diff --git a/mindspore/ccsrc/ir/anf.h b/mindspore/ccsrc/ir/anf.h
index c2db17aec5..95a018af06 100644
--- a/mindspore/ccsrc/ir/anf.h
+++ b/mindspore/ccsrc/ir/anf.h
@@ -124,6 +124,7 @@ class AnfNode : public Base {
 
   const KernelInfoDevice *kernel_info() const { return kernel_info_.get(); }
   KernelInfoDevice *kernel_info() { return kernel_info_.get(); }
+  const KernelInfoDevicePtr &kernel_info_ptr() { return kernel_info_; }
   void set_kernel_info(const KernelInfoDevicePtr &kernel_info) { kernel_info_ = kernel_info; }
 
   AbstractBasePtr abstract() const { return abstract_; }
@@ -216,6 +217,7 @@ class CNode : public AnfNode {
   void set_stop_gradient(bool stop_gradient) { stop_gradient_ = stop_gradient; }
 
   std::string fullname_with_scope() override;
+  void set_fullname_with_scope(const std::string full_name) { fullname_with_scope_ = full_name; }
   std::string DebugString(int recursive_level = 1) const override;
   std::string DebugString(bool recursive) const override { return DebugString(recursive ? 1 : 0); }
 
@@ -395,9 +397,9 @@ static S GetValue(const ValuePtr &value) {
 std::string GetCNodeFuncName(CNodePtr cnode);
 
 // used to check whether an AnfNode is a cnode with a kind of Primitive as first input
-bool IsPrimitiveCNode(const AnfNodePtr &node, const PrimitivePtr &value);
+bool IsPrimitiveCNode(const AnfNodePtr &node, const PrimitivePtr &value = nullptr);
 
-// used to check whether an AnfNode is a cnode with a Primitive as first input
+// used to get PrimitivePtr from a cnode first input
 PrimitivePtr GetCNodePrimitive(const AnfNodePtr &node);
 
 // used to check whether an AnfNode is a valuenode having some Primitive value
diff --git a/mindspore/ccsrc/ir/anf_extends.cc b/mindspore/ccsrc/ir/anf_extends.cc
index 0345ad29f5..432ffdb606 100644
--- a/mindspore/ccsrc/ir/anf_extends.cc
+++ b/mindspore/ccsrc/ir/anf_extends.cc
@@ -70,7 +70,7 @@ std::string CNode::fullname_with_scope() {
     }
     fullname_with_scope_ = name;
   } else {
-    // cnode input 0 should be primitive ptr
+    // cnode input 0 should be primitive ptr or funcgraph ptr
     auto value_ptr = input(0)->cast<ValueNodePtr>();
     if (value_ptr == nullptr) {
       MS_LOG(WARNING) << "Input 0 of cnode is not a value node, its type is " << input(0)->type_name() << ".";
@@ -84,11 +84,23 @@ std::string CNode::fullname_with_scope() {
       return fullname_with_scope_;
     }
 
-    PrimitivePtr prim = GetValue<PrimitivePtr>(input_value);
+    auto prim = input_value->cast<PrimitivePtr>();
     MS_EXCEPTION_IF_NULL(scope());
-    MS_EXCEPTION_IF_NULL(prim);
-    fullname_with_scope_ =
-      scope()->name() + "/" + prim->name() + "-op" + id_generator::get_id(shared_from_base<CNode>());
+    fullname_with_scope_ = scope()->name() + "/";
+    if (prim != nullptr) {
+      fullname_with_scope_ += prim->name();
+    } else {
+      auto func_graph = input_value->cast<FuncGraphPtr>();
+      MS_EXCEPTION_IF_NULL(func_graph);
+      auto fg_flag = func_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+      if (fg_flag != nullptr) {
+        auto fg_name = GetValue<std::string>(fg_flag);
+        fullname_with_scope_ += "GraphKernel_" + fg_name;
+      } else {
+        fullname_with_scope_ += func_graph->ToString();
+      }
+    }
+    fullname_with_scope_ += "-op" + id_generator::get_id(shared_from_base<CNode>());
   }
 
   return fullname_with_scope_;
diff --git a/mindspore/ccsrc/ir/dtype/number.h b/mindspore/ccsrc/ir/dtype/number.h
index 3930f51d73..f8a746f8d6 100644
--- a/mindspore/ccsrc/ir/dtype/number.h
+++ b/mindspore/ccsrc/ir/dtype/number.h
@@ -77,9 +77,9 @@ class Bool : public Number {
 
   TypeId generic_type_id() const override { return kNumberTypeBool; }
   TypePtr DeepCopy() const override { return std::make_shared<Bool>(); }
-  std::string ToString() const override { return "Bool_"; }
-  std::string ToReprString() const override { return "bool_"; }
-  std::string DumpText() const override { return "Bool_"; }
+  std::string ToString() const override { return "Bool"; }
+  std::string ToReprString() const override { return "bool"; }
+  std::string DumpText() const override { return "Bool"; }
 };
 
 // Int
diff --git a/mindspore/ccsrc/ir/dtype/type.h b/mindspore/ccsrc/ir/dtype/type.h
index a4035abf50..bfe39af43c 100644
--- a/mindspore/ccsrc/ir/dtype/type.h
+++ b/mindspore/ccsrc/ir/dtype/type.h
@@ -34,65 +34,9 @@
 
 #include "ir/base.h"
 #include "ir/named.h"
+#include "ir/dtype/type_id.h"
 
 namespace mindspore {
-//
-// Supported meta type
-//
-enum TypeId : int {
-  kTypeUnknown = 0,
-  kMetaTypeBegin = kTypeUnknown,
-  kMetaTypeType,  // Type
-  kMetaTypeAnything,
-  kMetaTypeObject,
-  kMetaTypeTypeType,  // TypeType
-  kMetaTypeProblem,
-  kMetaTypeExternal,
-  kMetaTypeNone,
-  kMetaTypeNull,
-  kMetaTypeEllipsis,
-  kMetaTypeEnd,
-  //
-  // Object types
-  //
-  kObjectTypeBegin = kMetaTypeEnd,
-  kObjectTypeNumber,
-  kObjectTypeString,
-  kObjectTypeList,
-  kObjectTypeTuple,
-  kObjectTypeSlice,
-  kObjectTypeKeyword,
-  kObjectTypeTensorType,
-  kObjectTypeClass,
-  kObjectTypeDictionary,
-  kObjectTypeFunction,
-  kObjectTypeJTagged,
-  kObjectTypeSymbolicKeyType,
-  kObjectTypeEnvType,
-  kObjectTypeRefKey,
-  kObjectTypeRef,
-  kObjectTypeEnd,
-  //
-  // Number Types
-  //
-  kNumberTypeBegin = kObjectTypeEnd,
-  kNumberTypeBool,
-  kNumberTypeInt,
-  kNumberTypeInt8,
-  kNumberTypeInt16,
-  kNumberTypeInt32,
-  kNumberTypeInt64,
-  kNumberTypeUInt,
-  kNumberTypeUInt8,
-  kNumberTypeUInt16,
-  kNumberTypeUInt32,
-  kNumberTypeUInt64,
-  kNumberTypeFloat,
-  kNumberTypeFloat16,
-  kNumberTypeFloat32,
-  kNumberTypeFloat64,
-  kNumberTypeEnd
-};
 
 TypeId IntBitsToTypeId(const int nbits);
 TypeId UIntBitsToTypeId(const int nbits);
diff --git a/mindspore/ccsrc/ir/dtype/type_id.h b/mindspore/ccsrc/ir/dtype/type_id.h
new file mode 100644
index 0000000000..17862ad798
--- /dev/null
+++ b/mindspore/ccsrc/ir/dtype/type_id.h
@@ -0,0 +1,91 @@
+/**
+ * This is the C++ adaptation and derivative work of Myia (https://github.com/mila-iqia/myia/).
+ *
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_IR_DTYPE_TYPE_ID_H_
+#define MINDSPORE_CCSRC_IR_DTYPE_TYPE_ID_H_
+
+#include <unordered_map>
+#include <string>
+
+namespace mindspore {
+//
+// Supported meta type
+//
+enum TypeId : int {
+  kTypeUnknown = 0,
+  kMetaTypeBegin = kTypeUnknown,
+  kMetaTypeType,  // Type
+  kMetaTypeAnything,
+  kMetaTypeObject,
+  kMetaTypeTypeType,  // TypeType
+  kMetaTypeProblem,
+  kMetaTypeExternal,
+  kMetaTypeNone,
+  kMetaTypeNull,
+  kMetaTypeEllipsis,
+  kMetaTypeEnd,
+  //
+  // Object types
+  //
+  kObjectTypeBegin = kMetaTypeEnd,
+  kObjectTypeNumber,
+  kObjectTypeString,
+  kObjectTypeList,
+  kObjectTypeTuple,
+  kObjectTypeSlice,
+  kObjectTypeKeyword,
+  kObjectTypeTensorType,
+  kObjectTypeClass,
+  kObjectTypeDictionary,
+  kObjectTypeFunction,
+  kObjectTypeJTagged,
+  kObjectTypeSymbolicKeyType,
+  kObjectTypeEnvType,
+  kObjectTypeRefKey,
+  kObjectTypeRef,
+  kObjectTypeEnd,
+  //
+  // Number Types
+  //
+  kNumberTypeBegin = kObjectTypeEnd,
+  kNumberTypeBool,
+  kNumberTypeInt,
+  kNumberTypeInt8,
+  kNumberTypeInt16,
+  kNumberTypeInt32,
+  kNumberTypeInt64,
+  kNumberTypeUInt,
+  kNumberTypeUInt8,
+  kNumberTypeUInt16,
+  kNumberTypeUInt32,
+  kNumberTypeUInt64,
+  kNumberTypeFloat,
+  kNumberTypeFloat16,
+  kNumberTypeFloat32,
+  kNumberTypeFloat64,
+  kNumberTypeEnd
+};
+//
+// TypeId name map
+//
+const std::unordered_map<TypeId, std::string> type_name_map = {
+  {kNumberTypeBool, "Bool"},       {kNumberTypeInt8, "Int8"},       {kNumberTypeUInt8, "UInt8"},
+  {kNumberTypeInt16, "Int16"},     {kNumberTypeInt32, "Int32"},     {kNumberTypeInt64, "Int64"},
+  {kNumberTypeFloat16, "Float16"}, {kNumberTypeFloat32, "Float32"}, {kNumberTypeFloat64, "Float64"}};
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_IR_DTYPE_TYPE_ID_H_
diff --git a/mindspore/ccsrc/ir/func_graph.cc b/mindspore/ccsrc/ir/func_graph.cc
index d5d80eb2f0..cdca98fc61 100644
--- a/mindspore/ccsrc/ir/func_graph.cc
+++ b/mindspore/ccsrc/ir/func_graph.cc
@@ -34,7 +34,7 @@ namespace mindspore {
  * Methods of Graph
  */
 FuncGraph::FuncGraph()
-    : flags_(),
+    : attrs_(),
       transforms_(),
       parameter_default_value_(),
       seen_(0),
@@ -95,13 +95,27 @@ ParameterPtr FuncGraph::AddWeightParameter(const std::string &name) {
   return p;
 }
 
-bool FuncGraph::has_flag(const std::string &flag) {
-  if (flags_.count(flag)) {
-    return flags_[flag];
+bool FuncGraph::has_flag(const std::string &key) {
+  auto iter = attrs_.find(key);
+  if (iter != attrs_.cend()) {
+    if (iter->second->isa<BoolImm>()) {
+      return GetValue<bool>(iter->second);
+    }
+    MS_LOG(WARNING) << "key " << key << " is not a flag, please use has_attr function.";
   }
   return false;
 }
 
+bool FuncGraph::has_attr(const std::string &key) {
+  auto iter = attrs_.find(key);
+  return !(iter == attrs_.cend());
+}
+
+ValuePtr FuncGraph::get_attr(const std::string &key) {
+  auto iter = attrs_.find(key);
+  return iter == attrs_.cend() ? nullptr : iter->second;
+}
+
 CNodePtr FuncGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
   CNodePtr cnode = std::make_shared<CNode>(inputs, shared_from_base<FuncGraph>());
   if (has_flag(GRAPH_FLAG_HAS_EFFECT)) {
diff --git a/mindspore/ccsrc/ir/func_graph.h b/mindspore/ccsrc/ir/func_graph.h
index 1a367bde92..5f09dfe6b5 100644
--- a/mindspore/ccsrc/ir/func_graph.h
+++ b/mindspore/ccsrc/ir/func_graph.h
@@ -38,6 +38,32 @@ namespace mindspore {
 using BaseRefCounterMap = OrderedMap<BaseRef, int, BaseRefHash>;
 using FuncGraphCounterMap = OrderedMap<FuncGraphPtr, int>;
 
+struct CNodeIndexHasher {
+  std::size_t operator()(const CNodeIndexPairPtr pair) const {
+    MS_EXCEPTION_IF_NULL(pair);
+    MS_EXCEPTION_IF_NULL(pair->first);
+    return hash_combine(pair->first->hash(), std::hash<int>()(pair->second));
+  }
+};
+
+struct CNodeIndexEqual {
+  bool operator()(const CNodeIndexPairPtr lhs, const CNodeIndexPairPtr rhs) const {
+    if (lhs == nullptr || rhs == nullptr) {
+      return false;
+    }
+    if (lhs == rhs) {
+      return true;
+    }
+    if (lhs->first != rhs->first) {
+      return false;
+    }
+    if (lhs->second != rhs->second) {
+      return false;
+    }
+    return true;
+  }
+};
+
 template <typename ValueT, class CounterHash = std::hash<ValueT>, class CounterEqual = std::equal_to<ValueT>>
 using CounterOrderedMap = OrderedMap<ValueT, int, CounterHash, CounterEqual>;
 using AnfNodeCounterMap = CounterOrderedMap<AnfNodePtr>;
@@ -48,6 +74,7 @@ using FuncGraphMap = OrderedMap<FuncGraphPtr, int>;
 const char FUNC_GRAPH_FLAG_IGNORE_VALUES[] = "ignore_values";
 const char FUNC_GRAPH_FLAG_DEFER_INLINE[] = "defer_inline";
 const char FUNC_GRAPH_FLAG_CORE[] = "core";
+const char FUNC_GRAPH_ATTR_GRAPH_KERNEL[] = "graph_kernel";
 const char FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER[] = "spec_param";
 
 namespace abstract {
@@ -57,9 +84,6 @@ class AbstractFunction;
 using AbstractFunctionPtr = std::shared_ptr<AbstractFunction>;
 }  // namespace abstract
 
-class FuncGraphManager;
-using FuncGraphManagerPtr = std::shared_ptr<FuncGraphManager>;
-
 // ANF transform class
 // either a primitive or a func_graph
 class FuncGraphTransform {
@@ -172,10 +196,19 @@ class FuncGraph : public FuncGraphBase {
   void set_is_generate(bool generated) { is_generated_ = generated; }
   bool is_generated() const { return is_generated_; }
 
-  bool has_flag(const std::string &flag);
-  std::unordered_map<std::string, bool> &flags() { return flags_; }
-  void set_flags(const std::unordered_map<std::string, bool> &flags) { flags_ = flags; }
-  void set_flags(const std::string &key, const bool value) { flags_[key] = value; }
+  std::unordered_map<std::string, ValuePtr> &attrs() { return attrs_; }
+  void set_attrs(const std::unordered_map<std::string, ValuePtr> &attrs) {
+    for (auto &attr : attrs) {
+      attrs_[attr.first] = attr.second;
+    }
+  }
+  bool has_flag(const std::string &key);
+  void set_flag(const std::string &key, bool flag) { attrs_[key] = MakeValue(flag); }
+  void erase_flag(const std::string &key) { (void)attrs_.erase(key); }
+
+  bool has_attr(const std::string &key);
+  ValuePtr get_attr(const std::string &key);
+  void set_attr(const std::string &key, const ValuePtr &value) { attrs_[key] = value; }
 
   std::unordered_map<std::string, FuncGraphTransform> &transforms() { return transforms_; }
   void set_transforms(const std::unordered_map<std::string, FuncGraphTransform> &transforms) {
@@ -294,7 +327,7 @@ class FuncGraph : public FuncGraphBase {
 
   std::unordered_map<AnfNodePtr, AnfNodePtr> &make_ref_params() { return make_ref_params_; }
 
-  std::unordered_map<std::string, bool> flags_;
+  std::unordered_map<std::string, ValuePtr> attrs_;
   std::unordered_map<std::string, FuncGraphTransform> transforms_;
   // parameter default value
   std::map<std::string, AnfNodePtr> parameter_default_value_;
diff --git a/mindspore/ccsrc/ir/func_graph_cloner.cc b/mindspore/ccsrc/ir/func_graph_cloner.cc
index 4622bf9ea2..4a0c69d99a 100644
--- a/mindspore/ccsrc/ir/func_graph_cloner.cc
+++ b/mindspore/ccsrc/ir/func_graph_cloner.cc
@@ -90,6 +90,7 @@ void Cloner::CloneCNode(const AnfNodePtr &node, const FuncGraphPtr &target) {
   new_node->set_abstract(old_node->abstract());
   ScopePtr scope = (node->scope() != kDefaultScope) ? node->scope() : this->scope();
   new_node->set_scope(scope);
+  new_node->set_kernel_info(old_node->kernel_info_ptr());
   repl_node_[old_node] = new_node;
   nodes_.emplace_back(old_node, new_node);
   TraceManager::EndTrace();
@@ -211,7 +212,7 @@ void Cloner::SetFuncGraphInfo(const FuncGraphPtr &func_graph, FuncGraphPtr *cons
   MS_EXCEPTION_IF_NULL(target_func_graph);
   TraceManager::DebugTrace(func_graph->debug_info(), target_relation_);
   *target_func_graph = std::make_shared<FuncGraph>();
-  (*target_func_graph)->set_flags(func_graph->flags());
+  (*target_func_graph)->set_attrs(func_graph->attrs());
   (*target_func_graph)->set_transforms(func_graph->transforms());
   (*target_func_graph)->set_has_vararg(func_graph->has_vararg());
   (*target_func_graph)->set_has_kwarg(func_graph->has_kwarg());
@@ -636,9 +637,14 @@ FuncGraphPtr TransformableClone(const FuncGraphPtr &func_graph, const TraceInfoP
 
   if (MsContext::GetInstance()->is_multi_graph_sink()) {
     if (func_graph->has_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES)) {
-      new_func_graph->set_flags(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
+      new_func_graph->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
     }
   }
+
+  if (func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+    new_func_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, func_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL));
+  }
+
   return new_func_graph;
 }
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/ir/func_graph_extends.cc b/mindspore/ccsrc/ir/func_graph_extends.cc
index 14998a1eaa..ad7aa6ee0c 100644
--- a/mindspore/ccsrc/ir/func_graph_extends.cc
+++ b/mindspore/ccsrc/ir/func_graph_extends.cc
@@ -399,8 +399,8 @@ void FuncGraph::ReleaseFullOrderToEffectOrder() {
         depend_inputs.push_back(*iter);
       }
     }
-    set_flags(GRAPH_FLAG_HAS_EFFECT, false);
-    set_flags(GRAPH_FLAG_EFFECT_PATIAL_ORDER, true);
+    set_flag(GRAPH_FLAG_HAS_EFFECT, false);
+    set_flag(GRAPH_FLAG_EFFECT_PATIAL_ORDER, true);
     if (!depend_inputs.empty()) {
       SetEffectDepends(depend_inputs);
     }
diff --git a/mindspore/ccsrc/minnie/param_value_minnie.h b/mindspore/ccsrc/ir/lite/param_value_lite.h
similarity index 72%
rename from mindspore/ccsrc/minnie/param_value_minnie.h
rename to mindspore/ccsrc/ir/lite/param_value_lite.h
index 684d8abd5d..2b249cfa4f 100644
--- a/mindspore/ccsrc/minnie/param_value_minnie.h
+++ b/mindspore/ccsrc/ir/lite/param_value_lite.h
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_MINNIE_PARAM_VALUE_MINNIE_H_
-#define MINDSPORE_CCSRC_MINNIE_PARAM_VALUE_MINNIE_H_
+#ifndef MINDSPORE_CCSRC_MINNIE_PARAM_VALUE_LITE_H_
+#define MINDSPORE_CCSRC_MINNIE_PARAM_VALUE_LITE_H_
 
 #include <memory>
 
 #include "ir/anf.h"
 
 namespace mindspore {
-class ParamValueMinnie : public ParamValue {
+class ParamValueLite : public ParamValue {
  public:
-  ParamValueMinnie() : tensor_addr_(nullptr), tensor_size_(0) {}
-  virtual ~ParamValueMinnie() = default;
+  ParamValueLite() : tensor_addr_(nullptr), tensor_size_(0) {}
+  virtual ~ParamValueLite() = default;
 
   size_t tensor_size() const { return tensor_size_; }
   void set_tensor_size(size_t size) { tensor_size_ = size; }
@@ -38,7 +38,6 @@ class ParamValueMinnie : public ParamValue {
   size_t tensor_size_;
 };
 
-using ParamValueMinniePtr = std::shared_ptr<ParamValueMinnie>;
-
+using ParamValueLitePtr = std::shared_ptr<ParamValueLite>;
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_MINNIE_PARAM_VALUE_MINNIE_H_
+#endif  // MINDSPORE_CCSRC_MINNIE_PARAM_VALUE_LITE_H_
diff --git a/mindspore/ccsrc/ir/lite/tensor.cc b/mindspore/ccsrc/ir/lite/tensor.cc
new file mode 100644
index 0000000000..2957495aa4
--- /dev/null
+++ b/mindspore/ccsrc/ir/lite/tensor.cc
@@ -0,0 +1,152 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <utility>
+#include "ir/lite/tensor.h"
+#include "securec/include/securec.h"
+
+namespace mindspore {
+namespace tensor {
+#define kMaxMallocSize 1024 * 1024 * 100
+Tensor::Tensor(const TypeId data_type, const std::vector<int> &shape) : MetaTensor(data_type, shape) {}
+
+Tensor::Tensor(const TypePtr &type_ptr, const std::vector<int> &shape) : MetaTensor(type_ptr, shape) {}
+
+Tensor::Tensor(const Tensor &tensor) : MetaTensor(tensor) {
+  this->data_type_ = tensor.data_type_;
+  this->shape_ = tensor.shape_;
+  auto ret = CopyTensorData(tensor);
+  if (0 != ret) {
+    MS_LOG(EXCEPTION) << "CopyTensorData error";
+  }
+}
+
+int Tensor::CopyTensorData(const Tensor &srcTensor) {
+  if (srcTensor.data_ == nullptr) {
+    MS_LOG(ERROR) << "data of srcTensor is nullptr";
+    return -1;
+  }
+  size_t data_size = this->Size();
+  MS_ASSERT(data_size == tensor.Size());
+  if (this->data_ == nullptr) {
+    if (data_size > kMaxMallocSize) {
+      MS_LOG(ERROR) << "Malloc size is too big while coping data, " << data_size << " bytes";
+      return -1;
+    }
+    this->data_ = malloc(data_size);
+  }
+  memcpy_s(this->data_, data_size, tensor.data_, tensor.Size());
+  return 0;
+}
+
+Tensor::~Tensor() {
+  if (nullptr != this->data_) {
+    free(this->data_);
+  }
+}
+
+Tensor &Tensor::operator=(const Tensor &tensor) {
+  if (&tensor == this) {
+    return *this;
+  }
+  this->shape_ = tensor.shape_;
+  this->data_type_ = tensor.data_type_;
+  auto ret = CopyTensorData(tensor);
+  if (0 != ret) {
+    MS_LOG(EXCEPTION) << "CopyTensorData error";
+  }
+  return *this;
+}
+
+bool Tensor::operator==(const Tensor &tensor) {
+  return data_ == tensor.data_ && shape_ == tensor.shape_ && data_type_ == tensor.data_type_;
+}
+
+bool Tensor::operator==(const Value &other) const {
+  if (other.isa<Tensor>()) {
+    auto other_ = static_cast<const Tensor &>(other);
+    return *this == other_;
+  } else {
+    return false;
+  }
+}
+}  // namespace tensor
+
+namespace inference {
+MSTensor *MSTensor::CreateTensor(TypeId data_type, const std::vector<int> &shape) {
+  return new Tensor(data_type, shape);
+}
+
+Tensor::Tensor() { this->tensor_impl_ = std::make_shared<tensor::Tensor>(); }
+
+Tensor::Tensor(TypeId data_type, const std::vector<int> &shape) {
+  this->tensor_impl_ = std::make_shared<tensor::Tensor>(data_type, shape);
+}
+
+Tensor::Tensor(std::shared_ptr<tensor::Tensor> tensor_ptr) { this->tensor_impl_ = std::move(tensor_ptr); }
+
+TypeId Tensor::data_type() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->data_type();
+}
+
+TypeId Tensor::set_data_type(TypeId data_type) {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->set_data_type(data_type);
+}
+
+std::vector<int> Tensor::shape() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->shape();
+}
+
+size_t Tensor::set_shape(const std::vector<int> &shape) {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->set_shape(shape);
+}
+
+int Tensor::DimensionSize(size_t index) const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->DimensionSize(index);
+}
+
+int Tensor::ElementsNum() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->ElementsNum();
+}
+
+std::size_t Tensor::hash() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->hash();
+}
+
+std::shared_ptr<tensor::Tensor> Tensor::tensor() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_;
+}
+
+size_t Tensor::Size() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->Size();
+}
+
+void *Tensor::MutableData() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->data();
+}
+}  // namespace inference
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/ir/lite/tensor.h b/mindspore/ccsrc/ir/lite/tensor.h
new file mode 100644
index 0000000000..0dcf5cc0ee
--- /dev/null
+++ b/mindspore/ccsrc/ir/lite/tensor.h
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_IR_LITE_TENSOR_H_
+#define MINDSPORE_CCSRC_IR_LITE_TENSOR_H_
+
+#include <memory>
+#include <vector>
+#include "ir/meta_tensor.h"
+#include "ir/dtype/type.h"
+
+namespace mindspore {
+namespace tensor {
+class Tensor : public MetaTensor {
+ public:
+  Tensor() : MetaTensor() {}
+
+  Tensor(const TypeId data_type, const std::vector<int> &shape);
+
+  Tensor(const TypePtr &type_ptr, const std::vector<int> &shape);
+
+  Tensor(const Tensor &tensor);
+
+  ~Tensor();
+
+  int CopyTensorData(const Tensor &srcTensor);
+
+  MS_DECLARE_PARENT(Tensor, MetaTensor)
+
+  virtual Tensor &operator=(const Tensor &tensor);
+
+  virtual bool operator==(const Tensor &tensor);
+
+  bool operator==(const Value &other) const override;
+
+  size_t Size() const { return MetaTensor::ElementsNum() * GetTypeByte(TypeIdToType(this->data_type_)); }
+
+  void *Data() const { return data_; }
+
+ protected:
+  void *data_;
+};
+
+using TensorPtr = std::shared_ptr<Tensor>;
+}  // namespace tensor
+
+namespace inference {
+class Tensor : public MSTensor {
+ public:
+  Tensor();
+
+  Tensor(TypeId data_type, const std::vector<int> &shape);
+
+  explicit Tensor(std::shared_ptr<tensor::Tensor> tensor_ptr);
+
+  ~Tensor() = default;
+
+  TypeId data_type() const override;
+
+  TypeId set_data_type(const TypeId data_type) override;
+
+  std::vector<int> shape() const override;
+
+  size_t set_shape(const std::vector<int> &shape) override;
+
+  int DimensionSize(size_t index) const override;
+
+  int ElementsNum() const override;
+
+  std::size_t hash() const override;
+
+  std::shared_ptr<tensor::Tensor> tensor() const;
+
+  size_t Size() const override;
+
+  void *MutableData() const override;
+
+ protected:
+  std::shared_ptr<tensor::Tensor> tensor_impl_;
+};
+}  // namespace inference
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_IR_LITE_TENSOR_H_
diff --git a/mindspore/ccsrc/ir/manager.cc b/mindspore/ccsrc/ir/manager.cc
index 4b9f0c22e9..291a752405 100644
--- a/mindspore/ccsrc/ir/manager.cc
+++ b/mindspore/ccsrc/ir/manager.cc
@@ -328,9 +328,6 @@ void FuncGraphManager::ProcessEdge(AnfNodePtr node, int index, AnfNodePtr inp, E
     DropEdge(node, index, inp);
   } else {
     MS_LOG(DEBUG) << "Add node " << node->ToString() << " input[" << index << "] " << inp->ToString();
-    if (inp->func_graph() != nullptr) {
-      AddFuncGraph(inp->func_graph());
-    }
     if (IsValueNode<FuncGraph>(inp)) {
       MS_LOG(DEBUG) << "Input[" << index << "] is const graph " << inp->ToString();
       AddFuncGraph(GetValueNode<FuncGraphPtr>(inp));
@@ -372,9 +369,8 @@ void FuncGraphManager::AcquireNodes(const std::vector<AnfNodePtr> &nodes) {
 
   for (auto &node : acq) {
     MS_EXCEPTION_IF_NULL(node);
-    FuncGraphPtr fg = node->func_graph();
+    auto fg = node->func_graph();
     if (fg != nullptr) {
-      AddFuncGraph(fg);
       fg->AddNode(node);
     }
     ProcessInputs(node, kIncEdge);
@@ -468,7 +464,7 @@ void FuncGraphManager::MoveAllCNodeDropGraph(FuncGraphPtr source, FuncGraphPtr t
   }
 }
 
-inline void FuncGraphManager::AddEdge(AnfNodePtr node, int index, AnfNodePtr input) {
+void FuncGraphManager::AddEdge(AnfNodePtr node, int index, AnfNodePtr input) {
   auto fg = node->func_graph();
   if (input->isa<ValueNode>()) {
     fg->AddValueNode(input);
@@ -489,7 +485,7 @@ inline void FuncGraphManager::AddEdge(AnfNodePtr node, int index, AnfNodePtr inp
   }
 }
 
-inline void FuncGraphManager::DropEdge(AnfNodePtr node, int index, AnfNodePtr input) {
+void FuncGraphManager::DropEdge(AnfNodePtr node, int index, AnfNodePtr input) {
   auto fg = node->func_graph();
   if (input->isa<ValueNode>()) {
     fg->DropValueNode(input);
@@ -510,7 +506,7 @@ inline void FuncGraphManager::DropEdge(AnfNodePtr node, int index, AnfNodePtr in
   }
 }
 
-inline void FuncGraphManager::MoveAllNodes(FuncGraphPtr source, FuncGraphPtr target) {
+void FuncGraphManager::MoveAllNodes(FuncGraphPtr source, FuncGraphPtr target) {
   target->CopyNodes(source);
   target->CopyValueNodes(source);
   target->CopyFuncGraphCNodesIndex(source);
@@ -637,103 +633,7 @@ void FuncGraphTransaction::Commit() {
   manager_->CommitChanges(changes);
 }
 
-FuncGraphAnalysis::FuncGraphAnalysis(const FuncGraphManager *const manager)
-    : manager_(manager), include_func_graph_none_(false) {}
-
-DepCollector::DepCollector(const FuncGraphManager *const manager) : FuncGraphAnalysis(manager) {
-  MS_EXCEPTION_IF_NULL(manager_);
-}
-
-void DepCollector::OnAddEdge(AnfNodePtr node, int index, AnfNodePtr inp) { OnModEdge(node, index, inp, kIncEdge); }
-
-void DepCollector::OnDropEdge(AnfNodePtr node, int index, AnfNodePtr inp) { OnModEdge(node, index, inp, kDecEdge); }
-
-template <typename ValueT, class CollectorHash, class CollectorEqual>
-bool CounterAnfNodeCollector<ValueT, CollectorHash, CollectorEqual>::Inc(const FuncGraphPtr &func_graph,
-                                                                         const ValueT &key, int count) {
-  auto &d = count_nodes_map_[func_graph];
-  if (d.count(key) == 0) {
-    d[key] = count;
-    return true;
-  } else {
-    d[key] += count;
-  }
-  return false;
-}
-
-template <typename ValueT, class CollectorHash, class CollectorEqual>
-bool CounterAnfNodeCollector<ValueT, CollectorHash, CollectorEqual>::Dec(const FuncGraphPtr &func_graph,
-                                                                         const ValueT &key, int count) {
-  MS_EXCEPTION_IF_NULL(func_graph);
-  auto &d = count_nodes_map_[func_graph];
-  if (d.count(key) != 0) {
-    if (d[key] == count) {
-      (void)d.erase(key);
-      return true;
-    } else {
-      d[key] -= count;
-      if (d[key] < 0) {
-        MS_LOG(EXCEPTION) << "Count of key '" << key
-                          << "' dec from 0. NodeInfo: " << trace::GetDebugInfo(func_graph->debug_info());
-      }
-    }
-  }
-  return false;
-}
-
-template <typename ValueT, class CollectorHash, class CollectorEqual>
-bool CounterAnfNodeCollector<ValueT, CollectorHash, CollectorEqual>::Mod(const FuncGraphPtr &func_graph,
-                                                                         const ValueT &key, int count) {
-  if (count > 0) {
-    return Inc(func_graph, key, count);
-  } else if (count < 0) {
-    return Dec(func_graph, key, -count);
-  } else {
-    MS_LOG(EXCEPTION) << "Count of key '" << key
-                      << "' cannot be 0. NodeInfo: " << trace::GetDebugInfo(func_graph->debug_info());
-  }
-}
-
-bool CounterFuncGraphCollector::Inc(const FuncGraphPtr &func_graph, const FuncGraphPtr &key, int count = 1) {
-  auto &d = count_func_graphs_map_[func_graph];
-  if (d.count(key) == 0) {
-    d[key] = count;
-    return true;
-  } else {
-    d[key] += count;
-  }
-  return false;
-}
-
-bool CounterFuncGraphCollector::Dec(const FuncGraphPtr &func_graph, const FuncGraphPtr &key, int count = 1) {
-  auto &d = count_func_graphs_map_[func_graph];
-  if (d.count(key) != 0) {
-    if (d[key] == count) {
-      (void)d.erase(key);
-      return true;
-    } else {
-      d[key] -= count;
-      if (d[key] < 0) {
-        MS_LOG(EXCEPTION) << "Count of key '" << key->ToString()
-                          << "' dec from 0. NodeInfo: " << trace::GetDebugInfo(func_graph->debug_info());
-      }
-    }
-  }
-  return false;
-}
-
-bool CounterFuncGraphCollector::Mod(const FuncGraphPtr &func_graph, const FuncGraphPtr &key, int count) {
-  if (count > 0) {
-    return Inc(func_graph, key, count);
-  } else if (count < 0) {
-    return Dec(func_graph, key, -count);
-  } else {
-    MS_LOG(EXCEPTION) << "Count of key '" << key->ToString()
-                      << "' cannot be 0. NodeInfo: " << trace::GetDebugInfo(func_graph->debug_info());
-  }
-}
-
-DepComputer::DepComputer(const FuncGraphManager *const manager) : FuncGraphAnalysis(manager) {
+DepComputer::DepComputer(const FuncGraphManager *const manager) : manager_(manager) {
   MS_EXCEPTION_IF_NULL(manager_);
   manager_->signals()->InvalidateComputer.connect(this, &DepComputer::OnInvalidateComputer);
   validate_ = false;
@@ -843,16 +743,15 @@ void FVTotalComputer::RealRecompute() {
 
   for (auto &fg : manager->func_graphs()) {
     fv_total_analysis_[fg] = OrderedMap<BaseRef, int, BaseRefHash>();
-    count_nodes_map_[fg] = OrderedMap<AnfNodePtr, int>();
-    count_func_graphs_map_[fg] = OrderedMap<FuncGraphPtr, int>();
   }
 
   for (auto &fg : manager->func_graphs()) {
+    // add all free variable nodes
     AnfNodeCounterMap items = fg->free_variables();
     for (auto &iter : items) {
       auto curr = fg;
       while (curr != nullptr) {
-        (void)CounterAnfNodeCollector::Mod(curr, iter.first, iter.second);
+        fv_total_analysis_[curr][iter.first] = iter.second;
         curr = manager->parent(curr);
         if (curr != nullptr) {
           const AnfNodeSet &all_nodes = curr->nodes();
@@ -863,6 +762,7 @@ void FVTotalComputer::RealRecompute() {
       }
     }
 
+    // add all FGs of free variables
     auto &used = fg->func_graphs_used();
     for (auto &iter : used) {
       auto p = manager->parent(iter.first);
@@ -871,21 +771,11 @@ void FVTotalComputer::RealRecompute() {
       }
       auto curr = fg;
       while (curr != p) {
-        (void)CounterFuncGraphCollector::Mod(curr, iter.first, iter.second);
+        fv_total_analysis_[curr][iter.first] = iter.second;
         curr = manager->parent(curr);
       }
     }
   }
-  for (auto &fg : manager->func_graphs()) {
-    auto &fvp = count_nodes_map_[fg];
-    auto &fvg = count_func_graphs_map_[fg];
-    for (auto &item : fvp) {
-      fv_total_analysis_[fg][item.first] = item.second;
-    }
-    for (auto &item : fvg) {
-      fv_total_analysis_[fg][item.first] = item.second;
-    }
-  }
 }
 
 void FuncGraphsUsedTotalComputer::RealRecompute(FuncGraphPtr fg) {
diff --git a/mindspore/ccsrc/ir/manager.h b/mindspore/ccsrc/ir/manager.h
index e4e5a1fba8..5da3812d25 100644
--- a/mindspore/ccsrc/ir/manager.h
+++ b/mindspore/ccsrc/ir/manager.h
@@ -88,14 +88,6 @@ FuncGraphManagerPtr Manage(const std::vector<FuncGraphPtr> &func_graphs, bool ma
 FuncGraphManagerPtr MakeManager(const std::vector<FuncGraphPtr> &func_graphs = {}, bool manage = true);
 
 struct Signals {
-  Signal<void(FuncGraphPtr)> AddFuncGraph;
-  Signal<void(FuncGraphPtr)> DropFuncGraph;
-  Signal<void(AnfNodePtr)> AddNode;
-  Signal<void(AnfNodePtr)> DropNode;
-  Signal<void(AnfNodePtr, int, AnfNodePtr)> AddEdge;
-  Signal<void(AnfNodePtr, int, AnfNodePtr)> DropEdge;
-  Signal<void(FuncGraphPtr, FuncGraphPtr)> MoveAllCNode;
-  Signal<void()> InvalidateCollector;
   Signal<void()> InvalidateComputer;
 };
 
@@ -103,136 +95,15 @@ enum EdgeProcessDirection { kDecEdge = -1, kIncEdge = 1 };
 
 using CNodeIndexPair = std::pair<AnfNodePtr, int>;
 using CNodeIndexPairPtr = std::shared_ptr<CNodeIndexPair>;
-
-using FuncGraphToFuncGraphCounterMap = OrderedMap<FuncGraphPtr, OrderedMap<FuncGraphPtr, int>>;
-template <typename ValueT, class CollectorHash = std::hash<ValueT>, class CollectorEqual = std::equal_to<ValueT>>
-using FuncGraphToAnfNodeCounterMap = OrderedMap<FuncGraphPtr, OrderedMap<ValueT, int, CollectorHash, CollectorEqual>>;
-
-// analysis base class
-class FuncGraphAnalysis {
- public:
-  explicit FuncGraphAnalysis(const FuncGraphManager *const manager);
-
-  virtual ~FuncGraphAnalysis() { manager_ = nullptr; }
-
-  virtual size_t size() const { return 0; }
-
-  virtual void OnAddFuncGraph(FuncGraphPtr) {}
-
-  virtual void OnDropFuncGraph(FuncGraphPtr) {}
-
-  virtual void OnMoveAllCNode(FuncGraphPtr, FuncGraphPtr) {}
-
- protected:
-  // subclass can reset their own member;
-  virtual void ExtraReset() {}
-
-  virtual void OnAddNode(AnfNodePtr n) {}
-
-  virtual void OnDropNode(AnfNodePtr n) {}
-
-  virtual void OnAddEdge(AnfNodePtr, int, AnfNodePtr) {}
-
-  virtual void OnDropEdge(AnfNodePtr, int, AnfNodePtr) {}
-
-  const FuncGraphManager *manager_;
-  bool include_func_graph_none_;
-};
-
-using FuncGraphToAnfNodeMap = OrderedMap<FuncGraphPtr, AnfNodeSet>;
-
-struct CNodeIndexHasher {
-  std::size_t operator()(const CNodeIndexPairPtr pair) const {
-    MS_EXCEPTION_IF_NULL(pair);
-    MS_EXCEPTION_IF_NULL(pair->first);
-    return hash_combine(pair->first->hash(), std::hash<int>()(pair->second));
-  }
-};
-
-struct CNodeIndexEqual {
-  bool operator()(const CNodeIndexPairPtr lhs, const CNodeIndexPairPtr rhs) const {
-    if (lhs == nullptr || rhs == nullptr) {
-      return false;
-    }
-    if (lhs == rhs) {
-      return true;
-    }
-    if (lhs->first != rhs->first) {
-      return false;
-    }
-    if (lhs->second != rhs->second) {
-      return false;
-    }
-    return true;
-  }
-};
-
-// graphs analysis which compute in write, read needn't recompute
-class DepCollector : public FuncGraphAnalysis {
- public:
-  explicit DepCollector(const FuncGraphManager *manager);
-  ~DepCollector() override = default;
-
-  void Reset() { ExtraReset(); }
-  void OnInvalidateCollector() { Reset(); }
-
- protected:
-  // inherit from FuncGraphAnalysis
-  void OnAddEdge(AnfNodePtr node, int index, AnfNodePtr inp) override;
-  void OnDropEdge(AnfNodePtr node, int index, AnfNodePtr inp) override;
-  // subclass can override;
-  virtual void OnModEdge(AnfNodePtr, int, AnfNodePtr, EdgeProcessDirection) {}
-};
-
-class CounterFuncGraphCollector : public DepCollector {
- public:
-  explicit CounterFuncGraphCollector(const FuncGraphManager *m) : DepCollector(m) {}
-  ~CounterFuncGraphCollector() override = default;
-  FuncGraphToFuncGraphCounterMap &count_func_graphs_map() { return count_func_graphs_map_; }
-  // inherit from FuncGraphAnalysis
-  size_t size() const override { return count_func_graphs_map_.size(); }
-  void OnAddFuncGraph(FuncGraphPtr fg) final { count_func_graphs_map_[fg] = OrderedMap<FuncGraphPtr, int>(); }
-  void OnDropFuncGraph(FuncGraphPtr fg) final { (void)count_func_graphs_map_.erase(fg); }
-  bool Inc(const FuncGraphPtr &func_graph, const FuncGraphPtr &key, int count);
-  bool Dec(const FuncGraphPtr &func_graph, const FuncGraphPtr &key, int count);
-  bool Mod(const FuncGraphPtr &func_graph, const FuncGraphPtr &key, int count);
-
-  FuncGraphToFuncGraphCounterMap count_func_graphs_map_;
-
- protected:
-  void ExtraReset() override { count_func_graphs_map_.clear(); }
-};
-
-template <typename ValueT, class CollectorHash = std::hash<ValueT>, class CollectorEqual = std::equal_to<ValueT>>
-class CounterAnfNodeCollector : public DepCollector {
- public:
-  explicit CounterAnfNodeCollector(const FuncGraphManager *m) : DepCollector(m) {}
-  ~CounterAnfNodeCollector() override = default;
-  FuncGraphToAnfNodeCounterMap<ValueT, CollectorHash, CollectorEqual> &count_nodes_map() { return count_nodes_map_; }
-
-  size_t size() const override { return count_nodes_map_.size(); }
-  void OnAddFuncGraph(FuncGraphPtr fg) final {
-    count_nodes_map_[fg] = OrderedMap<ValueT, int, CollectorHash, CollectorEqual>();
-  }
-  void OnDropFuncGraph(FuncGraphPtr fg) final { (void)count_nodes_map_.erase(fg); }
-
-  bool Inc(const FuncGraphPtr &func_graph, const ValueT &key, int count);
-  bool Dec(const FuncGraphPtr &func_graph, const ValueT &key, int count);
-  bool Mod(const FuncGraphPtr &func_graph, const ValueT &key, int count);
-
-  FuncGraphToAnfNodeCounterMap<ValueT, CollectorHash, CollectorEqual> count_nodes_map_;
-
- protected:
-  void ExtraReset() override { count_nodes_map_.clear(); }
-};
-
 using FuncGraphToFuncGraphSetMap = OrderedMap<FuncGraphPtr, FuncGraphSet>;
 
-// graphs analysis which need dynamic compute by DepCollector in each read
-class DepComputer : public FuncGraphAnalysis {
+// analysis base class, graphs analysis which need dynamic compute by DepCollector in each read
+class DepComputer {
  public:
   explicit DepComputer(const FuncGraphManager *manager);
-  ~DepComputer() override = default;
+  virtual ~DepComputer() { manager_ = nullptr; }
+
+  virtual size_t size() const { return 0; }
 
   void Reset() {
     ExtraReset();
@@ -250,15 +121,14 @@ class DepComputer : public FuncGraphAnalysis {
 
   bool IsValidate(const FuncGraphPtr &fg) { return func_graphs_validate_[fg]; }
 
-  void OnAddFuncGraph(FuncGraphPtr) final { Reset(); }
-
-  void OnDropFuncGraph(FuncGraphPtr) final { Reset(); }
-
  protected:
+  // subclass can reset their own member;
+  virtual void ExtraReset() {}
   // subclass do the real compute
   virtual void RealRecompute() {}
   virtual void RealRecompute(FuncGraphPtr) {}
 
+  const FuncGraphManager *manager_;
   bool validate_;
   OrderedMap<FuncGraphPtr, bool> func_graphs_validate_;
 
@@ -345,12 +215,9 @@ class ScopeComputer final : public DepComputer {
 
 using FVTotalMap = OrderedMap<FuncGraphPtr, OrderedMap<BaseRef, int, BaseRefHash>>;
 
-class FVTotalComputer final : public DepComputer,
-                              public CounterAnfNodeCollector<AnfNodePtr>,
-                              public CounterFuncGraphCollector {
+class FVTotalComputer final : public DepComputer {
  public:
-  explicit FVTotalComputer(const FuncGraphManager *m)
-      : DepComputer(m), CounterAnfNodeCollector(m), CounterFuncGraphCollector(m) {}
+  explicit FVTotalComputer(const FuncGraphManager *m) : DepComputer(m) {}
   ~FVTotalComputer() override = default;
 
   FVTotalMap &fv_total_analysis() { return fv_total_analysis_; }
diff --git a/mindspore/ccsrc/ir/meta_tensor.h b/mindspore/ccsrc/ir/meta_tensor.h
index a85ef77e83..d78caf3b5d 100644
--- a/mindspore/ccsrc/ir/meta_tensor.h
+++ b/mindspore/ccsrc/ir/meta_tensor.h
@@ -29,7 +29,7 @@
 
 // brief mindspore namespace.
 //
-// mindspore namespace is the top level namespace of Mindsporeession project.
+// mindspore namespace is the top level namespace of MindSpore project.
 // Other namespace should be a sub namespace of mindspore namespace in the ME project.
 namespace mindspore {
 
diff --git a/mindspore/ccsrc/ir/optimizer_caller.h b/mindspore/ccsrc/ir/optimizer_caller.h
new file mode 100644
index 0000000000..bd30454147
--- /dev/null
+++ b/mindspore/ccsrc/ir/optimizer_caller.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_IR_OPTIMIZER_CALLER_H_
+#define MINDSPORE_CCSRC_IR_OPTIMIZER_CALLER_H_
+
+#include "ir/anf.h"
+#include "optimizer/opt.h"
+
+namespace mindspore {
+class OptimizerCaller {
+ public:
+  virtual AnfNodePtr operator()(const opt::OptimizerPtr &, const AnfNodePtr &) { return nullptr; }
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_IR_OPTIMIZER_CALLER_H_
diff --git a/mindspore/ccsrc/ir/param_value_py.h b/mindspore/ccsrc/ir/param_value_py.h
index 6841f4c040..a03e34ac6e 100644
--- a/mindspore/ccsrc/ir/param_value_py.h
+++ b/mindspore/ccsrc/ir/param_value_py.h
@@ -28,7 +28,7 @@ namespace py = pybind11;
 class ParamValuePy : public ParamValue {
  public:
   ParamValuePy() : value_(py::none()) {}
-  explicit ParamValuePy(py::object value) : value_(value) {}
+  explicit ParamValuePy(const py::object &value) : value_(value) {}
   ~ParamValuePy() override = default;
 
   py::object value() { return value_; }
diff --git a/mindspore/ccsrc/ir/pattern_matcher.h b/mindspore/ccsrc/ir/pattern_matcher.h
new file mode 100644
index 0000000000..6605b9ce4c
--- /dev/null
+++ b/mindspore/ccsrc/ir/pattern_matcher.h
@@ -0,0 +1,310 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_IR_PATTERN_MATCHER_H_
+#define MINDSPORE_CCSRC_IR_PATTERN_MATCHER_H_
+
+#include <tuple>
+#include <vector>
+
+#include "ir/anf.h"
+#include "operator/ops.h"
+
+namespace mindspore {
+
+///
+///  Base class for all recognizable patterns.
+///  We implement an Expression Template approach using static polymorphism based on
+///  the Curiously Recurring Template Pattern (CRTP) which "achieves a similar effect
+///  to the use of virtual functions without the costs..." as described in:
+///  https://en.wikipedia.org/wiki/Expression_templates and
+///  https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern
+///  The TryCapture function tries to capture the pattern with the given node.
+///  The GetNode function builds a new node using the captured values.
+///
+
+template <typename T>
+class PBase {
+ public:
+  bool CheckFunc(const opt::PredicateFuncType &func, const AnfNodePtr &node) {
+    return func(get_object().GetNode(node));
+  }
+
+  const T &get_object() const { return *static_cast<const T *>(this); }
+
+  template <typename TN>
+  bool TryCapture(const TN &value) const {
+    get_object().Reset();
+    return get_object().TryCapture_(value);
+  }
+
+  using Internal = T;
+};
+
+template <typename T>
+class PIsEqual {
+ public:
+  bool operator()(const T &lhs, const T &rhs) const { return lhs == rhs; }
+};
+
+template <typename T>
+class PatternNode : public PBase<PatternNode<T> > {
+ public:
+  T GetNode(const AnfNodePtr &node) const {
+    if (!captured_) {
+      MS_EXCEPTION(ValueError) << "A Pattern wasn't captured for this Token before the call to GetNode.";
+    }
+    return captured_node_;
+  }
+
+  bool TryCapture_(const T &node) const {
+    if (!captured_) {
+      captured_node_ = node;
+      captured_ = true;
+      return true;
+    }
+    return PIsEqual<T>()(captured_node_, node);
+  }
+
+  void Reset() const { captured_ = false; }
+  using Internal = const PatternNode<T> &;
+
+ protected:
+  mutable T captured_node_;
+  mutable bool captured_{false};
+};
+
+template <typename T, typename T2>
+class PBinOperation : public PBase<PBinOperation<T, T2> > {
+ public:
+  PBinOperation(const PrimitivePtr &prim, const T &x, const T2 &y) : prim_(prim), x_(x), y_(y) {}
+
+  AnfNodePtr GetNode(const AnfNodePtr &node) const {
+    AnfNodePtr lhs = x_.GetNode(node->func_graph());
+    AnfNodePtr rhs = y_.GetNode(node->func_graph());
+    AnfNodePtrList list = {prim_->cast<AnfNodePtr>(), lhs, rhs};
+    return NewCNode(list, node->func_graph());
+  }
+
+  bool TryCapture_(const AnfNodePtr &node) const {
+    if (IsPrimitiveCNode(node, prim_)) {
+      auto cnode = node->cast<CNodePtr>();
+      auto inputs = cnode->inputs();
+      if (inputs.size() == 3) {
+        // Binary Prim assumes only two inputs
+        if (!x_.TryCapture_(inputs[1]) || !y_.TryCapture_(inputs[2])) {
+          return false;
+        }
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void Reset() const {
+    x_.Reset();
+    y_.Reset();
+  }
+
+ private:
+  const PrimitivePtr prim_;
+  typename T::Internal x_;
+  typename T2::Internal y_;
+};
+
+///
+/// Helper functions to apply a pattern function on all elements of a tuple
+///
+namespace tuple_utils {
+template <bool stop, size_t Index, typename Func>
+struct apply_func_tuple_item {
+  template <typename TTuple>
+  static void apply(Func *func, const TTuple &tuple) {
+    (*func)(Index, std::get<Index>(tuple));
+    apply_func_tuple_item<(Index + 1) == std::tuple_size<TTuple>::value, (Index + 1), Func>::apply(func, tuple);
+  }
+};
+
+template <size_t Index, typename Func>
+struct apply_func_tuple_item<true, Index, Func> {
+  template <typename TTuple>
+  static void apply(Func *func, const TTuple &tuple) {}
+};
+
+template <typename Func, typename TTuple>
+inline void apply_func_tuple(Func *func, const TTuple &tuple) {
+  apply_func_tuple_item<std::tuple_size<TTuple>::value == 0, 0, Func>::apply(func, tuple);
+}
+
+struct PTupleResetCapture {
+  template <typename T>
+  void operator()(size_t i, const T &pattern) const {
+    pattern.Reset();
+  }
+};
+
+struct PTupleCapture {
+  explicit PTupleCapture(const AnfNodePtrList tuple) : tuple_(tuple) {}
+
+  template <typename TPattern>
+  void operator()(size_t i, const TPattern &pattern) {
+    // Check if the first node is a Primitive
+    if (i == 0 && tuple_[i]->isa<Primitive>()) {
+      auto prim = tuple_[i]->cast<PrimitivePtr>();
+      if (tuple_[i] != pattern.GetNode(tuple_[i])) {
+        captured_ = false;
+      }
+    } else {
+      captured_ = captured_ && pattern.TryCapture_(tuple_[i]);
+    }
+  }
+
+  const AnfNodePtrList tuple_;
+  bool captured_{true};
+};
+
+struct PTupleGetNode {
+  explicit PTupleGetNode(const AnfNodePtr &node) : node_(node) {}
+
+  template <typename TPattern>
+  void operator()(size_t, const TPattern &pattern) {
+    args_.push_back(pattern.GetNode(node_));
+  }
+
+  const AnfNodePtr &node_;
+  std::vector<AnfNodePtr> args_;
+};
+}  // namespace tuple_utils
+
+template <typename... TArgs>
+class PCNode : public PBase<PCNode<TArgs...> > {
+ public:
+  explicit PCNode(const TArgs &... args) : args_(args...) {}
+
+  AnfNodePtr GetNode(const AnfNodePtr &node) const {
+    tuple_utils::PTupleGetNode get_node(node);
+    tuple_utils::apply_func_tuple(&get_node, args_);
+    return NewCNode(get_node.args_, node->func_graph());
+  }
+
+  bool TryCapture_(const AnfNodePtr &node) const {
+    if (node->isa<CNode>()) {
+      auto cnode = node->cast<CNodePtr>();
+      auto inputs = cnode->inputs();
+      if (inputs.size() != sizeof...(TArgs)) {
+        return false;
+      }
+      tuple_utils::PTupleCapture capture_func(inputs);
+      tuple_utils::apply_func_tuple(&capture_func, args_);
+      return capture_func.captured_;
+    }
+
+    return false;
+  }
+
+  void Reset() const {
+    tuple_utils::PTupleResetCapture reset;
+    tuple_utils::apply_func_tuple(&reset, args_);
+  }
+
+ private:
+  std::tuple<typename TArgs::Internal...> args_;
+};
+
+template <typename... TArgs>
+class PPrimitive : public PBase<PPrimitive<TArgs...> > {
+ public:
+  explicit PPrimitive(const PrimitivePtr &prim, const TArgs &... args) : prim_(prim), args_(args...) {}
+
+  AnfNodePtr GetNode(const AnfNodePtr &node) const {
+    tuple_utils::PTupleGetNode get_node(node);
+    tuple_utils::apply_func_tuple(&get_node, args_);
+    auto prim_cnode = get_node.args_;
+    prim_cnode.insert(prim_cnode.begin(), NewValueNode(prim_));
+    return NewCNode(prim_cnode, node->func_graph());
+  }
+
+  bool TryCapture_(const AnfNodePtr &node) const {
+    if (IsPrimitiveCNode(node, prim_)) {
+      auto cnode = node->cast<CNodePtr>();
+      auto inputs = cnode->inputs();
+      if ((inputs.size() - 1) != sizeof...(TArgs)) {
+        return false;
+      }
+
+      AnfNodePtrList rest(inputs.begin() + 1, inputs.end());
+      tuple_utils::PTupleCapture capture_func(rest);
+      tuple_utils::apply_func_tuple(&capture_func, args_);
+
+      return capture_func.captured_;
+    }
+
+    return false;
+  }
+
+  void Reset() const {
+    tuple_utils::PTupleResetCapture reset;
+    tuple_utils::apply_func_tuple(&reset, args_);
+  }
+
+ private:
+  const PrimitivePtr prim_;
+  std::tuple<typename TArgs::Internal...> args_;
+};
+
+// Macro for binary operation functions
+#define BIN_OPERATION_PATTERN(Operator, MSPrimitive)                            \
+  template <typename T, typename T2>                                            \
+  inline PBinOperation<T, T2> Operator(const PBase<T> &x, const PBase<T2> &y) { \
+    return PBinOperation(MSPrimitive, x.get_object(), y.get_object());          \
+  }
+
+// Arithmetic operations
+BIN_OPERATION_PATTERN(operator+, prim::kPrimTensorAdd);
+BIN_OPERATION_PATTERN(operator*, prim::kPrimMul);
+
+// Macros for match and replace
+#define MATCH_REPLACE(OrigNode, CaptureNode, ReplaceWith) \
+  if ((CaptureNode).TryCapture(OrigNode)) {               \
+    return (ReplaceWith).GetNode(OrigNode);               \
+  }
+
+#define MATCH_REPLACE_IF(OrigNode, CaptureNode, ReplaceWith, Condition) \
+  if ((CaptureNode).TryCapture(OrigNode) && (Condition)) {              \
+    return (ReplaceWith).GetNode(OrigNode);                             \
+  }
+
+#define MATCH_REPLACE_IF_ELSE(OrigNode, CaptureNode, ReplaceWith, Condition, ElseNode) \
+  if ((CaptureNode).TryCapture(OrigNode)) {                                            \
+    if ((Condition)) {                                                                 \
+      return (ReplaceWith).GetNode(OrigNode);                                          \
+    }                                                                                  \
+    return (ElseNode).GetNode(OrigNode);                                               \
+  }
+
+#define MATCH_REPLACE_LAMBDA(OrigNode, CaptureNode, Lambda) \
+  if ((CaptureNode).TryCapture(OrigNode)) {                 \
+    return (Lambda)();                                      \
+  }
+
+#define MATCH_REPLACE_LAMBDA_IF(OrigNode, CaptureNode, Lambda, Condition) \
+  if ((CaptureNode).TryCapture(OrigNode) && (Condition)) {                \
+    return (Lambda)();                                                    \
+  }
+
+}  // namespace mindspore
+
+#endif  // #ifndef MINDSPORE_CCSRC_IR_PATTERN_MATCHER_H_
diff --git a/mindspore/ccsrc/ir/primitive.cc b/mindspore/ccsrc/ir/primitive.cc
index 4be4489d5b..59497affd5 100644
--- a/mindspore/ccsrc/ir/primitive.cc
+++ b/mindspore/ccsrc/ir/primitive.cc
@@ -52,9 +52,6 @@ py::function PrimitivePy::GetBpropFunction() {
     return fn;
   } else {
     auto fn = GetBpropFunctionByObj(python_obj_);
-    if (fn.is_none()) {
-      MS_LOG(WARNING) << "Can't find bprop function for " << name();
-    }
     return fn;
   }
 }
@@ -75,7 +72,7 @@ py::function PrimitivePy::GetComputeFunction() {
   py::function vm_fn = get_fn(python_obj_);
 
   if (py::isinstance<py::none>(vm_fn)) {
-    MS_LOG(DEBUG) << "Cannot find " << python_obj_.attr("__class__").attr("__name__").cast<std::string>();
+    MS_LOG(WARNING) << "Cannot find " << python_obj_.attr("__class__").attr("__name__").cast<std::string>();
     vm_fn = mindspore::GetComputeFunction(Primitive::name());
   }
   return vm_fn;
diff --git a/mindspore/ccsrc/ir/primitive.h b/mindspore/ccsrc/ir/primitive.h
index 1dd867fd1f..257302c0c4 100644
--- a/mindspore/ccsrc/ir/primitive.h
+++ b/mindspore/ccsrc/ir/primitive.h
@@ -49,6 +49,8 @@ class PrimitivePy : public Primitive {
   void AddPyAttr(const py::str &name, const py::object &obj);
 
   py::dict GetAttrDict();
+  void set_hook(const py::function &hook) { hook_ = hook; }
+  py::function hook() const { return hook_; }
 
   const bool parse_info_ = true;
   const py::object &GetPyObj() const { return python_obj_; }
@@ -56,6 +58,7 @@ class PrimitivePy : public Primitive {
 
  private:
   py::object python_obj_;
+  py::function hook_;
   std::vector<Signature> signatures_;
 };
 
diff --git a/mindspore/ccsrc/ir/primitive_base.h b/mindspore/ccsrc/ir/primitive_base.h
index 78623f8542..b34c43d00e 100644
--- a/mindspore/ccsrc/ir/primitive_base.h
+++ b/mindspore/ccsrc/ir/primitive_base.h
@@ -89,11 +89,8 @@ class Primitive : public Named {
     return iter == attrs_.cend() ? nullptr : iter->second;
   }
 
-  void set_hook(const py::function &hook) { hook_ = hook; }
-  py::function hook() const { return hook_; }
-
   const std::unordered_map<std::string, ValuePtr> &attrs() const { return attrs_; }
-  std::unordered_map<std::string, ValuePtr> &evaluate_added_attrs() { return evaluate_added_attrs_; }
+  const std::unordered_map<std::string, ValuePtr> &evaluate_added_attrs() const { return evaluate_added_attrs_; }
 
   // if Primitive has any attribute, for Primitives like scalar_add, return, etc, don't have any attribute.
   bool HasAttr() const { return !attrs_.empty(); }
@@ -124,7 +121,6 @@ class Primitive : public Named {
 
  private:
   std::string instance_name_;
-  py::function hook_;
   bool is_base_;
   bool has_signature_;
   PrimType prim_type_;
@@ -145,7 +141,10 @@ struct PrimitiveEqual {
 };
 
 struct PrimitiveHasher {
-  std::size_t operator()(PrimitivePtr const &prim) const { return prim->Hash(); }
+  std::size_t operator()(PrimitivePtr const &prim) const {
+    MS_EXCEPTION_IF_NULL(prim);
+    return prim->Hash();
+  }
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_IR_PRIMITIVE_BASE_H_
diff --git a/mindspore/ccsrc/ir/tensor.cc b/mindspore/ccsrc/ir/tensor.cc
index 566f9396e6..e5212e922d 100644
--- a/mindspore/ccsrc/ir/tensor.cc
+++ b/mindspore/ccsrc/ir/tensor.cc
@@ -18,6 +18,7 @@
 
 #include <functional>
 #include <numeric>
+#include <utility>
 #include <vector>
 #include <sstream>
 #include <string>
@@ -28,9 +29,8 @@
 #include "pipeline/static_analysis/abstract_value.h"
 
 namespace mindspore {
-
 namespace tensor {
-
+static uint64_t count = 0;
 void DataBuf2Contiguous(const py::array &src, py::array *const dest) {
   if (dest == nullptr) {
     MS_LOG(EXCEPTION) << "Failed to copy data to a contiguous buffer as dest is nullptr!";
@@ -81,6 +81,7 @@ Tensor::Tensor(const Tensor &tensor, const TypePtr &data_type)
     : MetaTensor(tensor), device_address_(tensor.device_address_) {
   init(tensor.data_, data_type);
   dirty_ = tensor.is_dirty();
+  id_ = tensor.id();
 }
 
 Tensor &Tensor::operator=(const Tensor &tensor) {
@@ -89,9 +90,14 @@ Tensor &Tensor::operator=(const Tensor &tensor) {
     dirty_ = tensor.is_dirty();
     device_address_ = tensor.device_address();
     data_ = tensor.data_;
+    id_ = tensor.id();
   }
   return *this;
 }
+Tensor &Tensor::AssignValue(const Tensor &tensor) {
+  *this = tensor;
+  return *this;
+}
 
 bool Tensor::operator==(const Tensor &tensor) const {
   return (MetaTensor::operator==(tensor) && data_ == tensor.data_);
@@ -208,6 +214,7 @@ void Tensor::init(const py::array &input, const TypeId &data_type) {
     data_ = input;
   }
   dirty_ = true;
+  id_ = std::to_string((uintptr_t)(this)) + std::to_string(count++);
 }
 
 void Tensor::init(TypeId data_type, const std::vector<int> &shape, py::array *const data) {
@@ -254,6 +261,7 @@ void Tensor::init(TypeId data_type, const std::vector<int> &shape, py::array *co
       MS_LOG(EXCEPTION) << "Cannot construct Tensor because of unsupported data type: " << data_type << ".";
       break;
   }
+  id_ = std::to_string((uintptr_t)(this)) + std::to_string(count++);
 }
 
 TypePtr Tensor::SetDtype(const TypePtr type_ptr) {
@@ -382,6 +390,28 @@ REGISTER_PYBIND_DEFINE(Tensor, ([](const py::module *m) {
                            .def(py::init<py::tuple, TypePtr>(), py::arg("input"), py::arg("dtype") = nullptr)
                            .def(py::init<Tensor, TypePtr>(), py::arg("input"), py::arg("dtype") = nullptr)
                            .def_readonly(PYTHON_TENSOR_FLAG, &Tensor::parse_info_)
+                           .def_property_readonly("dtype", &Tensor::Dtype, R"mydelimiter(
+                             Get the tensor's data type.
+
+                             Returns:
+                                 type, the data type of tensor.
+
+                             Examples:
+                                 >>> data = mindspore.Tensor(np.ones((2, 1), np.int32))
+                                 >>> data.dtype
+                                 Int32
+                             )mydelimiter")
+                           .def_property_readonly("shape", &Tensor::GetPyTupleShape, R"mydelimiter(
+                             Get the tensor's shape.
+
+                             Returns:
+                                 tuple[int], the shape of tensor.
+
+                             Examples:
+                                 >>> data = mindspore.Tensor(np.ones((3, 3)))
+                                 >>> data.shape()
+                                 (3, 3)
+                             )mydelimiter")
                            .def("asnumpy", &Tensor::data_sync, R"mydelimiter(
                              Convert tensor to numpy.ndarray.
 
@@ -435,17 +465,6 @@ REGISTER_PYBIND_DEFINE(Tensor, ([](const py::module *m) {
                                  >>> data.dim()
                                  2
                              )mydelimiter")
-                           .def("dtype", &Tensor::Dtype, R"mydelimiter(
-                             Get the tensor's data type.
-
-                             Returns:
-                                 type, the data type of tensor.
-
-                             Examples:
-                                 >>> data = mindspore.Tensor(np.ones((2, 1), np.int32))
-                                 >>> data.dtype()
-                                 Int32
-                             )mydelimiter")
                            .def("set_dtype", &Tensor::SetDtype, R"mydelimiter(
                              Set the tensor's data type.
 
@@ -457,16 +476,18 @@ REGISTER_PYBIND_DEFINE(Tensor, ([](const py::module *m) {
                                  >>> data.set_dtype(mindspore.int32)
                                  mindspore.int32
                              )mydelimiter")
-                           .def("shape", &Tensor::GetPyTupleShape, R"mydelimiter(
-                             Get the tensor's shape.
+                           .def("assign_value", &Tensor::AssignValue, R"mydelimiter(
+                             Assign another tensor value to this.
 
-                             Returns:
-                                 tuple[int], the shape of tensor.
+                             Arg:
+                                 value (:class:`mindspore.tensor`): The value tensor.
 
                              Examples:
-                                 >>> data = mindspore.Tensor(np.ones((3, 3)))
-                                 >>> data.shape()
-                                 (3, 3)
+                                 >>> data = mindspore.Tensor(np.ones((1, 2), np.float32))
+                                 >>> data2 = mindspore.Tensor(np.ones((2, 2), np.float32))
+                                 >>> data.assign_value(data2)
+                                 >>> data.shape
+                                 (2, 2)
                              )mydelimiter")
                            .def("__str__", &Tensor::ToString)
                            .def("__repr__", &Tensor::ToStringRepr)
@@ -485,10 +506,86 @@ REGISTER_PYBIND_DEFINE(Tensor, ([](const py::module *m) {
                              }));
                          (void)py::class_<MetaTensor, std::shared_ptr<MetaTensor>>(*m, "MetaTensor")
                            .def(py::init<TypePtr, const std::vector<int>>(), py::arg("dtype"), py::arg("shape"))
+                           .def(py::pickle(
+                             [](const MetaTensor &t) {  // __getstate__
+                               /* Return a tuple that fully encodes the state of the object */
+                               return py::make_tuple(static_cast<int>(t.data_type()), t.shape());
+                             },
+                             [](const py::tuple &t) {  // __setstate__
+                               if (t.size() != 2) {
+                                 throw std::runtime_error("Invalid state!");
+                               }
+                               /* Create a new C++ instance */
+                               MetaTensor tensor(TypeId(t[0].cast<int>()), t[1].cast<std::vector<int>>());
+                               return tensor;
+                             }))
                            .def_readonly(PYTHON_META_TENSOR_FLAG, &MetaTensor::parse_info_)
-                           .def("dtype", &MetaTensor::Dtype, "Get the MetaTensor's dtype.")
-                           .def("shape", &MetaTensor::shape, "Get the MetaTensor's shape.");
+                           .def_property_readonly("dtype", &MetaTensor::Dtype, "Get the MetaTensor's dtype.")
+                           .def_property_readonly("shape", &MetaTensor::shape, "Get the MetaTensor's shape.");
                        }));
-
 }  // namespace tensor
+
+namespace inference {
+MSTensor *MSTensor::CreateTensor(TypeId data_type, const std::vector<int> &shape) {
+  return new Tensor(data_type, shape);
+}
+
+Tensor::Tensor() { this->tensor_impl_ = std::make_shared<tensor::Tensor>(); }
+
+Tensor::Tensor(TypeId data_type, const std::vector<int> &shape) {
+  this->tensor_impl_ = std::make_shared<tensor::Tensor>(data_type, shape);
+}
+
+Tensor::Tensor(std::shared_ptr<tensor::Tensor> tensor_ptr) { this->tensor_impl_ = std::move(tensor_ptr); }
+
+TypeId Tensor::data_type() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->data_type();
+}
+
+TypeId Tensor::set_data_type(TypeId data_type) {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->set_data_type(data_type);
+}
+
+std::vector<int> Tensor::shape() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->shape();
+}
+
+size_t Tensor::set_shape(const std::vector<int> &shape) {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->set_shape(shape);
+}
+
+int Tensor::DimensionSize(size_t index) const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->DimensionSize(index);
+}
+
+int Tensor::ElementsNum() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->ElementsNum();
+}
+
+std::size_t Tensor::hash() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->hash();
+}
+
+std::shared_ptr<tensor::Tensor> Tensor::tensor() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_;
+}
+
+size_t Tensor::Size() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->data().nbytes();
+}
+
+void *Tensor::MutableData() const {
+  MS_ASSERT(this->tensor_impl_ != nullptr);
+  return this->tensor_impl_->data_c(true);
+}
+}  // namespace inference
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/ir/tensor.h b/mindspore/ccsrc/ir/tensor.h
index 690fb83f55..1ce657143b 100644
--- a/mindspore/ccsrc/ir/tensor.h
+++ b/mindspore/ccsrc/ir/tensor.h
@@ -27,6 +27,7 @@
 #include "Eigen/Core"
 #include "device/device_address.h"
 #include "ir/meta_tensor.h"
+#include "include/ms_tensor.h"
 #include "utils/log_adapter.h"
 
 namespace py = pybind11;
@@ -34,9 +35,7 @@ namespace py = pybind11;
 using float16 = Eigen::half;
 
 namespace pybind11 {
-
 namespace detail {
-
 // Similar to enums in `pybind11/numpy.h`. Determined by doing:
 // python3 -c 'import numpy as np; print(np.dtype(np.float16).num)'
 constexpr int NPY_FLOAT16 = 23;
@@ -85,7 +84,6 @@ template <>
 struct type_caster<float16> : public npy_scalar_caster<float16> {
   static constexpr auto name = "float16";
 };
-
 }  // namespace detail
 }  // namespace pybind11
 
@@ -93,10 +91,9 @@ using mindspore::device::DeviceAddress;
 using DeviceAddressPtr = std::shared_ptr<mindspore::device::DeviceAddress>;
 // brief mindspore namespace.
 //
-// mindspore namespace is the top level namespace of Mindsporeession project.
+// mindspore namespace is the top level namespace of MindSpore project.
 // Other namespace should be a sub namespace of mindspore namespace in the ME project.
 namespace mindspore {
-
 // brief mindspore::tensor namespace
 //
 // A sub namespace in ME to support tensor related definition.
@@ -177,6 +174,9 @@ class Tensor : public MetaTensor {
   // It is different from 'operator==' which just compare shape/type/address, it do real value comparison.
   bool ValueEqual(const Tensor &other) const;
 
+  // assgin value to this tensor
+  Tensor &AssignValue(const Tensor &tensor);
+
   bool operator==(const Value &other) const override {
     if (other.isa<Tensor>()) {
       auto other_ = static_cast<const Tensor &>(other);
@@ -219,6 +219,11 @@ class Tensor : public MetaTensor {
   // return The pointer to the object
   void *data_c(bool writable = false);
 
+  // brief Get Tensor data byte-size for c++ type
+  //
+  // return byte size of Tensor data
+  size_t Size() const { return this->data().nbytes(); }
+
   // brief Get data type from tensor data.
   //
   // param buf The buffer info of the py::array data.
@@ -263,16 +268,52 @@ class Tensor : public MetaTensor {
   DeviceAddressPtr device_address() const { return device_address_; }
   void set_device_address(const DeviceAddressPtr &device_address) { device_address_ = device_address; }
   py::array data_sync();
+  std::string id() const { return id_; }
 
  private:
   bool dirty_{true};
+  std::string id_{""};
   DeviceAddressPtr device_address_{nullptr};
 };
-
 using TensorPtr = std::shared_ptr<Tensor>;
 using TensorPtrList = std::vector<std::shared_ptr<Tensor>>;
-
 }  // namespace tensor
+
+namespace inference {
+class Tensor : public MSTensor {
+ public:
+  Tensor();
+
+  Tensor(TypeId data_type, const std::vector<int> &shape);
+
+  explicit Tensor(std::shared_ptr<tensor::Tensor> tensor_ptr);
+
+  ~Tensor() = default;
+
+  TypeId data_type() const override;
+
+  TypeId set_data_type(const TypeId data_type) override;
+
+  std::vector<int> shape() const override;
+
+  size_t set_shape(const std::vector<int> &shape) override;
+
+  int DimensionSize(size_t index) const override;
+
+  int ElementsNum() const override;
+
+  std::size_t hash() const override;
+
+  std::shared_ptr<tensor::Tensor> tensor() const;
+
+  size_t Size() const override;
+
+  void *MutableData() const override;
+
+ protected:
+  std::shared_ptr<tensor::Tensor> tensor_impl_;
+};
+}  // namespace inference
 }  // namespace mindspore
 
 #endif  // MINDSPORE_CCSRC_IR_TENSOR_H_
diff --git a/mindspore/ccsrc/ir/visitor.cc b/mindspore/ccsrc/ir/visitor.cc
index efebe3124a..9e63f4f9c1 100644
--- a/mindspore/ccsrc/ir/visitor.cc
+++ b/mindspore/ccsrc/ir/visitor.cc
@@ -14,11 +14,10 @@
  * limitations under the License.
  */
 
-#include "ir/visitor.h"
 #include "ir/func_graph.h"
+#include "ir/visitor.h"
 
 namespace mindspore {
-AnfNodePtr AnfVisitor::operator()(const opt::OptimizerPtr &, const AnfNodePtr &) { return nullptr; }
 void AnfVisitor::Visit(const AnfNodePtr &node) { node->accept(this); }
 
 void AnfVisitor::Visit(const CNodePtr &cnode) {
diff --git a/mindspore/ccsrc/ir/visitor.h b/mindspore/ccsrc/ir/visitor.h
index e771f7ad28..6dcf28249a 100644
--- a/mindspore/ccsrc/ir/visitor.h
+++ b/mindspore/ccsrc/ir/visitor.h
@@ -18,14 +18,12 @@
 #define MINDSPORE_CCSRC_IR_VISITOR_H_
 
 #include <vector>
-#include "ir/anf.h"
-#include "optimizer/opt.h"
+#include "ir/optimizer_caller.h"
 
 namespace mindspore {
 using VisitFuncType = std::function<void(const AnfNodePtr &)>;
-class AnfVisitor {
+class AnfVisitor : public OptimizerCaller {
  public:
-  virtual AnfNodePtr operator()(const opt::OptimizerPtr &, const AnfNodePtr &);
   virtual void Visit(const AnfNodePtr &);
   virtual void Visit(const CNodePtr &);
   virtual void Visit(const ValueNodePtr &);
diff --git a/mindspore/ccsrc/kernel/CMakeLists.txt b/mindspore/ccsrc/kernel/CMakeLists.txt
index 76e1631d57..ceea6b1a99 100644
--- a/mindspore/ccsrc/kernel/CMakeLists.txt
+++ b/mindspore/ccsrc/kernel/CMakeLists.txt
@@ -9,6 +9,10 @@ if (ENABLE_D)
 	file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
 		"kernel_query.cc"
 		"kernel_fusion.cc"
+		"akg/ascend/*.cc"
+		"akg/akg_kernel_build.cc"
+		"akg/akg_kernel_attrs_process.cc"
+		"akg/akg_kernel_metadata.cc"
 		"tbe/*.cc"
 		"aicpu/*.cc"
 		"rts/*.cc"
@@ -21,13 +25,19 @@ if (ENABLE_CPU)
     file(GLOB_RECURSE CPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "cpu/*.cc"
     )
+    
+    if (NOT ENABLE_MPI)
+        list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
+        list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
+        list(REMOVE_ITEM CPU_SRC_LIST "cpu/embedding_look_up_comm_grad_cpu_kernel.cc")
+    endif ()
 endif ()
 
 if (ENABLE_GPU)
     file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "gpu/*.cu"
         "akg/gpu/*.cc"
-        "akg/akgkernelbuild.cc"
+        "akg/akg_kernel_build.cc"
         "akg/akg_kernel_attrs_process.cc"
 	)
 
diff --git a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc
index d6217ff1cc..c83994b5f2 100644
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc
@@ -24,7 +24,7 @@
 #include <map>
 #include "device/kernel_runtime.h"
 #include "kernel/aicpu/aicpu_kernel_mod.h"
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include "proto/tensor.pb.h"
 #include "proto/tensor_shape.pb.h"
 #include "proto/attr.pb.h"
@@ -50,7 +50,13 @@ bool SetIOIputSize(const std::shared_ptr<AnfNode> &anf_node, const size_t &input
         MS_LOG(EXCEPTION) << "anf_node is not CNode.";
       }
       auto cnode = anf_node->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(cnode);
+      if (cnode->inputs().size() < (i + 1)) {
+        MS_LOG(ERROR) << "cnode inputs size " << cnode->inputs().size() << " is smaller than " << i + 1;
+        return false;
+      }
       auto input_node = cnode->inputs()[i + 1];
+      MS_EXCEPTION_IF_NULL(input_node);
       if (input_node->isa<ValueNode>()) {
         auto value_ptr = GetValueNode(input_node);
         auto value = GetValue<std::string>(value_ptr);
@@ -103,13 +109,13 @@ bool SetIOSize(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<A
     output_size_list.push_back(IntToSize(size_i));
   }
   kernel_mod_ptr->SetOutputSizeList(output_size_list);
-
   return true;
 }
 
 void ParseAttrValue(const std::string &type, const std::string &attr_name, const mindspore::ValuePtr &value,
                     ::google::protobuf::Map<::std::string, ::mindspore::AttrValue> *node_attr) {
   MS_EXCEPTION_IF_NULL(node_attr);
+  MS_EXCEPTION_IF_NULL(value);
   if (type == "int") {
     auto attr_value = GetValue<int>(value);
     (*node_attr)[attr_name].set_i(attr_value);
@@ -146,6 +152,8 @@ void ParseAttrValue(const std::string &type, const std::string &attr_name, const
 }
 
 void SetNodeAttr(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  MS_EXCEPTION_IF_NULL(proto);
   std::string op_name = AnfAlgo::GetCNodeName(anf_node);
   if (op_name == kInitDataSetQueue) {
     op_name = kInitData;
@@ -161,15 +169,16 @@ void SetNodeAttr(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *p
   MS_EXCEPTION_IF_NULL(primitive);
   ::google::protobuf::Map<::std::string, ::mindspore::AttrValue> *node_attr = proto->mutable_attrs();
   for (const auto &attr_ptr : attrs_ptr) {
+    MS_EXCEPTION_IF_NULL(attr_ptr);
     std::string attr_name = attr_ptr->name();
     auto value = primitive->GetAttr(attr_name);
     if (value != nullptr) {
       if (attr_name == kQueueName || attr_name == kSharedName) {
         attr_name = kChannelName;
-      } else if (attr_name == kSeed) {
-        attr_name = "seed";
-      } else if (attr_name == kSeed2) {
-        attr_name = "seed2";
+      } else if (attr_name == kSeed0) {
+        attr_name = kSeed;
+      } else if (attr_name == kSeed1) {
+        attr_name = kSeed2;
       }
       std::string type = attr_ptr->type();
       ParseAttrValue(type, attr_name, value, node_attr);
@@ -179,6 +188,8 @@ void SetNodeAttr(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *p
 }
 
 void SetNodeInputs(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
+  MS_EXCEPTION_IF_NULL(proto);
+  MS_EXCEPTION_IF_NULL(anf_node);
   size_t input_num = AnfAlgo::GetInputTensorNum(anf_node);
   if (input_num == 0) {
     MS_LOG(INFO) << "Node [" << AnfAlgo::GetCNodeName(anf_node) << "] does not have input.";
@@ -193,6 +204,7 @@ void SetNodeInputs(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef
     int32_t input_data_type;
     if (input_type == kObjectTypeString) {
       auto cnode = anf_node->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(cnode);
       auto input_node = cnode->inputs()[input_index + 1];
       auto value_ptr = GetValueNode(input_node);
       auto value = GetValue<std::string>(value_ptr);
@@ -203,19 +215,20 @@ void SetNodeInputs(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef
       input_shape = AnfAlgo::GetInputDeviceShape(anf_node, input_index);
       input_data_type = AicpuOpUtil::MsTypeToProtoType(input_type);
     }
+
     mindspore::TensorShape *tensorShape = node_inputs->mutable_tensor_shape();
     for (auto item : input_shape) {
       mindspore::TensorShape_Dim *dim = tensorShape->add_dim();
       dim->set_size((::google::protobuf::int64)item);
     }
-
     node_inputs->set_tensor_type((mindspore::DataType)input_data_type);
-
     node_inputs->set_mem_device("HBM");
   }
 }
 
 void SetNodeOutputs(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
+  MS_EXCEPTION_IF_NULL(proto);
+  MS_EXCEPTION_IF_NULL(anf_node);
   size_t output_num = AnfAlgo::GetOutputTensorNum(anf_node);
   if (output_num == 0) {
     MS_LOG(INFO) << "Node [" << AnfAlgo::GetCNodeName(anf_node) << "] does not have output. ";
@@ -224,63 +237,55 @@ void SetNodeOutputs(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef
 
   for (size_t output_index = 0; output_index < output_num; output_index++) {
     ::mindspore::Tensor *node_outputs = proto->add_outputs();
+    MS_EXCEPTION_IF_NULL(node_outputs);
     std::vector<size_t> output_shape = AnfAlgo::GetOutputDeviceShape(anf_node, output_index);
     mindspore::TensorShape *tensorShape = node_outputs->mutable_tensor_shape();
+    MS_EXCEPTION_IF_NULL(tensorShape);
     for (auto item : output_shape) {
       mindspore::TensorShape_Dim *dim = tensorShape->add_dim();
+      MS_EXCEPTION_IF_NULL(dim);
       dim->set_size((::google::protobuf::int64)item);
     }
-
     TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, output_index);
-
     int32_t output_data_type = AicpuOpUtil::MsTypeToProtoType(output_type);
     node_outputs->set_tensor_type((mindspore::DataType)output_data_type);
-
     node_outputs->set_mem_device("HBM");
   }
 }
 
 void SetNodedefProto(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
-  MS_LOG(INFO) << "SetNodedefProto entry";
   MS_EXCEPTION_IF_NULL(anf_node);
   MS_EXCEPTION_IF_NULL(proto);
-
+  MS_LOG(INFO) << "SetNodedefProto entry";
   std::string op_name = AnfAlgo::GetCNodeName(anf_node);
-  if (op_name == "InitDataSetQueue") {
-    op_name = "InitData";
+  if (op_name == kInitDataSetQueue) {
+    op_name = kInitData;
   }
   // set op name
   proto->set_op(op_name);
-
   // set inputs tensor
   SetNodeInputs(anf_node, proto);
-
   // set outputs tensor
   SetNodeOutputs(anf_node, proto);
-
   // set node attr
   SetNodeAttr(anf_node, proto);
-
   MS_LOG(INFO) << "SetNodedefProto end!";
 }
 
 bool CreateNodeDefBytes(const std::shared_ptr<AnfNode> &anf_node,
                         const std::shared_ptr<AicpuOpKernelMod> &kernel_mod_ptr) {
-  MS_LOG(INFO) << "CreateNodeDefBytes entry";
-  MS_EXCEPTION_IF_NULL(anf_node);
   MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
-  mindspore::NodeDef proto;
+  MS_EXCEPTION_IF_NULL(anf_node);
+  MS_LOG(INFO) << "CreateNodeDefBytes entry";
 
+  mindspore::NodeDef proto;
   SetNodedefProto(anf_node, &proto);
-
   std::string nodeDefStr;
   if (!proto.SerializeToString(&nodeDefStr)) {
     MS_LOG(ERROR) << "Serialize nodeDef to string failed.";
     return false;
   }
-
   kernel_mod_ptr->SetNodeDef(nodeDefStr);
-
   MS_LOG(INFO) << "CreateNodeDefBytes end!";
   return true;
 }
@@ -288,8 +293,8 @@ bool CreateNodeDefBytes(const std::shared_ptr<AnfNode> &anf_node,
 KernelModPtr AicpuOpBuild(const std::shared_ptr<AnfNode> &anf_node) {
   MS_EXCEPTION_IF_NULL(anf_node);
   std::string op_name = AnfAlgo::GetCNodeName(anf_node);
-  if (op_name == "InitDataSetQueue") {
-    op_name = "InitData";
+  if (op_name == kInitDataSetQueue) {
+    op_name = kInitData;
   }
   auto kernel_mod_ptr = std::make_shared<AicpuOpKernelMod>();
   MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
diff --git a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_mod.cc b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_mod.cc
index 7875baaf0e..2213f176cc 100644
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_mod.cc
@@ -110,8 +110,8 @@ bool AicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::
   }
 
   CreateCpuKernelInfo(inputs, outputs);
-  if (node_name_ == "TopK") {
-    node_name_ = "TopKV2";
+  if (node_name_ == kTopK) {
+    node_name_ = kTopKV2;
   }
   MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_
                << ", args_size:" << args_.length();
@@ -141,8 +141,8 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr>
   (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs),
                        [](const AddressPtr &output) -> void * { return output->addr; });
 
-  if (node_name_ == "TopK") {
-    node_name_ = "TopKV2";
+  if (node_name_ == kTopK) {
+    node_name_ = kTopKV2;
   }
   AicpuTaskInfoPtr task_info_ptr = make_shared<ge::model_runner::AicpuTaskInfo>(
     stream_id, node_so_, node_name_, node_def_str_, input_data_addrs, output_data_addrs);
diff --git a/mindspore/ccsrc/kernel/aicpu/aicpu_util.h b/mindspore/ccsrc/kernel/aicpu/aicpu_util.h
index 3938cfbdea..f2092abbe2 100644
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_util.h
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_util.h
@@ -37,9 +37,12 @@ constexpr auto kSharedName = "shared_name";
 constexpr auto kShapes = "shapes";
 constexpr auto kTypes = "types";
 constexpr auto kQueueName = "queue_name";
-
-constexpr auto kSeed = "Seed0";
-constexpr auto kSeed2 = "Seed1";
+constexpr auto kSeed = "seed";
+constexpr auto kSeed0 = "Seed0";
+constexpr auto kSeed1 = "Seed1";
+constexpr auto kSeed2 = "seed2";
+constexpr auto kTopK = "TopK";
+constexpr auto kTopKV2 = "TopKV2";
 
 struct AicpuParamHead {
   uint32_t length;         // Total length: include cunstom message
diff --git a/mindspore/ccsrc/kernel/akg/akg_kernel_attrs_process.cc b/mindspore/ccsrc/kernel/akg/akg_kernel_attrs_process.cc
index c9ff41dc55..3a0cc3eb25 100644
--- a/mindspore/ccsrc/kernel/akg/akg_kernel_attrs_process.cc
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_attrs_process.cc
@@ -79,6 +79,10 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {
     dst_type = "float32";
   } else if (output_type == kFloat16->type_id()) {
     dst_type = "float16";
+  } else if (output_type == kInt32->type_id()) {
+    dst_type = "int32";
+  } else {
+    MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
   }
   AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
 }
diff --git a/mindspore/ccsrc/kernel/akg/akgkernelbuild.cc b/mindspore/ccsrc/kernel/akg/akg_kernel_build.cc
similarity index 78%
rename from mindspore/ccsrc/kernel/akg/akgkernelbuild.cc
rename to mindspore/ccsrc/kernel/akg/akg_kernel_build.cc
index c0759172a5..1f88bbb89a 100644
--- a/mindspore/ccsrc/kernel/akg/akgkernelbuild.cc
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_build.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include <Python.h>
 #include <sys/types.h>
 #include <signal.h>
@@ -43,7 +43,9 @@ namespace kernel {
 constexpr int ME_MAX_KERNEL_NAME_LENGTH = 200;
 constexpr int32_t ARGS_SIZE = 1;
 constexpr auto kCompileWithJsonFunc = "compilewithjson";
+
 // json key
+constexpr auto kOpDesc = "op_desc";
 constexpr auto kInputDesc = "input_desc";
 constexpr auto kShape = "shape";
 constexpr auto kDataType = "data_type";
@@ -51,13 +53,24 @@ constexpr auto kOutputDesc = "output_desc";
 constexpr auto kName = "name";
 constexpr auto kTensorName = "tensor_name";
 constexpr auto kValue = "value";
-constexpr auto KInpputNames = "input_names";
+constexpr auto KDynInputSizes = "dyn_input_sizes";
+constexpr auto KInputNames = "input_names";
 constexpr auto KInput = "input";
 constexpr auto KDtype = "dtype";
-int AkgKernelBuild::op_cnt_ = 0;
-std::mutex AkgKernelBuild::op_cnt_mtx_;
+namespace {
+template <typename T>
+std::string Vector2Str(const std::vector<T> &inputs) {
+  if (!inputs.empty()) {
+    std::ostringstream oss;
+    (void)std::copy(inputs.begin(), inputs.end() - 1, std::ostream_iterator<T>(oss, ", "));
+    oss << inputs.back();
+    return oss.str();
+  }
+  return "";
+}
+}  // namespace
 
-std::string PyObjectToStr(PyObject *const PyObj) {
+std::string AkgKernelBuild::PyObjectToStr(PyObject *const PyObj) {
   char *pChar = nullptr;
   std::string str_res;
   if (PyObj == nullptr) {
@@ -76,6 +89,72 @@ std::string PyObjectToStr(PyObject *const PyObj) {
   return str_res;
 }
 
+std::string GetTensorName(const nlohmann::json &node_json, const std::string &tag,
+                          const std::pair<size_t, size_t> &position) {
+  if (node_json.count(tag) == 0) {
+    MS_LOG(ERROR) << "Node [" << node_json.dump() << "] has no key [" << tag << "].";
+    return "";
+  }
+
+  auto const &tag_desc = node_json[tag];
+  nlohmann::json first_index;
+  if (tag == kOutputDesc) {
+    first_index = tag_desc;
+  } else if (!tag_desc.is_array() || tag_desc.size() <= position.first) {
+    MS_LOG(ERROR) << "Node [" << tag_desc.dump() << "] has no enough value [" << position.first << "].";
+    return "";
+  } else {
+    first_index = tag_desc[position.first];
+  }
+
+  if (!first_index.is_array() || first_index.size() <= position.second) {
+    MS_LOG(ERROR) << "Node [" << first_index.dump() << "] has no enough value [" << position.second << "].";
+    return "";
+  }
+  auto const &second_index = first_index[position.second];
+  if (second_index.count(kTensorName) == 0) {
+    MS_LOG(ERROR) << "Node [" << second_index.dump() << "] has no key [" << kTensorName << "].";
+    return "";
+  }
+
+  return second_index[kTensorName];
+}
+
+void SetTensorName(const std::string &tag, const std::string &new_name, const std::pair<size_t, size_t> &position,
+                   nlohmann::json *const node_json) {
+  MS_EXCEPTION_IF_NULL(node_json);
+  if (node_json->count(tag) == 0) {
+    MS_LOG(ERROR) << "Node [" << node_json->dump() << "] has no key [" << tag << "].";
+    return;
+  }
+
+  nlohmann::json *tag_desc = &((*node_json)[tag]);
+  nlohmann::json *first_index;
+  if (tag == kOutputDesc) {
+    first_index = tag_desc;
+  } else if (!tag_desc->is_array() || tag_desc->size() <= position.first) {
+    MS_LOG(ERROR) << "Node [" << tag_desc->dump() << "] has no enough value [" << position.first << "].";
+    return;
+  } else {
+    first_index = &((*tag_desc)[position.first]);
+  }
+
+  if (!first_index->is_array() || first_index->size() <= position.second) {
+    MS_LOG(ERROR) << "Node [" << first_index->dump() << "] has no enough value [" << position.second << "].";
+    return;
+  }
+  nlohmann::json *second_index = &((*first_index)[position.second]);
+  if (second_index->count(kTensorName) == 0) {
+    MS_LOG(ERROR) << "Node [" << second_index->dump() << "] has no key [" << kTensorName << "].";
+    return;
+  }
+  (*second_index)[kTensorName] = new_name;
+  return;
+}
+
+int AkgKernelBuild::op_cnt_ = 0;
+std::mutex AkgKernelBuild::op_cnt_mtx_;
+
 std::string AkgKernelBuild::GetProcessor(const AnfNodePtr &anf_node) {
   MS_EXCEPTION_IF_NULL(anf_node);
   std::string device;
@@ -187,10 +266,7 @@ bool AkgKernelBuild::CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::j
     for (size_t input_i = 0; input_i < input_tensor_num; input_i++) {
       // dtype : float16
       auto type_id = AnfAlgo::GetInputDeviceDataType(anf_node, real_input_index);
-      TypePtr type_ptr = TypeIdToType(type_id);
-      MS_EXCEPTION_IF_NULL(type_ptr);
-      std::string dtype = type_ptr->ToString();
-      dtype = Dtype2String(dtype);
+      std::string dtype = TypeId2String(type_id);
       if (dtype.empty()) {
         MS_LOG(ERROR) << "Op [" << op_name << "] input [" << input_i << "] data type is null. ";
         return false;
@@ -198,13 +274,23 @@ bool AkgKernelBuild::CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::j
       nlohmann::json input_desc_json;
       input_desc_json[kDataType] = dtype;
       input_desc_json[kName] = op_input_name;
-      input_desc_json[kTensorName] =
-        op_input_name + "_" + std::to_string(real_input_index) + "_" + std::to_string(input_i);
-      input_desc_json[kShape] = AnfAlgo::GetInputDeviceShape(anf_node, real_input_index);
+      input_desc_json[kTensorName] = "input_" + std::to_string(GetInputTensorIdxInc(anf_node, real_input_index));
+      auto input_shape = AnfAlgo::GetInputDeviceShape(anf_node, real_input_index);
+      if (GetInputTensorValue(anf_node, real_input_index, &input_desc_json)) {
+        MS_LOG(WARNING) << "we take input[" << real_input_index << "] of [" << anf_node->DebugString(2)
+                        << "] as const tensor, shape: [" << Vector2Str(input_shape)
+                        << "], value: " << input_desc_json[kValue];
+
+        input_shape.clear();
+      }
+      if (input_shape.empty()) {
+        input_shape.push_back(1);
+      }
+      input_desc_json[kShape] = input_shape;
       input_list.emplace_back(input_desc_json);
+      real_input_index++;
     }
     inputs_json->emplace_back(input_list);
-    real_input_index++;
   }
   return true;
 }
@@ -220,10 +306,7 @@ bool AkgKernelBuild::CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::
   for (size_t i = 0; i < output_tensor_num; i++) {
     nlohmann::json output_json;
     auto type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, i);
-    TypePtr type_ptr = TypeIdToType(type_id);
-    MS_EXCEPTION_IF_NULL(type_ptr);
-    std::string dtype = type_ptr->ToString();
-    dtype = Dtype2String(dtype);
+    std::string dtype = TypeId2String(type_id);
     if (dtype.empty()) {
       MS_LOG(ERROR) << "Op [" << op_name << "] output [" << i << "] data type is null. ";
       return false;
@@ -232,7 +315,7 @@ bool AkgKernelBuild::CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::
     std::string output_name = outputs[i]->name();
     output_json[kDataType] = dtype;
     output_json[kName] = output_name;
-    output_json[kTensorName] = output_name + "_" + std::to_string(i);
+    output_json[kTensorName] = "output_" + std::to_string(i) + "_" + std::to_string(GetOutputTensorIdxInc());
     output_json[kShape] = AnfAlgo::GetOutputDeviceShape(anf_node, i);
     outputs_json->push_back(output_json);
   }
@@ -358,15 +441,14 @@ bool AkgKernelBuild::GenerateSingleKernelJson(const AnfNodePtr &anf_node, const
   MS_EXCEPTION_IF_NULL(op_info_ptr);
 
   // get basic params from currentNodeOpDesc
-  (*node_json)["platform"] = "AKG";
   (*node_json)[kName] = op_name;
-  (*node_json)["fusion_type"] = AnfAlgo::GetFusionType(anf_node);
   (*node_json)["impl_path"] = op_info_ptr->impl_path();
   (*node_json)["process"] = AkgKernelBuild::GetProcessor(anf_node);
+  (*node_json)["composite"] = false;
 
   auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
   MS_EXCEPTION_IF_NULL(primitive);
-  ValuePtr input_names_v = primitive->GetAttr(KInpputNames);
+  ValuePtr input_names_v = primitive->GetAttr(KInputNames);
   if (input_names_v == nullptr) {
     MS_LOG(ERROR) << "ApplyKernel has no input_names, op[" << op_name << "].";
     return false;
@@ -465,12 +547,12 @@ KernelPackPtr AkgKernelBuild::OpBuild(const std::string &node_json, const AnfNod
   (void)alarm(0);
   if (pRes == nullptr) {
     MS_LOG(ERROR) << "No ret got, failed to call function [" << kCompileWithJsonFunc << "], args:\n("
-                  << PyObjectToStr(pArg) << ").";
+                  << AkgKernelBuild::PyObjectToStr(pArg) << ").";
     return nullptr;
   }
   if (PyObject_IsTrue(pRes) != 1) {
     MS_LOG(ERROR) << "Illegal ret, failed to call function [" << kCompileWithJsonFunc << "], args:\n("
-                  << PyObjectToStr(pArg) << ").";
+                  << AkgKernelBuild::PyObjectToStr(pArg) << ").";
     return nullptr;
   }
 
@@ -513,5 +595,29 @@ KernelPackPtr AkgKernelBuild::BuildByJson(const AnfNodePtr &anf_node, std::vecto
                << "]";
   return kernel_pack;
 }
+
+size_t AkgKernelBuild::GetInputTensorIdxInc(const AnfNodePtr &anf_node, size_t input_idx) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  auto cnode = anf_node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (input_idx + 1 >= cnode->inputs().size()) {
+    MS_EXCEPTION(ArgumentError) << "input_idx [" << input_idx << "] is out of index of inputs of ["
+                                << cnode->inputs().size() - 1 << "][" << cnode->DebugString() << "]";
+  }
+
+  auto input_node = cnode->input(input_idx + 1);
+  if (input_tensor_idx_.find(input_node) == input_tensor_idx_.end()) {
+    size_t index = input_tensor_idx_.size();
+    input_tensor_idx_[input_node] = index;
+  }
+
+  return input_tensor_idx_[input_node];
+}
+
+size_t AkgKernelBuild::GetOutputTensorIdxInc() {
+  size_t idx = output_tensor_idx_++;
+  return idx;
+}
+
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/akg/akgkernelbuild.h b/mindspore/ccsrc/kernel/akg/akg_kernel_build.h
similarity index 70%
rename from mindspore/ccsrc/kernel/akg/akgkernelbuild.h
rename to mindspore/ccsrc/kernel/akg/akg_kernel_build.h
index f8127843bd..d32bd48ce6 100644
--- a/mindspore/ccsrc/kernel/akg/akgkernelbuild.h
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_build.h
@@ -32,29 +32,45 @@ namespace mindspore {
 namespace kernel {
 class AkgKernelBuild {
  public:
-  AkgKernelBuild() = default;
+  AkgKernelBuild() {
+    input_tensor_idx_ = {};
+    output_tensor_idx_ = 0;
+  }
   ~AkgKernelBuild() = default;
 
   KernelPackPtr BuildByJson(const AnfNodePtr &anf_node, std::vector<size_t> *const input_size,
                             std::vector<size_t> *const output_size);
+  static std::string GetProcessor(const AnfNodePtr &anf_node);
+  static std::string PyObjectToStr(PyObject *const PyObj);
 
- private:
+ protected:
   bool CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const inputs_json);
   bool CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const outputs_json);
   bool CreateAttrDescJson(const AnfNodePtr &anf_node, const std::string &op_name,
                           const std::shared_ptr<OpInfo> &op_info, nlohmann::json *const attrs_json);
+  KernelPackPtr OpBuild(const std::string &node_json, const AnfNodePtr &anf_node);
+  int GetOpCntInc();
+  size_t GetInputTensorIdxInc(const AnfNodePtr &anf_node, size_t input_idx);
+  size_t GetOutputTensorIdxInc();
   bool GenerateSingleKernelJson(const AnfNodePtr &anf_node, const std::string &op_name,
                                 nlohmann::json *const node_json);
-  KernelPackPtr OpBuild(const std::string &node_json, const AnfNodePtr &anf_node);
 
-  int GetOpCntInc();
-  std::string GetProcessor(const AnfNodePtr &anf_node);
   static int op_cnt_;
   // lock for variable fusionOpCnt in singleton mode
   static std::mutex op_cnt_mtx_;
   std::string json_name_;
   std::string json_info_;
+  std::unordered_map<AnfNodePtr, size_t> input_tensor_idx_;
+  size_t output_tensor_idx_;
 };
+
+bool GetIOSize(const nlohmann::json &node_json, std::vector<size_t> *const input_size,
+               std::vector<size_t> *const output_size);
+void SetTensorName(const std::string &tag, const std::string &new_name, const std::pair<size_t, size_t> &position,
+                   nlohmann::json *const node_json);
+std::string GetTensorName(const nlohmann::json &node_json, const std::string &tag,
+                          const std::pair<size_t, size_t> &position);
+
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.cc b/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.cc
new file mode 100644
index 0000000000..3515add1e0
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.cc
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/akg/akg_kernel_metadata.h"
+#include <memory>
+#include "session/anf_runtime_algorithm.h"
+#include "kernel/oplib/oplib.h"
+#include "kernel/common_utils.h"
+
+namespace mindspore {
+namespace kernel {
+void AkgMetadataInfo(const CNodePtr &kernel_node,
+                     std::vector<std::shared_ptr<KernelBuildInfo>> *const kernel_info_list) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_info_list);
+
+  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
+  for (size_t i = 0; i < support_devices.size(); i++) {
+    auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAKG);
+    if (op_info_ptr == nullptr) {
+      continue;
+    }
+
+    if (!ParseMetadata(kernel_node, op_info_ptr, Processor(i), kernel_info_list)) {
+      MS_LOG(WARNING) << "Akg parsed metadata of op[" << op_name << "], device[" << support_devices[i] << "] failed.";
+    } else {
+      MS_LOG(DEBUG) << "Akg parsed metadata of op[" << op_name << "], device[" << support_devices[i] << "].";
+      break;
+    }
+  }
+
+  if (kernel_info_list->empty()) {
+    MS_LOG(WARNING) << "Akg dose not has metadata of op[" << op_name << "].";
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select.h b/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.h
similarity index 71%
rename from mindspore/ccsrc/kernel/tbe/tbe_kernel_select.h
rename to mindspore/ccsrc/kernel/akg/akg_kernel_metadata.h
index 3ce66b5148..5e329f0080 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select.h
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,19 +14,18 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_TBE_KERNEL_SELECT_H
-#define MINDSPORE_TBE_KERNEL_SELECT_H
+#ifndef MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_METADATA_H_
+#define MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_METADATA_H_
 
 #include <string>
 #include <vector>
+#include <unordered_map>
 #include <memory>
-#include "kernel/oplib/opinfo.h"
 #include "kernel/kernel_build_info.h"
 
 namespace mindspore {
 namespace kernel {
-void TbeMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list);
+void AkgMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list);
 }  // namespace kernel
 }  // namespace mindspore
-
-#endif  // MINDSPORE_TBE_KERNEL_SELECT_H
+#endif  // MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_METADATA_H_
diff --git a/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.cc b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.cc
new file mode 100644
index 0000000000..454b8052ab
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.cc
@@ -0,0 +1,385 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/akg/ascend/akg_ascend_kernel_build.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include <Python.h>
+#include "ir/dtype.h"
+#include "ir/func_graph.h"
+#include "kernel/kernel.h"
+#include "kernel/common_utils.h"
+#include "kernel/tbe/tbe_utils.h"
+#include "kernel/akg/ascend/akg_ascend_kernel_mod.h"
+#include "kernel/akg/akg_kernel_attrs_process.h"
+#include "session/anf_runtime_algorithm.h"
+
+namespace mindspore {
+namespace kernel {
+
+constexpr int32_t PARALLEL_ARGS_SIZE = 3;
+constexpr int32_t PROCESS_NUM = 16;
+constexpr int32_t TIME_OUT = 300;
+
+constexpr auto kOpDesc = "op_desc";
+constexpr auto kShape = "shape";
+constexpr auto kDataType = "data_type";
+constexpr auto kInputDesc = "input_desc";
+constexpr auto kOutputDesc = "output_desc";
+constexpr auto kTensorName = "tensor_name";
+constexpr auto kCompileAkgKernelParallelFunc = "compile_akg_kernel_parallel";
+constexpr auto kMultiProcModule = "mindspore._extends.parallel_compile.akg_compiler.multi_process_compiler";
+
+bool AkgAscendKernelBuilder::CollectJson(const AnfNodePtr &anf_node) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
+  MS_LOG(INFO) << "AKG start compile, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node) << "]";
+  auto it = kAkgKernelAttrsProcessMap.find(op_name);
+  if (it != kAkgKernelAttrsProcessMap.end()) {
+    it->second(anf_node);
+  }
+  MS_LOG(INFO) << "Akg start compile, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node) << "]";
+  nlohmann::json node_json;
+  if (!GenerateSingleKernelJson(anf_node, op_name, &node_json)) {
+    MS_LOG(ERROR) << "Op[" << op_name << "] create single kernel json failed.";
+  }
+
+  kernel_json_ = node_json.dump();
+
+  if (!GetIOSize(node_json, &input_size_list_, &output_size_list_)) {
+    MS_LOG(ERROR) << "Cal mem size failed.";
+    return false;
+  }
+
+  return true;
+}
+
+bool AkgAscendKernelBuilder::CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes,
+                                              const std::vector<AnfNodePtr> &input_list,
+                                              const std::vector<AnfNodePtr> &output_list) {
+  if (anf_nodes.empty() || input_list.empty()) {
+    MS_LOG(ERROR) << "Invalid input size, anf_nodes [" << anf_nodes.size() << "], input_list [" << input_list.size()
+                  << "].";
+    return false;
+  }
+  MS_LOG(INFO) << "anf_nodes [" << output_list.size() << "], input_list [" << anf_nodes.size() << "], output_list ["
+               << input_list.size() << "].";
+
+  std::map<AnfNodePtr, nlohmann::json> node_json_map;
+
+  for (auto const &anf_node : anf_nodes) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    std::string op_name = AnfAlgo::GetCNodeName(anf_node);
+    if (!AnfAlgo::IsRealKernel(anf_node)) {
+      MS_LOG(ERROR) << "Invalid anf node to build [" << anf_node->fullname_with_scope() << "].";
+      return false;
+    }
+    auto it = kAkgKernelAttrsProcessMap.find(op_name);
+    if (it != kAkgKernelAttrsProcessMap.end()) {
+      it->second(anf_node);
+    }
+
+    nlohmann::json node_json;
+    if (!GenerateSingleKernelJson(anf_node, op_name, &node_json)) {
+      MS_LOG(ERROR) << "Op [" << op_name << "] create single kernel json failed.";
+      return false;
+    }
+    // No need for composite op.
+    node_json.erase("id");
+    node_json.erase("op");
+    node_json.erase("composite");
+
+    auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
+    MS_EXCEPTION_IF_NULL(primitive);
+
+    if (primitive->GetAttr("fusion") != nullptr) {
+      node_json["fusion"] = primitive->GetAttr("fusion")->ToString();
+    }
+
+    node_json_map[anf_node] = node_json;
+  }
+
+  for (auto const &anf_node : anf_nodes) {
+    std::vector<int> dyn_input_sizes;
+    auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
+    MS_EXCEPTION_IF_NULL(primitive);
+
+    if (primitive->GetAttr(kAttrDynInputSizes) != nullptr) {
+      dyn_input_sizes = GetValue<const std::vector<int>>(primitive->GetAttr(kAttrDynInputSizes));
+    }
+
+    bool is_dynamic_input = !dyn_input_sizes.empty();
+    size_t input_num = is_dynamic_input ? dyn_input_sizes.size() : AnfAlgo::GetInputTensorNum(anf_node);
+    size_t real_input_index = 0;
+    for (size_t i = 0; i < input_num; ++i) {
+      size_t input_tensor_num = is_dynamic_input ? IntToSize(dyn_input_sizes[i]) : 1;
+      for (size_t j = 0; j < input_tensor_num; ++j) {
+        auto tmp_input = GetKernelInput(anf_node, real_input_index);
+        std::string tensor_name = GetTensorName(node_json_map[anf_node], kInputDesc, std::make_pair(i, j));
+        if (node_json_map.find(tmp_input.first) != node_json_map.end()) {
+          std::string new_tensor_name =
+            GetTensorName(node_json_map[tmp_input.first], kOutputDesc, std::make_pair(0, tmp_input.second));
+          SetTensorName(kInputDesc, new_tensor_name, std::make_pair(i, j), &(node_json_map[anf_node]));
+          MS_LOG(DEBUG) << "Update [" << real_input_index << "] input [" << tensor_name << "] of ["
+                        << anf_node->fullname_with_scope() << "] to [" << tmp_input.second << "] output ["
+                        << new_tensor_name << "] of [" << tmp_input.first->fullname_with_scope() << "].";
+        } else {
+          MS_LOG(DEBUG) << "[" << real_input_index << "] input " << tensor_name << "] of ["
+                        << anf_node->fullname_with_scope() << "] is out input.";
+        }
+        real_input_index++;
+      }
+    }
+  }
+
+  nlohmann::json fused_node_json;
+  std::vector<nlohmann::json> node_json_desc;
+  std::transform(anf_nodes.begin(), anf_nodes.end(), std::back_inserter(node_json_desc),
+                 [&node_json_map](const AnfNodePtr &anf_node) { return node_json_map[anf_node]; });
+  fused_node_json[kOpDesc] = node_json_desc;
+
+  nlohmann::json inputs_json;
+  auto input_index = GetInputIndex(anf_nodes, input_list);
+  for (size_t i = 0; i < input_index.size(); ++i) {
+    auto tmp_input = input_index[i];
+    auto type_id = AnfAlgo::GetInputDeviceDataType(tmp_input.first, tmp_input.second.first);
+    std::string dtype = TypeId2String(type_id);
+    nlohmann::json input_desc_json;
+    input_desc_json[kTensorName] = GetTensorName(node_json_map[tmp_input.first], kInputDesc, tmp_input.second);
+    input_desc_json[kDataType] = dtype;
+    input_desc_json[kShape] = AnfAlgo::GetInputDeviceShape(tmp_input.first, tmp_input.second.first);
+    inputs_json.emplace_back(std::vector<nlohmann::json>{input_desc_json});
+  }
+  fused_node_json[kInputDesc] = inputs_json;
+
+  nlohmann::json outputs_json;
+  auto output_index = GetOutputIndex(anf_nodes, input_list, output_list);
+  for (size_t i = 0; i < output_index.size(); ++i) {
+    auto tmp_output = output_index[i];
+    bool found = false;
+    nlohmann::json output_desc_json;
+    for (size_t input_i = 0; input_i < input_list.size(); ++input_i) {
+      if (tmp_output.first == input_list[input_i]) {
+        output_desc_json = inputs_json[input_i][0];
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      auto type_id = AnfAlgo::GetOutputDeviceDataType(tmp_output.first, tmp_output.second);
+      std::string dtype = TypeId2String(type_id);
+      output_desc_json[kTensorName] =
+        GetTensorName(node_json_map[tmp_output.first], kOutputDesc, std::make_pair(0, tmp_output.second));
+      output_desc_json[kDataType] = dtype;
+      auto output_shape = AnfAlgo::GetOutputDeviceShape(tmp_output.first, tmp_output.second);
+      if (output_shape.empty()) {
+        output_shape.push_back(1);
+      }
+      output_desc_json[kShape] = output_shape;
+    }
+    outputs_json.emplace_back(output_desc_json);
+  }
+  fused_node_json[kOutputDesc] = outputs_json;
+
+  size_t hash_id = std::hash<std::string>()(fused_node_json.dump());
+  json_name_ = "Fused_";
+  auto fg = anf_nodes[0]->func_graph();
+  MS_EXCEPTION_IF_NULL(fg);
+  auto attr_val = fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+  if (attr_val != nullptr) {
+    auto fg_attr = GetValue<std::string>(attr_val);
+    (void)json_name_.append(fg_attr).append("_");
+  }
+  (void)json_name_.append(std::to_string(hash_id));
+  fused_node_json["composite_graph"] = fg->ToString();
+  fused_node_json["op"] = json_name_;
+  fused_node_json["platform"] = "AKG";
+  fused_node_json["process"] = "aicore";
+  fused_node_json["composite"] = true;
+
+  kernel_json_ = fused_node_json.dump();
+
+  if (!GetIOSize(fused_node_json, &input_size_list_, &output_size_list_)) {
+    MS_LOG(ERROR) << "Cal mem size failed.";
+    return false;
+  }
+
+  return true;
+}
+
+void GenParallelCompileFuncArgs(const std::vector<std::string> &kernel_jsons, PyObject **p_args) {
+  MS_EXCEPTION_IF_NULL(p_args);
+  *p_args = PyTuple_New(PARALLEL_ARGS_SIZE);
+
+  PyObject *arg1 = PyList_New(kernel_jsons.size());
+  for (int i = 0; i < PyList_Size(arg1); ++i) {
+    PyList_SetItem(arg1, i, Py_BuildValue("s", kernel_jsons[i].c_str()));
+  }
+  PyObject *arg2 = Py_BuildValue("i", PROCESS_NUM);
+  PyObject *arg3 = Py_BuildValue("i", TIME_OUT);
+
+  (void)PyTuple_SetItem(*p_args, 0, arg1);
+  (void)PyTuple_SetItem(*p_args, 1, arg2);
+  (void)PyTuple_SetItem(*p_args, 2, arg3);
+}
+
+bool AkgOpParallelBuild(const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &build_args) {
+  // Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess.
+  std::vector<std::string> jsons;
+  std::unordered_set<std::string> json_name_set;
+  std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> repeat_nodes;
+  for (const auto &[builder, anf_node] : build_args) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto json_name = builder.json_name();
+    MS_LOG(DEBUG) << "Akg start compile op: " << json_name;
+    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
+    if (cached_kernel_pack != nullptr) {
+      MS_LOG(DEBUG) << "Use cached kernel, json_name_[" << json_name << "], fullname_with_scope["
+                    << anf_node->fullname_with_scope() << "].";
+      auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(cached_kernel_pack);
+      kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
+      kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
+      AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
+      continue;
+    }
+
+    if (json_name_set.count(json_name) != 0) {
+      repeat_nodes.push_back({builder, anf_node});
+      continue;
+    }
+    json_name_set.insert(json_name);
+    auto node_json = builder.kernel_json();
+    kernel::SaveJsonInfo(json_name, node_json);
+    jsons.push_back(node_json);
+  }
+
+  // No nodes need to be compiled!
+  if (jsons.empty()) {
+    return true;
+  }
+
+  // Try to call python method to compile nodes parallely.
+  PyObject *p_module = nullptr;
+  PyObject *p_func = nullptr;
+  PyObject *p_arg = nullptr;
+  PyObject *p_res = nullptr;
+
+  p_module = PyImport_ImportModule(kMultiProcModule);
+  if (p_module == nullptr) {
+    MS_LOG(ERROR) << "Failed to import [" << kMultiProcModule << "].";
+    return false;
+  }
+
+  p_func = PyObject_GetAttrString(p_module, kCompileAkgKernelParallelFunc);
+  GenParallelCompileFuncArgs(jsons, &p_arg);
+  MS_LOG(DEBUG) << "Call function [" << kCompileAkgKernelParallelFunc << "], try to compile " << jsons.size()
+                << " Akg kernels parallelly.";
+  p_res = PyEval_CallObject(p_func, p_arg);
+  if (p_res == nullptr) {
+    PyErr_Print();
+    MS_LOG(ERROR) << "No ret got, failed to call function [" << kCompileAkgKernelParallelFunc << "], args:\n("
+                  << AkgKernelBuild::PyObjectToStr(p_arg) << ").";
+    return false;
+  }
+  if (PyObject_IsTrue(p_res) != 1) {
+    PyErr_Print();
+    MS_LOG(ERROR) << "Illegal ret, failed to call function [" << kCompileAkgKernelParallelFunc << "], args:\n("
+                  << AkgKernelBuild::PyObjectToStr(p_arg) << ").";
+    return false;
+  }
+
+  // All unique done here, cache them and set kernel.
+  for (const auto &[builder, anf_node] : build_args) {
+    auto json_name = builder.json_name();
+    auto new_kernel_pack = tbe::TbeUtils::InsertCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
+    if (new_kernel_pack == nullptr) {
+      MS_LOG(ERROR) << "Insert to cache failed, json_name_[" << json_name << "], fullname_with_scope["
+                    << anf_node->fullname_with_scope() << "].";
+      return false;
+    }
+    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(new_kernel_pack);
+    kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
+    kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
+    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
+    MS_LOG(DEBUG) << "Akg compile " << json_name << " kernel and insert cache successfully!";
+  }
+
+  // Handle repeated nodes.
+  for (const auto &[builder, anf_node] : repeat_nodes) {
+    auto node_json = builder.kernel_json();
+    auto json_name = builder.json_name();
+    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
+    if (cached_kernel_pack == nullptr) return false;
+    MS_LOG(INFO) << "Use just compiled kernel, json_name_[" << json_name << "], fullname_with_scope["
+                 << anf_node->fullname_with_scope() << "].";
+    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(cached_kernel_pack);
+    kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
+    kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
+    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
+  }
+
+  return true;
+}
+
+bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
+  std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> json_and_node;
+  for (const auto &anf_node : anf_nodes) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    AkgAscendKernelBuilder akg_cce_kernel_builder;
+    KernelPackPtr kernel_pack = nullptr;
+    auto cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    if (AnfAlgo::IsGraphKernel(cnode)) {
+      auto func_graph = AnfAlgo::GetCNodeFuncGraphPtr(cnode);
+      auto mng = func_graph->manager();
+      if (mng == nullptr) {
+        mng = Manage(func_graph, true);
+        func_graph->set_manager(mng);
+      }
+      MS_EXCEPTION_IF_NULL(func_graph);
+      std::vector<AnfNodePtr> node_list;
+      std::vector<AnfNodePtr> input_list;
+      std::vector<AnfNodePtr> output_list;
+      std::string op_name = AnfAlgo::GetCNodeName(anf_node);
+      MS_LOG(INFO) << "Akg start compile composite op[" << op_name << "]";
+      GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
+      if (!akg_cce_kernel_builder.CollectFusedJson(node_list, input_list, output_list)) {
+        MS_EXCEPTION(UnknownError) << "Akg build failed composite op[" << op_name << "].";
+      }
+    } else {
+      if (!akg_cce_kernel_builder.CollectJson(anf_node)) {
+        MS_EXCEPTION(UnknownError) << "Akg build failed op[" << AnfAlgo::GetCNodeName(anf_node) << "].";
+      }
+    }
+    json_and_node.push_back({akg_cce_kernel_builder, anf_node});
+  }
+
+  if (json_and_node.empty()) {
+    MS_LOG(DEBUG) << "There is no kernel needed to be compiled.";
+    return true;
+  }
+
+  return AkgOpParallelBuild(json_and_node);
+}
+
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.h b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.h
new file mode 100644
index 0000000000..619b583fde
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_
+#define MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_
+
+#include <string>
+#include <memory>
+#include <vector>
+#include "ir/anf.h"
+#include "kernel/kernel.h"
+#include "kernel/akg/akg_kernel_build.h"
+
+namespace mindspore {
+namespace kernel {
+class AkgAscendKernelBuilder : public AkgKernelBuild {
+ public:
+  AkgAscendKernelBuilder() = default;
+  ~AkgAscendKernelBuilder() = default;
+
+  bool CollectJson(const AnfNodePtr &anf_node);
+  bool CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
+                        const std::vector<AnfNodePtr> &output_list);
+  std::string json_name() const { return json_name_; }
+  std::string kernel_json() const { return kernel_json_; }
+  const std::vector<size_t> &input_size_list() const { return input_size_list_; }
+  const std::vector<size_t> &output_size_list() const { return output_size_list_; }
+
+ private:
+  std::string kernel_json_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+};
+
+bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_
diff --git a/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.cc b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.cc
new file mode 100644
index 0000000000..24324f70e0
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.cc
@@ -0,0 +1,181 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/akg/ascend/akg_ascend_kernel_mod.h"
+#include <algorithm>
+#include <fstream>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+#include "nlohmann/json.hpp"
+#include "runtime/rt.h"
+#include "utils/log_adapter.h"
+#include "utils/convert_utils.h"
+
+namespace mindspore {
+namespace kernel {
+using std::fstream;
+using std::map;
+using std::mutex;
+using std::string;
+using TbeTaskInfoPtr = std::shared_ptr<ge::model_runner::TbeTaskInfo>;
+using tbe::KernelManager;
+constexpr uint32_t DEFAULT_BLOCK_DIM = 1;
+/**
+ * @brief infotable contain func_stub\blockdim\kernel file buffer
+ */
+AkgKernelMod::AkgKernelMod(const KernelPackPtr &kernel_pack) : kernel_pack_(kernel_pack) {}
+
+void AkgKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
+
+void AkgKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }
+
+void AkgKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
+
+const std::vector<size_t> &AkgKernelMod::GetInputSizeList() const { return input_size_list_; }
+
+const std::vector<size_t> &AkgKernelMod::GetOutputSizeList() const { return output_size_list_; }
+
+const std::vector<size_t> &AkgKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }
+
+void DumpData(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
+  const char *dump_data = getenv("MS_KERNEL_DUMP_DATA");
+  if (dump_data) {
+    int idx = 0;
+    for (const auto &x : inputs) {
+      std::vector<char> buf(x->size);
+      if (RT_ERROR_NONE != rtMemcpy(buf.data(), buf.size(), reinterpret_cast<const void *>(x->addr), x->size,
+                                    RT_MEMCPY_DEVICE_TO_HOST)) {
+        MS_LOG(WARNING) << "Call runtime rtMemcpy error.";
+        return;
+      }
+
+      std::string file_name("input_");
+      file_name += std::to_string(idx);
+      std::ofstream file(file_name, std::ios::binary);
+      if (file.is_open()) {
+        (void)file.write(buf.data(), SizeToLong(buf.size()));
+        file.close();
+        idx++;
+      } else {
+        MS_LOG(ERROR) << "Open file failed.";
+        return;
+      }
+    }
+    idx = 0;
+    for (const auto &x : outputs) {
+      std::vector<char> buf(x->size);
+      if (RT_ERROR_NONE != rtMemcpy(buf.data(), buf.size(), reinterpret_cast<const void *>(x->addr), x->size,
+                                    RT_MEMCPY_DEVICE_TO_HOST)) {
+        MS_LOG(WARNING) << "Call runtime rtMemcpy error.";
+        return;
+      }
+
+      std::string file_name("output_");
+      file_name += std::to_string(idx);
+      std::ofstream file(file_name, std::ios::binary);
+      if (file.is_open()) {
+        (void)file.write(buf.data(), SizeToLong(buf.size()));
+        file.close();
+        idx++;
+      } else {
+        MS_LOG(ERROR) << "Open file failed.";
+        return;
+      }
+    }
+  }
+}
+
+bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  if (stream_ptr == 0) {
+    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
+    return false;
+  }
+
+  if (kernel_pack_ == nullptr) {
+    MS_LOG(ERROR) << "kernel pack should not be nullptr.";
+    return false;
+  }
+
+  uint32_t block_dim = DEFAULT_BLOCK_DIM;  // default blockdim equal to 1.
+  auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim);
+  if (func_stub == 0) {
+    MS_LOG(ERROR) << "GenFuncStub failed.";
+    return false;
+  }
+
+  // pack all addresses into a vector.
+  std::vector<void *> runtime_args;
+  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtime_args),
+                       [](const AddressPtr &input) -> void * { return input->addr; });
+  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtime_args),
+                       [](const AddressPtr &output) -> void * { return output->addr; });
+
+  rtL2Ctrl_t *l2ctrl = nullptr;
+  auto stream = reinterpret_cast<rtStream_t *>(stream_ptr);
+  if (RT_ERROR_NONE != rtKernelLaunch(reinterpret_cast<void *>(func_stub), block_dim, runtime_args.data(),
+                                      SizeToUint(sizeof(void *) * runtime_args.size()), l2ctrl, stream)) {
+    MS_LOG(ERROR) << "Call runtime rtKernelLaunch error.";
+    return false;
+  }
+
+  DumpData(inputs, outputs);
+
+  return true;
+}
+
+std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                                               const std::vector<AddressPtr> &outputs, uint32_t stream_id) {
+  if (kernel_pack_ == nullptr) {
+    MS_LOG(EXCEPTION) << "kernel pack should not be nullptr.";
+  }
+
+  std::vector<uint8_t> args;
+  uint32_t args_size = 0;
+  std::vector<uint8_t> sm_desc;
+  void *binary = nullptr;
+  uint32_t binary_size = 0;
+  std::vector<uint8_t> meta_data;
+  std::vector<void *> input_data_addrs;
+  std::vector<void *> output_data_addrs;
+  std::vector<void *> workspace_addrs;
+
+  // pack all addresses into a vector.
+  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(input_data_addrs),
+                       [](const AddressPtr &input) -> void * { return input->addr; });
+  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs),
+                       [](const AddressPtr &output) -> void * { return output->addr; });
+
+  uint32_t block_dim = DEFAULT_BLOCK_DIM;  // default blockdim equal to 1.
+  auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim);
+  if (func_stub == 0) {
+    MS_LOG(EXCEPTION) << "GenFuncStub failed.";
+  }
+
+  std::string stub_func = KernelManager::GetStubFuncName(kernel_pack_);
+
+  MS_LOG(DEBUG) << "The block_dim is:" << block_dim;
+
+  TbeTaskInfoPtr task_info_ptr = make_shared<ge::model_runner::TbeTaskInfo>(
+    stream_id, stub_func, block_dim, args, args_size, sm_desc, binary, binary_size, meta_data, input_data_addrs,
+    output_data_addrs, workspace_addrs);
+  return {task_info_ptr};
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.h b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.h
new file mode 100644
index 0000000000..18d342f629
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_MOD_H_
+#define MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_MOD_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include "kernel/ascend_kernel_mod.h"
+#include "kernel/tbe/tbe_utils.h"
+
+namespace mindspore {
+namespace kernel {
+class AkgKernelMod : public AscendKernelMod {
+ public:
+  explicit AkgKernelMod(const KernelPackPtr &kernel_pack);
+  ~AkgKernelMod() final {}
+
+  void SetInputSizeList(const std::vector<size_t> &size_list);
+  void SetOutputSizeList(const std::vector<size_t> &size_list);
+  void SetWorkspaceSizeList(const std::vector<size_t> &size_list);
+  const std::vector<size_t> &GetInputSizeList() const override;
+  const std::vector<size_t> &GetOutputSizeList() const override;
+  const std::vector<size_t> &GetWorkspaceSizeList() const override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;
+
+ private:
+  KernelPackPtr kernel_pack_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+
+using AkgKernelModPtr = std::shared_ptr<AkgKernelMod>;
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_MOD_H_
diff --git a/mindspore/ccsrc/kernel/akg/gpu/akg_gpu_kernel_build.cc b/mindspore/ccsrc/kernel/akg/gpu/akg_gpu_kernel_build.cc
index 2bb2cfd267..534e355802 100644
--- a/mindspore/ccsrc/kernel/akg/gpu/akg_gpu_kernel_build.cc
+++ b/mindspore/ccsrc/kernel/akg/gpu/akg_gpu_kernel_build.cc
@@ -18,7 +18,7 @@
 #include <vector>
 #include <memory>
 #include "kernel/kernel.h"
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include "kernel/akg/gpu/akg_gpu_kernel_mod.h"
 #include "common/utils.h"
 
diff --git a/mindspore/ccsrc/kernel/common_utils.cc b/mindspore/ccsrc/kernel/common_utils.cc
index 2769e0c42a..868abeb1cc 100644
--- a/mindspore/ccsrc/kernel/common_utils.cc
+++ b/mindspore/ccsrc/kernel/common_utils.cc
@@ -18,10 +18,17 @@
 #include <unordered_map>
 #include <map>
 #include <iostream>
+#include <utility>
 #include <fstream>
+#include <thread>
 #include "nlohmann/json.hpp"
 #include "session/anf_runtime_algorithm.h"
 #include "common/utils.h"
+#include "ir/manager.h"
+#include "ir/meta_tensor.h"
+#include "ir/func_graph.h"
+#include "operator/ops.h"
+#include "utils/graph_utils.h"
 
 namespace mindspore {
 namespace kernel {
@@ -47,12 +54,6 @@ const std::map<TypeId, std::string> type_id_str_map = {
   {TypeId::kNumberTypeBool, "bool"},
 };
 
-const std::map<std::string, std::string> DATATYPE_STRING_MAP{
-  {"Float32", "float32"}, {"Float16", "float16"}, {"Int8", "int8"},   {"Int16", "int16"},
-  {"UInt16", "uint16"},   {"UInt8", "uint8"},     {"Int32", "int32"}, {"UInt32", "uint32"},
-  {"Int64", "int64"},     {"UInt64", "uint64"},   {"Bool_", "bool"},  {"Float64", "double"},
-};
-
 const std::unordered_map<std::string, std::string> dtype_shortdtype_map_ = {
   {"float16", "f16"}, {"float32", "f32"}, {"float64", "f64"}, {"int8", "i8"},    {"int16", "i16"},  {"int32", "i32"},
   {"int64", "i64"},   {"uint8", "u8"},    {"uint16", "u16"},  {"uint32", "u32"}, {"uint64", "u64"}, {"bool", "bool"},
@@ -70,50 +71,6 @@ const std::unordered_map<std::string, FusionType> fusion_type_maps = {
   {"SEGMENT", FusionType::SEGMENT},       {"OPAQUE", FusionType::OPAQUE},
 };
 
-bool IsAtomicNode(const CNodePtr &kernel_node) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  auto kernel_mod = AnfAlgo::GetKernelMod(kernel_node);
-  MS_EXCEPTION_IF_NULL(kernel_mod);
-  auto parameters_indexs = kernel_mod->GenParameters();
-  if (parameters_indexs.empty()) {
-    return false;
-  }
-  auto atomic_flag = false;
-  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  auto workspace_size_list = kernel_mod->GetWorkspaceSizeList();
-  size_t workspace_num = kernel_mod->GetWorkspaceSizeList().size();
-  if (input_num + workspace_num + output_num > parameters_indexs.size()) {
-    size_t lossNum = (input_num + workspace_num + output_num) - parameters_indexs.size();
-    for (size_t i = 0; i < lossNum; i++) {
-      parameters_indexs.push_back(0);
-    }
-  }
-  std::vector<int> clean_output_indexs;
-  // in parameters data sort as input->workspace->output
-  size_t index = 0;
-  while (index < output_num) {
-    if (parameters_indexs[input_num + workspace_num + index] == 1) {
-      atomic_flag = true;
-      clean_output_indexs.push_back(SizeToInt(index));
-    }
-    index++;
-  }
-  if (atomic_flag) {
-    AnfAlgo::SetNodeAttr(kAttrAutomicOutputIndexs, MakeValue(clean_output_indexs), kernel_node);
-  }
-  for (size_t i = 0; i < workspace_num; ++i) {
-    if (parameters_indexs[input_num + i] == 1) {
-      atomic_flag = true;
-      AnfAlgo::SetNodeAttr(kAttrAutomicWorkspaceSize,
-                           MakeValue(std::accumulate(workspace_size_list.begin(), workspace_size_list.end(), 0)),
-                           kernel_node);
-      break;
-    }
-  }
-  return atomic_flag;
-}
-
 void KernelMeta::Initialize() {
   kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(getpid()) + "/";
   // remove old kernel cache
@@ -242,14 +199,6 @@ TypeId DtypeToTypeId(const std::string &dtypes) {
   }
 }
 
-std::string Dtype2String(const std::string &dtypes) {
-  auto iter = DATATYPE_STRING_MAP.find(dtypes);
-  if (iter == DATATYPE_STRING_MAP.end()) {
-    MS_EXCEPTION(ArgumentError) << "Illegal input dtype:" << dtypes;
-  }
-  return iter->second;
-}
-
 std::string TypeId2String(TypeId type_id) {
   auto iter = type_id_str_map.find(type_id);
   if (iter == type_id_str_map.end()) {
@@ -360,7 +309,7 @@ bool SetOutputKernelBuilderInfo(const std::vector<std::shared_ptr<OpIOInfo>> &ou
       output_num = 1;
     } else {
       if (output_idx < real_output_num) {
-        MS_LOG(INFO) << "Set output kernel builder info, output type is optional, output index is :" << output_idx;
+        MS_LOG(DEBUG) << "Set output kernel builder info, output type is optional, output index is :" << output_idx;
         output_num = 1;
       }
     }
@@ -402,7 +351,7 @@ void SetKernelBuildInfo(const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBu
   }
 
   if (imply_type == kAKG) {
-    builder->SetKernelType(AUTO_DIFF_KERNEL);
+    builder->SetKernelType(AKG_KERNEL);
   } else if (imply_type == kAICPU) {
     builder->SetKernelType(AICPU_KERNEL);
   } else {
@@ -525,5 +474,429 @@ std::string GetProcessor(const AnfNodePtr &anf_node) {
   }
   return device;
 }
+
+bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b) {
+  if (shape_a.size() != shape_b.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < shape_a.size(); ++i) {
+    if (shape_a[i] != shape_b[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+int Sign(float x) {
+  if (x > 0) {
+    return 1;
+  }
+  if (x < 0) {
+    return -1;
+  }
+  return 0;
+}
+
+void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
+                              size_t outer_dim) {
+  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
+  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
+  MS_EXCEPTION_IF_NULL(unique_grad);
+  MS_EXCEPTION_IF_NULL(unique_grad->value_);
+  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
+  std::unordered_map<int, size_t> index_map;
+  size_t unique_indices_size = 0;
+  for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
+    int index = origin_sparse_grad.indices_[i];
+    if (index < 0 || IntToSize(index) >= first_dim) {
+      continue;
+    }
+    auto iter = index_map.find(index);
+    if (iter == index_map.end()) {
+      index_map[index] = unique_indices_size;
+      unique_grad->indices_[unique_indices_size] = index;
+      size_t start_index = unique_indices_size * outer_dim;
+      size_t end_index = start_index + outer_dim;
+      for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
+        unique_grad->value_[j] = origin_sparse_grad.value_[k];
+      }
+      unique_indices_size++;
+    } else {
+      size_t first_index = iter->second;
+      size_t start_index = first_index * outer_dim;
+      size_t end_index = start_index + outer_dim;
+      for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
+        unique_grad->value_[j] += origin_sparse_grad.value_[k];
+      }
+    }
+  }
+  unique_grad->indices_size_ = unique_indices_size;
+}
+
+struct WorkerParamsForReduceSparseGradient {
+  size_t slice_start_{0};
+  size_t slice_end_{0};
+  size_t max_length_{0};
+  size_t outer_dim_{0};
+  std::vector<std::pair<int, size_t>> *sorted_indices_{nullptr};
+  std::vector<size_t> *slice_positions_{nullptr};
+  float *src_value_{nullptr};
+  SparseGradient *unique_grad_{nullptr};
+};
+
+void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) {
+  MS_EXCEPTION_IF_NULL(param.sorted_indices_);
+  MS_EXCEPTION_IF_NULL(param.slice_positions_);
+  MS_EXCEPTION_IF_NULL(param.src_value_);
+  MS_EXCEPTION_IF_NULL(param.unique_grad_);
+  auto outer_dim = param.outer_dim_;
+  auto &sorted_indices = *(param.sorted_indices_);
+  auto &slice_positions = *(param.slice_positions_);
+  auto unique_grad = param.unique_grad_;
+  for (size_t slice_id = param.slice_start_; slice_id < param.slice_end_; ++slice_id) {
+    size_t cur_pos = slice_positions[slice_id];
+    int index = sorted_indices[cur_pos].first;
+    unique_grad->indices_[slice_id] = index;
+    size_t start_index = slice_id * outer_dim;
+    auto ret_code = memcpy_s(unique_grad->value_ + start_index, (param.max_length_ - start_index) * sizeof(float),
+                             param.src_value_ + sorted_indices[cur_pos].second, outer_dim * sizeof(float));
+    if (ret_code != EOK) {
+      MS_LOG(EXCEPTION) << "Failed to copy data!";
+    }
+    cur_pos++;
+    size_t end_pos;
+    if (slice_id + 1 < slice_positions.size()) {
+      end_pos = slice_positions[slice_id + 1];
+    } else {
+      end_pos = sorted_indices.size();
+    }
+    while (cur_pos < end_pos) {
+      for (size_t i = 0; i < outer_dim; ++i) {
+        unique_grad->value_[start_index + i] += param.src_value_[sorted_indices[cur_pos].second + i];
+      }
+      cur_pos++;
+    }
+  }
+}
+
+void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
+                          size_t outer_dim) {
+  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
+  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
+  MS_EXCEPTION_IF_NULL(unique_grad);
+  MS_EXCEPTION_IF_NULL(unique_grad->value_);
+  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
+  std::vector<std::pair<int, size_t>> sorted_indices;
+  sorted_indices.reserve(origin_sparse_grad.indices_size_);
+  for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
+    int index = origin_sparse_grad.indices_[i];
+    if (index >= 0 && IntToSize(index) < first_dim) {
+      sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim));
+    }
+  }
+  std::sort(
+    sorted_indices.begin(), sorted_indices.end(),
+    [](const std::pair<int, size_t> &left, const std::pair<int, size_t> &right) { return left.first < right.first; });
+  int last_index = 0;
+  std::vector<size_t> slice_positions;
+  for (size_t i = 0; i < sorted_indices.size(); ++i) {
+    if (i == 0 || last_index != sorted_indices[i].first) {
+      slice_positions.emplace_back(i);
+    }
+    last_index = sorted_indices[i].first;
+  }
+  size_t thread_num = 8;
+  if (slice_positions.size() < thread_num) {
+    thread_num = slice_positions.size();
+  }
+  size_t stride = (slice_positions.size() + thread_num - 1) / thread_num;
+  thread_num = (slice_positions.size() + stride - 1) / stride;
+  std::vector<std::thread> threads;
+  size_t max_length = sorted_indices.size() * outer_dim;
+  for (size_t i = 0; i < thread_num; ++i) {
+    size_t slice_start = i * stride;
+    size_t slice_end = 0;
+    if (i == thread_num - 1) {
+      slice_end = slice_positions.size();
+    } else {
+      slice_end = slice_start + stride;
+    }
+    WorkerParamsForReduceSparseGradient params{
+      slice_start, slice_end, max_length, outer_dim, &sorted_indices, &slice_positions, origin_sparse_grad.value_,
+      unique_grad};
+    threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params));
+  }
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads[i].join();
+  }
+  unique_grad->indices_size_ = slice_positions.size();
+}
+
+std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+
+  if (index >= AnfAlgo::GetInputTensorNum(anf_node)) {
+    MS_EXCEPTION(ArgumentError) << "Index is out of the size of anf_node inputs.";
+  }
+
+  auto cnode = anf_node->cast<CNodePtr>();
+  if (cnode == nullptr) {
+    return AnfAlgo::VisitKernel(anf_node, 0);
+  } else {
+    return AnfAlgo::VisitKernel(anf_node->cast<CNodePtr>()->input(index + 1), 0);
+  }
+}
+
+std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
+                                                                            const std::vector<AnfNodePtr> &input_list) {
+  std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> input_index;
+  for (size_t i = 0; i < input_list.size(); ++i) {
+    auto const &input = input_list[i];
+    MS_EXCEPTION_IF_NULL(input);
+    bool found = false;
+    // using NodeUsersMap = std::unordered_map<AnfNodePtr, std::set<std::pair<AnfNodePtr, int>>>;
+    auto mng = input->func_graph()->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    const NodeUsersMap &users = mng->node_users();
+    auto input_users = users.find(input);
+    if (input_users == users.end() || input_users->second.empty()) {
+      MS_EXCEPTION(ArgumentError) << "Input [" << i << "][" << input->DebugString(2) << "] of ["
+                                  << input->func_graph()->ToString() << "] has no users.";
+    }
+
+    for (auto const &input_user : input_users->second) {
+      for (auto const &anf_node : node_list) {
+        if (anf_node != input_user.first) {
+          continue;
+        }
+
+        std::vector<int> dyn_input_sizes;
+        auto prim = AnfAlgo::GetCNodePrimitive(anf_node);
+        MS_EXCEPTION_IF_NULL(prim);
+        if (prim->GetAttr(kAttrDynInputSizes) != nullptr) {
+          dyn_input_sizes = GetValue<const std::vector<int>>(prim->GetAttr(kAttrDynInputSizes));
+        }
+
+        if (dyn_input_sizes.empty()) {
+          input_index.push_back(std::make_pair(anf_node, std::make_pair(IntToSize(input_user.second - 1), 0)));
+          found = true;
+          break;
+        } else {
+          int used_as_idx = input_user.second - 1;
+          int accum_idx = 0;
+          size_t dyn_i = 0;
+          for (; dyn_i < dyn_input_sizes.size(); ++dyn_i) {
+            accum_idx += dyn_input_sizes[dyn_i];
+            if (used_as_idx < accum_idx) {
+              input_index.push_back(std::make_pair(
+                anf_node, std::make_pair(dyn_i, IntToSize(used_as_idx - (accum_idx - dyn_input_sizes[dyn_i])))));
+              break;
+            }
+          }
+          if (dyn_i != dyn_input_sizes.size()) {
+            found = true;
+            break;
+          }
+        }
+      }
+      if (found) {
+        break;
+      }
+    }
+
+    if (!found) {
+      MS_EXCEPTION(ArgumentError) << "Input [" << i << "][" << input->DebugString(2) << "] of ["
+                                  << input->func_graph()->ToString() << "] found no related kernel info.";
+    }
+  }
+  return input_index;
+}
+
+std::vector<std::pair<AnfNodePtr, size_t>> GetOutputIndex(const std::vector<AnfNodePtr> &node_list,
+                                                          const std::vector<AnfNodePtr> &input_list,
+                                                          const std::vector<AnfNodePtr> &output_list) {
+  std::vector<std::pair<AnfNodePtr, size_t>> output_index;
+  for (size_t i = 0; i < output_list.size(); ++i) {
+    auto const &output = output_list[i];
+    MS_EXCEPTION_IF_NULL(output);
+    bool found = false;
+    auto pree_node = AnfAlgo::VisitKernel(output, 0);
+
+    auto pos = std::find(std::begin(node_list), std::end(node_list), pree_node.first);
+    if (pos != std::end(node_list)) {
+      output_index.push_back(pree_node);
+      continue;
+    }
+
+    auto ret = std::find(std::begin(input_list), std::end(input_list), pree_node.first);
+    if (ret != std::end(input_list)) {
+      output_index.push_back(std::make_pair(pree_node.first, 0));
+      found = true;
+    }
+
+    if (!found) {
+      MS_EXCEPTION(ArgumentError) << "Output [" << i << "][" << output->DebugString(2) << "] of ["
+                                  << output->func_graph()->ToString() << "] found no related kernel info.";
+    }
+  }
+  return output_index;
+}
+
+void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list) {
+  MS_EXCEPTION_IF_NULL(node_list);
+
+  MS_EXCEPTION_IF_NULL(func_graph);
+
+  std::vector<AnfNodePtr> node_lists = TopoSort(func_graph->get_return());
+  for (auto const &node : node_lists) {
+    if (!AnfAlgo::IsRealKernel(node) || !node->isa<CNode>()) {
+      continue;
+    }
+
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+
+    if (IsValueNode<Primitive>(cnode->input(kAnfPrimitiveIndex))) {
+      node_list->push_back(node);
+    }
+  }
+}
+
+void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list,
+                         std::vector<AnfNodePtr> *input_list, std::vector<AnfNodePtr> *output_list) {
+  MS_EXCEPTION_IF_NULL(node_list);
+  MS_EXCEPTION_IF_NULL(input_list);
+  MS_EXCEPTION_IF_NULL(output_list);
+  MS_EXCEPTION_IF_NULL(func_graph);
+
+  GetValidKernelNodes(func_graph, node_list);
+
+  auto parameters = func_graph->parameters();
+  input_list->insert(input_list->begin(), parameters.begin(), parameters.end());
+
+  auto func_output = func_graph->output();
+  MS_EXCEPTION_IF_NULL(func_output);
+  if (func_output->isa<CNode>()) {
+    // multi output.
+    auto cnode = func_output->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    auto input0 = cnode->input(kAnfPrimitiveIndex);
+    MS_EXCEPTION_IF_NULL(input0);
+    if (IsPrimitive(input0, prim::kPrimMakeTuple)) {
+      for (size_t input_idx = 1; input_idx < cnode->inputs().size(); ++input_idx) {
+        auto input_node = cnode->input(input_idx);
+        MS_EXCEPTION_IF_NULL(input_node);
+        output_list->push_back(AnfAlgo::VisitKernel(input_node, 0).first);
+      }
+    } else {
+      // single output.
+      output_list->push_back(AnfAlgo::VisitKernel(func_output, 0).first);
+    }
+  } else {
+    // single output.
+    output_list->push_back(AnfAlgo::VisitKernel(func_output, 0).first);
+  }
+}
+
+bool GetInputTensorValue(const AnfNodePtr &anf_node, size_t input_idx, nlohmann::json *const node_json) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  MS_EXCEPTION_IF_NULL(node_json);
+  auto cnode = anf_node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (input_idx + 1 >= cnode->size()) {
+    MS_EXCEPTION(ArgumentError) << "input_idx [" << input_idx << "] is out of index of inputs of ["
+                                << cnode->inputs().size() << "][" << cnode->DebugString() << "]";
+  }
+
+  auto input_node = cnode->input(input_idx + 1);
+  if (!IsValueNode<tensor::Tensor>(input_node)) {
+    return false;
+  }
+
+  auto tensor = GetValueNode<tensor::TensorPtr>(input_node);
+  if (tensor == nullptr) {
+    return false;
+  }
+
+  auto type_id = tensor->data_type();
+  auto *data = tensor->data_c();
+  MS_EXCEPTION_IF_NULL(data);
+  if (tensor->DataDim() > 1 || tensor->DataSize() != 1) {
+    // not const tensor.
+    MS_LOG(WARNING) << "We take first value of tensor whose datasize != 1, [" << input_node->DebugString(2) << "]";
+  }
+
+  if (type_id == kFloat32->type_id()) {
+    float *val = static_cast<float *>(data);
+    MS_EXCEPTION_IF_NULL(val);
+    (*node_json)["value"] = val[0];
+    MS_LOG(DEBUG) << "Value of tensor[" << cnode->DebugString() << "] is [float32][" << *val << "].";
+    return true;
+  } else if (type_id == kFloat16->type_id()) {
+    float16 *val = static_cast<float16 *>(data);
+    MS_EXCEPTION_IF_NULL(val);
+    (*node_json)["value"] = static_cast<float>(val[0]);
+    MS_LOG(INFO) << "Value of tensor[" << cnode->DebugString() << "] is [float16][" << *val << "].";
+    return true;
+  } else if (type_id == kInt32->type_id()) {
+    int *val = static_cast<int *>(data);
+    MS_EXCEPTION_IF_NULL(val);
+    (*node_json)["value"] = val[0];
+    MS_LOG(INFO) << "Value of tensor[" << cnode->DebugString() << "] is [int32][" << *val << "].";
+    return true;
+  }
+  MS_LOG(ERROR) << "Unknown value type of tensor[" << cnode->DebugString() << "]";
+  return false;
+}
+
+void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<AnfNodePtr, size_t>> *node_list) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(node_list);
+  auto output = func_graph->output();
+  MS_EXCEPTION_IF_NULL(output);
+  if (AnfAlgo::IsRealKernel(output)) {
+    // single output.
+    node_list->push_back(std::make_pair(output, 0));
+    return;
+  } else if (IsPrimitiveCNode(output, prim::kPrimMakeTuple)) {
+    auto output_cnode = output->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(output_cnode);
+    // multi output.
+    auto &inputs = output_cnode->inputs();
+    for (size_t i = 1; i < inputs.size(); ++i) {
+      auto in_with_idx = AnfAlgo::VisitKernel(inputs[i], 0);
+      node_list->push_back(in_with_idx);
+    }
+    return;
+  }
+  MS_EXCEPTION(ArgumentError) << "Unknown  output type: " << output->DebugString(2)
+                              << " of graph: " << func_graph->ToString();
+}
+
+bool IsWeightBoundary(const AnfNodePtr &node) {
+  if (node->isa<ValueNode>()) {
+    return true;
+  }
+  if (node->isa<Parameter>() && AnfAlgo::IsParameterWeight(node->cast<ParameterPtr>())) {
+    return true;
+  }
+  return false;
+}
+
+void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params, size_t thread_num,
+                        size_t total_compute_size) {
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  size_t start = 0;
+  size_t once_compute_size = (total_compute_size + thread_num - 1) / thread_num;
+  while (start < total_compute_size) {
+    size_t end = (start + once_compute_size) > total_compute_size ? total_compute_size : (start + once_compute_size);
+    threads.emplace_back(std::thread(func, params, start, end));
+    start += once_compute_size;
+  }
+  for (size_t i = 0; i < threads.size(); ++i) {
+    threads[i].join();
+  }
+}
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/common_utils.h b/mindspore/ccsrc/kernel/common_utils.h
index 47fe96c4c9..e25421c57d 100644
--- a/mindspore/ccsrc/kernel/common_utils.h
+++ b/mindspore/ccsrc/kernel/common_utils.h
@@ -20,9 +20,12 @@
 #include <dirent.h>
 #include <memory>
 #include <unordered_map>
+#include <unordered_set>
 #include <map>
 #include <string>
 #include <vector>
+#include <utility>
+#include <nlohmann/json.hpp>
 #include "kernel/kernel.h"
 #include "kernel/oplib/opinfo.h"
 #include "kernel/kernel_build_info.h"
@@ -69,19 +72,64 @@ class KernelMeta {
   std::unordered_map<std::string, std::string> kernel_meta_map_;
 };
 
+struct SparseGradient {
+  float *value_;
+  int *indices_;
+  size_t indices_size_;
+};
+
+struct MultiThreadComputeParams {
+  float *var_;
+  float *accum_;
+  float *linear_;
+  float *m_;
+  float *m_t_;
+  float *v_;
+  float lr_;
+  float l1_;
+  float l2_;
+  float lr_power_;
+  float beta1_;
+  float beta2_;
+  float epsilon_;
+  SparseGradient sparse_grad_;
+  size_t var_first_dim_size_;
+  size_t var_outer_dim_size_;
+  bool use_nesterov_;
+};
+using MultiThreadComputeFunc = std::function<void(MultiThreadComputeParams *param, size_t start, size_t end)>;
+
 bool CheckCache(const std::string &kernel_name);
 KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &processor);
 KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &processor);
 TypeId DtypeToTypeId(const std::string &dtypes);
-std::string Dtype2String(const std::string &dtypes);
 std::string Dtype2ShortType(const std::string &dtypes);
 std::string TypeId2String(TypeId type_id);
 size_t GetDtypeNbyte(const std::string &dtypes);
 bool ParseMetadata(const CNodePtr &kernel_node, const std::shared_ptr<const OpInfo> &op_info_ptr, Processor processor,
                    std::vector<std::shared_ptr<KernelBuildInfo>> *const kernel_info_list);
-bool IsAtomicNode(const CNodePtr &kernel_node);
 void SaveJsonInfo(const std::string &json_name, const std::string &info);
 std::string GetProcessor(const AnfNodePtr &anf_node);
+bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
+int Sign(float x);
+void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
+                              size_t outer_dim);
+void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
+                          size_t outer_dim);
+std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
+std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
+                                                                            const std::vector<AnfNodePtr> &input_list);
+std::vector<std::pair<AnfNodePtr, size_t>> GetOutputIndex(const std::vector<AnfNodePtr> &node_list,
+                                                          const std::vector<AnfNodePtr> &input_list,
+                                                          const std::vector<AnfNodePtr> &output_list);
+void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list,
+                         std::vector<AnfNodePtr> *input_list, std::vector<AnfNodePtr> *output_list);
+void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list);
+bool GetInputTensorValue(const AnfNodePtr &anf_node, size_t input_idx, nlohmann::json *const node_json);
+void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<AnfNodePtr, size_t>> *node_list);
+bool IsWeightBoundary(const AnfNodePtr &node);
+void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params, size_t thread_num,
+                        size_t total_compute_size);
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/kernel/cpu/addn_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/addn_cpu_kernel.cc
index d0db0c7685..5b3194608e 100644
--- a/mindspore/ccsrc/kernel/cpu/addn_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/addn_cpu_kernel.cc
@@ -32,17 +32,17 @@ bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                            const std::vector<kernel::AddressPtr> &outputs) {
   auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
 
+  size_t offset = 0;
   for (size_t i = 0; i < output_shape_[0]; ++i) {
     for (size_t j = 0; j < output_shape_[1]; ++j) {
       for (size_t k = 0; k < output_shape_[2]; ++k) {
         for (size_t m = 0; m < output_shape_[3]; ++m) {
-          auto offset = CPUKernelUtils::CalcOffset(output_shape_, i, j, k, m);
           float sum = 0;
           for (size_t index = 0; index < input_num_; ++index) {
             auto input_addr = reinterpret_cast<float *>(inputs[index]->addr);
             sum += input_addr[offset];
           }
-          output_addr[offset] = sum;
+          output_addr[offset++] = sum;
         }
       }
     }
diff --git a/mindspore/ccsrc/kernel/cpu/allgather_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/allgather_cpu_kernel.cc
new file mode 100644
index 0000000000..abb0c65d27
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/allgather_cpu_kernel.cc
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/allgather_cpu_kernel.h"
+#include "device/cpu/cpu_device_address.h"
+#include "device/cpu/mpi/mpi_adapter.h"
+#include "ir/primitive.h"
+#include "utils/log_adapter.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr auto kRanksGroup = "group";
+constexpr auto kAllGatherInputNum = 1;
+}  // namespace
+
+void AllGatherCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != kAllGatherInputNum) {
+    MS_LOG(EXCEPTION) << "allgather input num:" << input_num;
+  }
+
+  auto ranks_group = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kRanksGroup);
+  if (ranks_group != nullptr) {
+    ranks_group_ = GetValue<std::vector<int>>(ranks_group);
+  } else {
+    MS_LOG(EXCEPTION) << "Miss attribute " << kRanksGroup;
+  }
+}
+
+bool AllGatherCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                const std::vector<kernel::AddressPtr> & /*workspace*/,
+                                const std::vector<kernel::AddressPtr> &outputs) {
+  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
+  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
+  auto input_data_num = inputs[0]->size / sizeof(float);
+
+  return device::cpu::MPIAdapter::Instance().AllGather(input_addr, output_addr, ranks_group_, input_data_num);
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/allgather_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/allgather_cpu_kernel.h
new file mode 100644
index 0000000000..94180fa89b
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/allgather_cpu_kernel.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
+#include <vector>
+#include <memory>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class AllGatherCPUKernel : public CPUKernel {
+ public:
+  AllGatherCPUKernel() = default;
+  ~AllGatherCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  std::vector<int> ranks_group_;
+};
+
+MS_REG_CPU_KERNEL(HostAllGather, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  AllGatherCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/apply_momentum_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/apply_momentum_cpu_kernel.h
index 0ce671f4f5..c0ca581974 100644
--- a/mindspore/ccsrc/kernel/cpu/apply_momentum_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/apply_momentum_cpu_kernel.h
@@ -42,6 +42,16 @@ MS_REG_CPU_KERNEL(ApplyMomentum,
                     .AddInputAttr(kNumberTypeFloat32)
                     .AddOutputAttr(kNumberTypeFloat32),
                   ApplyMomentumCPUKernel);
+MS_REG_CPU_KERNEL(ApplyMomentum,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  ApplyMomentumCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/kernel/cpu/argmax_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/argmax_cpu_kernel.h
index 16344d6817..aae7435c5c 100644
--- a/mindspore/ccsrc/kernel/cpu/argmax_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/argmax_cpu_kernel.h
@@ -37,7 +37,7 @@ class ArgmaxCPUKernel : public CPUKernel {
   size_t batch_size_{0};
 };
 
-MS_REG_CPU_KERNEL(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+MS_REG_CPU_KERNEL(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32),
                   ArgmaxCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/cpu_kernel.cc
index c9d3770c6e..2be05038d6 100644
--- a/mindspore/ccsrc/kernel/cpu/cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/cpu_kernel.cc
@@ -37,8 +37,8 @@ void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
 }
 
 void CPUKernel::Init(const CNodePtr &kernel_node) {
-  InitInputOutputSize(kernel_node);
   InitKernel(kernel_node);
+  InitInputOutputSize(kernel_node);
 }
 
 void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
@@ -66,5 +66,15 @@ size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int
   }
   return result;
 }
+
+void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
+  size_t accumulation = 1;
+  element_num->emplace_back(1);
+  for (size_t i = shape.size() - 1; i > 0; --i) {
+    accumulation *= shape[i];
+    element_num->emplace_back(accumulation);
+  }
+  std::reverse(element_num->begin(), element_num->end());
+}
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/cpu_kernel.h
index 2538459336..0836529840 100644
--- a/mindspore/ccsrc/kernel/cpu/cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/cpu_kernel.h
@@ -49,6 +49,7 @@ const char AXIS[] = "axis";
 const char BEGIN[] = "begin";
 const char END[] = "end";
 const char SIZE[] = "size";
+const char USE_NESTEROV[] = "use_nesterov";
 
 class CPUKernel : public kernel::KernelMod {
  public:
@@ -78,6 +79,7 @@ class CPUKernelUtils {
   static void ExpandDimsTo4(std::vector<size_t> *shape);
   static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
   static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
+  static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
 };
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
new file mode 100644
index 0000000000..837cb647e3
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <thread>
+#include "kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.h"
+#include "device/cpu/cpu_device_address.h"
+#include "device/cpu/mpi/mpi_adapter.h"
+#include "ir/primitive.h"
+
+namespace mindspore {
+namespace kernel {
+void EmbeddingLookUpCommGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  CheckParam(kernel_node);
+  split_num_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "split_num");
+  MS_LOG(INFO) << "split_num: " << split_num_;
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  if (input_shape[0] % split_num_ != 0) {
+    MS_LOG(EXCEPTION) << "Input shape[0] is " << input_shape[0] << ", but it must be multiple of split_num.";
+  }
+}
+
+bool EmbeddingLookUpCommGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                              const std::vector<kernel::AddressPtr> & /*workspace*/,
+                                              const std::vector<kernel::AddressPtr> &outputs) {
+#if defined(_WIN32) || defined(_WIN64)
+  auto start_time = std::chrono::steady_clock::now();
+#else
+  struct timeval start_time, end_time;
+  (void)gettimeofday(&start_time, nullptr);
+#endif
+  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
+  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
+  size_t input_size = inputs[0]->size;
+  size_t output_size = outputs[0]->size;
+  MS_LOG(DEBUG) << "input addr: " << input_addr << "input size: " << input_size;
+  MS_LOG(DEBUG) << "output addr: " << output_addr << "output size: " << output_size;
+  memset_s(output_addr, output_size, 0, output_size);
+  const std::vector<int> &rank_group = {0, 1, 2, 3, 4, 5, 6, 7};
+  size_t input_split_lens = input_size / split_num_ / sizeof(float_t);
+  size_t output_split_lens = output_size / split_num_ / sizeof(float_t);
+  for (int i = 0; i < split_num_; i++) {
+    device::cpu::MPIAdapter::Instance().AllGather(input_addr + i * input_split_lens,
+                                                  output_addr + i * output_split_lens, rank_group, input_split_lens);
+  }
+#if defined(_WIN32) || defined(_WIN64)
+  auto end_time = std::chrono::steady_clock::now();
+  std::chrono::duration<double, std::ratio<1, 1000000>> cost = end_time - start_time;
+  MS_LOG(INFO) << "EmbeddingLookUpCommGradCPUKernel, used time: " << cost.count() << " us";
+#else
+  (void)gettimeofday(&end_time, nullptr);
+  uint64_t time = 1000000 * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
+  time += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
+  MS_LOG(INFO) << "EmbeddingLookUpCommGradCPUKernel, used time: " << time << " us";
+#endif
+  return true;
+}
+
+void EmbeddingLookUpCommGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 1) {
+    MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but EmbeddingLookUpCommGradCPUKernel needs 1.";
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.h
new file mode 100644
index 0000000000..7222bd9be1
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_
+#include <vector>
+#include <memory>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class EmbeddingLookUpCommGradCPUKernel : public CPUKernel {
+ public:
+  EmbeddingLookUpCommGradCPUKernel() : split_num_(1) {}
+  ~EmbeddingLookUpCommGradCPUKernel() override{};
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  void CheckParam(const CNodePtr &kernel_node);
+  int split_num_;
+};
+
+MS_REG_CPU_KERNEL(EmbeddingLookupCommGrad,
+                  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EmbeddingLookUpCommGradCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.cc
new file mode 100644
index 0000000000..e91b5d8109
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.cc
@@ -0,0 +1,208 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <thread>
+#include <string>
+#include "kernel/cpu/embedding_look_up_cpu_kernel.h"
+#include "device/cpu/cpu_device_address.h"
+#include "device/cpu/mpi/mpi_adapter.h"
+#include "ir/primitive.h"
+
+namespace mindspore {
+namespace kernel {
+void EmbeddingLookUpCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  CheckParam(kernel_node);
+  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  input_lens_ = 1;
+  for (auto shape : input_shape_) {
+    input_lens_ = input_lens_ * shape;
+  }
+  indices_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  indices_lens_ = 1;
+  for (auto shape : indices_shape_) {
+    indices_lens_ = indices_lens_ * shape;
+  }
+  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+  axis_ = 4 - input_shape_.size();
+  reduce_scatter_flag_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "reduce_scatter_flag");
+#ifdef ENABLE_MPI
+  if (reduce_scatter_flag_) {
+    size_t gatherv2_out_lens = 1;
+    for (int i = 0; i < SizeToInt(input_shape_.size()); i++) {
+      if (i == 0) {
+        for (int j = 0; j < SizeToInt(indices_shape_.size()); j++) {
+          gatherv2_out_lens = gatherv2_out_lens * indices_shape_[j];
+        }
+      } else {
+        gatherv2_out_lens = gatherv2_out_lens * input_shape_[i];
+      }
+    }
+    gatherv2_out_lens_ = gatherv2_out_lens * sizeof(float);
+    gather_v2_out_ = malloc(gatherv2_out_lens_);
+    if (gather_v2_out_ == nullptr) {
+      MS_LOG(EXCEPTION) << "EmbeddingLookUpCPUKernel malloc failed, malloc lens: " << gatherv2_out_lens_;
+    }
+    auto ret = memset_s(gather_v2_out_, gatherv2_out_lens_, 0, gatherv2_out_lens_);
+    if (ret != 0) {
+      MS_LOG(EXCEPTION) << "EmbeddingLookUpCPUKernel memset gatherv2 out buff failed";
+    }
+    split_num_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "split_num");
+  }
+#else
+  if (reduce_scatter_flag_) {
+    MS_LOG(EXCEPTION) << "Not Enable MPI, please build version with -M on when set reduce_scatter_flag true";
+  }
+#endif
+  offset_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "offset");
+  CPUKernelUtils::ExpandDimsTo4(&input_shape_);
+  CPUKernelUtils::ExpandDimsTo4(&output_shape_);
+}
+
+bool EmbeddingLookUpCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                      const std::vector<kernel::AddressPtr> & /*workspace*/,
+                                      const std::vector<kernel::AddressPtr> &outputs) {
+  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
+  float *gather_out_addr = reduce_scatter_flag_ ? reinterpret_cast<float *>(gather_v2_out_) : output_addr;
+  size_t dim0 = input_shape_[0];
+  size_t dim1 = input_shape_[1];
+  size_t dim2 = input_shape_[2];
+  if (axis_ == 3) {
+    for (size_t i = 0; i < dim0; ++i) {
+      for (size_t j = 0; j < dim1; ++j) {
+        for (size_t k = 0; k < dim2; ++k) {
+          LookUpTable(inputs, i, j, k, &gather_out_addr);
+        }
+      }
+    }
+  } else if (axis_ == 2) {
+    for (size_t i = 0; i < dim0; ++i) {
+      for (size_t j = 0; j < dim1; ++j) {
+        LookUpTable(inputs, i, j, 0, &gather_out_addr);
+      }
+    }
+  } else if (axis_ == 1) {
+    for (size_t i = 0; i < dim0; ++i) {
+      LookUpTable(inputs, i, 0, 0, &gather_out_addr);
+    }
+  } else if (axis_ == 0) {
+    LookUpTable(inputs, 0, 0, 0, &gather_out_addr);
+  }
+#ifdef ENABLE_MPI
+  if (reduce_scatter_flag_) {
+    size_t one_split_lens = gatherv2_out_lens_ / split_num_ / sizeof(float);
+    size_t reduce_scatter_out_lens = one_split_lens / 8;
+    const std::vector<int> &group = {0, 1, 2, 3, 4, 5, 6, 7};
+    for (int i = 0; i < split_num_; i++) {
+      device::cpu::MPIAdapter::Instance().ReduceScatter(reinterpret_cast<float *>(gather_v2_out_) + i * one_split_lens,
+                                                        output_addr + i * reduce_scatter_out_lens, group,
+                                                        one_split_lens / 8, "sum");
+    }
+  }
+#endif
+  return true;
+}
+
+void LookUpTable_task(const float *input_addr, float *output_addr, int *indices_addr, size_t indices_lens, size_t num,
+                      size_t dim0, size_t dim1, size_t dim2, int offset, size_t axis, std::vector<size_t> input_shape,
+                      size_t input_lens) {
+  size_t lens = num * sizeof(float);
+  for (size_t i = 0; i < indices_lens; ++i) {
+    int indices = indices_addr[i] - offset;
+    if (indices >= 0) {
+      size_t index = IntToSize(indices);
+      if (index < input_shape[axis]) {
+        size_t pos = 0;
+        if (axis == 3) {
+          pos = CPUKernelUtils::CalcOffset(input_shape, dim0, dim1, dim2, index);
+        } else if (axis == 2) {
+          pos = CPUKernelUtils::CalcOffset(input_shape, dim0, dim1, index, 0);
+        } else if (axis == 1) {
+          pos = CPUKernelUtils::CalcOffset(input_shape, dim0, index, 0, 0);
+        } else if (axis == 0) {
+          pos = CPUKernelUtils::CalcOffset(input_shape, index, 0, 0, 0);
+        }
+
+        if (pos + num <= input_lens) {
+          auto ret = memcpy_s(output_addr, lens, input_addr + pos, lens);
+          if (ret != EOK) {
+            MS_LOG(EXCEPTION) << "LookUpTable task memcpy failed.";
+          }
+        } else {
+          auto ret = memset_s(output_addr, lens, 0, lens);
+          if (ret != EOK) {
+            MS_LOG(EXCEPTION) << "LookUpTable task memset failed.";
+          }
+        }
+      } else {
+        auto ret = memset_s(output_addr, lens, 0, lens);
+        if (ret != EOK) {
+          MS_LOG(EXCEPTION) << "LookUpTable task memset failed.";
+        }
+      }
+    } else {
+      auto ret = memset_s(output_addr, lens, 0, lens);
+      if (ret != EOK) {
+        MS_LOG(EXCEPTION) << "LookUpTable task memset failed.";
+      }
+    }
+    output_addr += num;
+  }
+}
+
+void EmbeddingLookUpCPUKernel::LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1,
+                                           size_t dim2, float **output_addr) {
+  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
+  auto indices_addr = reinterpret_cast<int *>(inputs[1]->addr);
+  size_t num = CPUKernelUtils::GetElementNumOnAxis(input_shape_, axis_);
+  float *task_out_addr = *output_addr;
+  const size_t thread_num = 8;
+  std::thread threads[8];
+  size_t task_proc_lens = (indices_lens_ + thread_num - 1) / thread_num;
+  size_t i;
+  size_t task_offset = 0;
+  MS_LOG(DEBUG) << "indices_lens_: " << indices_lens_ << " one task proc lens:" << task_proc_lens;
+  for (i = 0; i < thread_num; i++) {
+    if (task_offset >= indices_lens_) {
+      break;
+    }
+    MS_LOG(DEBUG) << "task_offset: " << task_offset << " task_proc_lenss:" << task_proc_lens;
+    threads[i] =
+      std::thread(LookUpTable_task, input_addr, task_out_addr + task_offset * num, indices_addr + task_offset,
+                  task_proc_lens, num, dim0, dim1, dim2, offset_, axis_, input_shape_, input_lens_);
+    task_offset += task_proc_lens;
+    if (task_offset + task_proc_lens > indices_lens_) {
+      task_proc_lens = indices_lens_ - task_offset;
+    }
+  }
+  for (size_t j = 0; j < i; j++) {
+    threads[j].join();
+  }
+  *output_addr += num * indices_lens_;
+}
+
+void EmbeddingLookUpCPUKernel::CheckParam(const CNodePtr &kernel_node) {
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  if (input_shape.size() > 4) {
+    MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size()
+                      << ", but EmbeddingLookUpCPUKernel olny support 4d or lower.";
+  }
+
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 2) {
+    MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but EmbeddingLookUpCPUKernel needs 2.";
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.h
new file mode 100644
index 0000000000..d839571caa
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_
+#include <vector>
+#include <memory>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class EmbeddingLookUpCPUKernel : public CPUKernel {
+ public:
+  EmbeddingLookUpCPUKernel() {
+    axis_ = 0;
+    offset_ = 0;
+    split_num_ = 0;
+    input_lens_ = 0;
+    indices_lens_ = 0;
+    gatherv2_out_lens_ = 0;
+    reduce_scatter_flag_ = false;
+    gather_v2_out_ = nullptr;
+  }
+  ~EmbeddingLookUpCPUKernel() override {
+    if (gather_v2_out_ != nullptr) {
+      free(gather_v2_out_);
+      gather_v2_out_ = nullptr;
+    }
+  }
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
+                   float **output_addr);
+  void CheckParam(const CNodePtr &kernel_node);
+  std::vector<size_t> input_shape_;
+  std::vector<size_t> indices_shape_;
+  std::vector<size_t> output_shape_;
+  int axis_;
+  int offset_;
+  int split_num_;
+  size_t input_lens_;
+  size_t indices_lens_;
+  size_t gatherv2_out_lens_;
+  bool reduce_scatter_flag_;
+
+  void *gather_v2_out_;
+};
+
+MS_REG_CPU_KERNEL(
+  EmbeddingLookup,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
+  EmbeddingLookUpCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc
index cb311043ac..9117a533c8 100644
--- a/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc
@@ -74,8 +74,8 @@ void GatherV2CPUKernel::CopyDataToOutput(const std::vector<kernel::AddressPtr> &
                                          size_t dim2, float **output_addr, size_t *buff_size) {
   auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
   auto indices_addr = reinterpret_cast<int *>(inputs[1]->addr);
-
-  for (size_t i = 0; i < output_shape_[axis_]; ++i) {
+  size_t elem_num = inputs[1]->size / 4;
+  for (size_t i = 0; i < elem_num; ++i) {
     size_t index = IntToSize(indices_addr[i]);
     size_t pos = 0;
     if (axis_ == 3) {
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.cc
index dab165e017..0a343785f7 100644
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.cc
@@ -22,99 +22,120 @@
 namespace mindspore {
 namespace kernel {
 void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+#ifdef PLATFORM_86
+  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
   MS_EXCEPTION_IF_NULL(kernel_node);
+  using tag = dnnl::memory::format_tag;
+  using dim = dnnl::memory::dims;
   std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
+  std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
   bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
   input_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "input_size");
   hidden_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "hidden_size");
   num_layers_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "num_layers");
+  has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
   batch_size_ = SizeToInt(src_shape[1]);
   seq_len_ = SizeToInt(src_shape[0]);
   num_directions_ = 1;
   if (bidirectional_) {
     num_directions_ = 2;
   }
-  int gate_size = 4 * hidden_size_;
+  if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
+    MS_LOG(EXCEPTION) << "error iteration shape!";
+  }
+  if (num_layers_ <= 0) {
+    MS_LOG(EXCEPTION) << "layers must be greater than zero!";
+  }
+  if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
+    MS_LOG(EXCEPTION) << "conv2d only support 3-D input!";
+  }
+  const int gate_size = 4 * hidden_size_;
   for (int i = 0; i < num_layers_; ++i) {
     weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
     weight_h_size_ += gate_size * hidden_size_;
   }
   weight_size_ = weight_size_ * num_directions_;
   weight_h_size_ = weight_h_size_ * num_directions_;
-}
-
-bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
-                           const std::vector<kernel::AddressPtr> & /*workspace*/,
-                           const std::vector<kernel::AddressPtr> &outputs) {
-  using dt = dnnl::memory::data_type;
-  using tag = dnnl::memory::format_tag;
-  using dim = dnnl::memory::dims;
   auto eng = MKLKernelEngine::Get().engine();
   dnnl::stream s(eng);
-  auto formatted_md = [](dim dimensions, tag layout) { return dnnl::memory::desc{{dimensions}, dt::f32, layout}; };
   dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
   if (bidirectional_) {
     direction = dnnl::rnn_direction::bidirectional_concat;
   }
-
   dim src_dims = {seq_len_, batch_size_, input_size_};
   dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
   dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
-  dim weights_dims = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
-  dim weights_h_dims = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
-  dim bias_dims = {num_layers_, num_directions_, 4, hidden_size_};
+  weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
+  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
+  bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
   dim dst_dims = {seq_len_, batch_size_, hidden_size_ * num_directions_};
   dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
   dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
   dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
   dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
   dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
-  dnnl::memory::desc weights_desc = formatted_md(weights_dims, tag::ldigo);
-  dnnl::memory::desc weights_h_desc = formatted_md(weights_h_dims, tag::ldigo);
-  dnnl::memory::desc bias_desc = formatted_md(bias_dims, tag::ldgo);
+  dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
   dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
   dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
   dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
-  dnnl::lstm_forward::desc desc =
-    dnnl::lstm_forward::desc(dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
-                             weights_desc, weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
-  auto prim_desc = dnnl::lstm_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
-  auto workspace_memory = dnnl::memory(prim_desc.workspace_desc(), eng);
-  auto src_memory = dnnl::memory(formatted_md(src_dims, tag::tnc), eng);
-  write_to_dnnl_memory(inputs[0]->addr, src_memory);
-
-  auto src_h_memory = dnnl::memory(prim_desc.src_iter_desc(), eng);
-  auto src_c_memory = dnnl::memory(prim_desc.src_iter_c_desc(), eng);
-  write_to_dnnl_memory(inputs[1]->addr, src_h_memory);
-  write_to_dnnl_memory(inputs[2]->addr, src_c_memory);
-
-  auto weights_memory = dnnl::memory(formatted_md(weights_dims, tag::ldigo), eng);
-  auto weights_h_memory = dnnl::memory(formatted_md(weights_h_dims, tag::ldigo), eng);
-  auto bias_memory = dnnl::memory(formatted_md(bias_dims, tag::ldgo), eng);
-  write_to_dnnl_memory(inputs[3]->addr, weights_memory);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_, weights_h_memory);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_, bias_memory);
+  auto desc = std::make_shared<dnnl::lstm_forward::desc>(dnnl::prop_kind::forward_training, direction, src_desc,
+                                                         src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
+                                                         formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
+                                                         dst_h_desc, dst_c_desc);
+  prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng);
+  primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_);
+  AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
+  AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
+  AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
+  AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc());
+  AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc());
+  AddArgument(DNNL_ARG_BIAS, bias_desc);
+  AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
+  AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
+  AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
+  AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc());
+}
 
-  auto dst_memory = dnnl::memory(formatted_md(dst_dims, tag::tnc), eng);
-  auto dst_h_memory = dnnl::memory(prim_desc.dst_iter_desc(), eng);
-  auto dst_c_memory = dnnl::memory(prim_desc.dst_iter_c_desc(), eng);
-  dnnl::lstm_forward fw_layer(prim_desc);
-  workspace_memory.set_data_handle(outputs[3]->addr);
-  dst_memory.set_data_handle(outputs[0]->addr);
-  dst_h_memory.set_data_handle(outputs[1]->addr);
-  dst_c_memory.set_data_handle(outputs[2]->addr);
-  fw_layer.execute(s, {{DNNL_ARG_SRC_LAYER, src_memory},
-                       {DNNL_ARG_SRC_ITER, src_h_memory},
-                       {DNNL_ARG_SRC_ITER_C, src_c_memory},
-                       {DNNL_ARG_WEIGHTS_LAYER, weights_memory},
-                       {DNNL_ARG_WEIGHTS_ITER, weights_h_memory},
-                       {DNNL_ARG_BIAS, bias_memory},
-                       {DNNL_ARG_DST_LAYER, dst_memory},
-                       {DNNL_ARG_DST_ITER, dst_h_memory},
-                       {DNNL_ARG_DST_ITER_C, dst_c_memory},
-                       {DNNL_ARG_WORKSPACE, workspace_memory}});
+bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                           const std::vector<kernel::AddressPtr> & /*workspace*/,
+                           const std::vector<kernel::AddressPtr> &outputs) {
+  using dt = dnnl::memory::data_type;
+  using tag = dnnl::memory::format_tag;
+  auto eng = MKLKernelEngine::Get().engine();
+  auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
+  auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
+  auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng);
+  auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng);
+  user_weights_memory.set_data_handle(inputs[3]->addr);
+  user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
+  Reorder(&user_weights_memory, &weights_memory);
+  Reorder(&user_weights_h_memory, &weights_h_memory);
+  auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng);
+  if (has_bias_) {
+    bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
+  } else {
+    auto ret =
+      memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0, prim_desc_.bias_desc().get_size());
+    if (ret != 0) {
+      MS_LOG(EXCEPTION) << "bias memset error";
+    }
+  }
+  // set handle
+  SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
+  SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
+  SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
+  SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
+  SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
+  SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
+  SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr);
+  SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
+  SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr);
+  SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr);
+  ExecutePrimitive();
   return true;
 }
-
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.h
index 6cb9a1ff74..d42ff803f0 100644
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.h
@@ -14,8 +14,14 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H
-#define MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H_
+#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
+#define PLATFORM_86
+#endif
+#ifdef PLATFORM_86
+#include <pmmintrin.h>
+#endif
 #include <vector>
 #include <memory>
 #include "kernel/cpu/mkldnn/mkl_cpu_kernel.h"
@@ -40,6 +46,11 @@ class LstmCPUKernel : public MKLCPUKernel {
   int seq_len_;
   int num_directions_;
   bool bidirectional_;
+  bool has_bias_;
+  dnnl::memory::dims weights_dims_;
+  dnnl::memory::dims weights_h_dims_;
+  dnnl::memory::dims bias_dims_;
+  dnnl::lstm_forward::primitive_desc prim_desc_;
 };
 
 MS_REG_CPU_KERNEL(LSTM,
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.cc
index df4744db6f..d7e7701d85 100644
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.cc
@@ -24,39 +24,41 @@
 
 namespace mindspore {
 namespace kernel {
-
 void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   MS_EXCEPTION_IF_NULL(kernel_node);
+  using tag = dnnl::memory::format_tag;
+  using dim = dnnl::memory::dims;
+  auto eng = MKLKernelEngine::Get().engine();
   std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
+  std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
   bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
   input_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "input_size");
   hidden_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "hidden_size");
   num_layers_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "num_layers");
+  has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
   batch_size_ = SizeToInt(src_shape[1]);
   seq_len_ = SizeToInt(src_shape[0]);
   num_directions_ = 1;
   if (bidirectional_) {
     num_directions_ = 2;
   }
-  int gate_size = 4 * hidden_size_;
+  if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
+    MS_LOG(EXCEPTION) << "error iteration shape!";
+  }
+  if (num_layers_ <= 0) {
+    MS_LOG(EXCEPTION) << "layers must be greater than zero!";
+  }
+  if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
+    MS_LOG(EXCEPTION) << "conv2d only support 3-D input!";
+  }
+  const int gate_size = 4 * hidden_size_;
   for (int i = 0; i < num_layers_; ++i) {
     weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
     weight_h_size_ += gate_size * hidden_size_;
   }
   weight_size_ = weight_size_ * num_directions_;
   weight_h_size_ = weight_h_size_ * num_directions_;
-}
-
-bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
-                               const std::vector<kernel::AddressPtr> &workspace /*workspace*/,
-                               const std::vector<kernel::AddressPtr> &outputs) {
-  using tag = dnnl::memory::format_tag;
-  using dt = dnnl::memory::data_type;
-  using dim = dnnl::memory::dims;
-  auto eng = MKLKernelEngine::Get().engine();
-  dnnl::stream s(eng);
-  auto formatted_md = [](dim dimensions, tag layout) { return dnnl::memory::desc{{dimensions}, dt::f32, layout}; };
-  auto generic_md = [](dim dimensions) { return dnnl::memory::desc{{dimensions}, dt::f32, tag::any}; };
   dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
   if (bidirectional_) {
     direction = dnnl::rnn_direction::bidirectional_concat;
@@ -64,105 +66,130 @@ bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
   dim src_dims = {seq_len_, batch_size_, input_size_};
   dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
   dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
-  dim weights_dims = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
-  dim weights_h_dims = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
-  dim bias_dims = {num_layers_, num_directions_, 4, hidden_size_};
+  weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
+  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
+  bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
   dim dst_dims = {seq_len_, batch_size_, hidden_size_ * num_directions_};
   dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
   dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
-
   dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
   dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
   dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
-  dnnl::memory::desc weights_desc = formatted_md(weights_dims, tag::ldigo);
-  dnnl::memory::desc weights_h_desc = formatted_md(weights_h_dims, tag::ldigo);
-  dnnl::memory::desc bias_desc = formatted_md(bias_dims, tag::ldgo);
+  dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
   dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
   dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
   dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
+  auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>(
+    dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
+    formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc,
+    dst_c_desc);
+  auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng);
+  auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>(
+    dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
+    formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
+    src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
+    dst_h_desc, dst_c_desc);
+  prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc);
+  primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_);
 
-  dnnl::lstm_forward::desc forward_desc =
-    dnnl::lstm_forward::desc(dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
-                             weights_desc, weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
-  auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(forward_desc, eng);
+  AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
+  AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
+  AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
+  AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc());
+  AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc());
+  AddArgument(DNNL_ARG_BIAS, bias_desc);
+  AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
+  AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
+  AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
+  AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc());
+  AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc);
+  AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc);
+  AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc);
+  AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc());
+  AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc());
+  AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc);
+  AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
+  AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
+  AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc);
+}
 
-  dnnl::lstm_backward::desc backward_desc = dnnl::lstm_backward::desc(
-    dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, generic_md(weights_dims),
-    generic_md(weights_h_dims), generic_md(bias_dims), dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
-    src_c_desc, weights_desc, weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
-  auto prim_backward_desc = dnnl::lstm_backward::primitive_desc(backward_desc, eng, prim_forward_desc);
+bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                               const std::vector<kernel::AddressPtr> &workspace /*workspace*/,
+                               const std::vector<kernel::AddressPtr> &outputs) {
+  using dt = dnnl::memory::data_type;
+  using tag = dnnl::memory::format_tag;
+  auto eng = MKLKernelEngine::Get().engine();
   // construct fw memory
-  auto src_memory = dnnl::memory(formatted_md(src_dims, tag::tnc), eng);
-  write_to_dnnl_memory(inputs[0]->addr, src_memory);
-
-  auto src_h_memory = dnnl::memory(prim_forward_desc.src_iter_desc(), eng);
-  auto src_c_memory = dnnl::memory(prim_forward_desc.src_iter_c_desc(), eng);
-  write_to_dnnl_memory(inputs[1]->addr, src_h_memory);
-  write_to_dnnl_memory(inputs[2]->addr, src_c_memory);
-
-  auto user_weights_memory = dnnl::memory(formatted_md(weights_dims, tag::ldigo), eng);
-  auto user_weights_h_memory = dnnl::memory(formatted_md(weights_h_dims, tag::ldigo), eng);
-  auto user_bias_memory = dnnl::memory(formatted_md(bias_dims, tag::ldgo), eng);
-  write_to_dnnl_memory(inputs[3]->addr, user_weights_memory);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_, user_weights_h_memory);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_, user_bias_memory);
-  auto weights_memory = dnnl::memory(prim_backward_desc.weights_layer_desc(), eng);
-  auto weights_h_memory = dnnl::memory(prim_backward_desc.weights_iter_desc(), eng);
-  auto bias_memory = dnnl::memory(prim_forward_desc.bias_desc(), eng);
-  dnnl::reorder(user_weights_memory, weights_memory).execute(s, user_weights_memory, weights_memory);
-  dnnl::reorder(user_weights_h_memory, weights_h_memory).execute(s, user_weights_h_memory, weights_h_memory);
-  dnnl::reorder(user_bias_memory, bias_memory).execute(s, user_bias_memory, bias_memory);
-
-  auto dst_memory = dnnl::memory(formatted_md(dst_dims, tag::tnc), eng);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[4]->addr), dst_memory);
-  auto dst_h_memory = dnnl::memory(prim_backward_desc.dst_iter_desc(), eng);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[5]->addr), dst_h_memory);
-  auto dst_c_memory = dnnl::memory(prim_backward_desc.dst_iter_c_desc(), eng);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[6]->addr), dst_c_memory);
-  auto workspace_memory = dnnl::memory(prim_forward_desc.workspace_desc(), eng);
-  write_to_dnnl_memory(inputs[10]->addr, workspace_memory);
-
-  // construct diff memory
-  auto diff_src_memory = dnnl::memory(formatted_md(src_dims, tag::tnc), eng);
-  auto diff_src_h_memory = dnnl::memory(prim_backward_desc.diff_src_iter_desc(), eng);
-  auto diff_src_c_memory = dnnl::memory(prim_backward_desc.diff_src_iter_c_desc(), eng);
-
-  auto diff_weights_memory = dnnl::memory(prim_backward_desc.diff_weights_layer_desc(), eng);
-  auto diff_weights_h_memory = dnnl::memory(prim_backward_desc.diff_weights_iter_desc(), eng);
-  auto diff_bias_memory = dnnl::memory(prim_backward_desc.diff_bias_desc(), eng);
-  auto diff_dst_memory = dnnl::memory(formatted_md(dst_dims, tag::tnc), eng);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[7]->addr), diff_dst_memory);
-  auto diff_dst_h_memory = dnnl::memory(prim_backward_desc.diff_dst_iter_desc(), eng);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[8]->addr), diff_dst_h_memory);
-  auto diff_dst_c_memory = dnnl::memory(prim_backward_desc.diff_dst_iter_c_desc(), eng);
-  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[9]->addr), diff_dst_c_memory);
-
-  diff_src_memory.set_data_handle(outputs[0]->addr);
-  diff_src_h_memory.set_data_handle(outputs[1]->addr);
-  diff_src_c_memory.set_data_handle(outputs[2]->addr);
-  diff_weights_memory.set_data_handle(outputs[3]->addr);
-  diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
-  diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
-  dnnl::lstm_backward bwd_layer(prim_backward_desc);
-  bwd_layer.execute(s, {{DNNL_ARG_SRC_LAYER, src_memory},
-                        {DNNL_ARG_SRC_ITER, src_h_memory},
-                        {DNNL_ARG_SRC_ITER_C, src_c_memory},
-                        {DNNL_ARG_WEIGHTS_LAYER, weights_memory},
-                        {DNNL_ARG_WEIGHTS_ITER, weights_h_memory},
-                        {DNNL_ARG_BIAS, bias_memory},
-                        {DNNL_ARG_DST_LAYER, dst_memory},
-                        {DNNL_ARG_DST_ITER, dst_h_memory},
-                        {DNNL_ARG_DST_ITER_C, dst_c_memory},
-                        {DNNL_ARG_DIFF_SRC_LAYER, diff_src_memory},
-                        {DNNL_ARG_DIFF_SRC_ITER, diff_src_h_memory},
-                        {DNNL_ARG_DIFF_SRC_ITER_C, diff_src_c_memory},
-                        {DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory},
-                        {DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory},
-                        {DNNL_ARG_DIFF_BIAS, diff_bias_memory},
-                        {DNNL_ARG_DIFF_DST_LAYER, diff_dst_memory},
-                        {DNNL_ARG_DIFF_DST_ITER, diff_dst_h_memory},
-                        {DNNL_ARG_DIFF_DST_ITER_C, diff_dst_c_memory},
-                        {DNNL_ARG_WORKSPACE, workspace_memory}});
+  auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
+  auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
+  auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng);
+  auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng);
+  auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng);
+  user_weights_memory.set_data_handle(inputs[3]->addr);
+  user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
+  Reorder(&user_weights_memory, &weights_memory);
+  Reorder(&user_weights_h_memory, &weights_h_memory);
+  if (has_bias_) {
+    bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
+  } else {
+    if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0,
+                 prim_backward_desc_.bias_desc().get_size())) {
+      MS_LOG(EXCEPTION) << "bias memset error";
+    }
+  }
+  // construct bw memory
+  auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng);
+  auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng);
+  auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng);
+  auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
+  auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
+  user_diff_weights_memory.set_data_handle(outputs[3]->addr);
+  user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
+  if (memset_s(user_diff_weights_memory.get_data_handle(), user_diff_weights_memory.get_desc().get_size(), 0,
+               user_diff_weights_memory.get_desc().get_size())) {
+    MS_LOG(EXCEPTION) << "user weights grad memset error";
+  }
+  if (memset_s(user_diff_weights_h_memory.get_data_handle(), user_diff_weights_h_memory.get_desc().get_size(), 0,
+               user_diff_weights_h_memory.get_desc().get_size())) {
+    MS_LOG(EXCEPTION) << "user weights iter grad memset error";
+  }
+  if (has_bias_) {
+    diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
+  }
+  if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0,
+               prim_backward_desc_.diff_bias_desc().get_size())) {
+    MS_LOG(EXCEPTION) << "bias grad memset error";
+  }
+  if (memset_s(diff_weights_memory.get_data_handle(), diff_weights_memory.get_desc().get_size(), 0,
+               diff_weights_memory.get_desc().get_size())) {
+    MS_LOG(EXCEPTION) << "weights grad memset error";
+  }
+  if (memset_s(diff_weights_h_memory.get_data_handle(), diff_weights_h_memory.get_desc().get_size(), 0,
+               diff_weights_h_memory.get_desc().get_size())) {
+    MS_LOG(EXCEPTION) << "weights iter grad memset error";
+  }
+  SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
+  SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
+  SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
+  SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
+  SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
+  SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
+  SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr);
+  SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr);
+  SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr);
+  SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr);
+  SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr);
+  SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr);
+  SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr);
+  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle());
+  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle());
+  SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle());
+  SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr);
+  SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr);
+  SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr);
+  ExecutePrimitive();
+  Reorder(&diff_weights_memory, &user_diff_weights_memory);
+  Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory);
   return true;
 }
 }  // namespace kernel
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h
index 22ec1f62db..1f3fb824c0 100644
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h
@@ -41,6 +41,11 @@ class LSTMGradCPUKernel : public MKLCPUKernel {
   int seq_len_;
   int num_directions_;
   bool bidirectional_;
+  bool has_bias_;
+  dnnl::memory::dims weights_dims_;
+  dnnl::memory::dims weights_h_dims_;
+  dnnl::memory::dims bias_dims_;
+  dnnl::lstm_backward::primitive_desc prim_backward_desc_;
 };
 
 MS_REG_CPU_KERNEL(LSTMGrad,
@@ -63,5 +68,4 @@ MS_REG_CPU_KERNEL(LSTMGrad,
                   LSTMGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
-
 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.cc
index 17fca72698..a38470e3a3 100644
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.cc
@@ -98,11 +98,9 @@ void MKLCPUKernel::SetArgumentHandle(int arg_key, void *ptr) {
 }
 
 void MKLCPUKernel::ExecutePrimitive() { MKLKernelEngine::Get().Execute(primitive_, arguments_); }
-void MKLCPUKernel::write_to_dnnl_memory(void *handle, const dnnl::memory &mem) {
-  MKLKernelEngine::Get().write_to_dnnl_memory(handle, mem);
-}
-void MKLCPUKernel::read_from_dnnl_memory(void *handle, const dnnl::memory &mem) {
-  MKLKernelEngine::Get().read_from_dnnl_memory(handle, mem);
+
+void MKLCPUKernel::Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem) {
+  MKLKernelEngine::Get().Reorder(src_mem, dst_mem);
 }
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.h
index a6b8d68627..10a860afff 100644
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.h
@@ -39,10 +39,12 @@ class MKLCPUKernel : public CPUKernel {
   dnnl::memory::format_tag GetDefaultFormatTag(const dnnl::memory::dims &dims) const;
   dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape);
   void ExecutePrimitive();
-  void write_to_dnnl_memory(void *handle, const dnnl::memory &mem);
-  void read_from_dnnl_memory(void *handle, const dnnl::memory &mem);
   std::unordered_map<int, dnnl::memory> arguments_;
   std::shared_ptr<dnnl::primitive> primitive_{nullptr};
+  inline dnnl::memory::desc formatted_md(const dnnl::memory::dims &dimensions, dnnl::memory::format_tag layout) {
+    return dnnl::memory::desc{{dimensions}, dnnl::memory::data_type::f32, layout};
+  }
+  void Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem);
 };
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.cc b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.cc
index f5270a4e9a..5ae9791b12 100644
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.cc
@@ -33,5 +33,8 @@ dnnl::memory MKLKernelEngine::CreateMemory(const dnnl::memory::desc &mem_desc, b
     return dnnl::memory(mem_desc, engine_, nullptr);
   }
 }
+void MKLKernelEngine::Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem) {
+  dnnl::reorder(*src_mem, *dst_mem).execute(stream_, *src_mem, *dst_mem);
+}
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.h b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.h
index b0eaaf405f..99e7ecdfe0 100644
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.h
@@ -41,30 +41,7 @@ class MKLKernelEngine {
 
   void Execute(const std::shared_ptr<dnnl::primitive> &primitive,
                const std::unordered_map<int, dnnl::memory> &arguments);
-
-  inline void read_from_dnnl_memory(void *handle, const dnnl::memory &mem) {
-    dnnl::engine eng = mem.get_engine();
-    size_t bytes = mem.get_desc().get_size();
-    if (eng.get_kind() == dnnl::engine::kind::cpu) {
-      auto dst = reinterpret_cast<uint8_t *>(handle);
-      uint8_t *src = reinterpret_cast<uint8_t *>(mem.get_data_handle());
-      for (size_t i = 0; i < bytes; ++i) {
-        dst[i] = src[i];
-      }
-    }
-  }
-  // Read from handle, write to memory
-  inline void write_to_dnnl_memory(void *handle, const dnnl::memory &mem) {
-    dnnl::engine eng = mem.get_engine();
-    size_t bytes = mem.get_desc().get_size();
-    if (eng.get_kind() == dnnl::engine::kind::cpu) {
-      auto src = reinterpret_cast<uint8_t *>(handle);
-      uint8_t *dst = reinterpret_cast<uint8_t *>(mem.get_data_handle());
-      for (size_t i = 0; i < bytes; ++i) {
-        dst[i] = src[i];
-      }
-    }
-  }
+  void Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem);
 
  private:
   MKLKernelEngine() : engine_(dnnl::engine::kind::cpu, 0), stream_(engine_) {}
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc
new file mode 100644
index 0000000000..05b1a79924
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h"
+#include <numeric>
+#include <functional>
+#include <cmath>
+#include "kernel/cpu/mkldnn/mkl_kernel_engine.h"
+#include "device/cpu/cpu_device_address.h"
+#include "common/utils.h"
+
+namespace mindspore {
+namespace kernel {
+void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
+  CPUKernel::InitInputOutputSize(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  size_t type_size = sizeof(float);
+  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
+  workspace_size_list_.emplace_back(tensor_size);
+}
+
+void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  dnnl::memory::dims mem_dims;
+  mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
+  if (mem_dims.size() != 2) {
+    MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size();
+  }
+  batch_size_ = shape[0];
+  class_num_ = shape[1];
+  if (batch_size_ == 0 || class_num_ == 0) {
+    MS_LOG(EXCEPTION) << "invalid batch size or class num input!";
+  }
+  dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc);
+
+  dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1);
+  auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
+  primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc);
+
+  AddArgument(DNNL_ARG_SRC, mem_desc);
+  AddArgument(DNNL_ARG_DST, mem_desc);
+}
+
+void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels,
+                                                                float *output1, float *output2) const {
+  float epsilon = 1e-6;
+  for (size_t i = 0; i < batch_size_; ++i) {
+    output1[i] = 0;
+    float loss = 0.0;
+    for (size_t j = 0; j < class_num_; ++j) {
+      float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]);
+      output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j];
+      loss += labels[i * class_num_ + j] * logit;
+    }
+    output1[i] = -loss;
+  }
+}
+
+bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                                    const std::vector<kernel::AddressPtr> &workspace,
+                                                    const std::vector<kernel::AddressPtr> &outputs) {
+  if (inputs.empty() || workspace.empty() || outputs.empty()) {
+    MS_LOG(EXCEPTION) << "error input output size!";
+  }
+  size_t batch_float_size = batch_size_ * sizeof(float);
+  size_t batch_class_float_size = class_num_ * batch_float_size;
+  if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size ||
+      inputs[1]->size != batch_class_float_size) {
+    MS_LOG(EXCEPTION) << "error input data size!";
+  }
+  if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) {
+    MS_LOG(EXCEPTION) << "error output data size!";
+  }
+  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
+  SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr);
+  ExecutePrimitive();
+  auto labels = reinterpret_cast<float *>(inputs[1]->addr);
+  auto logits = reinterpret_cast<float *>(workspace[0]->addr);
+  auto output1 = reinterpret_cast<float *>(outputs[0]->addr);
+  auto output2 = reinterpret_cast<float *>(outputs[1]->addr);
+  ForwardPostExecute(logits, labels, output1, output2);
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h
new file mode 100644
index 0000000000..f663508059
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
+
+#include <vector>
+#include <memory>
+#include "kernel/cpu/mkldnn/mkl_cpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel {
+ public:
+  SoftmaxCrossEntropyWithLogitsCPUKernel() = default;
+  ~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ protected:
+  void InitInputOutputSize(const CNodePtr &kernel_node) override;
+
+ private:
+  void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const;
+  size_t class_num_{0};
+  size_t batch_size_{0};
+};
+MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  SoftmaxCrossEntropyWithLogitsCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/reduce_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/reduce_cpu_kernel.cc
new file mode 100644
index 0000000000..b12371c933
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/reduce_cpu_kernel.cc
@@ -0,0 +1,161 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <map>
+#include <string>
+#include <vector>
+#include "kernel/cpu/reduce_cpu_kernel.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+const size_t kReduceTypeMax = 0;
+const size_t kReduceTypeMean = 1;
+const size_t kReduceTypeSum = 2;
+const size_t kMaxDim = 100;
+void ReduceCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
+  if (kernel_name == "ReduceMax") {
+    reduce_type_ = kReduceTypeMax;
+  } else if (kernel_name == "ReduceMean") {
+    reduce_type_ = kReduceTypeMean;
+  } else if (kernel_name == "ReduceSum") {
+    reduce_type_ = kReduceTypeSum;
+  } else {
+    MS_LOG(EXCEPTION) << "Array reduce kernel type " << kernel_name << " is not supported.";
+  }
+  shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS);
+  if (axis_addr->isa<ValueTuple>()) {
+    auto attr_axis = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, AXIS);
+    if (attr_axis.size() > shape_.size()) {
+      MS_LOG(EXCEPTION) << "invalid axis size: " << axis_.size();
+    } else if (attr_axis.empty()) {
+      axis_.push_back(shape_.size() - 1);
+    } else {
+      for (auto axis : attr_axis) {
+        if (IntToSize(axis) >= (shape_.size())) {
+          MS_LOG(EXCEPTION) << "axis value is oversize.";
+        }
+        axis < 0 ? axis_.push_back(axis + shape_.size()) : axis_.push_back(axis);
+      }
+    }
+  } else if (axis_addr->isa<Int32Imm>()) {
+    int axis = AnfAlgo::GetNodeAttr<int>(kernel_node, AXIS);
+
+    if (axis >= 0 && IntToSize(axis) >= shape_.size()) {
+      MS_LOG(EXCEPTION) << "axis value is oversize.";
+    }
+    axis < 0 ? axis_.push_back(axis + shape_.size()) : axis_.push_back(axis);
+  } else {
+    MS_LOG(EXCEPTION) << "Attribute axis type is invalid.";
+  }
+  for (size_t i = 0; i < shape_.size(); ++i) {
+    if (shape_[i] <= 0) {
+      MS_LOG(EXCEPTION) << "shape value is invalid.";
+    }
+    left_dims_ *= shape_[i];
+  }
+  for (size_t i = 0; i < axis_.size(); ++i) {
+    stride_ *= shape_[axis_[i]];
+  }
+  if (stride_ <= 0) {
+    MS_LOG(EXCEPTION) << "stride_ must greater than zero.";
+  }
+  left_dims_ = left_dims_ / stride_;
+}
+bool ReduceCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                             const std::vector<kernel::AddressPtr> & /*workspaces*/,
+                             const std::vector<kernel::AddressPtr> &outputs) {
+  if (inputs.empty() || outputs.empty()) {
+    MS_LOG(EXCEPTION) << "input or output empty!";
+  }
+  size_t out_float_size = left_dims_ * sizeof(float);
+  size_t in_float_size = stride_ * out_float_size;
+  if (inputs[0]->size != in_float_size || outputs[0]->size != out_float_size) {
+    MS_LOG(EXCEPTION) << "invalid input or output data size!";
+  }
+  auto input = reinterpret_cast<float *>(inputs[0]->addr);
+  auto output = reinterpret_cast<float *>(outputs[0]->addr);
+  int size = inputs[0]->size / sizeof(float);
+  std::vector<float> new_input(IntToSize(size), 0.0);
+  std::vector<size_t> transpose_axis;
+  for (size_t i = 0; i < shape_.size(); ++i) {
+    bool insert = true;
+    for (size_t j = 0; j < axis_.size(); ++j) {
+      if (axis_[j] == i) {
+        insert = false;
+        break;
+      }
+    }
+    if (insert) {
+      transpose_axis.push_back(i);
+    }
+  }
+  (void)transpose_axis.insert(transpose_axis.end(), axis_.begin(), axis_.end());
+  Transpose(size, input, shape_, transpose_axis, SizeToInt(shape_.size()), &new_input[0]);
+  if (reduce_type_ == kReduceTypeMax) {
+    for (size_t i = 0; i < left_dims_; ++i) {
+      float value = new_input[i * stride_];
+      for (size_t k = 0; k < stride_; ++k) {
+        if (value < new_input[i * stride_ + k]) {
+          value = new_input[i * stride_ + k];
+        }
+      }
+      output[i] = value;
+    }
+  } else {
+    for (size_t i = 0; i < left_dims_; ++i) {
+      float value = 0.0;
+      for (size_t k = 0; k < stride_; ++k) {
+        value += new_input[i * stride_ + k];
+      }
+      if (reduce_type_ == kReduceTypeMean) {
+        output[i] = value / stride_;
+      } else {
+        output[i] = value;
+      }
+    }
+  }
+  return true;
+}
+void ReduceCPUKernel::Transpose(const int size, const float *input, const std::vector<size_t> &input_shape,
+                                const std::vector<size_t> &input_axis, const int shape_size, float *output) {
+  int pos_array[kMaxDim];
+  int size_offset[kMaxDim];
+  size_offset[0] = size / SizeToInt(input_shape[0]);
+  for (int i = 1; i < shape_size; i++) {
+    size_offset[i] = size_offset[i - 1] / SizeToInt(input_shape[i]);
+  }
+  for (int position = 0; position < size; position += 1) {
+    int temp_position = position;
+    pos_array[0] = temp_position / size_offset[0];
+    for (int i = 1; i < shape_size; i++) {
+      temp_position -= pos_array[i - 1] * size_offset[i - 1];
+      pos_array[i] = temp_position / size_offset[i];
+    }
+    int new_position = pos_array[SizeToInt(input_axis[shape_size - 1])];
+    int new_position_size = 1;
+    for (int j = shape_size - 2; j >= 0; j--) {
+      new_position_size *= SizeToInt(input_shape[SizeToInt(input_axis[j + 1])]);
+      new_position += pos_array[SizeToInt(input_axis[j])] * new_position_size;
+    }
+    output[new_position] = input[position];
+  }
+  return;
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/reduce_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/reduce_cpu_kernel.h
new file mode 100644
index 0000000000..27d28ba3bd
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/reduce_cpu_kernel.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_CPU_KERNEL_H_
+#include <vector>
+#include <memory>
+#include <string>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class ReduceCPUKernel : public CPUKernel {
+ public:
+  ReduceCPUKernel() = default;
+  ~ReduceCPUKernel() override = default;
+  void InitKernel(const CNodePtr &kernel_node) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  void Transpose(const int size, const float *input, const std::vector<size_t> &input_shape,
+                 const std::vector<size_t> &input_axis, const int shape_size, float *output);
+  size_t reduce_type_;
+  std::vector<size_t> axis_;
+  std::vector<size_t> shape_;
+  size_t left_dims_ = 1;
+  size_t stride_ = 1;
+};
+MS_REG_CPU_KERNEL(ReduceMean, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  ReduceCPUKernel);
+MS_REG_CPU_KERNEL(ReduceMax, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  ReduceCPUKernel);
+MS_REG_CPU_KERNEL(ReduceSum, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  ReduceCPUKernel);
+
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/reduce_scatter_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/reduce_scatter_cpu_kernel.cc
new file mode 100644
index 0000000000..fd8a74eb6b
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/reduce_scatter_cpu_kernel.cc
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/reduce_scatter_cpu_kernel.h"
+#include "device/cpu/cpu_device_address.h"
+#include "device/cpu/mpi/mpi_adapter.h"
+#include "ir/primitive.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr auto kRanksGroup = "group";
+}  // namespace
+
+ReduceScatterCPUKernel::ReduceScatterCPUKernel() : op_type_(device::cpu::kOpTypeSum) {}
+
+void ReduceScatterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  auto op = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("op");
+  if (op != nullptr) {
+    op_type_ = GetValue<std::string>(op);
+  }
+
+  auto ranks_group = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kRanksGroup);
+  if (ranks_group != nullptr) {
+    ranks_group_ = GetValue<std::vector<int>>(ranks_group);
+  } else {
+    MS_LOG(EXCEPTION) << "Miss attribute " << kRanksGroup;
+  }
+}
+
+bool ReduceScatterCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                    const std::vector<kernel::AddressPtr> & /*workspace*/,
+                                    const std::vector<kernel::AddressPtr> &outputs) {
+  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
+  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
+  auto output_data_num = outputs[0]->size / sizeof(float);
+
+  return device::cpu::MPIAdapter::Instance().ReduceScatter(input_addr, output_addr, ranks_group_, output_data_num,
+                                                           op_type_);
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/reduce_scatter_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/reduce_scatter_cpu_kernel.h
new file mode 100644
index 0000000000..c3bfe571a4
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/reduce_scatter_cpu_kernel.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
+#include <vector>
+#include <string>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class ReduceScatterCPUKernel : public CPUKernel {
+ public:
+  ReduceScatterCPUKernel();
+  ~ReduceScatterCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  std::string op_type_;
+  std::vector<int> ranks_group_;
+};
+
+MS_REG_CPU_KERNEL(HostReduceScatter, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  ReduceScatterCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.cc
index b1565425e0..d2530430e9 100644
--- a/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.cc
@@ -23,7 +23,6 @@ void SliceCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   CheckParam(kernel_node);
   input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
   output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
-  CPUKernelUtils::ExpandDimsTo4(&output_shape_);
 
   begin_ = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, BEGIN);
   for (size_t i = 0; i < begin_.size(); i++) {
@@ -61,6 +60,15 @@ void SliceCPUKernel::InitKernel(const CNodePtr &kernel_node) {
       end_.emplace_back(begin_[i] + sizes[i]);
     }
   }
+
+  ExpandAllMemberDims();
+  CPUKernelUtils::GetElementNumEveryDim(input_shape_, &input_element_num_);
+  CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_);
+}
+
+void SliceCPUKernel::ExpandAllMemberDims() {
+  CPUKernelUtils::ExpandDimsTo4(&output_shape_);
+
   auto input_len = input_shape_.size();
   if (input_len < 4) {
     for (size_t i = 0; i < 4 - input_len; ++i) {
@@ -78,12 +86,40 @@ bool SliceCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
   auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
   auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
 
-  for (int i = begin_[0]; i < end_[0]; i += strides_[0]) {
-    for (int j = begin_[1]; j < end_[1]; j += strides_[1]) {
-      for (int k = begin_[2]; k < end_[2]; k += strides_[2]) {
+  bool can_copy_memory[3] = {CanCopyMemoryOnAxis(0), CanCopyMemoryOnAxis(1), CanCopyMemoryOnAxis(2)};
+  size_t in_start_offset[3] = {begin_[0] * input_element_num_[0], begin_[1] * input_element_num_[1],
+                               begin_[2] * input_element_num_[2]};
+  size_t in_step_size[3] = {strides_[0] * input_element_num_[0], strides_[1] * input_element_num_[1],
+                            strides_[2] * input_element_num_[2]};
+
+  auto in_n_offset = in_start_offset[0];
+  auto out_n_offset = 0;
+  for (int i = begin_[0]; i < end_[0];
+       i += strides_[0], in_n_offset += in_step_size[0], out_n_offset += output_element_num_[0]) {
+    if (can_copy_memory[0]) {
+      CopyDataToOutput(inputs, in_n_offset, outputs, out_n_offset, input_element_num_[0]);
+      continue;
+    }
+    auto in_c_offset = in_start_offset[1];
+    auto out_c_offset = 0;
+    for (int j = begin_[1]; j < end_[1];
+         j += strides_[1], in_c_offset += in_step_size[1], out_c_offset += output_element_num_[1]) {
+      if (can_copy_memory[1]) {
+        CopyDataToOutput(inputs, in_n_offset + in_c_offset, outputs, out_n_offset + out_c_offset,
+                         input_element_num_[1]);
+        continue;
+      }
+      auto in_h_offset = in_start_offset[2];
+      auto out_h_offset = 0;
+      for (int k = begin_[2]; k < end_[2];
+           k += strides_[2], in_h_offset += in_step_size[2], out_h_offset += output_element_num_[2]) {
+        if (can_copy_memory[2]) {
+          CopyDataToOutput(inputs, in_n_offset + in_c_offset + in_h_offset, outputs,
+                           out_n_offset + out_c_offset + out_h_offset, input_element_num_[2]);
+          continue;
+        }
         for (int m = begin_[3]; m < end_[3]; m += strides_[3]) {
-          auto offset = CPUKernelUtils::CalcOffset(input_shape_, i, j, k, m);
-          *output_addr++ = input_addr[offset];
+          *output_addr++ = input_addr[in_n_offset + in_c_offset + in_h_offset + m];
         }
       }
     }
@@ -92,7 +128,38 @@ bool SliceCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
   return true;
 }
 
-void SliceCPUKernel::CheckParam(const CNodePtr &kernel_node) {
+bool SliceCPUKernel::CanCopyMemoryOnAxis(size_t dim) const {
+  for (size_t i = dim + 1; i < 4; ++i) {
+    if (begin_[i] != 0 || end_[i] != SizeToInt(input_shape_[i]) || strides_[i] != 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void SliceCPUKernel::CopyDataToOutput(const std::vector<kernel::AddressPtr> &inputs, size_t in_offset,
+                                      const std::vector<kernel::AddressPtr> &outputs, size_t out_offset,
+                                      size_t copy_num) const {
+  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
+  auto in_buff_size = inputs[0]->size;
+  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
+  auto out_buff_size = outputs[0]->size;
+
+  if ((in_offset + copy_num) * sizeof(float) > in_buff_size) {
+    MS_LOG(EXCEPTION) << "input memory out of bounds.";
+  }
+  if ((out_offset + copy_num) * sizeof(float) > out_buff_size) {
+    MS_LOG(EXCEPTION) << "output memory out of bounds.";
+  }
+
+  auto ret = memcpy_s(output_addr + out_offset, out_buff_size - out_offset * sizeof(float), input_addr + in_offset,
+                      copy_num * sizeof(float));
+  if (ret != EOK) {
+    MS_LOG(EXCEPTION) << "memcpy failed. ret:" << ret;
+  }
+}
+
+void SliceCPUKernel::CheckParam(const CNodePtr &kernel_node) const {
   size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
   if (input_num != 1) {
     MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but SliceCPUKernel needs 1 inputs.";
diff --git a/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.h
index 788c4f39ad..913c993d7a 100644
--- a/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.h
@@ -33,12 +33,18 @@ class SliceCPUKernel : public CPUKernel {
               const std::vector<AddressPtr> &outputs) override;
 
  private:
-  void CheckParam(const CNodePtr &kernel_node);
+  void ExpandAllMemberDims();
+  bool CanCopyMemoryOnAxis(size_t dim) const;
+  void CopyDataToOutput(const std::vector<kernel::AddressPtr> &inputs, size_t in_offset,
+                        const std::vector<kernel::AddressPtr> &outputs, size_t out_offset, size_t copy_num) const;
+  void CheckParam(const CNodePtr &kernel_node) const;
   std::vector<int> begin_;
   std::vector<int> end_;
   std::vector<int> strides_;
   std::vector<size_t> input_shape_;
+  std::vector<size_t> input_element_num_;
   std::vector<size_t> output_shape_;
+  std::vector<size_t> output_element_num_;
 };
 
 MS_REG_CPU_KERNEL(Slice, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
diff --git a/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.cc
index 2a61a0259a..92eaffe8c6 100644
--- a/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.cc
@@ -21,13 +21,13 @@ namespace mindspore {
 namespace kernel {
 void SliceGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   CheckParam(kernel_node);
-  output_dx_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
-  input_dy_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
 
   begin_ = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, BEGIN);
   for (size_t i = 0; i < begin_.size(); i++) {
     if (begin_[i] < 0) {
-      begin_[i] = begin_[i] + output_dx_shape_[i];
+      begin_[i] = begin_[i] + output_shape_[i];
     }
   }
 
@@ -37,35 +37,43 @@ void SliceGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   if (strides != nullptr) {
     strides_ = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, STRIDES);
     end_ = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, END);
-    if (strides_.size() != end_.size() || strides_.size() != output_dx_shape_.size()) {
+    if (strides_.size() != end_.size() || strides_.size() != output_shape_.size()) {
       MS_LOG(EXCEPTION) << "stride|end|input size must be equal";
     }
     for (size_t i = 0; i < strides_.size(); ++i) {
       if (strides_[i] < 0) {
-        strides_[i] = (strides_[i] + output_dx_shape_[i]) > 0 ? (strides_[i] + output_dx_shape_[i]) : 0;
+        strides_[i] = (strides_[i] + output_shape_[i]) > 0 ? (strides_[i] + output_shape_[i]) : 0;
       }
       if (end_[i] < 0) {
-        end_[i] = (end_[i] + output_dx_shape_[i]) > 0 ? (end_[i] + output_dx_shape_[i]) : 0;
+        end_[i] = (end_[i] + output_shape_[i]) > 0 ? (end_[i] + output_shape_[i]) : 0;
       }
     }
   } else {
     auto sizes = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, SIZE);
-    if (sizes.size() != output_dx_shape_.size() || begin_.size() != output_dx_shape_.size()) {
+    if (sizes.size() != output_shape_.size() || begin_.size() != output_shape_.size()) {
       MS_LOG(EXCEPTION) << "begin|size|input size must be equal";
     }
     for (size_t i = 0; i < sizes.size(); ++i) {
       if (sizes[i] < 0) {
-        sizes[i] = (sizes[i] + output_dx_shape_[i]) > 0 ? (sizes[i] + output_dx_shape_[i]) : 0;
+        sizes[i] = (sizes[i] + output_shape_[i]) > 0 ? (sizes[i] + output_shape_[i]) : 0;
       }
       strides_.emplace_back(1);
       end_.emplace_back(begin_[i] + sizes[i]);
     }
   }
-  CPUKernelUtils::ExpandDimsTo4(&output_dx_shape_);
-  auto input_len = input_dy_shape_.size();
-  if (input_len < 4) {
-    for (size_t i = 0; i < 4 - input_len; ++i) {
-      input_dy_shape_.insert(input_dy_shape_.begin(), 1);
+
+  ExpandAllMemberDims();
+  CPUKernelUtils::GetElementNumEveryDim(input_shape_, &input_element_num_);
+  CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_);
+}
+
+void SliceGradCPUKernel::ExpandAllMemberDims() {
+  CPUKernelUtils::ExpandDimsTo4(&input_shape_);
+
+  auto output_len = output_shape_.size();
+  if (output_len < 4) {
+    for (size_t i = 0; i < 4 - output_len; ++i) {
+      output_shape_.insert(output_shape_.begin(), 1);
       begin_.insert(begin_.begin(), 0);
       strides_.insert(strides_.begin(), 1);
       end_.insert(end_.begin(), 1);
@@ -76,22 +84,49 @@ void SliceGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool SliceGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                 const std::vector<kernel::AddressPtr> & /*workspace*/,
                                 const std::vector<kernel::AddressPtr> &outputs) {
-  auto input_dy_addr = reinterpret_cast<float *>(inputs[0]->addr);
-  auto output_dx_addr = reinterpret_cast<float *>(outputs[0]->addr);
+  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
+  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
 
-  auto out_size = sizeof(float) * output_dx_shape_[0] * output_dx_shape_[1] * output_dx_shape_[2] * output_dx_shape_[3];
-  auto ret = memset_s(output_dx_addr, out_size, 0, out_size);
+  auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size);
   if (ret != EOK) {
-    MS_LOG(ERROR) << "output buff memset fail.";
+    MS_LOG(ERROR) << "output buff memset fail. ret:" << ret;
     return false;
   }
 
-  for (int i = begin_[0]; i < end_[0]; i += strides_[0]) {
-    for (int j = begin_[1]; j < end_[1]; j += strides_[1]) {
-      for (int k = begin_[2]; k < end_[2]; k += strides_[2]) {
+  bool can_copy_memory[3] = {CanCopyMemoryOnAxis(0), CanCopyMemoryOnAxis(1), CanCopyMemoryOnAxis(2)};
+  size_t out_start_offset[3] = {begin_[0] * output_element_num_[0], begin_[1] * output_element_num_[1],
+                                begin_[2] * output_element_num_[2]};
+  size_t out_step_size[3] = {strides_[0] * output_element_num_[0], strides_[1] * output_element_num_[1],
+                             strides_[2] * output_element_num_[2]};
+
+  auto in_n_offset = 0;
+  auto out_n_offset = out_start_offset[0];
+  for (int i = begin_[0]; i < end_[0];
+       i += strides_[0], in_n_offset += input_element_num_[0], out_n_offset += out_step_size[0]) {
+    if (can_copy_memory[0]) {
+      CopyDataToOutput(inputs, in_n_offset, outputs, out_n_offset, input_element_num_[0]);
+      continue;
+    }
+    auto in_c_offset = 0;
+    auto out_c_offset = out_start_offset[1];
+    for (int j = begin_[1]; j < end_[1];
+         j += strides_[1], in_c_offset += input_element_num_[1], out_c_offset += out_step_size[1]) {
+      if (can_copy_memory[1]) {
+        CopyDataToOutput(inputs, in_n_offset + in_c_offset, outputs, out_n_offset + out_c_offset,
+                         input_element_num_[1]);
+        continue;
+      }
+      auto in_h_offset = 0;
+      auto out_h_offset = out_start_offset[2];
+      for (int k = begin_[2]; k < end_[2];
+           k += strides_[2], in_h_offset += input_element_num_[2], out_h_offset += out_step_size[2]) {
+        if (can_copy_memory[2]) {
+          CopyDataToOutput(inputs, in_n_offset + in_c_offset + in_h_offset, outputs,
+                           out_n_offset + out_c_offset + out_h_offset, input_element_num_[2]);
+          continue;
+        }
         for (int m = begin_[3]; m < end_[3]; m += strides_[3]) {
-          auto offset = CPUKernelUtils::CalcOffset(output_dx_shape_, i, j, k, m);
-          output_dx_addr[offset] = *input_dy_addr++;
+          output_addr[out_n_offset + out_c_offset + out_h_offset + m] = *input_addr++;
         }
       }
     }
@@ -99,7 +134,38 @@ bool SliceGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
   return true;
 }
 
-void SliceGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
+bool SliceGradCPUKernel::CanCopyMemoryOnAxis(size_t dim) const {
+  for (size_t i = dim + 1; i < 4; ++i) {
+    if (begin_[i] != 0 || end_[i] != SizeToInt(output_shape_[i]) || strides_[i] != 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void SliceGradCPUKernel::CopyDataToOutput(const std::vector<kernel::AddressPtr> &inputs, size_t in_offset,
+                                          const std::vector<kernel::AddressPtr> &outputs, size_t out_offset,
+                                          size_t copy_num) const {
+  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
+  auto in_buff_size = inputs[0]->size;
+  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
+  auto out_buff_size = outputs[0]->size;
+
+  if ((in_offset + copy_num) * sizeof(float) > in_buff_size) {
+    MS_LOG(EXCEPTION) << "input memory out of bounds.";
+  }
+  if ((out_offset + copy_num) * sizeof(float) > out_buff_size) {
+    MS_LOG(EXCEPTION) << "output memory out of bounds.";
+  }
+
+  auto ret = memcpy_s(output_addr + out_offset, out_buff_size - out_offset * sizeof(float), input_addr + in_offset,
+                      copy_num * sizeof(float));
+  if (ret != EOK) {
+    MS_LOG(EXCEPTION) << "memcpy failed. ret:" << ret;
+  }
+}
+
+void SliceGradCPUKernel::CheckParam(const CNodePtr &kernel_node) const {
   size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
   if (output_num != 1) {
     MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but SliceGradGpuKernel needs 1 output.";
diff --git a/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.h
index 847208e4bb..1e42c8ac68 100644
--- a/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.h
@@ -33,12 +33,18 @@ class SliceGradCPUKernel : public CPUKernel {
               const std::vector<AddressPtr> &outputs) override;
 
  private:
-  void CheckParam(const CNodePtr &kernel_node);
+  void ExpandAllMemberDims();
+  bool CanCopyMemoryOnAxis(size_t dim) const;
+  void CopyDataToOutput(const std::vector<kernel::AddressPtr> &inputs, size_t in_offset,
+                        const std::vector<kernel::AddressPtr> &outputs, size_t out_offset, size_t copy_num) const;
+  void CheckParam(const CNodePtr &kernel_node) const;
   std::vector<int> begin_;
   std::vector<int> end_;
   std::vector<int> strides_;
-  std::vector<size_t> input_dy_shape_;
-  std::vector<size_t> output_dx_shape_;
+  std::vector<size_t> input_shape_;
+  std::vector<size_t> input_element_num_;
+  std::vector<size_t> output_shape_;
+  std::vector<size_t> output_element_num_;
 };
 
 MS_REG_CPU_KERNEL(
diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_adam_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/sparse_apply_adam_cpu_kernel.cc
new file mode 100644
index 0000000000..5e2fc79576
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_adam_cpu_kernel.cc
@@ -0,0 +1,177 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/sparse_apply_adam_cpu_kernel.h"
+#include "kernel/common_utils.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr size_t kSparseApplyAdamInputSize = 11;
+
+void ComputeAdam(MultiThreadComputeParams *input_params, size_t start, size_t end) {
+  MS_EXCEPTION_IF_NULL(input_params);
+  auto m = input_params->m_;
+  auto m_t = input_params->m_t_;
+  auto v = input_params->v_;
+  auto beta1 = input_params->beta1_;
+  auto beta2 = input_params->beta2_;
+  auto use_nesterov = input_params->use_nesterov_;
+  auto unique_sparse_grad = input_params->sparse_grad_;
+  auto var_first_dim_size = input_params->var_first_dim_size_;
+  auto var_outer_dim_size = input_params->var_outer_dim_size_;
+  for (size_t i = start; i < end; ++i) {
+    int index = unique_sparse_grad.indices_[i];
+    if (index < 0 || IntToSize(index) >= var_first_dim_size) {
+      MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range after unique process";
+    }
+    size_t start_index = var_outer_dim_size * index;
+    size_t end_index = start_index + var_outer_dim_size;
+    for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
+      auto summed_grad = unique_sparse_grad.value_[k];
+      m[j] += (1 - beta1) * summed_grad;
+      v[j] += (1 - beta2) * summed_grad * summed_grad;
+      if (use_nesterov) {
+        m_t[j] = m[j] * beta1 + (1 - beta1) * summed_grad;
+      }
+    }
+  }
+}
+
+void ComputeMomentum(MultiThreadComputeParams *input_params, size_t start, size_t end) {
+  MS_EXCEPTION_IF_NULL(input_params);
+  auto m = input_params->m_;
+  auto v = input_params->v_;
+  auto beta1 = input_params->beta1_;
+  auto beta2 = input_params->beta2_;
+  for (size_t i = start; i < end; ++i) {
+    m[i] *= beta1;
+    v[i] *= beta2;
+  }
+}
+
+void ComputeWeight(MultiThreadComputeParams *input_params, size_t start, size_t end) {
+  MS_EXCEPTION_IF_NULL(input_params);
+  auto var = input_params->var_;
+  auto m = input_params->m_;
+  auto v = input_params->v_;
+  auto lr = input_params->lr_;
+  auto epsilon = input_params->epsilon_;
+  for (size_t i = start; i < end; ++i) {
+    var[i] -= lr * m[i] / (std::sqrt(v[i]) + epsilon);
+  }
+}
+}  // namespace
+
+void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
+  CPUKernel::InitInputOutputSize(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
+  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
+}
+
+void SparseApplyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> var_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  std::vector<size_t> m_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  std::vector<size_t> v_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+  std::vector<size_t> grad_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 9);
+  std::vector<size_t> indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 10);
+  if (!IsSameShape(var_shape, m_shape)) {
+    MS_LOG(EXCEPTION) << "var and m should have the same shape";
+  }
+  if (!IsSameShape(var_shape, v_shape)) {
+    MS_LOG(EXCEPTION) << "var and v should have the same shape";
+  }
+  if (var_shape.empty()) {
+    MS_LOG(EXCEPTION) << "var must be at least 1D";
+  }
+  var_first_dim_size_ = var_shape[0];
+  for (size_t i = 1; i < var_shape.size(); ++i) {
+    if (var_shape[i] != grad_shape[i]) {
+      MS_LOG(EXCEPTION) << "The shape of var and grad must equal in dimension " << i;
+    }
+    var_outer_dim_size_ *= var_shape[i];
+  }
+  if (indices_shape.size() != 1) {
+    MS_LOG(EXCEPTION) << "indices must be 1D";
+  }
+  indices_size_ = indices_shape[0];
+  if (grad_shape[0] != indices_size_) {
+    MS_LOG(EXCEPTION) << "The first dimension of grad shape must be equal to indices";
+  }
+  if (AnfAlgo::HasNodeAttr(USE_NESTEROV, kernel_node)) {
+    use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov");
+  }
+}
+
+bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                      const std::vector<kernel::AddressPtr> &workspace,
+                                      const std::vector<kernel::AddressPtr> & /*outputs*/) {
+  if (inputs.size() < kSparseApplyAdamInputSize) {
+    MS_LOG(EXCEPTION) << "Error input size!";
+  }
+
+  auto var = reinterpret_cast<float *>(inputs[0]->addr);
+  auto m = reinterpret_cast<float *>(inputs[1]->addr);
+  auto v = reinterpret_cast<float *>(inputs[2]->addr);
+  auto beta1_power = reinterpret_cast<float *>(inputs[3]->addr)[0];
+  if (beta1_power == 1) {
+    MS_LOG(EXCEPTION) << "The beta1_power should not be 1";
+  }
+  auto beta2_power = reinterpret_cast<float *>(inputs[4]->addr)[0];
+  auto lr = reinterpret_cast<float *>(inputs[5]->addr)[0];
+  auto beta1 = reinterpret_cast<float *>(inputs[6]->addr)[0];
+  auto beta2 = reinterpret_cast<float *>(inputs[7]->addr)[0];
+  auto epsilon = reinterpret_cast<float *>(inputs[8]->addr)[0];
+  auto grad = reinterpret_cast<float *>(inputs[9]->addr);
+  auto indices = reinterpret_cast<int *>(inputs[10]->addr);
+  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
+  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
+
+  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
+  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
+                       var_outer_dim_size_);
+  size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_;
+  lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
+
+  MultiThreadComputeParams input_params;
+  input_params.m_ = m;
+  input_params.v_ = v;
+  input_params.beta1_ = beta1;
+  input_params.beta2_ = beta2;
+  const size_t kThreadNum = 16;
+  MultiThreadCompute(ComputeMomentum, &input_params, kThreadNum, total_dim_size);
+
+  std::vector<float> m_t(m, m + total_dim_size);
+  input_params.m_t_ = m_t.data();
+  input_params.use_nesterov_ = use_nesterov_;
+  input_params.sparse_grad_ = unique_sparse_grad;
+  input_params.var_first_dim_size_ = var_first_dim_size_;
+  input_params.var_outer_dim_size_ = var_outer_dim_size_;
+  MultiThreadCompute(ComputeAdam, &input_params, kThreadNum, unique_sparse_grad.indices_size_);
+
+  if (use_nesterov_) {
+    input_params.m_ = input_params.m_t_;
+  }
+  input_params.var_ = var;
+  input_params.lr_ = lr;
+  input_params.epsilon_ = epsilon;
+  MultiThreadCompute(ComputeWeight, &input_params, kThreadNum, total_dim_size);
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_adam_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/sparse_apply_adam_cpu_kernel.h
new file mode 100644
index 0000000000..c2770d0ebd
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_adam_cpu_kernel.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_ADAM_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_ADAM_CPU_KERNEL_H_
+
+#include <vector>
+#include <memory>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class SparseApplyAdamCPUKernel : public CPUKernel {
+ public:
+  SparseApplyAdamCPUKernel() = default;
+  ~SparseApplyAdamCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+  void InitInputOutputSize(const CNodePtr &kernel_node) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  size_t indices_size_{0};
+  size_t var_first_dim_size_{0};
+  size_t var_outer_dim_size_{1};
+  bool use_nesterov_{false};
+};
+
+MS_REG_CPU_KERNEL(SparseApplyAdam,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeInt32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  SparseApplyAdamCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_ADAM_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc
new file mode 100644
index 0000000000..005195ea33
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc
@@ -0,0 +1,156 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/sparse_apply_ftrl_cpu_kernel.h"
+#include "kernel/common_utils.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr size_t kSparseApplyFtrlInputSize = 5;
+
+void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t end) {
+  MS_EXCEPTION_IF_NULL(input_params);
+  auto var = input_params->var_;
+  auto accum = input_params->accum_;
+  auto linear = input_params->linear_;
+  auto lr = input_params->lr_;
+  auto l1 = input_params->l1_;
+  auto l2 = input_params->l2_;
+  auto lr_power = input_params->lr_power_;
+  auto unique_sparse_grad = input_params->sparse_grad_;
+  auto var_first_dim_size = input_params->var_first_dim_size_;
+  auto var_outer_dim_size = input_params->var_outer_dim_size_;
+  for (size_t i = start; i < end; ++i) {
+    int index = unique_sparse_grad.indices_[i];
+    if (index < 0 || IntToSize(index) >= var_first_dim_size) {
+      MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range after unique process";
+    }
+    size_t start_index = var_outer_dim_size * index;
+    size_t end_index = start_index + var_outer_dim_size;
+    for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
+      auto summed_grad = unique_sparse_grad.value_[k];
+      auto accum_new = accum[j] + summed_grad * summed_grad;
+      if (lr_power == -0.5) {
+        linear[j] += summed_grad - (std::sqrt(accum_new) - std::sqrt(accum[j])) / lr * var[j];
+      } else {
+        linear[j] += summed_grad - (std::pow(accum_new, -lr_power) - std::pow(accum[j], -lr_power)) / lr * var[j];
+      }
+      auto x = Sign(linear[j]) * l1 - linear[j];
+      float y;
+      if (lr_power == -0.5) {
+        y = std::sqrt(accum_new) / lr + 2 * l2;
+      } else {
+        y = std::pow(accum_new, -lr_power) / lr + 2 * l2;
+      }
+      auto pre_shrink = x / y;
+      var[j] = std::fabs(linear[j]) > l1 ? pre_shrink : 0;
+      accum[j] = accum_new;
+    }
+  }
+}
+}  // namespace
+
+void SparseApplyFtrlCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
+  CPUKernel::InitInputOutputSize(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
+  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
+}
+
+void SparseApplyFtrlCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> var_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  std::vector<size_t> accum_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  std::vector<size_t> linear_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+  std::vector<size_t> grad_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+  std::vector<size_t> indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
+  if (!IsSameShape(var_shape, accum_shape)) {
+    MS_LOG(EXCEPTION) << "var and accum should have the same shape";
+  }
+  if (!IsSameShape(var_shape, linear_shape)) {
+    MS_LOG(EXCEPTION) << "var and linear should have the same shape";
+  }
+  if (var_shape.empty()) {
+    MS_LOG(EXCEPTION) << "var must be at least 1D";
+  }
+  var_first_dim_size_ = var_shape[0];
+  for (size_t i = 1; i < var_shape.size(); ++i) {
+    if (var_shape[i] != grad_shape[i]) {
+      MS_LOG(EXCEPTION) << "The shape of var and grad must equal in dimension " << i;
+    }
+    var_outer_dim_size_ *= var_shape[i];
+  }
+  if (indices_shape.size() != 1) {
+    MS_LOG(EXCEPTION) << "indices must be a 1D vector";
+  }
+  indices_size_ = indices_shape[0];
+  if (grad_shape[0] != indices_size_) {
+    MS_LOG(EXCEPTION) << "The first dimension of grad shape must be equal to indices";
+  }
+  lr_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "lr");
+  if (lr_ <= 0) {
+    MS_LOG(EXCEPTION) << "lr should be a positive scalar";
+  }
+  l1_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "l1");
+  if (l1_ < 0) {
+    MS_LOG(EXCEPTION) << "l1 should be a non-negative scalar";
+  }
+  l2_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "l2");
+  if (l2_ < 0) {
+    MS_LOG(EXCEPTION) << "l2 should be a non-negative scalar";
+  }
+  lr_power_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "lr_power");
+  if (lr_power_ > 0) {
+    MS_LOG(EXCEPTION) << "lr_power should be a non-positive scalar";
+  }
+}
+
+bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                      const std::vector<kernel::AddressPtr> &workspace,
+                                      const std::vector<kernel::AddressPtr> & /*outputs*/) {
+  if (inputs.size() < kSparseApplyFtrlInputSize) {
+    MS_LOG(EXCEPTION) << "error input output size!";
+  }
+
+  auto var = reinterpret_cast<float *>(inputs[0]->addr);
+  auto accum = reinterpret_cast<float *>(inputs[1]->addr);
+  auto linear = reinterpret_cast<float *>(inputs[2]->addr);
+  auto grad = reinterpret_cast<float *>(inputs[3]->addr);
+  auto indices = reinterpret_cast<int *>(inputs[4]->addr);
+  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
+  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
+  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
+  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
+                       var_outer_dim_size_);
+
+  MultiThreadComputeParams input_params;
+  input_params.var_ = var;
+  input_params.accum_ = accum;
+  input_params.linear_ = linear;
+  input_params.lr_ = lr_;
+  input_params.l1_ = l1_;
+  input_params.l2_ = l2_;
+  input_params.lr_power_ = lr_power_;
+  input_params.sparse_grad_ = unique_sparse_grad;
+  input_params.var_first_dim_size_ = var_first_dim_size_;
+  input_params.var_outer_dim_size_ = var_outer_dim_size_;
+  const size_t kThreadNum = 16;
+  MultiThreadCompute(ComputeFtrl, &input_params, kThreadNum, unique_sparse_grad.indices_size_);
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.h
new file mode 100644
index 0000000000..b4e5a48109
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.h
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_FTRL_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_FTRL_CPU_KERNEL_H_
+
+#include <vector>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class SparseApplyFtrlCPUKernel : public CPUKernel {
+ public:
+  SparseApplyFtrlCPUKernel() = default;
+  ~SparseApplyFtrlCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+  void InitInputOutputSize(const CNodePtr &kernel_node) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  size_t indices_size_{0};
+  size_t var_first_dim_size_{0};
+  size_t var_outer_dim_size_{1};
+  float lr_{0};
+  float l1_{0};
+  float l2_{0};
+  float lr_power_{0};
+};
+
+MS_REG_CPU_KERNEL(SparseApplyFtrl,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeInt32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  SparseApplyFtrlCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_FTRL_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
new file mode 100644
index 0000000000..2460dc0f27
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
@@ -0,0 +1,147 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.h"
+#include "kernel/common_utils.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr size_t kSparseApplyLazyAdamInputSize = 11;
+
+void ComputeLazyAdam(MultiThreadComputeParams *input_params, size_t start, size_t end) {
+  MS_EXCEPTION_IF_NULL(input_params);
+  auto var = input_params->var_;
+  auto m = input_params->m_;
+  auto v = input_params->v_;
+  auto lr = input_params->lr_;
+  auto beta1 = input_params->beta1_;
+  auto beta2 = input_params->beta2_;
+  auto epsilon = input_params->epsilon_;
+  auto use_nesterov = input_params->use_nesterov_;
+  auto unique_sparse_grad = input_params->sparse_grad_;
+  auto var_first_dim_size = input_params->var_first_dim_size_;
+  auto var_outer_dim_size = input_params->var_outer_dim_size_;
+  for (size_t i = start; i < end; ++i) {
+    int index = unique_sparse_grad.indices_[i];
+    if (index < 0 || IntToSize(index) >= var_first_dim_size) {
+      MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range";
+    }
+    size_t start_index = var_outer_dim_size * index;
+    size_t end_index = start_index + var_outer_dim_size;
+    for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
+      auto summed_grad = unique_sparse_grad.value_[k];
+      m[j] = beta1 * m[j] + (1 - beta1) * summed_grad;
+      v[j] = beta2 * v[j] + (1 - beta2) * summed_grad * summed_grad;
+      if (use_nesterov) {
+        var[j] -= lr * (m[j] * beta1 + (1 - beta1) * summed_grad) / (std::sqrt(v[j]) + epsilon);
+      } else {
+        var[j] -= lr * m[j] / (std::sqrt(v[j]) + epsilon);
+      }
+    }
+  }
+}
+}  // namespace
+
+void SparseApplyLazyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
+  CPUKernel::InitInputOutputSize(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
+  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
+}
+
+void SparseApplyLazyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> var_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  std::vector<size_t> m_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  std::vector<size_t> v_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+  std::vector<size_t> grad_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 9);
+  std::vector<size_t> indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 10);
+  if (!IsSameShape(var_shape, m_shape)) {
+    MS_LOG(EXCEPTION) << "var and m should have the same shape";
+  }
+  if (!IsSameShape(var_shape, v_shape)) {
+    MS_LOG(EXCEPTION) << "var and v should have the same shape";
+  }
+  if (var_shape.empty()) {
+    MS_LOG(EXCEPTION) << "var must be at least 1D";
+  }
+  var_first_dim_size_ = var_shape[0];
+  for (size_t i = 1; i < var_shape.size(); ++i) {
+    if (var_shape[i] != grad_shape[i]) {
+      MS_LOG(EXCEPTION) << "The shape of var and grad must equal in dimension " << i;
+    }
+    var_outer_dim_size_ *= var_shape[i];
+  }
+  if (indices_shape.size() != 1) {
+    MS_LOG(EXCEPTION) << "indices must be 1D";
+  }
+  indices_size_ = indices_shape[0];
+  if (grad_shape[0] != indices_size_) {
+    MS_LOG(EXCEPTION) << "The first dimension of grad shape must be equal to indices";
+  }
+  if (AnfAlgo::HasNodeAttr(USE_NESTEROV, kernel_node)) {
+    use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov");
+  }
+}
+
+bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                          const std::vector<kernel::AddressPtr> &workspace,
+                                          const std::vector<kernel::AddressPtr> & /*outputs*/) {
+  if (inputs.size() < kSparseApplyLazyAdamInputSize) {
+    MS_LOG(EXCEPTION) << "Error input size!";
+  }
+
+  auto var = reinterpret_cast<float *>(inputs[0]->addr);
+  auto m = reinterpret_cast<float *>(inputs[1]->addr);
+  auto v = reinterpret_cast<float *>(inputs[2]->addr);
+  auto beta1_power = reinterpret_cast<float *>(inputs[3]->addr)[0];
+  if (beta1_power == 1) {
+    MS_LOG(EXCEPTION) << "The beta1_power should not be 1";
+  }
+  auto beta2_power = reinterpret_cast<float *>(inputs[4]->addr)[0];
+  auto lr = reinterpret_cast<float *>(inputs[5]->addr)[0];
+  auto beta1 = reinterpret_cast<float *>(inputs[6]->addr)[0];
+  auto beta2 = reinterpret_cast<float *>(inputs[7]->addr)[0];
+  auto epsilon = reinterpret_cast<float *>(inputs[8]->addr)[0];
+  auto grad = reinterpret_cast<float *>(inputs[9]->addr);
+  auto indices = reinterpret_cast<int *>(inputs[10]->addr);
+  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
+  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
+
+  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
+  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
+                       var_outer_dim_size_);
+
+  lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
+  MultiThreadComputeParams input_params;
+  input_params.var_ = var;
+  input_params.m_ = m;
+  input_params.v_ = v;
+  input_params.lr_ = lr;
+  input_params.beta1_ = beta1;
+  input_params.beta2_ = beta2;
+  input_params.epsilon_ = epsilon;
+  input_params.use_nesterov_ = use_nesterov_;
+  input_params.sparse_grad_ = unique_sparse_grad;
+  input_params.var_first_dim_size_ = var_first_dim_size_;
+  input_params.var_outer_dim_size_ = var_outer_dim_size_;
+  const size_t kThreadNum = 16;
+  MultiThreadCompute(ComputeLazyAdam, &input_params, kThreadNum, unique_sparse_grad.indices_size_);
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.h
new file mode 100644
index 0000000000..795568a64d
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_LAZY_ADAM_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_LAZY_ADAM_CPU_KERNEL_H_
+
+#include <vector>
+#include <memory>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class SparseApplyLazyAdamCPUKernel : public CPUKernel {
+ public:
+  SparseApplyLazyAdamCPUKernel() = default;
+  ~SparseApplyLazyAdamCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+  void InitInputOutputSize(const CNodePtr &kernel_node) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  size_t indices_size_{0};
+  size_t var_first_dim_size_{0};
+  size_t var_outer_dim_size_{1};
+  bool use_nesterov_{false};
+};
+
+MS_REG_CPU_KERNEL(SparseApplyLazyAdam,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeInt32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  SparseApplyLazyAdamCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_LAZY_ADAM_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
new file mode 100644
index 0000000000..64cb65764f
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
@@ -0,0 +1,140 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel.h"
+#include "kernel/common_utils.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+constexpr size_t kSparseApplyProximalAdagradInputSize = 7;
+
+void ComputeProximalAdagrad(MultiThreadComputeParams *input_params, size_t start, size_t end) {
+  MS_EXCEPTION_IF_NULL(input_params);
+  auto var = input_params->var_;
+  auto accum = input_params->accum_;
+  auto lr = input_params->lr_;
+  auto l1 = input_params->l1_;
+  auto l2 = input_params->l2_;
+  auto unique_sparse_grad = input_params->sparse_grad_;
+  auto var_first_dim_size = input_params->var_first_dim_size_;
+  auto var_outer_dim_size = input_params->var_outer_dim_size_;
+  for (size_t i = start; i < end; ++i) {
+    int index = unique_sparse_grad.indices_[i];
+    if (index < 0 || IntToSize(index) >= var_first_dim_size) {
+      MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range after unique process";
+    }
+    size_t start_index = var_outer_dim_size * index;
+    size_t end_index = start_index + var_outer_dim_size;
+    for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
+      auto summed_grad = unique_sparse_grad.value_[k];
+      accum[j] += summed_grad * summed_grad;
+      auto learning_rate = lr * (1 / std::sqrt(accum[j]));
+      auto prox_v = var[j];
+      prox_v -= summed_grad * learning_rate;
+      if (l1 > 0) {
+        var[j] = Sign(prox_v) * std::fmax(std::fabs(prox_v) - learning_rate * l1, static_cast<float>(0.0)) /
+                 (1 + l2 * learning_rate);
+      } else {
+        var[j] = prox_v / (1 + l2 * learning_rate);
+      }
+    }
+  }
+}
+}  // namespace
+
+void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
+  CPUKernel::InitInputOutputSize(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
+  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
+}
+
+void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> var_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  std::vector<size_t> accum_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  std::vector<size_t> lr_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+  std::vector<size_t> l1_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+  std::vector<size_t> l2_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
+  std::vector<size_t> grad_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
+  std::vector<size_t> indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
+  if (!IsSameShape(var_shape, accum_shape)) {
+    MS_LOG(EXCEPTION) << "var and accum should have the same shape";
+  }
+  if (var_shape.empty()) {
+    MS_LOG(EXCEPTION) << "var must be at least 1D";
+  }
+  var_first_dim_size_ = var_shape[0];
+  for (size_t i = 1; i < var_shape.size(); ++i) {
+    if (var_shape[i] != grad_shape[i]) {
+      MS_LOG(EXCEPTION) << "The shape of var and grad must equal in dimension " << i;
+    }
+    var_outer_dim_size_ *= var_shape[i];
+  }
+  if (indices_shape.size() != 1) {
+    MS_LOG(EXCEPTION) << "indices must be a 1D vector";
+  }
+  indices_size_ = indices_shape[0];
+  if (grad_shape[0] != indices_size_) {
+    MS_LOG(EXCEPTION) << "The first dimension of grad shape must be equal to indices";
+  }
+  if (!lr_shape.empty()) {
+    MS_LOG(EXCEPTION) << "lr is not a scalar";
+  }
+  if (!l1_shape.empty()) {
+    MS_LOG(EXCEPTION) << "l1 is not a scalar";
+  }
+  if (!l2_shape.empty()) {
+    MS_LOG(EXCEPTION) << "l2 is not a scalar";
+  }
+}
+
+bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                                 const std::vector<kernel::AddressPtr> &workspace,
+                                                 const std::vector<kernel::AddressPtr> & /*outputs*/) {
+  if (inputs.size() < kSparseApplyProximalAdagradInputSize) {
+    MS_LOG(EXCEPTION) << "Wrong input size!";
+  }
+
+  auto var = reinterpret_cast<float *>(inputs[0]->addr);
+  auto accum = reinterpret_cast<float *>(inputs[1]->addr);
+  auto lr = reinterpret_cast<float *>(inputs[2]->addr)[0];
+  auto l1 = reinterpret_cast<float *>(inputs[3]->addr)[0];
+  auto l2 = reinterpret_cast<float *>(inputs[4]->addr)[0];
+  auto grad = reinterpret_cast<float *>(inputs[5]->addr);
+  auto indices = reinterpret_cast<int *>(inputs[6]->addr);
+  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
+  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
+  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
+  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
+                       var_outer_dim_size_);
+
+  MultiThreadComputeParams input_params;
+  input_params.var_ = var;
+  input_params.accum_ = accum;
+  input_params.lr_ = lr;
+  input_params.l1_ = l1;
+  input_params.l2_ = l2;
+  input_params.sparse_grad_ = unique_sparse_grad;
+  input_params.var_first_dim_size_ = var_first_dim_size_;
+  input_params.var_outer_dim_size_ = var_outer_dim_size_;
+  const size_t kThreadNum = 16;
+  MultiThreadCompute(ComputeProximalAdagrad, &input_params, kThreadNum, unique_sparse_grad.indices_size_);
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel.h
new file mode 100644
index 0000000000..082809a9c2
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_PROXIMAL_ADAGRAD_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_PROXIMAL_ADAGRAD_CPU_KERNEL_H_
+
+#include <vector>
+#include <memory>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class SparseApplyProximalAdagradCPUKernel : public CPUKernel {
+ public:
+  SparseApplyProximalAdagradCPUKernel() = default;
+  ~SparseApplyProximalAdagradCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+  void InitInputOutputSize(const CNodePtr &kernel_node) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  size_t indices_size_{0};
+  size_t var_first_dim_size_{0};
+  size_t var_outer_dim_size_{1};
+};
+
+MS_REG_CPU_KERNEL(SparseApplyProximalAdagrad,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeInt32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  SparseApplyProximalAdagradCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_SPARSE_APPLY_PROXIMAL_ADAGRAD_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/cpu/sub_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/sub_cpu_kernel.cc
new file mode 100644
index 0000000000..543f0e5cdd
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sub_cpu_kernel.cc
@@ -0,0 +1,89 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <thread>
+#include "kernel/cpu/sub_cpu_kernel.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+void SubCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  if (shape.size() == 1) {
+    if (shape[0] != 1) {
+      MS_LOG(EXCEPTION) << "input 1 only support scalar";
+    }
+  } else {
+    MS_LOG(EXCEPTION) << "input 1 only support scalar";
+  }
+}
+
+void sub_task(const int *in_addr, int *out_addr, size_t lens, int offset) {
+  for (size_t i = 0; i < lens; i++) {
+    out_addr[i] = in_addr[i] - offset;
+  }
+}
+
+bool SubCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                          const std::vector<kernel::AddressPtr> & /*workspace*/,
+                          const std::vector<kernel::AddressPtr> &outputs) {
+#if defined(_WIN32) || defined(_WIN64)
+  auto start_time = std::chrono::steady_clock::now();
+#else
+  struct timeval start_time, end_time;
+  (void)gettimeofday(&start_time, nullptr);
+#endif
+  auto input_addr = reinterpret_cast<int *>(inputs[0]->addr);
+  auto output_addr = reinterpret_cast<int *>(outputs[0]->addr);
+  offset_ = *reinterpret_cast<int *>(inputs[1]->addr);
+  MS_LOG(INFO) << "offset: " << offset_;
+  auto lens = inputs[0]->size / sizeof(int);
+  if (lens < 10000) {
+    for (size_t i = 0; i < lens; i++) {
+      output_addr[i] = input_addr[i] - offset_;
+    }
+  } else {
+    const size_t thread_num = 4;
+    std::thread threads[4];
+    size_t process_lens = (lens + thread_num - 1) / thread_num;
+    size_t process_offset = 0;
+    for (size_t i = 0; i < thread_num; i++) {
+      threads[i] =
+        std::thread(sub_task, input_addr + process_offset, output_addr + process_offset, process_lens, offset_);
+      if (process_offset + process_lens > lens) {
+        process_lens = lens - process_offset;
+        process_offset = lens;
+      } else {
+        process_offset += process_lens;
+      }
+    }
+    for (size_t i = 0; i < thread_num; i++) {
+      threads[i].join();
+    }
+  }
+#if defined(_WIN32) || defined(_WIN64)
+  auto end_time = std::chrono::steady_clock::now();
+  std::chrono::duration<double, std::ratio<1, 1000000>> cost = end_time - start_time;
+  MS_LOG(INFO) << "SubscaleCPUKernel, used time: " << cost.count() << " us";
+#else
+  (void)gettimeofday(&end_time, nullptr);
+  uint64_t time = 1000000 * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
+  time += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
+  MS_LOG(INFO) << "SubCPUKernel, used time: " << time << " us";
+#endif
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/cpu/sub_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/sub_cpu_kernel.h
new file mode 100644
index 0000000000..54b2c8951a
--- /dev/null
+++ b/mindspore/ccsrc/kernel/cpu/sub_cpu_kernel.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_SUB_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_SUB_CPU_KERNEL_H_
+#include <vector>
+#include <memory>
+#include "kernel/cpu/cpu_kernel.h"
+#include "kernel/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class SubCPUKernel : public CPUKernel {
+ public:
+  SubCPUKernel() : offset_(0) {}
+  ~SubCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  int offset_;
+};
+
+MS_REG_CPU_KERNEL(
+  Sub, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  SubCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_SUB_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/nn/relu_grad_kernel.cc b/mindspore/ccsrc/kernel/gpu/arrays/argmaxwithvalue_gpu_kernel.cc
similarity index 56%
rename from mindspore/ccsrc/kernel/gpu/nn/relu_grad_kernel.cc
rename to mindspore/ccsrc/kernel/gpu/arrays/argmaxwithvalue_gpu_kernel.cc
index 848007320f..24c8a9a730 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/relu_grad_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/arrays/argmaxwithvalue_gpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,17 @@
  * limitations under the License.
  */
 
-#include "kernel/gpu/nn/relu_grad_kernel.h"
+#include "kernel/gpu/arrays/argmaxwithvalue_gpu_kernel.h"
 
 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad,
-  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  ReluGradGpuFwdKernel, float)
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad,
-  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  ReluGradGpuFwdKernel, half)
+MS_REG_GPU_KERNEL_TWO(
+  ArgMaxWithValue,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
+  ArgmaxWithValueGpuKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(
+  ArgMaxWithValue,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
+  ArgmaxWithValueGpuKernel, half, int)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/tanh_grad_kernel.h b/mindspore/ccsrc/kernel/gpu/arrays/argmaxwithvalue_gpu_kernel.h
similarity index 50%
rename from mindspore/ccsrc/kernel/gpu/nn/tanh_grad_kernel.h
rename to mindspore/ccsrc/kernel/gpu/arrays/argmaxwithvalue_gpu_kernel.h
index b5b52d0acf..fb7796b022 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/tanh_grad_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/arrays/argmaxwithvalue_gpu_kernel.h
@@ -14,23 +14,20 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_NN_TANH_GRAD_KERNEL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_NN_TANH_GRAD_KERNEL_H_
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_ARGMAXWITHVALUEGPUKERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_ARGMAXWITHVALUEGPUKERNEL_H_
 
-#include <cuda_runtime_api.h>
 #include <vector>
-#include <memory>
 #include "kernel/gpu/gpu_kernel.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
-#include "kernel/gpu/cuda_impl/tanh_impl.cuh"
-
+#include "kernel/gpu/cuda_impl/argmaxwithvalue_impl.cuh"
 namespace mindspore {
 namespace kernel {
-template <typename T>
-class TanhGradKernel : public GpuKernel {
+template <typename T, typename S>
+class ArgmaxWithValueGpuKernel : public GpuKernel {
  public:
-  TanhGradKernel() : input_size_(0) {}
-  ~TanhGradKernel() override = default;
+  ArgmaxWithValueGpuKernel() : input_size_(0), output_size_(0), bound_(0), outerSize_(0), innerSize_(0) {}
+  ~ArgmaxWithValueGpuKernel() override = default;
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -38,21 +35,40 @@ class TanhGradKernel : public GpuKernel {
 
   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
               const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    auto y_addr = GetDeviceAddress<T>(inputs, 0);
-    auto dy_addr = GetDeviceAddress<T>(inputs, 1);
-    auto dx_addr = GetDeviceAddress<T>(outputs, 0);
-
-    TanhGrad(input_size_ / sizeof(T), y_addr, dy_addr, dx_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
+    T *input = GetDeviceAddress<T>(inputs, 0);
+    T *output = GetDeviceAddress<T>(outputs, 1);
+    S *index = GetDeviceAddress<S>(outputs, 0);
+    CalArgmaxWithValue(input_size_ / sizeof(T), input, bound_, outerSize_, innerSize_, index, output,
+                       reinterpret_cast<cudaStream_t>(stream_ptr));
     return true;
   }
-  bool Init(const CNodePtr &kernel_node) override {
-    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
 
+  bool Init(const CNodePtr &kernel_node) override {
+    std::vector<size_t> shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 1);
+    int dims = shape.size();
+    int axis = GetAttr<int>(kernel_node, "axis");
+    if (axis < 0) {
+      axis += dims;
+    }
     input_size_ = sizeof(T);
-    for (auto dim : input_shape) {
-      input_size_ *= dim;
+    for (auto x : shape) {
+      input_size_ *= x;
+    }
+    output_size_ = sizeof(S);
+    for (auto x : output_shape) {
+      output_size_ *= x;
+    }
+    bound_ = shape[axis];
+    outerSize_ = 1;
+    for (int i = axis - 1; i >= 0; i--) {
+      outerSize_ *= shape[i];
     }
 
+    innerSize_ = 1;
+    for (int i = axis + 1; i < dims; i++) {
+      innerSize_ *= shape[i];
+    }
     InitSizeLists();
     return true;
   }
@@ -60,17 +76,21 @@ class TanhGradKernel : public GpuKernel {
  protected:
   void InitSizeLists() override {
     input_size_list_.push_back(input_size_);
-    input_size_list_.push_back(input_size_);
-    output_size_list_.push_back(input_size_);
+    output_size_list_.push_back(output_size_);
+    output_size_list_.push_back(output_size_ / sizeof(S) * sizeof(T));
   }
 
  private:
+  size_t input_size_;
+  size_t output_size_;
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
   std::vector<size_t> workspace_size_list_;
-  size_t input_size_;
+  int bound_;
+  int outerSize_;
+  int innerSize_;
 };
 }  // namespace kernel
 }  // namespace mindspore
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_NN_TANH_GRAD_KERNEL_H_
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_ARGMAXWITHVALUEGPUKERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/arrays/array_reduce_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/arrays/array_reduce_gpu_kernel.h
index 224a3da8ad..e1f995d648 100644
--- a/mindspore/ccsrc/kernel/gpu/arrays/array_reduce_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/arrays/array_reduce_gpu_kernel.h
@@ -81,7 +81,7 @@ class ArrayReduceGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 1) {
       MS_LOG(ERROR) << "Input number is " << input_num << ", but reduce op needs 1 inputs.";
diff --git a/mindspore/ccsrc/kernel/gpu/arrays/slice_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/arrays/slice_gpu_kernel.h
index 81910b5091..7f71e548ad 100644
--- a/mindspore/ccsrc/kernel/gpu/arrays/slice_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/arrays/slice_gpu_kernel.h
@@ -27,7 +27,8 @@ namespace kernel {
 template <typename T>
 class SliceGpuFwdKernel : public GpuKernel {
  public:
-  SliceGpuFwdKernel() : is_strided_slice_(false), input_size_(0), output_size_(0), workspace_size_(0) {}
+  SliceGpuFwdKernel()
+      : is_strided_slice_(false), is_null_input_(false), input_size_(0), output_size_(0), workspace_size_(0) {}
   ~SliceGpuFwdKernel() override = default;
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -35,6 +36,9 @@ class SliceGpuFwdKernel : public GpuKernel {
 
   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
               const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    if (is_null_input_) {
+      return true;
+    }
     T *input = GetDeviceAddress<T>(inputs, 0);
     T *output = GetDeviceAddress<T>(outputs, 0);
     if (is_strided_slice_) {
@@ -79,7 +83,11 @@ class SliceGpuFwdKernel : public GpuKernel {
       if (size_[i] < 0) {
         size_[i] = (size_[i] + input_shape_[i]) > 0 ? (size_[i] + input_shape_[i]) : 0;
       }
-      if (size_[i] == 0) {
+      if (begin_[i] == size_[i] && is_strided_slice_) {
+        MS_LOG(WARNING) << "Output is null.";
+        is_null_input_ = true;
+      }
+      if (size_[i] == 0 && strides_[i] > 0) {
         size_[i] = begin_[i] + 1;
       }
     }
@@ -143,6 +151,7 @@ class SliceGpuFwdKernel : public GpuKernel {
   std::vector<size_t> workspace_size_list_;
 
   bool is_strided_slice_;
+  bool is_null_input_;
   size_t input_size_;
   size_t output_size_;
   size_t workspace_size_;
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/adam_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/adam_impl.cu
new file mode 100644
index 0000000000..3ec63ee03a
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/adam_impl.cu
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/cuda_impl/adam_impl.cuh"
+
+template <typename T>
+__device__ __forceinline__ T SqrtFunc(T input) {
+  return sqrt(input);
+}
+
+template <>
+__device__ __forceinline__ half SqrtFunc(half input) {
+  return hsqrt(input);
+}
+
+template <typename T>
+__global__ void ApplyAdamKernel(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power,
+                                const T *learning_rate, const T *beta1, const T *beta2, const T *epsilon, T *variable,
+                                T *m, T *v) {
+  const T one = static_cast<T>(1.0);
+  const T new_learning_rate = learning_rate[0] * SqrtFunc(one - beta2_power[0]) / (one - beta1_power[0]);
+
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
+    m[i] += (gradient[i] - m[i]) * (one - beta1[0]);
+    v[i] += (gradient[i] * gradient[i] - v[i]) * (one - beta2[0]);
+    variable[i] -= new_learning_rate * m[i] / (SqrtFunc(v[i]) + epsilon[0]);
+  }
+}
+
+template <typename T>
+void ApplyAdam(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power, const T *learning_rate,
+               const T *beta1, const T *beta2, const T *epsilon, T *variable, T *m, T *v, cudaStream_t cuda_stream) {
+  ApplyAdamKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+    size, gradient, beta1_power, beta2_power, learning_rate, beta1, beta2, epsilon, variable, m, v);
+}
+
+template void ApplyAdam<float>(const size_t size, const float *gradient, const float *beta1_power,
+                               const float *beta2_power, const float *learning_rate, const float *beta1,
+                               const float *beta2, const float *epsilon, float *variable, float *m, float *v,
+                               cudaStream_t cuda_stream);
+template void ApplyAdam<half>(const size_t size, const half *gradient, const half *beta1_power, const half *beta2_power,
+                              const half *learning_rate, const half *beta1, const half *beta2, const half *epsilon,
+                              half *variable, half *m, half *v, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/tanh_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/adam_impl.cuh
similarity index 59%
rename from mindspore/ccsrc/kernel/gpu/cuda_impl/tanh_impl.cuh
rename to mindspore/ccsrc/kernel/gpu/cuda_impl/adam_impl.cuh
index 71fc4be4dd..f48a113c26 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/tanh_impl.cuh
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/adam_impl.cuh
@@ -14,15 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TAN_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TAN_H_
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
 
 #include "device/gpu/cuda_common.h"
+template <typename T>
+void ApplyAdam(const size_t size, const T *gradient, const T *beta1_power, const T *beta2_power, const T *learning_rate,
+               const T *beta1, const T *beta2, const T *epsilon, T *variable, T *m, T *v, cudaStream_t cuda_stream);
 
-template<typename T>
-void Tanh(const size_t size, const T* x_addr, T* y_addr, cudaStream_t cuda_stream);
-
-template<typename T>
-void TanhGrad(const size_t size, const T* y_addr, const T* dy_addr, T* dx_addr, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TAN_H_
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAM_IMPL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/argmaxwithvalue_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/argmaxwithvalue_impl.cu
new file mode 100644
index 0000000000..a0687a2768
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/argmaxwithvalue_impl.cu
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "argmaxwithvalue_impl.cuh"
+#include "device/gpu/cuda_common.h"
+#include "include/cuda_fp16.h"
+template <typename T, typename S>
+__global__ void ArgmaxWithValue(size_t size, const T* input, const int bound, int outerSize, int innerSize,
+                                S* index, T* output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+      for (int i = 0; i < outerSize; i++)   {
+        int inputOutterOffset = i * innerSize * bound;
+        int outputOutterOffset = i * innerSize;
+        for (int j = 0; j < innerSize; j++) {
+            auto outputInnerOffset = outputOutterOffset + j;
+            S idx = 0;
+            T maxData = input[j + inputOutterOffset];
+            for (S c = 0; c < bound; c++) {
+                int offset = j + c * innerSize;
+                auto inputData = input[inputOutterOffset + offset];
+                idx = inputData > maxData ? c : idx;
+                maxData = inputData > maxData ? inputData : maxData;
+            }
+            output[outputInnerOffset] = maxData;
+            index[outputInnerOffset] = idx;
+      }
+    }
+  }
+  return;
+}
+
+template <typename T, typename S>
+void CalArgmaxWithValue(size_t size, const T* input, const int bound_, const int outerSize_, const int innerSize_,
+                        S* index, T* output, cudaStream_t cuda_stream) {
+  ArgmaxWithValue<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input, bound_, outerSize_, innerSize_,
+                                                                     index, output);
+  return;
+}
+
+template void CalArgmaxWithValue<float, int>(size_t size, const float* input, const int bound_, const int outerSize_,
+                                  const int innerSize_, int* index, float* output,
+                                  cudaStream_t cuda_stream);
+template void CalArgmaxWithValue<half, int>(size_t size, const half* input, const int bound_, const int outerSize_,
+                                  const int innerSize_, int* index, half* output,
+                                  cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/argmaxwithvalue_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/argmaxwithvalue_impl.cuh
new file mode 100644
index 0000000000..0d4f4b62a3
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/argmaxwithvalue_impl.cuh
@@ -0,0 +1,22 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ARGMAXWITHVALUE_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ARGMAXWITHVALUE_H_
+template <typename T, typename S>
+void CalArgmaxWithValue(size_t size, const T* input, const int bound_, const int outerSize_, const int innerSize_,
+                        S* index, T* output, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ARGMAXWITHVALUE_H_
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cu
index ce8617283c..5aa087e7f5 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cu
@@ -110,7 +110,13 @@ void NoBroadcastGrad(const int &nums, enum BroadcastGradOpType op, const T *x1,
 
 template void NoBroadcastGrad(const int &nums, enum BroadcastGradOpType op, const float *x1, const float *x2,
                               const float *dy, float *dx1, float *dx2, cudaStream_t stream);
+template void NoBroadcastGrad(const int &nums, enum BroadcastGradOpType op, const int *x1, const int *x2,
+                              const int *dy, int *dx1, int *dx2, cudaStream_t stream);
 template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                             const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                             enum BroadcastGradOpType op, const float *x1, const float *x2, const float *dy, float *dx1,
                             float *dx2, cudaStream_t stream);
+template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                            enum BroadcastGradOpType op, const int *x1, const int *x2, const int *dy, int *dx1,
+                            int *dx2, cudaStream_t stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cu
index 4953d45ff5..afa94fc56c 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cu
@@ -64,6 +64,11 @@ struct SubFunc {
   __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs - rhs); }
 };
 
+template <typename T, typename S>
+struct AddFunc {
+  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs + rhs); }
+};
+
 template <>
 struct PowerFunc<half, bool> {
   // invalid branch
@@ -118,6 +123,9 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
     case BROADCAST_TYPE_SUB:
       return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                       output);
+    case BROADCAST_TYPE_ADD:
+      return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
+                                                      output);
   }
 }
 
@@ -157,6 +165,8 @@ __global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const
       return NoBroadcastOperator<T, S, MulFunc<T, S>>(nums, input0, input1, output);
     case BROADCAST_TYPE_SUB:
       return NoBroadcastOperator<T, S, SubFunc<T, S>>(nums, input0, input1, output);
+    case BROADCAST_TYPE_ADD:
+      return NoBroadcastOperator<T, S, AddFunc<T, S>>(nums, input0, input1, output);
   }
 }
 
@@ -182,7 +192,10 @@ template void Broadcast(const int &l0, const int &l1, const int &l2, const int &
                         const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                         enum BroadcastOpType op, const half *input0, const half *input1, half *output,
                         cudaStream_t stream);
-
+template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                        enum BroadcastOpType op, const int *input0, const int *input1, int *output,
+                        cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
                           bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
@@ -191,3 +204,5 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
                           bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
                           half *output, cudaStream_t stream);
+template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
+                          int *output, cudaStream_t stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cuh
index 621e14401c..5f6992511d 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cuh
@@ -28,6 +28,7 @@ enum BroadcastOpType {
   BROADCAST_TYPE_REALDIV = 5,
   BROADCAST_TYPE_MUL = 6,
   BROADCAST_TYPE_SUB = 7,
+  BROADCAST_TYPE_ADD = 8,
   BROADCAST_TYPE_INVALID = 0xffffffff,
 };
 
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cu
index 940c64ea53..019d71d740 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cu
@@ -19,10 +19,10 @@
 #include "include/cuda_runtime.h"
 
 __global__ void DropoutForwardKernel(const float *input, float *mask, float *output, size_t num_count,
-                                     float drop_prob) {
-  float scale = 1.f / drop_prob;
+                                     float keep_prob) {
+  float scale = 1.f / keep_prob;
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_count; i += blockDim.x * gridDim.x) {
-    mask[i] = mask[i] > drop_prob;
+    mask[i] = mask[i] <= keep_prob;
     output[i] = scale * input[i] * mask[i];
   }
 }
@@ -34,8 +34,8 @@ void DropoutForward(const float *input, float *mask, float *output, size_t num_c
 }
 
 __global__ void DropoutBackwardKernel(const float *dy, const float *mask, float *dx, size_t num_count,
-                                      float drop_prob) {
-  float scale = 1.f / (1.f - drop_prob);
+                                      float keep_prob) {
+  float scale = 1.f / keep_prob;
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_count; i += blockDim.x * gridDim.x) {
     dx[i] = scale * dy[i] * mask[i];
   }
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cuh
index 9aa05d6a08..bd3de6524d 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cuh
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cuh
@@ -18,9 +18,9 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_
 
 #include "device/gpu/cuda_common.h"
-void DropoutForward(const float *input, float *mask, float *output, size_t num_count, float drop_prob,
+void DropoutForward(const float *input, float *mask, float *output, size_t num_count, float keep_prob,
                     cudaStream_t cuda_stream);
-void DropoutBackward(const float *dy, const float *mask, float *dx, size_t num_count, float drop_prob,
+void DropoutBackward(const float *dy, const float *mask, float *dx, size_t num_count, float keep_prob,
                      cudaStream_t cuda_stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_per_channel_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perchannel_impl.cu
similarity index 73%
rename from mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_per_channel_impl.cu
rename to mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perchannel_impl.cu
index b9aac9bdc3..75c5eacb25 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_per_channel_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perchannel_impl.cu
@@ -19,7 +19,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
 #include <thrust/pair.h>
-#include "fake_quant_per_channel_impl.cuh"
+#include "fake_quant_perchannel_impl.cuh"
 #include "device/gpu/cuda_common.h"
 
 /**
@@ -113,44 +113,6 @@ void CalFakeQuantizePerChannel(const float *input, float *output, const int tota
     input, output, total_size, channel_size, nudge_min, nudge_max, scale, symmetric);
 }
 
-/**
- * UpdateInputMinMaxPerChannel or UpdateInputMinMaxPerChannel With EMA.
- * @param input_min
- * @param input_max
- * @param min
- * @param max
- * @return
- */
-__global__ void UpdateInputMinMaxPerChannel(float *input_min, float *input_max, float *input, int channels,
-                                            int per_channel_nums, bool ema, float ema_decay) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < channels; i += blockDim.x * gridDim.x) {
-    thrust::pair<float *, float *> sum =
-      thrust::minmax_element(thrust::device, input + i * per_channel_nums, input + per_channel_nums * (i + 1));
-    if (ema) {
-      input_min[i] = ema_decay * sum.first[0] + (1 - ema_decay) * input_min[i];
-      input_max[i] = ema_decay * sum.second[0] + (1 - ema_decay) * input_max[i];
-    } else {
-      input_min[i] = sum.first[0];
-      input_max[i] = sum.second[0];
-    }
-    input_min[i] = input_min[i] > 0 ? 0 : input_min[i];
-    input_max[i] = input_max[i] < 0 ? 0 : input_max[i];
-  }
-}
-
-__global__ void UpdateInputMinMaxPerChannelWithEMA(float *input_min, float *input_max, float min, float max,
-                                                   const float decay) {
-  *input_min = decay * (min) + (1 - decay) * (*input_min);
-  *input_max = decay * (max) + (1 - decay) * (*input_max);
-}
-
-void CalMinMaxPerChannel(float *input, float *input_min, float *input_max, const int total_size, const int channel_size,
-                         const float ema_decay, const bool ema, cudaStream_t cuda_stream) {
-  int per_channel_num = total_size / channel_size;
-  UpdateInputMinMaxPerChannel<<<GET_BLOCKS(channel_size), GET_THREADS, 0, cuda_stream>>>(
-    input_min, input_max, input, channel_size, per_channel_num, ema, ema_decay);
-}
-
 __global__ void FakeQuantizePerChannelGrad(const float *input, const float *gradient, float *output,
                                            const int total_size, const int channel_size, const float *nudge_min,
                                            const float *nudge_max) {
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_per_channel_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perchannel_impl.cuh
similarity index 100%
rename from mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_per_channel_impl.cuh
rename to mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perchannel_impl.cuh
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perlayer_impl.cu
similarity index 92%
rename from mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_impl.cu
rename to mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perlayer_impl.cu
index f25727f2c3..11a25ba294 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perlayer_impl.cu
@@ -18,10 +18,10 @@
 #include <thrust/device_vector.h>
 #include <thrust/pair.h>
 #include "device/gpu/cuda_common.h"
-#include "fake_quant_impl.cuh"
+#include "fake_quant_perlayer_impl.cuh"
 
 __global__ void FakeQuantize(const float *input, float *output, const int size, const float *nudge_min,
-                             const float *nudge_max, const float *scale, bool symmetric) {
+                             const float *nudge_max, const float *scale) {
   float input_x = 0.f;
   int nudge_input = 0;
 
@@ -35,7 +35,7 @@ __global__ void FakeQuantize(const float *input, float *output, const int size,
       input_x = nudge_max[0];
     }
     // clamp shift
-    nudge_input = floor((input_x - nudge_min[0]) / scale[0] + 0.5f);
+    nudge_input = round((input_x - nudge_min[0]) / scale[0]);
 
     // quantize
     output[i] = nudge_input * scale[0] + nudge_min[0];
@@ -99,8 +99,7 @@ __global__ void UpdateInputMinMax(float *input_min, float *input_max, const floa
 
 void CalFakeQuantize(const float *input, float *output, const int size, const float *nudge_min, const float *nudge_max,
                      const float *scale, bool symmetric, cudaStream_t cuda_stream) {
-  FakeQuantize<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, output, size, nudge_min, nudge_max, scale,
-                                                                  symmetric);
+  FakeQuantize<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, output, size, nudge_min, nudge_max, scale);
   return;
 }
 
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perlayer_impl.cuh
similarity index 100%
rename from mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_impl.cuh
rename to mindspore/ccsrc/kernel/gpu/cuda_impl/fake_quant_perlayer_impl.cuh
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/ftrl_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/ftrl_impl.cu
new file mode 100644
index 0000000000..ea6ffdbbdc
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/ftrl_impl.cu
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/cuda_impl/ftrl_impl.cuh"
+
+template <typename T>
+__device__ __forceinline__ T PowFunc(T x, T y) {
+  return pow(x, y);
+}
+
+template <>
+__device__ __forceinline__ half PowFunc(half x, half y) {
+  return __float2half(pow(__half2float(x), __half2float(y)));
+}
+
+template <typename T>
+__device__ __forceinline__ bool CompareFunc(T x, T y) {
+  return abs(x) > y;
+}
+
+template <>
+__device__ __forceinline__ bool CompareFunc(half x, half y) {
+  return abs(__half2float(x)) > __half2float(y);
+}
+
+template <typename T>
+__device__ __forceinline__ T Sgn(T x) {
+  return static_cast<T>(x != 0 ? (x > 0 ? 1 : -1) : 0);
+}
+
+template <>
+__device__ __forceinline__ half Sgn(half x) {
+  return __float2half(__half2float(x) != 0 ? (__half2float(x) > 0 ? 1 : -1) : 0);
+}
+
+template <typename T>
+__global__ void ApplyFtrlKernel(const size_t size, const T *gradient, const T *learning_rate,
+                                const T *l1_regularization, const T *l2_regularization, const T *learning_rate_power,
+                                T *variable, T *accumulation, T *linear) {
+  const T two = static_cast<T>(2.0);
+  const T learning_rate_power_val = -learning_rate_power[0];
+
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
+    const T cur_accumulation = accumulation[i] + gradient[i] * gradient[i];
+    const T accumulation_power = PowFunc(accumulation[i], learning_rate_power_val);
+    const T cur_accumulation_power = PowFunc(cur_accumulation, learning_rate_power_val);
+    const T sigma = (cur_accumulation_power - accumulation_power) / learning_rate[0];
+
+    linear[i] += gradient[i] - sigma * variable[i];
+    variable[i] = CompareFunc(linear[i], l1_regularization[0])
+                    ? ((l1_regularization[0] * Sgn(linear[i]) - linear[i]) /
+                       (cur_accumulation_power / learning_rate[0] + two * l2_regularization[0]))
+                    : static_cast<T>(0);
+    accumulation[i] = cur_accumulation;
+  }
+}
+
+template <typename T>
+void ApplyFtrl(const size_t size, const T *gradient, const T *learning_rate, const T *l1_regularization,
+               const T *l2_regularization, const T *learning_rate_power, T *variable, T *accumulation, T *linear,
+               cudaStream_t cuda_stream) {
+  ApplyFtrlKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, gradient, learning_rate, l1_regularization,
+                                                                     l2_regularization, learning_rate_power, variable,
+                                                                     accumulation, linear);
+}
+
+template void ApplyFtrl<float>(const size_t size, const float *gradient, const float *learning_rate,
+                               const float *l1_regularization, const float *l2_regularization,
+                               const float *learning_rate_power, float *variable, float *accumulation, float *linear,
+                               cudaStream_t cuda_stream);
+template void ApplyFtrl<half>(const size_t size, const half *gradient, const half *learning_rate,
+                              const half *l1_regularization, const half *l2_regularization,
+                              const half *learning_rate_power, half *variable, half *accumulation, half *linear,
+                              cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/ftrl_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/ftrl_impl.cuh
new file mode 100644
index 0000000000..ba4a8fa816
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/ftrl_impl.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FTRL_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FTRL_IMPL_H_
+
+#include "device/gpu/cuda_common.h"
+template <typename T>
+void ApplyFtrl(const size_t size, const T *gradient, const T *learning_rate, const T *l1_regularization,
+               const T *l2_regularization, const T *learning_rate_power, T *variable, T *accumulation, T *linear,
+               cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FTRL_IMPL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/gelu_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/gelu_impl.cu
index bb476179d5..e460caec9e 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/gelu_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/gelu_impl.cu
@@ -14,32 +14,62 @@
  * limitations under the License.
  */
 
-
 #include "kernel/gpu/cuda_impl/gelu_impl.cuh"
 #include "device/gpu/cuda_common.h"
 
-template<typename T>
-__global__ void GeluKernel(size_t size, T* input_addr, T* output_addr) {
+template <typename T>
+__global__ void GeluKernel(size_t size, T *input_addr, T *output_addr) {
   // formula:
   // gelu(x) = 0.5 * x * (1.0 + tanh(y))
   // tanh(y) = 2 / (1 + exp(-2y)) - 1)
   // y = sqrt(2/pi) * (x + 0.044715 * x^3)
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
     float x = input_addr[pos];
     float tanh_res = tanh(0.7978845608 * (x + 0.044715 * x * x * x));
     output_addr[pos] = 0.5 * x * (1.0 + tanh_res);
   }
 }
 
-template<typename T>
-void Gelu(size_t size, T* input_addr, T* output_addr, cudaStream_t cuda_stream) {
+template <>
+__global__ void GeluKernel(size_t size, half *input_addr, half *output_addr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    half x = input_addr[pos];
+    float tanh_res = tanh(__half2float(half(0.7978845608) * (x + half(0.044715) * x * x * x)));
+    output_addr[pos] = half(0.5) * x * (half(1.0) + __float2half(tanh_res));
+  }
+}
+
+template <>
+__global__ void GeluKernel(size_t size, half2 *input_addr, half2 *output_addr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    half2 x = input_addr[pos];
+    float2 tanh_param = __half22float2(half2(0.7978845608, 0.7978845608) * (x + half2(0.044715, 0.044715) * x * x * x));
+    float2 tanh_res;
+    tanh_res.x = tanh(tanh_param.x);
+    tanh_res.y = tanh(tanh_param.y);
+    output_addr[pos] = half2(0.5, 0.5) * x * (half2(1.0, 1.0) + __float22half2_rn(tanh_res));
+  }
+}
+
+template <typename T>
+void Gelu(size_t size, T *input_addr, T *output_addr, cudaStream_t cuda_stream) {
   GeluKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr);
   return;
 }
 
+template <>
+void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream) {
+  if (size % 2 == 0) {
+    GeluKernel<half2><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+      size / 2, reinterpret_cast<half2 *>(input_addr), reinterpret_cast<half2 *>(output_addr));
+  } else {
+    GeluKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, output_addr);
+  }
+  return;
+}
 
-template<typename T>
-__global__ void GeluGradKernel(size_t size, T* dy_addr, T* x_addr, T* dx_addr) {
+template <typename T>
+__global__ void GeluGradKernel(size_t size, T *dy_addr, T *x_addr, T *dx_addr) {
   // formula:
   // dx = dy * y'
   // y' = 0.5 * (1 + tanh(tanh_para)) +
@@ -48,18 +78,59 @@ __global__ void GeluGradKernel(size_t size, T* dy_addr, T* x_addr, T* dx_addr) {
   // mul_right = sqrt(2/pi) * (1 + 3 * 0.044715 * x^2))
   for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
     T x = x_addr[pos];
-    T tanh_res = tanh(0.7978845608  * (x + 0.044715 * x * x * x));
-    T mul_right = 0.7978845608  + 0.1070322244 * x * x;
-    T y_res = 0.5 * (1 + tanh_res) + 0.5 * x * (1 - tanh_res * tanh_res) * mul_right;
+    T tanh_res = tanh(0.7978845608 * (x + 0.044715 * x * x * x));
+    T mul_right = 0.7978845608 + 0.1070322244 * x * x;
+    T y_res = 0.5 * (1.0 + tanh_res) + 0.5 * x * (1.0 - tanh_res * tanh_res) * mul_right;
+    dx_addr[pos] = dy_addr[pos] * y_res;
+  }
+}
+
+template <typename T>
+__global__ void GeluGradKernel(size_t size, half2 *dy_addr, half2 *x_addr, half2 *dx_addr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+    half2 x = x_addr[pos];
+    float2 tanh_param = __half22float2(half2(0.7978845608, 0.7978845608) * (x + half2(0.044715, 0.044715) * x * x * x));
+    float2 tanh_res;
+    tanh_res.x = tanh(tanh_param.x);
+    tanh_res.y = tanh(tanh_param.y);
+    half2 tanh_res_half = __float22half2_rn(tanh_res);
+    half2 mul_right = half2(0.7978845608, 0.7978845608) + half2(0.1070322244, 0.1070322244) * x * x;
+    half2 y_res = half2(0.5, 0.5) * (half2(1.0, 1.0) + tanh_res_half) +
+                  half2(0.5, 0.5) * x * (half2(1.0, 1.0) - tanh_res_half * tanh_res_half) * mul_right;
+    dx_addr[pos] = dy_addr[pos] * y_res;
+  }
+}
+
+template <typename T>
+__global__ void GeluGradKernel(size_t size, half *dy_addr, half *x_addr, half *dx_addr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+    half x = x_addr[pos];
+    half tanh_param = half(0.7978845608) * (x + half(0.044715) * x * x * x);
+    half tanh_res = __float2half_rn(tanh(__half2float(tanh_param)));
+    half mul_right = half(0.7978845608) + half(0.1070322244) * x * x;
+    half y_res = half(0.5) * (half(1.0) + tanh_res) + half(0.5) * x * (half(1.0) - tanh_res * tanh_res) * mul_right;
     dx_addr[pos] = dy_addr[pos] * y_res;
   }
 }
 
-template<typename T>
-void GeluGradKernel(size_t size, T* dy_addr, T* x_addr, T* dx_addr, cudaStream_t cuda_stream) {
+template <typename T>
+void GeluGradKernel(size_t size, T *dy_addr, T *x_addr, T *dx_addr, cudaStream_t cuda_stream) {
   GeluGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr, dx_addr);
 }
 
+template <>
+void GeluGradKernel(size_t size, half *dy_addr, half *x_addr, half *dx_addr, cudaStream_t cuda_stream) {
+  if (size % 2 == 0) {
+    GeluGradKernel<half2><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+      size / 2, reinterpret_cast<half2 *>(dy_addr), reinterpret_cast<half2 *>(x_addr),
+      reinterpret_cast<half2 *>(dx_addr));
+  } else {
+    GeluGradKernel<half><<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy_addr, x_addr, dx_addr);
+  }
+  return;
+}
 
-template void Gelu(size_t size, float* input_addr, float* output_addr, cudaStream_t cuda_stream);
-template void GeluGradKernel(size_t size, float* dy_addr, float* x_addr, float* dx_addr, cudaStream_t cuda_stream);
+template void Gelu(size_t size, float *input_addr, float *output_addr, cudaStream_t cuda_stream);
+template void Gelu(size_t size, half *input_addr, half *output_addr, cudaStream_t cuda_stream);
+template void GeluGradKernel(size_t size, float *dy_addr, float *x_addr, float *dx_addr, cudaStream_t cuda_stream);
+template void GeluGradKernel(size_t size, half *dy_addr, half *x_addr, half *dx_addr, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_grad_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_grad_impl.cu
index f8377fd721..e887b98eca 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_grad_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_grad_impl.cu
@@ -18,10 +18,21 @@
 #include <stdint.h>
 #include <cuda_runtime.h>
 #include "kernel/gpu/cuda_impl/layer_norm_grad_impl.cuh"
+#include "kernel/gpu/cuda_impl/layer_norm_impl.cuh"
 
 constexpr int NUM_PER_THREAD_REDUCE = 4;
 constexpr int WARP_SIZE = 32;
 
+template <typename T>
+inline __device__ T my_pow(T a, double b) {
+  return pow(a, static_cast<float>(b));
+}
+
+template <>
+inline __device__ half my_pow(half a, double b) {
+  return __float2half(pow(__half2float(a), static_cast<float>(b)));
+}
+
 template <typename T>
 inline __device__ void GammaAndBetaThreadReduce(const int& col, const int& row_dim, const int& col_dim,
                                                 const T& epsilon, const T* dy, const T* x, const T* mean, const T* var,
@@ -35,7 +46,7 @@ inline __device__ void GammaAndBetaThreadReduce(const int& col, const int& row_d
       }
 
       int pos = row * col_dim + col;
-      dg[0] += dy[pos] * pow(var[row] + epsilon, -0.5) * (x[pos] - mean[row]);
+      dg[0] += dy[pos] * my_pow(var[row] + epsilon, -0.5) * (x[pos] - mean[row]);
       db[0] += dy[pos];
     }
   }
@@ -58,26 +69,26 @@ inline __device__ void GammaAndBetaBlockReduce(const int& col, const int& row_di
 
   // load data to share memory
   // thread(0, 32, 64, 96, ...) keep the data
-  extern __shared__ T share_mem[];
+  DynamicSharedMem<T> share_mem;
   if (threadIdx.x % WARP_SIZE == 0) {
     int offset = threadIdx.x / WARP_SIZE * 2;
-    share_mem[offset] = dg[0];
-    share_mem[offset + 1] = db[0];
+    share_mem.addr()[offset] = dg[0];
+    share_mem.addr()[offset + 1] = db[0];
   }
   __syncthreads();
 
   for (int stride = blockDim.x / WARP_SIZE / 2; stride > 0; stride >>= 1) {
     if (threadIdx.x < stride) {
       int offset = (threadIdx.x + stride) * 2;
-      share_mem[threadIdx.x * 2] += share_mem[offset];
-      share_mem[threadIdx.x * 2 + 1] += share_mem[offset + 1];
+      share_mem.addr()[threadIdx.x * 2] += share_mem.addr()[offset];
+      share_mem.addr()[threadIdx.x * 2 + 1] += share_mem.addr()[offset + 1];
     }
   }
   __syncthreads();
 
   if (threadIdx.x == 0) {
-    dg_addr[col] = share_mem[0];
-    db_addr[col] = share_mem[1];
+    dg_addr[col] = share_mem.addr()[0];
+    db_addr[col] = share_mem.addr()[1];
   }
 }
 
@@ -114,13 +125,37 @@ inline __device__ void InputThreadReduce(const int& row, const int& col_dim, con
       T v1 = dy[pos] * gamma[gamma_offset];
       T v2 = x[pos] - mean[row];
 
-      sum1[0] += -0.5 * v1 * v2 * pow(var[row] + epsilon, -1.5);
+      sum1[0] += -0.5 * v1 * v2 * my_pow(var[row] + epsilon, -1.5);
       sum2[0] += v1;
       sum3[0] += -2.0 * v2;
     }
   }
 }
 
+template <>
+inline __device__ void InputThreadReduce(const int& row, const int& col_dim, const int& param_dim, const half& epsilon,
+                                         half* sum1, half* sum2, half* sum3, const half* dy, const half* x,
+                                         const half* mean, const half* var, const half* gamma) {
+  int loop_num = (col_dim + NUM_PER_THREAD_REDUCE - 1) / NUM_PER_THREAD_REDUCE;
+  for (int i = threadIdx.x; i < loop_num; i += blockDim.x) {
+    for (int j = 0; j < NUM_PER_THREAD_REDUCE; j++) {
+      int col = NUM_PER_THREAD_REDUCE * i + j;
+      if (col >= col_dim) {
+        return;
+      }
+
+      int pos = row * col_dim + col;
+      int gamma_offset = pos % param_dim;
+      half v1 = dy[pos] * gamma[gamma_offset];
+      half v2 = x[pos] - mean[row];
+
+      sum1[0] += __float2half(-0.5) * v1 * v2 * my_pow(var[row] + epsilon, -1.5);
+      sum2[0] += v1;
+      sum3[0] += __float2half(-2.0) * v2;
+    }
+  }
+}
+
 template <typename T>
 inline __device__ void InputWarpReduce(T* sum1, T* sum2, T* sum3) {
   for (int delta = (WARP_SIZE >> 1); delta > 0; delta >>= 1) {
@@ -166,12 +201,28 @@ inline __device__ void InputProp(const int& row, const int& col_dim, const int&
     int gamma_offset = pos % param_dim;
     T v1 = dy[pos] * gamma[gamma_offset];
     T v2 = x[pos] - mean[row];
-    T v3 = pow(var[row] + epsilon, -0.5);
+    T v3 = my_pow(var[row] + epsilon, -0.5);
     dx[pos] = v1 * v3 + share_mem[0] * (2.0 / col_dim) * v2 +
               (-1.0 * v3 * share_mem[1] + (1.0 / col_dim) * share_mem[0] * share_mem[2]) * (1.0 / col_dim);
   }
 }
 
+template <>
+inline __device__ void InputProp(const int& row, const int& col_dim, const int& param_dim, const half& epsilon,
+                                 const half* dy, const half* x, const half* mean, const half* var, const half* gamma,
+                                 half* dx, const half* share_mem) {
+  for (int col = threadIdx.x; col < col_dim; col += blockDim.x) {
+    int pos = (row * col_dim + col);
+    int gamma_offset = pos % param_dim;
+    half v1 = dy[pos] * gamma[gamma_offset];
+    half v2 = x[pos] - mean[row];
+    half v3 = my_pow(var[row] + epsilon, -0.5);
+    dx[pos] = v1 * v3 + share_mem[0] * __float2half(2.0 / col_dim) * v2 +
+              (__float2half(-1.0) * v3 * share_mem[1] + __float2half(1.0 / col_dim) * share_mem[0] * share_mem[2])\
+               * __float2half(1.0 / col_dim);
+  }
+}
+
 template <typename T>
 __global__ void InputPropKernel(const int row_dim, const int col_dim, const int param_dim, const T epsilon, const T* dy,
                                 const T* x, const T* mean, const T* var, const T* gamma, T* dx) {
@@ -179,27 +230,30 @@ __global__ void InputPropKernel(const int row_dim, const int col_dim, const int
     T sum1 = 0;
     T sum2 = 0;
     T sum3 = 0;
-    extern __shared__ T share_mem[];
+    DynamicSharedMem<T> share_mem;
     InputThreadReduce(row, col_dim, param_dim, epsilon, &sum1, &sum2, &sum3, dy, x, mean, var, gamma);
     InputWarpReduce(&sum1, &sum2, &sum3);
-    InputBlockReduce(col_dim, &sum1, &sum2, &sum3, share_mem);
-    InputProp(row, col_dim, param_dim, epsilon, dy, x, mean, var, gamma, dx, share_mem);
+    InputBlockReduce(col_dim, &sum1, &sum2, &sum3, share_mem.addr());
+    InputProp(row, col_dim, param_dim, epsilon, dy, x, mean, var, gamma, dx, share_mem.addr());
   }
 }
 
 template <typename T>
 void LayerNormGrad(const int& row_dim, const int& col_dim, const int& param_dim, const T& epsilon, const T* dy,
                    const T* x, const T* mean, const T* var, const T* gamma, T* dx, T* dg, T* db, cudaStream_t stream) {
-  int share_mem =
+  int share_mem_size =
     ((col_dim + NUM_PER_THREAD_REDUCE - 1) / NUM_PER_THREAD_REDUCE + WARP_SIZE - 1) / WARP_SIZE * 3 * sizeof(T);
-  InputPropKernel<<<row_dim, 256, share_mem, stream>>>(row_dim, col_dim, param_dim, epsilon, dy, x, mean, var, gamma,
-                                                       dx);
+  InputPropKernel<<<row_dim, 256, share_mem_size, stream>>>(row_dim, col_dim, param_dim, epsilon, dy, x, mean, var,
+                                                            gamma, dx);
 
-  share_mem =
+  share_mem_size =
     ((row_dim + NUM_PER_THREAD_REDUCE - 1) / NUM_PER_THREAD_REDUCE + WARP_SIZE - 1) / WARP_SIZE * 2 * sizeof(T);
-  GammaAndBetaPropKernel<<<col_dim, 256, share_mem, stream>>>(row_dim, col_dim, epsilon, dy, x, mean, var, dg, db);
+  GammaAndBetaPropKernel<<<col_dim, 256, share_mem_size, stream>>>(row_dim, col_dim, epsilon, dy, x, mean, var, dg, db);
 }
 
 template void LayerNormGrad(const int& row_dim, const int& col_dim, const int& param_dim, const float& epsilon,
                             const float* dy, const float* x, const float* mean, const float* var, const float* gamma,
                             float* dx, float* dg, float* db, cudaStream_t stream);
+template void LayerNormGrad(const int& row_dim, const int& col_dim, const int& param_dim, const half& epsilon,
+                            const half* dy, const half* x, const half* mean, const half* var, const half* gamma,
+                            half* dx, half* dg, half* db, cudaStream_t stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cu
index db33673744..cfb60f0ba6 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cu
@@ -23,7 +23,7 @@ constexpr int NUM_PER_THREAD_REDUCE = 4;
 constexpr int WARP_SIZE = 32;
 
 template <typename T>
-inline __device__ void MeanAndVarAccumulation(T* mean, T* var, T* num, const T& val) {
+inline __device__ void MeanAndVarAccumulation(T *mean, T *var, T *num, const T &val) {
   // Welford Algorithm:
   // \mu_k = \mu_{k-1} + (x_k - \mu_{k-1})/k
   // \sigma_k^2 = \sigma_{k-1}^2 + (x_k - \mu_{k-1}) * (x_k - \mu_k)
@@ -34,8 +34,9 @@ inline __device__ void MeanAndVarAccumulation(T* mean, T* var, T* num, const T&
 }
 
 template <typename T>
-inline __device__ void MeanAndVarMerge(T* m1, T* v1, T* n1, const T& m2, const T& v2, const T& n2) {
-  if (n2 == 0) {
+inline __device__ void MeanAndVarMerge(T *m1, T *v1, T *n1, const T &m2, const T &v2, const T &n2) {
+  T zero = 0;
+  if (n2 == zero) {
     return;
   }
 
@@ -46,7 +47,7 @@ inline __device__ void MeanAndVarMerge(T* m1, T* v1, T* n1, const T& m2, const T
 }
 
 template <typename T>
-inline __device__ void ThreadReduce(const int& col_dim, const T* block_addr, T* mean, T* var, T* num) {
+inline __device__ void ThreadReduce(const int &col_dim, const T *block_addr, T *mean, T *var, T *num) {
   int loop_num = (col_dim + NUM_PER_THREAD_REDUCE - 1) / NUM_PER_THREAD_REDUCE;
   for (int i = threadIdx.x; i < loop_num; i += blockDim.x) {
     for (int j = 0; j < NUM_PER_THREAD_REDUCE; j++) {
@@ -60,7 +61,7 @@ inline __device__ void ThreadReduce(const int& col_dim, const T* block_addr, T*
 }
 
 template <typename T>
-inline __device__ void WarpReduce(T* mean, T* var, T* num) {
+inline __device__ void WarpReduce(T *mean, T *var, T *num) {
   for (int delta = (WARP_SIZE >> 1); delta > 0; delta >>= 1) {
     T mean_other = __shfl_down_sync(0xffffffff, mean[0], delta);
     T var_other = __shfl_down_sync(0xffffffff, var[0], delta);
@@ -70,8 +71,8 @@ inline __device__ void WarpReduce(T* mean, T* var, T* num) {
 }
 
 template <typename T>
-inline __device__ void BlockReduce(const int& col_dim, T* mean, T* var, T* num, T* mean_addr, T* var_addr,
-                                   T* share_mem) {
+inline __device__ void BlockReduce(const int &col_dim, T *mean, T *var, T *num, T *mean_addr, T *var_addr,
+                                   T *share_mem) {
   if (threadIdx.x >= col_dim) {
     return;
   }
@@ -96,15 +97,15 @@ inline __device__ void BlockReduce(const int& col_dim, T* mean, T* var, T* num,
   __syncthreads();
 
   if (threadIdx.x == 0) {
-    mean_addr[blockIdx.x] = share_mem[0];  // todo: blockDim.x < row
+    mean_addr[blockIdx.x] = share_mem[0];
     share_mem[1] /= col_dim;
     var_addr[blockIdx.x] = share_mem[1];
   }
 }
 
 template <typename T>
-inline __device__ void LayerNorm(const int& row, const int& col_dim, const int& param_dim, const T* x,
-                                 const T* share_mem, const T* gamma, const T* beta, const T epsilon, T* y) {
+inline __device__ void LayerNorm(const int &row, const int &col_dim, const int &param_dim, const T *x,
+                                 const T *share_mem, const T *gamma, const T *beta, const T epsilon, T *y) {
   for (int col = threadIdx.x; col < col_dim; col += blockDim.x) {
     int pos = row * col_dim + col;
     int i = pos % param_dim;
@@ -112,37 +113,51 @@ inline __device__ void LayerNorm(const int& row, const int& col_dim, const int&
   }
 }
 
+template <>
+inline __device__ void LayerNorm(const int &row, const int &col_dim, const int &param_dim, const half *x,
+                                 const half *share_mem, const half *gamma, const half *beta, const half epsilon,
+                                 half *y) {
+  for (int col = threadIdx.x; col < col_dim; col += blockDim.x) {
+    int pos = row * col_dim + col;
+    int i = pos % param_dim;
+    y[pos] = (x[pos] - share_mem[0]) / hsqrt(share_mem[1] + epsilon) * gamma[i] + beta[i];
+  }
+}
+
 template <typename T>
-__global__ void LayerNormKernel(const int row_dim, const int col_dim, const int param_dim, const T epsilon, const T* x,
-                                const T* gamma, const T* beta, T* y, T* mean_addr, T* var_addr) {
+__global__ void LayerNormKernel(const int row_dim, const int col_dim, const int param_dim, const T epsilon, const T *x,
+                                const T *gamma, const T *beta, T *y, T *mean_addr, T *var_addr) {
   for (auto row = blockIdx.x; row < row_dim; row += gridDim.x) {
     T mean = 0;
     T var = 0;
     T num = 0;
-    const T* block_addr = x + row * col_dim;
-    extern __shared__ T share_mem[];
+    const T *block_addr = x + row * col_dim;
+    DynamicSharedMem<T> share_mem;
 
     ThreadReduce(col_dim, block_addr, &mean, &var, &num);
     WarpReduce(&mean, &var, &num);
-    BlockReduce(col_dim, &mean, &var, &num, mean_addr, var_addr, share_mem);
+    BlockReduce(col_dim, &mean, &var, &num, mean_addr, var_addr, share_mem.addr());
 
     __syncthreads();
-    LayerNorm(row, col_dim, param_dim, x, share_mem, gamma, beta, epsilon, y);
+    LayerNorm(row, col_dim, param_dim, x, share_mem.addr(), gamma, beta, epsilon, y);
   }
 }
 
 template <typename T>
-void LayerNorm(const int& row_dim, const int& col_dim, const int& param_dim, const T& epsilon, const T* x,
-               const T* gamma, const T* beta, T* y, T* mean, T* var, cudaStream_t stream) {
+void LayerNorm(const int &row_dim, const int &col_dim, const int &param_dim, const T &epsilon, const T *x,
+               const T *gamma, const T *beta, T *y, T *mean, T *var, cudaStream_t stream) {
   const dim3 block(row_dim);
   const dim3 thread(256);
   // keep the mean/var/num after warp reduce
-  int share_mem =
+  int share_mem_size =
     ((col_dim + NUM_PER_THREAD_REDUCE - 1) / NUM_PER_THREAD_REDUCE + WARP_SIZE - 1) / WARP_SIZE * 3 * sizeof(T);
-  LayerNormKernel<<<block, thread, share_mem, stream>>>(row_dim, col_dim, param_dim, epsilon, x, gamma, beta, y, mean,
-                                                        var);
+  LayerNormKernel<<<block, thread, share_mem_size, stream>>>(row_dim, col_dim, param_dim, epsilon, x, gamma, beta, y,
+                                                             mean, var);
 }
 
-template void LayerNorm(const int& row_dim, const int& col_dim, const int& param_dim, const float& epsilon,
-                        const float* x, const float* gamma, const float* beta, float* y, float* mean, float* var,
+template void LayerNorm(const int &row_dim, const int &col_dim, const int &param_dim, const float &epsilon,
+                        const float *x, const float *gamma, const float *beta, float *y, float *mean, float *var,
+                        cudaStream_t stream);
+template void LayerNorm(const int &row_dim, const int &col_dim, const int &param_dim, const half &epsilon,
+                        const half *x, const half *gamma, const half *beta, half *y, half *mean, half *var,
                         cudaStream_t stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cuh
index 4832b08746..c06a698384 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cuh
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cuh
@@ -19,6 +19,23 @@
 
 #include "device/gpu/cuda_common.h"
 
+template <typename T>
+struct DynamicSharedMem;
+template<>
+struct DynamicSharedMem<float> {
+    __device__ float *addr() {
+        extern __shared__ float addr_float[];
+        return addr_float;
+    }
+};
+template<>
+struct DynamicSharedMem<half> {
+    __device__ half *addr() {
+        extern __shared__ half addr_half[];
+        return addr_half;
+    }
+};
+
 template <typename T>
 void LayerNorm(const int& outer, const int& inner, const int& param_dim, const T& epsilon, const T* x, const T* gamma,
                const T* beta, T* y, T* mean, T* var, cudaStream_t stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/minmax_update_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/minmax_update_impl.cu
new file mode 100644
index 0000000000..27b2cb0232
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/minmax_update_impl.cu
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <thrust/extrema.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+#include <thrust/pair.h>
+#include "minmax_update_impl.cuh"
+#include "device/gpu/cuda_common.h"
+
+__global__ void UpdateInputMinMaxPerLayerWithEMA(const float *input_min, const float *input_max, float *output_min,
+                                                 float *output_max, const float min, const float max,
+                                                 const float decay) {
+  output_min[0] = decay * (min) + (1 - decay) * (input_min[0]);
+  output_min[0] = input_min[0] > 0 ? 0 : input_min[0];
+  output_max[0] = decay * (max) + (1 - decay) * (input_max[0]);
+  output_max[0] = input_max[0] < 0 ? 0 : input_max[0];
+  return;
+}
+
+__global__ void UpdateInputMinMaxPerLayer(float *output_min, float *output_max, const float min, const float max) {
+  output_min[0] = min > 0 ? 0 : min;
+  output_max[0] = max < 0 ? 0 : max;
+  return;
+}
+
+__global__ void UpdateInputMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min,
+                                            float *output_max, int channels, int per_channel_nums, bool ema,
+                                            float ema_decay) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < channels; i += blockDim.x * gridDim.x) {
+    thrust::pair<float *, float *> sum =
+      thrust::minmax_element(thrust::device, input + i * per_channel_nums, input + per_channel_nums * (i + 1));
+    if (ema) {
+      output_min[i] = ema_decay * sum.first[0] + (1 - ema_decay) * input_min[i];
+      output_max[i] = ema_decay * sum.second[0] + (1 - ema_decay) * input_max[i];
+    } else {
+      output_min[i] = sum.first[0];
+      output_max[i] = sum.second[0];
+    }
+    output_min[i] = input_min[i] > 0 ? 0 : input_min[i];
+    output_max[i] = input_max[i] < 0 ? 0 : input_max[i];
+  }
+  return;
+}
+
+void CalMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
+                         const int total_num, const int channel_num, const float ema_decay, const bool ema,
+                         cudaStream_t cuda_stream) {
+  int per_channel_num = total_num / channel_num;
+  UpdateInputMinMaxPerChannel<<<GET_BLOCKS(channel_num), GET_THREADS, 0, cuda_stream>>>(
+    input, input_min, input_max, output_min, output_max, channel_num, per_channel_num, ema, ema_decay);
+  return;
+}
+
+void CalMinMaxPerLayer(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
+                       const int total_num, const float ema_decay, const bool ema, cudaStream_t cuda_stream) {
+  float minel = 0.f;
+  float maxel = 0.f;
+  auto policy = thrust::cuda::par.on(cuda_stream);
+  thrust::pair<thrust::device_ptr<float>, thrust::device_ptr<float>> tuple;
+  tuple =
+    thrust::minmax_element(policy, thrust::device_pointer_cast(input), thrust::device_pointer_cast(input) + total_num);
+  minel = tuple.first[0];
+  maxel = tuple.second[0];
+
+  if (ema) {
+    UpdateInputMinMaxPerLayerWithEMA<<<1, 1, 0, cuda_stream>>>(input_min, input_max, output_min, output_max, minel,
+                                                               maxel, ema_decay);
+  } else {
+    UpdateInputMinMaxPerLayer<<<1, 1, 0, cuda_stream>>>(output_min, output_max, minel, maxel);
+  }
+  return;
+}
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/minmax_update_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/minmax_update_impl.cuh
new file mode 100644
index 0000000000..5e9becab38
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/minmax_update_impl.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_
+
+#include "device/gpu/cuda_common.h"
+
+void CalMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
+                         const int total_num, const int channel_num, const float ema_decay, const bool ema,
+                         cudaStream_t cuda_stream);
+
+void CalMinMaxPerLayer(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
+                       const int size, const float ema_decay, const bool ema, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/momentum_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/momentum_impl.cu
index ae24a8dec9..5a1c9eb687 100755
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/momentum_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/momentum_impl.cu
@@ -15,25 +15,38 @@
  */
 
 #include "momentum_impl.cuh"
-template <typename T>
-__global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const T *learning_rate,
-                                             const T *gradient, const T *momentum) {
+template <typename T, typename S>
+__global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate,
+                                             const T *gradient, const S *momentum) {
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
     accumulation[i] = momentum[0] * accumulation[i] + gradient[i];
     variable[i] -= learning_rate[0] * accumulation[i];
   }
   return;
 }
-template <typename T>
-void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const T *learning_rate, const T *gradient,
-                            const T *momentum, cudaStream_t cuda_stream) {
+template <>
+__global__ void MomentumUpdateVariableKernel(const size_t size, half *variable, half *accumulation,
+                                             const float *learning_rate, const half *gradient,
+                                             const float *momentum) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
+    accumulation[i] = __float2half(momentum[0]) * accumulation[i] + gradient[i];
+    variable[i] -= __float2half(learning_rate[0]) * accumulation[i];
+  }
+  return;
+}
+template <typename T, typename S>
+void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
+                            const S *momentum, cudaStream_t cuda_stream) {
   MomentumUpdateVariableKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, accumulation,
                                                                                   learning_rate, gradient, momentum);
   return;
 }
-template void MomentumUpdateVariable<float>(const size_t size, float *variable, float *accumulation,
-                                            const float *learning_rate, const float *gradient, const float *momentum,
-                                            cudaStream_t cuda_stream);
-template void MomentumUpdateVariable<half>(const size_t size, half *variable, half *accumulation,
-                                           const half *learning_rate, const half *gradient, const half *momentum,
-                                           cudaStream_t cuda_stream);
+template void MomentumUpdateVariable<float, float>(const size_t size, float *variable, float *accumulation,
+                                                   const float *learning_rate, const float *gradient,
+                                                   const float *momentum, cudaStream_t cuda_stream);
+template void MomentumUpdateVariable<half, half>(const size_t size, half *variable, half *accumulation,
+                                                 const half *learning_rate, const half *gradient,
+                                                 const half *momentum, cudaStream_t cuda_stream);
+template void MomentumUpdateVariable<half, float>(const size_t size, half *variable, half *accumulation,
+                                                  const float *learning_rate, const half *gradient,
+                                                  const float *momentum, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/momentum_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/momentum_impl.cuh
index 2993e04ff3..5405f5ef1d 100755
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/momentum_impl.cuh
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/momentum_impl.cuh
@@ -18,8 +18,8 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
 
 #include "device/gpu/cuda_common.h"
-template <typename T>
-void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const T *learning_rate, const T *gradient,
-                            const T *momentum, cudaStream_t cuda_stream);
+template <typename T, typename S>
+void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
+                            const S *momentum, cudaStream_t cuda_stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/rmsprop_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/rmsprop_impl.cu
index 31a4d97dff..913aaa3b8d 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/rmsprop_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/rmsprop_impl.cu
@@ -19,17 +19,17 @@
 #include "device/gpu/cuda_common.h"
 
 template <typename T>
-__global__ void RmsPropKernel(const T* learning_rate, const T* decay, const T* momentum, const T* epsilon, T* variable,
+__global__ void RmsPropKernel(const T* learning_rate, const T decay, const T momentum, const T epsilon, T* variable,
                               T* mean_square, T*moment, T* gradients, const size_t size) {
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x)  {
-    mean_square[i] = decay[0] * mean_square[i] + (1.0 - decay[0]) * gradients[i] * gradients[i];
-    moment[i] = momentum[0] * moment[i] + learning_rate[0] * rsqrt(mean_square[i] + epsilon[0]) * gradients[i];
+    mean_square[i] = decay * mean_square[i] + (1.0 - decay) * gradients[i] * gradients[i];
+    moment[i] = momentum * moment[i] + learning_rate[0] * rsqrt(mean_square[i] + epsilon) * gradients[i];
     variable[i] -= moment[i];
   }
 }
 
 template <typename T>
-void RmsProp(const T* learning_rate, const T* decay, const T* momentum, const T* epsilon,
+void RmsProp(const T* learning_rate, const T decay, const T momentum, const T epsilon,
              T* variable, T* mean_square, T* moment, T* gradients, const size_t size, cudaStream_t cuda_stream) {
   RmsPropKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(learning_rate, decay, momentum, epsilon,
                                                                    variable, mean_square, moment, gradients, size);
@@ -58,7 +58,7 @@ void RmsPropCenter(const T* learning_rate, const T* decay, const T* momentum, co
 }
 
 template
-void RmsProp(const float* learning_rate, const float* decay, const float* momentum, const float* epsilon,
+void RmsProp(const float* learning_rate, const float decay, const float momentum, const float epsilon,
             float* variable, float* mean_square, float* moment, float* gradients, const size_t size,
             cudaStream_t cuda_stream);
 
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/rmsprop_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/rmsprop_impl.cuh
index 62d7e19ba2..b5802dbb67 100644
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/rmsprop_impl.cuh
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/rmsprop_impl.cuh
@@ -19,7 +19,7 @@
 #include "device/gpu/cuda_common.h"
 
 template <typename T>
-void RmsProp(const T* learning_rate, const T* decay, const T* momentum, const T* epsilon, T* variable, T* mean_square,
+void RmsProp(const T* learning_rate, const T decay, const T momentum, const T epsilon, T* variable, T* mean_square,
              T* moment, T* gradients, const size_t size, cudaStream_t cuda_stream);
 
 template <typename T>
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cu
new file mode 100644
index 0000000000..a0082b84c8
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cu
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh"
+
+template <typename T, typename S>
+__global__ void SigmoidCrossEntropyWithLogitsGradKernel(const size_t size, const T *logits, const S *labels,
+                                                        T *outputs) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
+    if (logits[i] >= 0) {
+      outputs[i] = 1. / (1. + exp(-logits[i])) - labels[i];
+    } else {
+      const T exp_val = exp(logits[i]);
+      outputs[i] = exp_val / (1. + exp_val) - labels[i];
+    }
+  }
+}
+
+template <typename T, typename S>
+void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const T *logits, const S *labels, T *outputs,
+                                       cudaStream_t cuda_stream) {
+  SigmoidCrossEntropyWithLogitsGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, logits, labels,
+                                                                                             outputs);
+}
+
+template void SigmoidCrossEntropyWithLogitsGrad<float, float>(const size_t size, const float *logits,
+                                                              const float *labels, float *outputs,
+                                                              cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh
new file mode 100644
index 0000000000..2cd4922d25
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_H_
+
+#include "device/gpu/cuda_common.h"
+template <typename T, typename S>
+void SigmoidCrossEntropyWithLogitsGrad(const size_t size, const T *logits, const S *labels, T *outputs,
+                                       cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_IMPL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cu
new file mode 100644
index 0000000000..3766f367db
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cu
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh"
+
+template <typename T, typename S>
+__global__ void SigmoidCrossEntropyWithLogitsKernel(const size_t size, const T *logits, const S *labels, T *outputs) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
+    const T reverse_factor = static_cast<T>(logits[i] >= 0);
+    outputs[i] = log1p(exp(logits[i] - 2 * reverse_factor * logits[i])) - logits[i] * (labels[i] - reverse_factor);
+  }
+}
+
+template <typename T, typename S>
+void SigmoidCrossEntropyWithLogits(const size_t size, const T *logits, const S *labels, T *outputs,
+                                   cudaStream_t cuda_stream) {
+  SigmoidCrossEntropyWithLogitsKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, logits, labels, outputs);
+}
+
+template void SigmoidCrossEntropyWithLogits<float, float>(const size_t size, const float *logits, const float *labels,
+                                                          float *outputs, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh
new file mode 100644
index 0000000000..575605bde0
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_
+
+#include "device/gpu/cuda_common.h"
+template <typename T, typename S>
+void SigmoidCrossEntropyWithLogits(const size_t size, const T *logits, const S *labels, T *outputs,
+                                   cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_IMPL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/tanh_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/tanh_impl.cu
deleted file mode 100644
index 5471ffb5d9..0000000000
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/tanh_impl.cu
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/gpu/cuda_impl/tanh_impl.cuh"
-#include <cuda_runtime.h>
-
-template<typename T>
-__global__ void TanhKernel(const size_t size, const T* x_addr, T* y_addr) {
-  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
-    y_addr[pos] = tanh(x_addr[pos]);
-  }
-}
-
-template<typename T>
-__global__ void TanhGradKernel(const size_t size, const T* y_addr, const T* dy_addr, T* dx_addr) {
-  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
-    dx_addr[pos] = dy_addr[pos] * (1 - y_addr[pos] * y_addr[pos]);
-  }
-}
-
-template<typename T>
-void Tanh(const size_t size, const T* x_addr, T* y_addr, cudaStream_t cuda_stream) {
-  TanhKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, x_addr, y_addr);
-}
-
-template<typename T>
-void TanhGrad(const size_t size, const T* y_addr, const T* dy_addr, T* dx_addr, cudaStream_t cuda_stream) {
-  TanhGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, y_addr, dy_addr, dx_addr);
-}
-
-template void Tanh(const size_t size, const float* x_addr, float* y_addr, cudaStream_t cuda_stream);
-template void TanhGrad(const size_t size, const float* y_addr, const float* dy_addr,
-                       float* dx_addr, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/kernel/gpu/data/dataset_iterator_kernel.cc b/mindspore/ccsrc/kernel/gpu/data/dataset_iterator_kernel.cc
index d416d7df67..13ca191b0b 100644
--- a/mindspore/ccsrc/kernel/gpu/data/dataset_iterator_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/data/dataset_iterator_kernel.cc
@@ -96,7 +96,8 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
   }
 
   for (size_t i = 0; i < output_size_list_.size(); i++) {
-    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs[i]->addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice,
+    void *output_addr = GetDeviceAddress<void>(outputs, i);
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice,
                                                reinterpret_cast<cudaStream_t>(stream)),
                                "Cuda Memcpy Failed");
     addr = reinterpret_cast<unsigned char *>(addr) + output_size_list_[i];
diff --git a/mindspore/ccsrc/kernel/gpu/gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/gpu_kernel.h
index 9f8090451f..c935798f06 100644
--- a/mindspore/ccsrc/kernel/gpu/gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/gpu_kernel.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <vector>
 #include "kernel/kernel.h"
+#include "kernel/gpu/kernel_constants.h"
 #include "device/gpu/gpu_device_manager.h"
 #include "device/gpu/gpu_common.h"
 #include "session/anf_runtime_algorithm.h"
@@ -63,6 +64,9 @@ class GpuKernel : public KernelMod {
   }
   // expand Nd Shape to 4d (N in [0,4])
   void ShapeNdTo4d(const std::vector<size_t> &src, std::vector<int> *dst) {
+    if (src.size() > 4) {
+      MS_EXCEPTION(ValueError) << src.size() << "-D data is not supported!";
+    }
     dst->push_back(src.size() < 4 ? 1 : SizeToInt(src[src.size() - 4]));
     dst->push_back(src.size() < 3 ? 1 : SizeToInt(src[src.size() - 3]));
     dst->push_back(src.size() < 2 ? 1 : SizeToInt(src[src.size() - 2]));
@@ -79,6 +83,22 @@ class GpuKernel : public KernelMod {
            "must match the corresponding dimension of outC or must be equal to 1.";
     }
   }
+
+  // choose the suitable datatype for cudnn/cublas
+  inline cudnnDataType_t GetCudnnDataType(const std::string &Type) {
+    auto type = kCudnnDtypeMap.find(Type);
+    if (type == kCudnnDtypeMap.end()) {
+      MS_EXCEPTION(TypeError) << Type << " is not supported.";
+    }
+    return type->second;
+  }
+  inline cudaDataType_t GetCudaDataType(const std::string &Type) {
+    auto type = kCudaDtypeMap.find(Type);
+    if (type == kCudaDtypeMap.end()) {
+      MS_EXCEPTION(TypeError) << Type << " is not supported.";
+    }
+    return type->second;
+  }
 };
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/math/addn_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/math/addn_gpu_kernel.h
index 0b27602761..1498da777f 100644
--- a/mindspore/ccsrc/kernel/gpu/math/addn_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/addn_gpu_kernel.h
@@ -60,7 +60,7 @@ class AddNGpuFwdKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     num_input_ = GetAttr<int>(kernel_node, "n");
     if (IntToSize(num_input_) != input_num) {
diff --git a/mindspore/ccsrc/kernel/gpu/math/bias_add_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/math/bias_add_gpu_kernel.h
index 5d197e3cde..5a664db2e1 100644
--- a/mindspore/ccsrc/kernel/gpu/math/bias_add_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/bias_add_gpu_kernel.h
@@ -67,7 +67,7 @@ class BiasAddGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     auto num_dims = x_shape.size();
     is_null_input_ = CHECK_NULL_INPUT(x_shape);
diff --git a/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.cc
index 15beef39d0..e299946780 100644
--- a/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.cc
@@ -47,6 +47,10 @@ MS_REG_GPU_KERNEL_TWO(
 MS_REG_GPU_KERNEL_TWO(
   Sub, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
   BroadcastOpGpuKernel, float, float)
+MS_REG_GPU_KERNEL_TWO(
+  TensorAdd,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BroadcastOpGpuKernel, float, float)
 
 // fp16
 MS_REG_GPU_KERNEL_TWO(
@@ -77,5 +81,20 @@ MS_REG_GPU_KERNEL_TWO(
 MS_REG_GPU_KERNEL_TWO(
   Sub, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
   BroadcastOpGpuKernel, half, half)
+MS_REG_GPU_KERNEL_TWO(
+  TensorAdd,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  BroadcastOpGpuKernel, half, half)
+
+// int32
+MS_REG_GPU_KERNEL_TWO(
+  TensorAdd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  BroadcastOpGpuKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(
+  Minimum, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  BroadcastOpGpuKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(
+  Maximum, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  BroadcastOpGpuKernel, int, int)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.h
index c652d9aae4..be7d3a19d4 100644
--- a/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.h
@@ -68,14 +68,14 @@ class BroadcastOpGpuKernel : public GpuKernel {
       output_shape_[i] = shape3[i];
       output_num_ *= shape3[i];
     }
-    int offset = shape3.size() - shape1.size();
+    int lhs_offset = shape3.size() - shape1.size();
     for (size_t j = 0; j < shape1.size(); j++) {
-      lhs_shape_[j + offset] = shape1[j];
+      lhs_shape_[j + lhs_offset] = shape1[j];
       input1_num_ *= shape1[j];
     }
-    offset = shape3.size() - shape2.size();
+    int rhs_offset = shape3.size() - shape2.size();
     for (size_t k = 0; k < shape2.size(); k++) {
-      rhs_shape_[k + offset] = shape2[k];
+      rhs_shape_[k + rhs_offset] = shape2[k];
       input2_num_ *= shape2[k];
     }
 
@@ -98,7 +98,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
     static std::map<std::string, BroadcastOpType> kBroadcastTypeMap = {
       {"Greater", BROADCAST_TYPE_GREATER}, {"Less", BROADCAST_TYPE_LESS}, {"Maximum", BROADCAST_TYPE_MAXIMUM},
       {"Minimum", BROADCAST_TYPE_MINIMUM}, {"Pow", BROADCAST_TYPE_POWER}, {"RealDiv", BROADCAST_TYPE_REALDIV},
-      {"Mul", BROADCAST_TYPE_MUL},         {"Sub", BROADCAST_TYPE_SUB},
+      {"Mul", BROADCAST_TYPE_MUL},         {"Sub", BROADCAST_TYPE_SUB},   {"TensorAdd", BROADCAST_TYPE_ADD},
     };
 
     auto iter = kBroadcastTypeMap.find(kernel_name);
diff --git a/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.cc
index edc51d4ffd..85598cf940 100644
--- a/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.cc
@@ -34,5 +34,21 @@ MS_REG_GPU_KERNEL_ONE(MaximumGrad,
                         .AddOutputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat32),
                       BroadcastOpGradGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(MinimumGrad,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      BroadcastOpGradGpuKernel, int)
+MS_REG_GPU_KERNEL_ONE(MaximumGrad,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      BroadcastOpGradGpuKernel, int)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h
index 3e1f91b5b7..f1eb5fecf9 100644
--- a/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h
@@ -74,14 +74,14 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
       dy_shape_[i] = shape3[i];
       output_num_ *= shape3[i];
     }
-    int offset = shape3.size() - shape1.size();
+    int x1_offset = shape3.size() - shape1.size();
     for (size_t i = 0; i < shape1.size(); i++) {
-      x1_shape_[i + offset] = shape1[i];
+      x1_shape_[i + x1_offset] = shape1[i];
       input1_num_ *= shape1[i];
     }
-    offset = shape3.size() - shape2.size();
+    int x2_offset = shape3.size() - shape2.size();
     for (size_t i = 0; i < shape2.size(); i++) {
-      x2_shape_[i + offset] = shape2[i];
+      x2_shape_[i + x2_offset] = shape2[i];
       input2_num_ *= shape2[i];
     }
 
diff --git a/mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h
index 59153c7041..3ee3493ed6 100644
--- a/mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h
@@ -82,9 +82,9 @@ class MatMulGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
-    dtype_a_ = kCudaDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
-    dtype_b_ = kCudaDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 1))];
-    dtype_c_ = kCudaDtypeMap[TypeIdLabel(AnfAlgo::GetOutputDeviceDataType(kernel_node, 0))];
+    dtype_a_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
+    dtype_b_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 1)));
+    dtype_c_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetOutputDeviceDataType(kernel_node, 0)));
     auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
     is_null_input_ = CHECK_NULL_INPUT(output_shape);
     if (is_null_input_) {
diff --git a/mindspore/ccsrc/kernel/gpu/math/tensoradd_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/math/tensoradd_gpu_kernel.h
deleted file mode 100644
index 67c6a34f3f..0000000000
--- a/mindspore/ccsrc/kernel/gpu/math/tensoradd_gpu_kernel.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_TENSORADD_GPU_KERNEL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_TENSORADD_GPU_KERNEL_H_
-
-#include <vector>
-#include "kernel/gpu/gpu_kernel.h"
-#include "kernel/gpu/gpu_kernel_factory.h"
-#include "kernel/gpu/kernel_constants.h"
-namespace mindspore {
-namespace kernel {
-template <typename T>
-class TensorAddGpuFwdKernel : public GpuKernel {
- public:
-  TensorAddGpuFwdKernel()
-      : cudnn_handle_(nullptr),
-        inputA_descriptor_(nullptr),
-        inputB_descriptor_(nullptr),
-        opTensor_descriptor_(nullptr),
-        cudnn_data_type_(CUDNN_DATA_FLOAT),
-        input_size_(0),
-        output_size_(0),
-        workspace_size_(0),
-        is_null_input_(false) {}
-  ~TensorAddGpuFwdKernel() override { DestroyResource(); }
-
-  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
-  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
-  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
-
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
-              const std::vector<AddressPtr> &outputs, void *) {
-    if (is_null_input_) {
-      return true;
-    }
-    T *input_addr = GetDeviceAddress<T>(inputs, 0);
-    T *input_addr2 = GetDeviceAddress<T>(inputs, 1);
-    T *output_addr = GetDeviceAddress<T>(outputs, 0);
-    const float alpha = 1;
-    const float beta = 0;
-    // A + B = C. [ C = op(alpha1[0] * A, alpha2[0] * B) + beta[0] * C ]
-    // InputA must match the corresponding dimension of the destination tensor outC, and each dimension of the inputB
-    // must match the corresponding dimension of outC or must be equal to 1.
-    if (inputs[0]->size > inputs[1]->size) {
-      CHECK_CUDNN_RET_WITH_EXCEPT(
-        cudnnOpTensor(cudnn_handle_, opTensor_descriptor_, &alpha, inputA_descriptor_, input_addr, &alpha,
-                      inputB_descriptor_, input_addr2, &beta, inputA_descriptor_, output_addr),
-        "cudnnOpTensor Add failed");
-    } else {
-      CHECK_CUDNN_RET_WITH_EXCEPT(
-        cudnnOpTensor(cudnn_handle_, opTensor_descriptor_, &alpha, inputB_descriptor_, input_addr2, &alpha,
-                      inputA_descriptor_, input_addr, &beta, inputB_descriptor_, output_addr),
-        "cudnnOpTensor Add failed");
-    }
-    return true;
-  }
-  bool Init(const CNodePtr &kernel_node) {
-    InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
-    if (cudnn_data_type_ == CUDNN_DATA_INT32) {
-      cudnn_data_type_ = CUDNN_DATA_FLOAT;
-    }
-    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-    if (input_num != 2) {
-      MS_LOG(ERROR) << "Input number is " << input_num << ", but cudnnAddTensor needs 2 inputs.";
-      return false;
-    }
-    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-    if (output_num != 1) {
-      MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output.";
-      return false;
-    }
-    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-    auto input_shapeB = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
-    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
-    is_null_input_ = CHECK_NULL_INPUT(input_shape) || CHECK_NULL_INPUT(input_shapeB);
-    if (is_null_input_) {
-      MS_LOG(WARNING) << "TensorAddGpuFwdKernel input is null";
-      InitSizeLists();
-      return true;
-    }
-    std::vector<int> shapeA;
-    std::vector<int> shapeB;
-    std::vector<int> shapeOut;
-    ShapeNdTo4d(input_shape, &shapeA);
-    ShapeNdTo4d(input_shapeB, &shapeB);
-    ShapeNdTo4d(output_shape, &shapeOut);
-    CheckBroadcast4TensorOp(shapeA, shapeB, shapeOut);
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
-                                                           shapeA[0], shapeA[1], shapeA[2], shapeA[3]),
-                                "cudnnSetTensor4dDescriptor failed");
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputB_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
-                                                           shapeB[0], shapeB[1], shapeB[2], shapeB[3]),
-                                "cudnnSetTensor4dDescriptor failed");
-
-    CHECK_CUDNN_RET_WITH_EXCEPT(
-      cudnnSetOpTensorDescriptor(opTensor_descriptor_, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN),
-      "cudnnSetOpTensorDescriptor failed");
-
-    InitSizeLists();
-    return true;
-  }
-
- protected:
-  void InitResource() {
-    cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputA_descriptor_), "cudnnCreateTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputB_descriptor_), "cudnnCreateTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateOpTensorDescriptor(&opTensor_descriptor_),
-                                "cudnnCreateOpTensorDescriptor failed");
-  }
-  void InitSizeLists() {
-    if (!is_null_input_) {
-      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_),
-                                  "cudnnGetTensorSizeInBytes failed");
-      input_size_list_.push_back(input_size_);
-      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(inputB_descriptor_, &output_size_),
-                                  "cudnnGetTensorSizeInBytes failed");
-    }
-    input_size_list_.push_back(output_size_);
-
-    if (output_size_ > input_size_) {
-      output_size_list_.push_back(output_size_);
-    } else {
-      output_size_list_.push_back(input_size_);
-    }
-    workspace_size_list_.push_back(workspace_size_);
-
-    return;
-  }
-
- private:
-  void DestroyResource() noexcept {
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputB_descriptor_), "cudnnDestroyTensorDescriptor failed");
-    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(opTensor_descriptor_),
-                               "cudnnDestroyOpTensorDescriptor failed");
-  }
-  cudnnHandle_t cudnn_handle_;
-  cudnnTensorDescriptor_t inputA_descriptor_;
-  cudnnTensorDescriptor_t inputB_descriptor_;
-  cudnnOpTensorDescriptor_t opTensor_descriptor_;
-  cudnnDataType_t cudnn_data_type_;
-
-  std::vector<size_t> input_size_list_;
-  std::vector<size_t> output_size_list_;
-  std::vector<size_t> workspace_size_list_;
-
-  size_t input_size_;
-  size_t output_size_;
-  size_t workspace_size_;
-  bool is_null_input_;
-};
-}  // namespace kernel
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_TENSORADD_GPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/nn/activation_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/activation_gpu_kernel.cc
new file mode 100644
index 0000000000..5e80cccd75
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/nn/activation_gpu_kernel.cc
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/nn/activation_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(ReLU, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      ActivationGpuFwdKernel, float)
+MS_REG_GPU_KERNEL_ONE(ReLU, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      ActivationGpuFwdKernel, half)
+
+MS_REG_GPU_KERNEL_ONE(Tanh, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      ActivationGpuFwdKernel, float)
+MS_REG_GPU_KERNEL_ONE(Tanh, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      ActivationGpuFwdKernel, half)
+
+MS_REG_GPU_KERNEL_ONE(Sigmoid, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      ActivationGpuFwdKernel, float)
+MS_REG_GPU_KERNEL_ONE(Sigmoid, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      ActivationGpuFwdKernel, half)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/relu_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/activation_gpu_kernel.h
similarity index 79%
rename from mindspore/ccsrc/kernel/gpu/nn/relu_gpu_kernel.h
rename to mindspore/ccsrc/kernel/gpu/nn/activation_gpu_kernel.h
index 4cebc45831..bf6cfa7b23 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/relu_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/activation_gpu_kernel.h
@@ -18,6 +18,8 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_NN_RELU_GPU_KERNEL_H_
 
 #include <vector>
+#include <map>
+#include <string>
 #include "kernel/gpu/gpu_kernel.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "kernel/gpu/kernel_constants.h"
@@ -25,9 +27,9 @@
 namespace mindspore {
 namespace kernel {
 template <typename T>
-class ReLUGpuFwdKernel : public GpuKernel {
+class ActivationGpuFwdKernel : public GpuKernel {
  public:
-  ReLUGpuFwdKernel()
+  ActivationGpuFwdKernel()
       : cudnn_handle_(nullptr),
         activation_desc_(nullptr),
         mode_(CUDNN_ACTIVATION_RELU),
@@ -37,7 +39,7 @@ class ReLUGpuFwdKernel : public GpuKernel {
         input_size_(0),
         output_size_(0),
         workspace_size_(0) {}
-  ~ReLUGpuFwdKernel() override { DestroyResource(); }
+  ~ActivationGpuFwdKernel() override { DestroyResource(); }
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
@@ -54,33 +56,39 @@ class ReLUGpuFwdKernel : public GpuKernel {
     const float beta = 0;
     CHECK_CUDNN_RET_WITH_EXCEPT(cudnnActivationForward(cudnn_handle_, activation_desc_, &alpha, data_descriptor_, input,
                                                        &beta, data_descriptor_, output),
-                                "ReLUGpuFwdKernel failed");
+                                "cudnnActivationForward failed");
 
     return true;
   }
   bool Init(const CNodePtr &kernel_node) override {
+    auto node_name = AnfAlgo::GetCNodeName(kernel_node);
+    auto iter = kernel_map.find(node_name);
+    if (iter == kernel_map.end()) {
+      MS_LOG(EXCEPTION) << "Kernel: " << node_name << " not support.";
+    }
+    mode_ = iter->second;
+
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 1) {
-      MS_LOG(ERROR) << "Argument number is " << input_num << ", but ReLUGpuFwdKernel needs 1.";
+      MS_LOG(ERROR) << "Argument number is " << input_num << ", but ActivationGpuFwdKernel needs 1.";
       return false;
     }
     auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     is_null_input_ = CHECK_NULL_INPUT(input_shape);
     if (is_null_input_) {
-      MS_LOG(WARNING) << "ReLUGpuFwdKernel input is null.";
+      MS_LOG(WARNING) << "ActivationGpuFwdKernel input is null.";
       InitSizeLists();
       return true;
     }
-    mode_ = CUDNN_ACTIVATION_RELU;
     std::vector<int> shape;
     ShapeNdTo4d(input_shape, &shape);
     CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_NOT_PROPAGATE_NAN, 0.0),
-                                "SetActivationDescriptor failed");
+                                "cudnnSetActivationDescriptor failed");
     CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
                                                            shape[0], shape[1], shape[2], shape[3]),
-                                "SetTensor4dDescriptor failed");
+                                "cudnnSetTensor4dDescriptor failed");
     InitSizeLists();
     return true;
   }
@@ -110,6 +118,11 @@ class ReLUGpuFwdKernel : public GpuKernel {
     CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
   }
 
+  std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReLU", CUDNN_ACTIVATION_RELU},
+                                                             {"Tanh", CUDNN_ACTIVATION_TANH},
+                                                             {"ELU", CUDNN_ACTIVATION_ELU},
+                                                             {"Sigmoid", CUDNN_ACTIVATION_SIGMOID}};
+
   cudnnHandle_t cudnn_handle_;
   cudnnActivationDescriptor_t activation_desc_;
   cudnnActivationMode_t mode_;
diff --git a/mindspore/ccsrc/kernel/gpu/math/tensoradd_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/activation_grad_kernel.cc
similarity index 53%
rename from mindspore/ccsrc/kernel/gpu/math/tensoradd_gpu_kernel.cc
rename to mindspore/ccsrc/kernel/gpu/nn/activation_grad_kernel.cc
index 69716e9165..35d11f8b47 100644
--- a/mindspore/ccsrc/kernel/gpu/math/tensoradd_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/activation_grad_kernel.cc
@@ -14,20 +14,35 @@
  * limitations under the License.
  */
 
-#include "kernel/gpu/math/tensoradd_gpu_kernel.h"
+#include "kernel/gpu/nn/activation_grad_kernel.h"
 
 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(
-  TensorAdd,
+  ReluGrad,
   KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  TensorAddGpuFwdKernel, float)
+  ActivationGradGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
-  TensorAdd,
+  ReluGrad,
   KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  TensorAddGpuFwdKernel, half)
+  ActivationGradGpuKernel, half)
+
+MS_REG_GPU_KERNEL_ONE(
+  TanhGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ActivationGradGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
-  TensorAdd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  TensorAddGpuFwdKernel, int)
+  TanhGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  ActivationGradGpuKernel, half)
+
+MS_REG_GPU_KERNEL_ONE(
+  SigmoidGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ActivationGradGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(
+  SigmoidGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  ActivationGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/relu_grad_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/activation_grad_kernel.h
similarity index 77%
rename from mindspore/ccsrc/kernel/gpu/nn/relu_grad_kernel.h
rename to mindspore/ccsrc/kernel/gpu/nn/activation_grad_kernel.h
index ccc037f6e7..38e34eb752 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/relu_grad_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/activation_grad_kernel.h
@@ -18,6 +18,8 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_NN_RELU_GRAD_KERNEL_H_
 
 #include <vector>
+#include <map>
+#include <string>
 #include "kernel/gpu/gpu_kernel.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "kernel/gpu/kernel_constants.h"
@@ -25,9 +27,9 @@
 namespace mindspore {
 namespace kernel {
 template <typename T>
-class ReluGradGpuFwdKernel : public GpuKernel {
+class ActivationGradGpuKernel : public GpuKernel {
  public:
-  ReluGradGpuFwdKernel()
+  ActivationGradGpuKernel()
       : cudnn_handle_(nullptr),
         activation_desc_(nullptr),
         mode_(CUDNN_ACTIVATION_RELU),
@@ -35,7 +37,7 @@ class ReluGradGpuFwdKernel : public GpuKernel {
         is_null_input_(false),
         cudnn_data_type_(CUDNN_DATA_FLOAT),
         input_size_(0) {}
-  ~ReluGradGpuFwdKernel() override { DestroyResource(); }
+  ~ActivationGradGpuKernel() override { DestroyResource(); }
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
@@ -45,8 +47,15 @@ class ReluGradGpuFwdKernel : public GpuKernel {
     if (is_null_input_) {
       return true;
     }
-    T *y = GetDeviceAddress<T>(inputs, 1);
-    T *dy = GetDeviceAddress<T>(inputs, 0);
+    T *dy = nullptr;
+    T *y = nullptr;
+    if (mode_ == CUDNN_ACTIVATION_RELU || mode_ == CUDNN_ACTIVATION_ELU) {
+      dy = GetDeviceAddress<T>(inputs, 0);
+      y = GetDeviceAddress<T>(inputs, 1);
+    } else {
+      y = GetDeviceAddress<T>(inputs, 0);
+      dy = GetDeviceAddress<T>(inputs, 1);
+    }
     T *dx = GetDeviceAddress<T>(outputs, 0);
 
     const float alpha = 1;
@@ -59,18 +68,24 @@ class ReluGradGpuFwdKernel : public GpuKernel {
     return true;
   }
   bool Init(const CNodePtr &kernel_node) override {
+    auto node_name = AnfAlgo::GetCNodeName(kernel_node);
+    auto iter = kernel_map.find(node_name);
+    if (iter == kernel_map.end()) {
+      MS_LOG(EXCEPTION) << "Kernel: " << node_name << " not support.";
+    }
+    mode_ = iter->second;
+
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 2) {
-      MS_LOG(ERROR) << "Argument number is " << input_num << ", but ReluGradGpuFwdKernel needs 2.";
+      MS_LOG(ERROR) << "Argument number is " << input_num << ", but ActivationGradGpuKernel needs 2.";
       return false;
     }
     auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
-    mode_ = CUDNN_ACTIVATION_RELU;
     is_null_input_ = CHECK_NULL_INPUT(input_shape);
     if (is_null_input_) {
-      MS_LOG(WARNING) << "ReluGradGpuFwdKernel input is null.";
+      MS_LOG(WARNING) << "ActivationGradGpuKernel input is null.";
       InitSizeLists();
       return true;
     }
@@ -110,6 +125,10 @@ class ReluGradGpuFwdKernel : public GpuKernel {
     CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
   }
 
+  std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReluGrad", CUDNN_ACTIVATION_RELU},
+                                                             {"TanhGrad", CUDNN_ACTIVATION_TANH},
+                                                             {"ELUGrad", CUDNN_ACTIVATION_ELU},
+                                                             {"SigmoidGrad", CUDNN_ACTIVATION_SIGMOID}};
   cudnnHandle_t cudnn_handle_;
   cudnnActivationDescriptor_t activation_desc_;
   cudnnActivationMode_t mode_;
diff --git a/mindspore/ccsrc/kernel/gpu/nn/adam_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/adam_gpu_kernel.cc
new file mode 100644
index 0000000000..049a5cc280
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/nn/adam_gpu_kernel.cc
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/nn/adam_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(Adam,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      AdamGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(Adam,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      AdamGpuKernel, half)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/adam_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/adam_gpu_kernel.h
new file mode 100644
index 0000000000..93c6381ab3
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/nn/adam_gpu_kernel.h
@@ -0,0 +1,142 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_NN_ADAM_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_NN_ADAM_GPU_KERNEL_H_
+
+#include <vector>
+#include "kernel/gpu/gpu_kernel.h"
+#include "kernel/gpu/gpu_kernel_factory.h"
+#include "kernel/gpu/cuda_impl/adam_impl.cuh"
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class AdamGpuKernel : public GpuKernel {
+ public:
+  AdamGpuKernel()
+      : variable_size_(0),
+        m_size_(0),
+        v_size_(0),
+        beta1_power_size_(0),
+        beta2_power_size_(0),
+        learning_rate_size_(0),
+        beta1_size_(0),
+        beta2_size_(0),
+        epsilon_size_(0),
+        gradient_size_(0) {}
+
+  ~AdamGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
+              void *stream_ptr) override {
+    T *variable = GetDeviceAddress<T>(inputs, 0);
+    T *m = GetDeviceAddress<T>(inputs, 1);
+    T *v = GetDeviceAddress<T>(inputs, 2);
+    T *beta1_power = GetDeviceAddress<T>(inputs, 3);
+    T *beta2_power = GetDeviceAddress<T>(inputs, 4);
+    T *learning_rate = GetDeviceAddress<T>(inputs, 5);
+    T *beta1 = GetDeviceAddress<T>(inputs, 6);
+    T *beta2 = GetDeviceAddress<T>(inputs, 7);
+    T *epsilon = GetDeviceAddress<T>(inputs, 8);
+    T *gradient = GetDeviceAddress<T>(inputs, 9);
+    ApplyAdam(inputs[0]->size / sizeof(T), gradient, beta1_power, beta2_power, learning_rate, beta1, beta2, epsilon,
+              variable, m, v, reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 10) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but ftrl needs 10 inputs.";
+      return false;
+    }
+
+    variable_size_ = sizeof(T);
+    m_size_ = sizeof(T);
+    v_size_ = sizeof(T);
+    beta1_power_size_ = sizeof(T);
+    beta2_power_size_ = sizeof(T);
+    learning_rate_size_ = sizeof(T);
+    beta1_size_ = sizeof(T);
+    beta2_size_ = sizeof(T);
+    epsilon_size_ = sizeof(T);
+    gradient_size_ = sizeof(T);
+
+    auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < variable_shape.size(); i++) {
+      variable_size_ *= variable_shape[i];
+    }
+
+    auto m_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    for (size_t i = 0; i < m_shape.size(); i++) {
+      m_size_ *= m_shape[i];
+    }
+
+    auto v_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+    for (size_t i = 0; i < v_shape.size(); i++) {
+      v_size_ *= v_shape[i];
+    }
+
+    auto gradient_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 9);
+    for (size_t i = 0; i < gradient_shape.size(); i++) {
+      gradient_size_ *= gradient_shape[i];
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(variable_size_);
+    input_size_list_.push_back(m_size_);
+    input_size_list_.push_back(v_size_);
+    input_size_list_.push_back(beta1_power_size_);
+    input_size_list_.push_back(beta2_power_size_);
+    input_size_list_.push_back(learning_rate_size_);
+    input_size_list_.push_back(beta1_size_);
+    input_size_list_.push_back(beta2_size_);
+    input_size_list_.push_back(epsilon_size_);
+    input_size_list_.push_back(gradient_size_);
+    output_size_list_.push_back(0);
+    output_size_list_.push_back(0);
+    output_size_list_.push_back(0);
+  }
+
+ private:
+  size_t variable_size_;
+  size_t m_size_;
+  size_t v_size_;
+  size_t beta1_power_size_;
+  size_t beta2_power_size_;
+  size_t learning_rate_size_;
+  size_t beta1_size_;
+  size_t beta2_size_;
+  size_t epsilon_size_;
+  size_t gradient_size_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_NN_ADAM_GPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/nn/bias_add_grad_gpu_kenel.h b/mindspore/ccsrc/kernel/gpu/nn/bias_add_grad_gpu_kenel.h
index c93a050649..9b4f18d24c 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/bias_add_grad_gpu_kenel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/bias_add_grad_gpu_kenel.h
@@ -68,7 +68,7 @@ class BiasAddGradGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     auto dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     auto num_dims = dy_shape.size();
     if (num_dims < 2) {
diff --git a/mindspore/ccsrc/kernel/gpu/nn/conv2d_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/conv2d_gpu_kernel.h
index 7bb6aa2a6d..f51cbfef33 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/conv2d_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/conv2d_gpu_kernel.h
@@ -191,7 +191,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
     CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
   }
   bool CheckParam(const CNodePtr &kernel_node) {
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 2) {
       MS_LOG(ERROR) << "Input number is " << input_num << ", but conv2d needs 2 inputs.";
diff --git a/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_filter_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_filter_gpu_kernel.h
index b126b542dd..0d7be25772 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_filter_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_filter_gpu_kernel.h
@@ -98,7 +98,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
     if (!CheckParam(kernel_node)) {
       return false;
     }
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     auto dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
     is_null_input_ = CHECK_NULL_INPUT(dy_shape) || CHECK_NULL_INPUT(in_shape);
diff --git a/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_input_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_input_gpu_kernel.h
index f7f371067f..a33ea5b4da 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_input_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_input_gpu_kernel.h
@@ -98,7 +98,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
     if (!CheckParam(kernel_node)) {
       return false;
     }
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     auto dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     auto filter_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
     is_null_input_ = CHECK_NULL_INPUT(dy_shape);
diff --git a/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc
index 0d2a6be9c8..b84dc628e0 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc
@@ -23,7 +23,7 @@ DropoutGpuFwdKernel::DropoutGpuFwdKernel()
     : cudnn_handle_(nullptr),
       is_null_input_(false),
       num_count_(0),
-      drop_prob_(0.0),
+      keep_prob_(0.0),
       states_init_(false),
       mask_generator_(nullptr) {}
 
@@ -54,7 +54,7 @@ bool DropoutGpuFwdKernel::Init(const CNodePtr &kernel_node) {
   for (size_t x : input_shape) {
     num_count_ *= x;
   }
-  drop_prob_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("drop_prob"));
+  keep_prob_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("keep_prob"));
 
   InitSizeLists();
   return true;
@@ -68,14 +68,12 @@ void DropoutGpuFwdKernel::DestroyResource() noexcept {}
 
 void DropoutGpuFwdKernel::InitSizeLists() {
   size_t input_size = num_count_ * sizeof(float);
-  size_t workspace_size = 0;
   input_size_list_.push_back(input_size);
   output_size_list_.push_back(input_size);  // output size: the same with input size
   output_size_list_.push_back(input_size);  // mask size: the same with input size
-  workspace_size_list_.push_back(workspace_size);
 }
 
-bool DropoutGpuFwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+bool DropoutGpuFwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                  const std::vector<AddressPtr> &outputs, void *stream_ptr) {
   if (is_null_input_) {
     return true;
@@ -92,7 +90,7 @@ bool DropoutGpuFwdKernel::Launch(const std::vector<AddressPtr> &inputs, const st
   }
 
   curandGenerateUniform(mask_generator_, mask, num_count_);
-  DropoutForward(input, mask, output, num_count_, drop_prob_, reinterpret_cast<cudaStream_t>(stream_ptr));
+  DropoutForward(input, mask, output, num_count_, keep_prob_, reinterpret_cast<cudaStream_t>(stream_ptr));
 
   return true;
 }
diff --git a/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.h
index accff17429..81eb78c880 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.h
@@ -52,7 +52,7 @@ class DropoutGpuFwdKernel : public GpuKernel {
   cudnnHandle_t cudnn_handle_;
   bool is_null_input_;
   size_t num_count_;
-  float drop_prob_;
+  float keep_prob_;
   bool states_init_;
   curandGenerator_t mask_generator_;
   std::vector<size_t> input_size_list_;
diff --git a/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc
index 44f603f02d..2194805e92 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc
@@ -20,7 +20,7 @@
 namespace mindspore {
 namespace kernel {
 DropoutGradGpuFwdKernel::DropoutGradGpuFwdKernel()
-    : cudnn_handle_(nullptr), is_null_input_(false), num_count_(0), drop_prob_(0.0) {}
+    : cudnn_handle_(nullptr), is_null_input_(false), num_count_(0), keep_prob_(0.0) {}
 
 DropoutGradGpuFwdKernel::~DropoutGradGpuFwdKernel() { DestroyResource(); }
 
@@ -50,7 +50,7 @@ bool DropoutGradGpuFwdKernel::Init(const CNodePtr &kernel_node) {
   for (size_t x : input_shape) {
     num_count_ *= x;
   }
-  drop_prob_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("drop_prob"));
+  keep_prob_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("keep_prob"));
 
   InitSizeLists();
   return true;
@@ -66,15 +66,13 @@ void DropoutGradGpuFwdKernel::InitSizeLists() {
   size_t dy_size = num_count_ * sizeof(float);
   size_t mask_size = dy_size;
   size_t dx_size = dy_size;
-  size_t workspace_size = 0;
 
   input_size_list_.push_back(dy_size);
   input_size_list_.push_back(mask_size);
   output_size_list_.push_back(dx_size);
-  workspace_size_list_.push_back(workspace_size);
 }
 
-bool DropoutGradGpuFwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+bool DropoutGradGpuFwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                      const std::vector<AddressPtr> &outputs, void *stream_ptr) {
   if (is_null_input_) {
     return true;
@@ -84,7 +82,7 @@ bool DropoutGradGpuFwdKernel::Launch(const std::vector<AddressPtr> &inputs, cons
   auto *mask = reinterpret_cast<float *>(inputs[1]->addr);
   auto *dx = reinterpret_cast<float *>(outputs[0]->addr);
 
-  DropoutBackward(dy, mask, dx, num_count_, drop_prob_, reinterpret_cast<cudaStream_t>(stream_ptr));
+  DropoutBackward(dy, mask, dx, num_count_, keep_prob_, reinterpret_cast<cudaStream_t>(stream_ptr));
 
   return true;
 }
diff --git a/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.h
index 79d4117b58..4991b9dad5 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.h
@@ -45,7 +45,7 @@ class DropoutGradGpuFwdKernel : public GpuKernel {
   cudnnHandle_t cudnn_handle_;
   bool is_null_input_;
   size_t num_count_;
-  float drop_prob_;
+  float keep_prob_;
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
   std::vector<size_t> workspace_size_list_;
diff --git a/mindspore/ccsrc/kernel/gpu/nn/ftrl_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/ftrl_gpu_kernel.cc
new file mode 100644
index 0000000000..4d30130931
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/nn/ftrl_gpu_kernel.cc
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/nn/ftrl_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(ApplyFtrl,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      FtrlGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(ApplyFtrl,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      FtrlGpuKernel, half)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/ftrl_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/ftrl_gpu_kernel.h
new file mode 100644
index 0000000000..9e2153965b
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/nn/ftrl_gpu_kernel.h
@@ -0,0 +1,130 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_NN_FTRL_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_NN_FTRL_GPU_KERNEL_H_
+
+#include <vector>
+#include "kernel/gpu/gpu_kernel.h"
+#include "kernel/gpu/gpu_kernel_factory.h"
+#include "kernel/gpu/cuda_impl/ftrl_impl.cuh"
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class FtrlGpuKernel : public GpuKernel {
+ public:
+  FtrlGpuKernel()
+      : variable_size_(0),
+        accumulation_size_(0),
+        linear_size_(0),
+        gradient_size_(0),
+        learning_rate_size_(0),
+        l1_regularization_size_(0),
+        l2_regularization_size_(0),
+        learning_rate_power_size_(0) {}
+
+  ~FtrlGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
+              void *stream_ptr) override {
+    T *variable = GetDeviceAddress<T>(inputs, 0);
+    T *accumulation = GetDeviceAddress<T>(inputs, 1);
+    T *linear = GetDeviceAddress<T>(inputs, 2);
+    T *gradient = GetDeviceAddress<T>(inputs, 3);
+    T *learning_rate = GetDeviceAddress<T>(inputs, 4);
+    T *l1_regularization = GetDeviceAddress<T>(inputs, 5);
+    T *l2_regularization = GetDeviceAddress<T>(inputs, 6);
+    T *learning_rate_power = GetDeviceAddress<T>(inputs, 7);
+    ApplyFtrl(inputs[0]->size / sizeof(T), gradient, learning_rate, l1_regularization, l2_regularization,
+              learning_rate_power, variable, accumulation, linear, reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 8) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but ftrl needs 8 inputs.";
+      return false;
+    }
+
+    variable_size_ = sizeof(T);
+    accumulation_size_ = sizeof(T);
+    linear_size_ = sizeof(T);
+    gradient_size_ = sizeof(T);
+    learning_rate_size_ = sizeof(T);
+    l1_regularization_size_ = sizeof(T);
+    l2_regularization_size_ = sizeof(T);
+    learning_rate_power_size_ = sizeof(T);
+
+    auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < variable_shape.size(); i++) {
+      variable_size_ *= variable_shape[i];
+    }
+
+    auto accumulation_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    for (size_t i = 0; i < accumulation_shape.size(); i++) {
+      accumulation_size_ *= accumulation_shape[i];
+    }
+
+    auto linear_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+    for (size_t i = 0; i < linear_shape.size(); i++) {
+      linear_size_ *= linear_shape[i];
+    }
+
+    auto gradient_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
+    for (size_t i = 0; i < gradient_shape.size(); i++) {
+      gradient_size_ *= gradient_shape[i];
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(variable_size_);
+    input_size_list_.push_back(accumulation_size_);
+    input_size_list_.push_back(linear_size_);
+    input_size_list_.push_back(gradient_size_);
+    input_size_list_.push_back(learning_rate_size_);
+    input_size_list_.push_back(l1_regularization_size_);
+    input_size_list_.push_back(l2_regularization_size_);
+    input_size_list_.push_back(learning_rate_power_size_);
+    output_size_list_.push_back(0);
+  }
+
+ private:
+  size_t variable_size_;
+  size_t accumulation_size_;
+  size_t linear_size_;
+  size_t gradient_size_;
+  size_t learning_rate_size_;
+  size_t l1_regularization_size_;
+  size_t l2_regularization_size_;
+  size_t learning_rate_power_size_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_NN_FTRL_GPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/nn/fused_batch_norm_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/fused_batch_norm_gpu_kernel.h
index c08b341e78..b0a898209b 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/fused_batch_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/fused_batch_norm_gpu_kernel.h
@@ -82,7 +82,7 @@ class FusedBatchNormGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 5) {
       MS_LOG(EXCEPTION) << "input tensor size is " << input_num << ", FusedBatchNormGpuKernel should be 5";
diff --git a/mindspore/ccsrc/kernel/gpu/nn/fused_batchnorm_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
index 153b0286b3..712354b17c 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
@@ -75,7 +75,7 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 5) {
       MS_LOG(EXCEPTION) << "input tensor size is " << input_num << ", FusedBatchNormGradGpuKernel should be 5";
diff --git a/mindspore/ccsrc/kernel/gpu/nn/gelu_grad_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/gelu_grad_kernel.cc
index 2b6c53aa28..32d91be80a 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/gelu_grad_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/gelu_grad_kernel.cc
@@ -25,5 +25,12 @@ MS_REG_GPU_KERNEL_ONE(GeluGrad,
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat32),
                       GeLUGpuGradKernel, float)
+MS_REG_GPU_KERNEL_ONE(GeluGrad,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      GeLUGpuGradKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/gelu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/gelu_kernel.cc
index 604dee04c4..ca54ff68ad 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/gelu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/gelu_kernel.cc
@@ -20,5 +20,7 @@ namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(Gelu, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                       GeluGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(Gelu, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      GeluGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/layer_norm_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/layer_norm_gpu_kernel.cc
index e67b745ab3..19e4dc17a6 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/layer_norm_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/layer_norm_gpu_kernel.cc
@@ -27,5 +27,14 @@ MS_REG_GPU_KERNEL_ONE(LayerNorm,
                         .AddOutputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat32),
                       LayerNormGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(LayerNorm,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      LayerNormGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/layer_norm_grad_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/layer_norm_grad_gpu_kernel.cc
index e268161349..7991d42499 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/layer_norm_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/layer_norm_grad_gpu_kernel.cc
@@ -29,5 +29,16 @@ MS_REG_GPU_KERNEL_ONE(LayerNormGrad,
                         .AddOutputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat32),
                       LayerNormGradGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(LayerNormGrad,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      LayerNormGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/lstm_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/lstm_gpu_kernel.h
index 01247f0ed6..42eda96b02 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/lstm_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/lstm_gpu_kernel.h
@@ -89,7 +89,7 @@ class LstmGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     seq_len_ = SizeToInt(input_shape[0]);
     batch_size_ = SizeToInt(input_shape[1]);
diff --git a/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_data_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_data_gpu_kernel.h
index 5591b0c817..6eeefa262c 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_data_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_data_gpu_kernel.h
@@ -105,7 +105,7 @@ class LstmGradDataGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
     seq_len_ = SizeToInt(input_shape[0]);
     batch_size_ = SizeToInt(input_shape[1]);
diff --git a/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_weight_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_weight_gpu_kernel.h
index dd6aae9a00..a1a4852c84 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_weight_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_weight_gpu_kernel.h
@@ -84,7 +84,7 @@ class LstmGradWeightGpuKernel : public GpuKernel {
   }
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     seq_len_ = SizeToInt(input_shape[0]);
     batch_size_ = SizeToInt(input_shape[1]);
diff --git a/mindspore/ccsrc/kernel/gpu/nn/momentum_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/momentum_gpu_kernel.cc
index 4a77f7342b..e8b2b17706 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/momentum_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/momentum_gpu_kernel.cc
@@ -18,7 +18,7 @@
 
 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_ONE(ApplyMomentum,
+MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
                       KernelAttr()
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddInputAttr(kNumberTypeFloat32)
@@ -26,8 +26,8 @@ MS_REG_GPU_KERNEL_ONE(ApplyMomentum,
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat32),
-                      MomentumGpuKernel, float)
-MS_REG_GPU_KERNEL_ONE(ApplyMomentum,
+                      MomentumGpuKernel, float, float)
+MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
                       KernelAttr()
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddInputAttr(kNumberTypeFloat16)
@@ -35,6 +35,15 @@ MS_REG_GPU_KERNEL_ONE(ApplyMomentum,
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddOutputAttr(kNumberTypeFloat16),
-                      MomentumGpuKernel, half)
+                      MomentumGpuKernel, half, half)
+MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      MomentumGpuKernel, half, float)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/momentum_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/momentum_gpu_kernel.h
index 8452c177db..5abfb9e97b 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/momentum_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/momentum_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "kernel/gpu/cuda_impl/momentum_impl.cuh"
 namespace mindspore {
 namespace kernel {
-template <typename T>
+template <typename T, typename S>
 class MomentumGpuKernel : public GpuKernel {
  public:
   MomentumGpuKernel()
@@ -37,9 +37,9 @@ class MomentumGpuKernel : public GpuKernel {
               void *stream_ptr) override {
     T *variable = GetDeviceAddress<T>(inputs, 0);
     T *accumulation = GetDeviceAddress<T>(inputs, 1);
-    T *learning_rate = GetDeviceAddress<T>(inputs, 2);
+    S *learning_rate = GetDeviceAddress<S>(inputs, 2);
     T *gradient = GetDeviceAddress<T>(inputs, 3);
-    T *momentum = GetDeviceAddress<T>(inputs, 4);
+    S *momentum = GetDeviceAddress<S>(inputs, 4);
     MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum,
                            reinterpret_cast<cudaStream_t>(stream_ptr));
     return true;
@@ -53,9 +53,9 @@ class MomentumGpuKernel : public GpuKernel {
 
     variable_size_ = sizeof(T);
     accumulation_size_ = sizeof(T);
-    learning_rate_size_ = sizeof(T);
+    learning_rate_size_ = sizeof(S);
     gradient_size_ = sizeof(T);
-    momentum_size_ = sizeof(T);
+    momentum_size_ = sizeof(S);
 
     auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     for (size_t i = 0; i < variable_shape.size(); i++) {
diff --git a/mindspore/ccsrc/kernel/gpu/nn/pooling_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/pooling_gpu_kernel.h
index faff453775..0dda1e8998 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/pooling_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/pooling_gpu_kernel.h
@@ -88,7 +88,7 @@ class PoolingGpuFwdKernel : public GpuKernel {
     if (!CheckParam(kernel_node)) {
       return false;
     }
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
     is_null_input_ = CHECK_NULL_INPUT(input_shape);
     if (is_null_input_) {
diff --git a/mindspore/ccsrc/kernel/gpu/nn/pooling_grad_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/pooling_grad_gpu_kernel.cc
index 57bd231129..c3d4a44943 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/pooling_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/pooling_grad_gpu_kernel.cc
@@ -24,27 +24,27 @@ MS_REG_GPU_KERNEL_ONE(MaxPoolGrad,
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat32),
-                      PoolingGradGpuFwdKernel, float)
+                      PoolingGradGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(MaxPoolGrad,
                       KernelAttr()
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddOutputAttr(kNumberTypeFloat16),
-                      PoolingGradGpuFwdKernel, half)
+                      PoolingGradGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(AvgPoolGradGpu,
                       KernelAttr()
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat32),
-                      PoolingGradGpuFwdKernel, float)
+                      PoolingGradGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(AvgPoolGradGpu,
                       KernelAttr()
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddOutputAttr(kNumberTypeFloat16),
-                      PoolingGradGpuFwdKernel, half)
+                      PoolingGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/pooling_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/pooling_grad_gpu_kernel.h
index df3454c581..e8f1ebc1af 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/pooling_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/pooling_grad_gpu_kernel.h
@@ -28,9 +28,9 @@
 namespace mindspore {
 namespace kernel {
 template <typename T>
-class PoolingGradGpuFwdKernel : public GpuKernel {
+class PoolingGradGpuKernel : public GpuKernel {
  public:
-  PoolingGradGpuFwdKernel()
+  PoolingGradGpuKernel()
       : cudnn_handle_(nullptr),
         pooling_descriptor_(nullptr),
         y_descriptor_(nullptr),
@@ -55,7 +55,7 @@ class PoolingGradGpuFwdKernel : public GpuKernel {
         padded_size_(0),
         workspace_size_(0),
         use_pad_(true) {}
-  ~PoolingGradGpuFwdKernel() override { DestroyResource(); }
+  ~PoolingGradGpuKernel() override { DestroyResource(); }
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -108,7 +108,7 @@ class PoolingGradGpuFwdKernel : public GpuKernel {
     auto input_mask = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
     is_null_input_ = CHECK_NULL_INPUT(input_shape) || CHECK_NULL_INPUT(input_mask);
     if (is_null_input_) {
-      MS_LOG(WARNING) << "PoolingGradGpuFwdKernel input is null.";
+      MS_LOG(WARNING) << "PoolingGradGpuKernel input is null.";
       InitSizeLists();
       return true;
     }
@@ -196,7 +196,7 @@ class PoolingGradGpuFwdKernel : public GpuKernel {
   bool CheckParam(const CNodePtr &kernel_node) {
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 3) {
-      MS_LOG(ERROR) << "Input number is " << input_num << ", but PoolingGradGpuFwdKernel needs 3 inputs.";
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but PoolingGradGpuKernel needs 3 inputs.";
       return false;
     }
     return true;
@@ -239,7 +239,7 @@ class PoolingGradGpuFwdKernel : public GpuKernel {
   void SetPoolingMode(const CNodePtr &kernel_node) {
     pad_mode_ = GetAttr<std::string>(kernel_node, "padding");
     stride_ = GetAttr<std::vector<int>>(kernel_node, "strides");
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     mode_ = AnfAlgo::GetCNodeName(kernel_node);
     if (mode_ == "AvgPoolGradGpu") {
       pooling_mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
diff --git a/mindspore/ccsrc/kernel/gpu/nn/rmsprop_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/rmsprop_gpu_kernel.cc
index 707aa77647..032e8eeec4 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/rmsprop_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/rmsprop_gpu_kernel.cc
@@ -25,9 +25,6 @@ MS_REG_GPU_KERNEL_ONE(ApplyRMSProp,
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddInputAttr(kNumberTypeFloat32)
                         .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat32),
                       RMSPropGpuKernel, float)
 
diff --git a/mindspore/ccsrc/kernel/gpu/nn/rmsprop_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/rmsprop_gpu_kernel.h
index 7eaedfba52..9e148b690d 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/rmsprop_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/rmsprop_gpu_kernel.h
@@ -27,7 +27,7 @@ namespace kernel {
 template <typename T>
 class RMSPropGpuKernel : public GpuKernel {
  public:
-  RMSPropGpuKernel() : size_(1), use_center_(false) {}
+  RMSPropGpuKernel() : size_(1), use_center_(false), decay_(0.0), momentum_(0.9), epsilon_(1e-12) {}
   ~RMSPropGpuKernel() override = default;
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -40,13 +40,10 @@ class RMSPropGpuKernel : public GpuKernel {
       T *variable = GetDeviceAddress<T>(inputs, 0);
       T *mean_square = GetDeviceAddress<T>(inputs, 1);
       T *moment = GetDeviceAddress<T>(inputs, 2);
-      T *gradients = GetDeviceAddress<T>(inputs, 3);
-      T *learning_rate = GetDeviceAddress<T>(inputs, 4);
-      T *decay = GetDeviceAddress<T>(inputs, 5);
-      T *momentum = GetDeviceAddress<T>(inputs, 6);
-      T *epsilon = GetDeviceAddress<T>(inputs, 7);
+      T *learning_rate = GetDeviceAddress<T>(inputs, 3);
+      T *gradients = GetDeviceAddress<T>(inputs, 4);
 
-      RmsProp(learning_rate, decay, momentum, epsilon, variable, mean_square, moment, gradients, size_,
+      RmsProp(learning_rate, decay_, momentum_, epsilon_, variable, mean_square, moment, gradients, size_,
               reinterpret_cast<cudaStream_t>(stream));
     } else {
       T *variable = GetDeviceAddress<T>(inputs, 0);
@@ -70,6 +67,11 @@ class RMSPropGpuKernel : public GpuKernel {
       use_center_ = true;
     }
 
+    if (node_name == "ApplyRMSProp") {
+      decay_ = GetAttr<float>(kernel_node, "rho");
+      momentum_ = GetAttr<float>(kernel_node, "momentum");
+      epsilon_ = GetAttr<float>(kernel_node, "epsilon");
+    }
     auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
     for (auto &dim : input_shape) {
       size_ *= dim;
@@ -81,24 +83,33 @@ class RMSPropGpuKernel : public GpuKernel {
  protected:
   void InitSizeLists() override {
     size_t input_size = size_ * sizeof(T);
-    input_size_list_.push_back(input_size);
-    if (use_center_) {
+    if (!use_center_) {
+      input_size_list_.push_back(input_size);
       input_size_list_.push_back(input_size);
+      input_size_list_.push_back(input_size);
+      input_size_list_.push_back(sizeof(T));
+      input_size_list_.push_back(input_size);
+      output_size_list_.push_back(input_size);
+    } else {
+      input_size_list_.push_back(input_size);
+      input_size_list_.push_back(input_size);
+      input_size_list_.push_back(input_size);
+      input_size_list_.push_back(input_size);
+      input_size_list_.push_back(input_size);
+      input_size_list_.push_back(sizeof(T));
+      input_size_list_.push_back(sizeof(T));
+      input_size_list_.push_back(sizeof(T));
+      input_size_list_.push_back(sizeof(T));
+      output_size_list_.push_back(input_size);
     }
-
-    input_size_list_.push_back(input_size);
-    input_size_list_.push_back(input_size);
-    input_size_list_.push_back(input_size);
-    input_size_list_.push_back(sizeof(T));
-    input_size_list_.push_back(sizeof(T));
-    input_size_list_.push_back(sizeof(T));
-    input_size_list_.push_back(sizeof(T));
-    output_size_list_.push_back(0);
   }
 
  private:
   size_t size_;
   bool use_center_;
+  float decay_;
+  float momentum_;
+  float epsilon_;
 
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
diff --git a/mindspore/ccsrc/kernel/gpu/nn/tanh_grad_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.cc
similarity index 81%
rename from mindspore/ccsrc/kernel/gpu/nn/tanh_grad_kernel.cc
rename to mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.cc
index 97176680d0..1e650811fd 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/tanh_grad_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.cc
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "kernel/gpu/nn/tanh_grad_kernel.h"
+#include "kernel/gpu/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h"
 
 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_ONE(
-  TanhGrad,
+MS_REG_GPU_KERNEL_TWO(
+  SigmoidCrossEntropyWithLogits,
   KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  TanhGradKernel, float)
+  SigmoidCrossEntropyWithLogitsGpuKernel, float, float)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h
new file mode 100644
index 0000000000..8d0efe90b4
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_gpu_kernel.h
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_NN_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_NN_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GPU_KERNEL_H_
+
+#include <vector>
+#include "kernel/gpu/gpu_kernel.h"
+#include "kernel/gpu/gpu_kernel_factory.h"
+#include "kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename S>
+class SigmoidCrossEntropyWithLogitsGpuKernel : public GpuKernel {
+ public:
+  SigmoidCrossEntropyWithLogitsGpuKernel() : logits_size_(0), labels_size_(0), outputs_size_(0) {}
+
+  ~SigmoidCrossEntropyWithLogitsGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *logits_addr = GetDeviceAddress<T>(inputs, 0);
+    S *labels_addr = GetDeviceAddress<S>(inputs, 1);
+    T *outputs_addr = GetDeviceAddress<T>(outputs, 0);
+
+    SigmoidCrossEntropyWithLogits(inputs[0]->size / sizeof(T), logits_addr, labels_addr, outputs_addr,
+                                  reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 2) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but SigmoidCrossEntropyWithLogits needs 2 inputs.";
+      return false;
+    }
+    logits_size_ = sizeof(T);
+    labels_size_ = sizeof(S);
+    outputs_size_ = sizeof(T);
+
+    auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < logits_shape.size(); i++) {
+      logits_size_ *= logits_shape[i];
+    }
+
+    auto labels_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    for (size_t i = 0; i < labels_shape.size(); i++) {
+      labels_size_ *= labels_shape[i];
+    }
+
+    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < output_shape.size(); i++) {
+      outputs_size_ *= output_shape[i];
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(logits_size_);
+    input_size_list_.push_back(labels_size_);
+    output_size_list_.push_back(outputs_size_);
+  }
+
+ private:
+  size_t logits_size_;
+  size_t labels_size_;
+  size_t outputs_size_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_NN_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.cc
new file mode 100644
index 0000000000..dabc4df850
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.cc
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(SigmoidCrossEntropyWithLogitsGrad,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      SigmoidCrossEntropyWithLogitsGradGpuKernel, float, float)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h
new file mode 100644
index 0000000000..01f416f6b7
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/nn/sigmoid_cross_entropy_with_logits_grad_gpu_kernel.h
@@ -0,0 +1,96 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_NN_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_NN_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_GPU_KERNEL_H_
+
+#include <vector>
+#include "kernel/gpu/gpu_kernel.h"
+#include "kernel/gpu/gpu_kernel_factory.h"
+#include "kernel/gpu/cuda_impl/sigmoid_cross_entropy_with_logits_grad_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename S>
+class SigmoidCrossEntropyWithLogitsGradGpuKernel : public GpuKernel {
+ public:
+  SigmoidCrossEntropyWithLogitsGradGpuKernel() : logits_size_(0), labels_size_(0), outputs_size_(0) {}
+  ~SigmoidCrossEntropyWithLogitsGradGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *logits_addr = GetDeviceAddress<T>(inputs, 0);
+    S *labels_addr = GetDeviceAddress<S>(inputs, 1);
+    T *outputs_addr = GetDeviceAddress<T>(outputs, 0);
+
+    SigmoidCrossEntropyWithLogitsGrad(inputs[0]->size / sizeof(T), logits_addr, labels_addr, outputs_addr,
+                                      reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 3) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but SigmoidCrossEntropyWithLogitsGrad needs 3 inputs.";
+      return false;
+    }
+    logits_size_ = sizeof(T);
+    labels_size_ = sizeof(S);
+    outputs_size_ = sizeof(T);
+
+    auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < logits_shape.size(); i++) {
+      logits_size_ *= logits_shape[i];
+    }
+
+    auto labels_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    for (size_t i = 0; i < labels_shape.size(); i++) {
+      labels_size_ *= labels_shape[i];
+    }
+
+    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < output_shape.size(); i++) {
+      outputs_size_ *= output_shape[i];
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(logits_size_);
+    input_size_list_.push_back(labels_size_);
+    output_size_list_.push_back(outputs_size_);
+  }
+
+ private:
+  size_t logits_size_;
+  size_t labels_size_;
+  size_t outputs_size_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_NN_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_GPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
index 6840f0a1eb..8256174bcb 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
@@ -87,7 +87,7 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
                     << ", but SoftmaxCrossEntropyWithLogitsGpuKernel needs 2 output.";
       return false;
     }
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
 
     InferInputOutputSize(kernel_node);
     CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
diff --git a/mindspore/ccsrc/kernel/gpu/nn/softmax_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/softmax_gpu_kernel.h
index 060bc57d56..9d5a2a24e1 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/softmax_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/softmax_gpu_kernel.h
@@ -95,7 +95,7 @@ class SoftmaxGpuKernel : public GpuKernel {
 
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 1) {
       MS_LOG(ERROR) << "Input number is " << input_num << ", but softmax needs 1 input.";
diff --git a/mindspore/ccsrc/kernel/gpu/nn/softmax_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/softmax_grad_gpu_kernel.h
index 003b55c0ed..d73503d5a5 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/softmax_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/softmax_grad_gpu_kernel.h
@@ -98,7 +98,7 @@ class SoftmaxGradGpuKernel : public GpuKernel {
 
   bool Init(const CNodePtr &kernel_node) override {
     InitResource();
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
     if (input_num != 2) {
       MS_LOG(ERROR) << "Input number is " << input_num << ", but softmax grad needs 2 input.";
diff --git a/mindspore/ccsrc/kernel/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
index 0749172cc6..6950f0e308 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
@@ -89,7 +89,7 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
       return false;
     }
     is_grad_ = GetAttr<bool>(kernel_node, "is_grad");
-    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
 
     InferInputOutputSize(kernel_node);
     CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
diff --git a/mindspore/ccsrc/kernel/gpu/nn/tanh_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/tanh_gpu_kernel.h
deleted file mode 100644
index 7060ad1792..0000000000
--- a/mindspore/ccsrc/kernel/gpu/nn/tanh_gpu_kernel.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_NN_TANH_GPU_KERNEL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_NN_TANH_GPU_KERNEL_H_
-
-#include <cuda_runtime_api.h>
-#include <vector>
-#include <memory>
-#include "kernel/gpu/gpu_kernel.h"
-#include "kernel/gpu/gpu_kernel_factory.h"
-#include "kernel/gpu/cuda_impl/tanh_impl.cuh"
-
-namespace mindspore {
-namespace kernel {
-template <typename T>
-class TanhGpuKernel : public GpuKernel {
- public:
-  TanhGpuKernel() : input_size_(0) {}
-  ~TanhGpuKernel() override = default;
-
-  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
-  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
-  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
-
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
-              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    auto x_addr = GetDeviceAddress<T>(inputs, 0);
-    auto y_addr = GetDeviceAddress<T>(outputs, 0);
-
-    Tanh(input_size_ / sizeof(T), x_addr, y_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
-    return true;
-  }
-  bool Init(const CNodePtr &kernel_node) override {
-    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-
-    input_size_ = sizeof(T);
-    for (auto dim : input_shape) {
-      input_size_ *= dim;
-    }
-
-    InitSizeLists();
-    return true;
-  }
-
- protected:
-  void InitSizeLists() override {
-    input_size_list_.push_back(input_size_);
-    input_size_list_.push_back(input_size_);
-    output_size_list_.push_back(input_size_);
-  }
-
- private:
-  std::vector<size_t> input_size_list_;
-  std::vector<size_t> output_size_list_;
-  std::vector<size_t> workspace_size_list_;
-  size_t input_size_;
-};
-}  // namespace kernel
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_NN_LSTM_GPU_KERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold2_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold2_gpu_kernel.h
index 5d2dee3ec7..b898f34689 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold2_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold2_gpu_kernel.h
@@ -39,12 +39,10 @@ class BatchNormFold2GpuKernel : public GpuKernel {
   ~BatchNormFold2GpuKernel() override { DestroyResource(); }
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
-
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
-
   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
 
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
               const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
     if (is_null_input_) {
       return true;
@@ -111,10 +109,7 @@ class BatchNormFold2GpuKernel : public GpuKernel {
     input_size_list_.push_back(weight_size);      // running_std
     input_size_list_.push_back(weight_size);      // running_mean
     input_size_list_.push_back(sizeof(int32_t));  // global_step
-
     output_size_list_.push_back(input_size);
-
-    workspace_size_list_.push_back(sizeof(int32_t));
   }
 
  private:
diff --git a/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h
index 28a4cf6cd6..e0bafdb96a 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h
@@ -39,9 +39,7 @@ class BatchNormFold2GradGpuKernel : public GpuKernel {
   ~BatchNormFold2GradGpuKernel() override { DestroyResource(); }
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
-
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
-
   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
 
   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
diff --git a/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold_gpu_kernel.h
index a90e9b47d7..6cd001fd2e 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold_gpu_kernel.h
@@ -47,9 +47,7 @@ class BatchNormFoldGpuKernel : public GpuKernel {
   ~BatchNormFoldGpuKernel() override { DestroyResource(); }
 
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
-
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
-
   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
 
   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
@@ -141,7 +139,7 @@ class BatchNormFoldGpuKernel : public GpuKernel {
     input_size_ = sizeof(T) * batch_ * channel_ * height_ * width_;
     output_size_ = sizeof(T) * channel_;
 
-    cudnnDataType_t cudnnDataType = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
+    cudnnDataType_t cudnnDataType = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
     CHECK_CUDNN_RET_WITH_EXCEPT(
       cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW, cudnnDataType, batch_, channel_, height_, width_),
       "Set x desc failed");
diff --git a/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold_grad_gpu_kernel.h
index 8cbe5b6927..7a3ed7ef91 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/quant/batchnorm_fold_grad_gpu_kernel.h
@@ -46,9 +46,8 @@ class BatchNormFoldGradGpuKernel : public GpuKernel {
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
 
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
               const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    (void)workspace;
     // 'd_batch_mean', 'd_batch_std', 'x', 'batch_mean', 'batch_std', 'current_step'
     T *d_batch_mean = GetDeviceAddress<T>(inputs, 0);
     T *d_batch_std = GetDeviceAddress<T>(inputs, 1);
@@ -139,11 +138,8 @@ class BatchNormFoldGradGpuKernel : public GpuKernel {
     input_size_list_.push_back(channel_size_);
     input_size_list_.push_back(channel_size_);
     input_size_list_.push_back(sizeof(int));
-
     // 'dx'
     output_size_list_.push_back(input_size_);
-
-    workspace_size_list_.push_back(workspace_size_);
   }
 
  private:
diff --git a/mindspore/ccsrc/kernel/gpu/quant/correction_mul_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/correction_mul_gpu_kernel.h
index 38a9532ef5..29aeabb03a 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/correction_mul_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/quant/correction_mul_gpu_kernel.h
@@ -33,7 +33,8 @@ class CorrectionMulGpuKernel : public GpuKernel {
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
               const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
     auto *weight = GetDeviceAddress<T>(inputs, 0);
     auto *gamma = GetDeviceAddress<T>(inputs, 1);
@@ -74,10 +75,9 @@ class CorrectionMulGpuKernel : public GpuKernel {
     input_size_list_.push_back(input_size);   // weight
     input_size_list_.push_back(weight_size);  // gamma
     input_size_list_.push_back(weight_size);  // running_std
-    size_t workspace_size = 0;
     output_size_list_.push_back(input_size);
-    workspace_size_list_.push_back(workspace_size);
   }
+
   void InitResource() override {}
 
  private:
diff --git a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_gpu_kernel.cc
similarity index 52%
rename from mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_gpu_kernel.cc
rename to mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_gpu_kernel.cc
index 083bf7f011..ffed550fbb 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_gpu_kernel.cc
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "kernel/gpu/quant/fake_quant_per_channel_gpu_kernel.h"
-#include "kernel/gpu/cuda_impl/fake_quant_per_channel_impl.cuh"
+#include "kernel/gpu/quant/fake_quant_perchannel_gpu_kernel.h"
+#include "kernel/gpu/cuda_impl/fake_quant_perchannel_impl.cuh"
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
 #include <thrust/device_vector.h>
@@ -25,21 +25,15 @@ namespace mindspore {
 namespace kernel {
 FakeQuantPerChannelGpuKernel::FakeQuantPerChannelGpuKernel()
     : input_size_(0),
-      min_size_(0),
-      max_size_(0),
-      output_size_(0),
-      workspace_size_(0),
+      num_channels_(0),
       num_bits_(0),
-      quant_min_(0),
-      quant_max_(0),
-      quant_delay_(0),
-      ema_(false),
-      ema_decay_(0),
-      global_step_(0),
       training_(false),
-      channel_out_(0),
+      symmetric_(false),
       narrow_range_(false),
-      symmetric_(false) {}
+      quant_delay_(0),
+      quant_min_(0),
+      quant_max_(0),
+      global_step_(0) {}
 
 const std::vector<size_t> &FakeQuantPerChannelGpuKernel::GetInputSizeList() const { return input_size_list_; }
 
@@ -60,90 +54,56 @@ bool FakeQuantPerChannelGpuKernel::Init(const CNodePtr &kernel_node) {
     return false;
   }
 
+  // get attribute
   num_bits_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("num_bits"));
-  ema_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema"));
-  ema_decay_ = 1.0 - GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema_decay"));
+  training_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("training"));
+  symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
+  narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
+  quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
 
   if (num_bits_ <= 2 || num_bits_ >= 16) {
     MS_LOG(EXCEPTION) << "Attr \'num_bits\' " << num_bits_ << "is out of range, expected between 2 and 16.";
     return false;
   }
 
-  quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
   if (quant_delay_ < 0) {
     MS_LOG(EXCEPTION) << "Attr \'quant_delay\' " << num_bits_ << " is less then 0, require larger than 0.";
     return false;
   }
 
-  training_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("training"));
-
-  symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
-  if (symmetric_) {
-    quant_min_ = 0 - (1 << (num_bits_ - 1));
-    quant_max_ = (1 << (num_bits_ - 1)) - 1;
-  } else {
-    quant_min_ = 0;
-    quant_max_ = (1 << num_bits_) - 1;
-  }
-
-  narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
+  // quant min and max value
+  quant_min_ = 0;
+  quant_max_ = (1 << num_bits_) - 1;
   if (narrow_range_) {
     quant_min_++;
   }
 
   // shape info for gpu
   auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-  channel_out_ = SizeToInt(input_shape[0]);
-  min_size_ = sizeof(float) * channel_out_;
-  max_size_ = sizeof(float) * channel_out_;
+  num_channels_ = SizeToInt(input_shape[0]);
   input_size_ = sizeof(float);
   for (size_t i = 0; i < input_shape.size(); i++) {
     input_size_ *= input_shape[i];
   }
-  output_size_ = input_size_;
-
   InitSizeLists();
   return true;
 }
 
 void FakeQuantPerChannelGpuKernel::InitSizeLists() {
-  input_size_list_.push_back(input_size_);                       // input in tensor
-  input_size_list_.push_back(min_size_);                         // min one scalar
-  input_size_list_.push_back(max_size_);                         // max on scalar
-  output_size_list_.push_back(output_size_);                     // output in tensor
-  workspace_size_list_.push_back(sizeof(float) * channel_out_);  // scale in channel
-  workspace_size_list_.push_back(sizeof(float) * channel_out_);  // min in channel
-  workspace_size_list_.push_back(sizeof(float) * channel_out_);  // max in channel
-}
-
-void FakeQuantPerChannelGpuKernel::CalFakeQuantizeForTraining(float *input, float *output, float *input_min,
-                                                              float *input_max, float *d_nudge_min, float *d_nudge_max,
-                                                              float *d_scale, void *stream_ptr) {
-  // calculate the input min and max according by the parameter ema and ema_decay.
-  CalMinMaxPerChannel(input, input_min, input_max, input_size_ / sizeof(float), channel_out_, ema_decay_, ema_,
-                      reinterpret_cast<cudaStream_t>(stream_ptr));
-  // control flow for quant_delay
-  if (global_step_ >= quant_delay_) {
-    // real launch
-    CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
-                       reinterpret_cast<cudaStream_t>(stream_ptr));
-    CalFakeQuantizePerChannel(input, output, input_size_ / sizeof(float), channel_out_, d_nudge_min, d_nudge_max,
-                              d_scale, symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
-  } else {
-    CHECK_CUDA_RET_WITH_ERROR(
-      cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
-      "Copy gpu memory failed.");
-  }
-  global_step_++;
+  input_size_list_.push_back(input_size_);                        // input in tensor
+  input_size_list_.push_back(sizeof(float) * num_channels_);      // min one scalar
+  input_size_list_.push_back(sizeof(float) * num_channels_);      // max on scalar
+  output_size_list_.push_back(input_size_);                       // output in tensor
+  workspace_size_list_.push_back(sizeof(float) * num_channels_);  // scale in channel
+  workspace_size_list_.push_back(sizeof(float) * num_channels_);  // min in channel
+  workspace_size_list_.push_back(sizeof(float) * num_channels_);  // max in channel
 }
 
-void FakeQuantPerChannelGpuKernel::CalFakeQuantizeForInfer(float *input, float *output, float *input_min,
-                                                           float *input_max, float *d_nudge_min, float *d_nudge_max,
-                                                           float *d_scale, void *stream_ptr) {
-  // real launch
-  CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
+void FakeQuantPerChannelGpuKernel::CalFakeQuantize(float *input, float *output, float *input_min, float *input_max,
+                                                   float *nudge_min, float *nudge_max, float *scale, void *stream_ptr) {
+  CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale, num_channels_,
                      reinterpret_cast<cudaStream_t>(stream_ptr));
-  CalFakeQuantizePerChannel(input, output, input_size_ / sizeof(float), channel_out_, d_nudge_min, d_nudge_max, d_scale,
+  CalFakeQuantizePerChannel(input, output, input_size_ / sizeof(float), num_channels_, nudge_min, nudge_max, scale,
                             symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
 }
 
@@ -155,9 +115,9 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
   float *input = GetDeviceAddress<float>(inputs, 0);
   float *input_min = GetDeviceAddress<float>(inputs, 1);
   float *input_max = GetDeviceAddress<float>(inputs, 2);
-  float *d_scale = GetDeviceAddress<float>(workspace, 0);
-  float *d_nudge_min = GetDeviceAddress<float>(workspace, 1);
-  float *d_nudge_max = GetDeviceAddress<float>(workspace, 2);
+  float *scale = GetDeviceAddress<float>(workspace, 0);
+  float *nudge_min = GetDeviceAddress<float>(workspace, 1);
+  float *nudge_max = GetDeviceAddress<float>(workspace, 2);
 
   if (input == nullptr) {
     MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input is null.";
@@ -167,14 +127,21 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
   }
 
   if (training_) {
-    CalFakeQuantizeForTraining(input, output, input_min, input_max, d_nudge_min, d_nudge_max, d_scale, stream_ptr);
+    if (global_step_ >= quant_delay_) {
+      CalFakeQuantize(input, output, input_min, input_max, nudge_min, nudge_max, scale, stream_ptr);
+    } else {
+      CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
+                                                reinterpret_cast<cudaStream_t>(stream_ptr)),
+                                "Copy gpu memory failed.");
+    }
+    global_step_++;
   } else {
-    CalFakeQuantizeForInfer(input, output, input_min, input_max, d_nudge_min, d_nudge_max, d_scale, stream_ptr);
+    CalFakeQuantize(input, output, input_min, input_max, nudge_min, nudge_max, scale, stream_ptr);
   }
 
   return true;
 }
 
-MS_REG_GPU_KERNEL(FakeQuantWithMinMaxPerChannel, FakeQuantPerChannelGpuKernel)
+MS_REG_GPU_KERNEL(FakeQuantPerChannel, FakeQuantPerChannelGpuKernel)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_gpu_kernel.h
similarity index 75%
rename from mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_gpu_kernel.h
rename to mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_gpu_kernel.h
index bea1a7421f..122fe96af3 100755
--- a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_gpu_kernel.h
@@ -39,31 +39,23 @@ class FakeQuantPerChannelGpuKernel : public GpuKernel {
   void InitSizeLists() override;
 
  private:
-  void CalFakeQuantizeForTraining(float *input, float *output, float *input_min, float *input_max, float *d_nudge_min,
-                                  float *d_nudge_max, float *d_scale, void *stream_ptr);
-  void CalFakeQuantizeForInfer(float *input, float *output, float *input_min, float *input_max, float *d_nudge_min,
-                               float *d_nudge_max, float *d_scale, void *stream_ptr);
+  void CalFakeQuantize(float *input, float *output, float *input_min, float *input_max, float *nudge_min,
+                       float *nudge_max, float *scale, void *stream_ptr);
 
   size_t input_size_;
-  size_t min_size_;
-  size_t max_size_;
-  size_t output_size_;
-  size_t workspace_size_;
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
   std::vector<size_t> workspace_size_list_;
 
+  int num_channels_;
   int num_bits_;
+  bool training_;
+  bool symmetric_;
+  bool narrow_range_;
+  int quant_delay_;
   float quant_min_;
   float quant_max_;
-  int quant_delay_;
-  bool ema_;
-  float ema_decay_;
   int global_step_;
-  bool training_;
-  int channel_out_;
-  bool narrow_range_;
-  bool symmetric_;
 };
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_grad_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.cc
similarity index 73%
rename from mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_grad_gpu_kernel.cc
rename to mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.cc
index 88c976285c..a57516eb2c 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.cc
@@ -14,21 +14,17 @@
  * limitations under the License.
  */
 
-#include "kernel/gpu/quant/fake_quant_per_channel_grad_gpu_kernel.h"
-#include "kernel/gpu/cuda_impl/fake_quant_per_channel_impl.cuh"
+#include "kernel/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.h"
+#include "kernel/gpu/cuda_impl/fake_quant_perchannel_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
 FakeQuantPerChannelGradGpuKernel::FakeQuantPerChannelGradGpuKernel()
     : input_size_(0),
-      min_size_(0),
-      max_size_(0),
-      output_size_(0),
-      workspace_size_(0),
       num_bits_(0),
       quant_min_(0),
       quant_max_(0),
-      channel_out_(0),
+      num_channels_(0),
       quant_delay_(0),
       global_step_(0),
       narrow_range_(false),
@@ -64,42 +60,34 @@ bool FakeQuantPerChannelGradGpuKernel::Init(const CNodePtr &kernel_node) {
   }
 
   symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
-  if (symmetric_) {
-    quant_min_ = 0 - (1 << (num_bits_ - 1));
-    quant_max_ = (1 << (num_bits_ - 1)) - 1;
-  } else {
-    quant_min_ = 0;
-    quant_max_ = (1 << num_bits_) - 1;
-  }
-
   narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
+
+  // quant min and max value
+  quant_min_ = 0;
+  quant_max_ = (1 << num_bits_) - 1;
   if (narrow_range_) {
     quant_min_++;
   }
 
   auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-  channel_out_ = SizeToInt(input_shape[0]);
-  min_size_ = sizeof(float) * channel_out_;
-  max_size_ = sizeof(float) * channel_out_;
+  num_channels_ = SizeToInt(input_shape[0]);
   input_size_ = sizeof(float);
   for (size_t i = 0; i < input_shape.size(); i++) {
     input_size_ *= input_shape[i];
   }
-  output_size_ = input_size_;
-
   InitSizeLists();
   return true;
 }
 
 void FakeQuantPerChannelGradGpuKernel::InitSizeLists() {
-  input_size_list_.push_back(input_size_);  // gradient
-  input_size_list_.push_back(input_size_);  // input
-  input_size_list_.push_back(min_size_);    // min
-  input_size_list_.push_back(max_size_);    // max
-  output_size_list_.push_back(output_size_);
-  workspace_size_list_.push_back(sizeof(float) * channel_out_);  // scale in channel
-  workspace_size_list_.push_back(sizeof(float) * channel_out_);  // min in channel
-  workspace_size_list_.push_back(sizeof(float) * channel_out_);  // max in channel
+  input_size_list_.push_back(input_size_);                        // gradient
+  input_size_list_.push_back(input_size_);                        // input
+  input_size_list_.push_back(sizeof(float) * num_channels_);      // min
+  input_size_list_.push_back(sizeof(float) * num_channels_);      // max
+  output_size_list_.push_back(input_size_);                       // output
+  workspace_size_list_.push_back(sizeof(float) * num_channels_);  // scale in channel
+  workspace_size_list_.push_back(sizeof(float) * num_channels_);  // min in channel
+  workspace_size_list_.push_back(sizeof(float) * num_channels_);  // max in channel
 }
 
 bool FakeQuantPerChannelGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
@@ -111,9 +99,9 @@ bool FakeQuantPerChannelGradGpuKernel::Launch(const std::vector<AddressPtr> &inp
   float *input = GetDeviceAddress<float>(inputs, 1);
   float *input_min = GetDeviceAddress<float>(inputs, 2);
   float *input_max = GetDeviceAddress<float>(inputs, 3);
-  float *d_scale = GetDeviceAddress<float>(workspace, 0);
-  float *d_nudge_min = GetDeviceAddress<float>(workspace, 1);
-  float *d_nudge_max = GetDeviceAddress<float>(workspace, 2);
+  float *scale = GetDeviceAddress<float>(workspace, 0);
+  float *nudge_min = GetDeviceAddress<float>(workspace, 1);
+  float *nudge_max = GetDeviceAddress<float>(workspace, 2);
 
   if (gradient == nullptr) {
     MS_LOG(EXCEPTION) << "FakeQuantPerChannelGradGpuKernel gradient is null";
@@ -130,9 +118,9 @@ bool FakeQuantPerChannelGradGpuKernel::Launch(const std::vector<AddressPtr> &inp
 
   int total_size = input_size_ / sizeof(float);
   if (global_step_ >= quant_delay_) {
-    CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
+    CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale, num_channels_,
                        reinterpret_cast<cudaStream_t>(stream_ptr));
-    CalFakeQuantizePerChannelGrad(input, gradient, output, total_size, channel_out_, d_nudge_min, d_nudge_max,
+    CalFakeQuantizePerChannelGrad(input, gradient, output, total_size, num_channels_, nudge_min, nudge_max,
                                   reinterpret_cast<cudaStream_t>(stream_ptr));
   } else {
     CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice,
@@ -143,6 +131,6 @@ bool FakeQuantPerChannelGradGpuKernel::Launch(const std::vector<AddressPtr> &inp
   return true;
 }
 
-MS_REG_GPU_KERNEL(FakeQuantWithMinMaxPerChannelGrad, FakeQuantPerChannelGradGpuKernel)
+MS_REG_GPU_KERNEL(FakeQuantPerChannelGrad, FakeQuantPerChannelGradGpuKernel)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.h
similarity index 91%
rename from mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_grad_gpu_kernel.h
rename to mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.h
index fe760d85d2..d863a2c99f 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_per_channel_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.h
@@ -40,10 +40,6 @@ class FakeQuantPerChannelGradGpuKernel : public GpuKernel {
 
  private:
   size_t input_size_;
-  size_t min_size_;
-  size_t max_size_;
-  size_t output_size_;
-  size_t workspace_size_;
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
   std::vector<size_t> workspace_size_list_;
@@ -51,7 +47,7 @@ class FakeQuantPerChannelGradGpuKernel : public GpuKernel {
   int num_bits_;
   float quant_min_;
   float quant_max_;
-  int channel_out_;
+  int num_channels_;
   int quant_delay_;
   int global_step_;
   bool narrow_range_;
diff --git a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_gpu_kernel.cc
similarity index 50%
rename from mindspore/ccsrc/kernel/gpu/quant/fake_quant_gpu_kernel.cc
rename to mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_gpu_kernel.cc
index ade7c32da0..845fb5b923 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_gpu_kernel.cc
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "kernel/gpu/quant/fake_quant_gpu_kernel.h"
-#include "kernel/gpu/cuda_impl/fake_quant_impl.cuh"
+#include "kernel/gpu/quant/fake_quant_perlayer_gpu_kernel.h"
+#include "kernel/gpu/cuda_impl/fake_quant_perlayer_impl.cuh"
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
 #include <thrust/device_vector.h>
@@ -23,31 +23,25 @@
 
 namespace mindspore {
 namespace kernel {
-FakeQuantGpuKernel::FakeQuantGpuKernel()
+FakeQuantPerLayerGpuKernel::FakeQuantPerLayerGpuKernel()
     : input_size_(0),
-      min_size_(0),
-      max_size_(0),
-      output_size_(0),
-      workspace_size_(0),
-      num_bits_(0),
       quant_min_(0),
       quant_max_(0),
-      quant_num_(0),
-      quant_delay_(0),
-      ema_(false),
-      ema_decay_(0),
+      quant_num_(1),
       global_step_(0),
+      num_bits_(0),
+      quant_delay_(0),
       training_(false),
       narrow_range_(false),
       symmetric_(false) {}
 
-const std::vector<size_t> &FakeQuantGpuKernel::GetInputSizeList() const { return input_size_list_; }
+const std::vector<size_t> &FakeQuantPerLayerGpuKernel::GetInputSizeList() const { return input_size_list_; }
 
-const std::vector<size_t> &FakeQuantGpuKernel::GetOutputSizeList() const { return output_size_list_; }
+const std::vector<size_t> &FakeQuantPerLayerGpuKernel::GetOutputSizeList() const { return output_size_list_; }
 
-const std::vector<size_t> &FakeQuantGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
+const std::vector<size_t> &FakeQuantPerLayerGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
 
-bool FakeQuantGpuKernel::Init(const CNodePtr &kernel_node) {
+bool FakeQuantPerLayerGpuKernel::Init(const CNodePtr &kernel_node) {
   size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
   if (input_num != 3) {
     MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 output.";
@@ -59,95 +53,73 @@ bool FakeQuantGpuKernel::Init(const CNodePtr &kernel_node) {
   }
 
   num_bits_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("num_bits"));
-  ema_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema"));
-  ema_decay_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema_decay"));
+  quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
   training_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("training"));
+  symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
+  narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
 
   if (num_bits_ <= 2 || num_bits_ >= 16) {
     MS_LOG(EXCEPTION) << "Attr \'num_bits\' " << num_bits_ << " is out of range, expected between 2 and 16.";
   }
 
-  quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
   if (quant_delay_ < 0) {
     MS_LOG(EXCEPTION) << "Attr \'quant_delay\' " << num_bits_ << "is less then 0, require larger than 0.";
   }
 
-  symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
-  if (symmetric_) {
-    quant_min_ = 0 - (1 << (num_bits_ - 1));
-    quant_max_ = (1 << (num_bits_ - 1)) - 1;
-  } else {
-    quant_min_ = 0;
-    quant_max_ = (1 << num_bits_) - 1;
-  }
-
-  narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
+  // quant min and max value
+  quant_min_ = 0;
+  quant_max_ = (1 << num_bits_) - 1;
   if (narrow_range_) {
     quant_min_++;
   }
 
-  if (quant_num_ == 0) {
-    quant_num_ = 1;
-  }
+  // init size
   auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
   for (size_t i = 0; i < input_shape.size(); ++i) {
     quant_num_ *= SizeToInt(input_shape[i]);
   }
-
   input_size_ = sizeof(float);
-  min_size_ = sizeof(float);
-  max_size_ = sizeof(float);
   for (size_t i = 0; i < input_shape.size(); i++) {
     input_size_ *= input_shape[i];
   }
-  output_size_ = input_size_;
   InitSizeLists();
   return true;
 }
 
-void FakeQuantGpuKernel::InitSizeLists() {
-  input_size_list_.push_back(input_size_);  // input
-  input_size_list_.push_back(min_size_);    // min
-  input_size_list_.push_back(max_size_);    // max
-  output_size_list_.push_back(output_size_);
-  workspace_size_list_.push_back(workspace_size_);
+void FakeQuantPerLayerGpuKernel::InitSizeLists() {
+  input_size_list_.push_back(input_size_);        // x
+  input_size_list_.push_back(sizeof(float));      // min
+  input_size_list_.push_back(sizeof(float));      // max
+  output_size_list_.push_back(input_size_);       // y
+  workspace_size_list_.push_back(sizeof(float));  // scale
+  workspace_size_list_.push_back(sizeof(float));  // nudge_min
+  workspace_size_list_.push_back(sizeof(float));  // nudge_max
 }
 
-bool FakeQuantGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
-                                const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+bool FakeQuantPerLayerGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                        const std::vector<AddressPtr> &outputs, void *stream_ptr) {
   float *output = GetDeviceAddress<float>(outputs, 0);
   float *input = GetDeviceAddress<float>(inputs, 0);
   float *input_min = GetDeviceAddress<float>(inputs, 1);
   float *input_max = GetDeviceAddress<float>(inputs, 2);
+  float *scale = GetDeviceAddress<float>(workspace, 0);
+  float *nudge_min = GetDeviceAddress<float>(workspace, 1);
+  float *nudge_max = GetDeviceAddress<float>(workspace, 2);
 
   if (input == nullptr) {
-    MS_LOG(EXCEPTION) << "FakeQuantGpuKernel input x is null.";
-  }
-  if (input_min == nullptr) {
-    MS_LOG(EXCEPTION) << "FakeQuantGpuKernel input min is null.";
+    MS_LOG(EXCEPTION) << "FakeQuantPerLayerGpuKernel input x is null.";
   }
-  if (input_max == nullptr) {
-    MS_LOG(EXCEPTION) << "FakeQuantGpuKernel input max is null.";
+  if (input_min == nullptr || input_max == nullptr) {
+    MS_LOG(EXCEPTION) << "FakeQuantPerLayerGpuKernel input min or input max is null.";
   }
 
-  // Allocate space for device copies
-  int size = sizeof(float);
-  float *d_scale = nullptr;
-  float *d_nudge_min = nullptr;
-  float *d_nudge_max = nullptr;
-  CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_scale), size), "Malloc gpu memory failed");
-  CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_min), size), "Malloc gpu memory failed");
-  CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_max), size), "Malloc gpu memory failed");
-
   if (training_) {
-    // calculate the input min and max according by the parameter ema and ema_decay.
-    CalMinMax(input, input_min, input_max, quant_num_, ema_decay_, ema_, reinterpret_cast<cudaStream_t>(stream_ptr));
     // control flow for quant_delay
     if (global_step_ >= quant_delay_) {
       // real launch
-      CalNudge(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale,
+      CalNudge(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale,
                reinterpret_cast<cudaStream_t>(stream_ptr));
-      CalFakeQuantize(input, output, quant_num_, d_nudge_min, d_nudge_max, d_scale, symmetric_,
+      CalFakeQuantize(input, output, quant_num_, nudge_min, nudge_max, scale, symmetric_,
                       reinterpret_cast<cudaStream_t>(stream_ptr));
     } else {
       CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
@@ -157,20 +129,15 @@ bool FakeQuantGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std
     global_step_++;
   } else {
     // real launch
-    CalNudge(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale,
+    CalNudge(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale,
              reinterpret_cast<cudaStream_t>(stream_ptr));
-    CalFakeQuantize(input, output, quant_num_, d_nudge_min, d_nudge_max, d_scale, symmetric_,
+    CalFakeQuantize(input, output, quant_num_, nudge_min, nudge_max, scale, symmetric_,
                     reinterpret_cast<cudaStream_t>(stream_ptr));
   }
 
-  // Cleanup
-  CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_scale), "Free gpu memory failed");
-  CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_nudge_min), "Free gpu memory failed");
-  CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_nudge_max), "Free gpu memory failed");
-
   return true;
 }
 
-MS_REG_GPU_KERNEL(FakeQuantWithMinMax, FakeQuantGpuKernel)
+MS_REG_GPU_KERNEL(FakeQuantPerLayer, FakeQuantPerLayerGpuKernel)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_gpu_kernel.h
similarity index 77%
rename from mindspore/ccsrc/kernel/gpu/quant/fake_quant_gpu_kernel.h
rename to mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_gpu_kernel.h
index 5a594c615f..38810e06df 100755
--- a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_gpu_kernel.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GPUKERNEL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GPUKERNEL_H_
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GPUKERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GPUKERNEL_H_
 
 #include <vector>
 #include "kernel/gpu/gpu_kernel.h"
@@ -23,10 +23,10 @@
 
 namespace mindspore {
 namespace kernel {
-class FakeQuantGpuKernel : public GpuKernel {
+class FakeQuantPerLayerGpuKernel : public GpuKernel {
  public:
-  FakeQuantGpuKernel();
-  ~FakeQuantGpuKernel() = default;
+  FakeQuantPerLayerGpuKernel();
+  ~FakeQuantPerLayerGpuKernel() = default;
 
   const std::vector<size_t> &GetInputSizeList() const override;
   const std::vector<size_t> &GetOutputSizeList() const override;
@@ -40,22 +40,16 @@ class FakeQuantGpuKernel : public GpuKernel {
 
  private:
   size_t input_size_;
-  size_t min_size_;
-  size_t max_size_;
-  size_t output_size_;
-  size_t workspace_size_;
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
   std::vector<size_t> workspace_size_list_;
 
-  int num_bits_;
   float quant_min_;
   float quant_max_;
   int quant_num_;
-  int quant_delay_;
-  bool ema_;
-  float ema_decay_;
   int global_step_;
+  int num_bits_;
+  int quant_delay_;
   bool training_;
   bool narrow_range_;
   bool symmetric_;
@@ -63,4 +57,4 @@ class FakeQuantGpuKernel : public GpuKernel {
 }  // namespace kernel
 }  // namespace mindspore
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GPUKERNEL_H_
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GPUKERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_grad_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.cc
similarity index 51%
rename from mindspore/ccsrc/kernel/gpu/quant/fake_quant_grad_gpu_kernel.cc
rename to mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.cc
index 7b7e3f1737..9c6584e239 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.cc
@@ -14,31 +14,30 @@
  * limitations under the License.
  */
 
-#include "kernel/gpu/quant/fake_quant_grad_gpu_kernel.h"
-#include "kernel/gpu/cuda_impl/fake_quant_impl.cuh"
+#include "kernel/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.h"
+#include "kernel/gpu/cuda_impl/fake_quant_perlayer_impl.cuh"
 
 namespace mindspore {
 namespace kernel {
-FakeQuantGradGpuKernel::FakeQuantGradGpuKernel()
+FakeQuantPerLayerGradGpuKernel::FakeQuantPerLayerGradGpuKernel()
     : input_size_(0),
-      min_size_(0),
-      max_size_(0),
-      output_size_(0),
       workspace_size_(0),
       num_bits_(0),
       quant_min_(0),
       quant_max_(0),
-      quant_size_(0),
+      quant_num_(1),
       quant_delay_(0),
-      global_step_(0) {}
+      global_step_(0),
+      narrow_range_(false),
+      symmetric_(false) {}
 
-const std::vector<size_t> &FakeQuantGradGpuKernel::GetInputSizeList() const { return input_size_list_; }
+const std::vector<size_t> &FakeQuantPerLayerGradGpuKernel::GetInputSizeList() const { return input_size_list_; }
 
-const std::vector<size_t> &FakeQuantGradGpuKernel::GetOutputSizeList() const { return output_size_list_; }
+const std::vector<size_t> &FakeQuantPerLayerGradGpuKernel::GetOutputSizeList() const { return output_size_list_; }
 
-const std::vector<size_t> &FakeQuantGradGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
+const std::vector<size_t> &FakeQuantPerLayerGradGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
 
-bool FakeQuantGradGpuKernel::Init(const CNodePtr &kernel_node) {
+bool FakeQuantPerLayerGradGpuKernel::Init(const CNodePtr &kernel_node) {
   size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
   if (input_num != 4) {
     MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuantGrad GpuKernel OP needs 4 output.";
@@ -59,78 +58,67 @@ bool FakeQuantGradGpuKernel::Init(const CNodePtr &kernel_node) {
     MS_LOG(EXCEPTION) << "Attr \'quant_delay_\' " << quant_delay_ << " is less then 0, require larger than 0.";
   }
 
+  symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
+  narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
+
+  // quant min and max value
   quant_min_ = 0;
   quant_max_ = (1 << num_bits_) - 1;
-
-  if (quant_size_ == 0) {
-    quant_size_ = 1;
+  if (narrow_range_) {
+    quant_min_++;
   }
+
+  // init size
   auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
   for (size_t i = 0; i < input_shape.size(); ++i) {
-    quant_size_ *= SizeToInt(input_shape[i]);
+    quant_num_ *= SizeToInt(input_shape[i]);
   }
-
   input_size_ = sizeof(float);
-  min_size_ = sizeof(float);
-  max_size_ = sizeof(float);
   for (size_t i = 0; i < input_shape.size(); i++) {
     input_size_ *= input_shape[i];
   }
-  output_size_ = input_size_;
-
   InitSizeLists();
   return true;
 }
 
-void FakeQuantGradGpuKernel::InitSizeLists() {
-  input_size_list_.push_back(input_size_);  // gradient
-  input_size_list_.push_back(input_size_);  // input
-  input_size_list_.push_back(min_size_);    // min
-  input_size_list_.push_back(max_size_);    // max
-  output_size_list_.push_back(output_size_);
-  workspace_size_list_.push_back(workspace_size_);
+void FakeQuantPerLayerGradGpuKernel::InitSizeLists() {
+  input_size_list_.push_back(input_size_);        // gradient
+  input_size_list_.push_back(input_size_);        // input
+  input_size_list_.push_back(sizeof(float));      // min
+  input_size_list_.push_back(sizeof(float));      // max
+  output_size_list_.push_back(input_size_);       // output
+  workspace_size_list_.push_back(sizeof(float));  // scale
+  workspace_size_list_.push_back(sizeof(float));  // nudge_min
+  workspace_size_list_.push_back(sizeof(float));  // nudge_max
 }
 
-bool FakeQuantGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
-                                    const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+bool FakeQuantPerLayerGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
+                                            const std::vector<AddressPtr> &workspace,
+                                            const std::vector<AddressPtr> &outputs, void *stream_ptr) {
   float *output = GetDeviceAddress<float>(outputs, 0);
   float *gradient = GetDeviceAddress<float>(inputs, 0);
   float *input = GetDeviceAddress<float>(inputs, 1);
   float *input_min = GetDeviceAddress<float>(inputs, 2);
   float *input_max = GetDeviceAddress<float>(inputs, 3);
+  float *scale = GetDeviceAddress<float>(workspace, 0);
+  float *nudge_min = GetDeviceAddress<float>(workspace, 1);
+  float *nudge_max = GetDeviceAddress<float>(workspace, 2);
 
   if (gradient == nullptr) {
-    MS_LOG(EXCEPTION) << "FakeQuantGradGpuKernel gradient is null";
+    MS_LOG(EXCEPTION) << "FakeQuantPerLayerGradGpuKernel gradient is null";
   }
   if (input == nullptr) {
-    MS_LOG(EXCEPTION) << "FakeQuantGradGpuKernel input is null.";
-  }
-  if (input_min == nullptr) {
-    MS_LOG(EXCEPTION) << "FakeQuantGradGpuKernel input min is null.";
+    MS_LOG(EXCEPTION) << "FakeQuantPerLayerGradGpuKernel input is null.";
   }
-  if (input_max == nullptr) {
-    MS_LOG(EXCEPTION) << "FakeQuantGradGpuKernel input max is null.";
+  if (input_min == nullptr || input_max == nullptr) {
+    MS_LOG(EXCEPTION) << "FakeQuantPerLayerGradGpuKernel input min or max is null.";
   }
 
   if (global_step_ >= quant_delay_) {
-    float *d_scale = nullptr;
-    float *d_nudge_min = nullptr;
-    float *d_nudge_max = nullptr;
-    int size = sizeof(float);
-    // Allocate space for device copies
-    CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_scale), size), "Malloc gpu memory failed");
-    CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_min), size), "Malloc gpu memory failed");
-    CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_max), size), "Malloc gpu memory failed");
-
-    CalNudge(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale,
+    CalNudge(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale,
              reinterpret_cast<cudaStream_t>(stream_ptr));
-    CalFakeQuantizeGrad(input, gradient, output, quant_size_, d_nudge_min, d_nudge_max,
+    CalFakeQuantizeGrad(input, gradient, output, quant_num_, nudge_min, nudge_max,
                         reinterpret_cast<cudaStream_t>(stream_ptr));
-
-    // Cleanup
-    CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_scale), "Free gpu memory failed");
-    CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_nudge_min), "Free gpu memory failed");
-    CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_nudge_max), "Free gpu memory failed");
   } else {
     CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice,
                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
@@ -140,6 +128,6 @@ bool FakeQuantGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const
   return true;
 }
 
-MS_REG_GPU_KERNEL(FakeQuantWithMinMaxGrad, FakeQuantGradGpuKernel)
+MS_REG_GPU_KERNEL(FakeQuantPerLayerGrad, FakeQuantPerLayerGradGpuKernel)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.h
similarity index 76%
rename from mindspore/ccsrc/kernel/gpu/quant/fake_quant_grad_gpu_kernel.h
rename to mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.h
index 04c505d2bd..ae2ea5bfac 100644
--- a/mindspore/ccsrc/kernel/gpu/quant/fake_quant_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GRAD_GPUKERNEL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GRAD_GPUKERNEL_H_
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GRAD_GPUKERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GRAD_GPUKERNEL_H_
 
 #include <vector>
 #include "kernel/gpu/gpu_kernel.h"
@@ -23,10 +23,10 @@
 
 namespace mindspore {
 namespace kernel {
-class FakeQuantGradGpuKernel : public GpuKernel {
+class FakeQuantPerLayerGradGpuKernel : public GpuKernel {
  public:
-  FakeQuantGradGpuKernel();
-  ~FakeQuantGradGpuKernel() = default;
+  FakeQuantPerLayerGradGpuKernel();
+  ~FakeQuantPerLayerGradGpuKernel() = default;
 
   const std::vector<size_t> &GetInputSizeList() const override;
   const std::vector<size_t> &GetOutputSizeList() const override;
@@ -40,9 +40,6 @@ class FakeQuantGradGpuKernel : public GpuKernel {
 
  private:
   size_t input_size_;
-  size_t min_size_;
-  size_t max_size_;
-  size_t output_size_;
   size_t workspace_size_;
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
@@ -51,11 +48,13 @@ class FakeQuantGradGpuKernel : public GpuKernel {
   int num_bits_;
   float quant_min_;
   float quant_max_;
-  int quant_size_;
+  int quant_num_;
   int quant_delay_;
   int global_step_;
+  bool narrow_range_;
+  bool symmetric_;
 };
 }  // namespace kernel
 }  // namespace mindspore
 
-#endif  // MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GRAD_GPUKERNEL_H_
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GRAD_GPUKERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perchannel_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perchannel_gpu_kernel.cc
new file mode 100644
index 0000000000..a8ce72148b
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perchannel_gpu_kernel.cc
@@ -0,0 +1,96 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/quant/minmax_update_perchannel_gpu_kernel.h"
+#include "kernel/gpu/cuda_impl/minmax_update_impl.cuh"
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/device_vector.h>
+#include <cuda_runtime_api.h>
+
+namespace mindspore {
+namespace kernel {
+MinMaxUpdatePerChannelGpuKernel::MinMaxUpdatePerChannelGpuKernel()
+    : input_size_(0), quant_num_(1), ema_(false), ema_decay_(0), num_channels_(0) {}
+
+const std::vector<size_t> &MinMaxUpdatePerChannelGpuKernel::GetInputSizeList() const { return input_size_list_; }
+
+const std::vector<size_t> &MinMaxUpdatePerChannelGpuKernel::GetOutputSizeList() const { return output_size_list_; }
+
+const std::vector<size_t> &MinMaxUpdatePerChannelGpuKernel::GetWorkspaceSizeList() const {
+  return workspace_size_list_;
+}
+
+bool MinMaxUpdatePerChannelGpuKernel::Init(const CNodePtr &kernel_node) {
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 3) {
+    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 output.";
+  }
+
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 2) {
+    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but FakeQuant GpuKernel OP needs 1 output.";
+  }
+
+  ema_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema"));
+  ema_decay_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema_decay"));
+
+  // init size
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  num_channels_ = SizeToInt(input_shape[0]);
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    quant_num_ *= SizeToInt(input_shape[i]);
+  }
+  input_size_ = sizeof(float);
+  for (size_t i = 0; i < input_shape.size(); i++) {
+    input_size_ *= input_shape[i];
+  }
+  InitSizeLists();
+  return true;
+}
+
+void MinMaxUpdatePerChannelGpuKernel::InitSizeLists() {
+  input_size_list_.push_back(input_size_);                     // input
+  input_size_list_.push_back(sizeof(float) * num_channels_);   // min
+  input_size_list_.push_back(sizeof(float) * num_channels_);   // max
+  output_size_list_.push_back(sizeof(float) * num_channels_);  // output min
+  output_size_list_.push_back(sizeof(float) * num_channels_);  // output max
+}
+
+bool MinMaxUpdatePerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                                             const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  float *output_min = GetDeviceAddress<float>(outputs, 0);
+  float *output_max = GetDeviceAddress<float>(outputs, 1);
+  float *input = GetDeviceAddress<float>(inputs, 0);
+  float *input_min = GetDeviceAddress<float>(inputs, 1);
+  float *input_max = GetDeviceAddress<float>(inputs, 2);
+
+  if (input == nullptr) {
+    MS_LOG(EXCEPTION) << "MinMaxUpdatePerChannelGpuKernel input x is null.";
+  }
+  if (input_min == nullptr || input_max == nullptr) {
+    MS_LOG(EXCEPTION) << "MinMaxUpdatePerChannelGpuKernel input min or input max is null.";
+  }
+
+  // calculate the input min and max according by the parameter ema and ema_decay.
+  CalMinMaxPerChannel(input, input_min, input_max, output_min, output_max, input_size_ / sizeof(float), num_channels_,
+                      ema_decay_, ema_, reinterpret_cast<cudaStream_t>(stream_ptr));
+  return true;
+}
+
+MS_REG_GPU_KERNEL(MinMaxUpdatePerChannel, MinMaxUpdatePerChannelGpuKernel)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perchannel_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perchannel_gpu_kernel.h
new file mode 100644
index 0000000000..563a583ca1
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perchannel_gpu_kernel.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERCHANNEL_GPUKERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERCHANNEL_GPUKERNEL_H_
+
+#include <vector>
+#include "kernel/gpu/gpu_kernel.h"
+#include "kernel/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class MinMaxUpdatePerChannelGpuKernel : public GpuKernel {
+ public:
+  MinMaxUpdatePerChannelGpuKernel();
+  ~MinMaxUpdatePerChannelGpuKernel() = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override;
+  const std::vector<size_t> &GetOutputSizeList() const override;
+  const std::vector<size_t> &GetWorkspaceSizeList() const override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  bool Init(const CNodePtr &kernel) override;
+
+ protected:
+  void InitSizeLists() override;
+
+ private:
+  size_t input_size_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+
+  int quant_num_;
+  bool ema_;
+  float ema_decay_;
+  int num_channels_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERCHANNEL_GPUKERNEL_H_
diff --git a/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perlayer_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perlayer_gpu_kernel.cc
new file mode 100644
index 0000000000..3659665b23
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perlayer_gpu_kernel.cc
@@ -0,0 +1,93 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/quant/minmax_update_perlayer_gpu_kernel.h"
+#include "kernel/gpu/cuda_impl/minmax_update_impl.cuh"
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/device_vector.h>
+#include <cuda_runtime_api.h>
+
+namespace mindspore {
+namespace kernel {
+MinMaxUpdatePerLayerGpuKernel::MinMaxUpdatePerLayerGpuKernel()
+    : input_size_(0), quant_num_(1), ema_(false), ema_decay_(0) {}
+
+const std::vector<size_t> &MinMaxUpdatePerLayerGpuKernel::GetInputSizeList() const { return input_size_list_; }
+
+const std::vector<size_t> &MinMaxUpdatePerLayerGpuKernel::GetOutputSizeList() const { return output_size_list_; }
+
+const std::vector<size_t> &MinMaxUpdatePerLayerGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
+
+bool MinMaxUpdatePerLayerGpuKernel::Init(const CNodePtr &kernel_node) {
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 3) {
+    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 output.";
+  }
+
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 2) {
+    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but FakeQuant GpuKernel OP needs 1 output.";
+  }
+
+  ema_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema"));
+  ema_decay_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema_decay"));
+
+  // init size
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    quant_num_ *= SizeToInt(input_shape[i]);
+  }
+  input_size_ = sizeof(float);
+  for (size_t i = 0; i < input_shape.size(); i++) {
+    input_size_ *= input_shape[i];
+  }
+  InitSizeLists();
+  return true;
+}
+
+void MinMaxUpdatePerLayerGpuKernel::InitSizeLists() {
+  input_size_list_.push_back(input_size_);     // input
+  input_size_list_.push_back(sizeof(float));   // input min
+  input_size_list_.push_back(sizeof(float));   // input max
+  output_size_list_.push_back(sizeof(float));  // output min
+  output_size_list_.push_back(sizeof(float));  // output max
+}
+
+bool MinMaxUpdatePerLayerGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                                           const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  float *output_min = GetDeviceAddress<float>(outputs, 0);
+  float *output_max = GetDeviceAddress<float>(outputs, 1);
+  float *input = GetDeviceAddress<float>(inputs, 0);
+  float *input_min = GetDeviceAddress<float>(inputs, 1);
+  float *input_max = GetDeviceAddress<float>(inputs, 2);
+
+  if (input == nullptr) {
+    MS_LOG(EXCEPTION) << "MinMaxUpdatePerLayerGpuKernel input x is null.";
+  }
+  if (input_min == nullptr || input_max == nullptr) {
+    MS_LOG(EXCEPTION) << "MinMaxUpdatePerLayerGpuKernel input min or input max is null.";
+  }
+
+  CalMinMaxPerLayer(input, input_min, input_max, output_min, output_max, quant_num_, ema_decay_, ema_,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+
+  return true;
+}
+
+MS_REG_GPU_KERNEL(MinMaxUpdatePerLayer, MinMaxUpdatePerLayerGpuKernel)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perlayer_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perlayer_gpu_kernel.h
new file mode 100644
index 0000000000..a237b6dc26
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/quant/minmax_update_perlayer_gpu_kernel.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERLAYER_GPUKERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERLAYER_GPUKERNEL_H_
+
+#include <vector>
+#include "kernel/gpu/gpu_kernel.h"
+#include "kernel/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class MinMaxUpdatePerLayerGpuKernel : public GpuKernel {
+ public:
+  MinMaxUpdatePerLayerGpuKernel();
+  ~MinMaxUpdatePerLayerGpuKernel() = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override;
+  const std::vector<size_t> &GetOutputSizeList() const override;
+  const std::vector<size_t> &GetWorkspaceSizeList() const override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  bool Init(const CNodePtr &kernel) override;
+
+ protected:
+  void InitSizeLists() override;
+
+ private:
+  size_t input_size_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+
+  int quant_num_;
+  bool ema_;
+  float ema_decay_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERLAYER_GPUKERNEL_H_
diff --git a/mindspore/ccsrc/kernel/hccl/hccl_kernel.cc b/mindspore/ccsrc/kernel/hccl/hccl_kernel.cc
index 493998c168..87fb8d743d 100644
--- a/mindspore/ccsrc/kernel/hccl/hccl_kernel.cc
+++ b/mindspore/ccsrc/kernel/hccl/hccl_kernel.cc
@@ -129,7 +129,7 @@ std::vector<TaskInfoPtr> HcclKernel::GenTask(const std::vector<AddressPtr> &inpu
                                              const std::vector<AddressPtr> &workspace,
                                              const std::vector<AddressPtr> &outputs, uint32_t stream_id) {
   if (inputs.empty() || outputs.empty()) {
-    MS_LOG(EXCEPTION) << "inputs or outputs is empty";
+    MS_LOG(EXCEPTION) << "Inputs or outputs is empty";
   }
   stream_id_ = stream_id;
   std::string hccl_type = AnfAlgo::GetCNodeName(anf_node_);
diff --git a/mindspore/ccsrc/kernel/hccl/hccl_kernel_metadata.cc b/mindspore/ccsrc/kernel/hccl/hccl_kernel_metadata.cc
index f0a0dda258..601d5cf1ea 100755
--- a/mindspore/ccsrc/kernel/hccl/hccl_kernel_metadata.cc
+++ b/mindspore/ccsrc/kernel/hccl/hccl_kernel_metadata.cc
@@ -23,6 +23,8 @@
 namespace mindspore {
 namespace kernel {
 void HcclMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list) {
+  const std::vector<TypeId> kHcclSupportTypes = {kNumberTypeInt8, kNumberTypeInt32, kNumberTypeFloat16,
+                                                 kNumberTypeFloat32, kNumberTypeInt16};
   MS_EXCEPTION_IF_NULL(kernel_info_list);
   MS_EXCEPTION_IF_NULL(kernel_node);
   std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
@@ -30,27 +32,27 @@ void HcclMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<K
     MS_LOG(DEBUG) << "Hccl does not have op [" << op_name << "]";
     return;
   }
-
-  std::vector<std::string> inputs_format{};
-  std::vector<TypeId> inputs_type{};
-  for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
-    inputs_format.emplace_back(AnfAlgo::GetPrevNodeOutputFormat(kernel_node, input_index));
-    inputs_type.push_back(AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, input_index));
-  }
-
-  std::vector<std::string> outputs_format;
-  std::vector<TypeId> outputs_type;
-  for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(kernel_node); ++output_index) {
-    outputs_format.emplace_back(AnfAlgo::GetPrevNodeOutputFormat(kernel_node, output_index));
-    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel_node, output_index));
+  for (const auto &type : kHcclSupportTypes) {
+    std::vector<std::string> inputs_format{};
+    std::vector<TypeId> inputs_type{};
+    for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
+      inputs_format.emplace_back(AnfAlgo::GetPrevNodeOutputFormat(kernel_node, input_index));
+      inputs_type.push_back(type);
+    }
+    std::vector<std::string> outputs_format;
+    std::vector<TypeId> outputs_type;
+    for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(kernel_node); ++output_index) {
+      outputs_format.emplace_back(AnfAlgo::GetPrevNodeOutputFormat(kernel_node, output_index));
+      outputs_type.push_back(type);
+    }
+    auto builder = KernelBuildInfo::KernelBuildInfoBuilder();
+    builder.SetInputsFormat(inputs_format);
+    builder.SetInputsDeviceType(inputs_type);
+    builder.SetOutputsFormat(outputs_format);
+    builder.SetOutputsDeviceType(outputs_type);
+    builder.SetKernelType(HCCL_KERNEL);
+    kernel_info_list->push_back(builder.Build());
   }
-  auto builder = KernelBuildInfo::KernelBuildInfoBuilder();
-  builder.SetInputsFormat(inputs_format);
-  builder.SetInputsDeviceType(inputs_type);
-  builder.SetOutputsFormat(outputs_format);
-  builder.SetOutputsDeviceType(outputs_type);
-  builder.SetKernelType(HCCL_KERNEL);
-  kernel_info_list->push_back(builder.Build());
 }
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/hccl/hcom_all_broadcast.cc b/mindspore/ccsrc/kernel/hccl/hcom_all_broadcast.cc
index dba692606c..9dbe708ef9 100644
--- a/mindspore/ccsrc/kernel/hccl/hcom_all_broadcast.cc
+++ b/mindspore/ccsrc/kernel/hccl/hcom_all_broadcast.cc
@@ -32,7 +32,12 @@ bool HcomAllBroadCastKernel::Launch(const std::vector<AddressPtr> &inputs,
   if (context_ptr->enable_task_sink()) {
     return true;
   }
+  if (inputs.empty() || hccl_data_type_list_.empty()) {
+    MS_LOG(ERROR) << "BroadCast param is empty";
+    return false;
+  }
   const char *tag = "Hccl-BroadCast";
+  MS_EXCEPTION_IF_NULL(inputs[0]);
   hcclResult_t ret =
     hcom_broadcast(tag, inputs[0]->addr, hccl_count_, hccl_data_type_list_[0], root_id_, nullptr, stream_ptr);
   if (ret != HCCL_SUCCESS) {
diff --git a/mindspore/ccsrc/kernel/hccl/hcom_all_gather.cc b/mindspore/ccsrc/kernel/hccl/hcom_all_gather.cc
index 67cd1001e3..6494f7fd12 100644
--- a/mindspore/ccsrc/kernel/hccl/hcom_all_gather.cc
+++ b/mindspore/ccsrc/kernel/hccl/hcom_all_gather.cc
@@ -31,6 +31,10 @@ bool HcomAllGatherKernel::Launch(const std::vector<AddressPtr> &inputs, const st
   if (context_ptr->enable_task_sink()) {
     return true;
   }
+  if (inputs.empty() || hccl_data_type_list_.empty()) {
+    MS_LOG(ERROR) << "AllGather param is empty";
+    return false;
+  }
   const char *tag = "Hccl-AllGather";
   hcclResult_t ret =
     hcom_all_gather(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0], nullptr, stream_ptr);
diff --git a/mindspore/ccsrc/kernel/hccl/hcom_all_reduce.cc b/mindspore/ccsrc/kernel/hccl/hcom_all_reduce.cc
index 2bf9823e5d..35a058e766 100644
--- a/mindspore/ccsrc/kernel/hccl/hcom_all_reduce.cc
+++ b/mindspore/ccsrc/kernel/hccl/hcom_all_reduce.cc
@@ -31,6 +31,10 @@ bool HcomAllReduceKernel::Launch(const std::vector<AddressPtr> &inputs, const st
   if (context_ptr->enable_task_sink()) {
     return true;
   }
+  if (inputs.empty() || outputs.empty() || hccl_data_type_list_.empty()) {
+    MS_LOG(ERROR) << "AllReduce param is empty";
+    return false;
+  }
   const char *tag = "Hccl-AllReduce";
   hcclResult_t ret = hcom_all_reduce(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0],
                                      op_type_, nullptr, stream_ptr);
diff --git a/mindspore/ccsrc/kernel/hccl/hcom_all_reduce_scatter.cc b/mindspore/ccsrc/kernel/hccl/hcom_all_reduce_scatter.cc
index 05217108d9..dea516885d 100644
--- a/mindspore/ccsrc/kernel/hccl/hcom_all_reduce_scatter.cc
+++ b/mindspore/ccsrc/kernel/hccl/hcom_all_reduce_scatter.cc
@@ -32,6 +32,10 @@ bool HcomAllReduceScatterKernel::Launch(const std::vector<AddressPtr> &inputs,
   if (context_ptr->enable_task_sink()) {
     return true;
   }
+  if (inputs.empty() || outputs.empty() || hccl_data_type_list_.empty()) {
+    MS_LOG(ERROR) << "ReduceScatter param is empty";
+    return false;
+  }
   const char *tag = "Hccl-ReduceScatter";
   hcclResult_t ret = hcom_reduce_scatter(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0],
                                          op_type_, nullptr, stream_ptr);
diff --git a/mindspore/ccsrc/kernel/hccl/hcom_util.cc b/mindspore/ccsrc/kernel/hccl/hcom_util.cc
index f2d35878d8..61a4d43eb5 100644
--- a/mindspore/ccsrc/kernel/hccl/hcom_util.cc
+++ b/mindspore/ccsrc/kernel/hccl/hcom_util.cc
@@ -66,6 +66,7 @@ bool HcomUtil::GetHcomDataType(const AnfNodePtr &anf_node, vector<hcclDataType_t
 }
 
 bool HcomUtil::GetHcclOpSize(const hcclDataType_t &data_type, const vector<size_t> &shape, size_t *size) {
+  MS_EXCEPTION_IF_NULL(size);
   int tmp_size = 1;
   uint32_t type_size = 4;
   for (size_t i = 0; i < shape.size(); i++) {
@@ -84,6 +85,7 @@ bool HcomUtil::GetHcclOpSize(const hcclDataType_t &data_type, const vector<size_
 }
 
 bool HcomUtil::GetHcomTypeSize(const hcclDataType_t &data_type, uint32_t *size) {
+  MS_EXCEPTION_IF_NULL(size);
   auto iter = CONST_OP_HCOM_DATA_TYPE_SIZE_MAP.find(data_type);
   if (iter == CONST_OP_HCOM_DATA_TYPE_SIZE_MAP.end()) {
     MS_LOG(ERROR) << "HcomUtil::HcomDataTypeSize, No DataTypeSize!";
diff --git a/mindspore/ccsrc/kernel/kash/kernel_pack.cc b/mindspore/ccsrc/kernel/kash/kernel_pack.cc
index 31f81d5d02..79e2ab9dbb 100644
--- a/mindspore/ccsrc/kernel/kash/kernel_pack.cc
+++ b/mindspore/ccsrc/kernel/kash/kernel_pack.cc
@@ -17,7 +17,7 @@
 #include <fstream>
 #include "mindspore/ccsrc/kernel/kernel.h"
 #include "kernel/kernel.h"
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include "nlohmann/json.hpp"
 #include "securec/include/securec.h"
 #include "pipeline/parse/python_adapter.h"
diff --git a/mindspore/ccsrc/kernel/kernel.h b/mindspore/ccsrc/kernel/kernel.h
index 271f6f20fa..7bccce49c3 100644
--- a/mindspore/ccsrc/kernel/kernel.h
+++ b/mindspore/ccsrc/kernel/kernel.h
@@ -27,11 +27,11 @@
 #include "utils/log_adapter.h"
 
 namespace mindspore {
-enum KernelType : int { UNKNOWN_KERNEL_TYPE = 0, AUTO_DIFF_KERNEL, AICPU_KERNEL, RT_KERNEL, HCCL_KERNEL, TBE_KERNEL };
+enum KernelType : int { UNKNOWN_KERNEL_TYPE = 0, AKG_KERNEL, AICPU_KERNEL, RT_KERNEL, HCCL_KERNEL, TBE_KERNEL };
 
 namespace kernel {
 
-enum Axis {
+enum Axis : int {
   N = 0,
   C,
   H,
@@ -45,6 +45,7 @@ enum FusionType {
   COMMREDUCE,
   SEGMENT,
   OPAQUE,
+  DYNAMIC,
   UNKNOWN_FUSION_TYPE = -1,
 };
 enum OpPattern {
diff --git a/mindspore/ccsrc/kernel/kernel_build_info.cc b/mindspore/ccsrc/kernel/kernel_build_info.cc
index ce7164a0d1..c912a0c199 100644
--- a/mindspore/ccsrc/kernel/kernel_build_info.cc
+++ b/mindspore/ccsrc/kernel/kernel_build_info.cc
@@ -105,7 +105,12 @@ bool KernelBuildInfo::operator==(const KernelBuildInfo &other) const {
     return false;
   }
   if (inputs_format_ != other.inputs_format_ || outputs_format_ != other.outputs_format_) {
-    return false;
+    if (op_pattern_ != kFormatAgnosticPattern) {
+      return false;
+    } else {
+      MS_LOG(INFO) << "this kernel build info:" << this->ToString()
+                   << ", other kernel build info: " << other.ToString();
+    }
   }
   return !(inputs_device_type_ != other.inputs_device_type_ || outputs_device_type_ != other.outputs_device_type_);
 }
@@ -167,5 +172,20 @@ void KernelBuildInfo::KernelBuildInfoBuilder::SetOpPattern(OpPattern pattern) {
   MS_EXCEPTION_IF_NULL(kernel_build_info_);
   kernel_build_info_->op_pattern_ = pattern;
 }
+void KernelBuildInfo::KernelBuildInfoBuilder::SetInputFormat(const std::string &format, size_t index) {
+  MS_EXCEPTION_IF_NULL(kernel_build_info_);
+  if (index >= kernel_build_info_->inputs_format_.size()) {
+    MS_LOG(EXCEPTION) << "index outof range!";
+  }
+  kernel_build_info_->inputs_format_[index] = format;
+}
+
+void KernelBuildInfo::KernelBuildInfoBuilder::SetOutputFormat(const std::string &format, size_t index) {
+  MS_EXCEPTION_IF_NULL(kernel_build_info_);
+  if (index >= kernel_build_info_->outputs_format_.size()) {
+    MS_LOG(EXCEPTION) << "index outof range!";
+  }
+  kernel_build_info_->outputs_format_[index] = format;
+}
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/kernel_build_info.h b/mindspore/ccsrc/kernel/kernel_build_info.h
index d17b41a6fc..ca1083fd68 100644
--- a/mindspore/ccsrc/kernel/kernel_build_info.h
+++ b/mindspore/ccsrc/kernel/kernel_build_info.h
@@ -31,7 +31,7 @@ class KernelBuildInfo {
   class KernelBuildInfoBuilder;
 
   KernelBuildInfo() {
-    kernel_type_ = AUTO_DIFF_KERNEL;
+    kernel_type_ = TBE_KERNEL;
     fusion_type_ = OPAQUE;
     processor_ = AICORE;
     op_pattern_ = kCommonPattern;
@@ -131,6 +131,10 @@ class KernelBuildInfo::KernelBuildInfoBuilder {
 
   void SetOpPattern(OpPattern pattern);
 
+  void SetInputFormat(const std::string &format, size_t index);
+
+  void SetOutputFormat(const std::string &format, size_t index);
+
   std::shared_ptr<KernelBuildInfo> Build();
 
  private:
diff --git a/mindspore/ccsrc/kernel/kernel_fusion.cc b/mindspore/ccsrc/kernel/kernel_fusion.cc
index 4e1ad97e23..be79eca15a 100644
--- a/mindspore/ccsrc/kernel/kernel_fusion.cc
+++ b/mindspore/ccsrc/kernel/kernel_fusion.cc
@@ -102,7 +102,8 @@ std::map<int32_t, KernelModPtr> KernelFusion(const std::vector<FusionScopeInfo>
   while (!build_manger->IsAllTaskFinish()) {
     int task_id = -1;
     char *task_result = nullptr;
-    auto ret = build_manger->WaitOne(&task_id, &task_result);
+    char *pre_build_result = nullptr;
+    auto ret = build_manger->WaitOne(&task_id, &task_result, &pre_build_result);
     if (!ret) {
       MS_EXCEPTION(ArgumentError) << "Build Failed. wait one ret:" << ret << ", task id:" << task_id;
     }
diff --git a/mindspore/ccsrc/kernel/kernel_query.cc b/mindspore/ccsrc/kernel/kernel_query.cc
index 8d3ee64591..5eda847917 100755
--- a/mindspore/ccsrc/kernel/kernel_query.cc
+++ b/mindspore/ccsrc/kernel/kernel_query.cc
@@ -20,7 +20,8 @@
 #include "kernel/aicpu/aicpu_kernel_metadata.h"
 #include "kernel/rts/rt_kernel_info.h"
 #include "kernel/hccl/hccl_kernel_metadata.h"
-#include "kernel/tbe/tbe_kernel_select.h"
+#include "kernel/tbe/tbe_kernel_select/tbe_kernel_select.h"
+#include "kernel/akg/akg_kernel_metadata.h"
 #include "session/anf_runtime_algorithm.h"
 
 namespace mindspore {
@@ -31,7 +32,7 @@ void FilterInvalidKernelInfo(const CNodePtr &kernel_node,
   MS_EXCEPTION_IF_NULL(kernel_info_list);
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> filtered_list;
   (void)std::copy_if(kernel_info_list->begin(), kernel_info_list->end(), std::back_inserter(filtered_list),
-                     [&](const std::shared_ptr<kernel::KernelBuildInfo> &kernel_build_info) {
+                     [&kernel_node](const std::shared_ptr<kernel::KernelBuildInfo> &kernel_build_info) {
                        return AnfAlgo::GetOutputTensorNum(kernel_node) == kernel_build_info->GetOutputNum() &&
                               AnfAlgo::GetInputTensorNum(kernel_node) == kernel_build_info->GetInputNum();
                      });
@@ -39,28 +40,40 @@ void FilterInvalidKernelInfo(const CNodePtr &kernel_node,
     kernel_info_list->clear();
     (void)std::copy(filtered_list.begin(), filtered_list.end(), std::back_inserter(*kernel_info_list));
   } else {
-    MS_LOG(WARNING) << "All kernel Info list does not match any kernel info ";
+    MS_LOG(INFO) << "All kernel Info list does not match any kernel info ";
     for (size_t index = 0; index < kernel_info_list->size(); ++index) {
-      MS_EXCEPTION_IF_NULL(kernel_info_list->at(index));
-      MS_LOG(WARNING) << "kernel [ " << index << " ] :" << kernel_info_list->at(index)->ToString();
+      std::ostringstream buffer;
+      auto kernel_info = kernel_info_list->at(index);
+      MS_EXCEPTION_IF_NULL(kernel_info);
+      if (AnfAlgo::GetOutputTensorNum(kernel_node) != kernel_info->GetOutputNum()) {
+        buffer << "Kernel node's output size [" << AnfAlgo::GetOutputTensorNum(kernel_node) << "]"
+               << " cannot match the kernel's output size [" << kernel_info->GetOutputNum() << "]";
+      } else {
+        buffer << "Kernel node's output size [" << AnfAlgo::GetInputTensorNum(kernel_node) << "]"
+               << " cannot match the kernel's output size [" << kernel_info->GetInputNum() << "]";
+      }
+      MS_LOG(INFO) << "kernel [ " << index << " ] :" << kernel_info->ToString() << buffer.str();
     }
     kernel_info_list->clear();
-    MS_LOG(WARNING) << "node" << kernel_node->DebugString() << "'s output size : ["
-                    << AnfAlgo::GetOutputTensorNum(kernel_node) << "]"
-                    << "input size : [" << AnfAlgo::GetInputTensorNum(kernel_node) << "] cannot match any kernelInfo !";
+    MS_LOG(INFO) << "node" << kernel_node->DebugString() << "'s output size : ["
+                 << AnfAlgo::GetOutputTensorNum(kernel_node) << "]"
+                 << "input size : [" << AnfAlgo::GetInputTensorNum(kernel_node) << "] cannot match any kernelInfo !";
   }
 }
 }  // namespace
-void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list) {
+
+void KernelQueryAll(const CNodePtr &kernel_node,
+                    std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list) {
   MS_EXCEPTION_IF_NULL(kernel_node);
   MS_EXCEPTION_IF_NULL(kernel_info_list);
+
   TbeMetadataInfo(kernel_node, kernel_info_list);
-  FilterInvalidKernelInfo(kernel_node, kernel_info_list);
+
   if (kernel_info_list->empty()) {
     AicpuMetadataInfo(kernel_node, kernel_info_list);
     if (!kernel_info_list->empty()) {
-      MS_LOG(WARNING) << "The node [" << kernel_node->DebugString()
-                      << "] cannot find valid TBE kernel info, try to get aicpu kernel info";
+      MS_LOG(INFO) << "The node [" << kernel_node->DebugString()
+                   << "] cannot find valid TBE kernel info, try to get aicpu kernel info";
       AnfAlgo::SetNodeAttr(kAttrIsAICPUKernel, MakeValue(true), kernel_node);
     }
   }
@@ -75,6 +88,28 @@ void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel
   if (kernel_info_list->empty()) {
     MS_LOG(EXCEPTION) << "Op " << kernel_node->DebugString() << "kernel query fail!";
   }
+}
+
+void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list,
+                 KernelType kernel_type) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_info_list);
+
+  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
+
+  switch (kernel_type) {
+    case KernelType::AKG_KERNEL:
+      AkgMetadataInfo(kernel_node, kernel_info_list);
+      break;
+    default:
+      KernelQueryAll(kernel_node, kernel_info_list);
+      break;
+  }
+
+  if (kernel_info_list->empty()) {
+    MS_EXCEPTION(NotExistsError) << "Op[" << kernel_node->DebugString() << "] kernel query fail!";
+  }
+  // check output
   FilterInvalidKernelInfo(kernel_node, kernel_info_list);
 }
 
@@ -106,7 +141,6 @@ bool IsSupportedByAICore(const AnfNodePtr &kernel_node, const KernelBuildInfoPtr
   auto cnode = kernel_node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   TbeMetadataInfo(cnode, &kernel_info_list);
-  FilterInvalidKernelInfo(cnode, &kernel_info_list);
   return std::any_of(kernel_info_list.begin(), kernel_info_list.end(),
                      [&select_kernel_build_info](const kernel::KernelBuildInfoPtr item) {
                        MS_EXCEPTION_IF_NULL(item);
diff --git a/mindspore/ccsrc/kernel/kernel_query.h b/mindspore/ccsrc/kernel/kernel_query.h
index fe8696a919..257b0cf073 100644
--- a/mindspore/ccsrc/kernel/kernel_query.h
+++ b/mindspore/ccsrc/kernel/kernel_query.h
@@ -25,7 +25,8 @@
 
 namespace mindspore {
 namespace kernel {
-void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list);
+void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list,
+                 KernelType kernel_type = KernelType::UNKNOWN_KERNEL_TYPE);
 void AICPUQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list);
 bool IsSupportedByAICPU(const AnfNodePtr &kernel_node, const KernelBuildInfoPtr &select_kernel_build_info);
 bool IsSupportedByAICore(const AnfNodePtr &kernel_node, const KernelBuildInfoPtr &select_kernel_build_info);
diff --git a/mindspore/ccsrc/kernel/oplib/opinfo.h b/mindspore/ccsrc/kernel/oplib/opinfo.h
index 8d7b543ea6..f224a97efc 100644
--- a/mindspore/ccsrc/kernel/oplib/opinfo.h
+++ b/mindspore/ccsrc/kernel/oplib/opinfo.h
@@ -90,17 +90,36 @@ class OpIOInfo {
 class OpInfo {
  public:
   OpInfo() = default;
+  OpInfo(const OpInfo &opinfo) {
+    op_name_ = opinfo.op_name();
+    imply_type_ = opinfo.imply_type();
+
+    impl_path_ = opinfo.impl_path();
+    fusion_type_ = opinfo.fusion_type();
+    async_flag_ = opinfo.async_flag_;
+    binfile_name_ = opinfo.binfile_name_;
+    compute_cost_ = opinfo.compute_cost_;
+    kernel_name_ = opinfo.kernel_name();
+    partial_flag_ = opinfo.partial_flag_;
+    dynamic_format_ = opinfo.dynamic_format_;
+    op_pattern_ = opinfo.op_pattern();
+    for (auto attr : opinfo.attrs_ptr()) {
+      attrs_ptr_.push_back(std::make_shared<OpAttr>(*attr));
+    }
+    for (auto input : opinfo.inputs_ptr()) {
+      inputs_ptr_.push_back(std::make_shared<OpIOInfo>(*input));
+    }
+    for (auto output : opinfo.outputs_ptr()) {
+      outputs_ptr_.push_back(std::make_shared<OpIOInfo>(*output));
+    }
+    ref_infos_ = opinfo.ref_infos();
+  }
   ~OpInfo() = default;
   std::string op_name() const { return op_name_; }
   OpImplyType imply_type() const { return imply_type_; }
   std::string impl_path() const { return impl_path_; }
   std::string fusion_type() const { return fusion_type_; }
-  bool async_flag() const { return async_flag_; }
-  std::string binfile_name() const { return binfile_name_; }
-  int compute_cost() const { return compute_cost_; }
   std::string kernel_name() const { return kernel_name_; }
-  bool partial_flag() const { return partial_flag_; }
-  bool dynamic_format() const { return dynamic_format_; }
   OpPattern op_pattern() const { return op_pattern_; }
   std::vector<std::shared_ptr<OpAttr>> attrs_ptr() const { return attrs_ptr_; }
   std::vector<std::shared_ptr<OpIOInfo>> inputs_ptr() const { return inputs_ptr_; }
@@ -116,16 +135,15 @@ class OpInfo {
   void set_compute_cost(const int compute_cost) { compute_cost_ = compute_cost; }
   void set_kernel_name(const std::string &kernel_name) { kernel_name_ = kernel_name; }
   void set_partial_flag(const bool partial_flag) { partial_flag_ = partial_flag; }
-  void set_dynamic_format(const bool dynamic_format) { dynamic_format_ = dynamic_format; }
   void set_op_pattern(const OpPattern op_pattern) { op_pattern_ = op_pattern; }
   void add_attrs_ptr(const std::shared_ptr<OpAttr> &attr) { attrs_ptr_.push_back(attr); }
   void add_inputs_ptr(const std::shared_ptr<OpIOInfo> &input) { inputs_ptr_.push_back(input); }
   void add_outputs_ptr(const std::shared_ptr<OpIOInfo> &output) { outputs_ptr_.push_back(output); }
-  void set_inputs_ptr(const std::vector<std::shared_ptr<OpIOInfo>> &inputs) { inputs_ptr_ = inputs; }
-  void set_outputs_ptr(const std::vector<std::shared_ptr<OpIOInfo>> &outputs) { outputs_ptr_ = outputs; }
   bool is_ref() const { return !ref_infos_.empty(); }
   bool has_ref_index(size_t out_index) const { return ref_infos_.find(out_index) != ref_infos_.end(); }
   void add_ref_pair(size_t out_index, size_t in_index) { (void)ref_infos_.emplace(out_index, in_index); }
+  void ClearInputs() { (void)inputs_ptr_.clear(); }
+  void ClearOutputs() { (void)outputs_ptr_.clear(); }
 
  private:
   std::string op_name_;
diff --git a/mindspore/ccsrc/kernel/oplib/oplib.cc b/mindspore/ccsrc/kernel/oplib/oplib.cc
index b1bff36518..35bc407026 100644
--- a/mindspore/ccsrc/kernel/oplib/oplib.cc
+++ b/mindspore/ccsrc/kernel/oplib/oplib.cc
@@ -35,7 +35,7 @@ constexpr auto kKernelName = "kernel_name";
 constexpr auto kPartialFlag = "partial_flag";
 constexpr auto kReshapeType = "reshape_type";
 constexpr auto kOpPattern = "op_pattern";
-constexpr auto kDynamicFormat = "dynamic_format";
+constexpr auto kDynamicFormat = "dynamicFormat";
 constexpr auto kFormatAgnostic = "formatAgnostic";
 constexpr auto kBroadcast = "broadcast";
 constexpr auto kReduce = "reduce";
@@ -100,22 +100,28 @@ bool OpLib::RegOp(const std::string &json_string, const std::string &impl_path)
 
 void OpLib::DecodeTBESpecificInfo(const nlohmann::json &obj, const std::shared_ptr<OpInfo> &op_info) {
   const std::map<std::string, kernel::OpPattern> kOpPatternMap = {{kFormatAgnostic, kFormatAgnosticPattern},
-                                                                  {kFormatAgnostic, kBroadcastPattern},
+                                                                  {kBroadcast, kBroadcastPattern},
                                                                   {kReduce, kReducePattern},
                                                                   {kDynamicFormat, kDynamicFormatPattern}};
+  MS_EXCEPTION_IF_NULL(op_info);
   op_info->set_async_flag(obj.at(kAsyncFlag));
   op_info->set_binfile_name(obj.at(kBinfileName));
   op_info->set_compute_cost(obj.at(kComputeCost));
   op_info->set_kernel_name(obj.at(kKernelName));
   op_info->set_partial_flag(obj.at(kPartialFlag));
+
   if (obj.find(kOpPattern) != obj.end()) {
-    if (kOpPatternMap.find(obj.at(kOpPattern)) != kOpPatternMap.end()) {
-      op_info->set_op_pattern(obj.at(kOpPattern));
+    std::string op_pattern = obj.at(kOpPattern);
+    auto find_iter = kOpPatternMap.find(op_pattern);
+    if (find_iter == kOpPatternMap.end()) {
+      if (!op_pattern.empty()) {
+        MS_LOG(WARNING) << "Op pattern set value error: " << op_pattern;
+      }
+      op_info->set_op_pattern(kCommonPattern);
+    } else {
+      op_info->set_op_pattern(find_iter->second);
     }
   }
-  if (obj.find(kDynamicFormat) != obj.end()) {
-    op_info->set_dynamic_format(obj.at(kDynamicFormat));
-  }
 }
 
 bool OpLib::DecodeOpInfo(const nlohmann::json &obj, const mindspore::kernel::OpImplyType imply_type,
@@ -194,6 +200,7 @@ bool OpLib::DecodeAttr(const nlohmann::json &obj, const OpImplyType imply_type,
 
 bool OpLib::DecodeDtypeFormat(const nlohmann::json &dtype_format, const std::shared_ptr<OpIOInfo> &op_io,
                               size_t index) {
+  MS_EXCEPTION_IF_NULL(op_io);
   bool ret = true;
   try {
     std::vector<std::string> dtype;
@@ -213,6 +220,7 @@ bool OpLib::DecodeDtypeFormat(const nlohmann::json &dtype_format, const std::sha
 
 bool OpLib::DecodeInputOutput(const nlohmann::json &obj, const OpImplyType imply_type, const OpIOType io_type,
                               const std::shared_ptr<OpInfo> &op_info, const nlohmann::json &dtype_format) {
+  MS_EXCEPTION_IF_NULL(op_info);
   bool ret = true;
   try {
     std::shared_ptr<OpIOInfo> op_io = std::make_shared<OpIOInfo>();
@@ -264,8 +272,7 @@ std::shared_ptr<OpInfo> OpLib::FindOp(const std::string &op_name, OpImplyType im
   auto context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context);
   bool is_gpu = (context->device_target() == kGPUDevice);
-  if ((is_gpu && (imply_type == kTBE || imply_type == kAICPU)) ||
-      (!is_gpu && (imply_type != kTBE && imply_type != kAICPU))) {
+  if (is_gpu && (imply_type == kTBE || imply_type == kAICPU)) {
     MS_LOG(ERROR) << "FindOp failed: opname: " << op_name << ", imply_type: " << ImplTypeToStr(imply_type)
                   << ", current op num: " << op_info_.size();
     return nullptr;
diff --git a/mindspore/ccsrc/kernel/oplib/oplib.h b/mindspore/ccsrc/kernel/oplib/oplib.h
index 3d4dcad908..47183455a2 100644
--- a/mindspore/ccsrc/kernel/oplib/oplib.h
+++ b/mindspore/ccsrc/kernel/oplib/oplib.h
@@ -29,7 +29,12 @@ class OpLib {
   OpLib() = default;
   virtual ~OpLib() = default;
   bool RegOp(const std::string &json_string, const std::string &impl_path);
+  static void RegOpInfo(std::shared_ptr<OpInfo> opinfo) {
+    op_info_.emplace_back(opinfo);
+    return;
+  }
   static std::shared_ptr<OpInfo> FindOp(const std::string &op_name, OpImplyType imply_type);
+  static const std::vector<std::shared_ptr<OpInfo>> &GetAllOpsInfo() { return op_info_; }
 
  protected:
   static std::vector<std::shared_ptr<OpInfo>> op_info_;
diff --git a/mindspore/ccsrc/kernel/oplib/oploader.h b/mindspore/ccsrc/kernel/oplib/oploader.h
new file mode 100644
index 0000000000..dd4c37e80b
--- /dev/null
+++ b/mindspore/ccsrc/kernel/oplib/oploader.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_OPLOADER_H
+#define MINDSPORE_OPLOADER_H
+
+#include <vector>
+#include "kernel/oplib/oplib.h"
+
+namespace mindspore {
+namespace kernel {
+class OpInfoLoaderPy {
+ public:
+  OpInfoLoaderPy() = default;
+
+  ~OpInfoLoaderPy() = default;
+
+  size_t GetAllOpsInfo() {
+    auto ops = OpLib::GetAllOpsInfo();
+    auto op_infos = new std::vector<OpInfo *>();
+    for (auto op_info : ops) {
+      auto new_op_info = new OpInfo(*op_info);
+      op_infos->emplace_back(new_op_info);
+    }
+    return (size_t)op_infos;
+  }
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_OPLOADER_H
diff --git a/mindspore/ccsrc/kernel/rts/label_switch.cc b/mindspore/ccsrc/kernel/rts/label_switch.cc
index 168e1f4844..d84407a930 100644
--- a/mindspore/ccsrc/kernel/rts/label_switch.cc
+++ b/mindspore/ccsrc/kernel/rts/label_switch.cc
@@ -67,9 +67,7 @@ std::vector<TaskInfoPtr> LabelSwitchKernel::GenTask(const std::vector<AddressPtr
   MS_LOG(INFO) << "LabelSwitchKernel GenTask label size:" << label_size_ << ", stream id:" << stream_id;
   std::vector<TaskInfoPtr> task_info_list;
   cond_ = inputs[0]->addr;
-  // todo: need update ge task info define
-  auto task_info_ptr = std::make_shared<LabelSwitchTaskInfo>(stream_id, 0);
-  // auto task_info_ptr = std::make_shared<LabelSwitchTaskInfo>(stream_id, label_size_, label_list_, cond_);
+  auto task_info_ptr = std::make_shared<LabelSwitchTaskInfo>(stream_id, label_size_, label_list_, cond_);
   MS_EXCEPTION_IF_NULL(task_info_ptr);
   task_info_list.emplace_back(task_info_ptr);
   return task_info_list;
@@ -77,7 +75,6 @@ std::vector<TaskInfoPtr> LabelSwitchKernel::GenTask(const std::vector<AddressPtr
 
 std::vector<std::shared_ptr<kernel::KernelBuildInfo>> LabelSwitchDesc::GetKernelInfo() {
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> label_switch_build_info{};
-
   vector<string> input_format{kOpFormat_DEFAULT, kOpFormat_DEFAULT};
   vector<TypeId> input_type{kNumberTypeUInt32, kNumberTypeBool};
   if (input_format.size() != input_type.size()) {
diff --git a/mindspore/ccsrc/kernel/rts/recv.cc b/mindspore/ccsrc/kernel/rts/recv.cc
index b68380dac8..c195fd1c92 100644
--- a/mindspore/ccsrc/kernel/rts/recv.cc
+++ b/mindspore/ccsrc/kernel/rts/recv.cc
@@ -37,6 +37,9 @@ bool RecvKernel::Init(const AnfNodePtr &anf_node) {
   MS_EXCEPTION_IF_NULL(anf_node);
   auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
   MS_EXCEPTION_IF_NULL(primitive);
+  if (!AnfAlgo::HasNodeAttr(kAttrEventId, anf_node->cast<CNodePtr>())) {
+    MS_LOG(EXCEPTION) << "RecvKernel has no attr kAttrEventId";
+  }
   event_id_ = GetValue<uint32_t>(primitive->GetAttr(kAttrEventId));
   MS_LOG(INFO) << "recv op event_id_:" << event_id_;
   return true;
diff --git a/mindspore/ccsrc/kernel/rts/send.cc b/mindspore/ccsrc/kernel/rts/send.cc
index ebcb53069e..ccdd43ebb6 100644
--- a/mindspore/ccsrc/kernel/rts/send.cc
+++ b/mindspore/ccsrc/kernel/rts/send.cc
@@ -34,6 +34,9 @@ bool SendKernel::Init(const AnfNodePtr &anf_node) {
   MS_EXCEPTION_IF_NULL(anf_node);
   auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
   MS_EXCEPTION_IF_NULL(primitive);
+  if (!AnfAlgo::HasNodeAttr(kAttrEventId, anf_node->cast<CNodePtr>())) {
+    MS_LOG(EXCEPTION) << "SendKernel has no attr kAttrEventId";
+  }
   event_id_ = GetValue<uint32_t>(primitive->GetAttr(kAttrEventId));
   MS_LOG(INFO) << "send op event id:" << event_id_;
   return true;
diff --git a/mindspore/ccsrc/kernel/rts/stream_active.cc b/mindspore/ccsrc/kernel/rts/stream_active.cc
index 3666dd670f..4f0895a0be 100644
--- a/mindspore/ccsrc/kernel/rts/stream_active.cc
+++ b/mindspore/ccsrc/kernel/rts/stream_active.cc
@@ -36,6 +36,9 @@ bool StreamActiveKernel::Init(const AnfNodePtr &anf_node) {
   MS_LOG(INFO) << "stream active op init start";
   auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
   MS_EXCEPTION_IF_NULL(primitive);
+  if (!AnfAlgo::HasNodeAttr(kAttrActiveStreamList, anf_node->cast<CNodePtr>())) {
+    MS_LOG(EXCEPTION) << "StreamActiveKernel has no attr kAttrActiveStreamList";
+  }
   active_streams_index_ = GetValue<std::vector<uint32_t>>(primitive->GetAttr(kAttrActiveStreamList));
   return true;
 }
diff --git a/mindspore/ccsrc/kernel/rts/stream_switch.cc b/mindspore/ccsrc/kernel/rts/stream_switch.cc
index 9dfb3e8de0..bab6b04366 100644
--- a/mindspore/ccsrc/kernel/rts/stream_switch.cc
+++ b/mindspore/ccsrc/kernel/rts/stream_switch.cc
@@ -42,8 +42,17 @@ bool StreamSwitchKernel::Init(const AnfNodePtr &anf_node) {
   MS_LOG(INFO) << "stream switch op init start";
   auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
   MS_EXCEPTION_IF_NULL(primitive);
+  if (!AnfAlgo::HasNodeAttr(kAttrSwitchCondition, anf_node->cast<CNodePtr>())) {
+    MS_LOG(EXCEPTION) << "StreamSwitchKernel has no attr kAttrSwitchCondition";
+  }
   cond_ = tagRtCondition(GetValue<int>(primitive->GetAttr(kAttrSwitchCondition)));
+  if (!AnfAlgo::HasNodeAttr(kAttrTrueBranchStream, anf_node->cast<CNodePtr>())) {
+    MS_LOG(EXCEPTION) << "StreamSwitchKernel has no attr kAttrTrueBranchStream";
+  }
   true_stream_index_ = GetValue<uint32_t>(primitive->GetAttr(kAttrTrueBranchStream));
+  if (!AnfAlgo::HasNodeAttr(kAttrDataType, anf_node->cast<CNodePtr>())) {
+    MS_LOG(EXCEPTION) << "StreamSwitchKernel has no attr kAttrDataType";
+  }
   data_type_ = tagRtSwitchDataType(GetValue<int>(primitive->GetAttr(kAttrDataType)));
   MS_LOG(INFO) << "cond_:" << static_cast<int>(cond_) << ", true_stream_index_:" << true_stream_index_
                << ", data_type_:" << static_cast<int>(data_type_);
@@ -54,7 +63,7 @@ bool StreamSwitchKernel::Launch(const std::vector<AddressPtr> &inputs, const std
                                 const std::vector<AddressPtr> &outputs, void *stream_ptr) {
   MS_LOG(INFO) << "stream switch op launch start";
   if (inputs.size() != 2) {
-    MS_LOG(ERROR) << "Stream switch inputs size is " << inputs.size() << ", only support 2";
+    MS_LOG(EXCEPTION) << "Stream switch inputs size is " << inputs.size() << ", only support 2";
   }
 
   void *loop_cnt = inputs[0]->addr;
@@ -73,7 +82,7 @@ std::vector<TaskInfoPtr> StreamSwitchKernel::GenTask(const std::vector<AddressPt
                                                      uint32_t stream_id) {
   MS_LOG(INFO) << "StreamSwitchKernel GenTask start";
   if (inputs.size() != 2) {
-    MS_LOG(ERROR) << "stream switch inputs size is " << inputs.size() << ", is not two";
+    MS_LOG(EXCEPTION) << "stream switch inputs size is " << inputs.size() << ", is not two";
   }
   stream_id_ = stream_id;
   MS_EXCEPTION_IF_NULL(inputs[0]);
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
index a24166c5b5..b7bad4fff8 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
@@ -32,6 +32,8 @@ namespace tbe {
 static std::map<string, string> tbe_func_adapter_map = {
   {"softmax", "softmax_v2"},
   {"log_softmax", "log_softmax_v2"},
+  {"apply_momentum", "apply_momentum_d"},
+  {"apply_ftrl", "apply_ftrl_d"},
   {"re_lu6", "relu6"},
   {"re_lu6_grad", "relu6_grad"},
   {"re_lu", "relu"},
@@ -51,10 +53,12 @@ static std::map<string, string> tbe_func_adapter_map = {
   {"scatter_nd", "scatter_nd_d"},
   {"tile", "tile_d"},
   {"gather_v2", "gather_v2_d"},
+  {"sparse_gather_v2", "gather_v2_d"},
   {"batch_mat_mul", "batch_matmul"},
   {"b_n_training_reduce", "bn_training_reduce"},
   {"b_n_training_update", "bn_training_update"},
   {"b_n_training_update_v2", "bn_training_update_v2"},
+  {"b_n_training_update_v3", "bn_training_update_v3"},
   {"b_n_training_reduce_grad", "bn_training_reduce_grad"},
   {"b_n_training_update_grad", "bn_training_update_grad"},
   {"b_n_infer", "bn_infer"},
@@ -66,17 +70,27 @@ static std::map<string, string> tbe_func_adapter_map = {
   {"strided_slice", "strided_slice_d"},
   {"strided_slice_grad", "strided_slice_grad_d"},
   {"sparse_apply_ftrl", "sparse_apply_ftrl_d"},
+  {"apply_ada_max", "apply_ada_max_d"},
+  {"apply_adadelta", "apply_adadelta_d"},
+  {"apply_adagrad", "apply_adagrad_d"},
+  {"apply_adagrad_v2", "apply_adagradv2_d"},
+  {"sparse_apply_adagrad", "sparse_apply_adagrad_d"},
+  {"apply_proximal_adagrad", "apply_proximal_adagrad_d"},
+  {"sparse_apply_proximal_adagrad", "sparse_apply_proximal_adagrad_d"},
   {"transpose", "transpose_d"},
   {"fill", "fill_d"},
   {"unsorted_segment_sum", "unsorted_segment_sum_d"},
   {"concat", "concat_d"},
   {"slice", "slice_d"},
   {"reduce_sum", "reduce_sum_d"},
+  {"inplace_add", "inplace_add_d"},
+  {"inplace_sub", "inplace_sub_d"},
   {"one_hot", "one_hot_d"},
   {"sum", "reduce_sum_d"},
   {"lamb_next_mv_with_decay", "lamb_next_m_v_with_decay"},
   {"lamb_next_mv", "lamb_next_m_v"},
   {"split", "split_d"},
+  {"split_v", "split_v_d"},
   {"resize_nearest_neighbor", "resize_nearest_neighbor_v2_d"},
   {"resize_nearest_neighbor_grad", "resize_nearest_neighbor_v2_grad_d"},
   {"pad", "pad_d"},
@@ -88,7 +102,7 @@ static std::map<string, string> tbe_func_adapter_map = {
   {"batch_to_space_nd", "batch_to_space_nd_d"},
   {"resize_bilinear", "resize_bilinear_v2_d"},
   {"resize_bilinear_grad", "resize_bilinear_v2_grad"},
-  {"adam", "apply_adam"},
+  {"adam", "apply_adam_d"},
   {"r_oi_align", "roi_align"},
   {"r_oi_align_grad", "roi_align_grad"},
   {"i_ou", "iou"},
@@ -97,6 +111,9 @@ static std::map<string, string> tbe_func_adapter_map = {
   {"n_ms_with_mask", "nms_with_mask"},
   {"square_sum_all", "square_sum_all"},
   {"cum_sum", "cumsum_d"},
+  {"range", "range_d"},
+  {"lin_space", "lin_space_d"},
+  {"inv_grad", "inv_grad"},
   {"apply_rms_prop", "apply_rms_prop_d"},
   {"cum_prod", "cumprod_d"},
   {"reduce_all", "reduce_all_d"},
@@ -104,7 +121,13 @@ static std::map<string, string> tbe_func_adapter_map = {
   {"unsorted_segment_min", "unsorted_segment_min_d"},
   {"reduce_prod", "reduce_prod_d"},
   {"a_cos", "acos"},
-  {"a_cos_grad", "acos_grad"}};
+  {"a_cos_grad", "acos_grad"},
+  {"histogram_fixed_width", "histogram_fixed_width_d"},
+  {"broadcast_to", "broadcast_to_d"},
+  {"inplace_update", "inplace_update_d"},
+  {"matrix_diag", "matrix_diag_d"},
+  {"matrix_diag_part", "matrix_diag_part_d"},
+  {"matrix_set_diag", "matrix_set_diag_d"}};
 
 void TbeAdapter::NormalizeFuncName(std::string *func_name) {
   if (func_name == nullptr) {
@@ -138,7 +161,7 @@ void TbeAdapter::NormalizeFuncName(std::string *func_name) {
   *func_name = name_tmp;
   auto iter = tbe_func_adapter_map.find(*func_name);
   if (iter != tbe_func_adapter_map.end()) {
-    MS_LOG(INFO) << "map actual op from me " << func_name << "to tbe op" << iter->second;
+    MS_LOG(INFO) << "map actual op from me " << *func_name << " to tbe op" << iter->second;
     *func_name = iter->second;
   }
 }
@@ -176,6 +199,18 @@ void TbeAdapter::InputOrderPass(const std::string &op_name, std::vector<std::vec
       for (size_t i = 3; i < inputs_list.size(); ++i) {
         inputs_json->push_back(inputs_list[i]);
       }
+    } else if (op_name == "ApplyCenteredRMSProp") {
+      // Parameter order of ApplyCenteredRMSProp's TBE implementation is different from python API, so map
+      // TBE parameter to correspond python API parameter by latter's index using hardcode
+      inputs_json->push_back(inputs_list[0]);
+      inputs_json->push_back(inputs_list[1]);
+      inputs_json->push_back(inputs_list[2]);
+      inputs_json->push_back(inputs_list[3]);
+      inputs_json->push_back(inputs_list[5]);
+      inputs_json->push_back(inputs_list[6]);
+      inputs_json->push_back(inputs_list[7]);
+      inputs_json->push_back(inputs_list[8]);
+      inputs_json->push_back(inputs_list[4]);
     } else {
       inputs_json->push_back(inputs_list[1]);
       inputs_json->push_back(inputs_list[0]);
@@ -316,10 +351,10 @@ static int TypeStrToDstType(const std::string &type_str) {
     ret = 4;
   } else if (type_str == "UInt64") {
     ret = 10;
-  } else if (type_str == "Bool_") {
+  } else if (type_str == "Bool") {
     ret = 12;
   } else {
-    MS_EXCEPTION(ArgumentError) << "type str is invailed: " << type_str;
+    MS_LOG(INFO) << "Error type str is invailed: " << type_str;
   }
   return ret;
 }
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_adapter.h b/mindspore/ccsrc/kernel/tbe/tbe_adapter.h
index 0208d6c6a6..51c4cfd777 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_adapter.h
+++ b/mindspore/ccsrc/kernel/tbe/tbe_adapter.h
@@ -27,7 +27,7 @@
 //       the TBE back-end operator implementation difference
 namespace mindspore {
 namespace kernel {
-enum kCreaterType : int { SINGLE_BUILD = 0, PREBUILD, OP_SELECT_FORMAT, CHECK_SUPPORTED };
+enum kCreaterType : int { SINGLE_BUILD = 0, PREBUILD, OP_SELECT_FORMAT, CHECK_SUPPORTED, OP_PRE_COMPILE };
 namespace tbe {
 using FAttrsPass = void (*)(const AnfNodePtr &anf_node, const std::vector<std::shared_ptr<OpAttr>> &op_info_attrs,
                             nlohmann::json *attrs_json);
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.cc b/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.cc
index 1159bd888d..90c5557253 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.cc
@@ -45,13 +45,13 @@ const std::map<TypeId, std::string> type_id_str_maps = {
   {TypeId::kNumberTypeInt64, "int64"},     {TypeId::kNumberTypeUInt, "uint"},
   {TypeId::kNumberTypeUInt8, "uint8"},     {TypeId::kNumberTypeUInt16, "uint16"},
   {TypeId::kNumberTypeUInt32, "uint32"},   {TypeId::kNumberTypeUInt64, "uint64"},
-  {TypeId::kNumberTypeBool, "bool"},
+  {TypeId::kNumberTypeBool, "int8"},
 };
 
 const std::map<std::string, std::string> type_str_maps = {
   {"Float32", "float32"}, {"Float16", "float16"}, {"Int8", "int8"},   {"Int16", "int16"},
   {"UInt16", "uint16"},   {"UInt8", "uint8"},     {"Int32", "int32"}, {"UInt32", "uint32"},
-  {"Int64", "int64"},     {"UInt64", "uint64"},   {"Bool_", "int8"},  {"Float64", "float64"},
+  {"Int64", "int64"},     {"UInt64", "uint64"},   {"Bool", "int8"},   {"Float64", "float64"},
 };
 
 const std::unordered_map<std::string, size_t> type_nbyte_maps = {
@@ -63,7 +63,7 @@ const std::unordered_map<std::string, size_t> type_nbyte_maps = {
 
 const std::unordered_map<std::string, FusionType> fusion_type_maps = {
   {"CONVLUTION", FusionType::CONVLUTION}, {"ELEMWISE", FusionType::ELEMWISE}, {"COMMREDUCE", FusionType::COMMREDUCE},
-  {"SEGMENT", FusionType::SEGMENT},       {"OPAQUE", FusionType::OPAQUE},
+  {"SEGMENT", FusionType::SEGMENT},       {"DYNAMIC", FusionType::DYNAMIC},   {"OPAQUE", FusionType::OPAQUE},
 };
 
 TypeId DtypeToTypeId(const std::string &dtypes) {
@@ -74,18 +74,10 @@ TypeId DtypeToTypeId(const std::string &dtypes) {
   return iter->second;
 }
 
-std::string DtypeToString(const std::string &dtypes) {
-  auto iter = type_str_maps.find(dtypes);
-  if (iter == type_str_maps.end()) {
-    MS_LOG(EXCEPTION) << "Illegal input dtype: " << dtypes;
-  }
-  return iter->second;
-}
-
 std::string TypeIdToString(TypeId type_id) {
   auto iter = type_id_str_maps.find(type_id);
   if (iter == type_id_str_maps.end()) {
-    MS_LOG(EXCEPTION) << "Illegal input dtype." << TypeIdLabel(type_id);
+    MS_LOG(EXCEPTION) << "Illegal input dtype: " << TypeIdLabel(type_id);
   }
   return iter->second;
 }
@@ -101,7 +93,7 @@ size_t GetDtypeNbyte(const std::string &dtypes) {
 FusionType GetFusionType(const std::string &pattern) {
   auto iter = fusion_type_maps.find(pattern);
   if (iter == fusion_type_maps.end()) {
-    MS_LOG(DEBUG) << "Illegal fusion pattern: " << pattern;
+    MS_LOG(INFO) << "Illegal fusion pattern: " << pattern;
     return UNKNOWN_FUSION_TYPE;
   }
   return iter->second;
@@ -115,7 +107,7 @@ std::string GetProcessor(const AnfNodePtr &anf_node) {
       device = kProcessorAiCore;
       break;
     default:
-      MS_LOG(DEBUG) << "Unknown processor type." << anf_node->fullname_with_scope();
+      MS_LOG(INFO) << "Unknown processor type." << anf_node->fullname_with_scope();
       break;
   }
   return device;
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.h b/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.h
index 9b9b3770df..2c8d3008b9 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.h
+++ b/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.h
@@ -28,8 +28,6 @@ namespace tbe {
 constexpr auto kProcessorAiCore = "aicore";
 TypeId DtypeToTypeId(const std::string &dtypes);
 
-std::string DtypeToString(const std::string &dtypes);
-
 std::string TypeIdToString(TypeId type_id);
 
 size_t GetDtypeNbyte(const std::string &dtypes);
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.cc b/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.cc
index bd5b0d6323..76df819043 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.cc
@@ -15,15 +15,12 @@
  */
 
 #include "kernel/tbe/tbe_kernel_build.h"
-
 #include <memory>
 #include <map>
 #include <algorithm>
-#include <unordered_set>
-
 #include "operator/ops.h"
+#include "parallel/ops_info/ops_utils.h"
 #include "session/anf_runtime_algorithm.h"
-#include "kernel/tbe/tbe_kernel_mod.h"
 #include "kernel/tbe/tbe_adapter.h"
 #include "kernel/tbe/tbe_python_funcs.h"
 #include "kernel/tbe/tbe_convert_utils.h"
@@ -37,6 +34,43 @@ constexpr auto kFusionOpList = "op_list";
 constexpr auto kFusionKernelNamePrfix = "te_fusion";
 constexpr auto kOptional = "optional_";
 constexpr auto kOpFormat_FRACTAL_Z = "FRACTAL_Z";
+constexpr auto kPlatform = "platform";
+constexpr auto kPlatTBE = "TBE";
+constexpr auto kGenModel = "gen_model";
+constexpr auto kSingle = "single";
+constexpr auto kImplPath = "impl_path";
+constexpr auto kJInputs = "inputs";
+constexpr auto kJOutputs = "outputs";
+constexpr auto kJAttrs = "attrs";
+constexpr auto kJKernelName = "kernel_name";
+constexpr auto kJOpInfo = "op_info";
+constexpr auto kJDtype = "dtype";
+constexpr auto kJtype = "type";
+constexpr auto kJName = "name";
+constexpr auto kJOriShape = "ori_shape";
+constexpr auto kJOriFormat = "ori_format";
+constexpr auto kJShape = "shape";
+constexpr auto kJFormat = "format";
+constexpr auto kJValid = "valid";
+constexpr auto kJParamType = "param_type";
+constexpr auto kParamDynamic = "dynamic";
+constexpr auto kParamRequred = "required";
+constexpr auto kJDataType = "data_type";
+constexpr auto kJOutputIndex = "output_index";
+constexpr auto kJOutputDesc = "output_desc";
+constexpr auto kJInputDesc = "input_desc";
+constexpr auto kVTypeInt = "int";
+constexpr auto kVTypeStr = "str";
+constexpr auto kVTypeBool = "bool";
+constexpr auto kVTypeFloat = "float";
+constexpr auto kVTypeListInt = "listInt";
+constexpr auto kVTypeInt32 = "Int32";
+constexpr auto kVTypeListUInt64 = "listUInt64";
+constexpr auto kVTypeListFloat = "listFloat";
+constexpr auto kVTypeListListInt = "listListInt";
+constexpr auto kJValue = "value";
+constexpr auto kJDynIndex = "dyn_index";
+constexpr auto kJFuncName = "func_name";
 
 std::string NormalizeFullScopeName(const string &full_scope_name) {
   // exp:Default/ReLU-op0 -->Default_ReLU_op0
@@ -46,51 +80,51 @@ std::string NormalizeFullScopeName(const string &full_scope_name) {
   return normal_ret;
 }
 
-bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const shared_ptr<mindspore::AnfNode> &anf_node,
+bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspore::AnfNode> &anf_node,
                                                   nlohmann::json *kernel_json) {
   MS_EXCEPTION_IF_NULL(anf_node);
   MS_EXCEPTION_IF_NULL(kernel_json);
   std::string op_name = AnfAlgo::GetCNodeName(anf_node);
   auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kTBE);
   MS_EXCEPTION_IF_NULL(op_info_ptr);
-  (*kernel_json)["platform"] = "TBE";
-  (*kernel_json)["gen_model"] = "single";
-  (*kernel_json)["impl_path"] = op_info_ptr->impl_path();
+  (*kernel_json)[kPlatform] = kPlatTBE;
+  (*kernel_json)[kGenModel] = kSingle;
+  (*kernel_json)[kImplPath] = op_info_ptr->impl_path();
   nlohmann::json op_info_json;
   if (op_info_ptr->impl_path().empty()) {
     tbe::TbeAdapter::NormalizeFuncName(&op_name);
   } else {
     op_name = op_info_ptr->kernel_name();
   }
-  op_info_json["name"] = op_name;
+  op_info_json[kJName] = op_name;
   // generate inputs json
   nlohmann::json inputs_json;
   if (!GenTbeInputsJson(anf_node, op_info_ptr, &inputs_json)) {
     MS_LOG(ERROR) << "Anf Node [" << op_name << "] generate inputs json failed";
     return false;
   }
-  op_info_json["inputs"] = inputs_json;
+  op_info_json[kJInputs] = inputs_json;
   // generate outputs json
   nlohmann::json outputs_json;
   if (!GenTbeOutputsJson(anf_node, op_info_ptr, &outputs_json)) {
     MS_LOG(ERROR) << "Anf Node [" << op_name << "] generate outputs json failed";
     return false;
   }
-  op_info_json["outputs"] = outputs_json;
+  op_info_json[kJOutputs] = outputs_json;
   // generate attrs json
   nlohmann::json attrs_json;
   (void)GenTbeAttrJson(anf_node, op_info_ptr, &attrs_json);
-  op_info_json["attrs"] = attrs_json;
+  op_info_json[kJAttrs] = attrs_json;
   std::string json_str = op_info_json.dump();
   size_t hash_id = std::hash<std::string>()(json_str);
   json_name_ = op_name + "_" + std::to_string(hash_id);
   json_info_ = json_str;
   if (creater_type_ == PREBUILD) {
-    op_info_json["kernel_name"] = NormalizeFullScopeName(anf_node->fullname_with_scope());
+    op_info_json[kJKernelName] = NormalizeFullScopeName(anf_node->fullname_with_scope());
   } else {
-    op_info_json["kernel_name"] = json_name_;
+    op_info_json[kJKernelName] = json_name_;
   }
-  (*kernel_json)["op_info"] = op_info_json;
+  (*kernel_json)[kJOpInfo] = op_info_json;
   if (creater_type_ == SINGLE_BUILD) {
     TbeUtils::SaveJsonInfo(json_name_, json_info_);
   }
@@ -101,9 +135,10 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const shared_ptr<mindspore::An
   return true;
 }
 
-bool TbeKernelJsonCreator::GenInputDescJson(const shared_ptr<AnfNode> &anf_node, size_t real_input_index, bool value,
-                                            const shared_ptr<OpIOInfo> &input_ptr, const string &op_input_name,
-                                            size_t input_i, vector<nlohmann::json> *input_list) {
+bool TbeKernelJsonCreator::GenInputDescJson(const std::shared_ptr<AnfNode> &anf_node, size_t real_input_index,
+                                            bool value, const std::shared_ptr<OpIOInfo> &input_ptr,
+                                            const string &op_input_name, size_t input_i,
+                                            std::vector<nlohmann::json> *input_list) {
   MS_EXCEPTION_IF_NULL(anf_node);
   MS_EXCEPTION_IF_NULL(input_ptr);
   MS_EXCEPTION_IF_NULL(input_list);
@@ -111,51 +146,30 @@ bool TbeKernelJsonCreator::GenInputDescJson(const shared_ptr<AnfNode> &anf_node,
   if (input_ptr->name() == "input_indices" && op_name == kTopKOpName) {
     TbeAdapter::GenTopKV2IndicesTensorInfo(anf_node, real_input_index, input_list, creater_type_);
   } else {
-    // dtype : float16
-    auto tensor_dtype =
-      std::make_shared<TensorType>(TypeIdToType(AnfAlgo::GetInputDeviceDataType(anf_node, real_input_index)));
-    MS_EXCEPTION_IF_NULL(tensor_dtype);
-    std::string dtype = tensor_dtype->element()->ToString();
-    dtype = tbe::DtypeToString(dtype);
-
-    // format
-    std::string format = AnfAlgo::GetInputFormat(anf_node, real_input_index);
-    if (format == kOpFormat_DEFAULT) {
-      format = kOpFormat_NCHW;
-    } else if (format == kOpFormat_FRAC_Z) {
-      format = kOpFormat_FRACTAL_Z;
-    }
-
-    nlohmann::json input_desc_json;
-    input_desc_json["dtype"] = dtype;
-    input_desc_json["name"] = op_input_name + std::to_string(input_i);
+    auto dtype = GetDeviceInputType(anf_node, real_input_index);
+    auto format = GetDeviceInputFormat(anf_node, real_input_index);
+    auto shape = GetDeviceInputShape(anf_node, real_input_index);
     auto ori_shape = AnfAlgo::GetPrevNodeOutputInferShape(anf_node, real_input_index);
     if (ori_shape.empty()) {
       ori_shape.emplace_back(1);
     }
-    input_desc_json["ori_shape"] = ori_shape;
-    input_desc_json["ori_format"] = kOpFormat_NCHW;
-    auto shape = AnfAlgo::GetInputDeviceShape(anf_node, real_input_index);
-    if (shape.empty()) {
-      shape.emplace_back(1);
-    }
-    if (creater_type_ == OP_SELECT_FORMAT || creater_type_ == CHECK_SUPPORTED) {
-      input_desc_json["shape"] = ori_shape;
-      input_desc_json["format"] = kOpFormat_NCHW;
-    } else {
-      input_desc_json["shape"] = shape;
-      input_desc_json["format"] = format;
-    }
-    input_desc_json["valid"] = value;
-    input_desc_json["param_type"] = input_ptr->param_type();
+    nlohmann::json input_desc_json;
+    input_desc_json[kJDtype] = dtype;
+    input_desc_json[kJName] = op_input_name + std::to_string(input_i);
+    input_desc_json[kJOriShape] = ori_shape;
+    input_desc_json[kJOriFormat] = kOpFormat_NCHW;
+    input_desc_json[kJShape] = shape;
+    input_desc_json[kJFormat] = format;
+    input_desc_json[kJValid] = value;
+    input_desc_json[kJParamType] = input_ptr->param_type();
     input_list->emplace_back(input_desc_json);
   }
   return true;
 }
 
-bool TbeKernelJsonCreator::GenInputList(const shared_ptr<AnfNode> &anf_node, size_t input_tensor_num,
-                                        const shared_ptr<OpIOInfo> &input_ptr, size_t *real_input_index,
-                                        string *op_input_name, vector<nlohmann::json> *input_list) {
+bool TbeKernelJsonCreator::GenInputList(const std::shared_ptr<AnfNode> &anf_node, size_t input_tensor_num,
+                                        const std::shared_ptr<OpIOInfo> &input_ptr, size_t *real_input_index,
+                                        string *op_input_name, std::vector<nlohmann::json> *input_list) {
   MS_EXCEPTION_IF_NULL(anf_node);
   MS_EXCEPTION_IF_NULL(input_ptr);
   MS_EXCEPTION_IF_NULL(real_input_index);
@@ -170,8 +184,8 @@ bool TbeKernelJsonCreator::GenInputList(const shared_ptr<AnfNode> &anf_node, siz
       if (input_ptr->param_type() == "optional") {
         *op_input_name = input_ptr->name() + "_optional_";
         nlohmann::json input_desc_json;
-        input_desc_json["valid"] = false;
-        input_desc_json["name"] = *op_input_name + std::to_string(*real_input_index);
+        input_desc_json[kJValid] = false;
+        input_desc_json[kJName] = *op_input_name + std::to_string(*real_input_index);
         input_list->emplace_back(input_desc_json);
         continue;
       }
@@ -200,7 +214,7 @@ bool TbeKernelJsonCreator::GenInputList(const shared_ptr<AnfNode> &anf_node, siz
   return true;
 }
 
-bool GetInputNameAndRealNum(const std::shared_ptr<AnfNode> &anf_node, const shared_ptr<OpIOInfo> &input_ptr,
+bool GetInputNameAndRealNum(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<OpIOInfo> &input_ptr,
                             size_t *dyn_input_index, size_t *input_num, std::string *op_input_name) {
   MS_EXCEPTION_IF_NULL(anf_node);
   MS_EXCEPTION_IF_NULL(input_ptr);
@@ -214,7 +228,7 @@ bool GetInputNameAndRealNum(const std::shared_ptr<AnfNode> &anf_node, const shar
     dyn_input_sizes = GetValue<const std::vector<int>>(primitive->GetAttr(kAttrDynInputSizes));
   }
 
-  if (input_ptr->param_type() == "dynamic") {
+  if (input_ptr->param_type() == kParamDynamic) {
     if (*dyn_input_index >= dyn_input_sizes.size()) {
       MS_LOG(ERROR) << "dyn input index" << *dyn_input_index << "is over dyn input num" << dyn_input_sizes.size();
       return false;
@@ -280,9 +294,9 @@ bool TbeKernelJsonCreator::GenTbeOutputsJson(const std::shared_ptr<AnfNode> &anf
   return GenOutputDescJson(anf_node, outputs_ptr, outputs_json);
 }
 
-bool TbeKernelJsonCreator::GenOutputDescJson(const shared_ptr<mindspore::AnfNode> &anf_node,
-                                             const vector<shared_ptr<mindspore::kernel::OpIOInfo>> &outputs_ptr,
-                                             nlohmann::json *outputs_json) {
+bool TbeKernelJsonCreator::GenOutputDescJson(
+  const std::shared_ptr<mindspore::AnfNode> &anf_node,
+  const std::vector<std::shared_ptr<mindspore::kernel::OpIOInfo>> &outputs_ptr, nlohmann::json *outputs_json) {
   MS_EXCEPTION_IF_NULL(outputs_json);
   size_t output_idx = 0;
   auto op_name = AnfAlgo::GetCNodeName(anf_node);
@@ -290,9 +304,9 @@ bool TbeKernelJsonCreator::GenOutputDescJson(const shared_ptr<mindspore::AnfNode
 
   for (const auto &output_ptr : outputs_ptr) {
     size_t output_obj_num = 0;
-    if (output_ptr->param_type() == "required") {
+    if (output_ptr->param_type() == kParamRequred) {
       output_obj_num = 1;
-    } else if (output_ptr->param_type() == "dynamic") {
+    } else if (output_ptr->param_type() == kParamDynamic) {
       if (outputs_ptr.size() > 1) {
         MS_LOG(ERROR) << "Dynamic output is unsupported multi output!";
         return false;
@@ -303,8 +317,8 @@ bool TbeKernelJsonCreator::GenOutputDescJson(const shared_ptr<mindspore::AnfNode
         MS_LOG(INFO) << "op:" << op_name << ", output" << output_ptr->name() << " is optional, output is none.";
         std::vector<nlohmann::json> output_list;
         nlohmann::json output_obj;
-        output_obj["name"] = output_ptr->name();
-        output_obj["valid"] = false;
+        output_obj[kJName] = output_ptr->name();
+        output_obj[kJValid] = false;
         output_list.emplace_back(output_obj);
         (*outputs_json).push_back(output_list);
         continue;
@@ -319,46 +333,28 @@ bool TbeKernelJsonCreator::GenOutputDescJson(const shared_ptr<mindspore::AnfNode
   return true;
 }
 
-void TbeKernelJsonCreator::GenOutputList(const shared_ptr<AnfNode> &anf_node, const size_t &output_obj_num,
-                                         const shared_ptr<OpIOInfo> &output_ptr, size_t *output_idx,
-                                         vector<nlohmann::json> *output_list) {
+void TbeKernelJsonCreator::GenOutputList(const std::shared_ptr<AnfNode> &anf_node, const size_t &output_obj_num,
+                                         const std::shared_ptr<OpIOInfo> &output_ptr, size_t *output_idx,
+                                         std::vector<nlohmann::json> *output_list) {
   MS_EXCEPTION_IF_NULL(output_idx);
   MS_EXCEPTION_IF_NULL(output_list);
   for (size_t i = 0; i < output_obj_num; i++) {
-    nlohmann::json output_obj;
-    auto type_ptr = std::make_shared<TensorType>(TypeIdToType(AnfAlgo::GetOutputDeviceDataType(anf_node, *output_idx)));
-    std::string dtype = type_ptr->element()->ToString();
-    dtype = tbe::DtypeToString(dtype);
-    std::string format = AnfAlgo::GetOutputFormat(anf_node, *output_idx);
-    if (format == kOpFormat_DEFAULT) {
-      format = kOpFormat_NCHW;
-    } else if (format == kOpFormat_FRAC_Z) {
-      format = kOpFormat_FRACTAL_Z;
-    }
-    std::vector<size_t> ori_shape;
-    if (AnfAlgo::GetOutputInferShape(anf_node, *output_idx).empty()) {
+    auto dtype = GetDeviceOutputType(anf_node, *output_idx);
+    auto format = GetDeviceOutputFormat(anf_node, *output_idx);
+    auto shape = GetDeviceOutputShape(anf_node, *output_idx);
+    std::vector<size_t> ori_shape = AnfAlgo::GetOutputInferShape(anf_node, *output_idx);
+    if (ori_shape.empty()) {
       ori_shape.emplace_back(1);
-    } else {
-      ori_shape = AnfAlgo::GetOutputInferShape(anf_node, *output_idx);
     }
-    output_obj["dtype"] = dtype;
-    auto shape = AnfAlgo::GetOutputDeviceShape(anf_node, *output_idx);
-    if (shape.empty()) {
-      shape.emplace_back(1);
-    }
-    if (creater_type_ == OP_SELECT_FORMAT || creater_type_ == CHECK_SUPPORTED) {
-      output_obj["shape"] = ori_shape;
-      output_obj["format"] = kOpFormat_NCHW;
-    } else {
-      output_obj["shape"] = shape;
-      output_obj["format"] = format;
-    }
-    output_obj["ori_shape"] = ori_shape;
-    output_obj["ori_format"] = kOpFormat_NCHW;
-    output_obj["name"] = output_ptr->name();
-    output_obj["valid"] = true;
-    output_obj["param_type"] = output_ptr->param_type();
-
+    nlohmann::json output_obj;
+    output_obj[kJDtype] = dtype;
+    output_obj[kJShape] = shape;
+    output_obj[kJFormat] = format;
+    output_obj[kJOriShape] = ori_shape;
+    output_obj[kJOriFormat] = kOpFormat_NCHW;
+    output_obj[kJName] = output_ptr->name();
+    output_obj[kJValid] = true;
+    output_obj[kJParamType] = output_ptr->param_type();
     output_list->emplace_back(output_obj);
     (*output_idx)++;
   }
@@ -379,24 +375,24 @@ bool TbeKernelJsonCreator::GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_no
   for (const auto &attr_ptr : attrs_ptr) {
     std::string attr_name = attr_ptr->name();
     nlohmann::json attr_obj;
-    attr_obj["name"] = attr_name;
-    if (op_name == "LayerNorm" && attr_obj["name"] == "epsilon" && creater_type_ == OP_SELECT_FORMAT) {
+    attr_obj[kJName] = attr_name;
+    if (op_name == parallel::LAYER_NORM && attr_obj[kJName] == "epsilon" && creater_type_ == OP_SELECT_FORMAT) {
       continue;
     }
     if (primitive->GetAttr(attr_name) != nullptr) {
       auto value = primitive->GetAttr(attr_name);
       std::string type = attr_ptr->type();
       ParseAttrValue(type, value, &attr_obj);
-      attr_obj["valid"] = true;
+      attr_obj[kJValid] = true;
     } else {
       if (op_info->impl_path().empty()) {
-        attr_obj["valid"] = false;
+        attr_obj[kJValid] = false;
       } else {
-        if (attr_ptr->param_type() == "required" && creater_type_ == SINGLE_BUILD) {
+        if (attr_ptr->param_type() == kParamRequred && creater_type_ == SINGLE_BUILD) {
           MS_LOG(EXCEPTION) << "op name: " << op_info->op_name() << " attr: " << attr_name
                             << " is required, but not set.";
         } else {
-          attr_obj["valid"] = false;
+          attr_obj[kJValid] = false;
         }
       }
     }
@@ -409,53 +405,134 @@ void TbeKernelJsonCreator::ParseAttrValue(const std::string &type, const mindspo
                                           nlohmann::json *attr_obj) {
   MS_EXCEPTION_IF_NULL(value);
   MS_EXCEPTION_IF_NULL(attr_obj);
-  if (type == "int") {
+  if (type == kVTypeInt) {
     auto attr_value = GetValue<int>(value);
-    (*attr_obj)["value"] = attr_value;
-  } else if (type == "str") {
+    (*attr_obj)[kJValue] = attr_value;
+  } else if (type == kVTypeStr) {
     auto attr_value = GetValue<std::string>(value);
     if (attr_value == kOpFormat_FRAC_Z) {
       attr_value = kOpFormat_FRACTAL_Z;
     }
-    (*attr_obj)["value"] = attr_value;
-  } else if (type == "bool") {
+    (*attr_obj)[kJValue] = attr_value;
+  } else if (type == kVTypeBool) {
     auto attr_value = GetValue<bool>(value);
-    (*attr_obj)["value"] = attr_value;
-  } else if (type == "float") {
+    (*attr_obj)[kJValue] = attr_value;
+  } else if (type == kVTypeFloat) {
     auto attr_value = GetValue<float>(value);
-    (*attr_obj)["value"] = attr_value;
-  } else if (type == "listInt") {
+    (*attr_obj)[kJValue] = attr_value;
+  } else if (type == kVTypeListInt) {
     std::vector<int> attr_value;
     auto value_type = value->type();
     MS_EXCEPTION_IF_NULL(value_type);
     auto value_type_str = value_type->ToString();
-    if (value_type_str == "Int32") {
+    if (value_type_str == kVTypeInt32) {
       int data = GetValue<int>(value);
       attr_value.push_back(data);
     } else {
       attr_value = GetValue<std::vector<int>>(value);
     }
-    (*attr_obj)["value"] = attr_value;
-  } else if (type == "listFloat") {
+    (*attr_obj)[kJValue] = attr_value;
+  } else if (type == kVTypeListFloat) {
     std::vector<float> attr_value;
     auto value_type = value->type();
     MS_EXCEPTION_IF_NULL(value_type);
     auto value_type_str = value_type->ToString();
-    if (value_type_str == "float") {
+    if (value_type_str == kVTypeFloat) {
       auto data = GetValue<float>(value);
       attr_value.push_back(data);
     } else {
       attr_value = GetValue<std::vector<float>>(value);
     }
-    (*attr_obj)["value"] = attr_value;
-  } else if (type == "listListInt") {
+    (*attr_obj)[kJValue] = attr_value;
+  } else if (type == kVTypeListUInt64) {
+    auto attr_value = GetValue<std::vector<size_t>>(value);
+    (*attr_obj)[kJValue] = attr_value;
+  } else if (type == kVTypeListListInt) {
     auto attr_value = GetValue<std::vector<std::vector<int>>>(value);
-    (*attr_obj)["value"] = attr_value;
+    (*attr_obj)[kJValue] = attr_value;
   } else {
     MS_LOG(EXCEPTION) << "type: " << type << "not support";
   }
 }
 
+std::vector<size_t> TbeKernelJsonCreator::GetDeviceInputShape(const AnfNodePtr &anf_node, size_t real_index) const {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::vector<size_t> shape;
+  if (creater_type_ == OP_SELECT_FORMAT || creater_type_ == CHECK_SUPPORTED) {
+    shape = AnfAlgo::GetPrevNodeOutputInferShape(anf_node, real_index);
+  } else {
+    shape = AnfAlgo::GetInputDeviceShape(anf_node, real_index);
+  }
+  if (shape.empty()) {
+    shape.emplace_back(1);
+  }
+  return shape;
+}
+
+std::string TbeKernelJsonCreator::GetDeviceInputType(const AnfNodePtr &anf_node, size_t real_index) const {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  TypeId type_id;
+  if (creater_type_ == OP_SELECT_FORMAT) {
+    type_id = AnfAlgo::GetPrevNodeOutputInferDataType(anf_node, real_index);
+  } else {
+    type_id = AnfAlgo::GetInputDeviceDataType(anf_node, real_index);
+  }
+  return tbe::TypeIdToString(type_id);
+}
+
+std::string TbeKernelJsonCreator::GetDeviceInputFormat(const AnfNodePtr &anf_node, size_t real_index) const {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::string format = kOpFormat_NCHW;
+  if (creater_type_ != OP_SELECT_FORMAT && creater_type_ != CHECK_SUPPORTED) {
+    format = AnfAlgo::GetInputFormat(anf_node, real_index);
+    if (format == kOpFormat_FRAC_Z) {
+      format = kOpFormat_FRACTAL_Z;
+    } else if (format == kOpFormat_DEFAULT) {
+      format = kOpFormat_NCHW;
+    }
+  }
+  return format;
+}
+
+std::vector<size_t> TbeKernelJsonCreator::GetDeviceOutputShape(const AnfNodePtr &anf_node, size_t real_index) const {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::vector<size_t> shape;
+  if (creater_type_ == OP_SELECT_FORMAT || creater_type_ == CHECK_SUPPORTED) {
+    shape = AnfAlgo::GetOutputInferShape(anf_node, real_index);
+  } else {
+    shape = AnfAlgo::GetOutputDeviceShape(anf_node, real_index);
+  }
+  if (shape.empty()) {
+    shape.emplace_back(1);
+  }
+  return shape;
+}
+
+std::string TbeKernelJsonCreator::GetDeviceOutputType(const AnfNodePtr &anf_node, size_t real_index) const {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  TypeId type_id;
+  if (creater_type_ == OP_SELECT_FORMAT) {
+    type_id = AnfAlgo::GetOutputInferDataType(anf_node, real_index);
+  } else {
+    type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, real_index);
+  }
+  return tbe::TypeIdToString(type_id);
+}
+
+std::string TbeKernelJsonCreator::GetDeviceOutputFormat(const AnfNodePtr &anf_node, size_t real_index) const {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::string format = kOpFormat_NCHW;
+  if (creater_type_ != OP_SELECT_FORMAT && creater_type_ != CHECK_SUPPORTED) {
+    format = AnfAlgo::GetOutputFormat(anf_node, real_index);
+    if (format == kOpFormat_FRAC_Z) {
+      format = kOpFormat_FRACTAL_Z;
+    } else if (format == kOpFormat_DEFAULT) {
+      format = kOpFormat_NCHW;
+    }
+  }
+  return format;
+}
+
 bool TbeKernelBuild::GetIOSize(const nlohmann::json &kernel_json, std::vector<size_t> *input_size_list,
                                std::vector<size_t> *output_size_list) {
   if (input_size_list == nullptr || output_size_list == nullptr) {
@@ -464,35 +541,35 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &kernel_json, std::vector<si
   }
   input_size_list->clear();
   output_size_list->clear();
-  for (size_t i = 0; i < kernel_json["op_info"]["inputs"].size(); i++) {
-    for (size_t m = 0; m < kernel_json["op_info"]["inputs"][i].size(); m++) {
+  for (size_t i = 0; i < kernel_json[kJOpInfo][kJInputs].size(); i++) {
+    for (size_t m = 0; m < kernel_json[kJOpInfo][kJInputs][i].size(); m++) {
       size_t size_i = 1;
-      if (kernel_json["op_info"]["inputs"][i][m]["valid"] == false) {
-        std::string input_name = kernel_json["op_info"]["inputs"][i][m]["name"];
+      if (kernel_json[kJOpInfo][kJInputs][i][m][kJValid] == false) {
+        std::string input_name = kernel_json[kJOpInfo][kJInputs][i][m][kJName];
         MS_LOG(INFO) << "Input name:" << input_name << "is optional, valid is false.";
         continue;
       }
-      for (const auto &j : kernel_json["op_info"]["inputs"][i][m]["shape"]) {
+      for (const auto &j : kernel_json[kJOpInfo][kJInputs][i][m][kJShape]) {
         size_i *= static_cast<size_t>(j);
       }
-      std::string dtype = kernel_json["op_info"]["inputs"][i][m]["dtype"];
+      std::string dtype = kernel_json[kJOpInfo][kJInputs][i][m][kJDtype];
       size_t nbyte = tbe::GetDtypeNbyte(dtype);
       size_i *= nbyte;
       input_size_list->push_back(size_i);
     }
   }
-  for (size_t i = 0; i < kernel_json["op_info"]["outputs"].size(); i++) {
-    for (size_t m = 0; m < kernel_json["op_info"]["outputs"][i].size(); m++) {
+  for (size_t i = 0; i < kernel_json[kJOpInfo][kJOutputs].size(); i++) {
+    for (size_t m = 0; m < kernel_json[kJOpInfo][kJOutputs][i].size(); m++) {
       size_t size_i = 1;
-      if (kernel_json["op_info"]["outputs"][i][m]["valid"] == false) {
-        std::string output_name = kernel_json["op_info"]["outputs"][i][m]["name"];
+      if (kernel_json[kJOpInfo][kJOutputs][i][m][kJValid] == false) {
+        std::string output_name = kernel_json[kJOpInfo][kJOutputs][i][m][kJName];
         MS_LOG(INFO) << "Output name:" << output_name << " is optional, valid is false.";
         continue;
       }
-      for (const auto &j : kernel_json["op_info"]["outputs"][i][m]["shape"]) {
+      for (const auto &j : kernel_json[kJOpInfo][kJOutputs][i][m][kJShape]) {
         size_i *= static_cast<size_t>(j);
       }
-      std::string dtype = kernel_json["op_info"]["outputs"][i][m]["dtype"];
+      std::string dtype = kernel_json[kJOpInfo][kJOutputs][i][m][kJDtype];
       size_t nbyte = tbe::GetDtypeNbyte(dtype);
       size_i *= nbyte;
       output_size_list->push_back(size_i);
@@ -501,9 +578,9 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &kernel_json, std::vector<si
   return true;
 }
 
-bool TbeKernelBuild::GenFusionScopeJson(const vector<mindspore::AnfNodePtr> &input_nodes,
-                                        const vector<mindspore::AnfNodePtr> &compute_nodes, nlohmann::json *fusion_str,
-                                        std::string *fusion_kernel) {
+bool TbeKernelBuild::GenFusionScopeJson(const std::vector<mindspore::AnfNodePtr> &input_nodes,
+                                        const std::vector<mindspore::AnfNodePtr> &compute_nodes,
+                                        nlohmann::json *fusion_str, std::string *fusion_kernel) {
   MS_EXCEPTION_IF_NULL(fusion_str);
   MS_EXCEPTION_IF_NULL(fusion_kernel);
   // get input layer info
@@ -513,7 +590,7 @@ bool TbeKernelBuild::GenFusionScopeJson(const vector<mindspore::AnfNodePtr> &inp
     return false;
   }
   // gen fusion scopre_op jsom
-  vector<nlohmann::json> compute_list;
+  std::vector<nlohmann::json> compute_list;
   (*fusion_kernel) = kFusionKernelNamePrfix;
   // index: fusion build option input record, next one from 0
   static size_t index = 0;
@@ -526,7 +603,7 @@ bool TbeKernelBuild::GenFusionScopeJson(const vector<mindspore::AnfNodePtr> &inp
   }
   index = 0;
   // gen data input json
-  vector<nlohmann::json> data_list;
+  std::vector<nlohmann::json> data_list;
   for (const auto &layer : input_layers) {
     for (const auto &data_input : layer) {
       nlohmann::json data_str;
@@ -549,51 +626,51 @@ void TbeKernelBuild::GenDescJson(const std::shared_ptr<mindspore::AnfNode> &anf_
   if (node_out_idx > 0) {
     output_desc_name = output_desc_name + "_" + std::to_string(node_out_idx);
   }
-  (*output_desc)["name"] = NormalizeFullScopeName(output_desc_name);
+  (*output_desc)[kJName] = NormalizeFullScopeName(output_desc_name);
   auto type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, node_out_idx);
-  (*output_desc)["data_type"] = tbe::TypeIdToString(type_id);
+  (*output_desc)[kJDataType] = tbe::TypeIdToString(type_id);
   auto ori_shape = AnfAlgo::GetOutputInferShape(anf_node, node_out_idx);
   if (ori_shape.empty()) {
     ori_shape.emplace_back(1);
   }
-  (*output_desc)["ori_shape"] = ori_shape;
+  (*output_desc)[kJOriShape] = ori_shape;
   auto shape = AnfAlgo::GetOutputDeviceShape(anf_node, node_out_idx);
   if (shape.empty()) {
     shape.emplace_back(1);
   }
-  (*output_desc)["shape"] = shape;
+  (*output_desc)[kJShape] = shape;
   auto format = AnfAlgo::GetOutputFormat(anf_node, node_out_idx);
   if (format == kOpFormat_DEFAULT) {
     format = ori_shape.size() == 4 ? kOpFormat_NCHW : kOpFormat_ND;
   }
-  (*output_desc)["format"] = format;
-  (*output_desc)["ori_format"] = kOpFormat_NCHW;
-  (*output_desc)["output_index"] = desc_output_idx;
+  (*output_desc)[kJFormat] = format;
+  (*output_desc)[kJOriFormat] = kOpFormat_NCHW;
+  (*output_desc)[kJOutputIndex] = desc_output_idx;
   if (fusion_data_type == kFusionAddN && format == kOpFormat_NC1HWC0) {
     std::vector<size_t> spec_shape = {};
     spec_shape.emplace_back(shape[0]);
     spec_shape.emplace_back(shape[1]);
     spec_shape.emplace_back(shape[2] * shape[3]);
     spec_shape.emplace_back(shape[4]);
-    (*output_desc)["shape"] = spec_shape;
-  } else if (fusion_data_type == kFusionReLUGradV2 && (*output_desc)["data_type"] == "uint8") {
+    (*output_desc)[kJShape] = spec_shape;
+  } else if (fusion_data_type == kFusionReLUGradV2) {
     std::vector<size_t> spec_shape = {};
     spec_shape.emplace_back(shape[0]);
     spec_shape.emplace_back(shape[1]);
     spec_shape.emplace_back(shape[2] * shape[3]);
     spec_shape.emplace_back(16);
-    (*output_desc)["shape"] = spec_shape;
-    (*output_desc)["data_type"] = "bool";
+    (*output_desc)[kJShape] = spec_shape;
+    (*output_desc)[kJDataType] = kVTypeBool;
   }
 }
 
-void TbeKernelBuild::GenReusedOutputDesc(const shared_ptr<mindspore::AnfNode> &anf_node, size_t index,
+void TbeKernelBuild::GenReusedOutputDesc(const std::shared_ptr<mindspore::AnfNode> &anf_node, size_t index,
                                          size_t output_index, nlohmann::json *output_desc) {
   std::string output_desc_name = anf_node->fullname_with_scope() + "_" + std::to_string(index);
-  (*output_desc)["name"] = NormalizeFullScopeName(output_desc_name);
-  (*output_desc)["output_index"] = output_index;
+  (*output_desc)[kJName] = NormalizeFullScopeName(output_desc_name);
+  (*output_desc)[kJOutputIndex] = output_index;
   std::vector<size_t> shape;
-  (*output_desc)["shape"] = shape;
+  (*output_desc)[kJShape] = shape;
 }
 
 bool TbeKernelBuild::GetSpecInputLayers(const std::string &op_name,
@@ -618,6 +695,8 @@ bool TbeKernelBuild::GetInputLayers(const std::vector<mindspore::AnfNodePtr> &in
                                     const std::vector<mindspore::AnfNodePtr> &compute_nodes,
                                     std::vector<std::vector<mindspore::AnfNodePtr>> *input_layers,
                                     std::map<const AnfNodePtr, FusionDataType> *spec_data_input) {
+  MS_EXCEPTION_IF_NULL(input_layers);
+  MS_EXCEPTION_IF_NULL(spec_data_input);
   auto result = std::find_if(compute_nodes.begin(), compute_nodes.end(), [](const auto &it) {
     auto op_name = AnfAlgo::GetCNodeName(it);
     return op_name == kConv2DBackpropInputOpName;
@@ -673,10 +752,10 @@ bool TbeKernelBuild::GenFusionDataInputJson(const std::shared_ptr<mindspore::Anf
   if (!data_input) {
     MS_LOG(INFO) << "data input is optional node";
     auto name = std::string(kOptional) + std::to_string(*index);
-    (*data_str)["name"] = name;
+    (*data_str)[kJName] = name;
     nlohmann::json output_desc;
-    output_desc["name"] = name;
-    output_desc["shape"] = "NULL";
+    output_desc[kJName] = name;
+    output_desc[kJShape] = "NULL";
     output_desc_list.push_back(output_desc);
     (*index)++;
   } else {
@@ -688,14 +767,14 @@ bool TbeKernelBuild::GenFusionDataInputJson(const std::shared_ptr<mindspore::Anf
     auto real_node = kernel_idx.first;
     size_t real_idx = kernel_idx.second;
     MS_LOG(INFO) << "real name " << real_node->fullname_with_scope() << " index:" << real_idx;
-    // "output_desc"
+    // kJOutputDesc
     nlohmann::json output_desc;
     GenDescJson(real_node, real_idx, real_idx, &output_desc, fusion_data_type);
     output_desc_list.push_back(output_desc);
-    (*data_str)["name"] = NormalizeFullScopeName(real_node->fullname_with_scope());
+    (*data_str)[kJName] = NormalizeFullScopeName(real_node->fullname_with_scope());
   }
-  (*data_str)["output_desc"] = output_desc_list;
-  (*data_str)["type"] = "Data";
+  (*data_str)[kJOutputDesc] = output_desc_list;
+  (*data_str)[kJtype] = "Data";
   return true;
 }
 
@@ -726,6 +805,7 @@ bool TbeKernelBuild::IsDynamicInput(const mindspore::CNodePtr &cnode) {
 }
 
 size_t TbeKernelBuild::GetOptionalInput(const mindspore::CNodePtr &cnode, bool is_dynamic_input) {
+  MS_EXCEPTION_IF_NULL(cnode);
   if (is_dynamic_input) {
     return 0;
   }
@@ -740,8 +820,8 @@ size_t TbeKernelBuild::GetOptionalInput(const mindspore::CNodePtr &cnode, bool i
 }
 
 std::string TbeKernelBuild::GetRealOpType(const std::string &origin_type) {
-  static std::map<std::string, std::string> buffer_fussion_op_map = {{"DepthwiseConv2dNative", "DepthwiseConv2D"},
-                                                                     {"TensorAdd", "Add"}};
+  static std::map<std::string, std::string> buffer_fussion_op_map = {
+    {parallel::DEPTHWISE_CONV2D_NATIVE, parallel::DEPTHWISE_CONV2D}, {parallel::TENSOR_ADD, parallel::ADD}};
   string result = origin_type;
   auto iter = buffer_fussion_op_map.find(origin_type);
   if (iter != buffer_fussion_op_map.end()) {
@@ -767,7 +847,7 @@ bool TbeKernelBuild::GenFusionComputeInputJson(const mindspore::CNodePtr &cnode,
     GenDescJson(real_node, real_idx, real_idx, &input_desc);
     if (is_dynamic_input) {
       MS_LOG(INFO) << "node has dynamic input.";
-      input_desc["dyn_index"] = (i - 1);
+      input_desc[kJDynIndex] = (i - 1);
     }
     input_desc_list_tmp.emplace_back(input_desc);
   }
@@ -776,7 +856,7 @@ bool TbeKernelBuild::GenFusionComputeInputJson(const mindspore::CNodePtr &cnode,
     MS_LOG(INFO) << "node has optional input.";
     for (size_t i = 0; i < optional_num; ++i) {
       nlohmann::json optional_input_desc;
-      optional_input_desc["name"] = std::string(kOptional) + std::to_string(*index);
+      optional_input_desc[kJName] = std::string(kOptional) + std::to_string(*index);
       (*index)++;
       (*layer_iter)->emplace_back(nullptr);
       input_desc_list_tmp.emplace_back(optional_input_desc);
@@ -802,6 +882,7 @@ std::vector<size_t> TbeKernelBuild::GetDescOutputIndex(const std::vector<int> &o
 
 bool TbeKernelBuild::GenFusionComputeOutputJson(const mindspore::CNodePtr &cnode,
                                                 std::vector<nlohmann::json> *output_desc_list) {
+  MS_EXCEPTION_IF_NULL(output_desc_list);
   auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
   if (AnfAlgo::HasNodeAttr(kAttrOutputUsedNum, cnode)) {
     auto output_used_nums = AnfAlgo::GetNodeAttr<std::vector<int>>(cnode, kAttrOutputUsedNum);
@@ -844,22 +925,22 @@ bool TbeKernelBuild::GenFusionComputeJson(const mindspore::AnfNodePtr &compute_n
   // gen input desc
   std::vector<nlohmann::json> input_desc_list;
   (void)GenFusionComputeInputJson(cnode, layer_iter, &input_desc_list, index);
-  (*compute_op_str)["input_desc"] = input_desc_list;
+  (*compute_op_str)[kJInputDesc] = input_desc_list;
   // gen output desc
   std::vector<nlohmann::json> output_desc_list;
   if (!GenFusionComputeOutputJson(cnode, &output_desc_list)) {
     MS_LOG(INFO) << "Fusion Error: gen fusion output desc faild, node full name: " << cnode->fullname_with_scope();
     return false;
   }
-  (*compute_op_str)["output_desc"] = output_desc_list;
+  (*compute_op_str)[kJOutputDesc] = output_desc_list;
   // gen others
   auto origin_type = AnfAlgo::GetCNodeName(cnode);
   // replace special op type for buffer fusion op
   auto type = GetRealOpType(origin_type);
-  (*compute_op_str)["type"] = type;
+  (*compute_op_str)[kJtype] = type;
   tbe::TbeAdapter::NormalizeFuncName(&type);
-  (*compute_op_str)["func_name"] = type;
-  (*compute_op_str)["name"] = NormalizeFullScopeName(cnode->fullname_with_scope());
+  (*compute_op_str)[kJFuncName] = type;
+  (*compute_op_str)[kJName] = NormalizeFullScopeName(cnode->fullname_with_scope());
   (void)(*fusion_kernel_name).append("_");
   (void)(*fusion_kernel_name).append(type);
   return true;
@@ -867,16 +948,17 @@ bool TbeKernelBuild::GenFusionComputeJson(const mindspore::AnfNodePtr &compute_n
 
 size_t TbeKernelBuild::GetIOSizeImpl(const nlohmann::json &desc) {
   size_t ret = 1;
-  for (const auto &shape_item : desc["shape"]) {
+  for (const auto &shape_item : desc[kJShape]) {
     ret *= static_cast<size_t>(shape_item);
   }
-  std::string data_type = desc["data_type"];
+  std::string data_type = desc[kJDataType];
   size_t nbyte = tbe::GetDtypeNbyte(data_type);
   ret *= nbyte;
   return ret;
 }
 
-bool TbeKernelBuild::GetIOSize(const nlohmann::json &fusion_op_list, const vector<mindspore::AnfNodePtr> &output_nodes,
+bool TbeKernelBuild::GetIOSize(const nlohmann::json &fusion_op_list,
+                               const std::vector<mindspore::AnfNodePtr> &output_nodes,
                                std::vector<size_t> *input_size_list, std::vector<size_t> *output_size_list) {
   MS_EXCEPTION_IF_NULL(input_size_list);
   MS_EXCEPTION_IF_NULL(output_size_list);
@@ -884,15 +966,15 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &fusion_op_list, const vecto
   output_size_list->clear();
 
   for (const auto &op : fusion_op_list) {
-    if (op["type"] == "Data") {
-      const auto &data_output_desc = op["output_desc"];
+    if (op[kJtype] == "Data") {
+      const auto &data_output_desc = op[kJOutputDesc];
       for (const auto &data_output : data_output_desc) {
-        if (data_output["shape"] == "NULL") {
+        if (data_output[kJShape] == "NULL") {
           break;
         }
         auto ret = GetIOSizeImpl(data_output);
         input_size_list->push_back(ret);
-        MS_LOG(INFO) << "Fusion info: scope input name： " << op["name"] << ", size: " << ret;
+        MS_LOG(INFO) << "Fusion info: scope input name： " << op[kJName] << ", size: " << ret;
       }
     }
   }
@@ -904,13 +986,13 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &fusion_op_list, const vecto
     auto normal_name = NormalizeFullScopeName(real_node->fullname_with_scope());
     MS_LOG(INFO) << "Fusion info: real node name: " << normal_name << ", real output index: " << real_idx;
     for (const auto &op : fusion_op_list) {
-      if (op["name"] == normal_name) {
-        auto op_output_desces = op["output_desc"];
+      if (op[kJName] == normal_name) {
+        auto op_output_desces = op[kJOutputDesc];
         if (output_node != real_node) {
           // tuple_get item
           MS_LOG(INFO) << "output is a tuple getitem node";
           auto output_desc = op_output_desces[real_idx];
-          if (output_desc["shape"].empty()) {
+          if (output_desc[kJShape].empty()) {
             MS_LOG(INFO) << "Fusion error: output_desc's shape is empty. real_index " << real_idx;
             return false;
           }
@@ -919,7 +1001,7 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &fusion_op_list, const vecto
           MS_LOG(INFO) << "Fusion info: scope output index： " << real_idx << ", size: " << ret;
         } else {
           for (const auto &output_desc : op_output_desces) {
-            if (output_desc["shape"].empty()) {
+            if (output_desc[kJShape].empty()) {
               MS_LOG(INFO) << "Fusion info: output_desc's shape is empty, may be this node output";
               continue;
             }
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.h b/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.h
index 2ddab34d49..eef02efa87 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.h
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.h
@@ -93,7 +93,7 @@ class TbeKernelJsonCreator {
                          nlohmann::json *outputs_json);
   bool GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<OpInfo> &op_info,
                       nlohmann::json *attrs_json);
-  void ParseAttrValue(const std::string &type, const ValuePtr &value, nlohmann::json *attr_obj);
+  static void ParseAttrValue(const std::string &type, const ValuePtr &value, nlohmann::json *attr_obj);
   bool GenInputDescJson(const std::shared_ptr<AnfNode> &anf_node, size_t real_input_index, bool value,
                         const std::shared_ptr<OpIOInfo> &input_ptr, const string &op_input_name, size_t input_i,
                         std::vector<nlohmann::json> *input_list);
@@ -105,6 +105,13 @@ class TbeKernelJsonCreator {
   void GenOutputList(const std::shared_ptr<AnfNode> &anf_node, const size_t &output_obj_num,
                      const std::shared_ptr<OpIOInfo> &output_ptr, size_t *output_idx,
                      std::vector<nlohmann::json> *output_list);
+  std::vector<size_t> GetDeviceInputShape(const AnfNodePtr &anf_node, size_t real_index) const;
+  std::string GetDeviceInputType(const AnfNodePtr &anf_node, size_t real_index) const;
+  std::string GetDeviceInputFormat(const AnfNodePtr &anf_node, size_t real_index) const;
+  std::vector<size_t> GetDeviceOutputShape(const AnfNodePtr &anf_node, size_t real_index) const;
+  std::string GetDeviceOutputType(const AnfNodePtr &anf_node, size_t real_index) const;
+  std::string GetDeviceOutputFormat(const AnfNodePtr &anf_node, size_t real_index) const;
+
   kCreaterType creater_type_;
   std::string json_name_;
   std::string json_info_;
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.cc b/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.cc
index 577af45d59..79e5e0e109 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.cc
@@ -42,6 +42,41 @@ constexpr auto kStartCompileOp = "start_compile_op";
 constexpr auto kWaitOne = "wait_one";
 constexpr auto kResetTaskInfo = "reset_task_info";
 
+bool TbeOpParallelPreBuild(const std::vector<AnfNodePtr> &anf_nodes) {
+  auto build_manger = std::make_shared<ParallelBuildManager>();
+  MS_EXCEPTION_IF_NULL(build_manger);
+  for (const auto &anf_node : anf_nodes) {
+    // gen kernel json
+    MS_EXCEPTION_IF_NULL(anf_node);
+    nlohmann::json kernel_json;
+    TbeKernelJsonCreator creator(OP_PRE_COMPILE);
+    if (!creator.GenTbeSingleKernelJson(anf_node, &kernel_json)) {
+      MS_LOG(ERROR) << "GenTbeSingleKernelJson failed";
+      return false;
+    }
+    kernel_json["compile_type"] = "pre_build";
+    // op build
+    auto task_id = build_manger->StartCompileOp(kernel_json);
+    build_manger->SavePreTaskInfo(task_id, anf_node);
+  }
+  while (!build_manger->IsAllPreTaskFinish()) {
+    int task_id = -1;
+    char *task_result = nullptr;
+    char *pre_build_result = nullptr;
+    auto ret = build_manger->WaitOne(&task_id, &task_result, &pre_build_result);
+    if (!ret) {
+      MS_EXCEPTION(ArgumentError) << "Pre Build Failed. wait one ret:" << ret << ", task id:" << task_id;
+    }
+
+    if ((task_result != nullptr) && (strcmp(task_result, "Success") != 0)) {
+      MS_EXCEPTION(ArgumentError) << "task pre compile Failed, task id:" << task_id << ", cause:" << task_result;
+    }
+
+    build_manger->PreTaskFinishProcess(task_id, pre_build_result);
+  }
+  return true;
+}
+
 bool TbeOpParallelBuild(std::vector<AnfNodePtr> anf_nodes) {
   auto build_manger = std::make_shared<ParallelBuildManager>();
   MS_EXCEPTION_IF_NULL(build_manger);
@@ -82,7 +117,8 @@ bool TbeOpParallelBuild(std::vector<AnfNodePtr> anf_nodes) {
   while (!build_manger->IsAllTaskFinish()) {
     int task_id = -1;
     char *task_result = nullptr;
-    auto ret = build_manger->WaitOne(&task_id, &task_result);
+    char *pre_build_result = nullptr;
+    auto ret = build_manger->WaitOne(&task_id, &task_result, &pre_build_result);
     if (!ret) {
       MS_EXCEPTION(ArgumentError) << "Build Failed. wait one ret:" << ret << ", task id:" << task_id;
     }
@@ -116,7 +152,7 @@ int32_t ParallelBuildManager::StartCompileOp(const nlohmann::json &kernel_json)
   return task_id;
 }
 
-bool ParallelBuildManager::WaitOne(int *task_id, char **task_result) const {
+bool ParallelBuildManager::WaitOne(int *task_id, char **task_result, char **pre_build_result) const {
   MS_LOG(INFO) << "wait task start.";
   MS_EXCEPTION_IF_NULL(task_id);
   MS_EXCEPTION_IF_NULL(task_result);
@@ -128,10 +164,15 @@ bool ParallelBuildManager::WaitOne(int *task_id, char **task_result) const {
     MS_EXCEPTION(ArgumentError) << "Failed to call function wait_one";
     return false;
   }
-  (void)PyArg_ParseTuple(pRes, "is", task_id, task_result);
+  (void)PyArg_ParseTuple(pRes, "iss", task_id, task_result, pre_build_result);
   return true;
 }
 
+void ParallelBuildManager::SavePreTaskInfo(int32_t task_id, const mindspore::AnfNodePtr &anf_node) {
+  MS_LOG(INFO) << "SavePreTaskInfo, task id: " << task_id;
+  pre_task_map_[task_id] = anf_node;
+}
+
 void ParallelBuildManager::SaveTaskInfo(int32_t task_id, const mindspore::AnfNodePtr &anf_node,
                                         const std::string &json_name, const std::vector<size_t> &input_size_list,
                                         const std::vector<size_t> &output_size_list, int32_t scope_id) {
@@ -150,11 +191,42 @@ void ParallelBuildManager::SaveTaskInfo(int32_t task_id, const mindspore::AnfNod
   task_map_[task_id] = task_info;
 }
 
+bool ParallelBuildManager::IsAllPreTaskFinish() const {
+  MS_LOG(INFO) << "wait pre build process task_num: " << pre_task_map_.size();
+  return pre_task_map_.empty();
+}
+
 bool ParallelBuildManager::IsAllTaskFinish() const {
   MS_LOG(INFO) << "wait process task_num: " << task_map_.size();
   return task_map_.empty();
 }
 
+void ParallelBuildManager::PreTaskFinishProcess(int32_t task_id, const std::string &pre_build_result) {
+  auto task_iter = pre_task_map_.find(task_id);
+  if (task_iter == pre_task_map_.end()) {
+    MS_EXCEPTION(ArgumentError) << "can find pre task_id:" << task_id;
+  }
+  auto node = task_iter->second;
+  auto builder =
+    std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(node));
+  std::string start_flag = "fusion_pattern_start";
+  std::string end_flag = "fusion_pattern_end";
+  int start = pre_build_result.find(start_flag);
+  int end = pre_build_result.find(end_flag);
+  if (start != -1 && end != -1 && end >= start) {
+    std::string result = pre_build_result.substr(start + start_flag.size(), end - start - start_flag.size());
+    if (result == "") {
+      (void)pre_task_map_.erase(task_iter);
+      return;
+    }
+    transform(result.begin(), result.end(), result.begin(), ::toupper);
+    FusionType fusion_type = tbe::GetFusionType(result);
+    builder->SetFusionType(fusion_type);
+    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
+  }
+  (void)pre_task_map_.erase(task_iter);
+}
+
 std::pair<int32_t, KernelModPtr> ParallelBuildManager::TaskFinishProcess(int32_t task_id, bool set_kernel_mod) {
   auto task_iter = task_map_.find(task_id);
   if (task_iter == task_map_.end()) {
@@ -167,7 +239,7 @@ std::pair<int32_t, KernelModPtr> ParallelBuildManager::TaskFinishProcess(int32_t
     if (set_kernel_mod) {
       MS_EXCEPTION(ArgumentError) << "build kernel name:" << task_iter->second.json_name << " failed.";
     } else {
-      MS_LOG(DEBUG) << "fusion build kernel name:" << task_iter->second.json_name << "failed.";
+      MS_LOG(INFO) << "fusion build kernel name:" << task_iter->second.json_name << "failed.";
       auto ret = std::make_pair(task_iter->second.scope_id, nullptr);
       (void)task_map_.erase(task_iter);
       return ret;
@@ -177,7 +249,7 @@ std::pair<int32_t, KernelModPtr> ParallelBuildManager::TaskFinishProcess(int32_t
                                  task_iter->second.output_size_list, kernel_pack);
   MS_EXCEPTION_IF_NULL(kernel_mod);
   if (set_kernel_mod) {
-    AnfAlgo ::SetKernelMod(kernel_mod, task_iter->second.node);
+    AnfAlgo::SetKernelMod(kernel_mod, task_iter->second.node);
   }
   auto ret = std::make_pair(task_iter->second.scope_id, kernel_mod);
   (void)task_map_.erase(task_iter);
@@ -202,7 +274,7 @@ bool ParallelBuildManager::GenSameOpKernelMod() const {
     bool ret = SearchInCache(task_info.json_name, task_info.processor, task_info.input_size_list,
                              task_info.output_size_list, task_info.node);
     if (!ret) {
-      MS_LOG(DEBUG) << "can't find " << task_info.json_name << " in cache.";
+      MS_LOG(INFO) << "can't find " << task_info.json_name << " in cache.";
       return false;
     }
   }
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.h b/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.h
index 776aa0b1fc..c900baf036 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.h
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.h
@@ -26,6 +26,7 @@
 #include <nlohmann/json.hpp>
 namespace mindspore {
 namespace kernel {
+bool TbeOpParallelPreBuild(const std::vector<AnfNodePtr> &anf_nodes);
 bool TbeOpParallelBuild(std::vector<AnfNodePtr> anf_nodes);
 
 struct KernelBuildTaskInfo {
@@ -42,6 +43,7 @@ class ParallelBuildManager {
   ParallelBuildManager();
   ~ParallelBuildManager();
   int32_t StartCompileOp(const nlohmann::json &kernel_json) const;
+  void SavePreTaskInfo(int32_t task_id, const AnfNodePtr &anf_node);
   void SaveTaskInfo(int32_t task_id, const AnfNodePtr &anf_node, const std::string &json_name,
                     const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list,
                     int32_t scope_id = 0);
@@ -52,8 +54,10 @@ class ParallelBuildManager {
                      const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list,
                      AnfNode *node) const;
 
-  bool WaitOne(int *task_id, char **task_result) const;
+  bool WaitOne(int *task_id, char **task_result, char **pre_build_result) const;
+  bool IsAllPreTaskFinish() const;
   bool IsAllTaskFinish() const;
+  void PreTaskFinishProcess(int32_t task_id, const std::string &pre_build_result);
   std::pair<int32_t, KernelModPtr> TaskFinishProcess(int32_t task_id, bool set_kernel_mod = true);
   KernelModPtr GenKernelMod(const string &json_name, const string &processor,
                             const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list,
@@ -62,6 +66,7 @@ class ParallelBuildManager {
 
  private:
   PyObject *tbe_parallel_compiler_;
+  std::map<int32_t, AnfNodePtr> pre_task_map_;
   std::map<int32_t, KernelBuildTaskInfo> task_map_;
   std::vector<KernelBuildTaskInfo> same_op_list_;
 };
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select.cc b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select.cc
deleted file mode 100644
index aedb0b3eaf..0000000000
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select.cc
+++ /dev/null
@@ -1,664 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/tbe/tbe_kernel_select.h"
-
-#include <unordered_map>
-#include <memory>
-#include <map>
-#include <set>
-
-#include "session/anf_runtime_algorithm.h"
-#include "kernel/oplib/oplib.h"
-#include "kernel/tbe/tbe_kernel_build.h"
-#include "nlohmann/json.hpp"
-#include "common/utils.h"
-#include "utils/context/ms_context.h"
-#include "kernel/tbe/tbe_python_funcs.h"
-#include "pre_activate/common/helper.h"
-#include "kernel/tbe/tbe_convert_utils.h"
-
-namespace mindspore {
-namespace kernel {
-constexpr auto kName = "name";
-constexpr auto kDtype = "dtype";
-constexpr auto kFormat = "format";
-constexpr auto kPrefixInput = "input";
-constexpr auto kPrefixOutput = "output";
-const std::map<std::string, std::string> DYNAMIC_FORMAT_MAP = {{"NCHW", "DefaultFormat"},
-                                                               {"NHWC", "DefaultFormat"},
-                                                               {"ND", "DefaultFormat"},
-                                                               {"FRACTAL_Z", "FracZ"},
-                                                               {"NDHWC", "DefaultFormat"}};
-static const std::vector<std::string> CHECK_SUPPORTED_OPTYPE{
-  "MatMul", "BatchMatMul", "TopK", "InTopK", "Pack", "GatherNd", "UnsortedSegmentMinD", "UnsortedSegmentProdD", "Cast"};
-
-bool CheckSupported(const AnfNodePtr &anf_node, const KernelBuildInfoPtr &select_kernel_build_info) {
-  MS_EXCEPTION_IF_NULL(anf_node);
-  MS_EXCEPTION_IF_NULL(select_kernel_build_info);
-
-  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
-  auto iter = std::find(CHECK_SUPPORTED_OPTYPE.begin(), CHECK_SUPPORTED_OPTYPE.end(), op_name);
-  if (iter == CHECK_SUPPORTED_OPTYPE.end()) {
-    MS_LOG(DEBUG) << "Op " << op_name << "this op does not need to check op supported.";
-    return true;
-  }
-
-  // replace kernel_info with current kernel info
-  auto ori_select_kernel_info = AnfAlgo::GetSelectKernelBuildInfo(anf_node);
-  AnfAlgo::SetSelectKernelBuildInfo(select_kernel_build_info, anf_node.get());
-
-  nlohmann::json kernel_json;
-  TbeKernelJsonCreator creator(CHECK_SUPPORTED);
-  bool ret = creator.GenTbeSingleKernelJson(anf_node, &kernel_json);
-  if (!ret) {
-    MS_LOG(DEBUG) << "GenTbeSingleKernelJson failed";
-    AnfAlgo::SetSelectKernelBuildInfo(ori_select_kernel_info, anf_node.get());
-    return false;
-  }
-
-  ret = TbePythonFuncs::CheckSupported(kernel_json);
-  AnfAlgo::SetSelectKernelBuildInfo(ori_select_kernel_info, anf_node.get());
-  return ret;
-}
-
-bool CheckJsonItemValidity(const nlohmann::json &json_obj, const std::string &key_name,
-                           const std::vector<std::string> &keys) {
-  if (!json_obj[key_name].is_object()) {
-    MS_LOG(DEBUG) << key_name << "is not an object!";
-    return false;
-  }
-  for (auto key : keys) {
-    if (json_obj[key_name].find(key) == json_obj[key_name].end()) {
-      MS_LOG(DEBUG) << "Key" << key << "of " << key_name << " is not found!";
-      return false;
-    }
-  }
-  return true;
-}
-
-std::vector<std::string> SplitStr(const std::string &string, const std::string &sep) {
-  std::vector<std::string> result;
-  size_t start = 0;
-  size_t index = string.find(sep, start);
-  std::string substr;
-  while (index != std::string::npos) {
-    if (string.size() > start) {
-      substr = string.substr(start, index - start);
-    }
-    (void)substr.erase(0, substr.find_first_not_of(' '));
-    (void)substr.erase(substr.find_last_not_of(' ') + 1);
-    auto iter = DYNAMIC_FORMAT_MAP.find(substr);
-    if (iter != DYNAMIC_FORMAT_MAP.end()) {
-      substr = iter->second;
-    }
-    result.push_back(substr);
-    start = index + sep.size();
-    index = string.find(sep, start);
-  }
-
-  if (string.size() > start) {
-    substr = string.substr(start);
-  }
-  (void)substr.erase(0, substr.find_first_not_of(' '));
-  (void)substr.erase(substr.find_last_not_of(' ') + 1);
-  auto iter = DYNAMIC_FORMAT_MAP.find(substr);
-  if (iter != DYNAMIC_FORMAT_MAP.end()) {
-    substr = iter->second;
-  }
-  result.push_back(substr);
-  return result;
-}
-
-void ConvertFormatDtype(const std::string &format, const std::string &dtype, const std::shared_ptr<OpIOInfo> &io_info) {
-  MS_EXCEPTION_IF_NULL(io_info);
-  std::vector<std::string> format_vec = SplitStr(format, ",");
-  std::vector<std::string> dtype_vec = SplitStr(dtype, ",");
-  io_info->set_formats(format_vec);
-  io_info->set_dtypes(dtype_vec);
-}
-
-bool ParseDynamicFormatJson(const std::string &jsonStr, std::vector<std::shared_ptr<OpIOInfo>> *const inputs,
-                            std::vector<std::shared_ptr<OpIOInfo>> *const outputs) {
-  nlohmann::json json_obj = nlohmann::json::parse(jsonStr);
-  if (!json_obj.is_object()) {
-    MS_LOG(DEBUG) << "JsonStr is not an object, the jsonStr is:" << jsonStr;
-    return false;
-  }
-  std::vector<std::string> keys = {kName, kDtype, kFormat};
-  for (const auto &item : json_obj.items()) {
-    std::string key_name;
-    key_name = item.key();
-    if (key_name.empty()) {
-      MS_LOG(DEBUG) << "Key name is empty!";
-      return false;
-    }
-    if (!CheckJsonItemValidity(json_obj, key_name, keys)) {
-      return false;
-    }
-    if (key_name.compare(0, strlen(kPrefixInput), kPrefixInput) == 0) {
-      std::shared_ptr<OpIOInfo> input = std::make_shared<OpIOInfo>();
-      MS_EXCEPTION_IF_NULL(input);
-      input->set_name(json_obj[key_name].at(kName));
-      ConvertFormatDtype(json_obj[key_name].at(kFormat), json_obj[key_name].at(kDtype), input);
-      inputs->emplace_back(input);
-    } else if (key_name.compare(0, strlen(kPrefixOutput), kPrefixOutput) == 0) {
-      std::shared_ptr<OpIOInfo> output = std::make_shared<OpIOInfo>();
-      MS_EXCEPTION_IF_NULL(output);
-      output->set_name(json_obj[key_name].at(kName));
-      ConvertFormatDtype(json_obj[key_name].at(kFormat), json_obj[key_name].at(kDtype), output);
-      outputs->emplace_back(output);
-    } else {
-      MS_LOG(DEBUG) << "Key name:" << key_name << " is undefined!";
-      return false;
-    }
-  }
-  return true;
-}
-
-std::string OpSelectFormat(const std::shared_ptr<AnfNode> &anf_node) {
-  nlohmann::json kernel_json;
-  std::string res_json_str;
-  TbeKernelJsonCreator creator(OP_SELECT_FORMAT);
-  bool ret = creator.GenTbeSingleKernelJson(anf_node, &kernel_json);
-  if (!ret) {
-    MS_LOG(DEBUG) << "GenTbeSingleKernelJson failed";
-    return res_json_str;
-  }
-  res_json_str = TbePythonFuncs::OpSelectFormat(kernel_json);
-  MS_LOG(INFO) << "Dynamic select foramt response result:" << res_json_str;
-  return res_json_str;
-}
-
-void SetTidyInputsInfo(const std::shared_ptr<AnfNode> &anf_node,
-                       const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> &builder,
-                       const std::vector<std::shared_ptr<OpIOInfo>> &inputs) {
-  std::vector<TypeId> inputs_type;
-  std::vector<std::string> inputs_format;
-  std::vector<int> dyn_input_sizes;
-  size_t dyn_input_idx = 0;
-  size_t kernel_info_index = 0;
-  size_t real_input_num = AnfAlgo::GetInputTensorNum(anf_node);
-  auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
-  MS_EXCEPTION_IF_NULL(primitive);
-  if (primitive->GetAttr("dyn_input_sizes") != nullptr) {
-    dyn_input_sizes = GetValue<std::vector<int>>(primitive->GetAttr("dyn_input_sizes"));
-  }
-  for (size_t i = 0; i < inputs.size(); i++) {
-    MS_EXCEPTION_IF_NULL(inputs[i]);
-    std::string param_type = inputs[i]->param_type();
-    if (i >= real_input_num) {
-      MS_LOG(INFO) << "Input index: " << i << " is out of real_input_num:" << real_input_num;
-      continue;
-    }
-    auto type_id = AnfAlgo::GetPrevNodeOutputInferDataType(anf_node, i);
-    auto format = kOpFormat_DEFAULT;
-    if (param_type == "dynamic") {
-      if (!dyn_input_sizes.empty()) {
-        for (int t = 0; t < dyn_input_sizes[dyn_input_idx]; t++) {
-          kernel_info_index++;
-          inputs_type.emplace_back(type_id);
-          inputs_format.emplace_back(format);
-        }
-        dyn_input_idx++;
-      }
-    } else if (param_type == "required") {
-      kernel_info_index++;
-      inputs_type.emplace_back(type_id);
-      inputs_format.emplace_back(format);
-    } else {
-      if (kernel_info_index < real_input_num) {
-        MS_LOG(INFO) << "Input type is optional, input index is :" << kernel_info_index;
-        kernel_info_index++;
-        inputs_type.emplace_back(type_id);
-        inputs_format.emplace_back(format);
-      }
-    }
-  }
-  builder->SetInputsDeviceType(inputs_type);
-  builder->SetInputsFormat(inputs_format);
-}
-
-void SetTidyOutputsInfo(const std::shared_ptr<AnfNode> &anf_node,
-                        const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> &builder,
-                        const std::vector<std::shared_ptr<OpIOInfo>> &outputs) {
-  std::vector<TypeId> outputs_type;
-  std::vector<std::string> outputs_format;
-  auto real_output_num = AnfAlgo::GetOutputTensorNum(anf_node);
-  size_t output_idx = 0;
-  for (const auto &output : outputs) {
-    MS_EXCEPTION_IF_NULL(output);
-    if (output_idx >= real_output_num) {
-      continue;
-    }
-    size_t output_num = 0;
-    if (output->param_type() == "dynamic") {
-      if (outputs.size() > 1) {
-        MS_EXCEPTION(ArgumentError) << "Dynamic output is unsupported multi output!";
-      }
-      output_num = real_output_num;
-    } else if (output->param_type() == "required") {
-      output_num = 1;
-    } else {
-      if (output_idx < real_output_num) {
-        MS_LOG(INFO) << "Set output kernel builder info, output type is optional, output index is :" << output_idx;
-        output_num = 1;
-      }
-    }
-    for (size_t i = 0; i < output_num; i++) {
-      auto type_id = AnfAlgo::GetOutputInferDataType(anf_node, output_idx);
-      outputs_type.emplace_back(type_id);
-      outputs_format.emplace_back(kOpFormat_DEFAULT);
-      output_idx++;
-    }
-  }
-  builder->SetOutputsDeviceType(outputs_type);
-  builder->SetOutputsFormat(outputs_format);
-}
-
-void GenTidyKernelBuildInfo(const std::shared_ptr<AnfNode> &anf_node,
-                            const std::vector<std::shared_ptr<OpIOInfo>> &inputs,
-                            const std::vector<std::shared_ptr<OpIOInfo>> &outputs) {
-  auto builder_tmp = std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
-  builder_tmp->SetKernelType(TBE_KERNEL);
-  SetTidyInputsInfo(anf_node, builder_tmp, inputs);
-  SetTidyOutputsInfo(anf_node, builder_tmp, outputs);
-  AnfAlgo::SetSelectKernelBuildInfo(builder_tmp->Build(), anf_node.get());
-}
-
-void ReplaceByDynamicFormatDtype(const CNodePtr &kernel_node, const std::shared_ptr<const OpInfo> &op_info_ptr,
-                                 const std::shared_ptr<OpInfo> &op_info_new_ptr) {
-  std::vector<std::shared_ptr<OpIOInfo>> inputs_static = op_info_ptr->inputs_ptr();
-  std::vector<std::shared_ptr<OpIOInfo>> outputs_static = op_info_ptr->outputs_ptr();
-  std::vector<std::shared_ptr<OpIOInfo>> inputs_dyn;
-  std::vector<std::shared_ptr<OpIOInfo>> outputs_dyn;
-  if ((op_info_ptr->imply_type() == kTBE) && (!mindspore::opt::IsNopNode(kernel_node->cast<AnfNodePtr>()))) {
-    // 1. create tidy kernelBuildInfo in order to generate json for calling op_select_format
-    auto anf_node = kernel_node->cast<std::shared_ptr<AnfNode>>();
-    auto kernel_build_info_ptr = AnfAlgo::GetSelectKernelBuildInfo(anf_node);
-    GenTidyKernelBuildInfo(kernel_node, inputs_static, outputs_static);
-
-    // 2.get dynamic format from op_impl
-    std::string res_json_str;
-    auto context_ptr = MsContext::GetInstance();
-    MS_EXCEPTION_IF_NULL(context_ptr);
-    if (context_ptr->execution_mode() != kPynativeMode) {
-      res_json_str = OpSelectFormat(kernel_node);
-    }
-    if (!res_json_str.empty()) {
-      (void)ParseDynamicFormatJson(res_json_str, &inputs_dyn, &outputs_dyn);
-    }
-    if (inputs_static.size() != inputs_dyn.size()) {
-      inputs_dyn.clear();
-    }
-    if (outputs_static.size() != outputs_dyn.size()) {
-      outputs_dyn.clear();
-    }
-
-    // 3. resume kernel node's SelectKernelBuildInfo
-    // As it has been replaced by GenTidyKernelBuildInfo in order to call python func
-    AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_ptr, anf_node.get());
-  }
-  // 4.replace by dynamic format and dtype
-  if (inputs_dyn.empty() && outputs_dyn.empty()) {
-    MS_LOG(INFO) << "Dynamic select format response is empty, use static register info.";
-    op_info_new_ptr->set_inputs_ptr(inputs_static);
-    op_info_new_ptr->set_outputs_ptr(outputs_static);
-  } else {
-    MS_LOG(INFO) << "Dynamic select format response successful, use dynamic format.";
-    for (size_t i = 0; i < inputs_static.size(); i++) {
-      inputs_dyn[i]->set_param_type(inputs_static[i]->param_type());
-      inputs_dyn[i]->set_reshape_type(inputs_static[i]->reshape_type());
-    }
-    for (size_t j = 0; j < outputs_static.size(); j++) {
-      outputs_dyn[j]->set_param_type(outputs_static[j]->param_type());
-      outputs_dyn[j]->set_reshape_type(outputs_static[j]->reshape_type());
-    }
-    op_info_new_ptr->set_inputs_ptr(inputs_dyn);
-    op_info_new_ptr->set_outputs_ptr(outputs_dyn);
-  }
-
-  // 5.copy other opinfo to new op_info_new
-  op_info_new_ptr->set_op_name(op_info_ptr->op_name());
-  op_info_new_ptr->set_imply_type(op_info_ptr->imply_type());
-  op_info_new_ptr->set_fusion_type(op_info_ptr->fusion_type());
-}
-
-bool StringToAxisVector(const std::string &reshape_type_str, std::vector<Axis> *reshape_type_vec) {
-  for (const auto &c : reshape_type_str) {
-    switch (c) {
-      case 'N':
-        reshape_type_vec->push_back(kernel::N);
-        break;
-      case 'C':
-        reshape_type_vec->push_back(kernel::C);
-        break;
-      case 'H':
-        reshape_type_vec->push_back(kernel::H);
-        break;
-      case 'W':
-        reshape_type_vec->push_back(kernel::W);
-        break;
-      default:
-        MS_LOG(ERROR) << "Unknown axis " << c << "in reshape type.";
-        return false;
-    }
-  }
-  return true;
-}
-
-bool SetKernelBuilderInputInfo(const std::vector<std::shared_ptr<OpIOInfo>> &inputs, size_t real_input_num,
-                               size_t builder_idex, const std::vector<int> &dyn_input_sizes,
-                               const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> &builder) {
-  MS_EXCEPTION_IF_NULL(builder);
-
-  std::vector<TypeId> inputs_device_type;
-  std::vector<std::string> inputs_format;
-  size_t dyn_input_idx = 0;
-  size_t kernel_info_index = 0;
-  MS_EXCEPTION_IF_NULL(inputs[0]);
-  size_t kernel_info_cnt = inputs[0]->dtypes().size();
-
-  std::vector<std::vector<Axis>> reshape_types;
-  for (const auto &input : inputs) {
-    MS_EXCEPTION_IF_NULL(input);
-    std::string param_type = input->param_type();
-    std::vector<std::string> dtypes = input->dtypes();
-    std::vector<std::string> formats = input->formats();
-    if (dtypes.size() != kernel_info_cnt || formats.size() != kernel_info_cnt) {
-      MS_LOG(ERROR) << "Set input kernel builder info, dtyps size != formats size.";
-      return false;
-    }
-
-    std::vector<Axis> reshape_type;
-    if (!StringToAxisVector(input->reshape_type(), &reshape_type)) {
-      return false;
-    }
-
-    if (param_type == "dynamic") {
-      if (dyn_input_sizes.empty()) {
-        MS_LOG(ERROR) << "Set input kernel builder info, dyn_input_sizes's size is 0 when param_type is dynamic";
-        return false;
-      }
-
-      for (int t = 0; t < dyn_input_sizes[dyn_input_idx]; t++) {
-        kernel_info_index++;
-        auto type_id = tbe::DtypeToTypeId(dtypes[builder_idex]);
-        inputs_device_type.push_back(type_id);
-        inputs_format.push_back(formats[builder_idex]);
-        reshape_types.push_back(reshape_type);
-      }
-      dyn_input_idx++;
-    } else if (param_type == "required") {
-      kernel_info_index++;
-      auto type_id = tbe::DtypeToTypeId(dtypes[builder_idex]);
-      inputs_device_type.push_back(type_id);
-      inputs_format.push_back(formats[builder_idex]);
-      reshape_types.push_back(reshape_type);
-    } else {
-      if (kernel_info_index < real_input_num) {
-        MS_LOG(INFO) << "Set input kernel builder info, input type is optional, input index is " << kernel_info_index;
-        kernel_info_index++;
-        auto type_id = tbe::DtypeToTypeId(dtypes[builder_idex]);
-        inputs_device_type.push_back(type_id);
-        inputs_format.push_back(formats[builder_idex]);
-        reshape_types.push_back(reshape_type);
-      }
-    }
-  }
-
-  builder->SetInputReshapeType(reshape_types);
-  builder->SetInputsDeviceType(inputs_device_type);
-  builder->SetInputsFormat(inputs_format);
-  return true;
-}
-
-bool SetKernelBuilderOutputInfo(const std::vector<std::shared_ptr<OpIOInfo>> &outputs, size_t builder_idex,
-                                const size_t &real_output_num,
-                                const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> &builder) {
-  // not now but in the next we need to support dynamic output case
-  MS_EXCEPTION_IF_NULL(builder);
-
-  size_t output_idx = 0;
-  std::vector<TypeId> outputs_device_type;
-  std::vector<std::string> outputs_format;
-  MS_EXCEPTION_IF_NULL(outputs[0]);
-  size_t kernel_info_cnt = outputs[0]->dtypes().size();
-
-  std::vector<std::vector<Axis>> reshape_types;
-  for (const auto &output : outputs) {
-    MS_EXCEPTION_IF_NULL(output);
-    if (output_idx >= real_output_num) {
-      MS_LOG(WARNING) << "real_output_num: " << real_output_num << ", output_idx: " << output_idx << "is out of limit!";
-      continue;
-    }
-    std::vector<Axis> reshape_type;
-    if (!StringToAxisVector(output->reshape_type(), &reshape_type)) {
-      return false;
-    }
-
-    size_t output_num = 0;
-    if (output->param_type() == "dynamic") {
-      if (outputs.size() > 1) {
-        MS_LOG(EXCEPTION) << "Dynamic output is unsupported multi output!";
-      }
-      output_num = real_output_num;
-    } else if (output->param_type() == "required") {
-      output_num = 1;
-    } else {
-      if (output_idx < real_output_num) {
-        MS_LOG(INFO) << "Set output kernel builder info, output type is optional, output index is " << output_idx;
-        output_num = 1;
-      }
-    }
-
-    for (size_t i = 0; i < output_num; i++) {
-      std::vector<std::string> dtypes = output->dtypes();
-      std::vector<std::string> formats = output->formats();
-      if (dtypes.size() != kernel_info_cnt || formats.size() != kernel_info_cnt) {
-        MS_LOG(ERROR) << "Set output kernel builder info, dtyps size != formats size.";
-        return false;
-      }
-      auto type_id = tbe::DtypeToTypeId(dtypes[builder_idex]);
-      outputs_device_type.push_back(type_id);
-      outputs_format.push_back(formats[builder_idex]);
-      reshape_types.push_back(reshape_type);
-      output_idx++;
-    }
-  }
-
-  builder->SetOutputReshapeType(reshape_types);
-  builder->SetOutputsFormat(outputs_format);
-  builder->SetOutputsDeviceType(outputs_device_type);
-  return true;
-}
-
-void SetKernelBuildCommonInfo(const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> &builder,
-                              Processor processor, const std::shared_ptr<const OpInfo> &op_info_ptr) {
-  MS_EXCEPTION_IF_NULL(builder);
-  MS_EXCEPTION_IF_NULL(op_info_ptr);
-
-  builder->SetProcessor(processor);
-  std::string fusion_type = op_info_ptr->fusion_type();
-  if (tbe::GetFusionType(fusion_type) != UNKNOWN_FUSION_TYPE) {
-    builder->SetFusionType(tbe::GetFusionType(fusion_type));
-  }
-  builder->SetOpPattern(op_info_ptr->op_pattern());
-  builder->SetKernelType(TBE_KERNEL);
-}
-
-bool ParseMetadata(const CNodePtr &kernel_node, const std::shared_ptr<const OpInfo> &op_info_ptr,
-                   std::vector<std::shared_ptr<KernelBuildInfo>> *const kernel_info_list) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  MS_EXCEPTION_IF_NULL(kernel_info_list);
-  size_t real_input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-  size_t real_output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  std::vector<std::shared_ptr<OpIOInfo>> inputs = op_info_ptr->inputs_ptr();
-  std::vector<std::shared_ptr<OpIOInfo>> outputs = op_info_ptr->outputs_ptr();
-  std::vector<int> dyn_input_sizes;
-  auto primitive = AnfAlgo::GetCNodePrimitive(kernel_node);
-  MS_EXCEPTION_IF_NULL(primitive);
-  if (primitive->GetAttr("dyn_input_sizes") != nullptr) {
-    dyn_input_sizes = GetValue<std::vector<int>>(primitive->GetAttr("dyn_input_sizes"));
-  }
-  if (!inputs.empty()) {
-    MS_EXCEPTION_IF_NULL(inputs[0]);
-    size_t kernel_info_cnt = inputs[0]->dtypes().size();
-    for (size_t j = 0; j < kernel_info_cnt; j++) {
-      auto builder = std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
-      MS_EXCEPTION_IF_NULL(builder);
-      SetKernelBuildCommonInfo(builder, Processor::AICORE, op_info_ptr);
-
-      if (!SetKernelBuilderInputInfo(inputs, real_input_num, j, dyn_input_sizes, builder)) {
-        MS_LOG(ERROR) << "Parse kernel metadata, set inputs kernel builder info failed.";
-        return false;
-      }
-
-      if (!outputs.empty()) {
-        if (!SetKernelBuilderOutputInfo(outputs, j, real_output_num, builder)) {
-          MS_LOG(ERROR) << "Parse kernel metadata, set outputs kernel builder info failed.";
-          return false;
-        }
-      }
-
-      kernel_info_list->push_back(builder->Build());
-    }
-  } else if (!outputs.empty()) {
-    MS_EXCEPTION_IF_NULL(outputs[0]);
-    size_t kernel_info_cnt = outputs[0]->dtypes().size();
-    for (size_t j = 0; j < kernel_info_cnt; j++) {
-      auto builder = std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
-      MS_EXCEPTION_IF_NULL(builder);
-      SetKernelBuildCommonInfo(builder, Processor::AICORE, op_info_ptr);
-
-      if (!SetKernelBuilderOutputInfo(outputs, j, real_output_num, builder)) {
-        MS_LOG(ERROR) << "Parse kernel metadata, set outputs kernel builder info failed.";
-        return false;
-      }
-
-      kernel_info_list->push_back(builder->Build());
-    }
-  }
-  return true;
-}
-
-bool IsShapeMatchFormat(const std::vector<size_t> &shape, const std::string &format) {
-  // if format is default, it remarkes support all format
-  if (kOpFormatList.find(format) == kOpFormatList.end()) {
-    MS_LOG(EXCEPTION) << "Got the unknown format " << format;
-  }
-  if (format == kOpFormat_DEFAULT) {
-    return true;
-  }
-  if (format == kOpFormat_NDHWC && shape.size() != kShape5dDims) {
-    return false;
-  }
-  // if shape size is 0, the shape will be a scalar
-  if (shape.empty()) {
-    return true;
-  }
-  if (shape.size() > kShape4dDims) {
-    return false;
-  }
-  if (format == kOpFormat_FRAC_NZ && shape.size() < 2) {
-    return false;
-  }
-  return true;
-}
-
-bool IsValidKernelInfo(const std::shared_ptr<CNode> &kernel_node, const kernel::KernelBuildInfo &kernel_build_info) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
-  const size_t kCAxis = 1;
-  for (size_t index = 0; index < kernel_build_info.GetOutputNum(); ++index) {
-    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, index);
-    if (kernel_build_info.GetOutputFormat(index) == kOpFormat_FRACTAL_Z_C04) {
-      if (output_shape.size() != kShape4dDims || output_shape[kCAxis] > 4) {
-        return false;
-      }
-      return false;
-    }
-    if (!IsShapeMatchFormat(output_shape, kernel_build_info.GetOutputFormat(index))) {
-      return false;
-    }
-    if (kernel_name == "ReduceMean") {
-      auto keep_dims = AnfAlgo::GetNodeAttr<bool>(kernel_node, kAttrKeepDims);
-      if (!keep_dims && kernel_build_info.GetOutputFormat(index) != kOpFormat_DEFAULT) {
-        return false;
-      }
-    }
-  }
-  for (size_t index = 0; index < kernel_build_info.GetInputNum(); ++index) {
-    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index);
-    if (!IsShapeMatchFormat(input_shape, kernel_build_info.GetInputFormat(index))) {
-      return false;
-    }
-    if (kernel_build_info.GetInputFormat(index) == kOpFormat_FRACTAL_Z_C04) {
-      if (input_shape.size() != kShape4dDims || input_shape[kCAxis] > 4) {
-        return false;
-      }
-      return false;
-    }
-    if (kernel_name == "ReduceMean") {
-      auto keep_dims = AnfAlgo::GetNodeAttr<bool>(kernel_node, kAttrKeepDims);
-      if (!keep_dims && kernel_build_info.GetInputFormat(index) != kOpFormat_DEFAULT) {
-        return false;
-      }
-    }
-  }
-  if (AnfAlgo::GetCNodeName(kernel_node) == prim::kPrimCast->name()) {
-    return AnfAlgo::GetOutputInferDataType(kernel_node, 0) == kernel_build_info.GetOutputDeviceType(0) &&
-           AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0) == kernel_build_info.GetInputDeviceType(0);
-  }
-  return true;
-}
-
-void TbeMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  MS_EXCEPTION_IF_NULL(kernel_info_list);
-  std::vector<std::shared_ptr<kernel::KernelBuildInfo>> parse_info_list;
-
-  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
-  auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kTBE);
-  if (op_info_ptr == nullptr) {
-    return;
-  }
-  // dynamic get op format and dtype and replace opinfo
-  auto op_info_new_ptr = std::make_shared<OpInfo>();
-  ReplaceByDynamicFormatDtype(kernel_node, op_info_ptr, op_info_new_ptr);
-
-  if (!ParseMetadata(kernel_node, op_info_new_ptr, &parse_info_list)) {
-    MS_LOG(INFO) << "Tbe parsed metadata of op[" << op_name << "] failed.";
-    return;
-  }
-
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-  for (const auto &parse_info : parse_info_list) {
-    if (IsValidKernelInfo(kernel_node, *(parse_info))) {
-      if (CheckSupported(kernel_node, parse_info)) {
-        kernel_info_list->push_back(parse_info);
-      } else {
-        MS_LOG(INFO) << "CheckSupported Failed for TBE op" << op_name << " kernel info.";
-      }
-    }
-    if (kernel_info_list->empty()) {
-      MS_LOG(DEBUG) << "Tbe dose not have op [" << op_name << "].";
-    }
-  }
-}
-}  // namespace kernel
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/common_utils.h b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/common_utils.h
new file mode 100644
index 0000000000..c07197610e
--- /dev/null
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/common_utils.h
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_TBE_KERNEL_SELECT_COMMON_UTILS_H_
+#define MINDSPORE_CCSRC_KERNEL_TBE_KERNEL_SELECT_COMMON_UTILS_H_
+#include <string>
+#include <vector>
+namespace mindspore {
+namespace kernel {
+struct SupportFormat {
+  std::vector<std::vector<std::string>> input_format;
+  std::vector<std::vector<std::string>> output_format;
+};
+using SupportFormatItem = std::vector<std::string>;
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_TBE_COMMON_UTILS_H_
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_broadcast_selecter.cc b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_broadcast_selecter.cc
new file mode 100644
index 0000000000..9d28af3f3f
--- /dev/null
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_broadcast_selecter.cc
@@ -0,0 +1,319 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/tbe/tbe_kernel_select/tbe_kernel_broadcast_selecter.h"
+#include "utils/utils.h"
+#include "session/anf_runtime_algorithm.h"
+#include "kernel/tbe/tbe_kernel_select/common_utils.h"
+
+namespace mindspore {
+namespace kernel {
+constexpr char kDynInputKey[] = "dyn_input_sizes";
+constexpr size_t kInputIndex_0 = 0;
+constexpr size_t kChannelN = 0;
+constexpr size_t kChannelC = 1;
+constexpr size_t kAlignmented16 = 16;
+// 1. all shape no scalar and same
+// 2. part scalar : no_scalar (shape size > xxx && alig xxx)
+// 3. all no_scalar and not same (broad cast xxx dim)
+bool TbeKernelBroadCastSelecter::GetShapeInfo(SupportFormat *support_format) {
+  MS_EXCEPTION_IF_NULL(support_format);
+  input_num_ = 0;
+  output_num_ = 0;
+  input_shapes_.clear();
+  output_shapes_.clear();
+  if (AnfAlgo::HasNodeAttr(kDynInputKey, cnode_ptr_)) {
+    MS_LOG(INFO) << "This broadcast node has dynamic input.";
+    auto dynamic_size_vec = AnfAlgo::GetNodeAttr<std::vector<int>>(cnode_ptr_, kDynInputKey);
+    if (dynamic_size_vec.empty() || dynamic_size_vec[0] < 2) {
+      MS_LOG(EXCEPTION) << "dynamic attr set error, please check.";
+    }
+    auto dynamic_input_shape0_ = AnfAlgo::GetPrevNodeOutputInferShape(cnode_ptr_, kInputIndex_0);
+    PadScalarShape(&dynamic_input_shape0_);
+    input_shapes_.emplace_back(dynamic_input_shape0_);
+    input_num_ = 1;
+  } else {
+    input_num_ = AnfAlgo::GetInputTensorNum(cnode_ptr_);
+    for (size_t i = 0; i < input_num_; ++i) {
+      auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode_ptr_, i);
+      PadScalarShape(&input_shape);
+      input_shapes_.emplace_back(input_shape);
+    }
+  }
+
+  output_num_ = AnfAlgo::GetOutputTensorNum(cnode_ptr_);
+  for (size_t i = 0; i < output_num_; ++i) {
+    auto output = AnfAlgo::GetOutputInferShape(cnode_ptr_, i);
+    PadScalarShape(&output);
+    output_shapes_.emplace_back(output);
+  }
+  AssignSupportFormat(kOpFormat_DEFAULT, support_format);
+  return true;
+}
+
+bool TbeKernelBroadCastSelecter::IsBroadCastSupport5HD(SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  if (IsSameShape()) {
+    if (!HasScalarInput()) {
+      AssignSupportFormat(kOpFormat_NC1HWC0, support_format);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  SupportFormatItem input_support_format;
+  SupportFormatItem output_support_format;
+  if (HasScalarInput()) {
+    for (const auto &shape : input_shapes_) {
+      if (IsScalarShape(shape)) {
+        input_support_format.emplace_back(kOpFormat_DEFAULT);
+      } else {
+        if (!Is4DShape(shape)) {
+          return false;
+        }
+        if (shape[kChannelC] % kAlignmented16 != 0) {
+          return false;
+        }
+        input_support_format.emplace_back(kOpFormat_NC1HWC0);
+      }
+    }
+  } else {
+    for (const auto &shape : input_shapes_) {
+      if (!Is4DShape(shape)) {
+        return false;
+      }
+    }
+    auto shape_tmp = input_shapes_[0];
+    auto broadcast_c_axis = std::any_of(
+      input_shapes_.begin(), input_shapes_.end(),
+      [&shape_tmp](const std::vector<size_t> &elem) { return shape_tmp.at(kChannelC) != elem.at(kChannelC); });
+    if (broadcast_c_axis) {
+      MS_LOG(INFO) << "This node broadcast c channel.";
+      return false;
+    }
+    input_support_format.assign(input_num_, kOpFormat_NC1HWC0);
+  }
+  GenOutputSupportFormat(kOpFormat_NC1HWC0, &output_support_format);
+  support_format->input_format.emplace_back(input_support_format);
+  support_format->output_format.emplace_back(output_support_format);
+  return true;
+}
+
+bool TbeKernelBroadCastSelecter::IsBroadCastSupportFracZ(SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  if (IsSameShape()) {
+    if (!HasScalarInput()) {
+      AssignSupportFormat(kOpFormat_FRAC_Z, support_format);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  SupportFormatItem input_support_format;
+  SupportFormatItem output_support_format;
+  if (HasScalarInput()) {
+    for (const auto &shape : input_shapes_) {
+      if (IsScalarShape(shape)) {
+        input_support_format.emplace_back(kOpFormat_DEFAULT);
+      } else {
+        if (!Is4DShape(shape)) {
+          return false;
+        }
+        if (shape[kChannelN] % kAlignmented16 != 0 || shape[kChannelC] % kAlignmented16 != 0) {
+          return false;
+        }
+        input_support_format.emplace_back(kOpFormat_FRAC_Z);
+      }
+    }
+  } else {
+    return false;
+  }
+  GenOutputSupportFormat(kOpFormat_FRAC_Z, &output_support_format);
+  support_format->input_format.emplace_back(input_support_format);
+  support_format->output_format.emplace_back(output_support_format);
+  return true;
+}
+bool TbeKernelBroadCastSelecter::IsBroadCastSupportC1HWNCoC0(SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  if (IsSameShape()) {
+    if (!HasScalarInput()) {
+      AssignSupportFormat(kOpFormat_C1HWNCoC0, support_format);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  SupportFormatItem input_support_format;
+  SupportFormatItem output_support_format;
+  if (HasScalarInput()) {
+    for (const auto &shape : input_shapes_) {
+      if (IsScalarShape(shape)) {
+        input_support_format.emplace_back(kOpFormat_DEFAULT);
+      } else {
+        if (!Is4DShape(shape)) {
+          return false;
+        }
+        if (shape[kChannelN] % kAlignmented16 != 0) {
+          return false;
+        }
+        input_support_format.emplace_back(kOpFormat_C1HWNCoC0);
+      }
+    }
+  } else {
+    for (const auto &shape : input_shapes_) {
+      if (!Is4DShape(shape)) {
+        return false;
+      }
+    }
+    auto shape_tmp = input_shapes_[0];
+    auto broadcast_nc_axis =
+      std::any_of(input_shapes_.begin(), input_shapes_.end(), [&shape_tmp](const std::vector<size_t> &elem) {
+        return (shape_tmp.at(kChannelC) != elem.at(kChannelC) || shape_tmp.at(kChannelN) != elem.at(kChannelN));
+      });
+    if (broadcast_nc_axis) {
+      MS_LOG(INFO) << "This node broadcast n || c channel.";
+      return false;
+    }
+    input_support_format.assign(input_num_, kOpFormat_C1HWNCoC0);
+  }
+  GenOutputSupportFormat(kOpFormat_C1HWNCoC0, &output_support_format);
+  support_format->input_format.emplace_back(input_support_format);
+  support_format->output_format.emplace_back(output_support_format);
+  return true;
+}
+
+bool TbeKernelBroadCastSelecter::IsBroadCastSupportFracNZ(SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  if (IsSameShape()) {
+    if (!HasScalarInput()) {
+      AssignSupportFormat(kOpFormat_FRAC_NZ, support_format);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  SupportFormatItem input_support_format;
+  SupportFormatItem output_support_format;
+  if (HasScalarInput()) {
+    for (const auto &shape : input_shapes_) {
+      if (IsScalarShape(shape)) {
+        input_support_format.emplace_back(kOpFormat_DEFAULT);
+      } else {
+        if (shape.size() < kShape2dDims) {
+          return false;
+        }
+        if (shape[shape.size() - 1] % kAlignmented16 != 0 || shape[shape.size() - 2] % kAlignmented16 != 0) {
+          return false;
+        }
+        input_support_format.emplace_back(kOpFormat_FRAC_NZ);
+      }
+    }
+  } else {
+    auto less_2dims = std::any_of(input_shapes_.begin(), input_shapes_.end(),
+                                  [](const std::vector<size_t> &elem) { return elem.size() < kShape2dDims; });
+    if (less_2dims) {
+      MS_LOG(INFO) << "This node dim less 2.";
+      return false;
+    }
+
+    auto shape_tmp = input_shapes_[0];
+    auto broadcast_last_dim =
+      std::any_of(input_shapes_.begin(), input_shapes_.end(), [&shape_tmp](const std::vector<size_t> &elem) {
+        return (shape_tmp.at(shape_tmp.size() - 1) != elem.at(elem.size() - 1)) ||
+               (shape_tmp.at(shape_tmp.size() - 2) != elem.at(elem.size() - 2));
+      });
+    if (broadcast_last_dim) {
+      MS_LOG(INFO) << "This node broadcast last channel.";
+      return false;
+    }
+
+    input_support_format.assign(input_num_, kOpFormat_FRAC_NZ);
+  }
+  GenOutputSupportFormat(kOpFormat_FRAC_NZ, &output_support_format);
+  support_format->input_format.emplace_back(input_support_format);
+  support_format->output_format.emplace_back(output_support_format);
+  return true;
+}
+
+bool TbeKernelBroadCastSelecter::IsBroadCastSupportNDC1HWC0(SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  return false;
+}
+
+bool TbeKernelBroadCastSelecter::Is4DShape(const std::vector<size_t> &shape) const {
+  return shape.size() == kShape4dDims;
+}
+
+bool TbeKernelBroadCastSelecter::IsSameShape() const {
+  auto shape = input_shapes_.begin();
+  for (const auto &item : input_shapes_) {
+    if (shape->size() != item.size()) {
+      return false;
+    }
+    for (size_t i = 0; i < shape->size(); ++i) {
+      if (shape->at(i) != item.at(i)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void TbeKernelBroadCastSelecter::PadScalarShape(std::vector<size_t> *shape) const {
+  MS_EXCEPTION_IF_NULL(shape);
+  if (shape->empty()) {
+    shape->emplace_back(1);
+  }
+}
+
+bool TbeKernelBroadCastSelecter::IsScalarShape(const std::vector<size_t> &shape) const {
+  return (shape.size() == 1 && shape[0] == 1);
+}
+
+bool TbeKernelBroadCastSelecter::HasScalarInput() const {
+  bool ret = false;
+  for (const auto &shape : input_shapes_) {
+    if (IsScalarShape(shape)) {
+      ret = true;
+      break;
+    }
+  }
+  return ret;
+}
+
+void TbeKernelBroadCastSelecter::GenOutputSupportFormat(const std::string &support_format,
+                                                        SupportFormatItem *output_support_item) const {
+  MS_EXCEPTION_IF_NULL(output_support_item);
+  for (const auto &shape : output_shapes_) {
+    if (IsScalarShape(shape)) {
+      output_support_item->emplace_back(kOpFormat_DEFAULT);
+    } else {
+      output_support_item->emplace_back(support_format);
+    }
+  }
+}
+
+void TbeKernelBroadCastSelecter::AssignSupportFormat(const std::string &support_format_str,
+                                                     mindspore::kernel::SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  SupportFormatItem input_support_format;
+  SupportFormatItem output_support_format;
+  input_support_format.assign(input_num_, support_format_str);
+  output_support_format.assign(output_num_, support_format_str);
+  support_format->input_format.emplace_back(input_support_format);
+  support_format->output_format.emplace_back(output_support_format);
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_broadcast_selecter.h b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_broadcast_selecter.h
new file mode 100644
index 0000000000..af711ddf29
--- /dev/null
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_broadcast_selecter.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_TBE_KERNEL_BROADCAST_SELECTER_H_
+#define MINDSPORE_CCSRC_KERNEL_TBE_KERNEL_BROADCAST_SELECTER_H_
+
+#include <vector>
+#include <string>
+#include <utility>
+#include "ir/anf.h"
+#include "kernel/tbe/tbe_kernel_select/common_utils.h"
+
+namespace mindspore {
+namespace kernel {
+class TbeKernelBroadCastSelecter {
+ public:
+  explicit TbeKernelBroadCastSelecter(CNodePtr cnode_ptr) : cnode_ptr_(std::move(cnode_ptr)) {}
+  ~TbeKernelBroadCastSelecter() = default;
+  bool GetShapeInfo(SupportFormat *support_format);
+  bool IsBroadCastSupport5HD(SupportFormat *support_format) const;
+  bool IsBroadCastSupportFracZ(SupportFormat *support_format) const;
+  bool IsBroadCastSupportC1HWNCoC0(SupportFormat *support_format) const;
+  bool IsBroadCastSupportFracNZ(SupportFormat *support_format) const;
+  bool IsBroadCastSupportNDC1HWC0(SupportFormat *support_format) const;
+
+ private:
+  bool IsSameShape() const;
+  void PadScalarShape(std::vector<size_t> *shape) const;
+  bool Is4DShape(const std::vector<size_t> &shape) const;
+  bool IsScalarShape(const std::vector<size_t> &shape) const;
+  bool HasScalarInput() const;
+  void GenOutputSupportFormat(const std::string &support_format, SupportFormatItem *output_support_item) const;
+  void AssignSupportFormat(const std::string &support_format_str, SupportFormat *support_format) const;
+  // broadcast
+  CNodePtr cnode_ptr_;
+  size_t input_num_{};
+  size_t output_num_{};
+  std::vector<std::vector<size_t>> input_shapes_;
+  std::vector<std::vector<size_t>> output_shapes_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_TBE_KERNEL_BROADCAST_SELECTER_HELPER_H
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_reduce_selecter.cc b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_reduce_selecter.cc
new file mode 100644
index 0000000000..da0466feaa
--- /dev/null
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_reduce_selecter.cc
@@ -0,0 +1,179 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/tbe/tbe_kernel_select/tbe_kernel_reduce_selecter.h"
+#include <string>
+#include <vector>
+#include "utils/utils.h"
+#include "session/anf_runtime_algorithm.h"
+#include "kernel/tbe/tbe_kernel_select/common_utils.h"
+
+namespace mindspore {
+namespace kernel {
+constexpr char kKeepDims[] = "keep_dims";
+constexpr char kAxis[] = "axis";
+constexpr char kTypeInt32[] = "Int32";
+constexpr size_t kInputIndex_0 = 0;
+constexpr size_t kOutputIndex_0 = 0;
+constexpr size_t kChannelN = 0;
+constexpr size_t kChannelC = 1;
+constexpr size_t kReduceNZMinDim = 3;
+
+bool TbeKernelReduceSelecter::GetShapeInfo(SupportFormat *support_format) {
+  MS_EXCEPTION_IF_NULL(support_format);
+  input_shape_.clear();
+  output_shape_.clear();
+  axis_.clear();
+  auto input_num = AnfAlgo::GetInputTensorNum(cnode_ptr_);
+  auto output_num = AnfAlgo::GetOutputTensorNum(cnode_ptr_);
+  if (input_num != 1 || output_num != 1) {
+    MS_LOG(EXCEPTION) << "Reduce operator only support one input/output, input num: " << input_num
+                      << ", output num: " << output_num;
+  }
+  // get input/output shape
+  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(cnode_ptr_, kInputIndex_0);
+  PadScalarShape(&input_shape_);
+  output_shape_ = AnfAlgo::GetOutputInferShape(cnode_ptr_, kOutputIndex_0);
+  PadScalarShape(&output_shape_);
+  // get keep dim attr
+  GetReduceAttrKeepDim();
+  // get axis attr
+  GetReduceAttrAxis();
+  AssignSupportFormat(kOpFormat_DEFAULT, support_format);
+  return true;
+}
+
+bool TbeKernelReduceSelecter::IsReduceSupport5HD(SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  if (!Is4DShape(input_shape_)) {
+    return false;
+  }
+  if (!keep_dims_ || axis_.empty()) {
+    return false;
+  }
+  auto reduce_c_axis = std::any_of(axis_.begin(), axis_.end(), [](const size_t &elem) { return (elem == kChannelC); });
+  if (reduce_c_axis) {
+    return false;
+  }
+  AssignSupportFormat(kOpFormat_NC1HWC0, support_format);
+  return true;
+}
+
+bool TbeKernelReduceSelecter::IsReduceSupportNDC1HWC0(SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  // like to 5HD
+  return false;
+}
+
+bool TbeKernelReduceSelecter::IsReduceSupportFracZ(SupportFormat *support_format) const {
+  return IsFracZAndC1HWNCoC0Common(kOpFormat_FRAC_Z, support_format);
+}
+
+bool TbeKernelReduceSelecter::IsReduceSupportC1HWNCoC0(SupportFormat *support_format) const {
+  return IsFracZAndC1HWNCoC0Common(kOpFormat_C1HWNCoC0, support_format);
+}
+
+bool TbeKernelReduceSelecter::IsReduceSupportFracNZ(SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  if (input_shape_.size() < kReduceNZMinDim) {
+    return false;
+  }
+  if (axis_.empty()) {
+    return false;
+  }
+  auto reduce_last_axis = std::any_of(axis_.begin(), axis_.end(), [this](const size_t &elem) {
+    return (elem == (this->input_shape_.size() - 1) || elem == (this->input_shape_.size() - 2));
+  });
+  if (reduce_last_axis) {
+    return false;
+  }
+  AssignSupportFormat(kOpFormat_FRAC_NZ, support_format);
+  return true;
+}
+
+bool TbeKernelReduceSelecter::IsFracZAndC1HWNCoC0Common(const std::string &format,
+                                                        mindspore::kernel::SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  if (!Is4DShape(input_shape_)) {
+    return false;
+  }
+  if (!keep_dims_ || axis_.empty()) {
+    return false;
+  }
+  auto reduce_n_c_axis = std::any_of(axis_.begin(), axis_.end(),
+                                     [](const size_t &elem) { return (elem == kChannelC || elem == kChannelN); });
+  if (reduce_n_c_axis) {
+    return false;
+  }
+  AssignSupportFormat(format, support_format);
+  return true;
+}
+
+void TbeKernelReduceSelecter::GetReduceAttrAxis() {
+  auto primitive = AnfAlgo::GetCNodePrimitive(cnode_ptr_);
+  MS_EXCEPTION_IF_NULL(primitive);
+  auto axis = primitive->GetAttr(kAxis);
+  if (axis == nullptr) {
+    MS_LOG(INFO) << "This node does't have axie attr.";
+    return;
+  }
+  auto type = axis->type();
+  MS_EXCEPTION_IF_NULL(type);
+  std::vector<int> axis_list;
+  if (type->ToString() == kTypeInt32) {
+    axis_list.emplace_back(GetValue<int>(axis));
+  } else {
+    axis_list = GetValue<std::vector<int>>(axis);
+  }
+  for (const auto &elem : axis_list) {
+    if (elem < 0) {
+      axis_.emplace_back(input_shape_.size() + elem);
+    } else {
+      axis_.emplace_back(IntToSize(elem));
+    }
+  }
+}
+
+void TbeKernelReduceSelecter::GetReduceAttrKeepDim() {
+  if (!AnfAlgo::HasNodeAttr(kKeepDims, cnode_ptr_)) {
+    MS_LOG(INFO) << "This node does't have keep_attr.";
+    keep_dims_ = false;
+    return;
+  }
+  keep_dims_ = AnfAlgo::GetNodeAttr<bool>(cnode_ptr_, kKeepDims);
+}
+
+void TbeKernelReduceSelecter::AssignSupportFormat(const std::string &support_format_str,
+                                                  mindspore::kernel::SupportFormat *support_format) const {
+  MS_EXCEPTION_IF_NULL(support_format);
+  SupportFormatItem input_support_format;
+  SupportFormatItem output_support_format;
+  input_support_format.emplace_back(support_format_str);
+  output_support_format.emplace_back(support_format_str);
+  support_format->input_format.emplace_back(input_support_format);
+  support_format->output_format.emplace_back(output_support_format);
+}
+
+bool TbeKernelReduceSelecter::Is4DShape(const std::vector<size_t> &shape) const { return shape.size() == kShape4dDims; }
+
+void TbeKernelReduceSelecter::PadScalarShape(std::vector<size_t> *shape) const {
+  MS_EXCEPTION_IF_NULL(shape);
+  if (shape->empty()) {
+    shape->emplace_back(1);
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_reduce_selecter.h b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_reduce_selecter.h
new file mode 100644
index 0000000000..e66525fd64
--- /dev/null
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_reduce_selecter.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_TBE_KERNEL_REDUCE_SELECTER_H_
+#define MINDSPORE_CCSRC_KERNEL_TBE_KERNEL_REDUCE_SELECTER_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "ir/anf.h"
+#include "kernel/tbe/tbe_kernel_select/common_utils.h"
+namespace mindspore {
+namespace kernel {
+class TbeKernelReduceSelecter {
+ public:
+  explicit TbeKernelReduceSelecter(CNodePtr cnode_ptr) : cnode_ptr_(std::move(cnode_ptr)) {}
+  ~TbeKernelReduceSelecter() = default;
+  bool GetShapeInfo(SupportFormat *support_format);
+  bool IsReduceSupport5HD(SupportFormat *support_format) const;
+  bool IsReduceSupportNDC1HWC0(SupportFormat *support_format) const;
+  bool IsReduceSupportFracZ(SupportFormat *support_format) const;
+  bool IsReduceSupportC1HWNCoC0(SupportFormat *support_format) const;
+  bool IsReduceSupportFracNZ(SupportFormat *support_format) const;
+
+ private:
+  bool IsFracZAndC1HWNCoC0Common(const std::string &format, SupportFormat *support_format) const;
+  void GetReduceAttrAxis();
+  void GetReduceAttrKeepDim();
+  void AssignSupportFormat(const std::string &support_format_str, SupportFormat *support_format) const;
+  bool Is4DShape(const std::vector<size_t> &shape) const;
+  void PadScalarShape(std::vector<size_t> *shape) const;
+  CNodePtr cnode_ptr_;
+  std::vector<size_t> input_shape_{};
+  std::vector<size_t> output_shape_{};
+  std::vector<size_t> axis_{};
+  bool keep_dims_ = false;
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_TBE_KERNEL_REDUCE_SELECTER_H
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_select.cc b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_select.cc
new file mode 100644
index 0000000000..573ad176cf
--- /dev/null
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_select.cc
@@ -0,0 +1,624 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/tbe/tbe_kernel_select/tbe_kernel_select.h"
+#include <memory>
+#include <map>
+#include <set>
+#include <utility>
+#include "session/anf_runtime_algorithm.h"
+#include "kernel/oplib/oplib.h"
+#include "kernel/tbe/tbe_kernel_build.h"
+#include "nlohmann/json.hpp"
+#include "utils/context/ms_context.h"
+#include "kernel/tbe/tbe_python_funcs.h"
+#include "pre_activate/common/helper.h"
+#include "kernel/tbe/tbe_convert_utils.h"
+#include "parallel/ops_info/ops_utils.h"
+#include "kernel/tbe/tbe_kernel_select/tbe_kernel_broadcast_selecter.h"
+#include "kernel/tbe/tbe_kernel_select/tbe_kernel_reduce_selecter.h"
+#include "kernel/tbe/tbe_kernel_select/common_utils.h"
+
+namespace mindspore {
+namespace kernel {
+constexpr auto kName = "name";
+constexpr auto kDtype = "dtype";
+constexpr auto kFormat = "format";
+constexpr auto kPrefixInput = "input";
+constexpr auto kPrefixOutput = "output";
+constexpr char kDynInputKey[] = "dyn_input_sizes";
+constexpr char kParamTypeDynamic[] = "dynamic";
+constexpr char kParamTypeRequre[] = "required";
+constexpr char kParamTypeOptional[] = "optional";
+void TbeMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list) {
+  auto tbe_selecter = TbeKernelSelect(kernel_node, kernel_info_list);
+  tbe_selecter.TbeMetadataInfoEx();
+}
+
+TbeKernelSelect::TbeKernelSelect(CNodePtr kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list)
+    : cnode_ptr_(std::move(kernel_node)), kernel_info_list_(kernel_info_list) {}
+
+void TbeKernelSelect::TbeMetadataInfoEx() {
+  MS_EXCEPTION_IF_NULL(cnode_ptr_);
+  MS_EXCEPTION_IF_NULL(kernel_info_list_);
+  node_name_ = AnfAlgo::GetCNodeName(cnode_ptr_);
+  auto op_info_ptr = OpLib::FindOp(node_name_, kTBE);
+  if (!op_info_ptr) {
+    MS_LOG(INFO) << "Warning: Cann't find tbe core opinfo, node type: " << node_name_;
+    return;
+  }
+  MS_LOG(INFO) << "Start to tbe metadata info. node type: " << node_name_
+               << ", node name: " << cnode_ptr_->fullname_with_scope();
+  OpPattern pattern = op_info_ptr->op_pattern();
+  if (pattern == kCommonPattern) {
+    GetCommonPatternKernelInfo(*op_info_ptr);
+  } else if (pattern == kDynamicFormatPattern) {
+    GetDynamicFormatPatternKernelInfo(*op_info_ptr);
+  } else if (pattern == kFormatAgnosticPattern) {
+    GetAgnosticPatternKernelInfo(*op_info_ptr);
+  } else if (pattern == kBroadcastPattern) {
+    GetBroadcastPatternKernelInfo(*op_info_ptr);
+  } else if (pattern == kReducePattern) {
+    GetReducePatternKernelInfo(*op_info_ptr);
+  } else {
+    MS_LOG(INFO) << "Warning: op pattern is invailed.";
+  }
+  // check support
+  FilterInVaildKernelInfo();
+  MS_LOG(INFO) << "End get kernel build info size: " << kernel_info_list_->size() << ", after tbe select.";
+}
+
+void TbeKernelSelect::GetCommonPatternKernelInfo(const OpInfo &op_info) {
+  MS_LOG(INFO) << "start.";
+  // get dynamic inputs
+  auto primitive = AnfAlgo::GetCNodePrimitive(cnode_ptr_);
+  MS_EXCEPTION_IF_NULL(primitive);
+  std::vector<int> dyn_input_sizes;
+  if (primitive->HasAttr(kDynInputKey)) {
+    dyn_input_sizes = GetValue<std::vector<int>>(primitive->GetAttr(kDynInputKey));
+  }
+  // get real input/output num
+  size_t real_input_tensor_num = AnfAlgo::GetInputTensorNum(cnode_ptr_);
+  const auto inputs_info = op_info.inputs_ptr();
+  size_t real_output_tensor_num = AnfAlgo::GetOutputTensorNum(cnode_ptr_);
+  const auto outputs_info = op_info.outputs_ptr();
+  if (inputs_info.empty() && outputs_info.empty()) {
+    MS_LOG(EXCEPTION) << "op info input & output is null, please check.";
+  }
+  // create kernel build info from opinfo
+  size_t kernel_build_info_num =
+    inputs_info.empty() ? outputs_info[0]->dtypes().size() : inputs_info[0]->dtypes().size();
+  for (size_t kernel_build_info_index = 0; kernel_build_info_index < kernel_build_info_num; ++kernel_build_info_index) {
+    auto builder = KernelBuildInfo::KernelBuildInfoBuilder();
+    SetTbeBuildCommonInfo(op_info, &builder);
+    std::vector<std::string> inputs_format;
+    std::vector<TypeId> inputs_device_type;
+    std::vector<std::vector<Axis>> inputs_reshape_type;
+    // input
+    if (!GenBuilderItem(true, kernel_build_info_index, real_input_tensor_num, inputs_info, dyn_input_sizes,
+                        &inputs_format, &inputs_device_type, &inputs_reshape_type)) {
+      break;
+    }
+    builder.SetInputsDeviceType(inputs_device_type);
+    builder.SetInputsFormat(inputs_format);
+    builder.SetInputReshapeType(inputs_reshape_type);
+    // output
+    std::vector<std::string> outputs_format;
+    std::vector<TypeId> outputs_device_type;
+    std::vector<std::vector<Axis>> outputs_reshape_type;
+    if (!GenBuilderItem(false, kernel_build_info_index, real_output_tensor_num, outputs_info, dyn_input_sizes,
+                        &outputs_format, &outputs_device_type, &outputs_reshape_type)) {
+      break;
+    }
+    builder.SetOutputsDeviceType(outputs_device_type);
+    builder.SetOutputsFormat(outputs_format);
+    builder.SetOutputReshapeType(outputs_reshape_type);
+    kernel_info_list_->emplace_back(builder.Build());
+  }
+  MS_LOG(INFO) << "end.";
+}
+
+void TbeKernelSelect::GetDynamicFormatPatternKernelInfo(const OpInfo &op_info) {
+  MS_LOG(INFO) << "start.";
+  //
+  OpInfo op_info_new;
+  CreateNewOpInfo(op_info, &op_info_new);
+  GetCommonPatternKernelInfo(op_info_new);
+  MS_LOG(INFO) << "end.";
+}
+
+void TbeKernelSelect::GetAgnosticPatternKernelInfo(const OpInfo &op_info) {
+  MS_LOG(INFO) << "start.";
+  if (op_info.inputs_ptr().size() != 1) {
+    MS_LOG(EXCEPTION) << "AgnosticPattern only support one input.";
+  }
+  auto format = AnfAlgo::GetPrevNodeOutputFormat(cnode_ptr_, 0);
+  if (kOpFormatList.find(format) == kOpFormatList.end()) {
+    MS_LOG(INFO) << "Got the unknown format " << format;
+    format = kOpFormat_DEFAULT;
+  }
+  SupportFormat support_format;
+  SupportFormatItem input_item;
+  SupportFormatItem output_item;
+  input_item.assign(op_info.inputs_ptr().size(), format);
+  output_item.assign(op_info.outputs_ptr().size(), format);
+  support_format.input_format.emplace_back(input_item);
+  support_format.output_format.emplace_back(output_item);
+  PrintSupportedFormat(support_format);
+  OpInfo op_info_new;
+  CreateNewOpInfo(op_info, support_format, &op_info_new);
+  GetCommonPatternKernelInfo(op_info_new);
+  MS_LOG(INFO) << "end.";
+}
+
+void TbeKernelSelect::GetBroadcastPatternKernelInfo(const OpInfo &op_info) {
+  MS_LOG(INFO) << "start.";
+  auto broadcast_selecter = TbeKernelBroadCastSelecter(cnode_ptr_);
+  SupportFormat support_format;
+  broadcast_selecter.GetShapeInfo(&support_format);
+  if (!broadcast_selecter.IsBroadCastSupport5HD(&support_format)) {
+    MS_LOG(INFO) << "Node(" << node_name_ << ") does not support 5HD.";
+  }
+  if (!broadcast_selecter.IsBroadCastSupportFracZ(&support_format)) {
+    MS_LOG(INFO) << "Node(" << node_name_ << ") does not support FracZ.";
+  }
+  if (!broadcast_selecter.IsBroadCastSupportC1HWNCoC0(&support_format)) {
+    MS_LOG(INFO) << "Node(" << node_name_ << ") does not support C1HWNCoC0.";
+  }
+  if (!broadcast_selecter.IsBroadCastSupportFracNZ(&support_format)) {
+    MS_LOG(INFO) << "Node(" << node_name_ << ") does not support FracNZ.";
+  }
+  PrintSupportedFormat(support_format);
+  OpInfo op_info_new;
+  CreateNewOpInfo(op_info, support_format, &op_info_new);
+  GetCommonPatternKernelInfo(op_info_new);
+  MS_LOG(INFO) << "end.";
+}
+
+void TbeKernelSelect::GetReducePatternKernelInfo(const OpInfo &op_info) {
+  MS_LOG(INFO) << "start.";
+  auto reduce_selecter = TbeKernelReduceSelecter(cnode_ptr_);
+  SupportFormat support_format;
+  reduce_selecter.GetShapeInfo(&support_format);
+  if (!reduce_selecter.IsReduceSupport5HD(&support_format)) {
+    MS_LOG(INFO) << "Node (" << node_name_ << ") reduce not support 5HD.";
+  }
+  if (reduce_selecter.IsReduceSupportFracZ(&support_format)) {
+    MS_LOG(INFO) << "Node (" << node_name_ << ") reduce not support FracZ.";
+  }
+  if (reduce_selecter.IsReduceSupportC1HWNCoC0(&support_format)) {
+    MS_LOG(INFO) << "Node (" << node_name_ << ") reduce not support C1HWNCoC0.";
+  }
+  if (reduce_selecter.IsReduceSupportFracNZ(&support_format)) {
+    MS_LOG(INFO) << "Node (" << node_name_ << ") reduce not support FracNZ.";
+  }
+  PrintSupportedFormat(support_format);
+  OpInfo op_info_new;
+  CreateNewOpInfo(op_info, support_format, &op_info_new);
+  GetCommonPatternKernelInfo(op_info_new);
+  MS_LOG(INFO) << "end.";
+}
+
+void TbeKernelSelect::FilterInVaildKernelInfo() {
+  if (kernel_info_list_->empty()) {
+    MS_LOG(INFO) << "Warning: get kernel build info failed.";
+    return;
+  }
+  auto kernel_build_info_iter = kernel_info_list_->begin();
+  while (kernel_build_info_iter != kernel_info_list_->end()) {
+    if (!FilterInVaildShape(kernel_build_info_iter)) {
+      MS_LOG(INFO) << "Filter invaild shape, filter item info: " << (*kernel_build_info_iter)->ToString();
+      kernel_build_info_iter = kernel_info_list_->erase(kernel_build_info_iter);
+      continue;
+    }
+    if (!TbeCheckSupported(kernel_build_info_iter)) {
+      MS_LOG(INFO) << "Check support shape, filter item info: " << (*kernel_build_info_iter)->ToString();
+      kernel_build_info_iter = kernel_info_list_->erase(kernel_build_info_iter);
+      continue;
+    }
+    kernel_build_info_iter++;
+  }
+}
+
+bool TbeKernelSelect::FilterInVaildShape(
+  const mindspore::kernel::TbeKernelSelect::KernelBuildInfoIter &kernel_build_info_iter) {
+  MS_EXCEPTION_IF_NULL((*kernel_build_info_iter));
+  auto kernel_build_info_inputs_format = (*kernel_build_info_iter)->GetAllInputFormats();
+  for (size_t i = 0; i < kernel_build_info_inputs_format.size(); ++i) {
+    auto shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode_ptr_, i);
+    auto format = kernel_build_info_inputs_format.at(i);
+    if (!IsShapeMatchFormat(shape, format)) {
+      MS_LOG(INFO) << "The " << i << "th input check failed.";
+      return false;
+    }
+  }
+  auto kernel_build_info_outputs_format = (*kernel_build_info_iter)->GetAllOutputFormats();
+  for (size_t j = 0; j < kernel_build_info_outputs_format.size(); ++j) {
+    auto shape = AnfAlgo::GetOutputInferShape(cnode_ptr_, j);
+    auto format = kernel_build_info_outputs_format.at(j);
+    if (!IsShapeMatchFormat(shape, format)) {
+      MS_LOG(INFO) << "The " << j << "th input check failed.";
+      return false;
+    }
+  }
+  return true;
+}
+
+bool TbeKernelSelect::IsShapeMatchFormat(const std::vector<size_t> &shape, const std::string &format) {
+  if (format == kOpFormat_DEFAULT) {
+    return true;
+  }
+  static std::set<std::string> kServerNotSupportFormat = {kOpFormat_NC1HWC0_C04, kOpFormat_FRACTAL_Z_C04};
+  // if format is default, it remarkes support all format
+  if (kOpFormatList.find(format) == kOpFormatList.end()) {
+    MS_LOG(EXCEPTION) << "Got the unknown format " << format;
+  }
+  // server not support format with C04 suffix
+  if (std::find(kServerNotSupportFormat.begin(), kServerNotSupportFormat.end(), format) !=
+      kServerNotSupportFormat.end()) {
+    MS_LOG(INFO) << "Warning: Server not support format with C04 suffix.";
+    return false;
+  }
+  // not support format:
+  // 1 NDHWC with shape size != 5
+  // 2 FRAC_NZ with shape size < 2
+  // 3 !NDHWC with shape size > 4
+  if ((format == kOpFormat_NDHWC && shape.size() != kShape5dDims) ||
+      (format == kOpFormat_FRAC_NZ && shape.size() < kShape2dDims) ||
+      (format != kOpFormat_NDHWC && shape.size() > kShape4dDims)) {
+    MS_LOG(INFO) << "Warning: Shape format check failed, format: " << format << ", size: " << shape.size();
+    return false;
+  }
+  return true;
+}
+
+bool TbeKernelSelect::TbeCheckSupported(
+  const mindspore::kernel::TbeKernelSelect::KernelBuildInfoIter &kernel_build_info_iter) {
+  MS_EXCEPTION_IF_NULL((*kernel_build_info_iter));
+  static const std::set<std::string> kCheckSupportedOpType = {parallel::MATMUL,
+                                                              parallel::BATCHMATMUL,
+                                                              parallel::TOPK,
+                                                              parallel::IN_TOPK,
+                                                              parallel::PACK,
+                                                              parallel::GATHER_ND,
+                                                              parallel::UNSORTEF_SEGMENT_MIND,
+                                                              parallel::UNSORTEF_SEGMENT_PRODD,
+                                                              parallel::CAST};
+  auto iter = std::find(kCheckSupportedOpType.begin(), kCheckSupportedOpType.end(), node_name_);
+  if (iter == kCheckSupportedOpType.end()) {
+    return true;
+  }
+  MS_LOG(INFO) << "Check support start.";
+  // replace kernel_info with current kernel info
+  auto kernel_build_info_tmp = AnfAlgo::GetSelectKernelBuildInfo(cnode_ptr_);
+  AnfAlgo::SetSelectKernelBuildInfo(*kernel_build_info_iter, cnode_ptr_.get());
+  nlohmann::json kernel_json;
+  TbeKernelJsonCreator creator(CHECK_SUPPORTED);
+  bool ret = creator.GenTbeSingleKernelJson(cnode_ptr_, &kernel_json);
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "Gen tbe single kernel json for check support failed.";
+  }
+  ret = TbePythonFuncs::CheckSupported(kernel_json);
+  AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_tmp, cnode_ptr_.get());
+  return ret;
+}
+
+void TbeKernelSelect::SetTbeBuildCommonInfo(const mindspore::kernel::OpInfo &op_info,
+                                            mindspore::kernel::KernelBuildInfo::KernelBuildInfoBuilder *builder) {
+  MS_EXCEPTION_IF_NULL(builder);
+  builder->SetProcessor(AICORE);
+  std::string fusion_type = op_info.fusion_type();
+  if (tbe::GetFusionType(fusion_type) != UNKNOWN_FUSION_TYPE) {
+    builder->SetFusionType(tbe::GetFusionType(fusion_type));
+  }
+  builder->SetOpPattern(op_info.op_pattern());
+  builder->SetKernelType(TBE_KERNEL);
+}
+
+bool TbeKernelSelect::GenBuilderItem(bool is_input, size_t kernel_build_info_index, size_t real_io_tensor_num,
+                                     const std::vector<std::shared_ptr<OpIOInfo>> &ios_info,
+                                     const std::vector<int> &dyn_input_sizes, std::vector<std::string> *formats,
+                                     std::vector<TypeId> *device_types, std::vector<std::vector<Axis>> *reshape_types) {
+  MS_EXCEPTION_IF_NULL(formats);
+  MS_EXCEPTION_IF_NULL(device_types);
+  MS_EXCEPTION_IF_NULL(reshape_types);
+  size_t dynamic_input_index = 0;
+  size_t real_io_tensor_index = 0;
+  size_t io_info_index = 0;
+  size_t io_info_num = ios_info.size();
+  for (; io_info_index < io_info_num && real_io_tensor_index < real_io_tensor_num; io_info_index++) {
+    std::shared_ptr<OpIOInfo> io_info_item = ios_info[io_info_index];
+    auto kernel_build_info_dtype = io_info_item->dtypes().at(kernel_build_info_index);
+    std::string kernel_build_info_format;
+    if (!io_info_item->formats().empty()) {
+      kernel_build_info_format = io_info_item->formats().at(kernel_build_info_index);
+    }
+    std::string io_param_type = io_info_item->param_type();
+    std::vector<Axis> reshape_type;
+    StringToAxisVector(io_info_item->reshape_type(), &reshape_type);
+    if (io_param_type == kParamTypeDynamic) {
+      // dynamic io
+      if (is_input) {
+        if (dynamic_input_index >= dyn_input_sizes.size()) {
+          MS_LOG(EXCEPTION) << "dyn_input_sizes attr set error, dynamic_input_index: " << dynamic_input_index
+                            << ", dyn_input_sizes size: " << dyn_input_sizes.size();
+        }
+        int dynamic_input_size = dyn_input_sizes[dynamic_input_index];
+        for (int i = 0; i < dynamic_input_size; ++i) {
+          device_types->emplace_back(tbe::DtypeToTypeId(kernel_build_info_dtype));
+          formats->emplace_back(kernel_build_info_format);
+          reshape_types->emplace_back(reshape_type);
+        }
+        dynamic_input_index++;
+        real_io_tensor_index += dynamic_input_size;
+      } else {
+        if (ios_info.size() != 1) {
+          MS_LOG(EXCEPTION) << "if output is dynamic, so output must has one output.";
+        }
+        for (size_t i = 0; i < real_io_tensor_num; ++i) {
+          device_types->emplace_back(tbe::DtypeToTypeId(kernel_build_info_dtype));
+          formats->emplace_back(kernel_build_info_format);
+          reshape_types->emplace_back(reshape_type);
+        }
+        real_io_tensor_index += real_io_tensor_num;
+      }
+    } else if (io_param_type == kParamTypeRequre || io_param_type == kParamTypeOptional) {
+      // requre or optional io
+      device_types->emplace_back(tbe::DtypeToTypeId(kernel_build_info_dtype));
+      formats->emplace_back(kernel_build_info_format);
+      reshape_types->emplace_back(reshape_type);
+      real_io_tensor_index++;
+    } else {
+      MS_LOG(EXCEPTION) << "op info's param type is not match: " << io_param_type;
+    }
+  }
+
+  if (io_info_index != io_info_num) {
+    MS_LOG(INFO) << "Warning: io_info_index(" << io_info_index << ") != io_info_num(" << io_info_num
+                 << "), this node may has optional input/output.";
+  }
+  if (real_io_tensor_index != real_io_tensor_num) {
+    std::string io_type = is_input ? "inputs " : "outputs";
+    MS_LOG(INFO) << node_name_ << "'s " << io_type << "op io info num: " << io_info_num
+                 << ", real io tensor num:" << real_io_tensor_num << "real_io_tensor_index(" << real_io_tensor_index
+                 << ") != real_io_tensor_num(" << real_io_tensor_num << ")";
+    return false;
+  }
+  return true;
+}
+
+void TbeKernelSelect::StringToAxisVector(const std::string &reshape_type_str, std::vector<Axis> *reshape_type_vec) {
+  MS_EXCEPTION_IF_NULL(reshape_type_vec);
+  for (const auto &c : reshape_type_str) {
+    switch (c) {
+      case 'N':
+        reshape_type_vec->push_back(kernel::N);
+        break;
+      case 'C':
+        reshape_type_vec->push_back(kernel::C);
+        break;
+      case 'H':
+        reshape_type_vec->push_back(kernel::H);
+        break;
+      case 'W':
+        reshape_type_vec->push_back(kernel::W);
+        break;
+      default:
+        MS_LOG(EXCEPTION) << "Unknown axis " << c << "in reshape type.";
+    }
+  }
+}
+
+void TbeKernelSelect::CreateNewOpIOInfo(const mindspore::kernel::OpIOInfo &op_io_info,
+                                        const std::vector<std::vector<std::string>> &support_format_item, size_t index,
+                                        mindspore::kernel::OpIOInfo *op_io_info_new) {
+  MS_EXCEPTION_IF_NULL(op_io_info_new);
+  op_io_info_new->set_index(op_io_info.index());
+  op_io_info_new->set_name(op_io_info.name());
+  op_io_info_new->set_param_type(op_io_info.param_type());
+  op_io_info_new->set_need_compile(op_io_info.need_compile());
+  op_io_info_new->set_reshape_type(op_io_info.reshape_type());
+  op_io_info_new->set_shape(op_io_info.shape());
+  // dtype
+  std::vector<std::string> dtype_new;
+  auto dtype = op_io_info.dtypes();
+  for (size_t i = 0; i < support_format_item.size(); ++i) {
+    dtype_new.insert(dtype_new.end(), dtype.begin(), dtype.end());
+  }
+  op_io_info_new->set_dtypes(dtype_new);
+  // format
+  std::vector<std::string> format_new;
+  for (const auto &formats : support_format_item) {
+    auto format = formats.at(index);
+    for (size_t j = 0; j < dtype.size(); ++j) {
+      format_new.emplace_back(format);
+    }
+  }
+  op_io_info_new->set_formats(format_new);
+}
+
+std::vector<std::string> TbeKernelSelect::SplitStrToVec(const std::string &op_select_json_item) {
+  const std::map<std::string, std::string> kDynamicFormatMap = {
+    {"NCHW", "DefaultFormat"}, {"ND", "DefaultFormat"}, {"FRACTAL_Z", "FracZ"}};
+  if (op_select_json_item.empty()) {
+    MS_LOG(EXCEPTION) << "Op select ret item is null.";
+  }
+  const char space = ' ';
+  const char sep = ',';
+  std::string op_select_tmp = op_select_json_item + ",";
+  std::vector<std::string> ret;
+  auto begin = op_select_tmp.find_first_not_of(space, 0);
+  auto sep_pos = op_select_tmp.find(sep);
+  if (begin >= sep_pos) {
+    MS_LOG(EXCEPTION) << "Select ret json is error.";
+  }
+  while (sep_pos != std::string::npos) {
+    auto obj = op_select_tmp.substr(begin, sep_pos - begin);
+    if (kDynamicFormatMap.find(obj) != kDynamicFormatMap.end()) {
+      obj = kDynamicFormatMap.at(obj);
+    }
+    ret.emplace_back(obj);
+    begin = op_select_tmp.find_first_not_of(space, sep_pos + 1);
+    sep_pos = op_select_tmp.find(sep, begin);
+  }
+  return ret;
+}
+
+std::string TbeKernelSelect::OpSelectFormat() {
+  nlohmann::json kernel_json;
+  std::string res_json_str;
+  TbeKernelJsonCreator creator(OP_SELECT_FORMAT);
+  bool ret = creator.GenTbeSingleKernelJson(cnode_ptr_, &kernel_json);
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "GenTbeSingleKernelJson failed.";
+  }
+  res_json_str = TbePythonFuncs::OpSelectFormat(kernel_json);
+  if (res_json_str.empty()) {
+    MS_LOG(EXCEPTION) << "op select format error.";
+  }
+  MS_LOG(INFO) << "Dynamic select foramt response result:" << res_json_str;
+  return res_json_str;
+}
+
+void TbeKernelSelect::CreateNewOpInfo(const mindspore::kernel::OpInfo &op_info, const SupportFormat &support_format,
+                                      mindspore::kernel::OpInfo *op_info_new) {
+  MS_EXCEPTION_IF_NULL(op_info_new);
+  if (op_info.inputs_ptr().size() != support_format.input_format[0].size() ||
+      op_info.outputs_ptr().size() != support_format.output_format[0].size()) {
+    MS_LOG(EXCEPTION) << "BroadCast input/output size not match, op info input size:" << op_info.inputs_ptr().size()
+                      << ", input support size: " << support_format.input_format[0].size()
+                      << ", op info output size: " << op_info.outputs_ptr().size()
+                      << ", output support size: " << support_format.output_format[0].size();
+  }
+  *op_info_new = op_info;
+  op_info_new->ClearInputs();
+  op_info_new->ClearOutputs();
+  for (size_t i = 0; i < op_info.inputs_ptr().size(); ++i) {
+    auto input = op_info.inputs_ptr().at(i);
+    auto input_new = std::make_shared<OpIOInfo>();
+    CreateNewOpIOInfo(*input, support_format.input_format, i, input_new.get());
+    op_info_new->add_inputs_ptr(input_new);
+  }
+  for (size_t j = 0; j < op_info.outputs_ptr().size(); ++j) {
+    auto output = op_info.outputs_ptr().at(j);
+    auto output_new = std::make_shared<OpIOInfo>();
+    CreateNewOpIOInfo(*output, support_format.output_format, j, output_new.get());
+    op_info_new->add_outputs_ptr(output_new);
+  }
+}
+
+struct SelectOpIOInfo {
+  std::string name;
+  std::vector<std::string> dtypes;
+  std::vector<std::string> formats;
+};
+
+void TbeKernelSelect::CreateNewOpInfo(const mindspore::kernel::OpInfo &op_info,
+                                      mindspore::kernel::OpInfo *op_info_new) {
+  MS_EXCEPTION_IF_NULL(op_info_new);
+  auto op_seclect_json = OpSelectFormat();
+  if (!op_seclect_json.empty()) {
+    nlohmann::json json_obj = nlohmann::json::parse(op_seclect_json);
+    if (!json_obj.is_object()) {
+      MS_LOG(EXCEPTION) << "JsonStr is not an object, the jsonStr is:" << op_seclect_json;
+    }
+    std::vector<SelectOpIOInfo> inputs;
+    std::vector<SelectOpIOInfo> outputs;
+    for (const auto &item : json_obj.items()) {
+      const std::string &item_name = item.key();
+      bool is_input = (item_name.find(kPrefixInput) != std::string::npos);
+      bool is_output = (item_name.find(kPrefixOutput) != std::string::npos);
+      if (!is_input && !is_output) {
+        MS_LOG(EXCEPTION) << "op select ret json is error.";
+      }
+      if (is_input) {
+        SelectOpIOInfo select_input;
+        select_input.name = item.value().at(kName);
+        std::string input_dtype_item = item.value().at(kDtype);
+        select_input.dtypes = SplitStrToVec(input_dtype_item);
+        std::string input_format_item = item.value().at(kFormat);
+        select_input.formats = SplitStrToVec(input_format_item);
+        inputs.emplace_back(select_input);
+      } else if (is_output) {
+        SelectOpIOInfo select_output;
+        select_output.name = item.value().at(kName);
+        std::string input_dtype_item = item.value().at(kDtype);
+        select_output.dtypes = SplitStrToVec(input_dtype_item);
+        std::string input_format_item = item.value().at(kFormat);
+        select_output.formats = SplitStrToVec(input_format_item);
+        outputs.emplace_back(select_output);
+      }
+    }
+
+    if (op_info.inputs_ptr().size() != inputs.size() || op_info.outputs_ptr().size() != outputs.size()) {
+      MS_LOG(EXCEPTION) << "select format input/output size not equal, please check register.";
+    }
+
+    *op_info_new = op_info;
+    op_info_new->ClearInputs();
+    op_info_new->ClearOutputs();
+    for (size_t i = 0; i < op_info.inputs_ptr().size(); ++i) {
+      auto input_new = std::make_shared<OpIOInfo>();
+      CreateNewOpIOInfo(*op_info.inputs_ptr().at(i), inputs.at(i).dtypes, inputs.at(i).formats, input_new.get());
+      op_info_new->add_inputs_ptr(input_new);
+    }
+    for (size_t i = 0; i < op_info.outputs_ptr().size(); ++i) {
+      auto output_new = std::make_shared<OpIOInfo>();
+      CreateNewOpIOInfo(*op_info.outputs_ptr().at(i), outputs.at(i).dtypes, outputs.at(i).formats, output_new.get());
+      op_info_new->add_outputs_ptr(output_new);
+    }
+  }
+}
+
+void TbeKernelSelect::CreateNewOpIOInfo(const mindspore::kernel::OpIOInfo &op_io_info,
+                                        const std::vector<std::string> &support_dtype,
+                                        const std::vector<std::string> &support_format,
+                                        mindspore::kernel::OpIOInfo *op_io_info_new) {
+  MS_EXCEPTION_IF_NULL(op_io_info_new);
+  op_io_info_new->set_index(op_io_info.index());
+  op_io_info_new->set_name(op_io_info.name());
+  op_io_info_new->set_param_type(op_io_info.param_type());
+  op_io_info_new->set_need_compile(op_io_info.need_compile());
+  op_io_info_new->set_reshape_type(op_io_info.reshape_type());
+  op_io_info_new->set_shape(op_io_info.shape());
+  // dtype  && format
+  op_io_info_new->set_dtypes(support_dtype);
+  op_io_info_new->set_formats(support_format);
+}
+
+void TbeKernelSelect::PrintSupportedFormat(const SupportFormat &support_format) {
+  if (support_format.input_format.size() != support_format.output_format.size()) {
+    MS_LOG(EXCEPTION) << "Input(" << support_format.input_format.size() << ")Output("
+                      << support_format.output_format.size() << ") size not match.";
+  }
+  for (size_t i = 0; i < support_format.input_format.size(); ++i) {
+    auto input_items = support_format.input_format.at(i);
+    auto output_items = support_format.output_format.at(i);
+    std::string print_str = "[";
+    for (const auto &input : input_items) {
+      print_str.append(input);
+      print_str.append(", ");
+    }
+    print_str.append("] -->");
+    for (const auto &output : output_items) {
+      print_str.append(output);
+      print_str.append(", ");
+    }
+    MS_LOG(INFO) << "Support format: " << print_str;
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h
new file mode 100644
index 0000000000..c400bdbb6f
--- /dev/null
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select/tbe_kernel_select.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_TBE_KERNEL_SELECT_H
+#define MINDSPORE_TBE_KERNEL_SELECT_H
+
+#include <string>
+#include <vector>
+#include <memory>
+#include "kernel/oplib/opinfo.h"
+#include "kernel/kernel_build_info.h"
+#include "kernel/tbe/tbe_kernel_select/common_utils.h"
+
+namespace mindspore {
+namespace kernel {
+void TbeMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list);
+
+class TbeKernelSelect {
+  using OpInfoPtr = std::shared_ptr<OpInfo>;
+  using KernelBuildInfoIter = std::vector<std::shared_ptr<KernelBuildInfo>>::iterator;
+
+ public:
+  TbeKernelSelect(CNodePtr kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list);
+  ~TbeKernelSelect() = default;
+  void TbeMetadataInfoEx();
+
+ private:
+  void GetCommonPatternKernelInfo(const OpInfo &op_info);
+  void GetDynamicFormatPatternKernelInfo(const OpInfo &op_info);
+  void GetAgnosticPatternKernelInfo(const OpInfo &op_info);
+  void GetBroadcastPatternKernelInfo(const OpInfo &op_info);
+  void GetReducePatternKernelInfo(const OpInfo &op_info);
+  void FilterInVaildKernelInfo();
+  bool FilterInVaildShape(const KernelBuildInfoIter &kernel_build_info_iter);
+  static bool IsShapeMatchFormat(const std::vector<size_t> &shape, const std::string &format);
+  bool TbeCheckSupported(const KernelBuildInfoIter &kernel_build_info_iter);
+  static void SetTbeBuildCommonInfo(const OpInfo &op_info, KernelBuildInfo::KernelBuildInfoBuilder *builder);
+  bool GenBuilderItem(bool is_input, size_t kernel_build_info_index, size_t real_io_tensor_num,
+                      const std::vector<std::shared_ptr<OpIOInfo>> &ios_info, const std::vector<int> &dyn_input_sizes,
+                      std::vector<std::string> *formats, std::vector<TypeId> *device_types,
+                      std::vector<std::vector<Axis>> *reshape_types);
+  static void StringToAxisVector(const std::string &reshape_type_str, std::vector<Axis> *reshape_type_vec);
+  static void CreateNewOpInfo(const OpInfo &op_info, const SupportFormat &support_format, OpInfo *op_info_new);
+  static void CreateNewOpIOInfo(const OpIOInfo &op_io_info,
+                                const std::vector<std::vector<std::string>> &support_format_item, size_t index,
+                                OpIOInfo *op_io_info_new);
+  // op select(dynamic)
+  void CreateNewOpInfo(const mindspore::kernel::OpInfo &op_info, mindspore::kernel::OpInfo *op_info_new);
+  static void CreateNewOpIOInfo(const OpIOInfo &op_io_info, const std::vector<std::string> &support_dtype,
+                                const std::vector<std::string> &support_format, OpIOInfo *op_io_info_new);
+  static std::vector<std::string> SplitStrToVec(const std::string &op_select_json_item);
+  std::string OpSelectFormat();
+
+  static void PrintSupportedFormat(const SupportFormat &support_format);
+
+ private:
+  CNodePtr cnode_ptr_;
+  std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list_;
+  std::string node_name_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_TBE_KERNEL_SELECT_H
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_utils.cc b/mindspore/ccsrc/kernel/tbe/tbe_utils.cc
index 5980a0fd88..a930fd3dca 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_utils.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_utils.cc
@@ -67,12 +67,12 @@ void TbeUtils::SaveJsonInfo(const std::string &json_name, const std::string &inf
   filewrite << info << std::endl;
   filewrite.close();
   if (nullptr == realpath(path.c_str(), real_path)) {
-    MS_LOG(DEBUG) << "dir: " << path << "does not exit.";
+    MS_LOG(INFO) << "dir: " << path << "does not exit.";
     return;
   }
   MS_LOG(INFO) << "real path is: " << real_path;
   if (chmod(real_path, S_IRUSR) == -1) {
-    MS_LOG(DEBUG) << "modify file: " << real_path << "to read only fail.";
+    MS_LOG(INFO) << "modify file: " << real_path << "to read only fail.";
   }
 }
 
@@ -93,7 +93,7 @@ KernelPackPtr TbeUtils::SearchCache(const std::string &kernel_name, const std::s
   // search cache.
   KernelMeta *bin_map = KernelMeta::GetInstance();
   if (bin_map == nullptr) {
-    MS_LOG(DEBUG) << "kernel cache is invalid.";
+    MS_LOG(INFO) << "kernel cache is invalid.";
     return nullptr;
   }
   return bin_map->GetKernelPack(kernel_name, processor);
@@ -118,14 +118,14 @@ int KernelManager::BinaryRegister(const mindspore::kernel::FlexArray &kernel_buf
   dev_bin.data = kernel_buffer.contents;
   auto iter = magic_maps.find(magic);
   if (iter == magic_maps.end()) {
-    MS_LOG(DEBUG) << "Invalid magic number: " << magic;
+    MS_LOG(INFO) << "Invalid magic number: " << magic;
     return -1;
   }
   dev_bin.magic = iter->second;
   dev_bin.length = kernel_buffer.len;
   dev_bin.version = 2;
   if (RT_ERROR_NONE != rtDevBinaryRegister(&dev_bin, module)) {
-    MS_LOG(DEBUG) << "Call runtime rtDevBinaryRegister error.";
+    MS_LOG(INFO) << "Call runtime rtDevBinaryRegister error.";
     return -1;
   }
   return 0;
@@ -158,14 +158,14 @@ uintptr_t KernelManager::GenFuncStub(const mindspore::kernel::KernelPack &kernel
   }
   void *module = nullptr;
   if (0 != BinaryRegister((*kernel_pack.GetKernel()), &module, magic)) {
-    MS_LOG(DEBUG) << "Call runtime BinaryRegister error.";
+    MS_LOG(INFO) << "Call runtime BinaryRegister error.";
     return 0;
   }
   // to diff different funcs.
   uintptr_t funcstub = ++kernel_stub_gen_;
   if (RT_ERROR_NONE !=
       rtFunctionRegister(module, reinterpret_cast<void *>(funcstub), funcname.c_str(), funcname.c_str(), 0)) {
-    MS_LOG(DEBUG) << "Call runtime rtFunctionRegister error.";
+    MS_LOG(INFO) << "Call runtime rtFunctionRegister error.";
     return 0;
   }
   // cache the registered kernelmeta.
@@ -236,7 +236,7 @@ KernelPackPtr KernelMeta::GetKernelPack(const std::string &kernel_name, const st
     (void)cce_json.append(kernel_name).append(kJsonSuffix);
     ret = std::make_shared<KernelPack>();
     if (!ret->LoadKernelMeta(cce_json, processor)) {
-      MS_LOG(DEBUG) << "Read cache json and bin file failed[" << cce_json << "]";
+      MS_LOG(INFO) << "Read cache json and bin file failed[" << cce_json << "]";
       return nullptr;
     }
     kernel_pack_map_[kernel_name] = ret;
diff --git a/mindspore/ccsrc/mindrecord/include/common/shard_utils.h b/mindspore/ccsrc/mindrecord/include/common/shard_utils.h
index 65a8d53e72..8aa5bdfbda 100644
--- a/mindspore/ccsrc/mindrecord/include/common/shard_utils.h
+++ b/mindspore/ccsrc/mindrecord/include/common/shard_utils.h
@@ -73,6 +73,10 @@ enum ShardType {
   kCV = 1,
 };
 
+enum TaskType {
+  kCommonTask = 0,
+  kPaddedTask = 1,
+};
 enum SamplerType { kCustomTopNSampler, kCustomTopPercentSampler, kSubsetRandomSampler, kPKSampler };
 
 enum ShuffleType { kShuffleCategory, kShuffleSample };
diff --git a/mindspore/ccsrc/mindrecord/include/shard_column.h b/mindspore/ccsrc/mindrecord/include/shard_column.h
index 496e7ec3ea..968d82e717 100644
--- a/mindspore/ccsrc/mindrecord/include/shard_column.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_column.h
@@ -67,7 +67,7 @@ class ShardColumn {
   /// \brief get column value by column name
   MSRStatus GetColumnValueByName(const std::string &column_name, const std::vector<uint8_t> &columns_blob,
                                  const json &columns_json, const unsigned char **data,
-                                 std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *n_bytes,
+                                 std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *const n_bytes,
                                  ColumnDataType *column_data_type, uint64_t *column_data_type_size,
                                  std::vector<int64_t> *column_shape);
 
@@ -88,13 +88,17 @@ class ShardColumn {
   /// \brief get column value from blob
   MSRStatus GetColumnFromBlob(const std::string &column_name, const std::vector<uint8_t> &columns_blob,
                               const unsigned char **data, std::unique_ptr<unsigned char[]> *data_ptr,
-                              uint64_t *n_bytes);
+                              uint64_t *const n_bytes);
+  std::pair<MSRStatus, ColumnCategory> GetColumnTypeByName(const std::string &column_name,
+                                                           ColumnDataType *column_data_type,
+                                                           uint64_t *column_data_type_size,
+                                                           std::vector<int64_t> *column_shape);
 
- private:
   /// \brief get column value from json
   MSRStatus GetColumnFromJson(const std::string &column_name, const json &columns_json,
                               std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *n_bytes);
 
+ private:
   /// \brief get float value from json
   template <typename T>
   MSRStatus GetFloat(std::unique_ptr<unsigned char[]> *data_ptr, const json &json_column_value, bool use_double);
@@ -115,7 +119,7 @@ class ShardColumn {
 
   /// \brief uncompress integer array column
   template <typename T>
-  static MSRStatus UncompressInt(const uint64_t &column_id, std::unique_ptr<unsigned char[]> *data_ptr,
+  static MSRStatus UncompressInt(const uint64_t &column_id, std::unique_ptr<unsigned char[]> *const data_ptr,
                                  const std::vector<uint8_t> &columns_blob, uint64_t *num_bytes, uint64_t shift_idx);
 
   /// \brief convert big-endian bytes to unsigned int
diff --git a/mindspore/ccsrc/mindrecord/include/shard_distributed_sample.h b/mindspore/ccsrc/mindrecord/include/shard_distributed_sample.h
new file mode 100644
index 0000000000..ef0ad738c4
--- /dev/null
+++ b/mindspore/ccsrc/mindrecord/include/shard_distributed_sample.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDRECORD_INCLUDE_SHARD_DISTRIBUTED_SAMPLE_H_
+#define MINDRECORD_INCLUDE_SHARD_DISTRIBUTED_SAMPLE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "mindrecord/include/shard_operator.h"
+#include "mindrecord/include/shard_shuffle.h"
+#include "mindrecord/include/shard_sample.h"
+
+namespace mindspore {
+namespace mindrecord {
+class ShardDistributedSample : public ShardSample {
+ public:
+  ShardDistributedSample(int num_shards, int shard_id, int no_of_padded_samples, bool shuffle, uint32_t seed);
+
+  ShardDistributedSample(int num_shards, int shard_id, bool shuffle, uint32_t seed);
+
+  void SetNumPaddedSamples(int no_of_padded_samples) { no_of_padded_samples_ = no_of_padded_samples; }
+
+  ~ShardDistributedSample() override{};
+
+  MSRStatus PreExecute(ShardTask &tasks) override;
+
+  int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) override;
+
+ private:
+  bool shuffle_;
+  int no_of_padded_samples_;
+  bool first_epoch_;  // check  (num_sample + num_padded) % num_shards == 0 in first epoch
+  ShardTask task_;    // maintain the input tasks in first epoch
+};
+}  // namespace mindrecord
+}  // namespace mindspore
+
+#endif  // MINDRECORD_INCLUDE_SHARD_DISTRIBUTED_SAMPLE_H_
diff --git a/mindspore/ccsrc/mindrecord/include/shard_index_generator.h b/mindspore/ccsrc/mindrecord/include/shard_index_generator.h
index f91d0f17a7..b081b7a0a0 100644
--- a/mindspore/ccsrc/mindrecord/include/shard_index_generator.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_index_generator.h
@@ -91,7 +91,7 @@ class ShardIndexGenerator {
 
   INDEX_FIELDS GenerateIndexFields(const std::vector<json> &schema_detail);
 
-  MSRStatus ExecuteTransaction(const int &shard_no, const std::pair<MSRStatus, sqlite3 *> &db,
+  MSRStatus ExecuteTransaction(const int &shard_no, std::pair<MSRStatus, sqlite3 *> &db,
                                const std::vector<int> &raw_page_ids, const std::map<int, int> &blob_id_to_page_id);
 
   MSRStatus CreateShardNameTable(sqlite3 *db, const std::string &shard_name);
diff --git a/mindspore/ccsrc/mindrecord/include/shard_operator.h b/mindspore/ccsrc/mindrecord/include/shard_operator.h
index 59c77074a1..f33e3db5f4 100644
--- a/mindspore/ccsrc/mindrecord/include/shard_operator.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_operator.h
@@ -17,6 +17,7 @@
 #ifndef MINDRECORD_INCLUDE_SHARD_OPERATOR_H_
 #define MINDRECORD_INCLUDE_SHARD_OPERATOR_H_
 
+#include <memory>
 #include "mindrecord/include/shard_task.h"
 
 namespace mindspore {
@@ -37,6 +38,14 @@ class ShardOperator {
     }
     return SUCCESS;
   }
+  virtual bool HasChildOp() { return child_op_ != nullptr; }
+
+  virtual MSRStatus SetChildOp(std::shared_ptr<ShardOperator> child_op) {
+    if (child_op != nullptr) child_op_ = child_op;
+    return SUCCESS;
+  }
+
+  virtual std::shared_ptr<ShardOperator> GetChildOp() { return child_op_; }
 
   virtual MSRStatus PreExecute(ShardTask &tasks) { return SUCCESS; }
 
@@ -44,7 +53,10 @@ class ShardOperator {
 
   virtual MSRStatus SufExecute(ShardTask &tasks) { return SUCCESS; }
 
-  virtual int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) { return -1; }
+  virtual int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) { return 0; }
+
+ private:
+  std::shared_ptr<ShardOperator> child_op_ = nullptr;
 };
 }  // namespace mindrecord
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/mindrecord/include/shard_reader.h b/mindspore/ccsrc/mindrecord/include/shard_reader.h
index 8db7761fb8..1f2138d6d5 100644
--- a/mindspore/ccsrc/mindrecord/include/shard_reader.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_reader.h
@@ -34,6 +34,7 @@
 #include <memory>
 #include <mutex>
 #include <set>
+#include <stack>
 #include <string>
 #include <thread>
 #include <tuple>
@@ -44,6 +45,7 @@
 #include "mindrecord/include/common/shard_utils.h"
 #include "mindrecord/include/shard_category.h"
 #include "mindrecord/include/shard_column.h"
+#include "mindrecord/include/shard_distributed_sample.h"
 #include "mindrecord/include/shard_error.h"
 #include "mindrecord/include/shard_index_generator.h"
 #include "mindrecord/include/shard_operator.h"
@@ -58,7 +60,8 @@ using ROW_GROUPS =
   std::tuple<MSRStatus, std::vector<std::vector<std::vector<uint64_t>>>, std::vector<std::vector<json>>>;
 using ROW_GROUP_BRIEF =
   std::tuple<MSRStatus, std::string, int, uint64_t, std::vector<std::vector<uint64_t>>, std::vector<json>>;
-using TASK_RETURN_CONTENT = std::pair<MSRStatus, std::vector<std::tuple<std::vector<uint8_t>, json>>>;
+using TASK_RETURN_CONTENT =
+  std::pair<MSRStatus, std::pair<TaskType, std::vector<std::tuple<std::vector<uint8_t>, json>>>>;
 const int kNumBatchInMap = 1000;  // iterator buffer size in row-reader mode
 const int kNumPageInBuffer = 16;  // page buffer size in block-reader mode
 
@@ -78,7 +81,8 @@ class ShardReader {
   /// \return MSRStatus the status of MSRStatus
   MSRStatus Open(const std::vector<std::string> &file_paths, bool load_dataset, int n_consumer = 4,
                  const std::vector<std::string> &selected_columns = {},
-                 const std::vector<std::shared_ptr<ShardOperator>> &operators = {}, const bool &block_reader = false);
+                 const std::vector<std::shared_ptr<ShardOperator>> &operators = {}, const bool &block_reader = false,
+                 const int num_padded = 0);
 
   /// \brief open files and initialize reader, python API
   /// \param[in] file_paths the path of ONE file, any file in dataset is fine or file list
@@ -127,7 +131,7 @@ class ShardReader {
   /// \param[out] count # of rows
   /// \return MSRStatus the status of MSRStatus
   MSRStatus CountTotalRows(const std::vector<std::string> &file_paths, bool load_dataset,
-                           const std::shared_ptr<ShardOperator> &op, int64_t *count);
+                           const std::shared_ptr<ShardOperator> &op, int64_t *count, const int num_padded);
 
   /// \brief shuffle task with incremental seed
   /// \return void
@@ -182,7 +186,8 @@ class ShardReader {
 
   /// \brief return a row by id
   /// \return a batch of images and image data
-  std::vector<std::tuple<std::vector<uint8_t>, json>> GetNextById(const int64_t &task_id, const int32_t &consumer_id);
+  std::pair<TaskType, std::vector<std::tuple<std::vector<uint8_t>, json>>> GetNextById(const int64_t &task_id,
+                                                                                       const int32_t &consumer_id);
 
   /// \brief return a batch in block-reader mode, given that one is ready
   /// \return a batch of images and image data
@@ -330,6 +335,8 @@ class ShardReader {
   bool all_in_index_ = true;  // if all columns are stored in index-table
   bool interrupt_ = false;    // reader interrupted
 
+  int num_padded_;  // number of padding samples
+
   // Delivery/Iterator mode begin
   const std::string kThreadName = "THRD_ITER_";  // prefix of thread name
   std::vector<std::thread> thread_set_;          // thread list
diff --git a/mindspore/ccsrc/mindrecord/include/shard_sample.h b/mindspore/ccsrc/mindrecord/include/shard_sample.h
index 7905f328f9..a32acbff6e 100644
--- a/mindspore/ccsrc/mindrecord/include/shard_sample.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_sample.h
@@ -38,22 +38,22 @@ class ShardSample : public ShardOperator {
 
   ~ShardSample() override{};
 
-  const std::pair<int, int> GetPartitions() const;
-
   MSRStatus Execute(ShardTask &tasks) override;
 
   MSRStatus SufExecute(ShardTask &tasks) override;
 
   int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) override;
 
- private:
+ protected:
   int numerator_;
   int denominator_;
-  int no_of_samples_;
   int partition_id_;
+  int no_of_samples_;
+  std::shared_ptr<ShardShuffle> shuffle_op_;
+
+ private:
   std::vector<int64_t> indices_;
   SamplerType sampler_type_;
-  std::shared_ptr<ShardShuffle> shuffle_op_;
 };
 }  // namespace mindrecord
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/mindrecord/include/shard_sequential_sample.h b/mindspore/ccsrc/mindrecord/include/shard_sequential_sample.h
new file mode 100644
index 0000000000..a8ee3a36db
--- /dev/null
+++ b/mindspore/ccsrc/mindrecord/include/shard_sequential_sample.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDRECORD_INCLUDE_SHARD_SEQUENTIAL_SAMPLE_H_
+#define MINDRECORD_INCLUDE_SHARD_SEQUENTIAL_SAMPLE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "mindrecord/include/shard_sample.h"
+
+namespace mindspore {
+namespace mindrecord {
+class ShardSequentialSample : public ShardSample {
+ public:
+  ShardSequentialSample(int n, int offset);
+
+  ShardSequentialSample(float per, float per_offset);
+
+  ~ShardSequentialSample() override{};
+
+  MSRStatus Execute(ShardTask &tasks) override;
+
+  int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) override;
+
+ private:
+  int offset_;
+  float per_;
+  float per_offset_;
+};
+}  // namespace mindrecord
+}  // namespace mindspore
+
+#endif  // MINDRECORD_INCLUDE_SHARD_SEQUENTIAL_SAMPLE_H_
diff --git a/mindspore/ccsrc/mindrecord/include/shard_shuffle.h b/mindspore/ccsrc/mindrecord/include/shard_shuffle.h
index a9c54e6239..adb172bdcc 100644
--- a/mindspore/ccsrc/mindrecord/include/shard_shuffle.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_shuffle.h
@@ -26,12 +26,20 @@ class ShardShuffle : public ShardOperator {
  public:
   explicit ShardShuffle(uint32_t seed = 0, ShuffleType shuffle_type = kShuffleCategory);
 
+  ShardShuffle(uint32_t seed, int64_t no_of_samples, bool replacement, bool reshuffle_each_epoch,
+               ShuffleType shuffle_type = kShuffleSample);
+
   ~ShardShuffle() override{};
 
   MSRStatus Execute(ShardTask &tasks) override;
 
+  int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) override;
+
  private:
   uint32_t shuffle_seed_;
+  int64_t no_of_samples_;
+  bool replacement_;
+  bool reshuffle_each_epoch_;
   ShuffleType shuffle_type_;
 };
 }  // namespace mindrecord
diff --git a/mindspore/ccsrc/mindrecord/include/shard_task.h b/mindspore/ccsrc/mindrecord/include/shard_task.h
index d48c25c9cd..4a12eb9e45 100644
--- a/mindspore/ccsrc/mindrecord/include/shard_task.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_task.h
@@ -17,6 +17,7 @@
 #ifndef MINDRECORD_INCLUDE_SHARD_TASK_H_
 #define MINDRECORD_INCLUDE_SHARD_TASK_H_
 
+#include <algorithm>
 #include <iostream>
 #include <string>
 #include <tuple>
@@ -27,11 +28,20 @@ namespace mindspore {
 namespace mindrecord {
 class ShardTask {
  public:
+  ShardTask();
+
+  ShardTask(const ShardTask &task);  // copy construction
+
+  ShardTask &operator=(const ShardTask &task);  // assignment operator
+
+  ~ShardTask() = default;
+
   void MakePerm();
 
-  void InsertTask(int shard_id, int group_id, const std::vector<uint64_t> &offset, const json &label);
+  void InsertTask(TaskType task_type, int shard_id, int group_id, const std::vector<uint64_t> &offset,
+                  const json &label);
 
-  void InsertTask(std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> task);
+  void InsertTask(std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> task);
 
   void PopBack();
 
@@ -39,16 +49,17 @@ class ShardTask {
 
   uint32_t SizeOfRows() const;
 
-  std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> &GetTaskByID(size_t id);
+  std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> &GetTaskByID(size_t id);
 
-  std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> &GetRandomTask();
+  std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> &GetRandomTask();
 
   static ShardTask Combine(std::vector<ShardTask> &category_tasks, bool replacement, int64_t num_elements);
 
-  uint32_t categories = 1;
+  uint32_t categories;
 
-  std::vector<std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json>> task_list_;
   std::vector<int> permutation_;
+
+  std::vector<std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json>> task_list_;
 };
 }  // namespace mindrecord
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/mindrecord/io/shard_index_generator.cc b/mindspore/ccsrc/mindrecord/io/shard_index_generator.cc
index 905968e3a2..16c730bd4c 100644
--- a/mindspore/ccsrc/mindrecord/io/shard_index_generator.cc
+++ b/mindspore/ccsrc/mindrecord/io/shard_index_generator.cc
@@ -335,15 +335,15 @@ MSRStatus ShardIndexGenerator::BindParameterExecuteSQL(
 
       int index = sqlite3_bind_parameter_index(stmt, common::SafeCStr(place_holder));
       if (field_type == "INTEGER") {
-        if (sqlite3_bind_int(stmt, index, std::stoi(field_value)) != SQLITE_OK) {
+        if (sqlite3_bind_int64(stmt, index, std::stoll(field_value)) != SQLITE_OK) {
           MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index
-                        << ", field value: " << std::stoi(field_value);
+                        << ", field value: " << std::stoll(field_value);
           return FAILED;
         }
       } else if (field_type == "NUMERIC") {
-        if (sqlite3_bind_double(stmt, index, std::stod(field_value)) != SQLITE_OK) {
+        if (sqlite3_bind_double(stmt, index, std::stold(field_value)) != SQLITE_OK) {
           MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index
-                        << ", field value: " << std::stoi(field_value);
+                        << ", field value: " << std::stold(field_value);
           return FAILED;
         }
       } else if (field_type == "NULL") {
@@ -514,7 +514,7 @@ INDEX_FIELDS ShardIndexGenerator::GenerateIndexFields(const std::vector<json> &s
   return {SUCCESS, std::move(fields)};
 }
 
-MSRStatus ShardIndexGenerator::ExecuteTransaction(const int &shard_no, const std::pair<MSRStatus, sqlite3 *> &db,
+MSRStatus ShardIndexGenerator::ExecuteTransaction(const int &shard_no, std::pair<MSRStatus, sqlite3 *> &db,
                                                   const std::vector<int> &raw_page_ids,
                                                   const std::map<int, int> &blob_id_to_page_id) {
   // Add index data to database
@@ -556,6 +556,7 @@ MSRStatus ShardIndexGenerator::ExecuteTransaction(const int &shard_no, const std
     MS_LOG(ERROR) << "Close database failed";
     return FAILED;
   }
+  db.second = nullptr;
   return SUCCESS;
 }
 
diff --git a/mindspore/ccsrc/mindrecord/io/shard_reader.cc b/mindspore/ccsrc/mindrecord/io/shard_reader.cc
index fcb588fff8..99fa0c447d 100644
--- a/mindspore/ccsrc/mindrecord/io/shard_reader.cc
+++ b/mindspore/ccsrc/mindrecord/io/shard_reader.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "mindrecord/include/shard_distributed_sample.h"
 #include "mindrecord/include/shard_reader.h"
 #include "common/utils.h"
 
@@ -45,6 +46,7 @@ ShardReader::ShardReader() {
   row_id_ = 0;
   num_blocks_ = 0;
   block_reader_ = false;
+  num_padded_ = 0;
 }
 
 std::pair<MSRStatus, std::vector<std::string>> ShardReader::GetMeta(const std::string &file_path, json &meta_data) {
@@ -113,6 +115,7 @@ MSRStatus ShardReader::Init(const std::vector<std::string> &file_paths, bool loa
       MS_LOG(ERROR) << "Error in select statement, sql: " << sql << ", error: " << errmsg;
       sqlite3_free(errmsg);
       sqlite3_close(db);
+      db = nullptr;
       return FAILED;
     } else {
       MS_LOG(DEBUG) << "Get " << static_cast<int>(name.size()) << " records from index.";
@@ -121,6 +124,7 @@ MSRStatus ShardReader::Init(const std::vector<std::string> &file_paths, bool loa
         MS_LOG(ERROR) << "DB file can not match file " << file;
         sqlite3_free(errmsg);
         sqlite3_close(db);
+        db = nullptr;
         return FAILED;
       }
     }
@@ -218,7 +222,11 @@ void ShardReader::FileStreamsOperator() {
   }
   for (int i = static_cast<int>(database_paths_.size()) - 1; i >= 0; --i) {
     if (database_paths_[i] != nullptr) {
-      (void)sqlite3_close(database_paths_[i]);
+      auto ret = sqlite3_close(database_paths_[i]);
+      if (ret != SQLITE_OK) {
+        MS_LOG(ERROR) << "Close db failed. Error code: " << ret << ".";
+      }
+      database_paths_[i] = nullptr;
     }
   }
 }
@@ -346,6 +354,7 @@ MSRStatus ShardReader::ReadAllRowsInShard(int shard_id, const std::string &sql,
     MS_LOG(ERROR) << "Error in select statement, sql: " << sql << ", error: " << errmsg;
     sqlite3_free(errmsg);
     sqlite3_close(db);
+    db = nullptr;
     return FAILED;
   }
   MS_LOG(INFO) << "Get " << static_cast<int>(labels.size()) << " records from shard " << shard_id << " index.";
@@ -399,6 +408,7 @@ void ShardReader::GetClassesInShard(sqlite3 *db, int shard_id, const std::string
   if (ret != SQLITE_OK) {
     sqlite3_free(errmsg);
     sqlite3_close(db);
+    db = nullptr;
     MS_LOG(ERROR) << "Error in select sql statement, sql:" << common::SafeCStr(sql) << ", error: " << errmsg;
     return;
   }
@@ -523,6 +533,7 @@ std::vector<std::vector<uint64_t>> ShardReader::GetImageOffset(int page_id, int
     MS_LOG(ERROR) << "Error in select statement, sql: " << sql << ", error: " << errmsg;
     sqlite3_free(errmsg);
     sqlite3_close(db);
+    db = nullptr;
     return std::vector<std::vector<uint64_t>>();
   } else {
     MS_LOG(DEBUG) << "Get " << static_cast<int>(image_offsets.size()) << "records from index.";
@@ -662,6 +673,7 @@ std::pair<MSRStatus, std::vector<json>> ShardReader::GetLabelsFromPage(
       MS_LOG(ERROR) << "Error in select statement, sql: " << sql << ", error: " << errmsg;
       sqlite3_free(errmsg);
       sqlite3_close(db);
+      db = nullptr;
       return {FAILED, {}};
     }
     MS_LOG(DEBUG) << "Get " << label_offsets.size() << "records from index.";
@@ -698,6 +710,7 @@ std::pair<MSRStatus, std::vector<json>> ShardReader::GetLabels(int page_id, int
         MS_LOG(ERROR) << "Error in select statement, sql: " << sql << ", error: " << errmsg;
         sqlite3_free(errmsg);
         sqlite3_close(db);
+        db = nullptr;
         return {FAILED, {}};
       } else {
         MS_LOG(DEBUG) << "Get " << static_cast<int>(labels.size()) << "records from index.";
@@ -790,23 +803,51 @@ int64_t ShardReader::GetNumClasses(const std::string &category_field) {
 }
 
 MSRStatus ShardReader::CountTotalRows(const std::vector<std::string> &file_paths, bool load_dataset,
-                                      const std::shared_ptr<ShardOperator> &op, int64_t *count) {
+                                      const std::shared_ptr<ShardOperator> &ops, int64_t *count, const int num_padded) {
   if (SUCCESS != Init(file_paths, load_dataset)) {
     return FAILED;
   }
   int64_t num_samples = num_rows_;
-  if (std::dynamic_pointer_cast<ShardCategory>(op)) {
-    auto category_op = std::dynamic_pointer_cast<ShardCategory>(op);
-    std::string category_field = category_op->GetCategoryField();
-    auto num_classes = GetNumClasses(category_field);
-    num_samples = category_op->GetNumSamples(num_rows_, num_classes);
-  } else if (std::dynamic_pointer_cast<ShardSample>(op)) {
-    num_samples = op->GetNumSamples(num_rows_, 0);
-  } else {
-  }
-  if (-1 == num_samples) {
-    MS_LOG(ERROR) << "Failed to get dataset size.";
-    return FAILED;
+  bool root = true;
+  std::stack<std::shared_ptr<ShardOperator>> stack_ops;
+  std::shared_ptr<ShardOperator> op(ops);
+  while (op != nullptr) {
+    stack_ops.push(op);
+    op = op->GetChildOp();
+  }
+  while (!stack_ops.empty()) {
+    op = stack_ops.top();
+    stack_ops.pop();
+    if (std::dynamic_pointer_cast<ShardShuffle>(op)) {
+      num_samples = op->GetNumSamples(num_samples, 0);
+      if (num_padded > 0 && root == true) {
+        num_samples += num_padded;
+        MS_LOG(DEBUG) << "Padding samples work on shuffle sampler.";
+        root = false;
+      }
+    } else if (std::dynamic_pointer_cast<ShardCategory>(op)) {
+      auto category_op = std::dynamic_pointer_cast<ShardCategory>(op);
+      std::string category_field = category_op->GetCategoryField();
+      auto num_classes = GetNumClasses(category_field);
+      num_samples = category_op->GetNumSamples(num_samples, num_classes);
+    } else if (std::dynamic_pointer_cast<ShardSample>(op)) {
+      if (std::dynamic_pointer_cast<ShardDistributedSample>(op)) {
+        auto sampler_op = std::dynamic_pointer_cast<ShardDistributedSample>(op);
+        if (root == true) {
+          sampler_op->SetNumPaddedSamples(num_padded);
+          num_samples = op->GetNumSamples(num_samples, 0);
+          if (-1 == num_samples) {
+            MS_LOG(ERROR) << "Dataset size plus number of padded samples is not divisible by number of shards.";
+            return FAILED;
+          }
+          root = false;
+        }
+      } else {
+        num_samples = op->GetNumSamples(num_samples, 0);
+      }
+    } else {
+      if (num_padded > 0) num_samples += num_padded;
+    }
   }
   *count = num_samples;
   return SUCCESS;
@@ -814,7 +855,8 @@ MSRStatus ShardReader::CountTotalRows(const std::vector<std::string> &file_paths
 
 MSRStatus ShardReader::Open(const std::vector<std::string> &file_paths, bool load_dataset, int n_consumer,
                             const std::vector<std::string> &selected_columns,
-                            const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader) {
+                            const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader,
+                            int num_padded) {
   // Open file and set header by ShardReader
   auto ret = Init(file_paths, load_dataset);
   if (SUCCESS != ret) {
@@ -844,6 +886,7 @@ MSRStatus ShardReader::Open(const std::vector<std::string> &file_paths, bool loa
   // Initialize argument
   shard_count_ = static_cast<int>(file_paths_.size());
   n_consumer_ = n_consumer;
+  num_padded_ = num_padded;
 
   operators_ = operators;
 
@@ -935,7 +978,7 @@ MSRStatus ShardReader::CreateTasksByBlock(const std::vector<std::tuple<int, int,
     auto shard_id = std::get<0>(rg);
     auto group_id = std::get<1>(rg);
     auto n_Rows = std::get<3>(rg);
-    tasks_.InsertTask(shard_id, group_id, std::vector<uint64_t>{n_Rows}, json{});
+    tasks_.InsertTask(TaskType::kCommonTask, shard_id, group_id, std::vector<uint64_t>{n_Rows}, json{});
   }
   return SUCCESS;
 }
@@ -986,7 +1029,7 @@ MSRStatus ShardReader::CreateTasksByCategory(const std::vector<std::tuple<int, i
       auto number_of_rows = offsets.size();
       for (uint32_t iStart = 0; iStart < number_of_rows; iStart += 1) {
         if (category_index < num_elements) {
-          categoryTasks[categoryNo].InsertTask(shard_id, group_id, std::get<4>(details)[iStart],
+          categoryTasks[categoryNo].InsertTask(TaskType::kCommonTask, shard_id, group_id, std::get<4>(details)[iStart],
                                                std::get<5>(details)[iStart]);
           category_index++;
         }
@@ -1014,7 +1057,7 @@ MSRStatus ShardReader::CreateTasksByRow(const std::vector<std::tuple<int, int, i
   if (shard_count_ <= kMaxShardCount) {
     for (int shard_id = 0; shard_id < shard_count_; shard_id++) {
       for (uint32_t i = 0; i < offsets[shard_id].size(); i += 1) {
-        tasks_.InsertTask(offsets[shard_id][i][0], offsets[shard_id][i][1],
+        tasks_.InsertTask(TaskType::kCommonTask, offsets[shard_id][i][0], offsets[shard_id][i][1],
                           std::vector<uint64_t>{offsets[shard_id][i][2], offsets[shard_id][i][3]},
                           local_columns[shard_id][i]);
       }
@@ -1044,6 +1087,11 @@ MSRStatus ShardReader::CreateTasks(const std::vector<std::tuple<int, int, int, u
       if (SUCCESS != CreateTasksByRow(row_group_summary, operators)) {
         return FAILED;
       }
+      if (num_padded_ > 0) {
+        for (int i = 0; i < num_padded_; ++i) {
+          tasks_.InsertTask(TaskType::kPaddedTask, 0, 0, {}, json());
+        }
+      }
     } else {
       if (SUCCESS != CreateTasksByCategory(row_group_summary, operators[category_operator])) {
         return FAILED;
@@ -1070,18 +1118,27 @@ MSRStatus ShardReader::CreateTasks(const std::vector<std::tuple<int, int, int, u
 TASK_RETURN_CONTENT ShardReader::ConsumerOneTask(int task_id, uint32_t consumer_id) {
   // All tasks are done
   if (task_id >= static_cast<int>(tasks_.Size())) {
-    return std::make_pair(FAILED, std::vector<std::tuple<std::vector<uint8_t>, json>>());
+    return std::make_pair(FAILED,
+                          std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
   }
 
   // Pick up task from task list
   auto task = tasks_.GetTaskByID(tasks_.permutation_[task_id]);
 
-  auto shard_id = std::get<0>(std::get<0>(task));
-  auto group_id = std::get<1>(std::get<0>(task));
-  auto addr = std::get<1>(task);
+  // check task type
+  auto task_type = std::get<0>(task);
+  if (task_type == TaskType::kPaddedTask) {
+    return std::make_pair(SUCCESS,
+                          std::make_pair(TaskType::kPaddedTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
+  }
+
+  auto shard_id = std::get<0>(std::get<1>(task));
+  auto group_id = std::get<1>(std::get<1>(task));
+  auto addr = std::get<2>(task);
   const auto &ret = shard_header_->GetPageByGroupId(group_id, shard_id);
   if (SUCCESS != ret.first) {
-    return std::make_pair(FAILED, std::vector<std::tuple<std::vector<uint8_t>, json>>());
+    return std::make_pair(FAILED,
+                          std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
   }
   const std::shared_ptr<Page> &page = ret.second;
 
@@ -1093,7 +1150,8 @@ TASK_RETURN_CONTENT ShardReader::ConsumerOneTask(int task_id, uint32_t consumer_
   if (!io_seekg.good() || io_seekg.fail() || io_seekg.bad()) {
     MS_LOG(ERROR) << "File seekg failed";
     file_streams_random_[consumer_id][shard_id]->close();
-    return std::make_pair(FAILED, std::vector<std::tuple<std::vector<uint8_t>, json>>());
+    return std::make_pair(FAILED,
+                          std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
   }
 
   auto &io_read =
@@ -1101,14 +1159,15 @@ TASK_RETURN_CONTENT ShardReader::ConsumerOneTask(int task_id, uint32_t consumer_
   if (!io_read.good() || io_read.fail() || io_read.bad()) {
     MS_LOG(ERROR) << "File read failed";
     file_streams_random_[consumer_id][shard_id]->close();
-    return std::make_pair(FAILED, std::vector<std::tuple<std::vector<uint8_t>, json>>());
+    return std::make_pair(FAILED,
+                          std::pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>()));
   }
 
   // Deliver batch data to output map
   std::vector<std::tuple<std::vector<uint8_t>, json>> batch;
-  batch.emplace_back(std::move(images), std::move(std::get<2>(task)));
+  batch.emplace_back(std::move(images), std::move(std::get<3>(task)));
 
-  return std::make_pair(SUCCESS, std::move(batch));
+  return std::make_pair(SUCCESS, std::make_pair(TaskType::kCommonTask, std::move(batch)));
 }
 
 MSRStatus ShardReader::ConsumerByRow(int consumer_id) {
@@ -1133,7 +1192,7 @@ MSRStatus ShardReader::ConsumerByRow(int consumer_id) {
     if (SUCCESS != ret.first) {
       return FAILED;
     }
-    const auto &batch = ret.second;
+    const auto &batch = (ret.second).second;
     // Hanging if maximum map size exceeded
     //   otherwise, set batch data in map
     {
@@ -1193,8 +1252,8 @@ MSRStatus ShardReader::ConsumerByBlock(int consumer_id) {
     // Pick up task from task list
     auto task = tasks_.GetTaskByID(tasks_.permutation_[task_id]);
 
-    auto shard_id = std::get<0>(std::get<0>(task));
-    auto group_id = std::get<1>(std::get<0>(task));
+    auto shard_id = std::get<0>(std::get<1>(task));
+    auto group_id = std::get<1>(std::get<1>(task));
     auto row_group_brief = ReadRowGroupBrief(group_id, shard_id, selected_columns_);
     if (SUCCESS != std::get<0>(row_group_brief)) {
       return FAILED;
@@ -1302,17 +1361,17 @@ std::vector<std::tuple<std::vector<uint8_t>, json>> ShardReader::GetNext() {
   return *res;
 }
 
-std::vector<std::tuple<std::vector<uint8_t>, json>> ShardReader::GetNextById(const int64_t &task_id,
-                                                                             const int32_t &consumer_id) {
+std::pair<TaskType, std::vector<std::tuple<std::vector<uint8_t>, json>>> ShardReader::GetNextById(
+  const int64_t &task_id, const int32_t &consumer_id) {
   if (interrupt_) {
-    return std::vector<std::tuple<std::vector<uint8_t>, json>>();
+    return std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>());
   }
   if (block_reader_) {
-    return GetBlockNext();
+    return std::make_pair(TaskType::kCommonTask, GetBlockNext());
   }
   const auto &ret = ConsumerOneTask(task_id, consumer_id);
   if (SUCCESS != ret.first) {
-    return std::vector<std::tuple<std::vector<uint8_t>, json>>();
+    return std::make_pair(TaskType::kCommonTask, std::vector<std::tuple<std::vector<uint8_t>, json>>());
   }
   return std::move(ret.second);
 }
@@ -1364,12 +1423,26 @@ void ShardReader::Reset() {
 }
 
 void ShardReader::ShuffleTask() {
+  if (block_reader_) return;
+  // exist shuffle and distributed sampler in ops, skip shuffle
+  bool has_sharding = false;
   for (const auto &op : operators_) {
-    if (block_reader_ || !std::dynamic_pointer_cast<ShardShuffle>(op)) continue;
-    if (SUCCESS != (*op)(tasks_)) {
-      MS_LOG(WARNING) << "Reshuffle reader tasks failed.";
+    if (std::dynamic_pointer_cast<ShardDistributedSample>(op)) {
+      has_sharding = true;
     }
   }
+  for (const auto &op : operators_) {
+    if (std::dynamic_pointer_cast<ShardShuffle>(op) && has_sharding == false) {
+      if (SUCCESS != (*op)(tasks_)) {
+        MS_LOG(WARNING) << "Redo randomSampler failed.";
+      }
+    } else if (std::dynamic_pointer_cast<ShardDistributedSample>(op)) {
+      if (SUCCESS != (*op)(tasks_)) {
+        MS_LOG(WARNING) << "Redo distributeSampler failed.";
+      }
+    }
+  }
+  if (tasks_.permutation_.empty()) tasks_.MakePerm();
 }
 
 }  // namespace mindrecord
diff --git a/mindspore/ccsrc/mindrecord/io/shard_segment.cc b/mindspore/ccsrc/mindrecord/io/shard_segment.cc
index 86c79ca05a..fb1120b178 100644
--- a/mindspore/ccsrc/mindrecord/io/shard_segment.cc
+++ b/mindspore/ccsrc/mindrecord/io/shard_segment.cc
@@ -43,6 +43,7 @@ std::pair<MSRStatus, vector<std::string>> ShardSegment::GetCategoryFields() {
     MS_LOG(ERROR) << "Error in select statement, sql: " << sql << ", error: " << errmsg;
     sqlite3_free(errmsg);
     sqlite3_close(database_paths_[0]);
+    database_paths_[0] = nullptr;
     return {FAILED, vector<std::string>{}};
   } else {
     MS_LOG(INFO) << "Get " << static_cast<int>(field_names.size()) << " records from index.";
@@ -53,6 +54,7 @@ std::pair<MSRStatus, vector<std::string>> ShardSegment::GetCategoryFields() {
     if (field_names[idx].size() < 2) {
       sqlite3_free(errmsg);
       sqlite3_close(database_paths_[0]);
+      database_paths_[0] = nullptr;
       return {FAILED, vector<std::string>{}};
     }
     candidate_category_fields_.push_back(field_names[idx][1]);
@@ -107,6 +109,7 @@ std::pair<MSRStatus, std::vector<std::tuple<int, std::string, int>>> ShardSegmen
       MS_LOG(ERROR) << "Error in select statement, sql: " << sql << ", error: " << errmsg;
       sqlite3_free(errmsg);
       sqlite3_close(db);
+      db = nullptr;
       return {FAILED, std::vector<std::tuple<int, std::string, int>>()};
     } else {
       MS_LOG(INFO) << "Get " << static_cast<int>(field_count.size()) << " records from index.";
diff --git a/mindspore/ccsrc/mindrecord/io/shard_writer.cc b/mindspore/ccsrc/mindrecord/io/shard_writer.cc
index 9756b475e5..913caab550 100644
--- a/mindspore/ccsrc/mindrecord/io/shard_writer.cc
+++ b/mindspore/ccsrc/mindrecord/io/shard_writer.cc
@@ -90,7 +90,7 @@ MSRStatus ShardWriter::OpenDataFiles(bool append) {
       fs->close();
 
       // open the mindrecord file to write
-      fs->open(common::SafeCStr(file), std::ios::out | std::ios::binary);
+      fs->open(common::SafeCStr(file), std::ios::out | std::ios::in | std::ios::binary | std::ios::trunc);
       if (!fs->good()) {
         MS_LOG(ERROR) << "MindRecord file could not opened.";
         return FAILED;
diff --git a/mindspore/ccsrc/mindrecord/meta/shard_category.cc b/mindspore/ccsrc/mindrecord/meta/shard_category.cc
index dfca92a08c..bd427a330a 100644
--- a/mindspore/ccsrc/mindrecord/meta/shard_category.cc
+++ b/mindspore/ccsrc/mindrecord/meta/shard_category.cc
@@ -41,7 +41,7 @@ int64_t ShardCategory::GetNumSamples(int64_t dataset_size, int64_t num_classes)
   if (dataset_size > 0 && num_classes > 0 && num_categories_ > 0 && num_elements_ > 0) {
     return std::min(num_categories_, num_classes) * num_elements_;
   }
-  return -1;
+  return 0;
 }
 }  // namespace mindrecord
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/mindrecord/meta/shard_column.cc b/mindspore/ccsrc/mindrecord/meta/shard_column.cc
index 86ad0c96d7..28dc243e17 100644
--- a/mindspore/ccsrc/mindrecord/meta/shard_column.cc
+++ b/mindspore/ccsrc/mindrecord/meta/shard_column.cc
@@ -66,9 +66,28 @@ ShardColumn::ShardColumn(const std::shared_ptr<ShardHeader> &shard_header, bool
   num_blob_column_ = blob_column_.size();
 }
 
+std::pair<MSRStatus, ColumnCategory> ShardColumn::GetColumnTypeByName(const std::string &column_name,
+                                                                      ColumnDataType *column_data_type,
+                                                                      uint64_t *column_data_type_size,
+                                                                      std::vector<int64_t> *column_shape) {
+  // Skip if column not found
+  auto column_category = CheckColumnName(column_name);
+  if (column_category == ColumnNotFound) {
+    return {FAILED, ColumnNotFound};
+  }
+
+  // Get data type and size
+  auto column_id = column_name_id_[column_name];
+  *column_data_type = column_data_type_[column_id];
+  *column_data_type_size = ColumnDataTypeSize[*column_data_type];
+  *column_shape = column_shape_[column_id];
+
+  return {SUCCESS, column_category};
+}
+
 MSRStatus ShardColumn::GetColumnValueByName(const std::string &column_name, const std::vector<uint8_t> &columns_blob,
                                             const json &columns_json, const unsigned char **data,
-                                            std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *n_bytes,
+                                            std::unique_ptr<unsigned char[]> *data_ptr, uint64_t *const n_bytes,
                                             ColumnDataType *column_data_type, uint64_t *column_data_type_size,
                                             std::vector<int64_t> *column_shape) {
   // Skip if column not found
@@ -231,7 +250,7 @@ MSRStatus ShardColumn::GetInt(std::unique_ptr<unsigned char[]> *data_ptr, const
 
 MSRStatus ShardColumn::GetColumnFromBlob(const std::string &column_name, const std::vector<uint8_t> &columns_blob,
                                          const unsigned char **data, std::unique_ptr<unsigned char[]> *data_ptr,
-                                         uint64_t *n_bytes) {
+                                         uint64_t *const n_bytes) {
   uint64_t offset_address = 0;
   auto column_id = column_name_id_[column_name];
   if (GetColumnAddressInBlock(column_id, columns_blob, n_bytes, &offset_address) == FAILED) {
@@ -304,7 +323,7 @@ std::vector<uint8_t> ShardColumn::CompressBlob(const std::vector<uint8_t> &blob)
 }
 
 vector<uint8_t> ShardColumn::CompressInt(const vector<uint8_t> &src_bytes, const IntegerType &int_type) {
-  uint64_t i_size = kUnsignedOne << int_type;
+  uint64_t i_size = kUnsignedOne << static_cast<uint8_t>(int_type);
   // Get number of elements
   uint64_t src_n_int = src_bytes.size() / i_size;
   // Calculate bitmap size (bytes)
@@ -325,20 +344,20 @@ vector<uint8_t> ShardColumn::CompressInt(const vector<uint8_t> &src_bytes, const
     // Initialize destination data type
     IntegerType dst_int_type = kInt8Type;
     // Shift to next int position
-    uint64_t pos = i * (kUnsignedOne << int_type);
+    uint64_t pos = i * (kUnsignedOne << static_cast<uint8_t>(int_type));
     // Narrow down this int
     int64_t i_n = BytesLittleToMinIntType(src_bytes, pos, int_type, &dst_int_type);
 
     // Write this int to destination blob
     uint64_t u_n = *reinterpret_cast<uint64_t *>(&i_n);
     auto temp_bytes = UIntToBytesLittle(u_n, dst_int_type);
-    for (uint64_t j = 0; j < (kUnsignedOne << dst_int_type); j++) {
+    for (uint64_t j = 0; j < (kUnsignedOne << static_cast<uint8_t>(dst_int_type)); j++) {
       dst_bytes[i_dst++] = temp_bytes[j];
     }
 
     // Update date type in bit map
     dst_bytes[i / kNumDataOfByte + kBytesOfColumnLen] |=
-      (dst_int_type << (kDataTypeBits * (kNumDataOfByte - kUnsignedOne - (i % kNumDataOfByte))));
+      (static_cast<uint8_t>(dst_int_type) << (kDataTypeBits * (kNumDataOfByte - kUnsignedOne - (i % kNumDataOfByte))));
   }
   // Resize destination blob
   dst_bytes.resize(i_dst);
@@ -366,7 +385,7 @@ MSRStatus ShardColumn::GetColumnAddressInBlock(const uint64_t &column_id, const
 }
 
 template <typename T>
-MSRStatus ShardColumn::UncompressInt(const uint64_t &column_id, std::unique_ptr<unsigned char[]> *data_ptr,
+MSRStatus ShardColumn::UncompressInt(const uint64_t &column_id, std::unique_ptr<unsigned char[]> *const data_ptr,
                                      const std::vector<uint8_t> &columns_blob, uint64_t *num_bytes,
                                      uint64_t shift_idx) {
   auto num_elements = BytesBigToUInt64(columns_blob, shift_idx, kInt32Type);
@@ -387,7 +406,10 @@ MSRStatus ShardColumn::UncompressInt(const uint64_t &column_id, std::unique_ptr<
 
   auto data = reinterpret_cast<const unsigned char *>(array_data.get());
   *data_ptr = std::make_unique<unsigned char[]>(*num_bytes);
-  memcpy(data_ptr->get(), data, *num_bytes);
+  int ret_code = memcpy_s(data_ptr->get(), *num_bytes, data, *num_bytes);
+  if (ret_code != 0) {
+    MS_LOG(ERROR) << "Failed to copy data!";
+  }
 
   return SUCCESS;
 }
@@ -395,14 +417,14 @@ MSRStatus ShardColumn::UncompressInt(const uint64_t &column_id, std::unique_ptr<
 uint64_t ShardColumn::BytesBigToUInt64(const std::vector<uint8_t> &bytes_array, const uint64_t &pos,
                                        const IntegerType &i_type) {
   uint64_t result = 0;
-  for (uint64_t i = 0; i < (kUnsignedOne << i_type); i++) {
+  for (uint64_t i = 0; i < (kUnsignedOne << static_cast<uint8_t>(i_type)); i++) {
     result = (result << kBitsOfByte) + bytes_array[pos + i];
   }
   return result;
 }
 
 std::vector<uint8_t> ShardColumn::UIntToBytesBig(uint64_t value, const IntegerType &i_type) {
-  uint64_t n_bytes = kUnsignedOne << i_type;
+  uint64_t n_bytes = kUnsignedOne << static_cast<uint8_t>(i_type);
   std::vector<uint8_t> result(n_bytes, 0);
   for (uint64_t i = 0; i < n_bytes; i++) {
     result[n_bytes - 1 - i] = value & std::numeric_limits<uint8_t>::max();
@@ -412,7 +434,7 @@ std::vector<uint8_t> ShardColumn::UIntToBytesBig(uint64_t value, const IntegerTy
 }
 
 std::vector<uint8_t> ShardColumn::UIntToBytesLittle(uint64_t value, const IntegerType &i_type) {
-  uint64_t n_bytes = kUnsignedOne << i_type;
+  uint64_t n_bytes = kUnsignedOne << static_cast<uint8_t>(i_type);
   std::vector<uint8_t> result(n_bytes, 0);
   for (uint64_t i = 0; i < n_bytes; i++) {
     result[i] = value & std::numeric_limits<uint8_t>::max();
@@ -424,8 +446,9 @@ std::vector<uint8_t> ShardColumn::UIntToBytesLittle(uint64_t value, const Intege
 int64_t ShardColumn::BytesLittleToMinIntType(const std::vector<uint8_t> &bytes_array, const uint64_t &pos,
                                              const IntegerType &src_i_type, IntegerType *dst_i_type) {
   uint64_t u_temp = 0;
-  for (uint64_t i = 0; i < (kUnsignedOne << src_i_type); i++) {
-    u_temp = (u_temp << kBitsOfByte) + bytes_array[pos + (kUnsignedOne << src_i_type) - kUnsignedOne - i];
+  for (uint64_t i = 0; i < (kUnsignedOne << static_cast<uint8_t>(src_i_type)); i++) {
+    u_temp = (u_temp << kBitsOfByte) +
+             bytes_array[pos + (kUnsignedOne << static_cast<uint8_t>(src_i_type)) - kUnsignedOne - i];
   }
 
   int64_t i_out;
diff --git a/mindspore/ccsrc/mindrecord/meta/shard_distributed_sample.cc b/mindspore/ccsrc/mindrecord/meta/shard_distributed_sample.cc
new file mode 100644
index 0000000000..b7e890da7c
--- /dev/null
+++ b/mindspore/ccsrc/mindrecord/meta/shard_distributed_sample.cc
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mindrecord/include/shard_distributed_sample.h"
+
+using mindspore::LogStream;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::MsLogLevel::ERROR;
+
+namespace mindspore {
+namespace mindrecord {
+ShardDistributedSample::ShardDistributedSample(int num_shards, int shard_id, int no_of_padded_samples, bool shuffle,
+                                               uint32_t seed)
+    : ShardSample(1, num_shards, shard_id),
+      shuffle_(shuffle),
+      no_of_padded_samples_(no_of_padded_samples),
+      first_epoch_(true) {
+  shuffle_op_ = std::make_shared<ShardShuffle>(seed, kShuffleSample);
+}
+
+ShardDistributedSample::ShardDistributedSample(int num_shards, int shard_id, bool shuffle, uint32_t seed)
+    : ShardDistributedSample(num_shards, shard_id, 0, shuffle, seed) {}
+
+int64_t ShardDistributedSample::GetNumSamples(int64_t dataset_size, int64_t num_classes) {
+  if (no_of_padded_samples_ <= 0) {
+    if (dataset_size % denominator_ == 0) {
+      return dataset_size / denominator_ * numerator_;
+    } else {
+      return dataset_size / denominator_ * numerator_ + 1;
+    }
+  } else {
+    auto padded_size = dataset_size + no_of_padded_samples_;
+    if (padded_size % denominator_ == 0) {
+      return padded_size / denominator_ * numerator_;
+    } else {
+      return -1;
+    }
+  }
+  return 0;
+}
+
+MSRStatus ShardDistributedSample::PreExecute(ShardTask &tasks) {
+  auto total_no = tasks.Size();
+  if (no_of_padded_samples_ > 0 && first_epoch_) {
+    if (total_no % denominator_ != 0) {
+      MS_LOG(ERROR) << "Dataset size plus number of padded samples is not divisible by number of shards. "
+                    << "task size: " << total_no << ", number padded: " << no_of_padded_samples_
+                    << ", denominator: " << denominator_;
+      return FAILED;
+    }
+  }
+  if (first_epoch_) {
+    first_epoch_ = false;
+    task_ = tasks;
+  } else {
+    tasks = task_;
+  }
+  if (shuffle_ == true) {
+    if (SUCCESS != (*shuffle_op_)(tasks)) {
+      return FAILED;
+    }
+  }
+  return SUCCESS;
+}
+}  // namespace mindrecord
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/mindrecord/meta/shard_sample.cc b/mindspore/ccsrc/mindrecord/meta/shard_sample.cc
index d7842a11a3..c207747194 100644
--- a/mindspore/ccsrc/mindrecord/meta/shard_sample.cc
+++ b/mindspore/ccsrc/mindrecord/meta/shard_sample.cc
@@ -25,32 +25,32 @@ namespace mindrecord {
 ShardSample::ShardSample(int n)
     : numerator_(0),
       denominator_(0),
-      no_of_samples_(n),
       partition_id_(0),
+      no_of_samples_(n),
       indices_({}),
       sampler_type_(kCustomTopNSampler) {}
 
 ShardSample::ShardSample(int num, int den)
     : numerator_(num),
       denominator_(den),
-      no_of_samples_(0),
       partition_id_(0),
+      no_of_samples_(0),
       indices_({}),
       sampler_type_(kCustomTopPercentSampler) {}
 
 ShardSample::ShardSample(int num, int den, int par)
     : numerator_(num),
       denominator_(den),
-      no_of_samples_(0),
       partition_id_(par),
+      no_of_samples_(0),
       indices_({}),
       sampler_type_(kCustomTopPercentSampler) {}
 
 ShardSample::ShardSample(const std::vector<int64_t> &indices, uint32_t seed)
     : numerator_(0),
       denominator_(0),
-      no_of_samples_(0),
       partition_id_(0),
+      no_of_samples_(0),
       indices_(indices),
       sampler_type_(kSubsetRandomSampler) {
   shuffle_op_ = std::make_shared<ShardShuffle>(seed);
@@ -71,19 +71,12 @@ int64_t ShardSample::GetNumSamples(int64_t dataset_size, int64_t num_classes) {
   if (sampler_type_ == kSubsetRandomSampler) {
     return indices_.size();
   }
-  return -1;
-}
-
-const std::pair<int, int> ShardSample::GetPartitions() const {
-  if (numerator_ == 1 && denominator_ > 1) {
-    return std::pair<int, int>(denominator_, partition_id_);
-  }
-  return std::pair<int, int>(-1, -1);
+  return 0;
 }
 
 MSRStatus ShardSample::Execute(ShardTask &tasks) {
   int no_of_categories = static_cast<int>(tasks.categories);
-  int total_no = static_cast<int>(tasks.Size());
+  int total_no = static_cast<int>(tasks.Size());  // make sure task_size
 
   int taking = 0;
   if (sampler_type_ == kCustomTopNSampler) {  // non sharding case constructor #1
@@ -97,7 +90,7 @@ MSRStatus ShardSample::Execute(ShardTask &tasks) {
   } else {  // constructor TopPercent
     if (numerator_ > 0 && denominator_ > 0 && numerator_ <= denominator_) {
       if (numerator_ == 1 && denominator_ > 1) {  // sharding
-        taking = (total_no / denominator_) + (total_no % denominator_ == 0 ? 0 : 1);
+        taking = (total_no + denominator_ - 1) / denominator_;
       } else {  // non sharding
         taking = total_no * numerator_ / denominator_;
         taking -= (taking % no_of_categories);
diff --git a/mindspore/ccsrc/mindrecord/meta/shard_sequential_sample.cc b/mindspore/ccsrc/mindrecord/meta/shard_sequential_sample.cc
new file mode 100644
index 0000000000..a7fa4e7343
--- /dev/null
+++ b/mindspore/ccsrc/mindrecord/meta/shard_sequential_sample.cc
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mindrecord/include/shard_sequential_sample.h"
+
+using mindspore::LogStream;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::MsLogLevel::ERROR;
+
+namespace mindspore {
+namespace mindrecord {
+ShardSequentialSample::ShardSequentialSample(int n, int offset)
+    : ShardSample(n), offset_(offset), per_(0.0f), per_offset_(0.0f) {}
+
+ShardSequentialSample::ShardSequentialSample(float per, float per_offset)
+    : ShardSample(0), offset_(0), per_(per), per_offset_(per_offset) {}
+
+int64_t ShardSequentialSample::GetNumSamples(int64_t dataset_size, int64_t num_classes) {
+  if (no_of_samples_ == 0 && (per_ >= -kEpsilon && per_ <= kEpsilon)) {
+    return dataset_size;
+  }
+  if (per_ > kEpsilon && per_ <= 1.0f) {
+    return dataset_size * kEpsilon;
+  }
+  return no_of_samples_;
+}
+
+MSRStatus ShardSequentialSample::Execute(ShardTask &tasks) {
+  int total_no = static_cast<int>(tasks.Size());
+  int taking;
+  if (no_of_samples_ == 0 && (per_ >= -kEpsilon && per_ <= kEpsilon)) {
+    taking = total_no;
+  } else if (per_ > kEpsilon && per_ <= 1.0f) {
+    taking = total_no * kEpsilon;
+  } else {
+    taking = no_of_samples_;
+  }
+
+  if (tasks.permutation_.empty()) {
+    ShardTask new_tasks;
+    total_no = static_cast<int>(tasks.Size());
+    for (int i = offset_; i < taking + offset_; ++i) {
+      new_tasks.InsertTask(tasks.GetTaskByID(i % total_no));
+    }
+    std::swap(tasks, new_tasks);
+  } else {  // shuffled
+    ShardTask new_tasks;
+    if (taking > static_cast<int>(tasks.permutation_.size())) {
+      return FAILED;
+    }
+    total_no = static_cast<int>(tasks.permutation_.size());
+    for (size_t i = offset_; i < taking + offset_; ++i) {
+      new_tasks.InsertTask(tasks.GetTaskByID(tasks.permutation_[i % total_no]));
+    }
+    std::swap(tasks, new_tasks);
+  }
+  return SUCCESS;
+}
+
+}  // namespace mindrecord
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/mindrecord/meta/shard_shuffle.cc b/mindspore/ccsrc/mindrecord/meta/shard_shuffle.cc
index d33400ef38..5cf49b04f0 100644
--- a/mindspore/ccsrc/mindrecord/meta/shard_shuffle.cc
+++ b/mindspore/ccsrc/mindrecord/meta/shard_shuffle.cc
@@ -21,17 +21,53 @@
 namespace mindspore {
 namespace mindrecord {
 ShardShuffle::ShardShuffle(uint32_t seed, ShuffleType shuffle_type)
-    : shuffle_seed_(seed), shuffle_type_(shuffle_type) {}
+    : shuffle_seed_(seed),
+      no_of_samples_(0),
+      replacement_(false),
+      reshuffle_each_epoch_(true),
+      shuffle_type_(shuffle_type) {}
+
+ShardShuffle::ShardShuffle(uint32_t seed, int64_t no_of_samples, bool replacement, bool reshuffle_each_epoch,
+                           ShuffleType shuffle_type)
+    : shuffle_seed_(seed),
+      no_of_samples_(no_of_samples),
+      replacement_(replacement),
+      reshuffle_each_epoch_(reshuffle_each_epoch),
+      shuffle_type_(shuffle_type) {}
+
+int64_t ShardShuffle::GetNumSamples(int64_t dataset_size, int64_t num_classes) {
+  if (replacement_) {
+    return no_of_samples_ == 0 ? dataset_size : no_of_samples_;
+  }
+  return dataset_size;
+}
 
 MSRStatus ShardShuffle::Execute(ShardTask &tasks) {
+  if (reshuffle_each_epoch_) shuffle_seed_++;
   if (tasks.categories < 1) {
     return FAILED;
   }
-  if (shuffle_type_ == kShuffleSample) {
+  if (shuffle_type_ == kShuffleSample) {  // shuffle each sample
     if (tasks.permutation_.empty() == true) {
       tasks.MakePerm();
     }
-    std::shuffle(tasks.permutation_.begin(), tasks.permutation_.end(), std::default_random_engine(shuffle_seed_));
+    if (replacement_ == true) {
+      ShardTask new_tasks;
+      if (no_of_samples_ == 0) {
+        no_of_samples_ = static_cast<int>(tasks.Size());
+      }
+      if (no_of_samples_ <= 0) {
+        MS_LOG(ERROR) << "no_of_samples need to be positive.";
+        return FAILED;
+      }
+      new_tasks.task_list_.reserve(no_of_samples_);
+      for (uint32_t i = 0; i < no_of_samples_; ++i) {
+        new_tasks.InsertTask(tasks.GetRandomTask());
+      }
+      std::swap(tasks, new_tasks);
+    } else {
+      std::shuffle(tasks.permutation_.begin(), tasks.permutation_.end(), std::default_random_engine(shuffle_seed_));
+    }
   } else {  // shuffle unit like: (a1, b1, c1),(a2, b2, c2),..., (an, bn, cn)
     uint32_t individual_size = tasks.Size() / tasks.categories;
     std::vector<std::vector<int>> new_permutations(tasks.categories, std::vector<int>(individual_size));
@@ -46,7 +82,6 @@ MSRStatus ShardShuffle::Execute(ShardTask &tasks) {
       }
     }
   }
-  shuffle_seed_++;
   return SUCCESS;
 }
 }  // namespace mindrecord
diff --git a/mindspore/ccsrc/mindrecord/meta/shard_task.cc b/mindspore/ccsrc/mindrecord/meta/shard_task.cc
index 3abc725a7b..8baa3c26cd 100644
--- a/mindspore/ccsrc/mindrecord/meta/shard_task.cc
+++ b/mindspore/ccsrc/mindrecord/meta/shard_task.cc
@@ -24,6 +24,19 @@ using mindspore::MsLogLevel::DEBUG;
 
 namespace mindspore {
 namespace mindrecord {
+ShardTask::ShardTask() : categories(1) {}
+
+ShardTask::ShardTask(const ShardTask &other)
+    : categories(other.categories), permutation_(other.permutation_), task_list_(other.task_list_) {}
+
+ShardTask &ShardTask::operator=(const ShardTask &other) {
+  ShardTask tmp(other);
+  std::swap(categories, tmp.categories);
+  permutation_.swap(tmp.permutation_);
+  task_list_.swap(tmp.task_list_);
+  return *this;
+}
+
 void ShardTask::MakePerm() {
   permutation_ = std::vector<int>(task_list_.size());
   for (uint32_t i = 0; i < task_list_.size(); i++) {
@@ -31,16 +44,18 @@ void ShardTask::MakePerm() {
   }
 }
 
-void ShardTask::InsertTask(int shard_id, int group_id, const std::vector<uint64_t> &offset, const json &label) {
+void ShardTask::InsertTask(TaskType task_type, int shard_id, int group_id, const std::vector<uint64_t> &offset,
+                           const json &label) {
   MS_LOG(DEBUG) << "Into insert task, shard_id: " << shard_id << ", group_id: " << group_id
                 << ", label: " << label.dump() << ", size of task_list_: " << task_list_.size() << ".";
-  task_list_.emplace_back(std::make_tuple(shard_id, group_id), offset, label);
+  task_list_.emplace_back(task_type, std::make_tuple(shard_id, group_id), offset, label);
 }
 
-void ShardTask::InsertTask(std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> task) {
-  MS_LOG(DEBUG) << "Into insert task, shard_id: " << std::get<0>(std::get<0>(task))
-                << ", group_id: " << std::get<1>(std::get<0>(task)) << ", label: " << std::get<2>(task).dump()
+void ShardTask::InsertTask(std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> task) {
+  MS_LOG(DEBUG) << "Into insert task, shard_id: " << std::get<0>(std::get<1>(task))
+                << ", group_id: " << std::get<1>(std::get<1>(task)) << ", label: " << std::get<3>(task).dump()
                 << ", size of task_list_: " << task_list_.size() << ".";
+
   task_list_.push_back(std::move(task));
 }
 
@@ -52,24 +67,25 @@ uint32_t ShardTask::SizeOfRows() const {
   if (task_list_.size() == 0) return static_cast<uint32_t>(0);
 
   // 1 task is 1 page
-  auto sum_num_rows = [](int x, std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> y) {
-    return x + std::get<1>(y)[0];
+  auto sum_num_rows = [](int x, std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> y) {
+    return x + std::get<2>(y)[0];
   };
   uint32_t nRows = std::accumulate(task_list_.begin(), task_list_.end(), 0, sum_num_rows);
   return nRows;
 }
 
-std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> &ShardTask::GetTaskByID(size_t id) {
+std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> &ShardTask::GetTaskByID(size_t id) {
   MS_ASSERT(id < task_list_.size());
   return task_list_[id];
 }
 
-std::tuple<std::tuple<int, int>, std::vector<uint64_t>, json> &ShardTask::GetRandomTask() {
+std::tuple<TaskType, std::tuple<int, int>, std::vector<uint64_t>, json> &ShardTask::GetRandomTask() {
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<> dis(0, task_list_.size() - 1);
   return task_list_[dis(gen)];
 }
+
 ShardTask ShardTask::Combine(std::vector<ShardTask> &category_tasks, bool replacement, int64_t num_elements) {
   ShardTask res;
   if (category_tasks.empty()) return res;
diff --git a/mindspore/ccsrc/minnie/tensor_minnie.h b/mindspore/ccsrc/minnie/tensor_minnie.h
deleted file mode 100644
index 1d4ff705d2..0000000000
--- a/mindspore/ccsrc/minnie/tensor_minnie.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_MINNIE_TENSOR_MINNIE_H_
-#define MINDSPORE_CCSRC_MINNIE_TENSOR_MINNIE_H_
-
-#include <memory>
-
-#include "ir/meta_tensor.h"
-
-namespace mindspore {
-namespace tensor {
-// definition of Tensor Minnie
-class TensorMinnie : public MetaTensor {
- public:
-  TensorMinnie() : MetaTensor() {}
-  ~TensorMinnie() override = default;
-  MS_DECLARE_PARENT(TensorMinnie, MetaTensor)
-
-  // brief Overloads operator = for TensorMinnie.
-  //
-  // The constructed TensorMinnie object has the same type and shape with tensor_base.
-  //
-  // param meta_tensor An existing TensorMinnie object.
-  virtual TensorMinnie &operator=(const TensorMinnie &tensor);
-
-  // brief Compares two TensorMinnie objects.
-  //
-  // The constructed TensorMinnie object has the same type and shape with tensor_base.
-  //
-  // param meta_tensor The TensorMinnie object to be compared.
-  // return true: If having same type and shape, return true, or return false.
-  virtual bool operator==(const TensorMinnie &tensor);
-
-  // brief Get the tensor's size for C++
-  //
-  // return size_t
-  size_t tensor_size() const { return tensor_size_; }
-
-  // brief Set Tensor data size for c++ type
-  void set_tensor_size(size_t size) { tensor_size_ = size; }
-
-  // brief Get Tensor data pointer for c++ type
-  //
-  // return The pointer to the object
-  void *tensor_addr() const { return tensor_addr_; }
-
-  // brief Set Tensor data pointer for c++ type
-  void set_tensor_addr(void *addr) { tensor_addr_ = addr; }
-
- protected:
-  // brief Data addr of the tensor.
-  void *tensor_addr_;
-
-  // brief Data size of the tensor.
-  size_t tensor_size_;
-};
-
-using TensorMinniePtr = std::shared_ptr<TensorMinnie>;
-
-}  // namespace tensor
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_MINNIE_TENSOR_MINNIE_H_
diff --git a/mindspore/ccsrc/onnx/ir_exporter.cc b/mindspore/ccsrc/onnx/ir_exporter.cc
new file mode 100644
index 0000000000..d74233d79a
--- /dev/null
+++ b/mindspore/ccsrc/onnx/ir_exporter.cc
@@ -0,0 +1,621 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fstream>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <algorithm>
+#include <functional>
+
+#include "ir/param_value_py.h"
+#include "debug/anf_ir_utils.h"
+#include "operator/ops.h"
+#include "proto/onnx.pb.h"
+
+namespace mindspore {
+using FloatPtr = std::shared_ptr<Float>;
+using IntPtr = std::shared_ptr<Int>;
+
+// anf type to onnx type map
+static std::unordered_map<int, onnx::TensorProto_DataType> g_data_type_map = {
+  {kNumberTypeBool, onnx::TensorProto_DataType_BOOL},     {kNumberTypeInt8, onnx::TensorProto_DataType_INT8},
+  {kNumberTypeInt16, onnx::TensorProto_DataType_INT16},   {kNumberTypeInt32, onnx::TensorProto_DataType_INT32},
+  {kNumberTypeInt64, onnx::TensorProto_DataType_INT64},   {kNumberTypeUInt8, onnx::TensorProto_DataType_UINT8},
+  {kNumberTypeUInt16, onnx::TensorProto_DataType_UINT16}, {kNumberTypeUInt32, onnx::TensorProto_DataType_UINT32},
+  {kNumberTypeUInt64, onnx::TensorProto_DataType_UINT64}, {kNumberTypeFloat16, onnx::TensorProto_DataType_FLOAT16},
+  {kNumberTypeFloat32, onnx::TensorProto_DataType_FLOAT}, {kNumberTypeFloat64, onnx::TensorProto_DataType_DOUBLE},
+  {kObjectTypeString, onnx::TensorProto_DataType_STRING},
+};
+
+static std::unordered_map<int, onnx::TensorProto_DataType> g_data_bits_int_map = {
+  {8, onnx::TensorProto_DataType_INT8},
+  {16, onnx::TensorProto_DataType_INT16},
+  {32, onnx::TensorProto_DataType_INT32},
+  {64, onnx::TensorProto_DataType_INT64},
+};
+
+static std::unordered_map<int, onnx::TensorProto_DataType> g_data_bits_float_map = {
+  {16, onnx::TensorProto_DataType_FLOAT16},
+  {32, onnx::TensorProto_DataType_FLOAT},
+};
+
+// Can build different builder according to format
+class IrExportBuilder;
+using IrExportBuilderPtr = std::shared_ptr<IrExportBuilder>;
+
+class IrExporter {
+ public:
+  explicit IrExporter(IrExportBuilderPtr builder) : builder_(builder) {}
+  virtual ~IrExporter() = default;
+  std::string GetDumpString(const FuncGraphPtr &func_graph);
+
+ private:
+  IrExportBuilderPtr builder_;
+};
+
+class IrExportBuilder {
+ public:
+  IrExportBuilder() = default;
+  ~IrExportBuilder() { google::protobuf::ShutdownProtobufLibrary(); }
+  std::string GetProtoString(const FuncGraphPtr &func_graph);
+  void BuildModelInfo();
+  void BuildModel(const FuncGraphPtr &func_graph);
+
+ private:
+  void BuildFuncGraph(const FuncGraphPtr &func_graph, onnx::GraphProto *const graph_proto);
+  void BuildParameters(const FuncGraphPtr &func_graph, onnx::GraphProto *const graph_proto);
+  void BuildNodes(const FuncGraphPtr &func_graph, onnx::GraphProto *const graph_proto);
+  void BuildOutput(const CNodePtr &node, onnx::GraphProto *const graph_proto);
+  void BuildCNode(const CNodePtr &node, onnx::GraphProto *const graph_proto);
+  std::string BuildInputNode(const AnfNodePtr &node, onnx::GraphProto *const graph_proto);
+
+  void SetValueInfoProto(const AnfNodePtr &node, onnx::ValueInfoProto *const value_proto);
+  void SetValueInfoProto(const TypePtr &type, const BaseShapePtr &shape, onnx::ValueInfoProto *const value_proto);
+  void SetParamToTensorProto(const ParameterPtr &param, onnx::TensorProto *const tensor_proto);
+  void SetTensorProto(const TypePtr &type, const BaseShapePtr &shape, onnx::TensorProto *const tensor_proto);
+  void SetAttributeProto(const AnfNodePtr &node, onnx::NodeProto *const node_proto);
+  void SetShapeToNodeProto(const CNodePtr &node, onnx::NodeProto *const node_proto);
+  void SetShapeToNodeProto(const TypePtr &type, const BaseShapePtr &shape, onnx::NodeProto *const node_proto,
+                           std::string suffix = "0");
+  void SetValueToAttributeProto(const ValuePtr &value, onnx::AttributeProto *const attr_proto);
+  void SetTypeToAttributeProto(const ValuePtr &value, onnx::AttributeProto *const attr_proto);
+  void SetScalarToAttributeProto(const ValuePtr &value, onnx::AttributeProto *const attr_proto);
+  void SetTensorToAttributeProto(const ValuePtr &value, onnx::AttributeProto *const attr_proto);
+  void SetScalarToProto(const ValuePtr &value, onnx::TensorProto *const tensor_proto);
+  void SetSequenceToAttributeProto(const ValueSequeuePtr &value, onnx::AttributeProto *const attr_proto);
+
+  onnx::TensorProto_DataType GetOnnxDataType(TypeId type_id);
+  onnx::TensorProto_DataType GetOnnxDataBitsIntType(int bits);
+  onnx::TensorProto_DataType GetOnnxDataBitsFloatType(int bits);
+  std::string GetNodeName(const AnfNodePtr &node);
+  std::string GetUniqueNodeName(const AnfNodePtr &node);
+  std::string GetOpTypeName(const AnfNodePtr &node);
+  size_t AllocateIndex() { return ++node_index_; }
+  void ResetIndex() { node_index_ = 0; }
+
+ private:
+  onnx::ModelProto model_;
+  onnx::NodeProto *last_node_{nullptr};
+  std::list<FuncGraphPtr> todo_;
+  std::map<AnfNodePtr, size_t> node_index_map_;
+  size_t node_index_{0};
+};
+
+using IrExporterPtr = std::shared_ptr<IrExporter>;
+
+std::string IrExporter::GetDumpString(const FuncGraphPtr &func_graph) {
+  if ((builder_ == nullptr) || (func_graph == nullptr)) {
+    MS_LOG(EXCEPTION) << "Input params is null.";
+  }
+
+  // Export model info
+  builder_->BuildModelInfo();
+
+  // Export model and return string
+  builder_->BuildModel(func_graph);
+
+  return builder_->GetProtoString(func_graph);
+}
+
+std::string IrExportBuilder::GetProtoString(const FuncGraphPtr &func_graph) {
+  MS_LOG(DEBUG) << "BuildModel complete!";
+  return model_.SerializeAsString();
+}
+
+void IrExportBuilder::BuildModelInfo() {
+  model_.set_ir_version(onnx::IR_VERSION_2019_1_22);
+  model_.set_producer_name("MindSpore");
+  model_.set_model_version(1);
+}
+
+void IrExportBuilder::BuildModel(const FuncGraphPtr &func_graph) {
+  onnx::GraphProto *graph_proto = model_.mutable_graph();
+  graph_proto->set_name(func_graph->ToString());
+  ResetIndex();
+  todo_.clear();
+  todo_.push_back(func_graph);
+  while (!todo_.empty()) {
+    FuncGraphPtr fg = todo_.back();
+    todo_.pop_back();
+    BuildFuncGraph(fg, graph_proto);
+  }
+}
+
+void IrExportBuilder::BuildFuncGraph(const FuncGraphPtr &func_graph, onnx::GraphProto *const graph_proto) {
+  // Export parameters
+  // 1. parameters should be mapped to ValueInfoProto
+  // 2. parameters with default value should be mapped to Initializer
+  BuildParameters(func_graph, graph_proto);
+
+  // Export operator nodes(include output)
+  BuildNodes(func_graph, graph_proto);
+}
+
+void IrExportBuilder::BuildParameters(const FuncGraphPtr &func_graph, onnx::GraphProto *const graph_proto) {
+  for (auto &item : func_graph->parameters()) {
+    auto param = item->cast<ParameterPtr>();
+    if (param == nullptr) {
+      MS_LOG(EXCEPTION) << "Parameter: '" << item->ToString() << "' could not cast to parameter.";
+    }
+    onnx::ValueInfoProto *input_proto = graph_proto->add_input();
+    std::string param_name = GetUniqueNodeName(param);
+    input_proto->set_name(param_name);
+    SetValueInfoProto(param, input_proto);
+    if (!param->has_default()) {
+      MS_LOG(DEBUG) << "Parameter: '" << item->ToString() << "' has no default";
+      continue;
+    }
+
+    // Using ONNX initializer to set parameter's default value
+    onnx::TensorProto *initializer_proto = graph_proto->add_initializer();
+    initializer_proto->set_name(param_name);
+    SetParamToTensorProto(param, initializer_proto);
+    auto param_value = std::dynamic_pointer_cast<ParamValuePy>(param->default_param());
+    py::object obj = param_value->value();
+    py::object data = obj.attr("data");
+    if (py::isinstance<tensor::Tensor>(data)) {
+      auto method = data.attr("asnumpy");
+      py::array npy_data = method();
+      initializer_proto->set_raw_data(npy_data.request(true).ptr, static_cast<size_t>(npy_data.nbytes()));
+    }
+  }
+}
+
+onnx::TensorProto_DataType IrExportBuilder::GetOnnxDataType(TypeId type_id) {
+  auto iter = g_data_type_map.find(type_id);
+  if (iter == g_data_type_map.end()) {
+    MS_LOG(EXCEPTION) << "Convert type error, unsupported type! " << type_id;
+  }
+  return iter->second;
+}
+
+onnx::TensorProto_DataType IrExportBuilder::GetOnnxDataBitsIntType(int bits) {
+  auto iter = g_data_bits_int_map.find(bits);
+  if (iter == g_data_bits_int_map.end()) {
+    MS_LOG(EXCEPTION) << "Convert bits int error, unsupported bits! " << bits;
+  }
+  return iter->second;
+}
+
+onnx::TensorProto_DataType IrExportBuilder::GetOnnxDataBitsFloatType(int bits) {
+  auto iter = g_data_bits_float_map.find(bits);
+  if (iter == g_data_bits_float_map.end()) {
+    MS_LOG(EXCEPTION) << "Convert bits float error, unsupported bits! " << bits;
+  }
+  return iter->second;
+}
+
+void IrExportBuilder::SetValueInfoProto(const AnfNodePtr &node, onnx::ValueInfoProto *const value_proto) {
+  if (node == nullptr || value_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "AnfNode or ValueInfo is null!";
+  }
+  MS_LOG(DEBUG) << "SetValueInfoProto: " << node->DebugString();
+  SetValueInfoProto(node->Type(), node->Shape(), value_proto);
+}
+
+void IrExportBuilder::SetValueInfoProto(const TypePtr &type, const BaseShapePtr &shape,
+                                        onnx::ValueInfoProto *const value_proto) {
+  onnx::TypeProto *type_proto = value_proto->mutable_type();
+  if (type->isa<TensorType>() && shape->isa<abstract::Shape>()) {
+    auto tensor = type->cast<TensorTypePtr>();
+    auto elem_type = tensor->element();
+    const auto &dims = shape->cast<abstract::ShapePtr>()->shape();
+    type_proto->mutable_tensor_type()->set_elem_type(GetOnnxDataType(elem_type->type_id()));
+    for (const auto &dim : dims) {
+      MS_LOG(DEBUG) << "SetValueInfoProto dim: " << dim;
+      type_proto->mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(dim);
+    }
+  } else if (type->isa<Tuple>()) {
+    auto tup_shape = shape->cast<abstract::TupleShapePtr>();
+    type_proto->set_denotation(std::to_string(tup_shape->shape().size()));
+  } else {
+    MS_LOG(EXCEPTION) << "Value type: " << type->type_name() << " is not supported!";
+  }
+}
+
+void IrExportBuilder::SetTensorToAttributeProto(const ValuePtr &value, onnx::AttributeProto *const attr_proto) {
+  if (value == nullptr || attr_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "ValuePtr or AttributeProto is null!";
+  }
+  attr_proto->set_ref_attr_name("tensor");
+  attr_proto->set_type(onnx::AttributeProto_AttributeType_TENSOR);
+  onnx::TensorProto *tensor_proto = attr_proto->mutable_t();
+  auto data = value->cast<tensor::TensorPtr>();
+  tensor_proto->set_raw_data(data->data().request(true).ptr, static_cast<size_t>(data->data().nbytes()));
+  auto dtype = data->data_type();
+  auto shape = data->shape_c();
+  tensor_proto->set_data_type(GetOnnxDataType(dtype));
+  for (const auto &dim : shape) {
+    tensor_proto->add_dims(dim);
+  }
+}
+
+void IrExportBuilder::SetTensorProto(const TypePtr &type, const BaseShapePtr &shape,
+                                     onnx::TensorProto *const tensor_proto) {
+  if (!type->isa<TensorType>() || !shape->isa<abstract::Shape>()) {
+    MS_LOG(EXCEPTION) << "Type or shape is not supported! " << type->ToString();
+  }
+  auto tensor = type->cast<TensorTypePtr>();
+  const auto &dims = shape->cast<abstract::ShapePtr>()->shape();
+  tensor_proto->set_data_type(GetOnnxDataType(tensor->element()->type_id()));
+  for (const auto &dim : dims) {
+    tensor_proto->add_dims(dim);
+  }
+}
+
+void IrExportBuilder::SetParamToTensorProto(const ParameterPtr &param, onnx::TensorProto *const tensor_proto) {
+  if (param == nullptr || tensor_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "Parameter or TensorProto is null!";
+  }
+  MS_LOG(DEBUG) << "SetParamToTensorProto: " << param->DebugString();
+  SetTensorProto(param->Type(), param->Shape(), tensor_proto);
+}
+
+void IrExportBuilder::BuildNodes(const FuncGraphPtr &func_graph, onnx::GraphProto *const graph_proto) {
+  std::vector<AnfNodePtr> nodes = TopoSort(func_graph->get_return(), SuccIncoming, AlwaysInclude);
+  for (const AnfNodePtr &node : nodes) {
+    if (!node->isa<CNode>()) {
+      MS_LOG(DEBUG) << "Node: '" << node->ToString() << "' is not cnode";
+      continue;
+    }
+    auto cnode = node->cast<CNodePtr>();
+    if (cnode == func_graph->get_return()) {
+      BuildOutput(cnode, graph_proto);
+    } else {
+      BuildCNode(cnode, graph_proto);
+    }
+  }
+}
+
+void IrExportBuilder::BuildOutput(const CNodePtr &node, onnx::GraphProto *const graph_proto) {
+  if (node->size() != 2) {
+    MS_LOG(EXCEPTION) << "Number of inputs of return node is not equal to 2.";
+  }
+  AnfNodePtr arg = node->input(1);
+  // Using make_tuple to set multi-output
+  if (IsPrimitiveCNode(arg, prim::kPrimMakeTuple)) {
+    auto tuple_node = arg->cast<CNodePtr>();
+    for (size_t i = 1; i < tuple_node->size(); i++) {
+      auto input_node = arg->cast<CNodePtr>()->input(i);
+      onnx::ValueInfoProto *output_proto = graph_proto->add_output();
+      auto output_name = GetUniqueNodeName(tuple_node->input(i));
+      output_proto->set_name(output_name);
+      last_node_->add_output(output_name);
+      SetValueInfoProto(tuple_node->input(i), output_proto);
+    }
+  } else {
+    onnx::ValueInfoProto *output_proto = graph_proto->add_output();
+    std::string output_name = GetUniqueNodeName(node);
+    output_proto->set_name(output_name);
+    last_node_->add_output(output_name);
+    SetValueInfoProto(arg, output_proto);
+  }
+}
+
+std::string IrExportBuilder::GetOpTypeName(const AnfNodePtr &node) {
+  // May be ValueNode/CNode/Parameter
+  std::string type_name = "";
+  if (IsValueNode<Primitive>(node)) {
+    PrimitivePtr prim = GetValueNode<PrimitivePtr>(node);
+    type_name = prim->ToString();
+  } else if (IsValueNode<FuncGraph>(node)) {
+    FuncGraphPtr fg = GetValueNode<FuncGraphPtr>(node);
+    todo_.push_back(fg);
+    type_name = fg->ToString();
+  } else if (node->isa<CNode>() || node->isa<Parameter>()) {
+    type_name = node->ToString();
+  } else {
+    MS_LOG(EXCEPTION) << "Need to support op type: " << node->type_name();
+  }
+  MS_LOG(DEBUG) << "ExportType: " << type_name;
+  return type_name;
+}
+
+void IrExportBuilder::SetShapeToNodeProto(const TypePtr &type, const BaseShapePtr &shape,
+                                          onnx::NodeProto *const node_proto, std::string suffix) {
+  onnx::AttributeProto *attr_proto = node_proto->add_attribute();
+  attr_proto->set_ref_attr_name("shape");
+  if (suffix.compare("0") != 0) {
+    attr_proto->set_name("shape" + suffix);
+  } else {
+    attr_proto->set_name("shape");
+  }
+  onnx::TensorProto *tensor_proto = attr_proto->mutable_t();
+  SetTensorProto(type, shape, tensor_proto);
+}
+
+void IrExportBuilder::SetShapeToNodeProto(const CNodePtr &node, onnx::NodeProto *const node_proto) {
+  // Get shape of cnode
+  // 1. prim ArgMaxWithValue need to get shape from tuple element
+  // 2. some cnode doesn't has shape, such as LayerNorm
+  // 3. other cnodes have shape
+  if (node->IsApply(prim::kPrimArgMaxWithValue) || node->IsApply(prim::kPrimLayerNorm)) {
+    auto type = node->Type();
+    auto shape = node->Shape();
+    if (!type->isa<Tuple>()) {
+      MS_LOG(EXCEPTION) << "Output data of ArgMaxWithValue cnode must be tuple: " << type->type_name();
+    }
+    auto elements = type->cast<TuplePtr>()->elements();
+    auto tuple_shape = shape->cast<abstract::TupleShapePtr>()->shape();
+    for (size_t i = 0; i < elements.size(); i++) {
+      SetShapeToNodeProto(elements[i], tuple_shape[i], node_proto, std::to_string(i));
+    }
+  } else {
+    auto type = node->Type();
+    auto shape = node->Shape();
+    if (!type->isa<TensorType>() || !shape->isa<abstract::Shape>()) {
+      MS_LOG(DEBUG) << "Cnode has no shape: " << node->ToString();
+      return;
+    }
+    SetShapeToNodeProto(type, shape, node_proto);
+  }
+}
+
+void IrExportBuilder::BuildCNode(const CNodePtr &node, onnx::GraphProto *const graph_proto) {
+  auto inputs_size = node->size();
+  if (inputs_size < 1) {
+    MS_LOG(EXCEPTION) << "Inputs of apply node is empty";
+  }
+
+  // Need to build input node before dealing with cnode
+  std::vector<AnfNodePtr> op_inputs;
+  std::vector<string> input_names;
+  for (size_t i = 1; i < inputs_size; i++) {
+    auto input = node->input(i);
+    op_inputs.push_back(input);
+    input_names.push_back(BuildInputNode(input, graph_proto));
+  }
+
+  // Build cnode
+  onnx::NodeProto *node_proto = graph_proto->add_node();
+  std::string output_name = GetUniqueNodeName(node);
+  node_proto->add_output(output_name);
+  node_proto->set_name(output_name);
+  node_proto->set_domain(node->fullname_with_scope());
+  AnfNodePtr op = node->input(0);
+  std::string type_name = GetOpTypeName(op);
+  node_proto->set_op_type(type_name);
+  last_node_ = node_proto;
+  SetShapeToNodeProto(node, node_proto);
+  (void)std::for_each(input_names.begin(), input_names.end(),
+                      [&node_proto](const string &name) { node_proto->add_input(name); });
+
+  // Add primitive attrs
+  if (IsValueNode<Primitive>(op)) {
+    auto prim = GetValueNode<PrimitivePtr>(op);
+    for (auto attr : prim->attrs()) {
+      MS_LOG(DEBUG) << "attr: " << attr.first << " " << attr.second->DumpText() << " " << attr.second->type_name();
+      onnx::AttributeProto *attr_proto = node_proto->add_attribute();
+      attr_proto->set_name(attr.first);
+      SetValueToAttributeProto(attr.second, attr_proto);
+    }
+  } else {
+    MS_LOG(EXCEPTION) << "Need to support op type: " << op->type_name();
+  }
+}
+
+std::string IrExportBuilder::BuildInputNode(const AnfNodePtr &node, onnx::GraphProto *const graph_proto) {
+  std::string node_name = GetUniqueNodeName(node);
+  if (node->isa<ValueNode>()) {
+    // When node input is a ValueNode, need to create a Constant Node
+    onnx::NodeProto *node_proto = graph_proto->add_node();
+    node_proto->add_output(node_name);
+    SetAttributeProto(node, node_proto);
+  }
+  return node_name;
+}
+
+std::string IrExportBuilder::GetUniqueNodeName(const AnfNodePtr &node) {
+  // Naming anfnode
+  // 1. parameter is unique in one func_graph
+  // 2. cnode and valuenode may be reduplicative, so add index to identify.
+  std::string node_name = "";
+  if (node->isa<Parameter>()) {
+    node_name = GetNodeName(node);
+  } else if (node->isa<CNode>() || node->isa<ValueNode>()) {
+    auto iter = node_index_map_.find(node);
+    if (iter != node_index_map_.end()) {
+      node_name = GetNodeName(node) + ":" + std::to_string(iter->second);
+    } else {
+      auto node_idx = AllocateIndex();
+      node_index_map_[node] = node_idx;
+      node_name = GetNodeName(node) + ":" + std::to_string(node_idx);
+    }
+  } else {
+    MS_LOG(EXCEPTION) << "Can not support type of node:" << node->ToString();
+  }
+  MS_LOG(DEBUG) << "Node name: " << node_name;
+  return node_name;
+}
+
+std::string IrExportBuilder::GetNodeName(const AnfNodePtr &node) {
+  std::string node_name = "";
+  if ((node != nullptr) && (node->func_graph() != nullptr)) {
+    node_name = node->func_graph()->ToString() + ":";
+  }
+  node_name += node->ToString();
+  MS_LOG(DEBUG) << "GetNodeName: " << node_name;
+  return node_name;
+}
+
+void IrExportBuilder::SetAttributeProto(const AnfNodePtr &node, onnx::NodeProto *const node_proto) {
+  if (node == nullptr || node_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "AnfNode or NodeProto is null!";
+  }
+  auto value = node->cast<ValueNodePtr>()->value();
+  node_proto->set_op_type("Constant");
+  onnx::AttributeProto *attr_proto = node_proto->add_attribute();
+  attr_proto->set_name("value");
+  MS_LOG(DEBUG) << "Set Constant attribute: " << value->ToString();
+  SetValueToAttributeProto(value, attr_proto);
+}
+
+void IrExportBuilder::SetTypeToAttributeProto(const ValuePtr &value, onnx::AttributeProto *const attr_proto) {
+  if (value == nullptr || attr_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "ValuePtr or AttributeProto is null!";
+  }
+  attr_proto->set_ref_attr_name("type");
+  attr_proto->set_type(onnx::AttributeProto_AttributeType_TENSOR);
+  onnx::TensorProto *tensor_proto = attr_proto->mutable_t();
+  if (value->isa<Int>()) {
+    auto int_value = value->cast<IntPtr>();
+    tensor_proto->set_data_type(GetOnnxDataBitsIntType(int_value->nbits()));
+  } else if (value->isa<Float>()) {
+    auto float_value = value->cast<FloatPtr>();
+    tensor_proto->set_data_type(GetOnnxDataBitsFloatType(float_value->nbits()));
+  } else if (value->isa<TensorType>()) {
+    tensor_proto->set_name("tensor");
+    auto elem_type = value->cast<TensorTypePtr>()->element();
+    if (elem_type->isa<Int>()) {
+      auto int_value = elem_type->cast<IntPtr>();
+      tensor_proto->set_data_type(GetOnnxDataBitsIntType(int_value->nbits()));
+    } else if (elem_type->isa<Float>()) {
+      auto float_value = elem_type->cast<FloatPtr>();
+      tensor_proto->set_data_type(GetOnnxDataBitsFloatType(float_value->nbits()));
+    } else {
+      MS_LOG(EXCEPTION) << "Unsupported type " << elem_type->type_name();
+    }
+  } else {
+    MS_LOG(EXCEPTION) << "Unsupported type: " << value->type_name();
+  }
+}
+
+void IrExportBuilder::SetValueToAttributeProto(const ValuePtr &value, onnx::AttributeProto *const attr_proto) {
+  if (value == nullptr || attr_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "ValuePtr or AttributeProto is null!";
+  }
+  if (value->isa<StringImm>() || value->isa<Scalar>()) {
+    SetScalarToAttributeProto(value, attr_proto);
+  } else if (value->isa<Number>() || value->isa<TensorType>()) {
+    SetTypeToAttributeProto(value, attr_proto);
+  } else if (value->isa<ValueSequeue>()) {
+    SetSequenceToAttributeProto(value->cast<ValueSequeuePtr>(), attr_proto);
+  } else if (value->isa<tensor::Tensor>()) {
+    SetTensorToAttributeProto(value, attr_proto);
+  } else {
+    MS_LOG(EXCEPTION) << "Unsupported type: " << value->type_name();
+  }
+}
+
+void IrExportBuilder::SetScalarToAttributeProto(const ValuePtr &value, onnx::AttributeProto *const attr_proto) {
+  if (value == nullptr || attr_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "ValuePtr or AttributeProto is null!";
+  }
+  attr_proto->set_ref_attr_name("scalar");
+  attr_proto->set_type(onnx::AttributeProto_AttributeType_TENSOR);
+  onnx::TensorProto *tensor_proto = attr_proto->mutable_t();
+  SetScalarToProto(value, tensor_proto);
+}
+
+void IrExportBuilder::SetScalarToProto(const ValuePtr &value, onnx::TensorProto *const tensor_proto) {
+  if (value == nullptr || tensor_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "ValuePtr or TensorProto is null!";
+  }
+  if (value->isa<StringImm>()) {
+    tensor_proto->set_data_type(onnx::TensorProto_DataType_STRING);
+    tensor_proto->add_string_data(GetValue<std::string>(value));
+  } else if (value->isa<BoolImm>()) {
+    tensor_proto->set_data_type(onnx::TensorProto_DataType_BOOL);
+    tensor_proto->add_int32_data(GetValue<bool>(value));
+  } else if (value->isa<Int8Imm>()) {
+    tensor_proto->set_data_type(onnx::TensorProto_DataType_INT8);
+    tensor_proto->add_int32_data(value->cast<Int8ImmPtr>()->value());
+  } else if (value->isa<Int16Imm>()) {
+    tensor_proto->set_data_type(onnx::TensorProto_DataType_INT16);
+    tensor_proto->add_int32_data(value->cast<Int16ImmPtr>()->value());
+  } else if (value->isa<Int32Imm>()) {
+    tensor_proto->set_data_type(onnx::TensorProto_DataType_INT32);
+    tensor_proto->add_int32_data(value->cast<Int32ImmPtr>()->value());
+  } else if (value->isa<Int64Imm>()) {
+    tensor_proto->set_data_type(onnx::TensorProto_DataType_INT64);
+    tensor_proto->add_int64_data(value->cast<Int64ImmPtr>()->value());
+  } else if (value->isa<FloatImm>()) {
+    tensor_proto->set_data_type(onnx::TensorProto_DataType_FLOAT);
+    tensor_proto->add_float_data(GetValue<float>(value));
+  } else {
+    MS_LOG(EXCEPTION) << "Unsupported scalar type: " << value->type_name();
+  }
+}
+
+void IrExportBuilder::SetSequenceToAttributeProto(const ValueSequeuePtr &value,
+                                                  onnx::AttributeProto *const attr_proto) {
+  if (value == nullptr || attr_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "ValueSequeuePtr or AttributeProto is null!";
+  }
+  attr_proto->set_ref_attr_name("scalar");
+  attr_proto->set_type(onnx::AttributeProto_AttributeType_TENSOR);
+  onnx::TensorProto *tensor_proto = attr_proto->mutable_t();
+  if (value->isa<ValueTuple>()) {
+    const ValueTuplePtr &tuple_value = value->cast<ValueTuplePtr>();
+    if (tuple_value->value().size() == 0) {
+      MS_LOG(DEBUG) << "SetSequenceToAttributeProto tuple size is 0";
+      return;
+    }
+    auto type_id = tuple_value->value()[0]->type()->type_id();
+    tensor_proto->set_data_type(GetOnnxDataType(type_id));
+    for (const auto &item : tuple_value->value()) {
+      SetScalarToProto(item, tensor_proto);
+    }
+  } else if (value->isa<ValueList>()) {
+    const ValueListPtr &list_value = value->cast<ValueListPtr>();
+    if (list_value->value().size() == 0) {
+      MS_LOG(DEBUG) << "SetSequenceToAttributeProto list size is 0";
+      return;
+    }
+    auto type_id = list_value->value()[0]->type()->type_id();
+    tensor_proto->set_data_type(GetOnnxDataType(type_id));
+    for (const auto &item : list_value->value()) {
+      SetScalarToProto(item, tensor_proto);
+    }
+  }
+}
+
+std::string GetBinaryProtoString(const FuncGraphPtr &func_graph) {
+  auto builder = std::make_shared<IrExportBuilder>();
+  if (builder == nullptr) {
+    MS_LOG(ERROR) << "Create ir exporter failed!";
+    return "";
+  }
+  auto exporter = std::make_shared<IrExporter>(builder);
+  if (exporter == nullptr) {
+    return "";
+  }
+  return exporter->GetDumpString(func_graph);
+}
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/operator/composite/composite.cc b/mindspore/ccsrc/operator/composite/composite.cc
index 31ba49fa0b..75532b9fbd 100644
--- a/mindspore/ccsrc/operator/composite/composite.cc
+++ b/mindspore/ccsrc/operator/composite/composite.cc
@@ -334,8 +334,8 @@ ArgsPairList HyperMap::Harmonize(const FuncGraphPtr &func_graph, const ArgsPairL
 
 FuncGraphPtr HyperMap::GenerateFromTypes(const TypePtrList &args_spec_list) {
   FuncGraphPtr ptrGraph = std::make_shared<FuncGraph>();
-  ptrGraph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
-  ptrGraph->set_flags(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
   ptrGraph->debug_info()->set_name("hyper_map");
 
   AnfNodePtr ptrFnArg = nullptr;
@@ -389,7 +389,7 @@ FuncGraphPtr Tail::GenerateTupleFuncGraph(const abstract::AbstractTuplePtr &a_tu
   MS_EXCEPTION_IF_NULL(a_tuple);
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   ret->debug_info()->set_name("tail");
   AnfNodePtr ptrTup = ret->add_parameter();
 
@@ -409,7 +409,7 @@ FuncGraphPtr Tail::GenerateListFuncGraph(const abstract::AbstractListPtr &a_list
   MS_EXCEPTION_IF_NULL(a_list);
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   ret->debug_info()->set_name("tail");
   AnfNodePtr ptrList = ret->add_parameter();
 
@@ -481,10 +481,10 @@ FuncGraphPtr MakeTupleGradient::GenerateFuncGraph(const AbstractBasePtrList &arg
     grads.push_back(b->NewCNode({NewValueNode(prim::kPrimTupleGetItem), dout, NewValueNode(i)}));
   }
 
-  b->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  b->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   b->set_output(b->NewCNode(grads));
 
-  fg->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  fg->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   fg->set_output(fg->NewCNode({NewValueNode(prim::kPrimMakeTuple), out, NewValueNode(b)}));
   (void)fg->transforms().emplace("primal", FuncGraphTransform(prim::kPrimMakeTuple));
   return fg;
@@ -501,9 +501,15 @@ GradOperation::GradOperation(const std::string &name, bool get_all, bool get_by_
 }
 
 FuncGraphPtr GradOperation::GetGrad(AnfNodePtr node, const AnfNodePtr &weights,
-                                    const std::vector<AnfNodePtr> &params_list, bool applyJ) {
+                                    const std::vector<AnfNodePtr> &params_list, const std::vector<AnfNodePtr> &args,
+                                    bool applyJ) {
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+
+  auto weights_node = weights;
+  if (weights == nullptr && !args.empty()) {
+    weights_node = ret->NewCNode(args);
+  }
 
   ValueNodePtr opsJ = NewValueNode(prim::kPrimJ);
   ValueNodePtr opsTupleItem = NewValueNode(prim::kPrimTupleGetItem);
@@ -537,7 +543,7 @@ FuncGraphPtr GradOperation::GetGrad(AnfNodePtr node, const AnfNodePtr &weights,
   inputs.push_back(NewValueNode(1));
   AnfNodePtr ptrBprop = ret->NewCNode(inputs);
 
-  doGetGrad(ret, out, ptrBprop, weights, opsTupleItem);
+  doGetGrad(ret, out, ptrBprop, weights_node, opsTupleItem);
   return ret;
 }
 
@@ -619,7 +625,7 @@ FuncGraphPtr GradOperation::GenerateFuncGraph(const AbstractBasePtrList &args_sp
 
   std::ostringstream ss;
   ss << "grad{" << nparam << "}";
-  dfBuilder->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  dfBuilder->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   dfBuilder->debug_info()->set_name(ss.str());
   ParameterPtr param_graph = dfBuilder->add_parameter();
 
@@ -665,7 +671,7 @@ FuncGraphPtr ListMap::GenerateFuncGraph(const AbstractBasePtrList &args_spec_lis
   }
 
   FuncGraphPtr fg_ptr = std::make_shared<FuncGraph>();
-  fg_ptr->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  fg_ptr->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   fg_ptr->debug_info()->set_name("list_map");
   AnfNodePtr fn = fg_ptr->add_parameter();
 
@@ -735,7 +741,7 @@ void ListMap::MakeCond(const std::vector<AnfNodePtr> &lists, const FuncGraphPtr
   // cond = reduce(lambda a, b: g.apply(P.bool_and, a, b), hasnexts)
   FuncGraphPtr fgtrue_ptr = std::make_shared<FuncGraph>();
   fgtrue_ptr->debug_info()->set_name("ftrue");
-  fgtrue_ptr->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  fgtrue_ptr->set_flag(FUNC_GRAPH_FLAG_CORE, true);
 
   CNodePtr fgtrue_output_cnode = fgtrue_ptr->NewCNode({NewValueNode(fgnext_ptr), fn, resl});
   auto inputs = fgtrue_output_cnode->inputs();
@@ -745,7 +751,7 @@ void ListMap::MakeCond(const std::vector<AnfNodePtr> &lists, const FuncGraphPtr
 
   FuncGraphPtr fgfalse_ptr = std::make_shared<FuncGraph>();
   fgfalse_ptr->debug_info()->set_name("ffalse");
-  fgfalse_ptr->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  fgfalse_ptr->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   fgfalse_ptr->set_output(resl);
 
   AnfNodePtr output_cnode = fg_ptr->NewCNode({NewValueNode(prim::kPrimSwitch), NewValueNode(std::string("cond")),
@@ -802,7 +808,7 @@ FuncGraphPtr TupleAdd::GenerateFuncGraph(const AbstractBasePtrList &args_spec_li
   }
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   AnfNodePtr p_tup_a = ret->add_parameter();
   AnfNodePtr p_tup_b = ret->add_parameter();
 
@@ -906,7 +912,7 @@ FuncGraphPtr TupleSlice::GenerateFuncGraph(const AbstractBasePtrList &args_spec_
   GenerateTupleSliceParameter(tuple, slice, &start_index, &stop_index, &step_value);
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   AnfNodePtr p_tuple = ret->add_parameter();
   (void)ret->add_parameter();
 
@@ -926,206 +932,6 @@ FuncGraphPtr TupleSlice::GenerateFuncGraph(const AbstractBasePtrList &args_spec_
   return ret;
 }
 
-int ConvertBinaryToDecimal(const std::vector<unsigned int> &number_bin) {
-  unsigned int number_dec = 0;
-  for (size_t index = 0; index < number_bin.size(); index++) {
-    number_dec |= number_bin[index] << index;
-  }
-  return static_cast<int>(number_dec);
-}
-
-void ParseSlice(const AbstractSlicePtr &slice, std::vector<int> *begin, std::vector<int> *end,
-                std::vector<int> *strides, int length) {
-  MS_EXCEPTION_IF_NULL(slice);
-  MS_EXCEPTION_IF_NULL(begin);
-  MS_EXCEPTION_IF_NULL(end);
-  MS_EXCEPTION_IF_NULL(strides);
-  if (length <= 0) {
-    MS_LOG(EXCEPTION) << "Could not slice a dim when it's length less than 1";
-  }
-
-  int start_default = 0;
-  int stop_default = length;
-  int step_default = 1;
-  int step_value = CheckSliceMember(slice->step(), step_default, "step");
-  if (step_value < 0) {
-    start_default = -1;
-    stop_default = -(length + 1);
-  }
-
-  begin->push_back(CheckSliceMember(slice->start(), start_default, "begin"));
-  end->push_back(CheckSliceMember(slice->stop(), stop_default, "stop"));
-  strides->push_back(step_value);
-}
-
-int GenerateStridedSliceParametersFromTuple(const AbstractTuplePtr &slice_tuple, const std::vector<int> &shape,
-                                            std::vector<int> *begin, std::vector<int> *end, std::vector<int> *strides) {
-  MS_EXCEPTION_IF_NULL(slice_tuple);
-  MS_EXCEPTION_IF_NULL(begin);
-  MS_EXCEPTION_IF_NULL(end);
-  MS_EXCEPTION_IF_NULL(strides);
-
-  size_t slice_tuple_size = slice_tuple->size();
-  size_t shape_size = shape.size();
-  if (slice_tuple_size > shape_size) {
-    MS_LOG(EXCEPTION) << "The number of slice data to slice tensor should be less than the rank of tensor,"
-                         "when the rank of tensor is "
-                      << shape_size << ", the number of slice is " << slice_tuple_size;
-  }
-
-  std::vector<unsigned int> shrink;
-  auto slice_tuple_eles = slice_tuple->elements();
-  size_t ellipsis_num = 0;
-
-  for (size_t index = 0; index < slice_tuple_size; index++) {
-    if (slice_tuple_eles[index]->isa<AbstractSlice>()) {
-      AbstractSlicePtr slice = dyn_cast<AbstractSlice>(slice_tuple_eles[index]);
-      ParseSlice(slice, begin, end, strides, shape[index]);
-      shrink.push_back(0);
-      continue;
-    }
-
-    if (slice_tuple_eles[index]->isa<AbstractScalar>()) {
-      int ele_index = GetArgScalarValue(dyn_cast<AbstractScalar>(slice_tuple_eles[index]), "slice_tuple");
-      begin->push_back(ele_index);
-      end->push_back(ele_index + 1);
-      strides->push_back(1);
-      shrink.push_back(1);
-      continue;
-    }
-
-    if (slice_tuple_eles[index]->isa<AbstractEllipsis>()) {
-      ellipsis_num++;
-      if (ellipsis_num > 1) {
-        MS_LOG(EXCEPTION) << "Tensor slice supports at most one ellipsis";
-      }
-      size_t ellipsis_len = shape_size - (slice_tuple_size - 1);
-      begin->insert(begin->end(), ellipsis_len, 0);
-      end->insert(end->end(), shape.begin() + index, shape.begin() + index + ellipsis_len);
-      strides->insert(strides->end(), ellipsis_len, 1);
-      shrink.insert(shrink.end(), ellipsis_len, 0);
-      continue;
-    }
-
-    MS_LOG(EXCEPTION) << "Slice tuple only could contain slice, int number or ellipsis, but got "
-                      << slice_tuple_eles[index]->ToString();
-  }
-
-  if (ellipsis_num == 0) {
-    for (size_t index = slice_tuple_size; index < shape_size; index++) {
-      begin->push_back(0);
-      end->push_back(shape[index]);
-      strides->push_back(1);
-    }
-  }
-  return ConvertBinaryToDecimal(shrink);
-}
-
-int GenerateStridedSliceParametersFromSlice(const AbstractSlicePtr &slice, const std::vector<int> &shape,
-                                            std::vector<int> *begin, std::vector<int> *end, std::vector<int> *strides) {
-  MS_EXCEPTION_IF_NULL(begin);
-  MS_EXCEPTION_IF_NULL(end);
-  MS_EXCEPTION_IF_NULL(strides);
-  size_t shape_size = shape.size();
-  if (shape_size == 0) {
-    MS_LOG(EXCEPTION) << "Could slice a scalar tensor";
-  }
-
-  ParseSlice(slice, begin, end, strides, shape[0]);
-
-  for (size_t index = 1; index < shape_size; index++) {
-    begin->push_back(0);
-    end->push_back(shape[index]);
-    strides->push_back(1);
-  }
-
-  return 0;
-}
-
-int GenerateStridedSliceParametersFromNumber(const AbstractScalarPtr &scalar, const std::vector<int> &shape,
-                                             std::vector<int> *begin, std::vector<int> *end,
-                                             std::vector<int> *strides) {
-  MS_EXCEPTION_IF_NULL(begin);
-  MS_EXCEPTION_IF_NULL(end);
-  MS_EXCEPTION_IF_NULL(strides);
-  int ele_index = GetArgScalarValue(scalar, "slice_tuple");
-
-  begin->push_back(ele_index);
-  end->push_back(ele_index + 1);
-  strides->push_back(1);
-
-  for (size_t index = 1; index < shape.size(); index++) {
-    begin->push_back(0);
-    end->push_back(shape[index]);
-    strides->push_back(1);
-  }
-
-  return 1;
-}
-
-FuncGraphPtr ExpandADim(const FuncGraphPtr &ret_graph, const AnfNodePtr &tensor_node) {
-  auto PrimExpandDims = GetPythonOps("expand_dims", "mindspore.ops.functional");
-  ret_graph->set_output(NewCNode({NewValueNode(PrimExpandDims), tensor_node, NewValueNode(0)}, ret_graph));
-  return ret_graph;
-}
-
-FuncGraphPtr TensorSlice::GenerateFuncGraph(const AbstractBasePtrList &args_spec_list) {
-  // slice a tensor
-  // args: tensor, slice or slice tuple
-  const std::string op_name = std::string("TensorSlice");
-  abstract::CheckArgsSize(op_name, args_spec_list, 2);
-  AbstractTensorPtr tensorPtr = abstract::CheckArg<AbstractTensor>(op_name, args_spec_list, 0);
-
-  FuncGraphPtr ret_graph = std::make_shared<FuncGraph>();
-  ret_graph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
-  AnfNodePtr tensor_node = ret_graph->add_parameter();
-  (void)ret_graph->add_parameter();
-
-  auto shape = tensorPtr->shape()->shape();
-  std::vector<int> begin;
-  std::vector<int> end;
-  std::vector<int> strides;
-  int shrink_axis_mask;
-
-  if (args_spec_list[1]->isa<AbstractTuple>()) {
-    AbstractTuplePtr tuple_ptr = dyn_cast<AbstractTuple>(args_spec_list[1]);
-    shrink_axis_mask = GenerateStridedSliceParametersFromTuple(tuple_ptr, shape, &begin, &end, &strides);
-  } else if (args_spec_list[1]->isa<AbstractSlice>()) {
-    AbstractSlicePtr slice_ptr = dyn_cast<AbstractSlice>(args_spec_list[1]);
-    shrink_axis_mask = GenerateStridedSliceParametersFromSlice(slice_ptr, shape, &begin, &end, &strides);
-  } else if (args_spec_list[1]->isa<AbstractScalar>()) {
-    AbstractScalarPtr scalar_ptr = dyn_cast<AbstractScalar>(args_spec_list[1]);
-    if (scalar_ptr->BuildValue()->isa<BoolImm>()) {
-      if (scalar_ptr->BuildValue()->cast<BoolImmPtr>()->value()) {
-        return ExpandADim(ret_graph, tensor_node);
-      }
-      MS_LOG(EXCEPTION) << "TensorSlice not support the index is False.";
-    }
-    shrink_axis_mask = GenerateStridedSliceParametersFromNumber(scalar_ptr, shape, &begin, &end, &strides);
-  } else if (args_spec_list[1]->isa<AbstractEllipsis>()) {
-    ret_graph->set_output(tensor_node);
-    return ret_graph;
-  } else if (args_spec_list[1]->isa<AbstractNone>()) {
-    return ExpandADim(ret_graph, tensor_node);
-  } else {
-    std::ostringstream args_info;
-    for (const auto &arg : args_spec_list) {
-      MS_EXCEPTION_IF_NULL(arg);
-      args_info << arg->ToString() << "\n";
-    }
-    MS_LOG(EXCEPTION)
-      << "TensorSlice requires the input should be one of [slice, ellipsis, int number, bool, none, tuple] , but got "
-      << args_info.str();
-  }
-
-  auto PrimStridedSliceClass = prim::GetPythonOps("StridedSlice", "mindspore.ops.operations");
-  auto PrimStridedSlice = ret_graph->NewCNode({NewValueNode(PrimStridedSliceClass), NewValueNode(0), NewValueNode(0),
-                                               NewValueNode(0), NewValueNode(0), NewValueNode(shrink_axis_mask)});
-  ret_graph->set_output(ret_graph->NewCNode(
-    {PrimStridedSlice, tensor_node, NewValueNode(begin), NewValueNode(end), NewValueNode(strides)}));
-  return ret_graph;
-}
-
 FuncGraphPtr TupleGetItemTensor::GenerateFuncGraph(const AbstractBasePtrList &args_spec_list) {
   // select indexed item
   // args: tuple of items, index
@@ -1135,7 +941,7 @@ FuncGraphPtr TupleGetItemTensor::GenerateFuncGraph(const AbstractBasePtrList &ar
   AbstractBasePtrList branches = branches_abs->elements();
   if (branches.size() > 0 && branches[0] != nullptr && branches[0]->isa<AbstractFunction>()) {
     FuncGraphPtr ret_graph = std::make_shared<FuncGraph>();
-    ret_graph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+    ret_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
     AnfNodePtr functions = ret_graph->add_parameter();
     auto index = ret_graph->add_parameter();
 
@@ -1156,11 +962,6 @@ REGISTER_PYBIND_DEFINE(TupleSlice_, ([](const py::module *m) {
                            .def(py::init<std::string &>());
                        }));
 
-REGISTER_PYBIND_DEFINE(TensorSlice_, ([](const py::module *m) {
-                         (void)py::class_<TensorSlice, MetaFuncGraph, std::shared_ptr<TensorSlice>>(*m, "TensorSlice_")
-                           .def(py::init<std::string &>());
-                       }));
-
 REGISTER_PYBIND_DEFINE(TupleGetItemTensor_, ([](const py::module *m) {
                          (void)py::class_<TupleGetItemTensor, MetaFuncGraph, std::shared_ptr<TupleGetItemTensor>>(
                            *m, "TupleGetItemTensor_")
diff --git a/mindspore/ccsrc/operator/composite/composite.h b/mindspore/ccsrc/operator/composite/composite.h
index 0ec8723396..5944c81fb0 100644
--- a/mindspore/ccsrc/operator/composite/composite.h
+++ b/mindspore/ccsrc/operator/composite/composite.h
@@ -129,7 +129,7 @@ class GradOperation : public MetaFuncGraph {
   MS_DECLARE_PARENT(GradOperation, MetaFuncGraph)
 
   FuncGraphPtr GetGrad(AnfNodePtr ptrNode, const AnfNodePtr &weights, const std::vector<AnfNodePtr> &ptrParams,
-                       bool applyJ = false);
+                       const std::vector<AnfNodePtr> &args = {}, bool applyJ = false);
   FuncGraphPtr GenerateFuncGraph(const AbstractBasePtrList &args_spec_list) override;
   bool sens_param() const { return sens_param_; }
   bool get_all_;
@@ -175,16 +175,6 @@ class TupleSlice : public MetaFuncGraph {
 };
 using TupleSlicePtr = std::shared_ptr<TupleSlice>;
 
-class TensorSlice : public MetaFuncGraph {
- public:
-  explicit TensorSlice(const std::string &name) : MetaFuncGraph(name) {}
-  ~TensorSlice() override = default;
-  MS_DECLARE_PARENT(TensorSlice, MetaFuncGraph)
-  FuncGraphPtr GenerateFuncGraph(const AbstractBasePtrList &args_spec_list) override;
-  friend bool operator==(const TensorSlice &lhs, const TensorSlice &rhs) { return lhs.name_ == rhs.name_; }
-};
-using TensorSlicePtr = std::shared_ptr<TensorSlice>;
-
 class TupleGetItemTensor : public MetaFuncGraph {
  public:
   explicit TupleGetItemTensor(const std::string &name) : MetaFuncGraph(name) {}
diff --git a/mindspore/ccsrc/operator/composite/do_signature.cc b/mindspore/ccsrc/operator/composite/do_signature.cc
index 0cc4ee0483..d9bcef3031 100644
--- a/mindspore/ccsrc/operator/composite/do_signature.cc
+++ b/mindspore/ccsrc/operator/composite/do_signature.cc
@@ -65,55 +65,57 @@ void ProcessDefault(const std::string &func_name, const AbstractBasePtrList &arg
     }
   }
 }
-bool CompareTensorScalarType(const TypeId &tensor_type, const size_t &t_type_number, const TypeId &scalar_type,
-                             const size_t &s_type_number) {
-  if (scalar_type == kNumberTypeFloat16 || scalar_type == kNumberTypeFloat32 || scalar_type == kNumberTypeFloat64) {
-    if (tensor_type == kNumberTypeFloat16 || tensor_type == kNumberTypeFloat32 || tensor_type == kNumberTypeFloat64) {
-      return t_type_number >= s_type_number;
-    }
-    return false;
-  }
-  return true;
-}
 
-void setMaxType(TypeId *max_type_id, TypeId *max_type, size_t *max_type_number, const TypeId type_id, const TypeId type,
-                const size_t type_number) {
+void SetMaxType(TypeId *max_type_id, size_t *max_type_number, const TypeId type_id, const size_t type_number) {
   *max_type_id = type_id;
-  *max_type = type;
   *max_type_number = type_number;
 }
 
-TypeId GetMaxTypeId(const abstract::AbstractBasePtrList &args_spec_list, std::vector<size_t> indexs,
-                    const std::set<size_t> &write_indexs) {
+bool GetTensorOrScalarTypeInfo(AbstractBasePtr arg_value, bool is_write, TypeId *arg_type_id,
+                               TypeId *arg_type = nullptr) {
+  if (arg_value->isa<abstract::AbstractRef>()) {
+    if (is_write) {
+      arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref_origin();
+    } else {
+      arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
+    }
+  }
+  if (arg_value->isa<abstract::AbstractTensor>()) {
+    auto tensor = arg_value->cast<abstract::AbstractTensorPtr>();
+    auto tensor_type = tensor->element()->BuildType();
+    MS_EXCEPTION_IF_NULL(tensor_type);
+    *arg_type_id = tensor_type->type_id();
+    if (arg_type != nullptr) {
+      *arg_type = kObjectTypeTensorType;
+    }
+    return true;
+  }
+  if (arg_value->isa<abstract::AbstractScalar>()) {
+    auto scalar = arg_value->cast<abstract::AbstractScalarPtr>();
+    auto scalar_type = scalar->BuildType();
+    MS_EXCEPTION_IF_NULL(scalar_type);
+    *arg_type_id = scalar_type->type_id();
+    if (arg_type != nullptr) {
+      *arg_type = kObjectTypeNumber;
+    }
+    return true;
+  }
+  return false;
+}
+
+TypeId GetMaxTypeId(const abstract::AbstractBasePtrList &args_spec_list, std::vector<size_t> indices,
+                    const std::set<size_t> &write_indices) {
   TypeId max_type_id = kTypeUnknown;
-  TypeId max_type = kTypeUnknown;
   size_t max_type_number = 0;
   bool has_int8 = false;
-  for (const auto &index : indexs) {
+  for (const auto &index : indices) {
     TypeId arg_type_id = kTypeUnknown;
     TypeId arg_type = kTypeUnknown;
-    AbstractBasePtr arg_value = args_spec_list[index];
-    if (arg_value->isa<abstract::AbstractRef>()) {
-      auto is_write = (write_indexs.find(index) != write_indexs.end());
-      if (is_write) {
-        arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref_origin();
-      } else {
-        arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
-      }
+    auto is_write = (write_indices.find(index) != write_indices.end());
+    if (!GetTensorOrScalarTypeInfo(args_spec_list[index], is_write, &arg_type_id, &arg_type)) {
+      continue;
     }
-    if (arg_value->isa<abstract::AbstractTensor>()) {
-      auto tensor = arg_value->cast<abstract::AbstractTensorPtr>();
-      auto tensor_type = tensor->element()->BuildType();
-      MS_EXCEPTION_IF_NULL(tensor_type);
-      arg_type_id = tensor_type->type_id();
-      arg_type = kObjectTypeTensorType;
-    } else if (arg_value->isa<abstract::AbstractScalar>()) {
-      auto scalar = arg_value->cast<abstract::AbstractScalarPtr>();
-      auto scalar_type = scalar->BuildType();
-      MS_EXCEPTION_IF_NULL(scalar_type);
-      arg_type_id = scalar_type->type_id();
-      arg_type = kObjectTypeNumber;
-    } else {
+    if (arg_type != kObjectTypeTensorType) {
       continue;
     }
     auto it = type_map.find(arg_type_id);
@@ -124,24 +126,11 @@ TypeId GetMaxTypeId(const abstract::AbstractBasePtrList &args_spec_list, std::ve
       has_int8 = true;
     }
     if (max_type_id == kTypeUnknown) {
-      setMaxType(&max_type_id, &max_type, &max_type_number, arg_type_id, arg_type, it->second);
+      SetMaxType(&max_type_id, &max_type_number, arg_type_id, it->second);
       continue;
     }
-
-    if (max_type == arg_type) {
-      if (it->second > max_type_number) {
-        setMaxType(&max_type_id, &max_type, &max_type_number, arg_type_id, arg_type, it->second);
-      }
-    } else {
-      if (arg_type == kObjectTypeTensorType) {
-        if (CompareTensorScalarType(arg_type_id, it->second, max_type_id, max_type_number)) {
-          setMaxType(&max_type_id, &max_type, &max_type_number, arg_type_id, arg_type, it->second);
-        }
-      } else {
-        if (!CompareTensorScalarType(max_type_id, max_type_number, arg_type_id, it->second)) {
-          setMaxType(&max_type_id, &max_type, &max_type_number, arg_type_id, arg_type, it->second);
-        }
-      }
+    if (it->second > max_type_number) {
+      SetMaxType(&max_type_id, &max_type_number, arg_type_id, it->second);
     }
   }
 
@@ -154,28 +143,28 @@ TypeId GetMaxTypeId(const abstract::AbstractBasePtrList &args_spec_list, std::ve
 // Get the largest type of index in the same SignatureEnumDType of arguments.
 std::map<SignatureEnumDType, TypeId> GetMaxDtype(const std::vector<SignatureEnumDType> &dtypes,
                                                  const abstract::AbstractBasePtrList &args_spec_list,
-                                                 const std::set<size_t> &write_indexs) {
+                                                 const std::set<size_t> &write_indices) {
   // record index for signature.dtypes of the same type
   // eg. [T, T1, T, T2, T, T1, T3] -> {{T:(0,2,4)}, {T1:(1,5)}, {T2:(3)}, {T3:(6)}}
-  std::map<SignatureEnumDType, std::vector<size_t>> type_indexs;
+  std::map<SignatureEnumDType, std::vector<size_t>> type_indices;
   for (size_t i = 0; i < dtypes.size(); ++i) {
-    auto it = type_indexs.find(dtypes[i]);
-    if (it == type_indexs.end()) {
-      (void)type_indexs.insert(std::make_pair(dtypes[i], std::vector<size_t>{i}));
+    auto it = type_indices.find(dtypes[i]);
+    if (it == type_indices.end()) {
+      (void)type_indices.insert(std::make_pair(dtypes[i], std::vector<size_t>{i}));
     } else {
       it->second.push_back(i);
     }
   }
   std::map<SignatureEnumDType, TypeId> dst_type;
-  for (auto it = type_indexs.begin(); it != type_indexs.end(); (void)++it) {
+  for (auto it = type_indices.begin(); it != type_indices.end(); (void)++it) {
     auto type = it->first;
-    auto indexs = it->second;
+    auto indices = it->second;
     // If the number of arguments belonging to the same SignatureEnumDType is less than 2, skip it.
-    if (indexs.size() < 2) {
+    if (indices.size() < 2) {
       continue;
     }
     bool has_tensor = false;
-    for (const auto &index : indexs) {
+    for (const auto &index : indices) {
       AbstractBasePtr arg_value = args_spec_list[index];
       if (arg_value->isa<abstract::AbstractRef>()) {
         arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
@@ -189,7 +178,7 @@ std::map<SignatureEnumDType, TypeId> GetMaxDtype(const std::vector<SignatureEnum
       (void)dst_type.insert(std::make_pair(type, kTypeUnknown));
       continue;
     }
-    (void)dst_type.insert(std::make_pair(type, GetMaxTypeId(args_spec_list, indexs, write_indexs)));
+    (void)dst_type.insert(std::make_pair(type, GetMaxTypeId(args_spec_list, indices, write_indices)));
   }
   return dst_type;
 }
@@ -204,7 +193,7 @@ AnfNodePtr DoCast(const AnfNodePtr &param, const TypeId &type_id, const FuncGrap
 
 void DoAutoCast(const std::string &func_name, const std::vector<Signature> &signature,
                 const abstract::AbstractBasePtrList &args_spec_list, const FuncGraphPtr &graph,
-                std::vector<AnfNodePtr> *const op_inputs, const std::set<size_t> &write_indexs) {
+                std::vector<AnfNodePtr> *const op_inputs, const std::set<size_t> &write_indices) {
   std::vector<SignatureEnumDType> dtypes;
   (void)std::transform(signature.begin(), signature.end(), std::back_inserter(dtypes),
                        [](const Signature &sig) { return sig.dtype; });
@@ -213,54 +202,40 @@ void DoAutoCast(const std::string &func_name, const std::vector<Signature> &sign
     return;
   }
   // Stat the index of the arguments with the largest type in the same SignatureEnumDType.
-  std::map<SignatureEnumDType, TypeId> dst_type = GetMaxDtype(dtypes, args_spec_list, write_indexs);
+  std::map<SignatureEnumDType, TypeId> dst_type = GetMaxDtype(dtypes, args_spec_list, write_indices);
   // Identify which arg requires auto cast
   for (size_t i = 0; i < args_spec_list.size(); ++i) {
     auto it = dst_type.find(dtypes[i]);
     if (it == dst_type.end() || it->second == kTypeUnknown) {
       continue;
     }
-    auto rw_it = write_indexs.find(i);
-    auto is_write = (rw_it != write_indexs.end());
+    auto rw_it = write_indices.find(i);
+    auto is_write = (rw_it != write_indices.end());
 
-    AbstractBasePtr arg_value = args_spec_list[i];
-    if (arg_value->isa<abstract::AbstractRef>()) {
-      if (is_write) {
-        arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref_origin();
-      } else {
-        arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
-      }
-    }
     TypeId arg_type_id = kTypeUnknown;
-    if (arg_value->isa<abstract::AbstractTensor>()) {
-      auto tensor = arg_value->cast<abstract::AbstractTensorPtr>();
-      auto tensor_type = tensor->element()->BuildType();
-      MS_EXCEPTION_IF_NULL(tensor_type);
-      arg_type_id = tensor_type->type_id();
-    } else if (arg_value->isa<abstract::AbstractScalar>()) {
-      auto scalar = arg_value->cast<abstract::AbstractScalarPtr>();
-      auto scalar_type = scalar->BuildType();
-      MS_EXCEPTION_IF_NULL(scalar_type);
-      arg_type_id = scalar_type->type_id();
-    }
-    auto it_map = type_map.find(arg_type_id);
-    if (it_map == type_map.end()) {
+    AbstractBasePtr arg_value = args_spec_list[i];
+    (void)GetTensorOrScalarTypeInfo(arg_value, is_write, &arg_type_id);
+    auto it_map = type_name_map.find(arg_type_id);
+    if (it_map == type_name_map.end()) {
       continue;
     }
     if (is_write) {
       if (arg_type_id != it->second) {
-        MS_LOG(EXCEPTION) << "In op '" << func_name << "', argument '" << args_spec_list[i]
-                          << "' can not cast type from '" << TypeIdLabel(arg_type_id) << "' to '"
-                          << TypeIdLabel(it->second) << "' automatically.";
+        auto it_name_map = type_name_map.find(it->second);
+        if (it_name_map == type_name_map.end()) {
+          continue;
+        }
+        MS_LOG(EXCEPTION) << "In op '" << func_name << "', \n"
+                          << "the type of writable argument is '" << it_map->second << "', "
+                          << "but the largest type in the same SignatureEumDtype is '" << it_name_map->second
+                          << "'. The writable arg type is not equal to the largest type, "
+                          << "so can not cast automatically.";
       }
       continue;
     }
     if (arg_value->isa<abstract::AbstractTensor>() && arg_type_id == it->second) {
       continue;
     }
-    if ((arg_type_id == kNumberTypeBool || it->second == kNumberTypeBool) && arg_type_id != it->second) {
-      continue;
-    }
     (*op_inputs)[i + 1] = DoCast((*op_inputs)[i + 1], it->second, graph);
   }
 }
@@ -282,12 +257,16 @@ AnfNodePtr BuildNewCNode(const FuncGraphPtr &func_graph, const std::string &func
     }
   }
   std::vector<AnfNodePtr> op_inputs;
-  std::set<size_t> write_indexs;
+  std::set<size_t> write_indices;
   op_inputs.push_back(NewValueNode(function));
   // Assume, the write input of op is always the first input. We check if any write op,
   // and add cast op on other inputs to keep the same type with assigned parameter.
   for (size_t i = 0; i < args_spec_list.size(); ++i) {
     AnfNodePtr param = params_list[i];
+    if (args_spec_list[i] == nullptr) {
+      op_inputs.push_back(param);
+      continue;
+    }
     SignatureEnumRW sig = SignatureEnumRW::kRWDefault;
     // If sig_size is 0 use defalut.
     if (sig_size > 0 && i < sig_size) {
@@ -295,13 +274,14 @@ AnfNodePtr BuildNewCNode(const FuncGraphPtr &func_graph, const std::string &func
     } else if (has_var && i >= sig_size) {
       sig = signature[sig_size - 1].rw;
     }
+
     TypePtr type = args_spec_list[i]->GetTypeTrack();
     if (type && type->type_id() == kObjectTypeRef) {
       if (sig == SignatureEnumRW::kRWRead) {
         param = func_graph->NewCNode({NewValueNode(prim::kPrimGetRefValue), param});
       } else if (sig == SignatureEnumRW::kRWWrite) {
         param = func_graph->NewCNode({NewValueNode(prim::kPrimGetRefOrigin), param});
-        write_indexs.insert(i);
+        write_indices.insert(i);
       }
       // If sig is SignatureEnumRW::kRWRef, not do anything.
     } else if (sig == SignatureEnumRW::kRWWrite && type->type_id() != kObjectTypeRefKey) {
@@ -311,7 +291,7 @@ AnfNodePtr BuildNewCNode(const FuncGraphPtr &func_graph, const std::string &func
   }
   // process default
   ProcessDefault(func_name, args_spec_list, signature, has_var, &op_inputs);
-  DoAutoCast(func_name, signature, args_spec_list, func_graph, &op_inputs, write_indexs);
+  DoAutoCast(func_name, signature, args_spec_list, func_graph, &op_inputs, write_indices);
   return func_graph->NewCNode(op_inputs);
 }
 }  // namespace
@@ -330,7 +310,7 @@ FuncGraphPtr DoSignatureMetaFuncGraph::GenerateFuncGraph(const AbstractBasePtrLi
   }
   auto new_cnode = BuildNewCNode(func_graph, name_, function_, args_spec_list, func_graph->parameters());
   func_graph->set_output(new_cnode);
-  func_graph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  func_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   return func_graph;
 }
 }  // namespace prim
diff --git a/mindspore/ccsrc/operator/composite/list_append_operation.cc b/mindspore/ccsrc/operator/composite/list_append_operation.cc
index b5a4fc626e..236a5b7062 100644
--- a/mindspore/ccsrc/operator/composite/list_append_operation.cc
+++ b/mindspore/ccsrc/operator/composite/list_append_operation.cc
@@ -35,7 +35,7 @@ FuncGraphPtr ListAppend::GenerateFuncGraph(const abstract::AbstractBasePtrList &
   MS_EXCEPTION_IF_NULL(arg0_list);
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   ret->debug_info()->set_name("append");
   AnfNodePtr arg0_node = ret->add_parameter();
 
diff --git a/mindspore/ccsrc/operator/composite/map.cc b/mindspore/ccsrc/operator/composite/map.cc
new file mode 100644
index 0000000000..a054da5f4d
--- /dev/null
+++ b/mindspore/ccsrc/operator/composite/map.cc
@@ -0,0 +1,289 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "operator/composite/map.h"
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "ir/anf.h"
+#include "ir/func_graph.h"
+#include "pipeline/static_analysis/abstract_value.h"
+#include "pipeline/static_analysis/abstract_function.h"
+#include "pipeline/static_analysis/dshape.h"
+#include "pybind_api/api_register.h"
+#include "debug/trace.h"
+#include "operator/ops.h"
+#include "./common.h"
+
+namespace mindspore {
+// namespace to support composite operators definition
+namespace prim {
+using FuncGraphAbstractClosure = mindspore::abstract::FuncGraphAbstractClosure;
+
+AnfNodePtr Map::FullMakeLeaf(const FuncGraphPtr &func_graph, const AnfNodePtr &fn_arg, const AnfNodePtrList &args) {
+  MS_LOG(DEBUG) << "Map FullMakeLeaf non recursive.\n";
+  MS_EXCEPTION_IF_NULL(func_graph);
+  std::vector<AnfNodePtr> inputs;
+  if (fn_arg != nullptr) {
+    inputs.emplace_back(fn_arg);
+  } else {
+    inputs.emplace_back(NewValueNode(fn_leaf_));
+  }
+  inputs.insert(inputs.end(), args.begin(), args.end());
+  return func_graph->NewCNode(inputs);
+}
+
+FuncGraphPtr Map::GenerateLeafFunc(const size_t &args_size) {
+  // Generate func for leaf nodes
+  FuncGraphPtr ptrGraph = std::make_shared<FuncGraph>();
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
+  ptrGraph->debug_info()->set_name("map");
+  AnfNodePtr ptrFnArg = nullptr;
+  if (fn_leaf_ == nullptr) {
+    ptrFnArg = ptrGraph->add_parameter();
+  }
+  AnfNodePtrList args;
+  for (size_t i = 0; i < args_size; ++i) {
+    args.emplace_back(ptrGraph->add_parameter());
+  }
+  ptrGraph->set_output(FullMakeLeaf(ptrGraph, ptrFnArg, args));
+  return ptrGraph;
+}
+
+AnfNodePtr Map::FullMakeList(const std::shared_ptr<List> &type, const FuncGraphPtr &func_graph,
+                             const AnfNodePtr &fn_arg, const ArgsPairList &arg_pairs) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(type);
+
+  std::size_t size = type->elements().size();
+  bool is_not_same =
+    std::any_of(arg_pairs.begin(), arg_pairs.end(), [size](const std::pair<AnfNodePtr, TypePtr> &item) {
+      auto lhs = std::dynamic_pointer_cast<List>(item.second);
+      MS_EXCEPTION_IF_NULL(lhs);
+      return lhs->elements().size() != size;
+    });
+  if (is_not_same) {
+    MS_LOG(EXCEPTION) << "List in Map should have same length";
+  }
+
+  std::vector<AnfNodePtr> inputs;
+  inputs.push_back(NewValueNode(prim::kPrimMakeList));
+
+  for (int i = 0; i < SizeToInt(size); ++i) {
+    MS_LOG(DEBUG) << "GenerateLeafFunc for the " << i << "th arg of the target";
+    auto ptrGraph = GenerateLeafFunc(arg_pairs.size());
+    auto fn = NewValueNode(ptrGraph);
+
+    std::vector<AnfNodePtr> inputs2;
+    inputs2.push_back(fn);
+    if (fn_arg != nullptr) {
+      inputs2.push_back(fn_arg);
+    }
+
+    (void)std::transform(
+      arg_pairs.begin(), arg_pairs.end(), std::back_inserter(inputs2),
+      [&func_graph, i](const std::pair<AnfNodePtr, Any> &item) {
+        return func_graph->NewCNode({NewValueNode(prim::kPrimListGetItem), item.first, NewValueNode(i)});
+      });
+
+    inputs.push_back(func_graph->NewCNode(inputs2));
+  }
+  return func_graph->NewCNode(inputs);
+}
+
+AnfNodePtr Map::FullMakeTuple(const std::shared_ptr<Tuple> &type, const FuncGraphPtr &func_graph,
+                              const AnfNodePtr &fn_arg, const ArgsPairList &arg_pairs) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(type);
+
+  std::size_t size = type->elements().size();
+  bool is_not_same =
+    std::any_of(arg_pairs.begin(), arg_pairs.end(), [size](const std::pair<AnfNodePtr, TypePtr> &item) {
+      auto lhs = std::dynamic_pointer_cast<Tuple>(item.second);
+      MS_EXCEPTION_IF_NULL(lhs);
+      return lhs->elements().size() != size;
+    });
+  if (is_not_same) {
+    MS_LOG(EXCEPTION) << "tuple in Map should have same length";
+  }
+
+  std::vector<AnfNodePtr> inputs;
+  inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
+
+  for (int i = 0; i < SizeToInt(size); ++i) {
+    MS_LOG(DEBUG) << "GenerateLeafFunc for the " << i << "th arg of the tuple inputs";
+    auto ptrGraph = GenerateLeafFunc(arg_pairs.size());
+    auto fn = NewValueNode(ptrGraph);
+
+    std::vector<AnfNodePtr> inputs2;
+    inputs2.push_back(fn);
+    if (fn_arg != nullptr) {
+      inputs2.push_back(fn_arg);
+    }
+
+    (void)std::transform(
+      arg_pairs.begin(), arg_pairs.end(), std::back_inserter(inputs2),
+      [&func_graph, &i](std::pair<AnfNodePtr, Any> item) {
+        return func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), item.first, NewValueNode(i)});
+      });
+
+    inputs.push_back(func_graph->NewCNode(inputs2));
+  }
+  return func_graph->NewCNode(inputs);
+}
+
+AnfNodePtr Map::FullMakeClass(const std::shared_ptr<Class> &type, const FuncGraphPtr &func_graph,
+                              const AnfNodePtr &fn_arg, const ArgsPairList &arg_pairs) {
+  MS_EXCEPTION_IF_NULL(type);
+  MS_EXCEPTION_IF_NULL(func_graph);
+
+  std::vector<AnfNodePtr> inputs;
+  inputs.push_back(NewValueNode(prim::kPrimMakeRecord));
+  inputs.push_back(NewValueNode(type));
+
+  std::size_t attrSize = type->GetAttributes().size();
+  for (std::size_t i = 0; i < attrSize; ++i) {
+    MS_LOG(DEBUG) << "GenerateLeafFunc for the " << i << "th element of the inputs";
+    auto ptrGraph = GenerateLeafFunc(arg_pairs.size());
+    auto fn = NewValueNode(ptrGraph);
+
+    std::vector<AnfNodePtr> inputs2;
+    inputs2.push_back(fn);
+    if (fn_arg != nullptr) {
+      inputs2.push_back(fn_arg);
+    }
+
+    int j = 0;
+    for (auto item : arg_pairs) {
+      inputs2.push_back(func_graph->NewCNode({NewValueNode(prim::kPrimGetAttr), item.first, NewValueNode(j)}));
+      j++;
+    }
+
+    inputs.push_back(func_graph->NewCNode(inputs2));
+  }
+  return func_graph->NewCNode(inputs);
+}
+
+AnfNodePtr Map::Make(const FuncGraphPtr &func_graph, const AnfNodePtr &fn_arg, const ArgsPairList &arg_pairs) {
+  bool found = false;
+  TypeId id = kObjectTypeEnd;
+  std::pair<AnfNodePtr, TypePtr> pair;
+  for (auto &item : arg_pairs) {
+    pair = item;
+    MS_LOG(DEBUG) << "Map " << pair.second->ToString();
+    id = item.second->type_id();
+    if (nonleaf_.count(id)) {
+      found = true;
+      break;
+    }
+  }
+
+  if (found) {
+    // In a nonleaf situation, all arguments must have the same generic.
+    bool is_not_same =
+      std::any_of(arg_pairs.begin(), arg_pairs.end(), [pair](const std::pair<AnfNodePtr, TypePtr> &item) {
+        if (item.first != pair.first) {
+          return item.second->type_id() != pair.second->type_id();
+        }
+        return false;
+      });
+    if (is_not_same) {
+      std::ostringstream oss;
+      oss << "There are " << arg_pairs.size() << " inputs of `" << name_ << "`, corresponding type info:\n"
+          << trace::GetDebugInfo(func_graph->debug_info()) << "\n";
+      int idx = 0;
+      for (auto &item : arg_pairs) {
+        oss << ++idx << ": " << item.second->ToString() << "\n";
+      }
+      MS_LOG(EXCEPTION) << "Map cannot match up all input types of arguments.\n"
+                        << oss.str() << pair.second->ToString() << "\n";
+    }
+  }
+
+  switch (id) {
+    case kObjectTypeList: {
+      auto type = std::static_pointer_cast<List>(pair.second);
+      return FullMakeList(type, func_graph, fn_arg, arg_pairs);
+    }
+    case kObjectTypeTuple: {
+      auto type = std::static_pointer_cast<Tuple>(pair.second);
+      return FullMakeTuple(type, func_graph, fn_arg, arg_pairs);
+    }
+    case kObjectTypeClass: {
+      auto type = std::static_pointer_cast<Class>(pair.second);
+      return FullMakeClass(type, func_graph, fn_arg, arg_pairs);
+    }
+    default:
+      MS_LOG(EXCEPTION) << "Map can only be applied to list, tuple and class "
+                        << ", but got " << pair.second->ToString();
+  }
+}
+
+FuncGraphPtr Map::GenerateFromTypes(const TypePtrList &args_spec_list) {
+  FuncGraphPtr ptrGraph = std::make_shared<FuncGraph>();
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
+  ptrGraph->debug_info()->set_name("map");
+
+  AnfNodePtr ptrFnArg = nullptr;
+  std::size_t i = 0;
+  if (fn_leaf_ == nullptr) {
+    ptrFnArg = ptrGraph->add_parameter();
+    i = 1;
+  }
+  ArgsPairList arg_pairs;
+  std::size_t size = args_spec_list.size();
+  for (; i < size; ++i) {
+    MS_LOG(DEBUG) << "GenerateFromTypes for elements from " << args_spec_list[i]->ToString();
+    arg_pairs.push_back(std::make_pair(ptrGraph->add_parameter(), args_spec_list[i]));
+  }
+
+  ptrGraph->set_output(Make(ptrGraph, ptrFnArg, arg_pairs));
+  return ptrGraph;
+}
+
+abstract::AbstractBasePtrList Map::NormalizeArgs(const AbstractBasePtrList &args_spec_list) const {
+  if (fn_leaf_ == nullptr) {
+    MS_EXCEPTION_IF_NULL(args_spec_list[0]);
+    // Assert that map's function param does not contain free variables
+    if (args_spec_list[0]->isa<FuncGraphAbstractClosure>()) {
+      auto graph_func = dyn_cast<FuncGraphAbstractClosure>(args_spec_list[0]);
+      auto func_graph = graph_func->func_graph();
+      if (func_graph->parent() != nullptr) {
+        MS_LOG(EXCEPTION) << "Map don't support Closure with free variable yet.";
+      }
+    }
+  }
+
+  AbstractBasePtrList broadened;
+  (void)std::transform(args_spec_list.begin(), args_spec_list.end(), std::back_inserter(broadened),
+                       [](const AbstractBasePtr &arg) -> AbstractBasePtr {
+                         MS_EXCEPTION_IF_NULL(arg);
+                         return arg->Broaden();
+                       });
+  return broadened;
+}
+
+REGISTER_PYBIND_DEFINE(Map_, ([](const py::module *m) {
+                         (void)py::class_<MapPy, MetaFuncGraph, std::shared_ptr<MapPy>>(*m, "Map_")
+                           .def(py::init<std::shared_ptr<MultitypeFuncGraph>>(), py::arg("leaf"))
+                           .def(py::init<>());
+                       }));
+}  // namespace prim
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/operator/composite/map.h b/mindspore/ccsrc/operator/composite/map.h
new file mode 100644
index 0000000000..02d374214a
--- /dev/null
+++ b/mindspore/ccsrc/operator/composite/map.h
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_OPERATOR_COMPOSITE_MAP_H_
+#define MINDSPORE_CCSRC_OPERATOR_COMPOSITE_MAP_H_
+
+#include <memory>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "ir/dtype.h"
+#include "ir/meta_func_graph.h"
+#include "operator/composite/multitype_funcgraph.h"
+
+namespace mindspore {
+// namespace to support composite operators definition
+namespace prim {
+using ArgsPairList = std::vector<std::pair<AnfNodePtr, TypePtr>>;
+
+class Map : public MetaFuncGraph {
+ public:
+  explicit Map(const std::shared_ptr<MultitypeFuncGraph> &fn_leaf = nullptr)
+      : MetaFuncGraph("map"),
+        fn_leaf_(fn_leaf),
+        broadcast_(false),
+        nonleaf_({kObjectTypeList, kObjectTypeTuple, kObjectTypeClass}) {
+    Init();
+  }
+  Map(const Map &h) : MetaFuncGraph("map"), fn_leaf_(h.fn_leaf_), broadcast_(h.broadcast_), nonleaf_(h.nonleaf_) {
+    Init();
+  }
+  Map &operator=(const Map &h) {
+    if (this != &h) {
+      fn_leaf_ = h.fn_leaf_;
+      broadcast_ = h.broadcast_;
+      nonleaf_ = h.nonleaf_;
+      if (fn_leaf_) {
+        name_ = "map[" + fn_leaf_->name() + "]";
+      }
+    }
+    return *this;
+  }
+  ~Map() override = default;
+  MS_DECLARE_PARENT(Map, MetaFuncGraph)
+  abstract::AbstractBasePtrList NormalizeArgs(const abstract::AbstractBasePtrList &args_spec_list) const override;
+  FuncGraphPtr GenerateFromTypes(const TypePtrList &args_spec_list) override;
+  MetaFuncGraphPtr GetFnLeaf() { return fn_leaf_; }
+
+ private:
+  FuncGraphPtr GenerateLeafFunc(const size_t &args_size);
+  AnfNodePtr FullMakeLeaf(const FuncGraphPtr &func_graph, const AnfNodePtr &fn_arg, const AnfNodePtrList &args);
+  AnfNodePtr FullMakeList(const std::shared_ptr<List> &type, const FuncGraphPtr &func_graph, const AnfNodePtr &fn_arg,
+                          const ArgsPairList &arg_pairs);
+  AnfNodePtr FullMakeTuple(const std::shared_ptr<Tuple> &type, const FuncGraphPtr &func_graph, const AnfNodePtr &fn_arg,
+                           const ArgsPairList &arg_pairs);
+  AnfNodePtr FullMakeClass(const std::shared_ptr<Class> &type, const FuncGraphPtr &func_graph, const AnfNodePtr &fn_arg,
+                           const ArgsPairList &arg_pairs);
+  AnfNodePtr Make(const FuncGraphPtr &graph, const AnfNodePtr &fn_arg, const ArgsPairList &arg_pairs);
+  void Init() {
+    if (fn_leaf_ != nullptr) {
+      name_ = "map[" + fn_leaf_->name() + "]";
+    }
+    signatures_ =
+      // def map(func:read, *args:ref):
+      std::vector<Signature>({{"func", SignatureEnumRW::kRWRead, SignatureEnumKind::kKindDefault},
+                              {"args", SignatureEnumRW::kRWRef, SignatureEnumKind::kKindVarPositional}});
+  }
+
+  MultitypeFuncGraphPtr fn_leaf_;
+  bool broadcast_;
+  std::set<TypeId> nonleaf_;
+};
+using MapPtr = std::shared_ptr<Map>;
+class MapPy : public Map {
+ public:
+  explicit MapPy(const std::shared_ptr<MultitypeFuncGraph> &fn_leaf = nullptr) : Map(fn_leaf) {}
+  ~MapPy() override = default;
+  MS_DECLARE_PARENT(MapPy, Map)
+};
+using MapPyPtr = std::shared_ptr<MapPy>;
+}  // namespace prim
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_OPERATOR_COMPOSITE_MAP_H_
diff --git a/mindspore/ccsrc/operator/composite/multitype_funcgraph.cc b/mindspore/ccsrc/operator/composite/multitype_funcgraph.cc
index e3957d044a..88b3134508 100644
--- a/mindspore/ccsrc/operator/composite/multitype_funcgraph.cc
+++ b/mindspore/ccsrc/operator/composite/multitype_funcgraph.cc
@@ -39,7 +39,6 @@
 namespace mindspore {
 // namespace to support composite operators definition
 namespace prim {
-
 MultitypeFuncGraph::MultitypeFuncGraph(const std::string &name) : MetaFuncGraph(name) {
   fn_cache_.clear();
   signatures_ = std::vector<Signature>({// def multitype(*args:ref):
@@ -148,6 +147,5 @@ REGISTER_PYBIND_DEFINE(MultitypeFuncGraph_, ([](const py::module *m) {
                            .def(py::init<std::string &>())
                            .def("register_fn", &MultitypeFuncGraph::PyRegister);
                        }));
-
 }  // namespace prim
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/operator/composite/multitype_funcgraph.h b/mindspore/ccsrc/operator/composite/multitype_funcgraph.h
index b38625d62c..feb38f17ba 100644
--- a/mindspore/ccsrc/operator/composite/multitype_funcgraph.h
+++ b/mindspore/ccsrc/operator/composite/multitype_funcgraph.h
@@ -34,7 +34,6 @@
 namespace mindspore {
 // namespace to support composite operators definition
 namespace prim {
-
 class MultitypeFuncGraph : public MetaFuncGraph {
  public:
   explicit MultitypeFuncGraph(const std::string &name);
@@ -59,7 +58,6 @@ class MultitypeFuncGraph : public MetaFuncGraph {
   std::unordered_map<TypePtrList, py::function, TypeListHasher, TypeListEqual> fn_cache_py_;
 };
 using MultitypeFuncGraphPtr = std::shared_ptr<MultitypeFuncGraph>;
-
 }  // namespace prim
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/operator/composite/unpack_call.cc b/mindspore/ccsrc/operator/composite/unpack_call.cc
index 6363d495c5..3993d41597 100644
--- a/mindspore/ccsrc/operator/composite/unpack_call.cc
+++ b/mindspore/ccsrc/operator/composite/unpack_call.cc
@@ -51,7 +51,7 @@ FuncGraphPtr UnpackCall::GenerateFuncGraph(const AbstractBasePtrList &args_spec_
 
   (void)abstract::CheckArg<AbstractFunction>(op_name, args_spec_list, 0);
   auto ret_graph = std::make_shared<FuncGraph>();
-  ret_graph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
 
   AnfNodePtr fnNode = ret_graph->add_parameter();
   std::vector<AnfNodePtr> elems;
diff --git a/mindspore/ccsrc/operator/composite/zip_operation.cc b/mindspore/ccsrc/operator/composite/zip_operation.cc
index 4d34163f28..33e21da044 100644
--- a/mindspore/ccsrc/operator/composite/zip_operation.cc
+++ b/mindspore/ccsrc/operator/composite/zip_operation.cc
@@ -57,7 +57,7 @@ FuncGraphPtr ZipOperation::GenerateFuncGraph(const AbstractBasePtrList &args_spe
                                     return (x->cast<AbstractTuplePtr>()->size() < y->cast<AbstractTuplePtr>()->size());
                                   });
   FuncGraphPtr ret_graph = std::make_shared<FuncGraph>();
-  ret_graph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   for (size_t idx = 0; idx < args_spec_list.size(); idx++) {
     (void)ret_graph->add_parameter();
   }
diff --git a/mindspore/ccsrc/operator/ops.cc b/mindspore/ccsrc/operator/ops.cc
index 36bbe1a650..f86cbd7fd2 100755
--- a/mindspore/ccsrc/operator/ops.cc
+++ b/mindspore/ccsrc/operator/ops.cc
@@ -50,6 +50,12 @@ const PrimitivePtr kPrimBoolNot = std::make_shared<Primitive>("bool_not");
 const PrimitivePtr kPrimBoolAnd = std::make_shared<Primitive>("bool_and");
 const PrimitivePtr kPrimBoolOr = std::make_shared<Primitive>("bool_or");
 const PrimitivePtr kPrimBoolEq = std::make_shared<Primitive>("bool_eq");
+const PrimitivePtr kPrimGreater = std::make_shared<Primitive>("Greater");
+const PrimitivePtr kPrimGreaterEqual = std::make_shared<Primitive>("GreaterEqual");
+const PrimitivePtr kPrimLess = std::make_shared<Primitive>("Less");
+const PrimitivePtr kPrimLessEqual = std::make_shared<Primitive>("LessEqual");
+const PrimitivePtr kPrimEqual = std::make_shared<Primitive>("Equal");
+const PrimitivePtr kPrimNotEqual = std::make_shared<Primitive>("NotEqual");
 
 // Type introspection
 const PrimitivePtr kPrimTypeOf = std::make_shared<Primitive>("typeof");
@@ -133,6 +139,8 @@ const PrimitivePtr kPrimConcat = std::make_shared<Primitive>("Concat");
 const PrimitivePtr kPrimSqueeze = std::make_shared<Primitive>("Squeeze");
 const PrimitivePtr kPrimTranspose = std::make_shared<Primitive>("Transpose");
 const PrimitivePtr kPrimGatherV2 = std::make_shared<Primitive>("GatherV2");
+const PrimitivePtr kPrimEmbeddingLookup = std::make_shared<Primitive>("EmbeddingLookup");
+const PrimitivePtr kPrimEmbeddingLookupCommGrad = std::make_shared<Primitive>("EmbeddingLookupCommGrad");
 const PrimitivePtr kPrimSize = std::make_shared<Primitive>("Size");
 const PrimitivePtr kPrimArgMax = std::make_shared<Primitive>("Argmax");
 const PrimitivePtr kPrimPack = std::make_shared<Primitive>("Pack");
@@ -145,6 +153,7 @@ const PrimitivePtr kPrimAddN = std::make_shared<Primitive>("AddN");
 const PrimitivePtr KPrimTransData = std::make_shared<Primitive>("TransData");
 const PrimitivePtr kPrimNMSWithMask = std::make_shared<Primitive>("NMSWithMask");
 const PrimitivePtr kPrimPad = std::make_shared<Primitive>("Pad");
+const PrimitivePtr kPrimArgMaxWithValue = std::make_shared<Primitive>("ArgMaxWithValue");
 
 // Maths
 const PrimitivePtr kPrimTensorAdd = std::make_shared<Primitive>("TensorAdd");
@@ -163,14 +172,20 @@ const PrimitivePtr kPrimMul = std::make_shared<Primitive>("Mul");
 const PrimitivePtr kPrimMinimum = std::make_shared<Primitive>("Minimum");
 const PrimitivePtr kPrimMaximum = std::make_shared<Primitive>("Maximum");
 const PrimitivePtr kPrimSquare = std::make_shared<Primitive>("Square");
-const PrimitivePtr kPrimEqual = std::make_shared<Primitive>("Equal");
-const PrimitivePtr kPrimLess = std::make_shared<Primitive>("Less");
-const PrimitivePtr kPrimLessEqual = std::make_shared<Primitive>("LessEqual");
 const PrimitivePtr kPrimCumSum = std::make_shared<Primitive>("CumSum");
 const PrimitivePtr kPrimCumProd = std::make_shared<Primitive>("CumProd");
+const PrimitivePtr kPrimSubscalar = std::make_shared<Primitive>("Subscalar");
+const PrimitivePtr kPrimInplaceAdd = std::make_shared<Primitive>("InplaceAdd");
+const PrimitivePtr kPrimInplaceSub = std::make_shared<Primitive>("InplaceSub");
+const PrimitivePtr kPrimPow = std::make_shared<Primitive>("Pow");
+const PrimitivePtr kPrimRealDiv = std::make_shared<Primitive>("RealDiv");
+const PrimitivePtr kPrimSqrt = std::make_shared<Primitive>("Sqrt");
+const PrimitivePtr kPrimReciprocal = std::make_shared<Primitive>("Reciprocal");
+const PrimitivePtr kPrimExpandDims = std::make_shared<Primitive>("ExpandDims");
 
 // NN
 const PrimitivePtr kPrimFlatten = std::make_shared<Primitive>("Flatten");
+const PrimitivePtr kPrimSoftmax = std::make_shared<Primitive>("Softmax");
 const PrimitivePtr kPrimLogSoftmax = std::make_shared<Primitive>("LogSoftmax");
 const PrimitivePtr kPrimLogSoftmaxGrad = std::make_shared<Primitive>("LogSoftmaxGrad");
 const PrimitivePtr kPrimTanh = std::make_shared<Primitive>("Tanh");
@@ -205,18 +220,21 @@ const PrimitivePtr kPrimLayerNormGrad = std::make_shared<Primitive>("LayerNormGr
 const PrimitivePtr kPrimLayerNormXBackprop = std::make_shared<Primitive>("LayerNormXBackprop");
 const PrimitivePtr kPrimLayerNormBetaGammaBackprop = std::make_shared<Primitive>("LayerNormBetaGammaBackprop");
 const PrimitivePtr kPrimDropoutGenMask = std::make_shared<Primitive>("DropoutGenMask");
+const PrimitivePtr kPrimDropoutDoMask = std::make_shared<Primitive>("DropoutDoMask");
 const PrimitivePtr kPrimOneHot = std::make_shared<Primitive>("OneHot");
 const PrimitivePtr kPrimGelu = std::make_shared<Primitive>("Gelu");
 const PrimitivePtr kPrimGeluGrad = std::make_shared<Primitive>("GeluGrad");
 const PrimitivePtr kPrimRelu = std::make_shared<Primitive>("ReLU");
 const PrimitivePtr kPrimReluV2 = std::make_shared<Primitive>("ReLUV2");
-const PrimitivePtr kPrimZerosLikeTensor = std::make_shared<Primitive>("zeros_like_tensor");
+const PrimitivePtr kPrimZerosLike = std::make_shared<Primitive>("ZerosLike");
 const PrimitivePtr kPrimFakeBprop = std::make_shared<Primitive>("fake_bprop");
 const PrimitivePtr kPrimBpropCut = std::make_shared<Primitive>("bprop_cut");
+const PrimitivePtr kPrimFakeQuantPerLayer = std::make_shared<Primitive>("FakeQuantPerLayer");
+const PrimitivePtr kPrimFakeQuantPerChannel = std::make_shared<Primitive>("FakeQuantPerChannel");
 
 // Other miscellaneous
 const PrimitivePtr kPrimIdentity = std::make_shared<Primitive>("identity");
-const PrimitivePtr kPrimPartial = std::make_shared<Primitive>("partial");
+const PrimitivePtr kPrimPartial = std::make_shared<Primitive>("Partial");
 const PrimitivePtr kPrimJ = std::make_shared<Primitive>("J");
 const PrimitivePtr kPrimEnvSetItem = std::make_shared<Primitive>("env_setitem");
 const PrimitivePtr kPrimEnvGetItem = std::make_shared<Primitive>("env_getitem");
@@ -233,7 +251,7 @@ const PrimitivePtr kPrimCheckBprop = std::make_shared<Primitive>("CheckBprop");
 const PrimitivePtr kPrimPrint = std::make_shared<Primitive>("Print");
 
 const PrimitivePtr kPrimMakeRef = std::make_shared<Primitive>("make_ref");
-const PrimitivePtr kPrimDepend = std::make_shared<Primitive>("depend");
+const PrimitivePtr kPrimDepend = std::make_shared<Primitive>("Depend");
 const PrimitivePtr kPrimStateSetItem = std::make_shared<Primitive>("state_setitem");
 
 const PrimitivePtr kPrimBroadcastGradientArgs = std::make_shared<Primitive>("BroadcastGradientArgs");
@@ -242,11 +260,15 @@ const PrimitivePtr kPrimIs_ = std::make_shared<Primitive>("is_");
 const PrimitivePtr kPrimIsNot = std::make_shared<Primitive>("is_not");
 const PrimitivePtr kPrimInDict = std::make_shared<Primitive>("in_dict");
 const PrimitivePtr kPrimNotInDict = std::make_shared<Primitive>("not_in_dict");
+const PrimitivePtr kPrimMixedPrecisionCast = std::make_shared<Primitive>("mixed_precision_cast");
+const PrimitivePtr kPrimIsConsant = std::make_shared<Primitive>("is_constant");
+const PrimitivePtr kPrimEquivFormat = std::make_shared<Primitive>("EquivFormat");
 
 // Comm ops
 const PrimitivePtr kPrimMirror = std::make_shared<Primitive>("_MirrorOperator");
 const PrimitivePtr kPrimVirtualDiv = std::make_shared<Primitive>("_VirtualDiv");
 const PrimitivePtr kPrimVirtualDataset = std::make_shared<Primitive>("_VirtualDataset");
+const PrimitivePtr kPrimAllReduce = std::make_shared<Primitive>("AllReduce");
 
 // Debug ops
 const PrimitivePtr kPrimScalarSummary = std::make_shared<Primitive>("ScalarSummary");
diff --git a/mindspore/ccsrc/operator/ops.h b/mindspore/ccsrc/operator/ops.h
index 03527f7be2..65327cf407 100755
--- a/mindspore/ccsrc/operator/ops.h
+++ b/mindspore/ccsrc/operator/ops.h
@@ -27,7 +27,8 @@ namespace mindspore {
 // namespace to support primitive operators
 namespace prim {
 ValuePtr GetPythonOps(const std::string &op_name,
-                      const std::string &module_name = "mindspore._extends.parse.standard_method");
+                      const std::string &module_name = "mindspore._extends.parse.standard_method",
+                      bool use_signature = false);
 
 // Arithmetic
 extern const PrimitivePtr kPrimScalarAdd;
@@ -58,6 +59,12 @@ extern const PrimitivePtr kPrimBoolNot;
 extern const PrimitivePtr kPrimBoolAnd;
 extern const PrimitivePtr kPrimBoolOr;
 extern const PrimitivePtr kPrimBoolEq;
+extern const PrimitivePtr kPrimGreater;
+extern const PrimitivePtr kPrimGreaterEqual;
+extern const PrimitivePtr kPrimLess;
+extern const PrimitivePtr kPrimLessEqual;
+extern const PrimitivePtr kPrimEqual;
+extern const PrimitivePtr kPrimNotEqual;
 
 // Type introspection
 extern const PrimitivePtr kPrimTypeOf;
@@ -140,6 +147,8 @@ extern const PrimitivePtr kPrimConcat;
 extern const PrimitivePtr kPrimSqueeze;
 extern const PrimitivePtr kPrimTranspose;
 extern const PrimitivePtr kPrimGatherV2;
+extern const PrimitivePtr kPrimEmbeddingLookup;
+extern const PrimitivePtr kPrimEmbeddingLookupCommGrad;
 extern const PrimitivePtr kPrimSize;
 extern const PrimitivePtr kPrimArgMax;
 extern const PrimitivePtr kPrimPack;
@@ -153,6 +162,11 @@ extern const PrimitivePtr kPrimAddN;
 extern const PrimitivePtr KPrimTransData;
 extern const PrimitivePtr kPrimNMSWithMask;
 extern const PrimitivePtr kPrimPad;
+extern const PrimitivePtr kPrimArgMaxWithValue;
+extern const PrimitivePtr kPrimRealDiv;
+extern const PrimitivePtr kPrimSqrt;
+extern const PrimitivePtr kPrimReciprocal;
+extern const PrimitivePtr kPrimExpandDims;
 
 // Maths
 extern const PrimitivePtr kPrimTensorAdd;
@@ -176,9 +190,14 @@ extern const PrimitivePtr kPrimLess;
 extern const PrimitivePtr kPrimLessEqual;
 extern const PrimitivePtr kPrimCumSum;
 extern const PrimitivePtr kPrimCumProd;
+extern const PrimitivePtr kPrimSubscalar;
+extern const PrimitivePtr kPrimInplaceAdd;
+extern const PrimitivePtr kPrimInplaceSub;
+extern const PrimitivePtr kPrimPow;
 
 // NN
 extern const PrimitivePtr kPrimFlatten;
+extern const PrimitivePtr kPrimSoftmax;
 extern const PrimitivePtr kPrimLogSoftmax;
 extern const PrimitivePtr kPrimLogSoftmaxGrad;
 extern const PrimitivePtr kPrimApplyCenteredRMSProp;
@@ -211,15 +230,18 @@ extern const PrimitivePtr kPrimLayerNormGrad;
 extern const PrimitivePtr kPrimLayerNormXBackprop;
 extern const PrimitivePtr kPrimLayerNormBetaGammaBackprop;
 extern const PrimitivePtr kPrimDropoutGenMask;
+extern const PrimitivePtr kPrimDropoutDoMask;
 extern const PrimitivePtr kPrimOneHot;
 extern const PrimitivePtr kPrimGelu;
 extern const PrimitivePtr kPrimGeluGrad;
 extern const PrimitivePtr kPrimRelu;
 extern const PrimitivePtr kPrimReluV2;
 extern const PrimitivePtr kPrimActivation;
-extern const PrimitivePtr kPrimZerosLikeTensor;
+extern const PrimitivePtr kPrimZerosLike;
 extern const PrimitivePtr kPrimFakeBprop;
 extern const PrimitivePtr kPrimBpropCut;
+extern const PrimitivePtr kPrimFakeQuantPerLayer;
+extern const PrimitivePtr kPrimFakeQuantPerChannel;
 
 // Other Miscellaneous
 extern const PrimitivePtr kPrimIdentity;
@@ -251,8 +273,12 @@ extern const PrimitivePtr kPrimIs_;
 extern const PrimitivePtr kPrimIsNot;
 extern const PrimitivePtr kPrimInDict;
 extern const PrimitivePtr kPrimNotInDict;
+extern const PrimitivePtr kPrimMixedPrecisionCast;
+extern const PrimitivePtr kPrimIsConsant;
+extern const PrimitivePtr kPrimEquivFormat;
 
 // Comm ops
+extern const PrimitivePtr kPrimAllReduce;
 extern const PrimitivePtr kPrimMirror;
 extern const PrimitivePtr kPrimVirtualDiv;
 extern const PrimitivePtr kPrimVirtualDataset;
diff --git a/mindspore/ccsrc/operator/ops_extends.cc b/mindspore/ccsrc/operator/ops_extends.cc
index 6a192eca10..d415b45adf 100755
--- a/mindspore/ccsrc/operator/ops_extends.cc
+++ b/mindspore/ccsrc/operator/ops_extends.cc
@@ -23,10 +23,10 @@
 namespace mindspore {
 // namespace to support primitive operators
 namespace prim {
-ValuePtr GetPythonOps(const std::string &op_name, const std::string &module_name) {
+ValuePtr GetPythonOps(const std::string &op_name, const std::string &module_name, bool use_signature) {
   py::object obj = parse::python_adapter::GetPyFn(module_name, op_name);
   ValuePtr node = nullptr;
-  bool succ = parse::ConvertData(obj, &node);
+  bool succ = parse::ConvertData(obj, &node, use_signature);
   if (!succ) {
     MS_LOG(EXCEPTION) << "get Python op " << op_name << " from " << module_name << " fail";
   }
diff --git a/mindspore/ccsrc/operator/prim_nn.cc b/mindspore/ccsrc/operator/prim_nn.cc
index d057fd925d..d9a0071757 100644
--- a/mindspore/ccsrc/operator/prim_nn.cc
+++ b/mindspore/ccsrc/operator/prim_nn.cc
@@ -271,8 +271,8 @@ AbstractBasePtr InferImplRelu(const AnalysisEnginePtr &, const PrimitivePtr &pri
   return args_spec_list[0]->Broaden();
 }
 
-AbstractBasePtr InferImplZerosLikeTensor(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                         const AbstractBasePtrList &args_spec_list) {
+AbstractBasePtr InferImplZerosLike(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                   const AbstractBasePtrList &args_spec_list) {
   // Inputs: a tensor.
   CheckArgsSize(primitive->name(), args_spec_list, 1);
   return args_spec_list[0]->Broaden();
diff --git a/mindspore/ccsrc/operator/prim_others.cc b/mindspore/ccsrc/operator/prim_others.cc
index b8e89378e6..432b12f83b 100644
--- a/mindspore/ccsrc/operator/prim_others.cc
+++ b/mindspore/ccsrc/operator/prim_others.cc
@@ -14,9 +14,14 @@
  * limitations under the License.
  */
 
+#include <string>
+#include <sstream>
+
+#include "ir/dtype.h"
+#include "common/utils.h"
+#include "operator/ops.h"
 #include "pipeline/static_analysis/param_validator.h"
 #include "pipeline/static_analysis/prim.h"
-#include "operator/ops.h"
 #include "pipeline/static_analysis/utils.h"
 #include "utils/symbolic.h"
 
@@ -50,6 +55,81 @@ AbstractBasePtr InferImplJ(const AnalysisEnginePtr &, const PrimitivePtr &primit
   return AbstractFunction::MakeAbstractFunction(jv);
 }
 
+class UndeterminedShapeType {
+ public:
+  explicit UndeterminedShapeType(const std::string &env_str) {
+    // param_name indices_shape indices_type values_shape values_type dense_shape
+    // export UNDETERMINED_SPARSE_SHAPE_TYPES="sparse_key_w1:2:Int32:2 1 2:Float32:3 1 2;sparse_key_w2:2:Int32:2 1
+    // 2:Float32:3 1 2"
+    std::vector<string> fields;
+    string tmp;
+    std::stringstream input(env_str);
+    while (std::getline(input, tmp, ':')) {
+      fields.push_back(tmp);
+    }
+    if (fields.size() != fields_num) {
+      MS_LOG(EXCEPTION) << "Expect " << fields_num << " fields, but got " << fields.size();
+    }
+
+    param_name_ = fields[0];
+
+    indices_shape_ = GetShape(fields[1]);
+    indices_type_ = StringToType(fields[2]);
+
+    values_shape_ = GetShape(fields[3]);
+    values_type_ = StringToType(fields[4]);
+
+    auto dense_shape_vec = GetShape(fields[5]);
+    AbstractBasePtrList dense_shape_list;
+    (void)std::transform(dense_shape_vec.begin(), dense_shape_vec.end(), std::back_inserter(dense_shape_list),
+                         [](const auto &elem) { return FromValue(elem, false); });
+    dense_shape_ = dense_shape_list;
+  }
+  ~UndeterminedShapeType() = default;
+  const std::string &param_name() { return param_name_; }
+  const std::vector<int> &indices_shape() { return indices_shape_; }
+  const TypePtr &indices_type() { return indices_type_; }
+  const std::vector<int> &values_shape() { return values_shape_; }
+  const TypePtr &values_type() { return values_type_; }
+  const AbstractBasePtrList &dense_shape() { return dense_shape_; }
+
+ private:
+  std::string param_name_;
+  std::vector<int> indices_shape_;
+  TypePtr indices_type_;
+  std::vector<int> values_shape_;
+  TypePtr values_type_;
+  AbstractBasePtrList dense_shape_;
+  static const size_t fields_num;
+
+  std::vector<int> GetShape(const std::string &shape_str);
+};
+std::vector<int> UndeterminedShapeType::GetShape(const std::string &shape_str) {
+  std::vector<int> ret;
+  std::istringstream iss(shape_str);
+  int elem;
+  while (iss.good()) {
+    iss >> elem;
+    ret.emplace_back(elem);
+  }
+  return ret;
+}
+const size_t UndeterminedShapeType::fields_num = 6;
+
+std::unordered_map<std::string, UndeterminedShapeType> g_undetermined_configs;
+void InitUndeterminedFromEnv(const std::string &sparse_shape_types) {
+  if (!g_undetermined_configs.empty()) {
+    return;
+  }
+  std::string tmp;
+  std::stringstream input(sparse_shape_types);
+  while (std::getline(input, tmp, ';')) {
+    auto config = UndeterminedShapeType(tmp);
+    g_undetermined_configs.insert(std::make_pair(config.param_name(), config));
+    MS_LOG(DEBUG) << "Undetermined config from env: " << tmp;
+  }
+}
+
 AbstractBasePtr InferImplEnvGetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list) {
   MS_EXCEPTION_IF_NULL(primitive);
@@ -62,6 +142,37 @@ AbstractBasePtr InferImplEnvGetItem(const AnalysisEnginePtr &, const PrimitivePt
   if (type->type_id() != kObjectTypeSymbolicKeyType) {
     MS_LOG(EXCEPTION) << "EnvGetItem evaluator args[1] should be a SymbolicKeyInstance but: " << key->ToString();
   }
+
+  if (!key->sparse_grad().empty()) {
+    // Will be fixed once undetermined type ready
+    auto sparse_shape_types = common::GetEnv("UNDETERMINED_SPARSE_SHAPE_TYPES");
+    if (sparse_shape_types.empty()) {
+      sparse_shape_types = "sparse_key_w1:2:Int32:2 1 2:Float32:3 1 2;sparse_key_w2:2:Int32:2 1 2:Float32:3 1 2";
+    }
+    InitUndeterminedFromEnv(sparse_shape_types);
+
+    auto shape_types = g_undetermined_configs.find(key->sparse_grad());
+    if (shape_types == g_undetermined_configs.end()) {
+      MS_LOG(EXCEPTION) << "Param " << key->ToString()
+                        << " has sparse_grad, but shape/type is not configured in env UNDETERMINED_SPARSE_SHAPE_TYPES: "
+                        << sparse_shape_types;
+    }
+    MS_LOG(DEBUG) << "EnvGetItem is sparse_grad " << key->ToString();
+    AbstractBasePtrList sparse_list;
+    // indices
+    auto indices_ele = std::make_shared<AbstractScalar>(kAnyValue, shape_types->second.indices_type());
+    auto indices =
+      std::make_shared<AbstractTensor>(indices_ele, std::make_shared<Shape>(shape_types->second.indices_shape()));
+    sparse_list.emplace_back(indices);
+    // values
+    auto dout_ele = std::make_shared<AbstractScalar>(kAnyValue, shape_types->second.values_type());
+    auto dout = std::make_shared<AbstractTensor>(dout_ele, std::make_shared<Shape>(shape_types->second.values_shape()));
+    sparse_list.emplace_back(dout);
+    // dense_shape
+    sparse_list.emplace_back(std::make_shared<AbstractTuple>(shape_types->second.dense_shape()));
+    return std::make_shared<AbstractTuple>(sparse_list);
+  }
+
   if (!key->GetValueTrack()->isa<SymbolicKeyInstance>()) {
     return dflt;
   }
@@ -80,8 +191,6 @@ AbstractBasePtr InferImplEnvSetItem(const AnalysisEnginePtr &, const PrimitivePt
   CheckArgsSize(primitive->name(), args_spec_list, 3);
 
   auto key = args_spec_list[1];
-  auto value = args_spec_list[2];
-
   ValuePtr key_value_ptr = key->GetValueTrack();
   MS_EXCEPTION_IF_NULL(key_value_ptr);
   auto key_value_track = key_value_ptr->cast<SymbolicKeyInstancePtr>();
@@ -91,7 +200,6 @@ AbstractBasePtr InferImplEnvSetItem(const AnalysisEnginePtr &, const PrimitivePt
   }
   auto expected = key_value_track->abstract();
   MS_EXCEPTION_IF_NULL(expected);
-  (void)expected->Join(value);
   return std::make_shared<AbstractScalar>(kAnyValue, std::make_shared<EnvType>());
 }
 
@@ -126,7 +234,9 @@ AbstractBasePtr InferImplMakeRef(const AnalysisEnginePtr &, const PrimitivePtr &
   if (type->type_id() != kObjectTypeRefKey) {
     MS_LOG(EXCEPTION) << "First input of make_ref should be a RefKey but a " << type->ToString();
   }
-  return std::make_shared<AbstractRef>(args_spec_list[0], args_spec_list[1], args_spec_list[2]);
+  auto ret = std::make_shared<AbstractRef>(args_spec_list[0], args_spec_list[1], args_spec_list[2]);
+  ret->set_sparse_grad(args_spec_list[2]->sparse_grad());
+  return ret;
 }
 
 AbstractBasePtr InferImplGetRefKey(const AnalysisEnginePtr &, const PrimitivePtr &,
diff --git a/mindspore/ccsrc/operator/prim_statement.cc b/mindspore/ccsrc/operator/prim_statement.cc
index c297e128e2..5eb8d39996 100644
--- a/mindspore/ccsrc/operator/prim_statement.cc
+++ b/mindspore/ccsrc/operator/prim_statement.cc
@@ -110,7 +110,8 @@ AbstractBasePtr InferImplSwitch(const AnalysisEnginePtr &, const PrimitivePtr &,
 
   ValuePtr v = cond->GetValueTrack();
   MS_EXCEPTION_IF_NULL(v);
-  if (v->isa<AnyValue>()) {
+  // for tensor as condition, keeps both true and false branch.
+  if (v->isa<AnyValue>() || cond->isa<AbstractTensor>()) {
     MS_EXCEPTION_IF_NULL(tb);
     return tb->Join(fb);
   }
@@ -228,5 +229,15 @@ AbstractBasePtr InferImplNotInDict(const AnalysisEnginePtr &, const PrimitivePtr
   // Inputs: x, t
   return std::make_shared<AbstractScalar>(!IsInDict(primitive, args_spec_list));
 }
+AbstractBasePtr InferImplIsConstant(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                    const AbstractBasePtrList &args_spec_list) {
+  // statement: isconstant(x)
+  // Inputs: x
+  if (args_spec_list.size() != 1) {
+    MS_LOG(EXCEPTION) << "IsConstant requires args input size = 1";
+  }
+  ValuePtr v = args_spec_list[0]->BuildValue();
+  return std::make_shared<AbstractScalar>(!v->isa<AnyValue>());
+}
 }  // namespace abstract
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/operator/prim_structures.cc b/mindspore/ccsrc/operator/prim_structures.cc
index 7b0bba98a5..33c7a1e209 100644
--- a/mindspore/ccsrc/operator/prim_structures.cc
+++ b/mindspore/ccsrc/operator/prim_structures.cc
@@ -205,13 +205,14 @@ AbstractBasePtr InferTupleOrListGetItem(const std::string &op_name, const Abstra
 
   ValuePtr index_value = index->BuildValue();
   if (!index_value->isa<Int32Imm>()) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator index should be an int32 number, but got " << index_value->ToString();
+    MS_EXCEPTION(IndexError) << op_name << " evaluator index should be an int32 number, but got "
+                             << index_value->ToString();
   }
   int idx_v = GetValue<int>(index_value);
   std::size_t nelems = queue->elements().size();
   if (idx_v >= SizeToInt(nelems) || idx_v < -SizeToInt(nelems)) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator index should be in range[-" << SizeToInt(nelems) << ", "
-                      << SizeToInt(nelems) << "), but got " << idx_v << ".";
+    MS_EXCEPTION(IndexError) << op_name << " evaluator index should be in range[-" << SizeToInt(nelems) << ", "
+                             << SizeToInt(nelems) << "), but got " << idx_v << ".";
   }
 
   std::size_t uidx_v = 0;
@@ -232,18 +233,21 @@ AbstractBasePtr InferTupleOrListSetItem(const std::string &op_name, const Abstra
 
   ValuePtr index_value = index->BuildValue();
   if (!index_value->isa<Int32Imm>()) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator index should be an int32 number, but got " << index_value->ToString();
+    MS_EXCEPTION(IndexError) << op_name << " evaluator index should be an int32 number, but got "
+                             << index_value->ToString();
   }
   int idx_v = GetValue<int>(index_value);
   if (idx_v < 0) {
-    MS_LOG(EXCEPTION) << "The index of " << typeid(T).name() << " should be positive number, but got " << idx_v << ".";
+    MS_EXCEPTION(IndexError) << "The index of " << typeid(T).name() << " should be positive number, but got " << idx_v
+                             << ".";
   }
 
   size_t uidx_v = IntToSize(idx_v);
   AbstractBasePtrList elements = queue->elements();
   std::size_t nelems = elements.size();
   if (uidx_v >= nelems) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator the index: " << uidx_v << " to set out of range: " << nelems - 1 << ".";
+    MS_EXCEPTION(IndexError) << op_name << " evaluator the index: " << uidx_v << " to set out of range: " << nelems - 1
+                             << ".";
   }
   elements[uidx_v] = args_spec_list[2];
   return std::make_shared<T>(elements);
diff --git a/mindspore/ccsrc/optimizer/ad/dfunctor.cc b/mindspore/ccsrc/optimizer/ad/dfunctor.cc
index bdefcfeba1..e192f3912e 100644
--- a/mindspore/ccsrc/optimizer/ad/dfunctor.cc
+++ b/mindspore/ccsrc/optimizer/ad/dfunctor.cc
@@ -45,17 +45,26 @@ DFunctor::DFunctor(const FuncGraphPtr &primal_graph, const pipeline::ResourceBas
     : primal_graph_(primal_graph), resources_(resources), need_cut_(false), is_top_(false) {
   TraceManager::DebugTrace(std::make_shared<TraceGradFprop>(primal_graph->debug_info()));
   k_graph_ = std::make_shared<FuncGraph>();
+  if (primal_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+    std::string grad_op_name = GetValue<std::string>(primal_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL));
+    k_graph_->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(grad_op_name));
+  }
   TraceManager::EndTrace();
 
   TraceManager::DebugTrace(std::make_shared<TraceGradBprop>(primal_graph->debug_info()));
   tape_ = std::make_shared<FuncGraph>();
+  // Add "_Grad" postfix
+  if (primal_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+    std::string grad_op_name = GetValue<std::string>(primal_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) + "_Grad";
+    tape_->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(grad_op_name));
+  }
   TraceManager::EndTrace();
 
   dout_ = tape_->add_parameter();
 }
 
-void DFunctor::Init(const DFunctorPtr &functor, bool is_top) {
-  func_graph_to_functor_[primal_graph_] = functor;
+void DFunctor::Init(bool is_top) {
+  func_graph_to_functor_[primal_graph_] = shared_from_this();
   is_top_ = is_top;
   if (is_top) {
     scope_ = primal_graph_->scope();
@@ -368,10 +377,10 @@ FuncGraphPtr DFunctor::KUserDefined(const FuncGraphPtr &primal) {
     (void)primal->transforms().insert(std::make_pair("grad", FuncGraphTransform(fg)));
     (void)fg->transforms().insert(std::make_pair("primal", FuncGraphTransform(primal)));
     // Reset defer_inline to enable successive inlining
-    primal->set_flags(FUNC_GRAPH_FLAG_DEFER_INLINE, false);
+    primal->set_flag(FUNC_GRAPH_FLAG_DEFER_INLINE, false);
 
     auto functor = std::make_shared<DFunctor>(primal, resources_);
-    functor->Init(functor);
+    functor->Init();
     functor->k_graph_ = fg;
 
     return fg;
@@ -394,7 +403,7 @@ AnfNodePtr DFunctor::MapToK(const FuncGraphPtr &primal) {
   }
 
   auto functor = std::make_shared<DFunctor>(primal, resources_);
-  functor->Init(functor);
+  functor->Init();
   functor->MapObject();
   functor->MapMorphism();
 
@@ -551,6 +560,10 @@ AdjointPtr DFunctor::FindAdjoint(const AnfNodePtr &primal) {
 }
 
 void DFunctor::CallDoutHoleOnTape() {
+  if (!is_top_) {
+    return;
+  }
+
   // Call dout hole of all adjoint.
   for (auto &f : func_graph_to_functor_) {
     for (auto &adjoint : f.second->anfnode_to_adjoin_) {
diff --git a/mindspore/ccsrc/optimizer/ad/dfunctor.h b/mindspore/ccsrc/optimizer/ad/dfunctor.h
index d11926b379..13a6d5388b 100644
--- a/mindspore/ccsrc/optimizer/ad/dfunctor.h
+++ b/mindspore/ccsrc/optimizer/ad/dfunctor.h
@@ -35,14 +35,40 @@
 
 namespace mindspore {
 namespace ad {
-using Registry = std::unordered_map<PrimitivePtr, FuncGraphPtr>;
+struct PrimitiveTotalEqual {
+  bool operator()(PrimitivePtr const &t1, PrimitivePtr const &t2) const {
+    if (t1->name() != t2->name()) {
+      return false;
+    }
+
+    auto const &attrs1 = t1->attrs();
+    auto const &attrs2 = t2->attrs();
+    if (attrs1.size() != attrs2.size()) {
+      return false;
+    }
+
+    for (auto &attr : attrs1) {
+      if (!t2->HasAttr(attr.first)) {
+        return false;
+      }
+
+      if (!(*(attr.second) == *(t2->GetAttr(attr.first)))) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+using Registry = std::unordered_map<PrimitivePtr, FuncGraphPtr, PrimitiveHasher>;
 class KPrim;
 extern KPrim g_k_prims;
 class DFunctor;
 using DFunctorPtr = std::shared_ptr<DFunctor>;
 
 // D Functor's rules to map closure object and morphisms.
-class DFunctor {
+class DFunctor : public std::enable_shared_from_this<DFunctor> {
  public:
   DFunctor(const FuncGraphPtr &primal_graph, const pipeline::ResourceBasePtr &resources);
   ~DFunctor() = default;
@@ -54,7 +80,9 @@ class DFunctor {
   // Construct user defined k object.
   FuncGraphPtr KUserDefined(const FuncGraphPtr &primal);
   // Register functor objects to form a global view.
-  void Init(const DFunctorPtr &functor, bool is_top = false);
+  void Init(bool is_top = false);
+  bool IsInScope(const AnfNodePtr &node);
+
   // Clear resources.
   static void Clear();
 
@@ -62,7 +90,6 @@ class DFunctor {
   // Map one morphism.
   AdjointPtr MapMorphism(const AnfNodePtr &morph);
   bool IsFreeMorphism(const AnfNodePtr &node);
-  bool IsInScope(const AnfNodePtr &node);
   // Map morphism that's not attached to output.
   void MapFreeMorphism();
   void BackPropagateFv(const AnfNodePtr &fv, const AnfNodePtr &din);
diff --git a/mindspore/ccsrc/optimizer/ad/grad.cc b/mindspore/ccsrc/optimizer/ad/grad.cc
index 7e1fdb842e..d141dc6eea 100644
--- a/mindspore/ccsrc/optimizer/ad/grad.cc
+++ b/mindspore/ccsrc/optimizer/ad/grad.cc
@@ -23,7 +23,7 @@
 
 namespace mindspore {
 namespace ad {
-FuncGraphPtr Grad(const FuncGraphPtr &func_graph, const pipeline::ResourceBasePtr &resources) {
+FuncGraphPtr Grad(const FuncGraphPtr &func_graph, const pipeline::ResourceBasePtr &resources, bool is_top) {
   MS_EXCEPTION_IF_NULL(func_graph);
   auto gradkv = func_graph->transforms().find("grad");
   if (gradkv != func_graph->transforms().end()) {
@@ -37,7 +37,7 @@ FuncGraphPtr Grad(const FuncGraphPtr &func_graph, const pipeline::ResourceBasePt
   auto multi_graph_sink = [&func_graph](const FuncGraphPtr &f) {
     if (MsContext::GetInstance()->is_multi_graph_sink()) {
       if (func_graph->has_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES)) {
-        f->set_flags(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
+        f->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
       }
     }
   };
@@ -46,14 +46,18 @@ FuncGraphPtr Grad(const FuncGraphPtr &func_graph, const pipeline::ResourceBasePt
   auto user_defined = f->KUserDefined(func_graph);
   if (user_defined != nullptr) {
     multi_graph_sink(user_defined);
-    DFunctor::Clear();
+    if (is_top) {
+      DFunctor::Clear();
+    }
     return user_defined;
   }
-  f->Init(f, true);
+  f->Init(is_top);
   f->MapObject();
   f->MapMorphism();
   auto ret = f->k_graph();
-  DFunctor::Clear();
+  if (is_top) {
+    DFunctor::Clear();
+  }
 
   multi_graph_sink(ret);
   return ret;
@@ -71,5 +75,7 @@ MetaFuncGraphPtr Kmeta(const PrimitivePtr &prim, const pipeline::ResourceBasePtr
   MetaFuncGraphPtr fg = g_k_prims.KMetaFuncGraph(prim);
   return fg;
 }
+
+void CleanRes() { DFunctor::Clear(); }
 }  // namespace ad
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/ad/grad.h b/mindspore/ccsrc/optimizer/ad/grad.h
index 12826311dc..a878aa9df7 100644
--- a/mindspore/ccsrc/optimizer/ad/grad.h
+++ b/mindspore/ccsrc/optimizer/ad/grad.h
@@ -28,9 +28,10 @@ namespace mindspore {
 namespace ad {
 using ResourcePtr = std::shared_ptr<pipeline::Resource>;
 
-FuncGraphPtr Grad(const FuncGraphPtr &func_graph, const pipeline::ResourceBasePtr &resources);
+FuncGraphPtr Grad(const FuncGraphPtr &func_graph, const pipeline::ResourceBasePtr &resources, bool is_top = true);
 FuncGraphPtr Kprim(const ValueNodePtr &value_node, const pipeline::ResourceBasePtr &resources);
 MetaFuncGraphPtr Kmeta(const PrimitivePtr &prim, const pipeline::ResourceBasePtr &);
+void CleanRes();
 }  // namespace ad
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/optimizer/ad/kprim.cc b/mindspore/ccsrc/optimizer/ad/kprim.cc
index 600c4f5cc9..a9883cbf63 100644
--- a/mindspore/ccsrc/optimizer/ad/kprim.cc
+++ b/mindspore/ccsrc/optimizer/ad/kprim.cc
@@ -82,7 +82,7 @@ MetaFuncGraphPtr KPrim::KMetaFuncGraph(const PrimitivePtr &prim) {
     return iter->second;
   }
 
-  if (prim->name() == "make_tuple") {
+  if (prim->Hash() == prim::kPrimMakeTuple->Hash() && prim->name() == prim::kPrimMakeTuple->name()) {
     MetaFuncGraphPtr meta = std::make_shared<prim::MakeTupleGradient>("make_tuple_gradient");
     bprop_registry_meta_[prim::kPrimMakeTuple] = meta;
     return meta;
@@ -111,7 +111,7 @@ FuncGraphPtr KPrim::KPrimitive(const ValueNodePtr &value_node, const pipeline::R
     return fprop;
   }
 
-  if (prim->name() == "make_tuple") {
+  if (prim->Hash() == prim::kPrimMakeTuple->Hash() && prim->name() == prim::kPrimMakeTuple->name()) {
     return nullptr;
   }
 
@@ -238,8 +238,12 @@ FuncGraphPtr KPrim::BpropCut(const ValueNodePtr &value_node, const pipeline::Res
   auto func_graph = std::make_shared<FuncGraph>();
   std::vector<AnfNodePtr> outputs;
 
-  auto bprop_cut = std::make_shared<Primitive>("bprop_cut");
-  bprop_cut->set_hook(prim->hook());
+  auto bprop_cut = std::make_shared<PrimitivePy>("bprop_cut", py::object());
+  if (!prim->is_base()) {
+    PrimitivePyPtr prim_py = dyn_cast<PrimitivePy>(prim);
+    bprop_cut->set_hook(prim_py->hook());
+  }
+
   auto cell_id = GetValue<std::string>(prim->GetAttr("cell_id"));
   if (cell_id != "") {
     (void)bprop_cut->AddAttr("cell_hook", MakeValue(true));
diff --git a/mindspore/ccsrc/optimizer/clean.cc b/mindspore/ccsrc/optimizer/clean.cc
index fafe26e2ed..6a54597282 100644
--- a/mindspore/ccsrc/optimizer/clean.cc
+++ b/mindspore/ccsrc/optimizer/clean.cc
@@ -78,7 +78,10 @@ AnfNodePtr ConvertGetAttrToTupleGetItem(const CNodePtr &node) {
   MS_EXCEPTION_IF_NULL(cons);
 
   auto dt = data->abstract();
-  MS_EXCEPTION_IF_NULL(dt);
+  if (dt == nullptr) {
+    return nullptr;
+  }
+
   if (!dt->isa<AbstractClass>()) {
     MS_LOG(EXCEPTION) << "First parameter of getattr is not AbstractClass, but " << dt->type_name() << ".";
   }
diff --git a/mindspore/ccsrc/optimizer/graph_kernel_reuse.cc b/mindspore/ccsrc/optimizer/graph_kernel_reuse.cc
new file mode 100644
index 0000000000..dc20ad925e
--- /dev/null
+++ b/mindspore/ccsrc/optimizer/graph_kernel_reuse.cc
@@ -0,0 +1,157 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "optimizer/graph_kernel_reuse.h"
+#include <vector>
+#include <algorithm>
+#include <string>
+#include "./common.h"
+#include "utils/graph_utils.h"
+
+namespace mindspore {
+/* namespace to support opt */
+namespace opt {
+
+bool GraphKernelReuse::CompareNode(const AnfNodePtr a, const AnfNodePtr b) {
+  if (a->abstract() && b->abstract()) {
+    auto a_type = a->abstract()->GetTypeTrack();
+    auto b_type = b->abstract()->GetTypeTrack();
+
+    if (a_type != b_type) {
+      return false;
+    }
+
+    auto a_shape = a->abstract()->GetShapeTrack();
+    auto b_shape = b->abstract()->GetShapeTrack();
+    if (a_shape != nullptr && a_shape == b_shape) {
+      return true;
+    }
+
+    if (a_shape != nullptr && b_shape != nullptr && a_shape->isa<abstract::Shape>() &&
+        b_shape->isa<abstract::Shape>()) {
+      return a_shape->cast<abstract::ShapePtr>()->shape() == b_shape->cast<abstract::ShapePtr>()->shape();
+    }
+  }
+  return false;
+}
+
+bool GraphKernelReuse::DoReplace(const FuncGraphManagerPtr manager) {
+  bool changed = false;
+  auto fgs = manager->func_graphs();
+  for (FuncGraphPtr &fg : fgs) {
+    if (!fg->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+      continue;
+    }
+    std::string key = GetValue<std::string>(fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL));
+    if (graph_kernel_ops.find(key) != graph_kernel_ops.end()) {
+      if (find(graph_kernel_ops[key].begin(), graph_kernel_ops[key].end(), fg) == graph_kernel_ops[key].end()) {
+        FuncGraphPtr new_fg = nullptr;
+        for (auto &cfg : graph_kernel_ops[key]) {
+          // If two graphs have different size then continue
+          auto fg_topos = TopoSort(fg->get_return());
+          auto cfg_topos = TopoSort(cfg->get_return());
+          if (fg_topos.size() != cfg_topos.size()) {
+            continue;
+          }
+
+          // Compare const tensor
+          bool has_same = true;
+          for (size_t i = 0; i < fg_topos.size(); ++i) {
+            if (IsValueNode<tensor::Tensor>(fg_topos[i])) {
+              if (!IsValueNode<tensor::Tensor>(cfg_topos[i])) {
+                has_same = false;
+                break;
+              }
+
+              auto tensor1 = GetValueNode<tensor::TensorPtr>(fg_topos[i]);
+              auto tensor2 = GetValueNode<tensor::TensorPtr>(cfg_topos[i]);
+              if (!tensor1->ValueEqual(*tensor2)) {
+                has_same = false;
+                break;
+              }
+            }
+          }
+
+          if (!has_same) {
+            continue;
+          }
+
+          auto fg_input = fg->parameters();
+          auto cfg_input = cfg->parameters();
+          if (fg_input.size() != cfg_input.size()) {
+            continue;
+          }
+          // Compare input
+          for (size_t i = 0; i < fg_input.size(); ++i) {
+            if (!CompareNode(fg_input[i], cfg_input[i])) {
+              has_same = false;
+              break;
+            }
+          }
+          if (!has_same) {
+            continue;
+          }
+
+          // Compare output
+          if (!CompareNode(fg->output(), cfg->output())) {
+            continue;
+          }
+
+          // Find reusable fg
+          new_fg = cfg;
+          break;
+        }
+
+        if (new_fg != nullptr) {
+          // Replace current fg with existing fg
+          auto users = fg->func_graph_cnodes_index();
+          for (auto &iter : users) {
+            auto cnode = iter.first->first->cast<CNodePtr>();
+            auto new_input = cnode->inputs();
+            auto main_graph = cnode->func_graph();
+            MS_EXCEPTION_IF_NULL(main_graph);
+            if (IsPrimitiveCNode(cnode, prim::kPrimPartial)) {
+              new_input[1] = NewValueNode(new_fg);
+            } else {
+              new_input[0] = NewValueNode(new_fg);
+            }
+            auto new_cnode = main_graph->NewCNode(new_input);
+            manager->Replace(iter.first->first, new_cnode);
+            changed = true;
+          }
+
+        } else {
+          // Add current fg to map
+          graph_kernel_ops[key].push_back(fg);
+        }
+      }
+    } else {
+      graph_kernel_ops[key] = {fg};
+    }
+  }
+
+  return changed;
+}
+
+bool GraphKernelReuse::ReuseGraphKernel(const FuncGraphPtr root, const FuncGraphManagerPtr manager) {
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->AddFuncGraph(root);
+
+  return DoReplace(manager);
+}
+
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/graph_kernel_reuse.h b/mindspore/ccsrc/optimizer/graph_kernel_reuse.h
new file mode 100644
index 0000000000..ed5cc93d18
--- /dev/null
+++ b/mindspore/ccsrc/optimizer/graph_kernel_reuse.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_OPTIMIZER_GRAPH_KERNEL_OP_REUSE_H
+#define MINDSPORE_CCSRC_OPTIMIZER_GRAPH_KERNEL_OP_REUSE_H
+
+#include <mindspore/ccsrc/session/anf_runtime_algorithm.h>
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#include "optimizer/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+
+// Common subexpression elimination.
+class GraphKernelReuse {
+ public:
+  GraphKernelReuse() : count(0) {}
+  virtual ~GraphKernelReuse() = default;
+
+  bool operator()(const FuncGraphPtr &root, const OptimizerPtr &optimizer) {
+    bool chg = ReuseGraphKernel(root, optimizer->resource()->manager());
+    return chg;
+  }
+
+  bool CompareNode(const AnfNodePtr a, const AnfNodePtr other);
+  bool DoReplace(const FuncGraphManagerPtr manager);
+
+  bool ReuseGraphKernel(const FuncGraphPtr root, const FuncGraphManagerPtr manager);
+
+ private:
+  std::unordered_map<std::string, std::vector<FuncGraphPtr>> graph_kernel_ops;
+  int count;
+};
+
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_OPTIMIZER_GRAPH_KERNEL_OP_REUSE_H
diff --git a/mindspore/ccsrc/optimizer/irpass.cc b/mindspore/ccsrc/optimizer/irpass.cc
index 107bf1eb57..72177ccb06 100644
--- a/mindspore/ccsrc/optimizer/irpass.cc
+++ b/mindspore/ccsrc/optimizer/irpass.cc
@@ -41,6 +41,8 @@
 #include "optimizer/irpass/incorporate_call.h"
 #include "optimizer/irpass/grad_var_prepare.h"
 #include "optimizer/irpass/param_replace.h"
+#include "optimizer/irpass/mark_interface_fusion.h"
+#include "optimizer/opt.h"
 
 namespace mindspore {
 namespace opt {
@@ -48,12 +50,13 @@ namespace irpass {
 OptimizeIRPassLib::OptimizeIRPassLib() {
   arithmetic_simplify_ = MakeSubstitution(ArithmeticSimplify(), "arithmetic_simplify",
                                           {prim::kPrimScalarAdd, prim::kPrimScalarMul, prim::kPrimTensorAdd,
-                                           prim::kPrimIdentity, prim::kPrimMomentum, prim::kPrimMul});
+                                           prim::kPrimIdentity, prim::kPrimMomentum, prim::kPrimMul, prim::kPrimPow});
   special_op_eliminate_ =
     MakeSubstitution(SpecialOpEliminater(), "special_op_eliminate",
-                     {prim::kPrimInsertGradientOf, prim::kPrimHookBackward, prim::kPrimPrintShapeType,
-                      prim::kPrimGetRefKey, prim::kPrimMirror, prim::kPrimVirtualDiv});
-  zero_like_fill_zero_ = MakeSubstitution(ZeroLikeFillZero(), "zero_like_fill_zero", prim::kPrimZerosLikeTensor);
+                     {prim::kPrimInsertGradientOf, prim::kPrimStopGradient, prim::kPrimHookBackward,
+                      prim::kPrimPrintShapeType, prim::kPrimGetRefKey, prim::kPrimMirror, prim::kPrimVirtualDiv});
+  zero_like_fill_zero_ = MakeSubstitution(ZeroLikeFillZero(), "zero_like_fill_zero", prim::kPrimZerosLike);
+  adjust_all_reduce_mul_add_ = MakeSubstitution(AdjustAllReduceMulAdd(), "adjust_all_reduce_mul_add", prim::kPrimAddN);
 
   // ops eliminate
   item_tuple_eliminate_ =
@@ -69,11 +72,11 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
   same_eliminate_ = MakeSubstitution(SameEliminater(), "same_eliminate", prim::kPrimSameTypeShape);
   check_bprop_eliminate_ = MakeSubstitution(CheckBpropEliminater(), "check_bprop_eliminate", prim::kPrimCheckBprop);
   reset_defer_inline_ = MakeSubstitution(ResetDeferInline(), "reset_defer_inline", IsValueNode<FuncGraph>);
+  depend_value_elim_ = MakeSubstitution(DependValueElim(), "depend_value_elim", prim::kPrimDepend);
 
   // Env Item Eliminate
+  env_get_item_eliminate_ = MakeSubstitution(EnvGetItemEliminater(), "env_get_item_eliminate", prim::kPrimEnvGetItem);
   new_env_get_item_ = MakeSubstitution(NewEnvGetItem(), "new_env_get_item", prim::kPrimEnvGetItem);
-  add_env_get_item_ = MakeSubstitution(AddEnvGetItem(), "add_env_get_item", prim::kPrimEnvGetItem);
-  env_get_set_item_ = MakeSubstitution(EnvGetSetItem(), "env_get_set_item", prim::kPrimEnvGetItem);
   incorporate_env_getitem_ =
     MakeSubstitution(IncorporateEnvGetitem(), "incorporate_env_get_item", prim::kPrimEnvGetItem);
   incorporate_env_getitem_switch_ =
@@ -81,17 +84,16 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
 
   // Ref eliminate
   make_ref_eliminate_ = MakeSubstitution(MakeRefEliminater(), "make_ref_eliminate", prim::kPrimMakeRef);
+  get_ref_param_eliminate_ = MakeSubstitution(GetRefParamEliminater(), "get_ref_param_eliminate",
+                                              {prim::kPrimGetRefValue, prim::kPrimGetRefOrigin});
   get_make_ref_eliminate_ = MakeSubstitution(GetMakeRefEliminater(), "get_make_ref_eliminate",
                                              {prim::kPrimGetRefKey, prim::kPrimGetRefValue, prim::kPrimGetRefOrigin});
 
   replace_refkey_by_param_ =
     MakeSubstitution(ReplaceRefkeyByParam(), "replace_refkey_by_param", IsValueNode<RefKey>, opt::FORCE_RENORM);
   replace_old_param_ = MakeSubstitution(ReplaceOldParam(), "replace_old_param", IsParam);
-
   // Gradient transforms
   expand_jprim_ = MakeSubstitution(ExpandJPrim(), "expand_jprim", prim::kPrimJ);
-  stop_gradient_eliminate_ =
-    MakeSubstitution(StopGradientEliminater(), "stop_gradient_eliminate", prim::kPrimStopGradient);
   minmaximum_grad_ = MakeSubstitution(MinMaximumGrad(), "minmaximum_grad", prim::kPrimTupleGetItem);
 
   // branch culling
@@ -112,9 +114,10 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
   specialize_transform_ = MakeSubstitution(SpecializeOnGraphArguments(), "specialize_transform", IsCNodeGraph);
 
   // Incorporation
-  incorporate_getitem_ = MakeSubstitution(IncorporateGetitem(), "incorporate_getitem", prim::kPrimTupleGetItem);
-  incorporate_getitem_switch_ =
-    MakeSubstitution(IncorporateGetitemSwitch(), "incorporate_getitem_switch", prim::kPrimTupleGetItem);
+  incorporate_getitem_set_ =
+    MakeSubstitution(IncorporateGetitemSet(), "incorporate_getitem_set", prim::kPrimTupleGetItem);
+  incorporate_getitem_from_param_ =
+    MakeSubstitution(IncorporateGetitemFromParam(), "incorporate_getitem_from_param", IsCNodeGraphKernel);
   incorporate_call_ = MakeSubstitution(IncorporateCall(), "incorporate_call", IsCNodeDup);
   incorporate_call_switch_ = MakeSubstitution(IncorporateCallSwitch(), "incorporate_call_switch", IsCNodeDup);
 
@@ -124,6 +127,17 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
 
   // Convert
   print_tuple_wrapper_ = MakeSubstitution(PrintTupleWrapper(), "print_tuple_wrapper", prim::kPrimPrint);
+
+  // Unused parameter eliminate
+  unused_parameter_eliminate_ =
+    MakeSubstitution(UnusedParasEliminater(), "unused_parameter_eliminate", IsCNodeGraphKernel);
+  unused_output_eliminate_ = MakeSubstitution(UnusedOutputEliminater(), "unused_output_eliminate", IsCNodeGraphKernel);
+
+  // AddN eliminate
+  addn_eliminate_ = MakeSubstitution(AddNEliminater(), "addn_eliminate", IsCNodeGraphKernel);
+
+  // Mark interface fusion
+  mark_interface_fusion_ = MakeSubstitution(MarkInterfaceFusion(), "mark_interface_fusion", prim::kPrimSelect);
 }
 
 ResolveIRPassLib::ResolveIRPassLib() {
diff --git a/mindspore/ccsrc/optimizer/irpass.h b/mindspore/ccsrc/optimizer/irpass.h
index 02bfee65d6..5e1550c883 100644
--- a/mindspore/ccsrc/optimizer/irpass.h
+++ b/mindspore/ccsrc/optimizer/irpass.h
@@ -35,6 +35,7 @@ class OptimizeIRPassLib {
   SubstitutionPtr arithmetic_simplify_;
   SubstitutionPtr special_op_eliminate_;
   SubstitutionPtr zero_like_fill_zero_;
+  SubstitutionPtr adjust_all_reduce_mul_add_;
 
   //  ops eliminate
   SubstitutionPtr item_tuple_eliminate_;
@@ -47,16 +48,17 @@ class OptimizeIRPassLib {
   SubstitutionPtr same_eliminate_;
   SubstitutionPtr check_bprop_eliminate_;
   SubstitutionPtr reset_defer_inline_;
+  SubstitutionPtr depend_value_elim_;
 
   // Env Item Eliminate
+  SubstitutionPtr env_get_item_eliminate_;
   SubstitutionPtr new_env_get_item_;
-  SubstitutionPtr add_env_get_item_;
-  SubstitutionPtr env_get_set_item_;
   SubstitutionPtr incorporate_env_getitem_;
   SubstitutionPtr incorporate_env_getitem_switch_;
 
   // Ref eliminate
   SubstitutionPtr make_ref_eliminate_;
+  SubstitutionPtr get_ref_param_eliminate_;
   SubstitutionPtr get_make_ref_eliminate_;
   SubstitutionPtr replace_refkey_by_param_;
   SubstitutionPtr replace_old_param_;
@@ -73,7 +75,6 @@ class OptimizeIRPassLib {
 
   // Gradient irpasses
   SubstitutionPtr expand_jprim_;
-  SubstitutionPtr stop_gradient_eliminate_;
   SubstitutionPtr minmaximum_grad_;
 
   // inline
@@ -82,8 +83,8 @@ class OptimizeIRPassLib {
   SubstitutionPtr specialize_transform_;
 
   // Incorporation
-  SubstitutionPtr incorporate_getitem_;
-  SubstitutionPtr incorporate_getitem_switch_;
+  SubstitutionPtr incorporate_getitem_set_;
+  SubstitutionPtr incorporate_getitem_from_param_;
   SubstitutionPtr incorporate_call_;
   SubstitutionPtr incorporate_call_switch_;
 
@@ -92,6 +93,16 @@ class OptimizeIRPassLib {
 
   // Convert
   SubstitutionPtr print_tuple_wrapper_;
+
+  // Unused parameter eliminate
+  SubstitutionPtr unused_parameter_eliminate_;
+  SubstitutionPtr unused_output_eliminate_;
+
+  // AddN eliminate
+  SubstitutionPtr addn_eliminate_;
+
+  // Fusion
+  SubstitutionPtr mark_interface_fusion_;
 };
 
 // the collection of irpass for resolve action
@@ -141,9 +152,23 @@ inline bool IsCNodeGraph(const AnfNodePtr &node) {
     return false;
   }
 
+  auto inp0 = node->cast<CNodePtr>()->input(0);
+  return IsValueNode<FuncGraph>(inp0);
+}
+
+// Check if CNode Input 0 is Func Graph of graph kernel.
+inline bool IsCNodeGraphKernel(const AnfNodePtr &node) {
+  if (node == nullptr || !node->isa<CNode>()) {
+    return false;
+  }
+
   auto inp0 = node->cast<CNodePtr>()->input(0);
   if (IsValueNode<FuncGraph>(inp0)) {
-    return true;
+    auto fg = GetValueNode<FuncGraphPtr>(inp0);
+    if (fg == nullptr) {
+      return false;
+    }
+    return fg->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
   }
   return false;
 }
@@ -155,10 +180,7 @@ inline bool IsCNodeDup(const AnfNodePtr &node) {
   }
 
   auto inp0 = node->cast<CNodePtr>()->input(0);
-  if (inp0 != nullptr && inp0->isa<CNode>()) {
-    return true;
-  }
-  return false;
+  return (inp0 != nullptr) && inp0->isa<CNode>();
 }
 }  // namespace irpass
 }  // namespace opt
diff --git a/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h b/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h
index ab191aab20..1836a88dbc 100644
--- a/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h
+++ b/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h
@@ -83,6 +83,216 @@ class MultiplyByZeroOrOne : public AnfVisitor {
   AnfNodePtr x_{nullptr};
 };
 
+// Support class used for checking if all values of a Tensor are equal `check_value_`
+// Supported data types: double, float/float32, int/int32
+class CheckTensorConstant {
+ public:
+  explicit CheckTensorConstant(int _check_value = 0) : check_value_(_check_value) {}
+  ~CheckTensorConstant() = default;
+  bool IsTensorConstant(const ValuePtr &value) {
+    if (!value->isa<tensor::Tensor>()) {
+      return false;
+    }
+    auto tensor_ptr = dyn_cast<tensor::Tensor>(value);
+    TypeId tensor_type = tensor_ptr->Dtype()->type_id();
+    if ((tensor_type == TypeId::kNumberTypeFloat32) || (tensor_type == TypeId::kNumberTypeFloat)) {
+      float *data2 = reinterpret_cast<float *>(tensor_ptr->data_c());
+      for (int i = 0; i < tensor_ptr->DataSize(); i++) {
+        if (fabs(data2[i] - check_value_) > FLT_EPSILON) {
+          return false;
+        }
+      }
+      return true;
+    } else if (tensor_type == TypeId::kNumberTypeFloat64) {
+      double *data2 = reinterpret_cast<double *>(tensor_ptr->data_c());
+      for (int i = 0; i < tensor_ptr->DataSize(); i++) {
+        if (fabs(data2[i] - check_value_) > DBL_EPSILON) {
+          return false;
+        }
+      }
+      return true;
+    } else if ((tensor_type == TypeId::kNumberTypeInt32) || (tensor_type == TypeId::kNumberTypeInt)) {
+      int *data2 = reinterpret_cast<int *>(tensor_ptr->data_c());
+      for (int i = 0; i < tensor_ptr->DataSize(); i++) {
+        if (data2[i] != check_value_) {
+          return false;
+        }
+      }
+      return true;
+    }
+    // Un-support Data Types
+    return false;
+  }
+
+  bool IsTensorScalarConstant(const ValuePtr &value) {
+    if (!value->isa<tensor::Tensor>()) {
+      return false;
+    }
+    auto tensor_ptr = dyn_cast<tensor::Tensor>(value);
+    if ((tensor_ptr->DataSize() > 1) || (tensor_ptr->DataDim() > 0)) {
+      return false;
+    }
+    return IsTensorConstant(value);
+  }
+
+ private:
+  int check_value_;
+};
+
+// {prim::kPrimMul, 0, X}, {prim::kPrimMul, X, 0}
+// {prim::kPrimMul, 1, X}, {prim::kPrimMul, X, 1}
+class TensorMultiplyByZeroOrOne : public AnfVisitor {
+ public:
+  TensorMultiplyByZeroOrOne() : zero_(MakeValue(0)) {}
+  ~TensorMultiplyByZeroOrOne() override = default;
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    Reset();
+    AnfVisitor::Match(prim::kPrimMul)(node);
+
+    if (is_zero_) {
+      if (x_->func_graph() != node->func_graph()) {
+        return nullptr;
+      }
+      return NewTensorFilledWithData(node);
+    }
+    if (is_one_) {
+      return NewTensorFilledWithData(node, x_);
+    }
+    return nullptr;
+  }
+
+  void Visit(const AnfNodePtr &node) override {
+    if (is_zero_ || is_one_) {
+      x_ = node;
+      return;
+    }
+
+    if (IsParam(node)) {
+      x_ = node;
+      return;
+    }
+
+    if (IsCNode(node)) {
+      CNodePtr cnode = node->cast<CNodePtr>();
+      if (IsPrimitive(cnode->input(0), prim::kPrimZerosLike)) {
+        is_zero_ = true;
+        return;
+      }
+      x_ = node;
+      return;
+    }
+    auto value = node->cast<ValueNodePtr>()->value();
+    if (CheckTensorConstant(0).IsTensorConstant(value)) {
+      is_zero_ = true;
+      return;
+    } else if (CheckTensorConstant(1).IsTensorConstant(value)) {
+      is_one_ = true;
+      return;
+    }
+    x_ = node;
+  }
+
+  void Visit(const ValueNodePtr &vnode) override {
+    auto value = vnode->value();
+    if (CheckTensorConstant(0).IsTensorConstant(value)) {
+      is_zero_ = true;
+      return;
+    } else if (CheckTensorConstant(1).IsTensorConstant(value)) {
+      is_one_ = true;
+      return;
+    }
+    x_ = vnode;
+  }
+  void Reset() {
+    x_ = nullptr;
+    is_one_ = false;
+    is_zero_ = false;
+  }
+
+  void *GetPointerToTensorData(const AnfNodePtr &node, bool writable = false) {
+    if (!node->isa<ValueNode>()) {
+      return nullptr;
+    }
+
+    auto value = node->cast<ValueNodePtr>()->value();
+
+    if (!value->isa<tensor::Tensor>()) {
+      return nullptr;
+    }
+
+    tensor::TensorPtr tensor_ptr = dyn_cast<tensor::Tensor>(value);
+    return tensor_ptr->data_c(writable);
+  }
+
+  // Make a new tensor (when possible) with the same shape as of `node`
+  // If x is nullptr then fill new tensor will "0"
+  // If x is a tensor with empty shape then fill new tensor with the single value of x
+  // If x is a tensor with same shape as `node` then return x as result
+  AnfNodePtr NewTensorFilledWithData(const AnfNodePtr &node, const AnfNodePtr &x = nullptr) {
+    if ((node->abstract() == nullptr) || !node->abstract()->isa<abstract::AbstractTensor>()) {
+      return nullptr;
+    }
+
+    auto tensor_abstract = node->abstract()->cast<abstract::AbstractTensorPtr>();
+    TypePtr tensor_type_ptr = tensor_abstract->element()->BuildType();
+    std::vector<int> tensor_shape = tensor_abstract->shape()->shape();
+
+    auto new_tensor_ptr = std::make_shared<tensor::Tensor>(tensor_type_ptr->type_id(), tensor_shape);
+    size_t mem_size = GetTypeByte(tensor_type_ptr) * IntToSize(new_tensor_ptr->ElementsNum());
+    char *data = reinterpret_cast<char *>(new_tensor_ptr->data_c(true));
+
+    if (x == nullptr) {
+      std::memset(data, 0, mem_size);
+      auto new_vnode = NewValueNode(new_tensor_ptr);
+      new_vnode->set_abstract(new_tensor_ptr->ToAbstract());
+      return new_vnode;
+    }
+    // x is not nullptr
+    if (x->isa<CNode>()) {
+      if ((x->abstract() == nullptr) || !x->abstract()->isa<abstract::AbstractTensor>()) {
+        return nullptr;
+      }
+      auto x_abstract = x->abstract()->cast<abstract::AbstractTensorPtr>();
+      std::vector<int> x_shape = x_abstract->shape()->shape();
+
+      if (x_shape != tensor_shape) {
+        return nullptr;
+      }
+      return x;
+    }
+
+    if (!x->isa<ValueNode>()) {
+      return nullptr;
+    }
+    auto x_value = x->cast<ValueNodePtr>()->value();
+    if (!x_value->isa<tensor::Tensor>()) {
+      return nullptr;
+    }
+
+    auto x_tensor_ptr = dyn_cast<tensor::Tensor>(x_value);
+
+    if ((x_tensor_ptr->DataSize() > 1) && (x_tensor_ptr->DataSize() != new_tensor_ptr->DataSize())) {
+      return nullptr;
+    }
+    char *source_data = reinterpret_cast<char *>(GetPointerToTensorData(x));
+    if (x_tensor_ptr->DataSize() == 1) {
+      for (int i = 0; i < new_tensor_ptr->ElementsNum(); i++) {
+        memcpy(source_data, data + i * GetTypeByte(tensor_type_ptr), GetTypeByte(tensor_type_ptr));
+      }
+    } else {
+      memcpy(source_data, data, mem_size);
+    }
+    auto new_vnode = NewValueNode(new_tensor_ptr);
+    new_vnode->set_abstract(new_tensor_ptr->ToAbstract());
+    return new_vnode;
+  }
+
+ private:
+  bool is_zero_{false}, is_one_{false};
+  ValuePtr zero_;
+  AnfNodePtr x_{nullptr};
+};
+
 // {prim::kPrimScalarAdd, X, 0}
 // {prim::kPrimScalarAdd, 0, X}
 class AddByZero : public AnfVisitor {
@@ -101,7 +311,8 @@ class AddByZero : public AnfVisitor {
   }
 
   void Visit(const AnfNodePtr &node) override {
-    if (node->isa<ValueNode>() && *GetValueNode(node) == *zero_) {
+    if (node->isa<ValueNode>() &&
+        ((*GetValueNode(node) == *zero_) || CheckTensorConstant(0).IsTensorScalarConstant(GetValueNode(node)))) {
       is_zero_ = true;
       return;
     }
@@ -120,8 +331,8 @@ class AddByZero : public AnfVisitor {
   AnfNodePtr x_{nullptr};
 };
 
-// {prim::kPrimTensorAdd, {PrimZerosLikeTensor, Y}, X},
-// {prim::kPrimTensorAdd, X, {PrimZerosLikeTensor, Y}}
+// {prim::kPrimTensorAdd, {kPrimZerosLike, Y}, X},
+// {prim::kPrimTensorAdd, X, {kPrimZerosLike, Y}}
 class TensorAddByZero : public AnfVisitor {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
@@ -135,7 +346,11 @@ class TensorAddByZero : public AnfVisitor {
   }
 
   void Visit(const AnfNodePtr &node) override {
-    if (IsPrimitive(node, prim::kPrimZerosLikeTensor)) {
+    if (IsPrimitive(node, prim::kPrimZerosLike)) {
+      is_zero_ = true;
+      return;
+    }
+    if (node->isa<ValueNode>() && CheckTensorConstant(0).IsTensorScalarConstant(GetValueNode(node))) {
       is_zero_ = true;
       return;
     }
@@ -143,6 +358,14 @@ class TensorAddByZero : public AnfVisitor {
     x_ = node;
   }
 
+  void Visit(const ValueNodePtr &vnode) override {
+    auto value = vnode->value();
+    if (CheckTensorConstant(0).IsTensorConstant(value)) {
+      is_zero_ = true;
+      return;
+    }
+  }
+
   void Reset() {
     x_ = nullptr;
     is_zero_ = false;
@@ -153,7 +376,7 @@ class TensorAddByZero : public AnfVisitor {
   AnfNodePtr x_{nullptr};
 };
 
-// {PrimMomentum, {PrimZerosLikeTensor, X}, Y, Z, Xs}  -> {prim::kPrimMakeTuple, Z, Y}
+// {PrimMomentum, {kPrimZerosLike, X}, Y, Z, Xs}  -> {prim::kPrimMakeTuple, Z, Y}
 class OptUpdateZeroTensor : public AnfVisitor {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
@@ -163,13 +386,13 @@ class OptUpdateZeroTensor : public AnfVisitor {
 
     // {PrimMomentum, {...}, Y, Z, Xs}
     auto &inputs = node->cast<CNodePtr>()->inputs();
-    if (inputs.size() < 4 || !IsPrimitiveCNode(inputs[1], prim::kPrimZerosLikeTensor)) {
+    if (inputs.size() < 4 || !IsPrimitiveCNode(inputs[1], prim::kPrimZerosLike)) {
       return nullptr;
     }
     auto y = inputs[2];
     auto z = inputs[3];
 
-    // {PrimZerosLikeTensor, X}
+    // {kPrimZerosLike, X}
     if (inputs[1]->cast<CNodePtr>()->size() != 2) {
       return nullptr;
     }
@@ -183,29 +406,143 @@ class OptUpdateZeroTensor : public AnfVisitor {
 // {prim::kPrimMul, {...}, {prim::kPrimMul, Tensor1, Tensor2}}
 class ConstantDuplicateMul : public AnfVisitor {
  public:
+  // Support function to multiply two constant tensors: partially support broadcasting shapes
+  template <typename T>
+  void Multiply(void *in_data_1, int in_data_1_size, void *in_data_2, int in_data_2_size, void **out_data,
+                int out_data_size) {
+    T *data_1 = reinterpret_cast<T *>(in_data_1);
+    T *data_2 = reinterpret_cast<T *>(in_data_2);
+    T *data_out = new T[out_data_size];
+
+    if (in_data_1_size == 1) {
+      for (int i = 0; i < out_data_size; i++) {
+        data_out[i] = data_1[0];
+      }
+    } else {
+      for (int i = 0; i < out_data_size; i++) {
+        data_out[i] = data_1[i];
+      }
+    }
+    if (in_data_2_size == 1) {
+      for (int i = 0; i < out_data_size; i++) {
+        data_out[i] *= data_2[0];
+      }
+    } else {
+      for (int i = 0; i < out_data_size; i++) {
+        data_out[i] *= data_2[i];
+      }
+    }
+    *out_data = reinterpret_cast<void *>(data_out);
+    return;
+  }
+
+  AnfNodePtr MulConstantTensors(const AnfNodePtr &vnode_1, const AnfNodePtr &vnode_2, const AnfNodePtr &node_3) {
+    if (!vnode_1->isa<ValueNode>() || !vnode_2->isa<ValueNode>() || (vnode_1->abstract() == nullptr) ||
+        (vnode_2->abstract() == nullptr) || (node_3->abstract() == nullptr)) {
+      return nullptr;
+    }
+
+    auto value_1 = GetValueNode(vnode_1);
+    auto value_2 = GetValueNode(vnode_2);
+
+    if (!value_1->isa<tensor::Tensor>() || !value_2->isa<tensor::Tensor>()) {
+      return nullptr;
+    }
+
+    auto tensor_ptr_1 = dyn_cast<tensor::Tensor>(value_1);
+    auto tensor_ptr_2 = dyn_cast<tensor::Tensor>(value_2);
+
+    auto tensor_1_abstract = vnode_1->abstract()->cast<abstract::AbstractTensorPtr>();
+    auto tensor_2_abstract = vnode_1->abstract()->cast<abstract::AbstractTensorPtr>();
+    auto tensor_3_abstract = node_3->abstract()->cast<abstract::AbstractTensorPtr>();
+
+    TypePtr tensor_1_type_ptr = tensor_1_abstract->element()->BuildType();
+    TypePtr tensor_2_type_ptr = tensor_2_abstract->element()->BuildType();
+    TypePtr tensor_3_type_ptr = tensor_3_abstract->element()->BuildType();
+
+    if ((tensor_1_type_ptr->type_id() != tensor_3_type_ptr->type_id()) ||
+        (tensor_2_type_ptr->type_id() != tensor_3_type_ptr->type_id())) {
+      return nullptr;
+    }
+
+    std::vector<int> tensor_out_shape = tensor_3_abstract->shape()->shape();
+
+    int data_out_size = 1;
+    for (auto it : tensor_out_shape) {
+      data_out_size *= it;
+    }
+    if ((tensor_ptr_1->DataSize() > 1) && (tensor_ptr_1->DataSize() != data_out_size)) {
+      return nullptr;
+    }
+    if ((tensor_ptr_2->DataSize() > 1) && (tensor_ptr_2->DataSize() != data_out_size)) {
+      return nullptr;
+    }
+
+    void *data_out;
+
+    if ((tensor_3_type_ptr->type_id() == TypeId::kNumberTypeFloat32) ||
+        (tensor_3_type_ptr->type_id() == TypeId::kNumberTypeFloat)) {
+      Multiply<float>(tensor_ptr_1->data_c(), tensor_ptr_1->DataSize(), tensor_ptr_2->data_c(),
+                      tensor_ptr_2->DataSize(), &data_out, data_out_size);
+    } else {
+      if (tensor_3_type_ptr->type_id() == TypeId::kNumberTypeFloat64) {
+        Multiply<double>(tensor_ptr_1->data_c(), tensor_ptr_1->DataSize(), tensor_ptr_2->data_c(),
+                         tensor_ptr_2->DataSize(), &data_out, data_out_size);
+      } else {
+        if ((tensor_3_type_ptr->type_id() == TypeId::kNumberTypeInt32) ||
+            (tensor_3_type_ptr->type_id() == TypeId::kNumberTypeInt)) {
+          Multiply<int>(tensor_ptr_1->data_c(), tensor_ptr_1->DataSize(), tensor_ptr_2->data_c(),
+                        tensor_ptr_2->DataSize(), &data_out, data_out_size);
+        } else {
+          // Un-support data types
+          return nullptr;
+        }
+      }
+    }
+
+    auto new_tensor_ptr = std::make_shared<tensor::Tensor>(tensor_3_type_ptr->type_id(), tensor_out_shape);
+    size_t mem_size = GetTypeByte(tensor_3_type_ptr) * IntToSize(new_tensor_ptr->ElementsNum());
+    char *data = reinterpret_cast<char *>(new_tensor_ptr->data_c(true));
+    memcpy(data, data_out, mem_size);
+
+    auto new_vnode = NewValueNode(new_tensor_ptr);
+    new_vnode->set_abstract(new_tensor_ptr->ToAbstract());
+    return new_vnode;
+  }
+
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     Reset();
     // {prim::kPrimMul, Tensor1, {...}}
     AnfVisitor::Match(prim::kPrimMul, {IsNode, IsNode})(node);
-    if (vnode_ == nullptr || cnode_ == nullptr) {
+    if (vnode_ == nullptr || c_p_node_ == nullptr) {
+      return nullptr;
+    }
+
+    if (!IsCNode(c_p_node_)) {
       return nullptr;
     }
+
     auto tensor1 = vnode_;
-    auto mul = cnode_;
+    auto mul = c_p_node_->cast<CNodePtr>();
 
     Reset();
     // {prim::kPrimMul, Tensor2, {...}}
     AnfVisitor::Match(prim::kPrimMul, {IsNode, IsNode})(mul);
-    if (vnode_ == nullptr || cnode_ == nullptr) {
+    if (vnode_ == nullptr || c_p_node_ == nullptr) {
       return nullptr;
     }
     auto tensor2 = vnode_;
-    auto cnode = cnode_;
+    auto c_p_node = c_p_node_;
 
     auto PrimMul = GetValueNode<PrimitivePtr>(mul->input(0));
     auto fg = node->func_graph();
-    auto ttmul = NewCNode({NewValueNode(PrimMul), tensor1, tensor2}, fg);
-    return NewCNode({NewValueNode(PrimMul), cnode, ttmul}, fg);
+
+    auto new_mul_tensor = MulConstantTensors(tensor1, tensor2, c_p_node);
+    if (new_mul_tensor == nullptr) {
+      auto ttmul = NewCNode({NewValueNode(PrimMul), tensor1, tensor2}, fg);
+      return NewCNode({NewValueNode(PrimMul), c_p_node, ttmul}, fg);
+    }
+    return NewCNode({NewValueNode(PrimMul), c_p_node, new_mul_tensor}, fg);
   }
 
   void Visit(const AnfNodePtr &node) override {
@@ -213,36 +550,170 @@ class ConstantDuplicateMul : public AnfVisitor {
       vnode_ = node;
     }
 
-    if (IsCNode(node)) {
-      cnode_ = node->cast<CNodePtr>();
+    if (IsCNode(node) || IsParam(node)) {
+      c_p_node_ = node;
     }
   }
 
   void Reset() {
     vnode_ = nullptr;
-    cnode_ = nullptr;
+    c_p_node_ = nullptr;
   }
 
  private:
   AnfNodePtr vnode_;
-  CNodePtr cnode_;
+  AnfNodePtr c_p_node_;
+};
+
+class PowerOneEliminate : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (!IsPrimitiveCNode(node, prim::kPrimPow) || node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    auto &inputs = node->cast<CNodePtr>()->inputs();
+    if (!IsValueNode<Scalar>(inputs[2])) {
+      return nullptr;
+    }
+    auto scalar = GetValueNode<ScalarPtr>(inputs[2]);
+    if (scalar->isa<FloatImm>() && GetValue<float>(scalar) == 1.0) {
+      return inputs[1];
+    } else if (scalar->isa<IntergerImm>() && GetValue<int>(scalar) == 1) {
+      return inputs[1];
+    }
+    return nullptr;
+  }
+};
+
+// grad = AllReduce(grad) / worker_number
+// grad = grad + weight * decy
+// ->
+// grad = grad + weight * decy
+// grad = AllReduce(grad) / worker_number
+
+// {prim::kPrimAddN, {prim::kPrimMakeTuple, {prim::kPrimMul, {prim::kPrimAllReduce, X}, Y}, Z}} ->
+// {prim::kPrimMul, {prim::kPrimAllReduce, {prim::kPrimAddN,{prim::kPrimMakeTuple, Z, X}}}, Y}
+class AdjustAllReduceMulAdd : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    Reset();
+    // {prim::kPrimAddN, Zs}
+    if (!IsPrimitiveCNode(node, prim::kPrimAddN)) {
+      return nullptr;
+    }
+    auto addn = node->cast<CNodePtr>();
+    if (addn->size() != 2) {
+      return nullptr;
+    }
+    AnfVisitor::Match(prim::kPrimMakeTuple, {IsNode, IsNode})(addn->input(1));
+    if (x_ == nullptr || y_ == nullptr || z_ == nullptr || all_reduce_fg_ == nullptr) {
+      return nullptr;
+    }
+    auto addn_maketuple = addn->input(1);
+
+    auto fg = all_reduce_fg_;
+    // addn inputs cross the graph, make the inputs same as allreduce node.
+    if (z_->isa<CNode>() && fg != z_->func_graph()) {
+      auto cnode_z = z_->cast<CNodePtr>();
+      z_ = NewCNode(cnode_z->inputs(), fg);
+    }
+
+    auto addn_op_node = addn->input(0);
+    auto make_tuple_op_node = addn->input(1)->cast<CNodePtr>()->input(0);
+
+    AnfNodePtr tuple = NewCNode({make_tuple_op_node, z_, x_}, fg);
+    AnfNodePtr add = NewCNode({addn_op_node, tuple}, fg);
+    AnfNodePtr all_reduce = NewCNode({all_reduce_, add}, fg);
+    AnfNodePtr mul = NewCNode({mul_, all_reduce, y_}, fg);
+    ProcessDependEdge(fg, addn_maketuple, all_reduce);
+    return mul;
+  }
+  void ProcessDependEdge(const FuncGraphPtr &fg, const AnfNodePtr &addn_maketuple, const AnfNodePtr &new_node) {
+    // If has dynamic loss scale.
+    auto &users_map = fg->manager()->node_users();
+    auto it = users_map.find(mul_cnode_);
+    if (it != users_map.end()) {
+      auto users = it->second;
+      for (auto &user_pair : users) {
+        auto node = user_pair.first;
+        if (node != addn_maketuple) {
+          if (IsPrimitiveCNode(node, prim::kPrimMakeTuple)) {
+            fg->manager()->SetEdge(node, user_pair.second, new_node);
+          }
+        }
+      }
+    }
+  }
+  void Visit(const AnfNodePtr &node) override {
+    if (level_ == 0) {
+      level_ = 1;
+      is_reduce_match_ = false;
+      // {prim::kPrimMul, {prim::kPrimAllReduce, X}, Y}
+      AnfVisitor::Match(prim::kPrimMul)(node);
+      level_ = 0;
+      if (is_reduce_match_) {
+        mul_ = node->cast<CNodePtr>()->input(0);
+        mul_cnode_ = node->cast<CNodePtr>();
+        y_ = tmp_;
+      } else {
+        z_ = node;
+      }
+    }
+
+    if (level_ == 1) {
+      // {prim::kPrimAllReduce, X}
+      if (IsPrimitiveCNode(node, prim::kPrimAllReduce)) {
+        auto cnode = node->cast<CNodePtr>();
+        if (cnode->size() > 1) {
+          all_reduce_ = cnode->input(0);
+          x_ = cnode->input(1);
+          is_reduce_match_ = true;
+          all_reduce_fg_ = cnode->func_graph();
+        }
+      } else {
+        tmp_ = node;
+      }
+    }
+  }
+
+  void Reset() {
+    level_ = 0;
+    is_reduce_match_ = false;
+    x_ = nullptr;
+    y_ = nullptr;
+    z_ = nullptr;
+    tmp_ = nullptr;
+    all_reduce_fg_ = nullptr;
+  }
+
+ private:
+  int level_{0};
+  bool is_reduce_match_{false};
+  AnfNodePtr x_{nullptr}, y_{nullptr}, z_{nullptr}, tmp_{nullptr};
+  AnfNodePtr all_reduce_{nullptr}, mul_{nullptr}, mul_cnode_{nullptr};
+  FuncGraphPtr all_reduce_fg_{nullptr};
 };
 
 class ArithmeticSimplify {
  public:
   ArithmeticSimplify()
       : multiply_by_zero_or_one_(),
+        tensor_multiply_by_zero_or_one_(),
         add_by_zero_(),
         tensor_add_by_zero_(),
         identity_(prim::kPrimIdentity),
         opt_update_zero_tensor_(),
-        constant_duplicate_mul_() {
+        constant_duplicate_mul_(),
+        power_one_() {
     eliminaters_.emplace_back(multiply_by_zero_or_one_);
+    eliminaters_.emplace_back(tensor_multiply_by_zero_or_one_);
     eliminaters_.emplace_back(add_by_zero_);
     eliminaters_.emplace_back(tensor_add_by_zero_);
     eliminaters_.emplace_back(identity_);
     eliminaters_.emplace_back(opt_update_zero_tensor_);
     eliminaters_.emplace_back(constant_duplicate_mul_);
+    eliminaters_.emplace_back(power_one_);
   }
   ~ArithmeticSimplify() = default;
 
@@ -259,11 +730,13 @@ class ArithmeticSimplify {
 
  private:
   MultiplyByZeroOrOne multiply_by_zero_or_one_;
+  TensorMultiplyByZeroOrOne tensor_multiply_by_zero_or_one_;
   AddByZero add_by_zero_;
   TensorAddByZero tensor_add_by_zero_;
   PrimEliminater identity_;
   OptUpdateZeroTensor opt_update_zero_tensor_;
   ConstantDuplicateMul constant_duplicate_mul_;
+  PowerOneEliminate power_one_;
   std::vector<TransformFuncType> eliminaters_{};
 };
 }  // namespace irpass
diff --git a/mindspore/ccsrc/optimizer/irpass/branch_culling.h b/mindspore/ccsrc/optimizer/irpass/branch_culling.h
index b2d6718857..2b5b30bdbf 100644
--- a/mindspore/ccsrc/optimizer/irpass/branch_culling.h
+++ b/mindspore/ccsrc/optimizer/irpass/branch_culling.h
@@ -20,147 +20,65 @@
 #include <vector>
 #include <algorithm>
 
-#include "optimizer/optimizer.h"
-#include "optimizer/irpass.h"
-#include "ir/visitor.h"
 #include "ir/func_graph.h"
 #include "ir/func_graph_cloner.h"
+#include "ir/optimizer_caller.h"
+#include "ir/pattern_matcher.h"
 #include "operator/ops.h"
+#include "optimizer/irpass.h"
 
 namespace mindspore {
 namespace opt {
 namespace irpass {
 // {prim::kPrimSwitch, true, X, Y}
 // {prim::kPrimSwitch, false, X, Y}
-class SwitchSimplify : public AnfVisitor {
+class SwitchSimplify : public OptimizerCaller {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
-    Reset();
-    auto getx = [this](const AnfNodePtr &node) -> bool {
-      this->x_ = node;
-      return true;
-    };
-    auto gety = [this](const AnfNodePtr &node) -> bool {
-      this->y_ = node;
-      return true;
+    PatternNode<AnfNodePtr> cond, true_br, false_br;
+    auto SwitchSimplLambda = [&node, &cond, &true_br, &false_br]() -> AnfNodePtr {
+      auto cond_value_ = GetValue<bool>(GetValueNode(cond.GetNode(node)));
+      if (cond_value_) {
+        return true_br.GetNode(node);
+      }
+      return false_br.GetNode(node);
     };
-    AnfVisitor::Match(prim::kPrimSwitch, {IsValueNode<BoolImm>, getx, gety})(node);
 
-    // simplify the switch
-    if (is_match_) {
-      if (cond_) {
-        return x_;
-      }
-      return y_;
-    }
+    MATCH_REPLACE_LAMBDA_IF(node, PPrimitive(prim::kPrimSwitch, cond, true_br, false_br), SwitchSimplLambda,
+                            cond.CheckFunc(IsValueNode<BoolImm>, node));
 
     return nullptr;
   }
-
-  void Visit(const AnfNodePtr &node) override {
-    if (!is_match_ && IsValueNode<BoolImm>(node)) {
-      cond_ = GetValue<bool>(GetValueNode(node));
-      is_match_ = true;
-    }
-  }
-
-  void Reset() {
-    x_ = nullptr;
-    y_ = nullptr;
-    cond_ = false;
-    is_match_ = false;
-  }
-
- private:
-  bool is_match_{false}, cond_{false};
-  AnfNodePtr x_{nullptr}, y_{nullptr};
 };
 
 // {prim::kPrimTupleGetItem, {prim::kPrimSwith, X0, X1, X2}, C} =>
 // {prim::kPrimSwith, X0, {prim::kPrimTupleGetItem, X1, C}, {prim::kPrimTupleGetItem, X2, C}}
-class FloatTupleGetItemSwitch : public AnfVisitor {
+class FloatTupleGetItemSwitch : public OptimizerCaller {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
-    Reset();
-    AnfVisitor::Match(prim::kPrimTupleGetItem, {IsCNode, IsVNode})(node);
-
-    auto fg = node->func_graph();
-    if (Xs_.empty() || c_ == nullptr || fg == nullptr) {
-      return nullptr;
-    }
-
-    auto true_node = fg->NewCNode({NewValueNode(prim::kPrimTupleGetItem), Xs_[1], c_});
-    auto false_node = fg->NewCNode({NewValueNode(prim::kPrimTupleGetItem), Xs_[2], c_});
-
-    return fg->NewCNode({NewValueNode(prim::kPrimSwitch), Xs_[0], true_node, false_node});
-  }
-
-  void Visit(const CNodePtr &cnode) override {
-    // {prim::kPrimSwith, X1, X2, X3}
-    if (!IsPrimitiveCNode(cnode, prim::kPrimSwitch) || cnode->size() != 4) {
-      return;
-    }
-
-    // copy X1, X2, X3
-    auto &inputs = cnode->inputs();
-    (void)std::copy(inputs.begin() + 1, inputs.end(), std::back_inserter(Xs_));
-  }
-
-  void Visit(const ValueNodePtr &vnode) override { c_ = vnode; }
-
-  void Reset() {
-    Xs_.clear();
-    c_ = nullptr;
+    PatternNode<AnfNodePtr> cond, true_br, false_br, x;
+    MATCH_REPLACE_IF(node,
+                     PPrimitive(prim::kPrimTupleGetItem, PPrimitive(prim::kPrimSwitch, cond, true_br, false_br), x),
+                     PPrimitive(prim::kPrimSwitch, cond, PPrimitive(prim::kPrimTupleGetItem, true_br, x),
+                                PPrimitive(prim::kPrimTupleGetItem, false_br, x)),
+                     x.CheckFunc(IsVNode, node));
+    return nullptr;
   }
-
- private:
-  AnfNodePtr c_{nullptr};
-  std::vector<AnfNodePtr> Xs_{};
 };
 
 // {prim::kPrimEnvGetItem, {prim::kPrimSwitch, X1, X2, X3}, X4, X5} =>
 // {prim::kPrimSwitch, X1, {prim::kPrimEnvGetItem, X2, X4, X5}, {prim::kPrimEnvGetItem, X3, X4, X5}}
-class FloatEnvGetItemSwitch : public AnfVisitor {
+class FloatEnvGetItemSwitch : public OptimizerCaller {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
-    is_match_ = false;
-    AnfVisitor::Match(prim::kPrimEnvGetItem, {IsCNode, IsNode, IsNode})(node);
-    if (!is_match_) {
-      return nullptr;
-    }
-
-    // {prim::kPrimEnvGetItem, {...}, X4, X5}
-    auto cnode = node->cast<CNodePtr>();
-    auto sw_node = cnode->input(1)->cast<CNodePtr>();
-    auto x4 = cnode->input(2);
-    auto x5 = cnode->input(3);
+    PatternNode<AnfNodePtr> cond, true_br, false_br, x, x2;
+    MATCH_REPLACE(node,
+                  PPrimitive(prim::kPrimEnvGetItem, PPrimitive(prim::kPrimSwitch, cond, true_br, false_br), x, x2),
+                  PPrimitive(prim::kPrimSwitch, cond, PPrimitive(prim::kPrimEnvGetItem, true_br, x, x2),
+                             PPrimitive(prim::kPrimEnvGetItem, false_br, x, x2)));
 
-    is_match_ = false;
-    AnfVisitor::Match(prim::kPrimSwitch, {IsNode, IsNode, IsNode})(sw_node);
-    if (!is_match_) {
-      return nullptr;
-    }
-
-    // {prim::kPrimSwitch, X1, X2, X3}
-    auto x1 = sw_node->input(1);
-    auto x2 = sw_node->input(2);
-    auto x3 = sw_node->input(3);
-
-    auto fg = node->func_graph();
-    if (fg == nullptr) {
-      return nullptr;
-    }
-
-    auto true_node = fg->NewCNode({NewValueNode(prim::kPrimEnvGetItem), x2, x4, x5});
-    auto false_node = fg->NewCNode({NewValueNode(prim::kPrimEnvGetItem), x3, x4, x5});
-
-    return fg->NewCNode({NewValueNode(prim::kPrimSwitch), x1, true_node, false_node});
+    return nullptr;
   }
-
-  void Visit(const AnfNodePtr &) override { is_match_ = true; }
-
- private:
-  bool is_match_{false};
 };
 
 namespace internal {
@@ -173,79 +91,64 @@ AnfNodePtr TransformMergeBranches(const AnfNodePtr &true_output_node, const AnfN
 }  // namespace internal
 
 // {{prim::kPrimSwitch, X, G1, G2}, Xs}
-class ConvertSwitchReplacement : public AnfVisitor {
+class ConvertSwitchReplacement : public OptimizerCaller {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     if (!node->isa<CNode>() || node->func_graph() == nullptr) {
       return nullptr;
     }
 
-    Reset();
-    auto cnode = node->cast<CNodePtr>();
-    if (cnode->size() < 1) {
+    auto cnode_ = node->cast<CNodePtr>();
+    if (cnode_->size() < 1) {
       return nullptr;
     }
 
-    // {prim::kPrimSwitch, X, G1, G2}
-    AnfVisitor::Match(prim::kPrimSwitch, {IsNode, IsValueNode<FuncGraph>, IsValueNode<FuncGraph>})(cnode->input(0));
-    if (g2_ == nullptr || g1_->output() == nullptr || g2_->output() == nullptr) {
-      return nullptr;
-    }
-    // for switch replace method, only graphs without graph inside can be replaced
-    for (auto &item : g1_->value_nodes()) {
-      auto value_node = item.first;
-      if (IsValueNode<FuncGraph>(value_node)) {
-        return nullptr;
+    auto node_ = cnode_->input(0);
+
+    PatternNode<AnfNodePtr> cond, true_br, false_br;
+
+    auto ConvertSwitchLambda = [&node_, &cond, &true_br, &false_br]() -> AnfNodePtr {
+      auto g1_ = GetValueNode<FuncGraphPtr>(true_br.GetNode(node_));
+      auto g2_ = GetValueNode<FuncGraphPtr>(false_br.GetNode(node_));
+      auto x_ = cond.GetNode(node_);
+
+      // for switch replace method, only graphs without graph inside can be replaced
+      for (auto &item : g1_->value_nodes()) {
+        auto value_node = item.first;
+        if (IsValueNode<FuncGraph>(value_node)) {
+          return nullptr;
+        }
       }
-    }
 
-    for (auto &item : g2_->value_nodes()) {
-      auto value_node = item.first;
-      if (IsValueNode<FuncGraph>(value_node)) {
-        return nullptr;
+      for (auto &item : g2_->value_nodes()) {
+        auto value_node = item.first;
+        if (IsValueNode<FuncGraph>(value_node)) {
+          return nullptr;
+        }
       }
-    }
 
-    auto true_output = g1_->output()->abstract();
-    auto false_output = g2_->output()->abstract();
-    auto trans_g1 = internal::TransformGraphCondTrueBranchNodes(g1_, x_);
-    auto trans_g2 = internal::TransformGraphCondFalseBranchNodes(g2_, x_);
-
-    std::vector<AnfNodePtr> params;
-    auto fg = node->func_graph();
-    auto cloned_g1 = InlineClone(trans_g1, fg, params);
-    auto cloned_g2 = InlineClone(trans_g2, fg, params);
-    auto nnode = internal::TransformMergeBranches(cloned_g1, cloned_g2, true_output, false_output, x_, fg);
-    return nnode;
-  }
+      auto true_output = g1_->output()->abstract();
+      auto false_output = g2_->output()->abstract();
+      auto trans_g1 = internal::TransformGraphCondTrueBranchNodes(g1_, x_);
+      auto trans_g2 = internal::TransformGraphCondFalseBranchNodes(g2_, x_);
 
-  void Visit(const AnfNodePtr &node) override {
-    if (x_ == nullptr) {
-      x_ = node;
-      return;
-    }
-    AnfVisitor::Visit(node);
-  }
+      std::vector<AnfNodePtr> params;
+      auto fg = node_->func_graph();
+      auto cloned_g1 = InlineClone(trans_g1, fg, params);
+      auto cloned_g2 = InlineClone(trans_g2, fg, params);
+      auto nnode = internal::TransformMergeBranches(cloned_g1, cloned_g2, true_output, false_output, x_, fg);
 
-  void Visit(const ValueNodePtr &vnode) override {
-    auto g = GetValueNode<FuncGraphPtr>(vnode);
-    if (g1_ == nullptr) {
-      g1_ = g;
-    } else {
-      g2_ = g;
-    }
-  }
+      return nnode;
+    };
 
-  void Reset() {
-    x_ = nullptr;
-    g1_ = nullptr;
-    g2_ = nullptr;
-  }
+    MATCH_REPLACE_LAMBDA_IF(
+      node_, PPrimitive(prim::kPrimSwitch, cond, true_br, false_br), ConvertSwitchLambda,
+      true_br.CheckFunc(IsValueNode<FuncGraph>, node_) && false_br.CheckFunc(IsValueNode<FuncGraph>, node_));
 
- private:
-  AnfNodePtr x_{nullptr};
-  FuncGraphPtr g1_{nullptr}, g2_{nullptr};
+    return nullptr;
+  }
 };
+
 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/irpass/env_item_eliminate.h b/mindspore/ccsrc/optimizer/irpass/env_item_eliminate.h
index ce29b32d14..0f59c69fef 100644
--- a/mindspore/ccsrc/optimizer/irpass/env_item_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/env_item_eliminate.h
@@ -225,6 +225,33 @@ class EnvGetSetItem : public AnfVisitor {
   bool is_match_{false};
 };
 
+class EnvGetItemEliminater {
+ public:
+  EnvGetItemEliminater() : new_env_get_item_(), add_env_get_item_(), env_get_set_item_() {
+    eliminaters_.emplace_back(new_env_get_item_);
+    eliminaters_.emplace_back(add_env_get_item_);
+    eliminaters_.emplace_back(env_get_set_item_);
+  }
+  ~EnvGetItemEliminater() = default;
+
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+    AnfNodePtr new_node;
+    for (auto &eliminater : eliminaters_) {
+      new_node = eliminater(optimizer, node);
+      if (new_node != nullptr) {
+        return new_node;
+      }
+    }
+    return nullptr;
+  }
+
+ private:
+  NewEnvGetItem new_env_get_item_;
+  AddEnvGetItem add_env_get_item_;
+  EnvGetSetItem env_get_set_item_;
+  std::vector<TransformFuncType> eliminaters_{};
+};
+
 // {prim::kPrimEnvGetItem, {G, Xs}, C, Y}
 class IncorporateEnvGetitem : public AnfVisitor {
  public:
diff --git a/mindspore/ccsrc/optimizer/irpass/gradient_eliminate.h b/mindspore/ccsrc/optimizer/irpass/gradient_eliminate.h
index 651dc3a2f2..671d9bde49 100644
--- a/mindspore/ccsrc/optimizer/irpass/gradient_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/gradient_eliminate.h
@@ -55,21 +55,6 @@ class ExpandJPrim : public AnfVisitor {
  private:
   ValueNodePtr x_{nullptr};
 };
-
-// stop_gradient(x) ==> x
-class StopGradientEliminater : public AnfVisitor {
- public:
-  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
-    x_ = nullptr;
-    AnfVisitor::Match(prim::kPrimStopGradient)(node);
-    return x_;
-  }
-
-  void Visit(const AnfNodePtr &node) override { x_ = node; }
-
- private:
-  AnfNodePtr x_{nullptr};
-};
 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h b/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h
index 77f3fa7b36..5afee45e95 100644
--- a/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h
+++ b/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h
@@ -21,6 +21,7 @@
 #include <algorithm>
 #include <unordered_map>
 #include <memory>
+#include <unordered_set>
 
 #include "optimizer/irpass.h"
 #include "optimizer/optimizer.h"
@@ -28,7 +29,6 @@
 #include "ir/func_graph.h"
 #include "ir/func_graph_cloner.h"
 #include "operator/ops.h"
-
 namespace mindspore {
 namespace opt {
 namespace irpass {
@@ -81,13 +81,32 @@ class IncorporateGetitem : public AnfVisitor {
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     Reset();
     AnfVisitor::Match(prim::kPrimTupleGetItem, {IsCNode, IsValueNode<Int32Imm>})(node);
+    if (node->func_graph() == nullptr || idx_ == -1 || fg_ == nullptr) {
+      return nullptr;
+    }
 
-    if (node->func_graph() != nullptr && idx_ >= 0 && fg_ != nullptr) {
-      auto new_fg = getitem_transform_(fg_, idx_);
-      (void)args_.insert(args_.begin(), NewValueNode(new_fg));
-      return node->func_graph()->NewCNode(args_);
+    if (fg_->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+      // If graph kernel has muti output, do not split.
+      // some graph kernel output has EnvInstance node or DeadCode node should split.
+      auto output = fg_->output();
+      if (IsPrimitiveCNode(output, prim::kPrimMakeTuple)) {
+        auto output_cnode = output->cast<CNodePtr>();
+        auto outputs = output_cnode->inputs();
+        int real_output_cnt = 0;
+        for (size_t i = 1; i < outputs.size(); ++i) {
+          if (IsCNode(outputs[i]) || IsValueNode<tensor::Tensor>(outputs[i]) || IsParam(outputs[i])) {
+            real_output_cnt++;
+            if (real_output_cnt > 1) {
+              return nullptr;
+            }
+          }
+        }
+      }
     }
-    return nullptr;
+
+    auto new_fg = getitem_transform_(fg_, idx_);
+    (void)args_.insert(args_.begin(), NewValueNode(new_fg));
+    return node->func_graph()->NewCNode(args_);
   }
 
   void Visit(const CNodePtr &cnode) override {
@@ -115,6 +134,172 @@ class IncorporateGetitem : public AnfVisitor {
   internal::GetitemTransform getitem_transform_;
 };
 
+class IncorporateGetitemFromParam : public AnfVisitor {
+ public:
+  void Process(const FuncGraphPtr &func_graph, const CNodePtr &cnode, const AnfNodePtr &param, size_t input_idx) {
+    auto mng = func_graph->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    auto &node_users = mng->node_users();
+    if (node_users.find(param) == node_users.end() || node_users[param].empty()) {
+      args_.push_back(cnode->input(input_idx + 1));
+      return;
+    }
+
+    for (auto &user : node_users[param]) {
+      if (!IsPrimitiveCNode(user.first, prim::kPrimTupleGetItem)) {
+        // we do not process this case.
+        args_.push_back(cnode->input(input_idx + 1));
+        return;
+      }
+    }
+
+    // update new args.
+    if (IsPrimitiveCNode(cnode->input(input_idx + 1), prim::kPrimMakeTuple)) {
+      // case 1
+      replace_parameters_[input_idx] = true;
+      need_update_ = true;
+      auto make_tuple_cnode = cnode->input(input_idx + 1)->cast<CNodePtr>();
+      auto &make_tuple_cnode_inputs = make_tuple_cnode->inputs();
+      inputs_num_[input_idx] = make_tuple_cnode_inputs.size() - 1;
+      args_.insert(args_.end(), make_tuple_cnode_inputs.begin() + 1, make_tuple_cnode_inputs.end());
+    } else {
+      // case 2
+      auto prev_cnode = cnode->input(input_idx + 1)->cast<CNodePtr>();
+      auto prev_fg = GetValueNode<FuncGraphPtr>(prev_cnode->input(0));
+      auto fg_output = prev_fg->output();
+      if (!IsPrimitiveCNode(fg_output, prim::kPrimMakeTuple)) {
+        MS_LOG(ERROR) << "The return of: " << prev_fg->ToString()
+                      << " should be a make tuple, but got: " << fg_output->DebugString();
+        return;
+      }
+      replace_parameters_[input_idx] = true;
+      need_update_ = true;
+      auto make_tuple_cnode = fg_output->cast<CNodePtr>();
+      inputs_num_[input_idx] = make_tuple_cnode->inputs().size() - 1;
+      for (size_t output_i = 0; output_i < inputs_num_[input_idx]; ++output_i) {
+        auto new_getitem =
+          func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), prev_cnode, NewValueNode(SizeToInt(output_i))});
+        auto aptr = std::make_shared<abstract::AbstractScalar>(std::make_shared<Int32Imm>(SizeToInt(output_i)));
+        new_getitem->input(2)->set_abstract(aptr);
+        new_getitem->set_abstract(make_tuple_cnode->input(output_i + 1)->abstract());
+        args_.push_back(new_getitem);
+      }
+    }
+  }
+
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    Reset();
+
+    auto cnode = node->cast<CNodePtr>();
+    if (cnode == nullptr) {
+      return nullptr;
+    }
+    auto &inputs = cnode->inputs();
+    auto fg = GetValueNode<FuncGraphPtr>(inputs[0]);
+    if (fg == nullptr) {
+      return nullptr;
+    }
+    auto mng = fg->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    auto parameters = fg->parameters();
+    if (parameters.size() != inputs.size() - 1) {
+      return nullptr;
+    }
+    replace_parameters_ = std::vector<bool>(parameters.size(), false);
+    inputs_num_ = std::vector<size_t>(parameters.size(), 1);
+    auto node_fg = node->func_graph();
+
+    for (size_t i = 1; i < inputs.size(); ++i) {
+      if (IsPrimitiveCNode(inputs[i], prim::kPrimMakeTuple) || IsCNodeGraphKernel(inputs[i])) {
+        Process(node_fg, cnode, parameters[i - 1], i - 1);
+      } else {
+        args_.push_back(inputs[i]);
+      }
+    }
+
+    if (!need_update_) {
+      return nullptr;
+    }
+
+    FuncGraphPtr new_fg = TransformableClone(fg, std::make_shared<TraceTransform>("sp"));
+    mng->AddFuncGraph(new_fg);
+
+    auto node_users = mng->node_users();
+    std::vector<AnfNodePtr> new_fg_parameters = new_fg->parameters();
+    std::vector<AnfNodePtr> new_parameters;
+    size_t curr_input_idx{0};
+    for (size_t param_i = 0; param_i < new_fg_parameters.size(); ++param_i) {
+      if (!replace_parameters_[param_i]) {
+        if (parameters[param_i]->abstract() != nullptr) {
+          new_fg_parameters[param_i]->set_abstract(parameters[param_i]->abstract());
+        }
+        new_parameters.push_back(new_fg_parameters[param_i]);
+        curr_input_idx++;
+        continue;
+      }
+
+      // make a new parameter.
+      for (size_t input_i = 0; input_i < inputs_num_[param_i]; ++input_i) {
+        auto new_param = std::make_shared<Parameter>(new_fg);
+        new_param->set_abstract(args_.at(curr_input_idx)->abstract());
+
+        // update users of new parameter.
+        for (auto &user : node_users[new_fg_parameters[param_i]]) {
+          idx_ = -1;
+          AnfVisitor::Match(prim::kPrimTupleGetItem, {IsParam, IsValueNode<Int32Imm>})(user.first);
+          if (idx_ == -1) {
+            MS_LOG(ERROR) << "User of: " << new_fg_parameters[param_i]->DebugString()
+                          << " must be tuple getitem here, but got: " << user.first->DebugString();
+            return nullptr;
+          }
+
+          if (input_i == IntToSize(idx_)) {
+            for (auto &sub_user : node_users[user.first]) {
+              auto sub_user_cnode = sub_user.first->cast<CNodePtr>();
+              MS_EXCEPTION_IF_NULL(sub_user_cnode);
+              sub_user_cnode->set_input(sub_user.second, new_param);
+              (void)mng->Replace(sub_user.first, sub_user_cnode);
+            }
+          }
+        }
+
+        // (void)mng->Replace(new_fg_parameters[param_i], new_param);
+        new_parameters.push_back(new_param);
+        curr_input_idx++;
+      }
+    }
+
+    mng->SetParameters(new_fg, new_parameters);
+    (void)args_.insert(args_.begin(), NewValueNode(new_fg));
+    auto new_call = node_fg->NewCNode(args_);
+    new_call->set_abstract(node->abstract());
+    return new_call;
+  }
+
+  void Visit(const ValueNodePtr &vnode) override { idx_ = GetValue<int>(vnode->value()); }
+
+  void Visit(const CNodePtr &cnode) override {}
+
+  void Reset() {
+    replace_parameters_.clear();
+    args_.clear();
+    inputs_num_.clear();
+    need_update_ = false;
+    idx_ = -1;
+  }
+
+ private:
+  std::vector<bool> replace_parameters_{};
+  std::vector<AnfNodePtr> args_{};
+  std::vector<size_t> inputs_num_{};
+  bool need_update_{false};
+  int idx_{-1};
+};
+
 // {prim::kPrimTupleGetItem, {{prim::kPrimSwitch, X, G1, G2}, Xs}, C}
 class IncorporateGetitemSwitch : public AnfVisitor {
  public:
@@ -197,6 +382,31 @@ class IncorporateGetitemSwitch : public AnfVisitor {
   std::vector<AnfNodePtr> args_{};
   internal::GetitemTransform getitem_transform_;
 };
+
+class IncorporateGetitemSet {
+ public:
+  IncorporateGetitemSet() : incorporate_getitem_(), incorporate_getitem_switch_() {
+    eliminaters_.emplace_back(incorporate_getitem_);
+    eliminaters_.emplace_back(incorporate_getitem_switch_);
+  }
+  ~IncorporateGetitemSet() = default;
+
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+    AnfNodePtr new_node;
+    for (auto &eliminater : eliminaters_) {
+      new_node = eliminater(optimizer, node);
+      if (new_node != nullptr) {
+        return new_node;
+      }
+    }
+    return nullptr;
+  }
+
+ private:
+  IncorporateGetitem incorporate_getitem_;
+  IncorporateGetitemSwitch incorporate_getitem_switch_;
+  std::vector<TransformFuncType> eliminaters_{};
+};
 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/irpass/inline.h b/mindspore/ccsrc/optimizer/irpass/inline.h
index 8ebd0f6eb7..64f192347c 100644
--- a/mindspore/ccsrc/optimizer/irpass/inline.h
+++ b/mindspore/ccsrc/optimizer/irpass/inline.h
@@ -71,11 +71,7 @@ class ReplaceApplicator : public AnfVisitor {
 using CriterionFuncType = std::function<bool(FuncGraphPtr, AnfNodePtr)>;
 
 bool IsTrivial(const FuncGraphPtr &fg, AnfNodePtr) {
-  auto &s = fg->nodes();
-  int n_cnode = std::count_if(s.begin(), s.end(), [](const AnfNodePtr &n) {
-    MS_EXCEPTION_IF_NULL(n);
-    return n->isa<CNode>();
-  });
+  auto n_cnode = fg->nodes().size() - fg->parameters().size();
   // There is at least one CNode(return, other_node).
   return n_cnode <= 2;
 }
@@ -90,20 +86,10 @@ bool IsUniqueUse(const FuncGraphPtr &fg, AnfNodePtr) {
 
 bool IsInside(FuncGraphPtr, const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node->func_graph());
-  auto &flags = node->func_graph()->flags();
-  if (flags.find("inline_inside") != flags.end()) {
-    return flags["inline_inside"];
-  }
-  return false;
+  return node->func_graph()->has_flag("inline_inside");
 }
 
-bool IsCore(const FuncGraphPtr &fg, AnfNodePtr) {
-  auto &flags = fg->flags();
-  if (flags.find("core") != flags.end()) {
-    return flags["core"];
-  }
-  return false;
-}
+bool IsCore(const FuncGraphPtr &fg, AnfNodePtr) { return fg->has_flag("core"); }
 
 bool NoCriterion(FuncGraphPtr, AnfNodePtr) { return true; }
 
@@ -127,6 +113,13 @@ class InlinerBase : public AnfVisitor {
     if (fg->has_flag(FUNC_GRAPH_FLAG_DEFER_INLINE)) {
       return nullptr;
     }
+    // Do not inline GraphKernel to Cell.
+    if (fg->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL) && !node->func_graph()->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+      // If the GraphKernel only contains a return node, we make it inlined.
+      if (fg->nodes().size() - fg->parameters().size() > 1) {
+        return nullptr;
+      }
+    }
 
     Reset();
     bool is_match = false;
@@ -167,7 +160,8 @@ class InlinerBase : public AnfVisitor {
     auto params = fg->parameters();
     auto old_size = params.size();
     if (old_size != new_params.size()) {
-      MS_LOG(EXCEPTION) << "Parameter size not match.";
+      MS_LOG(EXCEPTION) << "Parameter size not match." << old_size << " new " << new_params.size()
+                        << fg->output()->DebugString(10);
     }
     for (size_t i = 0; i < old_size; i++) {
       (void)mng->Replace(params[i], new_params[i]);
diff --git a/mindspore/ccsrc/optimizer/irpass/mark_interface_fusion.h b/mindspore/ccsrc/optimizer/irpass/mark_interface_fusion.h
new file mode 100644
index 0000000000..6f2bcc187f
--- /dev/null
+++ b/mindspore/ccsrc/optimizer/irpass/mark_interface_fusion.h
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_OPTIMIZER_IRPASS_MARK_INTERFACE_FUSION_H
+#define MINDSPORE_CCSRC_OPTIMIZER_IRPASS_MARK_INTERFACE_FUSION_H
+
+#include <string>
+#include <sstream>
+#include <unordered_map>
+
+#include "session/anf_runtime_algorithm.h"
+#include "optimizer/optimizer.h"
+#include "optimizer/irpass.h"
+#include "ir/visitor.h"
+#include "operator/ops.h"
+#include "utils/graph_utils.h"
+#include "operator/composite/composite.h"
+
+namespace mindspore {
+namespace opt {
+namespace irpass {
+
+static int count = 0;
+
+std::string GetFusionNumber() {
+  std::stringstream ss;
+  ss << std::setw(4) << std::setfill('0') << count;
+  std::string num = ss.str();
+  ++count;
+
+  return "_" + num;
+}
+
+// Mark CNodes which can be merged in kernel build
+class MarkInterfaceFusion : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (node->func_graph()->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL) && IsPrimitiveCNode(node, prim::kPrimSelect)) {
+      auto cnode = node->cast<CNodePtr>();
+      auto condition = cnode->input(1);
+      std::string cmp;
+      std::unordered_map<std::string, std::string> cmp_list = {{"GreaterEqual", "GE"}, {"Greater", "GT"},
+                                                               {"LessEqual", "LE"},    {"Less", "LT"},
+                                                               {"Equal", "EQ"},        {"NotEqual", "NE"}};
+      if (IsPrimitiveCNode(condition)) {
+        auto prim_name = GetCNodeFuncName(condition->cast<CNodePtr>());
+        if (cmp_list.count(prim_name) != 0) {
+          // Mark Select and compare node
+          cmp = cmp_list[prim_name];
+          auto cnt = GetFusionNumber();
+          AnfAlgo::SetNodeAttr("fusion", MakeValue("Select" + cmp + cnt), condition);
+          AnfAlgo::SetNodeAttr("fusion", MakeValue("Select" + cmp + cnt + "_end"), node);
+          for (size_t i = 1; i < cnode->inputs().size(); ++i) {
+            if (IsPrimitiveCNode(cnode->input(i), prim::kPrimZerosLike)) {
+              AnfAlgo::SetNodeAttr("fusion", MakeValue("Select" + cmp + cnt), cnode->input(i));
+            }
+          }
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  void Visit(const AnfNodePtr &) override {}
+
+ private:
+  AnfNodePtr y_{nullptr};
+};
+
+}  // namespace irpass
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_OPTIMIZER_IRPASS_MARK_INTERFACE_FUSION_H
diff --git a/mindspore/ccsrc/optimizer/irpass/merge_addn.h b/mindspore/ccsrc/optimizer/irpass/merge_addn.h
index 7a7c62f6f6..94f9e26c5b 100644
--- a/mindspore/ccsrc/optimizer/irpass/merge_addn.h
+++ b/mindspore/ccsrc/optimizer/irpass/merge_addn.h
@@ -19,6 +19,7 @@
 
 #include <vector>
 #include <algorithm>
+#include <memory>
 
 #include "optimizer/irpass.h"
 #include "optimizer/optimizer.h"
@@ -177,7 +178,7 @@ class AddNZeroFilter : public AnfVisitor {
     // {kPrimMakeTuple, X1, X2, ...}
     filtered_Xs_.push_back(NewValueNode(prim::kPrimMakeTuple));
     for (auto &x : Xs_) {
-      if (!IsPrimitiveCNode(x, prim::kPrimZerosLikeTensor)) {
+      if (!IsPrimitiveCNode(x, prim::kPrimZerosLike)) {
         filtered_Xs_.push_back(x);
       } else {
         has_zero_like_ = true;
@@ -196,6 +197,131 @@ class AddNZeroFilter : public AnfVisitor {
   std::vector<AnfNodePtr> filtered_Xs_{}, Xs_{};
   bool has_zero_like_{false};
 };
+
+// {PrimAddN, {kPrimMakeTuple, Xs}}
+// Akg don't support AddN(ValueNode, Tensor, ...), converted to TensorAdd.
+// case0: AddN(inputs)(inputs size < 2) -> error
+// case1: AddN(inputs)(all inputs is ValueNode) -> error
+// case2: AddN(inputs)(inputs size = 2) -> TensorAdd(Tensor, Tensor)
+// case3: AddN(ValueNode, Tensor, Tensor, ...)(has one ValueNode input)
+//   -> TensorAdd(ValueNode, AddN(Tensor, Tensor, ...))
+class AddNEliminater : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (!node->isa<CNode>() || node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    auto &inputs = node->cast<CNodePtr>()->inputs();
+    auto fg = GetValueNode<FuncGraphPtr>(inputs[0]);
+    MS_EXCEPTION_IF_NULL(fg);
+    auto mng = fg->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    if (fg->recursive()) {
+      return nullptr;
+    }
+
+    auto new_fg = TransformableClone(fg, std::make_shared<TraceTransform>("fg"));
+    mng->AddFuncGraph(new_fg);
+    need_update_ = false;
+    bool changed = false;
+    do {
+      changed = false;
+      changed |= Process(new_fg);
+    } while (changed);
+
+    if (!need_update_) {
+      return nullptr;
+    } else {
+      auto new_sx = inputs;
+      new_sx[0] = NewValueNode(new_fg);
+      return node->func_graph()->NewCNode(new_sx);
+    }
+  }
+
+  bool Process(const FuncGraphPtr &func_graph) {
+    auto mng = func_graph->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    auto nodes = TopoSort(func_graph->output());
+    bool changed = false;
+
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      auto node = nodes[i];
+      if (!IsPrimitiveCNode(node, prim::kPrimAddN)) {
+        continue;
+      }
+
+      auto cnode = node->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(cnode);
+      auto &tuple_input = cnode->input(1);
+      MS_EXCEPTION_IF_NULL(tuple_input);
+      auto tuple_input_cnode = tuple_input->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(tuple_input_cnode);
+      auto &tuple_inputs = tuple_input_cnode->inputs();
+      if (tuple_inputs.size() < 3) {
+        // case0: inputs size < 2, error
+        MS_EXCEPTION(ArgumentError) << "Inputs size of AddN less than 2. " << cnode->DebugString(2);
+      }
+
+      int valuenode_num =
+        std::accumulate(tuple_inputs.begin() + 1, tuple_inputs.end(), 0, [](int accumulator, const AnfNodePtr &node) {
+          if (IsValueNode<tensor::Tensor>(node)) {
+            return accumulator + 1;
+          } else {
+            return accumulator;
+          }
+        });
+      if (IntToSize(valuenode_num) == tuple_inputs.size()) {
+        // case1: all inputs is ValueNode, error
+        MS_EXCEPTION(ArgumentError) << "All inputs of AddN is ValueNode. " << cnode->DebugString(2);
+      }
+
+      if (tuple_inputs.size() == 3) {
+        // case2: inputs size = 2, -> TensorAdd(Tensor, Tensor)
+        MS_LOG(DEBUG) << "Replace AddN with two inputs with TensorAdd. " << cnode->DebugString(2);
+        ValuePtr prim_tensoradd = prim::GetPythonOps("TensorAdd", "mindspore.ops.operations");
+        std::vector<AnfNodePtr> new_xs{func_graph->NewCNode({NewValueNode(prim_tensoradd)}), tuple_inputs[1],
+                                       tuple_inputs[2]};
+        mng->Replace(node, func_graph->NewCNode(new_xs));
+        changed = true;
+        continue;
+      }
+
+      auto first_valuenode = std::find_if(tuple_inputs.begin() + 1, tuple_inputs.end(),
+                                          [](const AnfNodePtr &node) { return IsValueNode<tensor::Tensor>(node); });
+      if (first_valuenode == tuple_inputs.end()) {
+        // no ValueNode input found.
+        continue;
+      } else {
+        // case3: has one ValueNode input -> TensorAdd(ValueNode, AddN(Tensor, Tensor, ...))
+        std::vector<AnfNodePtr> make_tuple_new_xs{
+          NewValueNode(prim::kPrimMakeTuple),
+        };
+        std::for_each(tuple_inputs.begin() + 1, tuple_inputs.end(),
+                      [&make_tuple_new_xs, &first_valuenode](const AnfNodePtr &node) {
+                        if (node != *first_valuenode) {
+                          make_tuple_new_xs.push_back(node);
+                        }
+                      });
+        ValuePtr prim_addn = prim::GetPythonOps("AddN", "mindspore.ops.operations");
+        auto new_addn = func_graph->NewCNode(
+          {func_graph->NewCNode({NewValueNode(prim_addn)}), func_graph->NewCNode(make_tuple_new_xs)});
+        ValuePtr prim_tensoradd = prim::GetPythonOps("TensorAdd", "mindspore.ops.operations");
+        auto new_add =
+          func_graph->NewCNode({func_graph->NewCNode({NewValueNode(prim_tensoradd)}), *first_valuenode, new_addn});
+        (void)mng->Replace(node, new_add);
+        changed = true;
+        continue;
+      }
+    }
+
+    need_update_ |= changed;
+    return changed;
+  }
+
+ private:
+  bool need_update_{false};
+};
 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/irpass/reduce_eliminate.h b/mindspore/ccsrc/optimizer/irpass/reduce_eliminate.h
index 73dbc152e5..d2e1d15f91 100644
--- a/mindspore/ccsrc/optimizer/irpass/reduce_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/reduce_eliminate.h
@@ -79,7 +79,7 @@ class ReduceOneEliminater : public AnfVisitor {
   }
 
   void Visit(const AnfNodePtr &node) override {
-    if (x_ == nullptr) {
+    if (!IsVNode(node) && x_ == nullptr) {
       if (IsValueNode<tensor::Tensor>(node)) {
         is_tensor_ = true;
       }
diff --git a/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h b/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
index 201992ef13..599ee8c339 100644
--- a/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
@@ -21,86 +21,69 @@
 
 #include "optimizer/optimizer.h"
 #include "optimizer/irpass.h"
-#include "ir/visitor.h"
-#include "operator/ops.h"
+#include "ir/pattern_matcher.h"
 
 namespace mindspore {
 namespace opt {
 namespace irpass {
 // {prim::kPrimMakeRef, X, Y, Z} -> Y
-class MakeRefEliminater : public AnfVisitor {
+class MakeRefEliminater : public OptimizerCaller {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
-    y_ = nullptr;
-    auto gety = [this](const AnfNodePtr &node) -> bool {
-      this->y_ = node;
-      return true;
-    };
-    AnfVisitor::Match(prim::kPrimMakeRef, {IsNode, gety, IsNode})(node);
-    return y_;
+    PatternNode<AnfNodePtr> x, y, z;
+    MATCH_REPLACE(node, PPrimitive(prim::kPrimMakeRef, x, y, z), y);
+    return nullptr;
   }
+};
 
-  void Visit(const AnfNodePtr &) override {}
-
- private:
-  AnfNodePtr y_{nullptr};
+// {prim::kPrimGetRefValue, Parameter} -> Parameter
+// {prim::kPrimGetRefOrigin, Parameter} -> Parameter
+class GetRefParamEliminater : public OptimizerCaller {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    PatternNode<AnfNodePtr> x;
+    MATCH_REPLACE_IF(node, PPrimitive(prim::kPrimGetRefValue, x), x, x.CheckFunc(IsParam, node));
+    MATCH_REPLACE_IF(node, PPrimitive(prim::kPrimGetRefOrigin, x), x, x.CheckFunc(IsParam, node));
+    return nullptr;
+  }
 };
 
 // {prim::kPrimGetRefKey, {prim::kPrimMakeRef, X, Y, Z}} -> X
 // {prim::kPrimGetRefValue, {prim::kPrimMakeRef, X, Y, Z}} -> Y
 // {prim::kPrimGetRefOrigin, {prim::kPrimMakeRef, X, Y, Z}} -> Z
-class GetMakeRefEliminater : public AnfVisitor {
+class GetMakeRefEliminater : public OptimizerCaller {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
-    auto cnode = node->cast<CNodePtr>();
-    if (cnode == nullptr || cnode->size() != 2) {
-      return nullptr;
-    }
-
-    // {prim::kPrimGetRefKey/Value, {...}}
-    auto ref = cnode->input(1)->cast<CNodePtr>();
-    if (ref == nullptr || !ref->IsApply(prim::kPrimMakeRef) || ref->size() != 4) {
-      return nullptr;
-    }
-
-    // {prim::kPrimMakeRef, X, Y, Z}
-    if (cnode->IsApply(prim::kPrimGetRefKey)) {
-      return ref->input(1);
-    }
-
-    if (cnode->IsApply(prim::kPrimGetRefValue)) {
-      return ref->input(2);
-    }
-
-    if (cnode->IsApply(prim::kPrimGetRefOrigin)) {
-      return ref->input(3);
-    }
-
+    PatternNode<AnfNodePtr> x, y, z;
+    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefKey, PPrimitive(prim::kPrimMakeRef, x, y, z)), x);
+    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefValue, PPrimitive(prim::kPrimMakeRef, x, y, z)), y);
+    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefOrigin, PPrimitive(prim::kPrimMakeRef, x, y, z)), z);
     return nullptr;
   }
 };
 
 // IsValueNode<RefKey>
-class ReplaceRefkeyByParam : public AnfVisitor {
+class ReplaceRefkeyByParam : public OptimizerCaller {
  public:
   AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
-    if (!IsValueNode<RefKey>(node)) {
-      return nullptr;
-    }
-
-    auto refkey = GetValueNode<RefKeyPtr>(node);
-    auto resource = std::dynamic_pointer_cast<pipeline::Resource>(optimizer->resource());
-    MS_EXCEPTION_IF_NULL(resource);
-
-    auto top_graph = resource->func_graph();
-    MS_EXCEPTION_IF_NULL(top_graph);
-
-    for (const auto &tnode : top_graph->parameters()) {
-      auto para = tnode->cast<ParameterPtr>();
-      if (para != nullptr && para->name() == refkey->tag()) {
-        return para;
+    auto RefKeyLambda = [&node, &optimizer]() -> AnfNodePtr {
+      auto refkey = GetValueNode<RefKeyPtr>(node);
+      auto resource = std::dynamic_pointer_cast<pipeline::Resource>(optimizer->resource());
+      MS_EXCEPTION_IF_NULL(resource);
+
+      auto top_graph = resource->func_graph();
+      MS_EXCEPTION_IF_NULL(top_graph);
+
+      for (const auto &tnode : top_graph->parameters()) {
+        auto para = tnode->cast<ParameterPtr>();
+        if (para != nullptr && para->name() == refkey->tag()) {
+          return para;
+        }
       }
-    }
+      return nullptr;
+    };
+    PatternNode<AnfNodePtr> x;
+    MATCH_REPLACE_LAMBDA_IF(node, x, RefKeyLambda, x.CheckFunc(IsValueNode<RefKey>, node));
     return nullptr;
   }
 };
diff --git a/mindspore/ccsrc/optimizer/irpass/reshape_eliminate.h b/mindspore/ccsrc/optimizer/irpass/reshape_eliminate.h
index f1f73de4d9..fb43f6ffd8 100644
--- a/mindspore/ccsrc/optimizer/irpass/reshape_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/reshape_eliminate.h
@@ -50,11 +50,15 @@ class ReshapeSameShapeEliminater : public AnfVisitor {
     }
 
     auto src_shape = src_shape_abs->GetShapeTrack();
-    auto tgt_shape = GetValueNode(shape_);
-    if (src_shape != nullptr && tgt_shape != nullptr && src_shape->isa<Shape>()) {
-      auto elements = GetValue<std::vector<int>>(tgt_shape);
+    auto tgt_shape_abs = node->abstract();
+    if (tgt_shape_abs == nullptr) {
+      return nullptr;
+    }
+    auto tgt_shape = tgt_shape_abs->GetShapeTrack();
+    if (src_shape != nullptr && tgt_shape != nullptr && src_shape->isa<Shape>() && tgt_shape->isa<Shape>()) {
+      auto elements = tgt_shape->cast<ShapePtr>();
       auto shape = src_shape->cast<ShapePtr>();
-      if (shape->shape() == elements) {
+      if (shape->shape() == elements->shape()) {
         return x_;
       }
     }
diff --git a/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h b/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h
index aa23441bbb..1dc8fbb344 100644
--- a/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h
@@ -24,9 +24,11 @@
 
 #include "optimizer/optimizer.h"
 #include "optimizer/irpass.h"
+#include "ir/optimizer_caller.h"
 #include "optimizer/irpass/prim_eliminate.h"
 #include "ir/visitor.h"
 #include "operator/ops.h"
+#include "ir/pattern_matcher.h"
 
 namespace mindspore {
 namespace opt {
@@ -35,12 +37,14 @@ class SpecialOpEliminater {
  public:
   SpecialOpEliminater()
       : insert_gradient_of_(prim::kPrimInsertGradientOf),
+        stop_gradient_(prim::kPrimStopGradient),
         hook_backward_(prim::kPrimHookBackward),
         print_shape_type_(prim::kPrimPrintShapeType),
         get_ref_value_(prim::kPrimGetRefValue),
         mirror_(prim::kPrimMirror),
         virtual_div_(prim::kPrimVirtualDiv) {
     eliminaters_.emplace_back(insert_gradient_of_);
+    eliminaters_.emplace_back(stop_gradient_);
     eliminaters_.emplace_back(hook_backward_);
     eliminaters_.emplace_back(print_shape_type_);
     eliminaters_.emplace_back(get_ref_value_);
@@ -61,7 +65,8 @@ class SpecialOpEliminater {
   }
 
  private:
-  PrimEliminater insert_gradient_of_, hook_backward_, print_shape_type_, get_ref_value_, mirror_, virtual_div_;
+  PrimEliminater insert_gradient_of_, stop_gradient_, hook_backward_, print_shape_type_, get_ref_value_, mirror_,
+    virtual_div_;
   std::vector<TransformFuncType> eliminaters_{};
 };
 
@@ -137,13 +142,13 @@ class ResetDeferInline : public AnfVisitor {
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     if (IsValueNode<FuncGraph>(node)) {
       auto fg = GetValueNode<FuncGraphPtr>(node);
-      fg->set_flags(FUNC_GRAPH_FLAG_DEFER_INLINE, false);
+      fg->set_flag(FUNC_GRAPH_FLAG_DEFER_INLINE, false);
     }
     return nullptr;
   }
 };
 
-// {PrimZerosLikeTensor, Y} ->
+// {PrimZerosLike, Y} ->
 // {PrimFill, {PrimDType, Y}, {PrimShape, Y}, 0}
 class ZeroLikeFillZero : public AnfVisitor {
  public:
@@ -155,7 +160,7 @@ class ZeroLikeFillZero : public AnfVisitor {
 
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     y_ = nullptr;
-    AnfVisitor::Match(prim::kPrimZerosLikeTensor, {IsNode})(node);
+    AnfVisitor::Match(prim::kPrimZerosLike, {IsNode})(node);
     if (y_ == nullptr || node->func_graph() == nullptr) {
       return nullptr;
     }
@@ -188,6 +193,17 @@ class ZeroLikeFillZero : public AnfVisitor {
   AnfNodePtr y_{nullptr};
   PrimitivePtr PrimFill_, PrimShape_, PrimDType_;
 };
+
+// {prim::kPrimDepend, X, ValueCond}->X
+class DependValueElim : public OptimizerCaller {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    PatternNode<AnfNodePtr> x, cond;
+    MATCH_REPLACE_IF(node, PPrimitive(prim::kPrimDepend, x, cond), x, IsVNode(cond.GetNode(node)));
+    return nullptr;
+  }
+};
+
 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/irpass/specialize_transform.h b/mindspore/ccsrc/optimizer/irpass/specialize_transform.h
index 905479df77..6ac4e40f5e 100644
--- a/mindspore/ccsrc/optimizer/irpass/specialize_transform.h
+++ b/mindspore/ccsrc/optimizer/irpass/specialize_transform.h
@@ -22,6 +22,7 @@
 #include <memory>
 #include <utility>
 #include <unordered_map>
+#include <unordered_set>
 
 #include "optimizer/irpass.h"
 #include "optimizer/optimizer.h"
@@ -41,7 +42,7 @@ class SpecializeTransform {
   ~SpecializeTransform() = default;
 
   FuncGraphPtr operator()(const FuncGraphPtr &func_graph, std::vector<FuncGraphPtr> graph_args,
-                          std::vector<PrimitivePtr> prim_args) {
+                          std::vector<PrimitivePtr> prim_args, std::vector<tensor::TensorPtr> value_args) {
     if (cache_.count(func_graph) == 0) {
       cache_[func_graph] = {};
     }
@@ -69,6 +70,13 @@ class SpecializeTransform {
           (void)mng->Replace(params[i], arg);
           continue;
         }
+        if (value_args[i] != nullptr) {
+          auto const_tensor = *value_args[i];
+          auto const_tensor_ptr = std::make_shared<tensor::Tensor>(const_tensor);
+          AnfNodePtr arg = NewValueNode(const_tensor_ptr);
+          (void)mng->Replace(params[i], arg);
+          continue;
+        }
         new_params.push_back(params[i]);
       }
 
@@ -108,6 +116,7 @@ class SpecializeOnGraphArguments : public AnfVisitor {
 
     std::vector<FuncGraphPtr> graph_args;
     std::vector<PrimitivePtr> prim_args;
+    std::vector<tensor::TensorPtr> value_node_args;
     std::vector<AnfNodePtr> new_xs;
     bool hasVNode = false;
     for (size_t i = 1; i < inputs.size(); i++) {
@@ -115,15 +124,24 @@ class SpecializeOnGraphArguments : public AnfVisitor {
         auto fg_vnode = GetValueNode<FuncGraphPtr>(inputs[i]);
         graph_args.push_back(fg_vnode);
         prim_args.emplace_back(nullptr);
+        value_node_args.emplace_back(nullptr);
         hasVNode = true;
       } else if (IsValueNode<Primitive>(inputs[i])) {
         auto p_vnode = GetValueNode<PrimitivePtr>(inputs[i]);
         graph_args.emplace_back(nullptr);
         prim_args.push_back(p_vnode);
+        value_node_args.emplace_back(nullptr);
+        hasVNode = true;
+      } else if (IsValueNode<tensor::Tensor>(inputs[i])) {
+        tensor::TensorPtr t_vnode = GetValueNode<tensor::TensorPtr>(inputs[i]);
+        graph_args.emplace_back(nullptr);
+        prim_args.emplace_back(nullptr);
+        value_node_args.emplace_back(t_vnode);
         hasVNode = true;
       } else {
         graph_args.emplace_back(nullptr);
         prim_args.emplace_back(nullptr);
+        value_node_args.emplace_back(nullptr);
         new_xs.push_back(inputs[i]);
       }
     }
@@ -132,7 +150,7 @@ class SpecializeOnGraphArguments : public AnfVisitor {
       return nullptr;
     }
 
-    auto new_fg = specialize_transform_(inp0_fg, graph_args, prim_args);
+    auto new_fg = specialize_transform_(inp0_fg, graph_args, prim_args, value_node_args);
     (void)new_xs.insert(new_xs.begin(), NewValueNode(new_fg));
 
     return node->func_graph()->NewCNode(new_xs);
@@ -141,6 +159,146 @@ class SpecializeOnGraphArguments : public AnfVisitor {
  private:
   internal::SpecializeTransform specialize_transform_;
 };
+
+// Eliminate unused parameters.
+// {G, Xs}
+class UnusedParasEliminater : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (!node->isa<CNode>() || node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    auto &inputs = cnode->inputs();
+    auto fg = GetValueNode<FuncGraphPtr>(inputs[0]);
+    MS_EXCEPTION_IF_NULL(fg);
+
+    std::vector<AnfNodePtr> parameters = fg->parameters();
+    size_t size = parameters.size();
+    if (size != inputs.size() - 1) {
+      return nullptr;
+    }
+
+    std::vector<AnfNodePtr> new_xs;
+    std::vector<bool> keep_parameters;
+    auto mng = fg->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    auto &node_users = mng->node_users();
+    bool has_unused_para = false;
+    for (size_t i = 0; i < size; ++i) {
+      auto iter = node_users.find(parameters[i]);
+      if (iter != node_users.end() && !iter->second.empty()) {
+        keep_parameters.push_back(true);
+        new_xs.push_back(inputs[i + 1]);
+        continue;
+      }
+      keep_parameters.push_back(false);
+      has_unused_para = true;
+    }
+
+    if (!has_unused_para) {
+      return nullptr;
+    }
+    FuncGraphPtr new_fg = TransformableClone(fg, std::make_shared<TraceTransform>("sp"));
+    mng->AddFuncGraph(new_fg);
+
+    std::vector<AnfNodePtr> new_fg_parameters = new_fg->parameters();
+    std::vector<AnfNodePtr> new_parameters;
+    for (size_t i = 0; i < size; i++) {
+      if (keep_parameters[i]) {
+        if (parameters[i]->abstract() != nullptr) {
+          new_fg_parameters[i]->set_abstract(parameters[i]->abstract());
+        }
+        new_parameters.push_back(new_fg_parameters[i]);
+      }
+    }
+    mng->SetParameters(new_fg, new_parameters);
+
+    (void)new_xs.insert(new_xs.begin(), NewValueNode(new_fg));
+    return node->func_graph()->NewCNode(new_xs);
+  }
+};
+
+// Eliminate unused outputs.
+// {G, Xs}
+class UnusedOutputEliminater : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (!node->isa<CNode>() || node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    auto &inputs = node->cast<CNodePtr>()->inputs();
+    auto fg = GetValueNode<FuncGraphPtr>(inputs[0]);
+    MS_EXCEPTION_IF_NULL(fg);
+    auto mng = fg->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    if (fg->recursive()) {
+      return nullptr;
+    }
+
+    auto new_fg = TransformableClone(fg, std::make_shared<TraceTransform>("fg"));
+    mng->AddFuncGraph(new_fg);
+    auto new_fg_output = new_fg->output();
+    if (!IsPrimitiveCNode(new_fg_output, prim::kPrimMakeTuple)) {
+      return nullptr;
+    }
+
+    auto output_cnode = new_fg_output->cast<CNodePtr>();
+    auto &node_users = mng->node_users();
+    if (node_users.count(node) == 0 || node_users[node].empty()) {
+      return nullptr;
+    }
+    std::unordered_set<int> used_output_idx;
+    std::vector<std::pair<AnfNodePtr, int>> all_users;
+    for (auto &node_user : node_users[node]) {
+      if (!IsPrimitiveCNode(node_user.first, prim::kPrimTupleGetItem)) {
+        return nullptr;
+      }
+      auto user_cnode = node_user.first->cast<CNodePtr>();
+      size_t used_idx = GetValue<int>(user_cnode->input(2)->cast<ValueNodePtr>()->value());
+      used_output_idx.insert(used_idx);
+      all_users.push_back(std::make_pair(node_user.first, used_idx));
+    }
+
+    if (used_output_idx.size() >= output_cnode->inputs().size() - 1) {
+      // all output has users.
+      return nullptr;
+    }
+
+    if (used_output_idx.empty()) {
+      // we do not process this case.
+      return nullptr;
+    } else if (used_output_idx.size() == 1) {
+      // after eliminate, only one output left.
+      new_fg->set_output(output_cnode->input(*used_output_idx.begin() + 1));
+      // update users.
+      for (auto &ret_user : all_users) {
+        (void)mng->Replace(ret_user.first, node);
+      }
+    } else {
+      // after eliminate, create new multi output.
+      std::vector<AnfNodePtr> new_output_inputs{output_cnode->input(0)};
+      std::unordered_map<int, int> new_idx_map;
+      for (auto idx : used_output_idx) {
+        new_idx_map[idx] = SizeToInt(new_output_inputs.size() - 1);
+        new_output_inputs.push_back(output_cnode->input(idx + 1));
+      }
+      new_fg->set_output(new_fg->NewCNode(new_output_inputs));
+      // update users.
+      for (auto &ret_user : all_users) {
+        auto ret_user_cnode = ret_user.first->cast<CNodePtr>();
+        ret_user_cnode->set_input(2, NewValueNode(new_idx_map[ret_user.second]));
+      }
+    }
+
+    auto new_sx = inputs;
+    new_sx[0] = NewValueNode(new_fg);
+    return node->func_graph()->NewCNode(new_sx);
+  }
+};
 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/opt.cc b/mindspore/ccsrc/optimizer/opt.cc
index 96b9d28f9b..82fbcc2036 100644
--- a/mindspore/ccsrc/optimizer/opt.cc
+++ b/mindspore/ccsrc/optimizer/opt.cc
@@ -44,8 +44,17 @@ SubstitutionPtr MakeSubstitution(const TransformFuncType &transform, const std::
       return false;
     }
 
+    auto cnode = node->cast<CNodePtr>();
+    auto inp0 = cnode->input(0);
+    auto prim0 = GetValueNode<PrimitivePtr>(inp0);
+    if (prim0 == nullptr) {
+      return false;
+    }
+
+    auto hash = prim0->Hash();
+    auto const &name = prim0->name();
     for (auto &prim : prims) {
-      if (IsPrimitiveCNode(node, prim)) {
+      if (hash == prim->Hash() && name == prim->name()) {
         return true;
       }
     }
@@ -88,7 +97,7 @@ AnfNodePtr Substitution::operator()(const OptimizerPtr &optimizer, const AnfNode
   return result;
 }
 
-inline bool isTraversable(const AnfNodePtr &node) {
+static bool isTraversable(const AnfNodePtr &node) {
   if (node == nullptr) {
     return false;
   }
@@ -110,6 +119,7 @@ bool SubstitutionList::ApplyTransform(const OptimizerPtr &optimizer, const AnfNo
   auto seen = NewSeenGeneration();
   // 1024 is for the initial capacity of deque
   std::deque<AnfNodePtr> todo(1024);
+  todo.clear();
   todo.push_back(root_node);
   bool changes = false;
 
@@ -171,7 +181,7 @@ bool SubstitutionList::ApplyTransform(const OptimizerPtr &optimizer, const AnfNo
   }
 
 #ifdef ENABLE_PROFILE
-  MsProfile::StatTime("opt.transform", GetTime() - start);
+  MsProfile::StatTime("opt.transform." + optimizer->name(), GetTime() - start);
 #endif
   return changes;
 }
diff --git a/mindspore/ccsrc/optimizer/optimizer.h b/mindspore/ccsrc/optimizer/optimizer.h
index d5808b4818..3e77edc1e9 100644
--- a/mindspore/ccsrc/optimizer/optimizer.h
+++ b/mindspore/ccsrc/optimizer/optimizer.h
@@ -29,6 +29,7 @@
 
 #include "debug/draw.h"
 #include "debug/anf_ir_dump.h"
+#include "debug/anf_ir_utils.h"
 #include "debug/trace.h"
 #include "optimizer/opt.h"
 #include "pipeline/resource.h"
@@ -88,7 +89,7 @@ using OptPassGroupMap = std::vector<std::pair<std::string, OptPassConfig>>;
 class Optimizer : public std::enable_shared_from_this<Optimizer> {
  public:
   Optimizer(const std::string &name, const pipeline::ResourceBasePtr &resource_ptr)
-      : name_(name), resource_(resource_ptr), run_only_once_(false), is_watch_renormalize_(false) {}
+      : name_(name), resource_(resource_ptr), run_only_once_(false), is_watch_renormalize_(false), is_enable_(true) {}
   virtual ~Optimizer() = default;
 
   void Init(const OptPassGroupMap &passes, bool run_only_once) {
@@ -131,6 +132,9 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
   }
 
   FuncGraphPtr step(FuncGraphPtr func_graph, bool use_profile = true) {
+    if (!is_enable_) {
+      return func_graph;
+    }
     // Optimizer step counter;
     int counter = -1;
     bool changes = true;
@@ -170,11 +174,12 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
           };
           use_profile ? (WITH(MsProfile::GetProfile()->Step(pass_names_[i])) opt_func) : opt_func();
           if (IS_OUTPUT_ON(mindspore::DEBUG) && MsContext::GetInstance()->save_graphs_flag()) {
-            MS_LOG(DEBUG) << name_ << " round " << counter << " OptPass " << pass_names_[i] << " end.";
+            MS_LOG(DEBUG) << "The opt " << name_ << " round " << counter << " OptPass " << pass_names_[i] << " end.";
             auto fg_name =
               "opt_substep_" + name_ + "_r" + std::to_string(counter) + "_" + std::to_string(i) + "_" + pass_names_[i];
             func_graph->DumpFuncGraph(fg_name);
             DumpIR(fg_name + ".ir", func_graph);
+            ExportIR(fg_name + ".dat", "", func_graph);
             MS_LOG(DEBUG) << "Dump " << pass_names_[i] << " func graph.";
           }
         }
@@ -209,6 +214,7 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
   void enable_watch_renormalize() { is_watch_renormalize_ = true; }
   void disable_watch_renormalize() { is_watch_renormalize_ = false; }
   bool is_watch_renormalize() { return is_watch_renormalize_; }
+  void set_enable(bool enable) { is_enable_ = enable; }
 
  private:
   const std::string name_;
@@ -218,6 +224,7 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
   bool run_only_once_;
   std::vector<AnfNodePtr> untyped_nodes_;
   bool is_watch_renormalize_;
+  bool is_enable_;
 };
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/parallel/allreduce_fusion/step_allreduce_fusion.cc b/mindspore/ccsrc/parallel/allreduce_fusion/step_allreduce_fusion.cc
index 687bc12f05..999c4a85a9 100644
--- a/mindspore/ccsrc/parallel/allreduce_fusion/step_allreduce_fusion.cc
+++ b/mindspore/ccsrc/parallel/allreduce_fusion/step_allreduce_fusion.cc
@@ -64,7 +64,7 @@ bool StepAllreduceFusion(const FuncGraphPtr &root, const opt::OptimizerPtr &opti
   DumpGraph(root, std::string(ALLREDUCE_FUSION_END));
 
   // allreduce fusion only run once
-  root->flags()[ALLREDUCE_FUSION_RUN_ONCE_ONLY] = true;
+  root->set_flag(ALLREDUCE_FUSION_RUN_ONCE_ONLY, true);
   res->results()[pipeline::kStepParallelGraph] = root;
 #if defined(_WIN32) || defined(_WIN64)
   auto end_time = std::chrono::steady_clock::now();
diff --git a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_cost.cc b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_cost.cc
index bb25246608..9fb79ceee4 100644
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_cost.cc
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_cost.cc
@@ -28,7 +28,6 @@
 
 namespace mindspore {
 namespace parallel {
-#define DOUBLE_MAX (std::numeric_limits<double>::max)()
 
 // Compute redistributed cost
 double CostRedis(const Graph::NodeType &node,
@@ -621,75 +620,50 @@ StrategyRec CostCommon::ChoseStr(const std::vector<double> &cost_op, StrategyRec
       break;
 
     default:
-      MS_LOG(EXCEPTION) << "Failure: CostBiasAdd failed.";
+      MS_LOG(EXCEPTION) << "Failure: Common failed.";
   }
   return str;
 }
 
-// Get weight for BN
-double CostBatchNorm::GetMinCostIn(const OperatorRec &op) {
-  int tensor = static_cast<int>(op.arguments[0].tensor_shape.shape_h * op.arguments[0].tensor_str.str_h) *
-               static_cast<int>(op.arguments[0].tensor_shape.shape_n * op.arguments[0].tensor_str.str_n) *
-               static_cast<int>(op.arguments[0].tensor_shape.shape_w * op.arguments[0].tensor_str.str_w) *
-               static_cast<int>(op.arguments[0].tensor_shape.shape_c * op.arguments[0].tensor_str.str_c);
-
-  std::vector<double> cost_in;
-  cost_in.push_back(StrDimB(tensor) * 1.2);
-  cost_in.push_back(DOUBLE_MAX);
-  cost_in.push_back(StrDimH(tensor) * 1.2);
-  cost_in.push_back(StrDimW(tensor) * 1.2);
-
-  return *min_element(cost_in.begin(), cost_in.end());
-}
-
-// Get optimal strategy for BN
-StrategyRec CostBatchNorm::GetOptimalStr(const Graph::NodeType &node,
-                                         const std::vector<std::pair<std::string, StrategyRec>> &node_name_to_strategy,
-                                         const Graph &graph) {
+// Get optimal strategy for BatchParallel OPs
+StrategyRec CostBatchParallel::GetOptimalStr(const Graph::NodeType &node) {
   const OperatorRec &op = node.apply;
-
-  int tensor_filter_n = static_cast<int>(op.arguments[1].tensor_shape.shape_n * op.arguments[1].tensor_str.str_n);
-  int tensor_filter_c = static_cast<int>(op.arguments[1].tensor_shape.shape_c * op.arguments[1].tensor_str.str_c);
-  int tensor_filter_h = static_cast<int>(op.arguments[1].tensor_shape.shape_h * op.arguments[1].tensor_str.str_h);
-  int tensor_filter_w = static_cast<int>(op.arguments[1].tensor_shape.shape_w * op.arguments[1].tensor_str.str_w);
-
-  int tensor_filter = tensor_filter_h * tensor_filter_w * tensor_filter_n * tensor_filter_c;
-
-  int output_tensor_h = static_cast<int>(node.tensor_parm.tensor_shape.shape_h * node.tensor_parm.tensor_str.str_h);
-  int output_tensor_w = static_cast<int>(node.tensor_parm.tensor_shape.shape_w * node.tensor_parm.tensor_str.str_w);
-  int output_tensor_n = static_cast<int>(node.tensor_parm.tensor_shape.shape_n * node.tensor_parm.tensor_str.str_n);
+  int tensor_n = static_cast<int>(op.arguments[0].tensor_shape.shape_n * op.arguments[0].tensor_str.str_n);
+  int tensor_c = static_cast<int>(op.arguments[0].tensor_shape.shape_c * op.arguments[0].tensor_str.str_c);
+  int tensor_h = static_cast<int>(op.arguments[0].tensor_shape.shape_h * op.arguments[0].tensor_str.str_h);
+  int tensor_w = static_cast<int>(op.arguments[0].tensor_shape.shape_w * op.arguments[0].tensor_str.str_w);
 
   std::vector<double> cost_op;
-  std::vector<std::vector<float>> mode;
 
-  if (output_tensor_n < 2 || output_tensor_n % 2 != 0) {
+  if (tensor_n < 2 || tensor_n % 2 != 0) {
     cost_op.push_back(DOUBLE_MAX);
   } else {
-    cost_op.push_back(StrDimB(tensor_filter) + CostRedis(node, node_name_to_strategy,
-                                                         mode = {{0.5, 1, 1, 1}, {1, 1, 1, 1}, {0.5, 1, 1, 1}}, graph));
+    cost_op.push_back(cost_in_);
   }
 
-  cost_op.push_back(DOUBLE_MAX);
+  if (tensor_c < 2 || tensor_c % 2 != 0) {
+    cost_op.push_back(DOUBLE_MAX);
+  } else {
+    cost_op.push_back(cost_in_);
+  }
 
-  if (output_tensor_h < 2 || output_tensor_h % 2 != 0) {
+  if (tensor_h < 2 || tensor_h % 2 != 0) {
     cost_op.push_back(DOUBLE_MAX);
   } else {
-    cost_op.push_back(StrDimH(tensor_filter) + CostRedis(node, node_name_to_strategy,
-                                                         mode = {{1, 1, 0.5, 1}, {1, 1, 1, 1}, {1, 1, 0.5, 1}}, graph));
+    cost_op.push_back(cost_in_);
   }
 
-  if (output_tensor_w < 2 || output_tensor_w % 2 != 0) {
+  if (tensor_w < 2 || tensor_w % 2 != 0) {
     cost_op.push_back(DOUBLE_MAX);
   } else {
-    cost_op.push_back(StrDimW(tensor_filter) + CostRedis(node, node_name_to_strategy,
-                                                         mode = {{1, 1, 1, 0.5}, {1, 1, 1, 1}, {1, 1, 1, 0.5}}, graph));
+    cost_op.push_back(cost_in_);
   }
 
   return ChoseStr(cost_op, node.apply.str);
 }
 
-// Chose strategy for BatchNorm
-StrategyRec CostBatchNorm::ChoseStr(const std::vector<double> &cost_op, StrategyRec str) {
+// Chose strategy for BatchParallel op
+StrategyRec CostBatchParallel::ChoseStr(const std::vector<double> &cost_op, StrategyRec str) {
   uint64_t min_position = min_element(cost_op.begin(), cost_op.end()) - cost_op.begin();
   if (cost_op[min_position] > (DOUBLE_MAX - 0.1)) {
     return str;
@@ -700,36 +674,75 @@ StrategyRec CostBatchNorm::ChoseStr(const std::vector<double> &cost_op, Strategy
       str.inputTensor[0].str_n /= 2.0;
       str.outputTensor.str_n /= 2.0;
       str.cut_counter += 1;
-      str.cost = str.cost + cost_in_b_;
+      str.cost = str.cost + cost_in_;
       break;
 
     case 1:
       str.inputTensor[0].str_c /= 2.0;
-      str.inputTensor[1].str_c /= 2.0;
-      str.inputTensor[2].str_c /= 2.0;
-      str.inputTensor[3].str_c /= 2.0;
-      str.inputTensor[4].str_c /= 2.0;
       str.outputTensor.str_c /= 2.0;
       str.cut_counter += 1;
-      str.cost = str.cost + cost_in_c_;
+      str.cost = str.cost + cost_in_;
       break;
 
     case 2:
       str.inputTensor[0].str_h /= 2.0;
       str.outputTensor.str_h /= 2.0;
       str.cut_counter += 1;
-      str.cost = str.cost + cost_in_h_;
+      str.cost = str.cost + cost_in_;
       break;
 
     case 3:
       str.inputTensor[0].str_w /= 2.0;
       str.outputTensor.str_w /= 2.0;
       str.cut_counter += 1;
-      str.cost = str.cost + cost_in_w_;
+      str.cost = str.cost + cost_in_;
+      break;
+
+    default:
+      MS_LOG(EXCEPTION) << "Failure: CostBatchParallel failed.";
+  }
+  return str;
+}
+
+// Chose strategy for CostSoftmaxCrossEntropyWithLogits
+StrategyRec CostSoftmaxCrossEntropyWithLogits::ChoseStr(const std::vector<double> &cost_op, StrategyRec str) {
+  uint64_t min_position = min_element(cost_op.begin(), cost_op.end()) - cost_op.begin();
+  if (cost_op[min_position] > (DOUBLE_MAX - 0.1)) {
+    return str;
+  }
+
+  switch (min_position) {
+    case 0:
+      str.inputTensor[0].str_n /= 2.0;
+      str.inputTensor[1].str_n /= 2.0;
+      str.cut_counter += 1;
+      str.cost = str.cost + cost_in_;
+      break;
+
+    case 1:
+      str.inputTensor[0].str_c /= 2.0;
+      str.inputTensor[1].str_c /= 2.0;
+      str.cut_counter += 1;
+      str.cost = str.cost + cost_in_;
+      break;
+
+    case 2:
+      str.inputTensor[0].str_h /= 2.0;
+      str.inputTensor[1].str_h /= 2.0;
+      str.outputTensor.str_w /= 2.0;
+      str.cut_counter += 1;
+      str.cost = str.cost + cost_in_;
+      break;
+
+    case 3:
+      str.inputTensor[0].str_w /= 2.0;
+      str.inputTensor[1].str_w /= 2.0;
+      str.cut_counter += 1;
+      str.cost = str.cost + cost_in_;
       break;
 
     default:
-      MS_LOG(EXCEPTION) << "Failure: CostBatchNorm failed.";
+      MS_LOG(EXCEPTION) << "Failure: CostSoftmax failed.";
   }
   return str;
 }
diff --git a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_cost.h b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_cost.h
index c45c81aca0..fb4fc27164 100644
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_cost.h
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_cost.h
@@ -28,6 +28,8 @@
 
 namespace mindspore {
 namespace parallel {
+#define DOUBLE_MAX (std::numeric_limits<double>::max)()
+
 double CostRedis(const Graph::NodeType &node,
                  const std::vector<std::pair<std::string, StrategyRec>> &node_name_to_strategy,
                  const std::vector<std::vector<float>> &mode, const Graph &graph);
@@ -195,7 +197,6 @@ class CostTensorAdd : public CostCommon {
 };
 
 // all the following operation are element-wise and have the same cost
-class CostOneHot : public CostCommon {};
 class CostReLU : public CostCommon {};
 class CostLog : public CostCommon {};
 class CostExp : public CostCommon {};
@@ -206,50 +207,27 @@ class CostDiv : public CostCommon {};
 class CostSqueeze : public CostCommon {};
 class CostCast : public CostCommon {};
 
-// class BatchNorm is used to compute the cost of BatchNorm operator.
-class CostBatchNorm {
+// class BatchParallel is used to compute the cost of BatchParallel operator.
+class CostBatchParallel {
  public:
-  StrategyRec GetOptimalStr(const Graph::NodeType &node,
-                            const std::vector<std::pair<std::string, StrategyRec>> &node_name_to_strategy,
-                            const Graph &graph);
+  virtual StrategyRec GetOptimalStr(const Graph::NodeType &node);
 
-  double GetMinCostIn(const OperatorRec &op);
-
- private:
-  double StrDimB(int32_t Tensor) {
-    cost_in_b_ = (static_cast<double>(Tensor) * 4.0) / 2.0;
-
-    return cost_in_b_;
-  }
-
-  double StrDimC() {
-    cost_in_c_ = 0.0;
-
-    return cost_in_c_;
-  }
+  virtual double GetMaxCostIn() const { return DOUBLE_MAX; }
 
-  double StrDimH(int32_t Tensor) {
-    cost_in_h_ = (static_cast<double>(Tensor) * 4.0) / 2.0;
-
-    return cost_in_h_;
-  }
+ protected:
+  virtual StrategyRec ChoseStr(const std::vector<double> &cost_op, StrategyRec str);
 
-  double StrDimW(int32_t Tensor) {
-    cost_in_w_ = (static_cast<double>(Tensor) * 4.0) / 2.0;
+  double cost_in_ = 0;
+};  // class BatchParallel is used to compute the cost of BatchParallel operator.
 
-    return cost_in_w_;
-  }
+class CostBatchNorm : public CostBatchParallel {};
+class CostOneHot : public CostBatchParallel {};
+class CostPRelu : public CostBatchParallel {};
+class CostSoftmax : public CostBatchParallel {};
 
+class CostSoftmaxCrossEntropyWithLogits : public CostBatchParallel {
   StrategyRec ChoseStr(const std::vector<double> &cost_op, StrategyRec str);
-
-  double cost_in_b_ = 0;
-
-  double cost_in_c_ = 0;
-
-  double cost_in_h_ = 0;
-
-  double cost_in_w_ = 0;
-};  // class BatchNorm is used to compute the cost of BatchNorm operator.
+};
 }  // namespace parallel
 }  // namespace mindspore
 #endif  // PARALLEL_AUTO_PARALLEL_REC_COST_H_
diff --git a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.cc b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.cc
index 5bccf73fc2..19e07aae02 100644
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.cc
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.cc
@@ -37,19 +37,75 @@ void GenerateStrategy(std::shared_ptr<Graph> graph, const std::vector<std::share
   MS_EXCEPTION_IF_NULL(index_list);
   GeneratePartitionedOperatorStrategy(graph, ops, index_list);
   std::shared_ptr<std::vector<size_t>> no_stra_op_list(new std::vector<size_t>);
-  GenerateEliminatedOperatorStrategyForward(graph, ops, eli_list, input_tensor_names, index_list, no_stra_op_list);
+  for (size_t i = 0; i < eli_list->size(); i++) {
+    no_stra_op_list->push_back(eli_list->at(i)[0]);
+  }
+  GenerateEliminatedOperatorStrategyForward(graph, ops, input_tensor_names, index_list, no_stra_op_list);
   GenerateEliminatedOperatorStrategyBackward(ops, input_tensor_names, no_stra_op_list);
+  GenerateRemainingOperatorStrategy(graph, ops, input_tensor_names, index_list, no_stra_op_list);
 }
 
 std::vector<std::vector<int32_t>> PrepareMatMul(const std::shared_ptr<Graph> &graph,
                                                 const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                 const size_t iter_graph, const size_t iter_ops) {
   std::vector<std::vector<int32_t>> strategies;
+  auto attrs = ops[iter_ops]->attrs();
+  bool transpose_a = attrs[TRANSPOSE_A]->cast<BoolImmPtr>()->value();
+  bool transpose_b = attrs[TRANSPOSE_B]->cast<BoolImmPtr>()->value();
+
+  // HCCL does not support multi-dimension partition, and the hardware does not support excessive
+  // number of EVENT, so we temporarily disable matmul's multi-dimension partition function.
+  const auto max_cut = 1.0 / g_device_manager->DeviceNum();
+  if (graph->nodes[iter_graph].apply.arguments[0].tensor_str.str_h != max_cut &&
+      graph->nodes[iter_graph].apply.arguments[1].tensor_str.str_w != max_cut) {
+    graph->nodes[iter_graph].apply.arguments[0].tensor_str.str_h = 1.0;
+    graph->nodes[iter_graph].apply.arguments[0].tensor_str.str_w = 1.0;
+    graph->nodes[iter_graph].apply.arguments[1].tensor_str.str_h = 1.0;
+    graph->nodes[iter_graph].apply.arguments[1].tensor_str.str_w = 1.0;
+    graph->nodes[iter_graph].tensor_parm.tensor_str.str_h = 1.0;
+    graph->nodes[iter_graph].tensor_parm.tensor_str.str_w = 1.0;
+
+    auto shape_1 = ops[iter_ops]->inputs_tensor_info()[0].shape()[0];
+    if (transpose_a) {
+      shape_1 = ops[iter_ops]->inputs_tensor_info()[0].shape()[1];
+    }
+    auto shape_4 = ops[iter_ops]->inputs_tensor_info()[1].shape()[1];
+    if (transpose_b) {
+      shape_4 = ops[iter_ops]->inputs_tensor_info()[1].shape()[0];
+    }
+
+    bool already_cut = false;
+    if (shape_1 >= shape_4) {
+      if (shape_1 % g_device_manager->DeviceNum() == 0) {
+        graph->nodes[iter_graph].apply.arguments[0].tensor_str.str_h = max_cut;
+        graph->nodes[iter_graph].tensor_parm.tensor_str.str_h = max_cut;
+        already_cut = true;
+      }
+      if (!already_cut && shape_4 % g_device_manager->DeviceNum() == 0) {
+        graph->nodes[iter_graph].apply.arguments[1].tensor_str.str_w = max_cut;
+        graph->nodes[iter_graph].tensor_parm.tensor_str.str_w = max_cut;
+        already_cut = true;
+      }
+    } else {
+      if (shape_4 % g_device_manager->DeviceNum() == 0) {
+        graph->nodes[iter_graph].apply.arguments[1].tensor_str.str_w = max_cut;
+        graph->nodes[iter_graph].tensor_parm.tensor_str.str_w = max_cut;
+        already_cut = true;
+      }
+      if (!already_cut && shape_1 % g_device_manager->DeviceNum() == 0) {
+        graph->nodes[iter_graph].apply.arguments[0].tensor_str.str_h = max_cut;
+        graph->nodes[iter_graph].tensor_parm.tensor_str.str_h = max_cut;
+        already_cut = true;
+      }
+    }
+
+    if (!already_cut) {
+      MS_LOG(EXCEPTION) << "Failure: MatMul's shape is invalid.";
+    }
+  }
+
   for (size_t iter_op_inputs = 0; iter_op_inputs < ops[iter_ops]->inputs_tensor_info().size(); iter_op_inputs++) {
     std::vector<int32_t> s;
-    auto attrs = ops[iter_ops]->attrs();
-    bool transpose_a = attrs[TRANSPOSE_A]->cast<BoolImmPtr>()->value();
-    bool transpose_b = attrs[TRANSPOSE_B]->cast<BoolImmPtr>()->value();
     if (transpose_a && (iter_op_inputs == 0)) {
       s.push_back(
         static_cast<int32_t>(1.0 / graph->nodes[iter_graph].apply.arguments[iter_op_inputs].tensor_str.str_w));
@@ -71,52 +127,58 @@ std::vector<std::vector<int32_t>> PrepareMatMul(const std::shared_ptr<Graph> &gr
   return strategies;
 }
 
-std::vector<std::vector<int32_t>> PrepareVirtualDataset(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                        const size_t iter_ops) {
-  std::vector<std::vector<int32_t>> strategies = MakeDataParallelStrategy(ops, iter_ops);
-  strategies[1][0] = strategies[0][0];
+std::vector<std::vector<int32_t>> PrepareBiasAdd(const std::shared_ptr<std::vector<int32_t>> &s) {
+  std::vector<std::vector<int32_t>> strategies;
+  strategies.push_back(*s);
+  std::vector<int32_t> s_biasadd;
+  s_biasadd.push_back(s->at(1));
+  strategies.push_back(s_biasadd);
   return strategies;
 }
 
-std::vector<std::vector<int32_t>> PrepareScalarInputOperator(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                             const size_t iter_ops, std::vector<int32_t> s) {
+std::vector<std::vector<int32_t>> PrepareOneHot(const std::shared_ptr<Graph> &graph,
+                                                const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                const size_t iter_graph, const size_t iter_ops) {
+  std::vector<std::vector<int32_t>> strategies = MakeRecSearchStrategy(graph, ops, iter_graph, iter_ops);
+  strategies[0][0] = strategies[0][1];
+  strategies[0][1] = 1;
+  graph->nodes[iter_graph].tensor_parm.tensor_str.str_h = graph->nodes[iter_graph].tensor_parm.tensor_str.str_w;
+  graph->nodes[iter_graph].tensor_parm.tensor_str.str_w = 1.0;
+  std::vector<int32_t> s_empty = {};
+  strategies.push_back(s_empty);
+  strategies.push_back(s_empty);
+  return strategies;
+}
+
+std::vector<std::vector<int32_t>> PrepareGatherV2(const std::shared_ptr<std::vector<int32_t>> &s) {
   std::vector<std::vector<int32_t>> strategies;
+  strategies.push_back(*s);
+  return strategies;
+}
 
-  auto dev_num = g_device_manager->DeviceNum();
-  size_t cut_num = 1;
-  for (size_t iter_s = 0; iter_s < s.size(); iter_s++) {
-    cut_num *= s[iter_s];
-  }
-  if (cut_num != dev_num) {
-    std::vector<int32_t> s_max = s;
-    for (size_t dim = 0; dim < (size_t)ops[iter_ops]->inputs_tensor_info()[0].shape().size(); dim++) {
-      size_t shape = ops[iter_ops]->inputs_tensor_info()[0].shape()[dim] / s[dim];
-      while (cut_num < dev_num && shape % 2 == 0) {
-        shape = shape / 2;
-        s_max[dim] = s_max[dim] * 2;
-        cut_num = cut_num * 2;
-      }
-      if (cut_num == dev_num) {
-        break;
-      }
+std::vector<std::vector<int32_t>> PrepareL2Normalize(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                     const size_t iter_ops, std::vector<int32_t> s) {
+  int32_t axis = 0;
+  auto iter = ops[iter_ops]->attrs().find(AXIS);
+  if (iter != ops[iter_ops]->attrs().end()) {
+    MS_EXCEPTION_IF_NULL(iter->second);
+    if (iter->second->isa<Int32Imm>()) {
+      axis = iter->second->cast<Int32ImmPtr>()->value();
+    } else {
+      MS_LOG(EXCEPTION) << ops[iter_ops]->name() << " : The value of axis is not int.";
     }
-    s = s_max;
   }
 
-  strategies.push_back(s);
-  std::vector<int32_t> s_biasadd;
-  s_biasadd.push_back(s[1]);
-  strategies.push_back(s_biasadd);
+  int32_t axis_index = axis;
+  if (axis < 0) {
+    size_t input_dim = ops[iter_ops]->inputs_tensor_info()[0].shape().size();
+    axis_index = static_cast<int32_t>(input_dim) + axis;
+  }
 
-  return strategies;
-}
+  s[IntToSize(axis_index)] = 1;
 
-std::vector<std::vector<int32_t>> PrepareOneHot(std::vector<int32_t> s) {
   std::vector<std::vector<int32_t>> strategies;
-  std::vector<int32_t> s_empty = {};
   strategies.push_back(s);
-  strategies.push_back(s_empty);
-  strategies.push_back(s_empty);
   return strategies;
 }
 
@@ -131,16 +193,13 @@ std::vector<std::vector<int32_t>> MakeRecSearchStrategy(const std::shared_ptr<Gr
   }
 
   StrategyPtr origin_strategy = ops[iter_ops]->strategy();
-
   std::vector<std::vector<int32_t>> strategies;
   for (size_t iter_op_inputs = 0; iter_op_inputs < ops[iter_ops]->inputs_tensor_info().size(); iter_op_inputs++) {
     if (iter_op_inputs >= origin_strategy->GetInputDim().size()) {
       MS_LOG(EXCEPTION) << "Failure: Strategy's InputDim out of range.";
     }
 
-    // size_t output_size = ops[iter_ops]->outputs_tensor_info()[0].shape().size();
     size_t output_size = origin_strategy->GetInputDim()[iter_op_inputs].size();
-
     std::vector<int32_t> s;
     if (output_size == 4) {
       s.push_back(
@@ -164,14 +223,14 @@ std::vector<std::vector<int32_t>> MakeRecSearchStrategy(const std::shared_ptr<Gr
     } else {
       MS_LOG(ERROR) << "Tensor's output size is unexcepted.";
     }
-
     strategies.push_back(s);
   }
   return strategies;
 }
 
-std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                           const size_t iter_ops) {
+std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::shared_ptr<Graph> &graph,
+                                                           const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                           const size_t iter_graph, const size_t iter_ops) {
   if (ops.empty()) {
     MS_LOG(EXCEPTION) << "Failure: Operators is empty.";
   }
@@ -180,8 +239,9 @@ std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::vector<std
   }
 
   StrategyPtr origin_strategy = ops[iter_ops]->strategy();
-
   std::vector<std::vector<int32_t>> strategies;
+  size_t max_device_num = g_device_manager->DeviceNum();
+  size_t target_tensor_batch = ops[iter_ops]->outputs_tensor_info()[0].shape()[0];
   for (size_t iter_op_inputs = 0; iter_op_inputs < ops[iter_ops]->inputs_tensor_info().size(); iter_op_inputs++) {
     if (iter_op_inputs >= origin_strategy->GetInputDim().size()) {
       MS_LOG(EXCEPTION) << "Failure: Strategy's InputDim out of range.";
@@ -192,8 +252,6 @@ std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::vector<std
     for (size_t dim = 0; dim < input_size; dim++) {
       if (input_size == 1 || input_size == 2 || input_size == 4) {
         if (dim == 0) {
-          size_t max_device_num = g_device_manager->DeviceNum();
-          size_t target_tensor_batch = ops[iter_ops]->outputs_tensor_info()[0].shape()[0];
           s.push_back(std::min(max_device_num, target_tensor_batch));
         } else {
           s.push_back(1);
@@ -202,9 +260,21 @@ std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::vector<std
         MS_LOG(ERROR) << "Tensor's shape is unknown.";
       }
     }
-
     strategies.push_back(s);
   }
+
+  graph->nodes[iter_graph].tensor_parm.tensor_str.str_n = 1.0;
+  graph->nodes[iter_graph].tensor_parm.tensor_str.str_c = 1.0;
+  graph->nodes[iter_graph].tensor_parm.tensor_str.str_h = 1.0;
+  graph->nodes[iter_graph].tensor_parm.tensor_str.str_w = 1.0;
+  if (ops[iter_ops]->outputs_tensor_info()[0].shape().size() == 1) {
+    graph->nodes[iter_graph].tensor_parm.tensor_str.str_w = 1.0 / std::min(max_device_num, target_tensor_batch);
+  } else if (ops[iter_ops]->outputs_tensor_info()[0].shape().size() == 2) {
+    graph->nodes[iter_graph].tensor_parm.tensor_str.str_h = 1.0 / std::min(max_device_num, target_tensor_batch);
+  } else if (ops[iter_ops]->outputs_tensor_info()[0].shape().size() == 4) {
+    graph->nodes[iter_graph].tensor_parm.tensor_str.str_n = 1.0 / std::min(max_device_num, target_tensor_batch);
+  }
+
   return strategies;
 }
 
@@ -217,20 +287,18 @@ std::vector<std::vector<int32_t>> PrepareStrategy(const std::shared_ptr<Graph> &
   if (iter_ops >= ops.size()) {
     MS_LOG(EXCEPTION) << "Failure: Operators' elements out of range.";
   }
+  MS_EXCEPTION_IF_NULL(ops[iter_ops]);
 
   auto type = ops[iter_ops]->type();
-  if (type == VIRTUAL_DATA_SET) {
-    return PrepareVirtualDataset(ops, iter_ops);
-  }
   auto idx = DictOpType.find(type);
   if (idx == DictOpType.end()) {
-    return MakeDataParallelStrategy(ops, iter_ops);
+    return MakeDataParallelStrategy(graph, ops, iter_graph, iter_ops);
   }
 
   if (type == MATMUL) {
     return PrepareMatMul(graph, ops, iter_graph, iter_ops);
-  } else if (type == RESHAPE) {
-    return MakeDataParallelStrategy(ops, iter_ops);
+  } else if (type == ONEHOT) {
+    return PrepareOneHot(graph, ops, iter_graph, iter_ops);
   } else {
     return MakeRecSearchStrategy(graph, ops, iter_graph, iter_ops);
   }
@@ -242,28 +310,25 @@ void GeneratePartitionedOperatorStrategy(const std::shared_ptr<Graph> graph,
   for (size_t iter_ops = 0; iter_ops < (size_t)index_list->size(); iter_ops++) {
     std::vector<std::vector<int32_t>> strategies;
     size_t iter_graph = index_list->at(iter_ops);
-    if (iter_graph == SIZE_MAX) {
-      StrategyPtr sp = std::make_shared<Strategy>(0, strategies);
-      ops[iter_ops]->SetSelectedStrategyAndCost(sp, ops[iter_ops]->selected_cost());
-      continue;
+    if (iter_graph != SIZE_MAX) {
+      strategies = PrepareStrategy(graph, ops, iter_graph, iter_ops);
     }
-    strategies = PrepareStrategy(graph, ops, iter_graph, iter_ops);
     StrategyPtr sp = std::make_shared<Strategy>(0, strategies);
     ops[iter_ops]->SetSelectedStrategyAndCost(sp, ops[iter_ops]->selected_cost());
   }
 }
 
-int FindIndexOfOperatorIncoming(const std::vector<std::vector<std::string>> &input_tensor_names,
-                                const size_t iter_ops) {
-  int incoming_op_index = -1;
-  for (size_t i = 1; i < (size_t)input_tensor_names[iter_ops].size(); i++) {
-    for (size_t j = 0; j < (size_t)input_tensor_names.size(); j++) {
+size_t FindIndexOfOperatorIncoming(const std::vector<std::vector<std::string>> &input_tensor_names,
+                                   const size_t iter_ops) {
+  size_t incoming_op_index = SIZE_MAX;
+  for (size_t i = 1; i < input_tensor_names[iter_ops].size(); i++) {
+    for (size_t j = 0; j < input_tensor_names.size(); j++) {
       if (input_tensor_names[iter_ops][i] == input_tensor_names[j][0]) {
         incoming_op_index = j;
         break;
       }
     }
-    if (incoming_op_index != -1) {
+    if (incoming_op_index != SIZE_MAX) {
       break;
     }
   }
@@ -298,12 +363,16 @@ std::vector<int32_t> CopyIncomingOperatorOutputStrategy(const std::shared_ptr<Gr
 }
 
 std::vector<int32_t> PrepareIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                          const int incoming_op_index) {
+                                                          const size_t incoming_op_index) {
   std::vector<int32_t> s;
+  if (ops[incoming_op_index]->type() == RESHAPE || ops[incoming_op_index]->type() == GATHERV2) {
+    return s;
+  }
   auto strategy = ops[incoming_op_index]->selected_strategy();
   if (strategy->GetInputNumber() == 0) {
     return s;
   }
+
   for (size_t i = 0; i < (size_t)ops[incoming_op_index]->inputs_tensor_info().size(); i++) {
     if (ops[incoming_op_index]->inputs_tensor_info()[i].shape().size() == 0) {
       continue;
@@ -327,6 +396,7 @@ std::vector<int32_t> GetAxisList(const std::vector<std::shared_ptr<OperatorInfo>
   } else {
     MS_LOG(EXCEPTION) << "Failure: Axis type is invalid, neither tuple nor list." << std::endl;
   }
+
   for (auto &element : elements) {
     if (!element->isa<Int32Imm>()) {
       MS_LOG(EXCEPTION) << "Failure: Dimension indexes is not Int32." << std::endl;
@@ -338,12 +408,13 @@ std::vector<int32_t> GetAxisList(const std::vector<std::shared_ptr<OperatorInfo>
 }
 
 std::vector<int32_t> ModifyStrategyIfSqueezeIncoming(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                     const int incoming_op_index, std::vector<int32_t> s) {
+                                                     const size_t incoming_op_index, std::vector<int32_t> s) {
   std::vector<int32_t> s_Squeeze;
   std::vector<int32_t> stra_dim_list;
   for (size_t i = 0; i < s.size(); i++) {
     stra_dim_list.push_back(i);
   }
+
   auto axis_list = GetAxisList(ops, incoming_op_index);
   for (auto axis : axis_list) {
     auto it = find(stra_dim_list.begin(), stra_dim_list.end(), axis);
@@ -355,6 +426,7 @@ std::vector<int32_t> ModifyStrategyIfSqueezeIncoming(const std::vector<std::shar
     }
     stra_dim_list.erase(it);
   }
+
   for (size_t i = 0; i < (size_t)stra_dim_list.size(); i++) {
     s_Squeeze.push_back(s[stra_dim_list[i]]);
   }
@@ -391,12 +463,13 @@ std::vector<int32_t> GetDimList(const std::vector<std::shared_ptr<OperatorInfo>>
 }
 
 std::vector<int32_t> ModifyStrategyIfReduceIncoming(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                    const int incoming_op_index, std::vector<int32_t> s) {
+                                                    const size_t incoming_op_index, std::vector<int32_t> s) {
   std::vector<int32_t> s_Reduce;
   std::vector<int32_t> axis_list;
   for (size_t i = 0; i < s.size(); i++) {
     axis_list.push_back(i);
   }
+
   auto dim_list = GetDimList(ops, incoming_op_index);
   for (auto axis : dim_list) {
     auto it = find(axis_list.begin(), axis_list.end(), axis);
@@ -405,6 +478,7 @@ std::vector<int32_t> ModifyStrategyIfReduceIncoming(const std::vector<std::share
     }
     axis_list.erase(it);
   }
+
   for (size_t i = 0; i < (size_t)axis_list.size(); i++) {
     s_Reduce.push_back(s[axis_list[i]]);
   }
@@ -412,8 +486,7 @@ std::vector<int32_t> ModifyStrategyIfReduceIncoming(const std::vector<std::share
 }
 
 std::vector<int32_t> CopyIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                       const int incoming_op_index, const size_t iter_ops,
-                                                       const std::shared_ptr<std::vector<size_t>> no_stra_op_list) {
+                                                       const size_t iter_ops, const size_t incoming_op_index) {
   std::vector<int32_t> s;
   s = PrepareIncomingOperatorInputStrategy(ops, incoming_op_index);
   if (s.size() != 0) {
@@ -429,27 +502,31 @@ std::vector<int32_t> CopyIncomingOperatorInputStrategy(const std::vector<std::sh
 }
 
 std::vector<std::vector<int32_t>> GenerateStrategiesFromStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                                 const size_t iter_ops, std::vector<int32_t> s) {
+                                                                 const size_t iter_ops,
+                                                                 std::vector<int32_t> basic_stra) {
   std::vector<int32_t> s_empty = {};
   std::vector<std::vector<int32_t>> stra;
+  MS_EXCEPTION_IF_NULL(ops[iter_ops]);
 
-  if (s.size() == 0) {
+  if (basic_stra.size() == 0) {
     for (size_t iter_op_inputs = 0; iter_op_inputs < (size_t)ops[iter_ops]->inputs_tensor_info().size();
          iter_op_inputs++) {
-      stra.push_back(s);
+      stra.push_back(basic_stra);
     }
     return stra;
   }
 
-  MS_EXCEPTION_IF_NULL(ops[iter_ops]);
-  if (ops[iter_ops]->type() == BIAS_ADD || ops[iter_ops]->type() == PRELU) {
-    return PrepareScalarInputOperator(ops, iter_ops, s);
+  auto s_ptr = std::make_shared<std::vector<int32_t>>(basic_stra);
+  if (ops[iter_ops]->type() == BIAS_ADD) {
+    return PrepareBiasAdd(s_ptr);
   }
-  if (ops[iter_ops]->type() == ONEHOT) {
-    return PrepareOneHot(s);
+  if (ops[iter_ops]->type() == GATHERV2) {
+    return PrepareGatherV2(s_ptr);
+  }
+  if (ops[iter_ops]->type() == L2_NORMALIZE) {
+    return PrepareL2Normalize(ops, iter_ops, basic_stra);
   }
 
-  auto dev_num = g_device_manager->DeviceNum();
   for (size_t iter_op_inputs = 0; iter_op_inputs < (size_t)ops[iter_ops]->inputs_tensor_info().size();
        iter_op_inputs++) {
     if (ops[iter_ops]->inputs_tensor_info()[iter_op_inputs].shape().size() == 0) {
@@ -457,67 +534,49 @@ std::vector<std::vector<int32_t>> GenerateStrategiesFromStrategy(const std::vect
       continue;
     }
 
-    size_t cut_num = 1;
-    for (size_t iter_s = 0; iter_s < s.size(); iter_s++) {
-      cut_num *= s[iter_s];
-    }
-    if (cut_num == dev_num) {
-      std::vector<int32_t> s_1 = s;
-      bool modified = false;
-      for (size_t j = 0; j < (size_t)ops[iter_ops]->inputs_tensor_info()[iter_op_inputs].shape().size(); j++) {
-        if (ops[iter_ops]->inputs_tensor_info()[iter_op_inputs].shape()[j] == 1) {
-          s_1[j] = 1;
-          modified = true;
-        }
+    std::vector<int32_t> tmp_stra = basic_stra;
+    bool modified = false;
+    for (size_t j = 0; j < (size_t)ops[iter_ops]->inputs_tensor_info()[iter_op_inputs].shape().size(); j++) {
+      if (ops[iter_ops]->inputs_tensor_info()[iter_op_inputs].shape()[j] == 1) {
+        tmp_stra[j] = 1;
+        modified = true;
       }
-      if (modified) {
-        stra.push_back(s_1);
-      } else {
-        stra.push_back(s);
-      }
-      continue;
     }
-
-    std::vector<int32_t> s_max = s;
-    for (size_t dim = 0; dim < (size_t)ops[iter_ops]->inputs_tensor_info()[iter_op_inputs].shape().size(); dim++) {
-      size_t shape = ops[iter_ops]->inputs_tensor_info()[iter_op_inputs].shape()[dim] / s[dim];
-      while (cut_num < dev_num && shape % 2 == 0) {
-        shape = shape / 2;
-        s_max[dim] = s_max[dim] * 2;
-        cut_num = cut_num * 2;
-      }
-      if (cut_num == dev_num) {
-        break;
-      }
+    if (modified) {
+      stra.push_back(tmp_stra);
+    } else {
+      stra.push_back(basic_stra);
     }
-
-    stra.push_back(s_max);
   }
   return stra;
 }
 
 void GenerateEliminatedOperatorStrategyForward(const std::shared_ptr<Graph> graph,
                                                const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                               const std::shared_ptr<std::vector<std::vector<size_t>>> eli_list,
                                                const std::vector<std::vector<std::string>> &input_tensor_names,
                                                const std::shared_ptr<std::vector<size_t>> index_list,
                                                const std::shared_ptr<std::vector<size_t>> no_stra_op_list) {
-  for (int eli_index = eli_list->size() - 1; eli_index >= 0; eli_index--) {
-    size_t iter_ops = eli_list->at(eli_index)[0];
+  if (no_stra_op_list->size() == 0) {
+    return;
+  }
+  std::vector<size_t> no_stra_op_list_bis;
+
+  for (size_t iter_list = no_stra_op_list->size(); iter_list > 0; iter_list--) {
+    size_t iter_ops = no_stra_op_list->at(iter_list - 1);
     std::vector<std::vector<int32_t>> stra;
     std::vector<int32_t> s;
-    int incoming_op_index = FindIndexOfOperatorIncoming(input_tensor_names, iter_ops);
-    if (incoming_op_index != -1) {
+    size_t incoming_op_index = FindIndexOfOperatorIncoming(input_tensor_names, iter_ops);
+    if (incoming_op_index != SIZE_MAX) {
       auto iter_graph = index_list->at(incoming_op_index);
       if (iter_graph != SIZE_MAX) {
         s = CopyIncomingOperatorOutputStrategy(graph, ops, iter_ops, iter_graph);
       } else {
-        s = CopyIncomingOperatorInputStrategy(ops, incoming_op_index, iter_ops, no_stra_op_list);
+        s = CopyIncomingOperatorInputStrategy(ops, iter_ops, incoming_op_index);
       }
     }
 
     if (s.size() == 0) {
-      no_stra_op_list->push_back(iter_ops);
+      no_stra_op_list_bis.push_back(iter_ops);
     } else {
       stra = GenerateStrategiesFromStrategy(ops, iter_ops, s);
     }
@@ -525,6 +584,11 @@ void GenerateEliminatedOperatorStrategyForward(const std::shared_ptr<Graph> grap
     StrategyPtr sp = std::make_shared<Strategy>(0, stra);
     ops[iter_ops]->SetSelectedStrategyAndCost(sp, ops[iter_ops]->selected_cost());
   }
+
+  no_stra_op_list->clear();
+  for (size_t i = 0; i < no_stra_op_list_bis.size(); i++) {
+    no_stra_op_list->push_back(no_stra_op_list_bis[i]);
+  }
 }
 
 std::vector<int32_t> ModifyStrategyIfSqueezeOutgoing(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
@@ -534,7 +598,7 @@ std::vector<int32_t> ModifyStrategyIfSqueezeOutgoing(const std::vector<std::shar
   size_t s_index = 0;
   size_t axis_list_index = 0;
   for (size_t i = 0; i < (size_t)(s.size() + axis_list.size()); i++) {
-    if ((i) == (size_t)axis_list[axis_list_index]) {
+    if (i == (size_t)axis_list[axis_list_index]) {
       s_Squeeze.push_back(1);
       axis_list_index++;
     } else {
@@ -542,46 +606,50 @@ std::vector<int32_t> ModifyStrategyIfSqueezeOutgoing(const std::vector<std::shar
       s_index++;
     }
   }
-  return s_Squeeze;
-}
 
-std::vector<int32_t> ModifyStrategyIfReduceOutgoing(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                    const size_t iter_ops, std::vector<int32_t> s) {
-  std::vector<int32_t> dim_list = GetDimList(ops, iter_ops);
-  if (dim_list.size() == 0) {
-    return s;
+  size_t cut = 1;
+  for (size_t i = 0; i < s_Squeeze.size(); i++) {
+    cut *= s_Squeeze[i];
   }
-  std::vector<int32_t> s_Reduce;
-  size_t s_index = 0;
-  size_t dim_list_index = 0;
-  for (size_t i = 0; i < (size_t)(s.size() + dim_list.size()); i++) {
-    if (i == (size_t)dim_list[dim_list_index]) {
-      s_Reduce.push_back(1);
-      dim_list_index++;
-    } else {
-      s_Reduce.push_back(s[s_index]);
-      s_index++;
-    }
+  if (cut != g_device_manager->DeviceNum()) {
+    s_Squeeze.clear();
   }
-  return s_Reduce;
+
+  return s_Squeeze;
 }
 
 std::vector<int32_t> CopyOutgoingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                        const std::vector<std::vector<std::string>> &input_tensor_names,
                                                        const size_t iter_ops) {
   std::vector<int32_t> s;
+  if (ops[iter_ops]->type() == REDUCE_MAX || ops[iter_ops]->type() == REDUCE_MIN ||
+      ops[iter_ops]->type() == REDUCE_SUM || ops[iter_ops]->type() == REDUCE_MEAN || ops[iter_ops]->type() == RESHAPE ||
+      ops[iter_ops]->type() == GATHERV2) {
+    return s;
+  }
+
   bool found = false;
-  for (size_t i = 0; i < (size_t)input_tensor_names.size(); i++) {
-    for (size_t j = 1; j < (size_t)input_tensor_names[i].size(); j++) {
-      if (input_tensor_names[i][j] == input_tensor_names[iter_ops][0]) {
-        for (size_t k = 0; k < ops[i]->selected_strategy()->GetInputDim()[j - 1].size(); ++k) {
-          s.push_back(ops[i]->selected_strategy()->GetInputDim()[j - 1][k]);
-        }
+  size_t outgoing_op_index = SIZE_MAX;
+  size_t iter_op_inputs = SIZE_MAX;
+  for (size_t i = 0; i < input_tensor_names.size(); i++) {
+    for (size_t j = 1; j < input_tensor_names[i].size(); j++) {
+      if (input_tensor_names[i][j] == input_tensor_names[iter_ops][0] &&
+          ops[i]->selected_strategy()->GetInputNumber() != 0) {
+        outgoing_op_index = i;
+        iter_op_inputs = j - 1;
         found = true;
         break;
       }
     }
-    if (found) break;
+    if (found) {
+      break;
+    }
+  }
+
+  if (outgoing_op_index != SIZE_MAX && iter_op_inputs != SIZE_MAX) {
+    for (size_t k = 0; k < ops[iter_ops]->outputs_tensor_info()[0].shape().size(); ++k) {
+      s.push_back(ops[outgoing_op_index]->selected_strategy()->GetInputDim()[iter_op_inputs][k]);
+    }
   }
   return s;
 }
@@ -589,23 +657,66 @@ std::vector<int32_t> CopyOutgoingOperatorInputStrategy(const std::vector<std::sh
 void GenerateEliminatedOperatorStrategyBackward(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                 const std::vector<std::vector<std::string>> &input_tensor_names,
                                                 const std::shared_ptr<std::vector<size_t>> no_stra_op_list) {
-  MS_EXCEPTION_IF_NULL(no_stra_op_list);
-  for (int iter_list = no_stra_op_list->size() - 1; iter_list >= 0; iter_list--) {
-    auto iter_ops = no_stra_op_list->at(iter_list);
+  if (no_stra_op_list->size() == 0) {
+    return;
+  }
+  std::vector<size_t> no_stra_op_list_bis;
+
+  for (size_t iter_list = no_stra_op_list->size(); iter_list > 0; iter_list--) {
+    auto iter_ops = no_stra_op_list->at(iter_list - 1);
     std::vector<std::vector<int32_t>> stra;
     std::vector<int32_t> s = CopyOutgoingOperatorInputStrategy(ops, input_tensor_names, iter_ops);
-    if (s.size() == 0) {
-      for (size_t i = 0; i < ops[iter_ops]->inputs_tensor_info()[0].shape().size(); i++) {
-        s.push_back(1);
-      }
-    }
-    if (ops[iter_ops]->type() == SQUEEZE) {
+
+    if (s.size() != 0 && ops[iter_ops]->type() == SQUEEZE) {
       s = ModifyStrategyIfSqueezeOutgoing(ops, iter_ops, s);
     }
-    if (ops[iter_ops]->type() == REDUCE_SUM || ops[iter_ops]->type() == REDUCE_MAX ||
-        ops[iter_ops]->type() == REDUCE_MIN || ops[iter_ops]->type() == REDUCE_MEAN) {
-      s = ModifyStrategyIfReduceOutgoing(ops, iter_ops, s);
+    if (s.size() != 0) {
+      stra = GenerateStrategiesFromStrategy(ops, iter_ops, s);
+    } else {
+      no_stra_op_list_bis.push_back(iter_ops);
+    }
+
+    StrategyPtr sp = std::make_shared<Strategy>(0, stra);
+    ops[iter_ops]->SetSelectedStrategyAndCost(sp, ops[iter_ops]->selected_cost());
+  }
+
+  no_stra_op_list->clear();
+  for (size_t i = 0; i < no_stra_op_list_bis.size(); i++) {
+    no_stra_op_list->push_back(no_stra_op_list_bis[i]);
+  }
+}
+
+void GenerateRemainingOperatorStrategy(const std::shared_ptr<Graph> graph,
+                                       const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                       const std::vector<std::vector<std::string>> &input_tensor_names,
+                                       const std::shared_ptr<std::vector<size_t>> index_list,
+                                       const std::shared_ptr<std::vector<size_t>> no_stra_op_list) {
+  if (no_stra_op_list->size() == 0) {
+    return;
+  }
+
+  size_t no_stra_op_list_size;
+  do {
+    no_stra_op_list_size = no_stra_op_list->size();
+    GenerateEliminatedOperatorStrategyForward(graph, ops, input_tensor_names, index_list, no_stra_op_list);
+    GenerateEliminatedOperatorStrategyBackward(ops, input_tensor_names, no_stra_op_list);
+  } while (no_stra_op_list_size > no_stra_op_list->size());
+
+  for (size_t iter_list = 0; iter_list < no_stra_op_list->size(); iter_list++) {
+    auto iter_ops = no_stra_op_list->at(iter_list);
+    std::vector<std::vector<int32_t>> stra;
+    std::vector<int32_t> s;
+
+    size_t max_dim_num = 0;
+    for (size_t iter_op_inputs = 0; iter_op_inputs < ops[iter_ops]->inputs_tensor_info().size(); iter_op_inputs++) {
+      if (ops[iter_ops]->inputs_tensor_info()[iter_op_inputs].shape().size() > max_dim_num) {
+        max_dim_num = ops[iter_ops]->inputs_tensor_info()[iter_op_inputs].shape().size();
+      }
+    }
+    for (size_t i = 0; i < max_dim_num; i++) {
+      s.push_back(1);
     }
+
     stra = GenerateStrategiesFromStrategy(ops, iter_ops, s);
     StrategyPtr sp = std::make_shared<Strategy>(0, stra);
     ops[iter_ops]->SetSelectedStrategyAndCost(sp, ops[iter_ops]->selected_cost());
diff --git a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.h b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.h
index db275dda10..c9604b449f 100644
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.h
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.h
@@ -34,55 +34,61 @@ void GenerateStrategy(std::shared_ptr<Graph> graph, const std::vector<std::share
 std::vector<std::vector<int32_t>> PrepareMatMul(const std::shared_ptr<Graph> &graph,
                                                 const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                 const size_t iter_graph, const size_t iter_ops);
-std::vector<std::vector<int32_t>> PrepareVirtualDataset(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                        const size_t iter_ops);
-std::vector<std::vector<int32_t>> PrepareScalarInputOperator(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                             const size_t iter_ops, std::vector<int32_t> s);
-std::vector<std::vector<int32_t>> PrepareOneHot(std::vector<int32_t> s);
+std::vector<std::vector<int32_t>> PrepareBiasAdd(const std::shared_ptr<std::vector<int32_t>> &s);
+std::vector<std::vector<int32_t>> PrepareOneHot(const std::shared_ptr<Graph> &graph,
+                                                const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                const size_t iter_graph, const size_t iter_ops);
+std::vector<std::vector<int32_t>> PrepareGatherV2(const std::shared_ptr<std::vector<int32_t>> &s);
+std::vector<std::vector<int32_t>> PrepareL2Normalize(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                     const size_t iter_ops, std::vector<int32_t> s);
 std::vector<std::vector<int32_t>> MakeRecSearchStrategy(const std::shared_ptr<Graph> &graph,
                                                         const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                         const size_t iter_graph, const size_t iter_ops);
-std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                           const size_t iter_ops);
+std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::shared_ptr<Graph> &graph,
+                                                           const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                           const size_t iter_graph, const size_t iter_ops);
 std::vector<std::vector<int32_t>> PrepareStrategy(const std::shared_ptr<Graph> &graph,
                                                   const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                   const size_t iter_graph, const size_t iter_ops);
 void GeneratePartitionedOperatorStrategy(const std::shared_ptr<Graph> graph,
                                          const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                          const std::shared_ptr<std::vector<size_t>> index_list);
-int FindIndexOfOperatorIncoming(const std::vector<std::vector<std::string>> &input_tensor_names, const size_t iter_ops);
+size_t FindIndexOfOperatorIncoming(const std::vector<std::vector<std::string>> &input_tensor_names,
+                                   const size_t iter_ops);
 std::vector<int32_t> CopyIncomingOperatorOutputStrategy(const std::shared_ptr<Graph> graph,
                                                         const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                         const size_t iter_ops, const size_t iter_graph);
 std::vector<int32_t> PrepareIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                          const int incoming_op_index);
+                                                          const size_t incoming_op_index);
 std::vector<int32_t> GetAxisList(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const int iter_ops);
 std::vector<int32_t> ModifyStrategyIfSqueezeIncoming(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                     const int incoming_op_index, std::vector<int32_t> s);
+                                                     const size_t incoming_op_index, std::vector<int32_t> s);
 std::vector<int32_t> GetDimList(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const size_t iter_ops);
 std::vector<int32_t> ModifyStrategyIfReduceIncoming(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                    const int incoming_op_index, std::vector<int32_t> s);
+                                                    const size_t incoming_op_index, std::vector<int32_t> s);
 std::vector<int32_t> CopyIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                       const int incoming_op_index, const size_t iter_ops,
-                                                       const std::shared_ptr<std::vector<size_t>> no_stra_op_list);
+                                                       const size_t iter_ops, const size_t incoming_op_index);
 std::vector<std::vector<int32_t>> GenerateStrategiesFromStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                                 const size_t iter_ops, std::vector<int32_t> s);
+                                                                 const size_t iter_ops,
+                                                                 std::vector<int32_t> basic_stra);
 void GenerateEliminatedOperatorStrategyForward(std::shared_ptr<Graph> graph,
                                                const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                               const std::shared_ptr<std::vector<std::vector<size_t>>> eli_list,
                                                const std::vector<std::vector<std::string>> &input_tensor_names,
                                                const std::shared_ptr<std::vector<size_t>> index_list,
                                                const std::shared_ptr<std::vector<size_t>> no_stra_op_list);
 std::vector<int32_t> ModifyStrategyIfSqueezeOutgoing(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                      const size_t iter_ops, std::vector<int32_t> s);
-std::vector<int32_t> ModifyStrategyIfReduceOutgoing(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                    const size_t iter_ops, std::vector<int32_t> s);
 std::vector<int32_t> CopyOutgoingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                        const std::vector<std::vector<std::string>> &input_tensor_names,
                                                        const size_t iter_ops);
 void GenerateEliminatedOperatorStrategyBackward(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                 const std::vector<std::vector<std::string>> &input_tensor_names,
                                                 const std::shared_ptr<std::vector<size_t>> no_stra_op_list);
+void GenerateRemainingOperatorStrategy(const std::shared_ptr<Graph> graph,
+                                       const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                       const std::vector<std::vector<std::string>> &input_tensor_names,
+                                       const std::shared_ptr<std::vector<size_t>> index_list,
+                                       const std::shared_ptr<std::vector<size_t>> no_stra_op_list);
 }  // namespace parallel
 }  // namespace mindspore
 #endif  // PARALLEL_AUTO_PARALLEL_REC_GENERATE_STRATEGY_H_
diff --git a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_graph.h b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_graph.h
index a7bc1ae86f..647b857e16 100644
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_graph.h
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_graph.h
@@ -38,6 +38,7 @@ enum OperatorType {
   kRecBiasAdd,
   kRecSoftmax,
   kRecSparseSoftmaxCrossEntropyWithLogits,
+  kRecSoftmaxCrossEntropyWithLogits,
   kRecOneHot,
   kRecLog,
   kRecExp,
@@ -47,7 +48,9 @@ enum OperatorType {
   kRecDiv,
   kRecSqueeze,
   kRecCast,
-  kRecReduce
+  kRecReduce,
+  kRecPReLU,
+  kRecGatherV2
 };
 
 enum InfoType { kApplication, kConstant };
@@ -67,6 +70,7 @@ class Graph {
     std::vector<size_t> node_in;
     // Nodes that point from this node
     std::vector<size_t> node_out;
+    std::vector<size_t> node_in_aux;
     // Node Type Info: Application or Constant. Defined in enum <InfoType> .
     InfoType info;
     // Operator info. Defined in struct <OperatorRec> .
diff --git a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.cc b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.cc
index 823b1dca08..3e4eafe0a4 100644
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.cc
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.cc
@@ -171,21 +171,41 @@ void Eliminate_Aux(const size_t node_index, const std::shared_ptr<Graph> graph,
     eli.push_back(graph->nodes[node_index].node_out[i]);
   }
   eli_list->push_back(eli);
-  for (auto input_index : graph->nodes[node_index].node_in) {
-    auto it = find(graph->nodes[input_index].node_out.begin(), graph->nodes[input_index].node_out.end(), node_index);
-    if (it != graph->nodes[input_index].node_out.end()) {
-      graph->nodes[input_index].node_out.erase(it);
-      for (auto output_index : graph->nodes[node_index].node_out) {
-        graph->nodes[input_index].node_out.push_back(output_index);
-      }
+
+  for (size_t i = 0; i < graph->nodes[node_index].node_in.size(); i++) {
+    auto *incoming_outputs = &graph->nodes[graph->nodes[node_index].node_in[i]].node_out;
+    auto it = find(incoming_outputs->begin(), incoming_outputs->end(), node_index);
+    if (it != incoming_outputs->end()) {
+      it = incoming_outputs->erase(it);
+      incoming_outputs->insert(it, graph->nodes[node_index].node_out.begin(), graph->nodes[node_index].node_out.end());
+    }
+  }
+
+  for (size_t i = 0; i < graph->nodes[node_index].node_in_aux.size(); i++) {
+    auto *aux_incoming_outputs = &graph->nodes[graph->nodes[node_index].node_in_aux[i]].node_out;
+    auto it = find(aux_incoming_outputs->begin(), aux_incoming_outputs->end(), node_index);
+    if (it != aux_incoming_outputs->end()) {
+      it = aux_incoming_outputs->erase(it);
+      aux_incoming_outputs->insert(it, graph->nodes[node_index].node_out.begin(),
+                                   graph->nodes[node_index].node_out.end());
     }
   }
-  for (auto output_index : graph->nodes[node_index].node_out) {
-    auto it = find(graph->nodes[output_index].node_in.begin(), graph->nodes[output_index].node_in.end(), node_index);
-    if (it != graph->nodes[output_index].node_in.end()) {
-      graph->nodes[output_index].node_in.erase(it);
-      for (auto input_index : graph->nodes[node_index].node_in) {
-        graph->nodes[output_index].node_in.push_back(input_index);
+
+  for (size_t i = 0; i < graph->nodes[node_index].node_out.size(); i++) {
+    auto *outgoing_inputs = &graph->nodes[graph->nodes[node_index].node_out[i]].node_in;
+    auto it = find(outgoing_inputs->begin(), outgoing_inputs->end(), node_index);
+    if (it != outgoing_inputs->end()) {
+      if (graph->nodes[node_index].node_in.size() > 0) {
+        outgoing_inputs->at(std::distance(outgoing_inputs->begin(), it)) = graph->nodes[node_index].node_in[0];
+        for (size_t j = 1; j < graph->nodes[node_index].node_in.size(); j++) {
+          graph->nodes[graph->nodes[node_index].node_out[i]].node_in_aux.push_back(graph->nodes[node_index].node_in[j]);
+        }
+        for (size_t j = 1; j < graph->nodes[node_index].node_in_aux.size(); j++) {
+          graph->nodes[graph->nodes[node_index].node_out[i]].node_in_aux.push_back(
+            graph->nodes[node_index].node_in_aux[j]);
+        }
+      } else {
+        outgoing_inputs->erase(it);
       }
     }
   }
@@ -196,20 +216,22 @@ std::shared_ptr<Graph> EliminateGraph(const std::shared_ptr<Graph> graph,
                                       const std::shared_ptr<std::vector<size_t>> index_list) {
   MS_EXCEPTION_IF_NULL(graph);
   const std::set<OperatorType> type_list = {
-    OperatorType::kRecOneHot, OperatorType::kRecReLU,      OperatorType::kRecLog,     OperatorType::kRecExp,
-    OperatorType::kRecAdd,    OperatorType::kRecElmWiseOp, OperatorType::kRecBiasAdd, OperatorType::kRecSub,
-    OperatorType::kRecMul,    OperatorType::kRecDiv,       OperatorType::kRecSqueeze, OperatorType::kRecReduce,
-    OperatorType::kRecCast};
+    OperatorType::kRecReLU,      OperatorType::kRecLog,     OperatorType::kRecExp,    OperatorType::kRecAdd,
+    OperatorType::kRecElmWiseOp, OperatorType::kRecBiasAdd, OperatorType::kRecSub,    OperatorType::kRecMul,
+    OperatorType::kRecDiv,       OperatorType::kRecSqueeze, OperatorType::kRecReduce, OperatorType::kRecCast,
+    OperatorType::kRecReshape,   OperatorType::kRecGatherV2};
   for (size_t node_index = 0; node_index < (size_t)graph->nodes.size(); node_index++) {
     auto type = graph->nodes[node_index].apply.op_type;
     if (type_list.find(type) != type_list.end()) {
       Eliminate_Aux(node_index, graph, eli_list);
     }
   }
+
   index_list->reserve(graph->nodes.size());
   for (size_t i = 0; i < (size_t)graph->nodes.size(); i++) {
     index_list->push_back(i);
   }
+
   for (size_t i = 0; i < (size_t)eli_list->size(); i++) {
     if (eli_list->at(i)[0] >= index_list->size()) {
       MS_LOG(EXCEPTION) << "Failure: Operators' elements out of range.";
@@ -219,6 +241,7 @@ std::shared_ptr<Graph> EliminateGraph(const std::shared_ptr<Graph> graph,
       index_list->at(j)--;
     }
   }
+
   std::shared_ptr<Graph> new_graph(new Graph);
   for (size_t i = 0; i < graph->nodes.size(); i++) {
     if (index_list->at(i) > SIZE_MAX / 2) {
@@ -226,11 +249,23 @@ std::shared_ptr<Graph> EliminateGraph(const std::shared_ptr<Graph> graph,
     }
 
     new_graph->nodes.push_back(graph->nodes[i]);
-    for (size_t j = 0; j < new_graph->nodes[index_list->at(i)].node_in.size(); j++) {
-      new_graph->nodes[index_list->at(i)].node_in[j] = index_list->at(new_graph->nodes[index_list->at(i)].node_in[j]);
+    auto *node_in = &new_graph->nodes[index_list->at(i)].node_in;
+    for (size_t j = node_in->size(); j > 0; j--) {
+      bool IsEliminated = (index_list->at(node_in->at(j - 1)) == SIZE_MAX);
+      if (IsEliminated) {
+        node_in->erase(node_in->begin() + j - 1);
+      } else {
+        node_in->at(j - 1) = index_list->at(node_in->at(j - 1));
+      }
     }
-    for (size_t j = 0; j < new_graph->nodes[index_list->at(i)].node_out.size(); j++) {
-      new_graph->nodes[index_list->at(i)].node_out[j] = index_list->at(new_graph->nodes[index_list->at(i)].node_out[j]);
+    auto *node_out = &new_graph->nodes[index_list->at(i)].node_out;
+    for (size_t j = node_out->size(); j > 0; j--) {
+      bool IsEliminated = (index_list->at(node_out->at(j - 1)) == SIZE_MAX);
+      if (IsEliminated) {
+        node_out->erase(node_out->begin() + j - 1);
+      } else {
+        node_out->at(j - 1) = index_list->at(node_out->at(j - 1));
+      }
     }
   }
   return new_graph;
diff --git a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.h b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.h
index e6398b9556..536c04cd9f 100644
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.h
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.h
@@ -46,6 +46,7 @@ const std::map<std::string, OperatorType> DictOpType{
   {REDUCE_MAX, OperatorType::kRecReduce},
   {REDUCE_MIN, OperatorType::kRecReduce},
   {REDUCE_MEAN, OperatorType::kRecReduce},
+  {GATHERV2, OperatorType::kRecGatherV2},
 
   {RELU, OperatorType::kRecReLU},
   {"ReLU6", OperatorType::kRecReLU},
@@ -55,16 +56,18 @@ const std::map<std::string, OperatorType> DictOpType{
   {"HSigmoid", OperatorType::kRecReLU},
   {GELU, OperatorType::kRecReLU},
   {TANH, OperatorType::kRecReLU},
-  {PRELU, OperatorType::kRecReLU},
 
+  {PRELU, OperatorType::kRecPReLU},
+
+  {L2_NORMALIZE, OperatorType::kRecElmWiseOp},
   {TENSOR_ADD, OperatorType::kRecElmWiseOp},
   {SUB, OperatorType::kRecElmWiseOp},
   {MUL, OperatorType::kRecElmWiseOp},
   {DIV, OperatorType::kRecElmWiseOp},
   {REAL_DIV, OperatorType::kRecElmWiseOp},
-  {SOFTMAX, OperatorType::kRecElmWiseOp},
-  {LOG_SOFTMAX, OperatorType::kRecElmWiseOp},
-  {SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, OperatorType::kRecElmWiseOp},
+  {SOFTMAX, OperatorType::kRecSoftmax},
+  {LOG_SOFTMAX, OperatorType::kRecSoftmax},
+  {SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, OperatorType::kRecSoftmaxCrossEntropyWithLogits},
   {SQRT, OperatorType::kRecElmWiseOp},
   {NEG, OperatorType::kRecElmWiseOp},
   {POW, OperatorType::kRecElmWiseOp},
@@ -79,6 +82,7 @@ const std::map<std::string, OperatorType> DictOpType{
   {"Abs", OperatorType::kRecElmWiseOp},
   {"Acosh", OperatorType::kRecElmWiseOp},
   {"AddN", OperatorType::kRecElmWiseOp},
+  {"AccumulateNV2", OperatorType::kRecElmWiseOp},
   {"Atan2", OperatorType::kRecElmWiseOp},
   {"Erf", OperatorType::kRecElmWiseOp},
   {"Floor", OperatorType::kRecElmWiseOp},
diff --git a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_partition.cc b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_partition.cc
index 186987c0dd..0f6e736d52 100644
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_partition.cc
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_partition.cc
@@ -53,9 +53,8 @@ double GetWeights(const Graph::NodeType &node) {
     auto cost_ptr = std::make_shared<CostTensorAdd>();
 
     return cost_ptr->GetMinCostIn();
-  } else if (op.op_type == OperatorType::kRecReLU || op.op_type == OperatorType::kRecSoftmax ||
-             op.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits) {
-    // For Activation and Softmax
+  } else if (op.op_type == OperatorType::kRecReLU) {
+    // For Activation
     auto cost_ptr = std::make_shared<CostCommon>();
 
     return cost_ptr->GetMinCostIn();
@@ -69,22 +68,24 @@ double GetWeights(const Graph::NodeType &node) {
     auto cost_ptr = std::make_shared<CostBiasAdd>();
 
     return cost_ptr->GetMinCostIn();
-  } else if (op.op_type == OperatorType::kRecBatchNorm) {
-    // For BatchNorm
-    auto cost_ptr = std::make_shared<CostBatchNorm>();
-
-    return cost_ptr->GetMinCostIn(op);
-  } else if (op.op_type == OperatorType::kRecOneHot || op.op_type == OperatorType::kRecLog ||
-             op.op_type == OperatorType::kRecExp || op.op_type == OperatorType::kRecAdd ||
-             op.op_type == OperatorType::kRecSub || op.op_type == OperatorType::kRecMul ||
-             op.op_type == OperatorType::kRecDiv || op.op_type == OperatorType::kRecSqueeze ||
-             op.op_type == OperatorType::kRecCast) {
+  } else if (op.op_type == OperatorType::kRecLog || op.op_type == OperatorType::kRecExp ||
+             op.op_type == OperatorType::kRecAdd || op.op_type == OperatorType::kRecSub ||
+             op.op_type == OperatorType::kRecMul || op.op_type == OperatorType::kRecDiv ||
+             op.op_type == OperatorType::kRecSqueeze || op.op_type == OperatorType::kRecCast) {
     // For element-wise op
     auto cost_ptr = std::make_shared<CostCommon>();
 
     return cost_ptr->GetMinCostIn();
+  } else if (op.op_type == OperatorType::kRecBatchNorm || op.op_type == OperatorType::kRecOneHot ||
+             op.op_type == OperatorType::kRecPReLU || op.op_type == OperatorType::kRecSoftmax ||
+             op.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits ||
+             op.op_type == OperatorType::kRecSoftmaxCrossEntropyWithLogits) {
+    // For BatchParallel op
+    auto cost_ptr = std::make_shared<CostBatchParallel>();
+
+    return cost_ptr->GetMaxCostIn();
   } else if (op.op_type == OperatorType::kRecUnkownType) {
-    // For unknown type
+    // For Unkown type
     return 0.0;
   } else {
     MS_LOG(EXCEPTION) << "Failure: GetOperatorWeight failed.";
@@ -147,9 +148,8 @@ StrategyRec PartitionNode(const Graph::NodeType &node,
     auto cost_ptr = std::make_shared<CostTensorAdd>();
 
     return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
-  } else if (node.apply.op_type == OperatorType::kRecReLU || node.apply.op_type == OperatorType::kRecSoftmax ||
-             node.apply.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits) {
-    // For Softmax & Activation
+  } else if (node.apply.op_type == OperatorType::kRecReLU) {
+    // For Activation
     auto cost_ptr = std::make_shared<CostCommon>();
 
     return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
@@ -163,22 +163,26 @@ StrategyRec PartitionNode(const Graph::NodeType &node,
     auto cost_ptr = std::make_shared<CostBiasAdd>();
 
     return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
-  } else if (node.apply.op_type == OperatorType::kRecBatchNorm) {
-    // For BatchNorm
-    auto cost_ptr = std::make_shared<CostBatchNorm>();
-
-    return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
-  } else if (node.apply.op_type == OperatorType::kRecOneHot || node.apply.op_type == OperatorType::kRecLog ||
-             node.apply.op_type == OperatorType::kRecExp || node.apply.op_type == OperatorType::kRecAdd ||
-             node.apply.op_type == OperatorType::kRecSub || node.apply.op_type == OperatorType::kRecMul ||
-             node.apply.op_type == OperatorType::kRecDiv || node.apply.op_type == OperatorType::kRecSqueeze ||
-             node.apply.op_type == OperatorType::kRecCast) {
+  } else if (node.apply.op_type == OperatorType::kRecLog || node.apply.op_type == OperatorType::kRecExp ||
+             node.apply.op_type == OperatorType::kRecAdd || node.apply.op_type == OperatorType::kRecSub ||
+             node.apply.op_type == OperatorType::kRecMul || node.apply.op_type == OperatorType::kRecDiv ||
+             node.apply.op_type == OperatorType::kRecSqueeze || node.apply.op_type == OperatorType::kRecCast) {
     // For element-wise op
     auto cost_ptr = std::make_shared<CostCommon>();
 
     return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
+  } else if (node.apply.op_type == OperatorType::kRecBatchNorm || node.apply.op_type == OperatorType::kRecOneHot ||
+             node.apply.op_type == OperatorType::kRecPReLU || node.apply.op_type == kRecSoftmax ||
+             node.apply.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits) {
+    // For BatchParallel type
+    auto cost_ptr = std::make_shared<CostBatchParallel>();
+    return cost_ptr->GetOptimalStr(node);
+  } else if (node.apply.op_type == OperatorType::kRecSoftmaxCrossEntropyWithLogits) {
+    // For SoftmaxCrossEntropyWithLogits type
+    auto cost_ptr = std::make_shared<CostSoftmaxCrossEntropyWithLogits>();
+    return cost_ptr->GetOptimalStr(node);
   } else if (node.apply.op_type == OperatorType::kRecUnkownType) {
-    // For unknown type
+    // For Unkown type
     StrategyRec default_strategy;
     return default_strategy;
   } else {
diff --git a/mindspore/ccsrc/parallel/context.cc b/mindspore/ccsrc/parallel/context.cc
index de92bba507..8957dc842c 100644
--- a/mindspore/ccsrc/parallel/context.cc
+++ b/mindspore/ccsrc/parallel/context.cc
@@ -48,6 +48,7 @@ ParallelContext::ParallelContext() { Reset(); }
 
 void ParallelContext::Reset() {
   mirror_mean_ = false;
+  full_batch_ = false;
   cast_before_mirror_ = true;
   loss_repeated_mean_ = true;
   device_num_ = 1;
@@ -75,6 +76,8 @@ void ParallelContext::set_global_rank(int32_t global_rank) {
 
 void ParallelContext::set_mirror_mean(bool mirror_mean) { mirror_mean_ = mirror_mean; }
 
+void ParallelContext::set_full_batch(bool full_batch) { full_batch_ = full_batch; }
+
 void ParallelContext::set_cast_before_mirror(bool cast_before_mirror) { cast_before_mirror_ = cast_before_mirror; }
 
 void ParallelContext::set_loss_repeated_mean(bool loss_repeated_mean) { loss_repeated_mean_ = loss_repeated_mean; }
@@ -155,8 +158,8 @@ void ParallelParameterContextRestoreInNoTraining(const FuncGraphPtr &func_graph,
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(param_node);
   MS_EXCEPTION_IF_NULL(ptr);
-  if (!func_graph->has_flag(AUTO_PARALLEL) || (func_graph->flags().count(TRAINING) == 0) ||
-      func_graph->flags()[TRAINING]) {
+  if (!func_graph->has_flag(AUTO_PARALLEL) || (func_graph->attrs().count(TRAINING) == 0) ||
+      func_graph->has_flag(TRAINING)) {
     return;
   }
 
diff --git a/mindspore/ccsrc/parallel/context.h b/mindspore/ccsrc/parallel/context.h
index 32f9838d6c..efa528d179 100644
--- a/mindspore/ccsrc/parallel/context.h
+++ b/mindspore/ccsrc/parallel/context.h
@@ -55,6 +55,9 @@ class ParallelContext {
   void set_mirror_mean(bool mirror_mean);
   bool mirror_mean() const { return mirror_mean_; }
 
+  void set_full_batch(bool full_batch);
+  bool full_batch() const { return full_batch_; }
+
   void set_cast_before_mirror(bool cast_before_mirror);
   bool cast_before_mirror() const { return cast_before_mirror_; }
 
@@ -103,6 +106,7 @@ class ParallelContext {
   ParallelContext();
   static std::shared_ptr<ParallelContext> inst_context_;
   bool mirror_mean_;
+  bool full_batch_;
   bool cast_before_mirror_;
   bool loss_repeated_mean_;
   int32_t device_num_;
diff --git a/mindspore/ccsrc/parallel/dynamic_creator.h b/mindspore/ccsrc/parallel/dynamic_creator.h
index 4fd5f34cf2..f8e1d62d0a 100644
--- a/mindspore/ccsrc/parallel/dynamic_creator.h
+++ b/mindspore/ccsrc/parallel/dynamic_creator.h
@@ -121,6 +121,7 @@ REGISTER(SparseSoftmaxCrossEntropyWithLogitsInfo);
 REGISTER(AssignSubInfo);
 REGISTER(ReLUInfo);
 REGISTER(GatherV2Info);
+REGISTER(SparseGatherV2Info);
 REGISTER(SqrtInfo);
 REGISTER(SigmoidInfo);
 REGISTER(GetNextInfo);
diff --git a/mindspore/ccsrc/parallel/graph_util/generate_graph.cc b/mindspore/ccsrc/parallel/graph_util/generate_graph.cc
index f5f0fe85cb..7bd2fa808d 100644
--- a/mindspore/ccsrc/parallel/graph_util/generate_graph.cc
+++ b/mindspore/ccsrc/parallel/graph_util/generate_graph.cc
@@ -28,9 +28,14 @@ namespace parallel {
 std::string GetOpPythonPath(const OperatorName &op_name) {
   // almost all ops are defined in two main paths
   const std::string ops_module = OP_PATH;
+  const std::string inner_ops_module = INNER_OP_PATH;
   py::module mod = py::module::import(common::SafeCStr(ops_module));
+  py::module inner_mod = py::module::import(common::SafeCStr(inner_ops_module));
   if (!py::hasattr(mod, common::SafeCStr(op_name))) {
-    MS_LOG(EXCEPTION) << ops_module << " don't have op:" << op_name;
+    if (!py::hasattr(inner_mod, common::SafeCStr(op_name))) {
+      MS_LOG(EXCEPTION) << ops_module << " or " << inner_ops_module << " don't have op:" << op_name;
+    }
+    return inner_ops_module;
   }
   return ops_module;
 }
diff --git a/mindspore/ccsrc/parallel/node_check.cc b/mindspore/ccsrc/parallel/node_check.cc
index 7fecd307c7..6f30a8ec1c 100644
--- a/mindspore/ccsrc/parallel/node_check.cc
+++ b/mindspore/ccsrc/parallel/node_check.cc
@@ -75,7 +75,7 @@ const std::set<std::string> BLACK_LIST = {TUPLE_GETITEM,
                                           DROPOUT_GEN_MASK,
                                           EMBED,
                                           CREATINSTANCE,
-                                          ZEROSLIKETENSOR,
+                                          ZEROSLIKE,
                                           ASSIGN,
                                           REF_TO_EMBED,
                                           STOP_GRADIENT};
diff --git a/mindspore/ccsrc/parallel/ops_info/dropout_do_mask_info.cc b/mindspore/ccsrc/parallel/ops_info/dropout_do_mask_info.cc
index 87b8d15cca..e88868c772 100644
--- a/mindspore/ccsrc/parallel/ops_info/dropout_do_mask_info.cc
+++ b/mindspore/ccsrc/parallel/ops_info/dropout_do_mask_info.cc
@@ -204,7 +204,7 @@ Status DropoutDoMaskInfo::InitForCostModel(const StrategyPtr &strategy) {
 
 PrimitivePtr GetDropoutGenMaskPrim(const CNodePtr &cnode) {
   MS_EXCEPTION_IF_NULL(cnode);
-  if (cnode->inputs().size() != DROPOUT_DO_MASK_CNODE_INPUT_SIZE) {
+  if (cnode->size() != DROPOUT_DO_MASK_CNODE_INPUT_SIZE) {
     MS_LOG(EXCEPTION) << "The size of dropout do mask cnode's inputs must be " << DROPOUT_DO_MASK_CNODE_INPUT_SIZE;
   }
 
@@ -215,8 +215,7 @@ PrimitivePtr GetDropoutGenMaskPrim(const CNodePtr &cnode) {
   }
 
   auto dropout_gen_mask_cnode = dropout_gen_mask->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(dropout_gen_mask_cnode);
-  if (dropout_gen_mask_cnode->inputs().size() != DROPOUT_GEN_MASK_CNODE_INPUT_SIZE) {
+  if (dropout_gen_mask_cnode->size() != DROPOUT_GEN_MASK_CNODE_INPUT_SIZE) {
     MS_LOG(EXCEPTION) << "The size of dropout gen mask cnode's inputs must be " << DROPOUT_GEN_MASK_CNODE_INPUT_SIZE;
   }
   if (!IsValueNode<Primitive>(dropout_gen_mask_cnode->input(0))) {
@@ -233,11 +232,45 @@ PrimitivePtr GetDropoutGenMaskPrim(const CNodePtr &cnode) {
   return prim;
 }
 
+void SetGenMaskShape(const CNodePtr &cnode, const Shape &input_slice_shape) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (cnode->size() != DROPOUT_DO_MASK_CNODE_INPUT_SIZE) {
+    MS_LOG(EXCEPTION) << "The size of dropout do mask cnode's inputs must be " << DROPOUT_DO_MASK_CNODE_INPUT_SIZE;
+  }
+
+  AnfNodePtr dropout_gen_mask = cnode->input(DROPOUT_GEN_MASK_INDEX);
+  MS_EXCEPTION_IF_NULL(dropout_gen_mask);
+  if (!dropout_gen_mask->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << "The dropout do mask cnode's input[" << DROPOUT_GEN_MASK_INDEX << "] must be a cnode.";
+  }
+
+  auto dropout_gen_mask_cnode = dropout_gen_mask->cast<CNodePtr>();
+  if (dropout_gen_mask_cnode->size() != DROPOUT_GEN_MASK_CNODE_INPUT_SIZE) {
+    MS_LOG(EXCEPTION) << "The size of dropout gen mask cnode's inputs must be " << DROPOUT_GEN_MASK_CNODE_INPUT_SIZE;
+  }
+
+  if (!IsValueNode<ValueTuple>(dropout_gen_mask_cnode->input(1))) {
+    MS_LOG(EXCEPTION) << "The input[1] of dropout gen mask cnode is not ValueTuple.";
+  }
+
+  FuncGraphPtr func_graph = cnode->func_graph();
+  MS_EXCEPTION_IF_NULL(func_graph);
+  FuncGraphManagerPtr manager = func_graph->manager();
+  if (manager == nullptr) {
+    MS_LOG(EXCEPTION) << "Failure: AddNode error since manager is nullptr.";
+  }
+
+  ValuePtr new_shape = MakeValue(input_slice_shape);
+  AnfNodePtr val = NewValueNode(new_shape);
+  (void)manager->Replace(dropout_gen_mask_cnode->input(1), val);
+}
+
 // DropoutDoMask needs to be used together with DropoutGenMask. Only the first input tensor of DropoutGenMask is
 // split. Find the DropoutGenMask node in the anf graph according to DropoutDoMask node, and modify the input shape
 // of DropoutGenMask according to the strategy of DropoutDoMask. When the DropoutDoMask performs repeated calculation
 // and both seeds of DropoutGenMask are 0, two new seeds are automatically generated for DropoutGenMask.
-Operator DropoutDoMaskInfo::GetDropoutGenMaskReplaceOp(const CNodePtr &cnode) {
+std::vector<Operator> DropoutDoMaskInfo::GetDropoutGenMaskReplaceOp(const CNodePtr &cnode) {
+  std::vector<Operator> replace_ops;
   MS_EXCEPTION_IF_NULL(cnode);
   PrimitivePtr prim = GetDropoutGenMaskPrim(cnode);
   MS_EXCEPTION_IF_NULL(prim);
@@ -260,15 +293,20 @@ Operator DropoutDoMaskInfo::GetDropoutGenMaskReplaceOp(const CNodePtr &cnode) {
   if ((attr.find(SEED0) == attr.end()) || (attr.find(SEED1) == attr.end())) {
     MS_LOG(EXCEPTION) << "The attrs of dropout gen mask must be have seed0 and seed1";
   }
+
+  Shape input_slice_shape = inputs_tensor_info_[0].slice_shape();
   int32_t seed_0 = GetValue<int32_t>(attr[SEED0]);
   int32_t seed_1 = GetValue<int32_t>(attr[SEED1]);
   if ((seed_0 == 0) && (seed_1 == 0) && (repeated_calc_num_ > 1)) {
     seed_0 = SEED_NUM;
     seed_1 = SEED_NUM;
     SEED_NUM++;
+  } else {
+    SetGenMaskShape(cnode, input_slice_shape);
+    MS_LOG(DEBUG) << "The input slice shape droupout is " << ShapeToString(input_slice_shape);
+    return replace_ops;
   }
 
-  Shape input_slice_shape = inputs_tensor_info_[0].slice_shape();
   ValuePtr new_shape = MakeValue(input_slice_shape);
   Attr attr_0 = std::make_pair(SEED0, MakeValue(seed_0));
   Attr attr_1 = std::make_pair(SEED1, MakeValue(seed_1));
@@ -278,7 +316,8 @@ Operator DropoutDoMaskInfo::GetDropoutGenMaskReplaceOp(const CNodePtr &cnode) {
   OperatorParams params = {std::make_pair(param_0, 1), std::make_pair(param_1, 2)};
   OperatorArgs args = std::make_pair(attrs, params);
   Operator replace_op = {std::make_pair(DROPOUT_GEN_MASK, args)};
-  return replace_op;
+  replace_ops.push_back(replace_op);
+  return replace_ops;
 }
 }  // namespace parallel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/parallel/ops_info/dropout_do_mask_info.h b/mindspore/ccsrc/parallel/ops_info/dropout_do_mask_info.h
index c0d112f52d..c51a0a9513 100644
--- a/mindspore/ccsrc/parallel/ops_info/dropout_do_mask_info.h
+++ b/mindspore/ccsrc/parallel/ops_info/dropout_do_mask_info.h
@@ -41,7 +41,7 @@ class DropoutDoMaskInfo : public OperatorInfo {
   Status SetCostUnderStrategy(const StrategyPtr &strategy) override;
   Status InitForCostModel(const StrategyPtr &strategy) override;
   std::shared_ptr<std::vector<std::vector<int32_t>>> GenerateBatchStrategies() override;
-  Operator GetDropoutGenMaskReplaceOp(const CNodePtr &cnode);
+  std::vector<Operator> GetDropoutGenMaskReplaceOp(const CNodePtr &cnode);
 
  protected:
   Status CheckStrategy(const StrategyPtr &strategy) override;
diff --git a/mindspore/ccsrc/parallel/ops_info/gather_v2_p_info.cc b/mindspore/ccsrc/parallel/ops_info/gather_v2_p_info.cc
index e2d01fb779..7a16aeafcb 100644
--- a/mindspore/ccsrc/parallel/ops_info/gather_v2_p_info.cc
+++ b/mindspore/ccsrc/parallel/ops_info/gather_v2_p_info.cc
@@ -44,6 +44,24 @@ Status GatherV2PInfo::GetAttrs() {
   }
   axis_ = axis;
 
+  // get target
+  auto target_iter = attrs_.find(TARGET);
+  if (target_iter != attrs_.end()) {
+    MS_EXCEPTION_IF_NULL(target_iter->second);
+    if (target_iter->second->isa<StringImm>()) {
+      target_ = target_iter->second->cast<StringImmPtr>()->value();
+    } else {
+      MS_LOG(ERROR) << name_ << " : The value of target is not a string.";
+      return FAILED;
+    }
+  }
+
+  // target=CPU, axis must be 0
+  if (target_ == "CPU" && axis_ != 0) {
+    MS_LOG(ERROR) << name_ << ": target is CPU, axis must be 0, but got " << axis_;
+    return FAILED;
+  }
+
   return SUCCESS;
 }
 
@@ -61,8 +79,8 @@ Status GatherV2PInfo::CheckStrategy(const StrategyPtr &strategy) {
   auto param_shape = inputs_shape_.at(0);
   auto param_strategy = strategy->GetInputDim().at(0);
   auto slice_shape = param_shape.at(param_shape.size() - 1) / param_strategy.at(param_strategy.size() - 1);
-  if (slice_shape % 8 != 0) {
-    MS_LOG(ERROR) << name_ << ": Last dim of param slice shape need 32Byte aligned.";
+  if (slice_shape % 8 != 0 && slice_shape != 1) {
+    MS_LOG(DEBUG) << name_ << ": Last dim of param slice shape need 32Byte aligned.";
     return FAILED;
   }
 
@@ -74,20 +92,20 @@ Status GatherV2PInfo::CheckStrategy(const StrategyPtr &strategy) {
 
   // don't support scalar index
   if (inputs_shape_.at(1).size() == 0) {
-    MS_LOG(ERROR) << name_ << ": Don't support scalar index.";
+    MS_LOG(DEBUG) << name_ << ": Don't support scalar index.";
     return FAILED;
   }
 
   // axis=0, index_shape(0)%param_strategy(0) must be 0
   Shape index_shape = inputs_shape_.at(1);
   if ((axis_ == 0) && (index_shape.at(0) % param_strategy.at(0) != 0)) {
-    MS_LOG(ERROR) << name_ << ": index_shape(0) can't be divided by param_strategy(0).";
+    MS_LOG(DEBUG) << name_ << ": index_shape(0) can't be divided by param_strategy(0).";
     return FAILED;
   }
 
   // axis != 0, param_shape(0)%(param_strategy(0)*param_strategy(axis)) must be 0
   if (axis_ != 0 && param_shape.at(0) % (param_strategy.at(0) * param_strategy.at(IntToSize(axis_))) != 0) {
-    MS_LOG(ERROR) << name_ << ": index_shape(0) can't be divided by (param_strategy(0)*param_strategy(axis)).";
+    MS_LOG(DEBUG) << name_ << ": index_shape(0) can't be divided by (param_strategy(0)*param_strategy(axis)).";
     return FAILED;
   }
 
@@ -95,7 +113,7 @@ Status GatherV2PInfo::CheckStrategy(const StrategyPtr &strategy) {
   auto index_strategy = strategy->GetInputDim().at(1);
   auto product_i = std::accumulate(index_strategy.begin(), index_strategy.end(), 1, std::multiplies<int>());
   if ((param_strategy.at(IntToSize(axis_)) != 1) && (product_i != 1)) {
-    MS_LOG(ERROR) << name_ << ": param is splited at dim (axis)" << axis_ << " ,index can't be splited.";
+    MS_LOG(DEBUG) << name_ << ": param is splited at dim (axis)" << axis_ << " ,index can't be splited.";
     return FAILED;
   }
 
@@ -104,7 +122,7 @@ Status GatherV2PInfo::CheckStrategy(const StrategyPtr &strategy) {
   size_t dev_num = g_device_manager->GetDeviceListByStageId(0).size();
   auto product_p = std::accumulate(param_strategy.begin(), param_strategy.end(), 1, std::multiplies<int>());
   if (IntToSize(product_p) != dev_num && param_strategy.at(IntToSize(axis_)) != 1) {
-    MS_LOG(ERROR) << name_ << ": Invalid strategy. Don't support repeated calc.";
+    MS_LOG(DEBUG) << name_ << ": Invalid strategy. Don't support repeated calc.";
     return FAILED;
   }
 
@@ -267,6 +285,11 @@ Status GatherV2PInfo::InferBias() {
   int32_t rank = g_device_manager->global_rank();
   auto input_shape = inputs_shape_.at(0);
   auto params_strategy = strategy_->GetInputDim().at(0);
+  // axis don't split
+  if (params_strategy.at(axis_) == 1) {
+    bias_ = 0;
+    return SUCCESS;
+  }
   // params_size=1, axis=0
   if ((input_shape.size() == 1) && (axis_ == 0)) {
     slice_size_ = input_shape.at(0) / params_strategy.at(0);
@@ -290,18 +313,94 @@ Status GatherV2PInfo::InferBias() {
 }
 
 Status GatherV2PInfo::InferGroup() {
-  std::vector<Group> group_list;
   auto param_strategy = strategy_->GetInputDim().at(0);
   size_t dim = IntToSize(axis_);
   if (param_strategy.at(IntToSize(axis_)) != 1 && inputs_shape_.at(0).size() == 2) {
     dim = (axis_ + 1) % 2;
   }
-  if (CreateGroupByDim(dim, &group_list) != SUCCESS) {
+  CheckGlobalDeviceManager();
+  MS_EXCEPTION_IF_NULL(g_device_manager);
+  int32_t rank = g_device_manager->global_rank();
+  RankList dev_list = g_device_manager->GetDeviceListByStageId(0);
+  DeviceMatrix dev_matrix(rank, dev_list, dev_matrix_shape_);
+  RankList group_devices;
+  if (dev_matrix.GetDevicesAlongDim(SizeToUint(dim), &group_devices) != SUCCESS) {
     MS_LOG(ERROR) << name_ << ": Create group failed.";
     return FAILED;
   }
+  if (group_devices.size() == 1) {
+    MS_LOG(INFO) << "the group is empty";
+    return SUCCESS;
+  }
+
+  group_ = g_device_manager->CreateGroup(group_devices);
+  return SUCCESS;
+}
+
+std::vector<int32_t> GetRankFromGroup(const Group &group) {
+  std::vector<int32_t> rank_list;
+  auto device_list = group.GetDevicesList();
+  for (auto &device : device_list) {
+    rank_list.insert(rank_list.end(), device.rank() % 8);
+  }
+  return rank_list;
+}
+
+Status GatherV2PInfo::InferForwardCommunication() {
+  forward_op_.clear();
+  if (target_ != CPU) {
+    return SUCCESS;
+  }
+  auto param_strategy = strategy_->GetInputDim().at(0);
+  // don't split axis, no need forward communication
+  if (param_strategy.at(IntToSize(axis_)) == 1) {
+    return SUCCESS;
+  }
+  // split axis
+  OperatorName operator_name;
+  if (InferGroup() != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Infer Group failed.";
+    return FAILED;
+  }
+  auto group_size = group_.GetDevNum();
+  Attr attr_group;
+  if (host_reduce_scatter_) {
+    // group size <= 8
+    std::vector<int32_t> rank_list;
+    if (group_size <= 8) {
+      reduce_scatter_flag_ = false;
+      operator_name = HOST_REDUCE_SCATTER;
+      rank_list = GetRankFromGroup(group_);
+      attr_group = std::make_pair(GROUP, MakeValue(rank_list));
+    } else {
+      // group size > 8, don't support host reduce_scatter
+      reduce_scatter_flag_ = true;
+      split_num_ = SizeToInt(group_size / 8);
+      CheckGlobalDeviceManager();
+      operator_name = REDUCE_SCATTER;
+      int32_t rank = g_device_manager->global_rank();
+      size_t repeat = group_size / 8;
+      for (size_t i = 0; i < repeat; ++i) {
+        rank_list.push_back(rank + SizeToInt(i * 8));
+      }
+      Group g = g_device_manager->CreateGroup(rank_list);
+      attr_group = std::make_pair(GROUP, MakeValue(g.name()));
+    }
+  } else {
+    operator_name = REDUCE_SCATTER;
+    if (InferGroup() != SUCCESS) {
+      MS_LOG(ERROR) << name_ << ": Infer Group failed.";
+      return FAILED;
+    }
+    attr_group = std::make_pair(GROUP, MakeValue(group_.name()));
+  }
+  Attr attr_op = std::make_pair(OP, MakeValue(REDUCE_OP_SUM));
+  OperatorAttrs attrs = {attr_op, attr_group};
+  OperatorParams params;
+  OperatorArgs args = std::make_pair(attrs, params);
+  Operator op = std::make_pair(operator_name, args);
 
-  group_ = group_list.at(0);
+  forward_op_.push_back(op);
   return SUCCESS;
 }
 
@@ -320,7 +419,7 @@ Status GatherV2PInfo::ComputeReplaceGraph(const CNodePtr &cnode) {
   auto minimum = gen_g.PushBack({gen_g.NewOpInst(MINIMUM), relu, CreateInt32Tensor(slice_size_ - 1)});
   auto equal = gen_g.PushBack({gen_g.NewOpInst(EQUAL), sub, minimum});
   auto gather_v2 =
-    gen_g.PushBack({gen_g.NewOpInst(GATHERV2), gen_g.virtual_input_node(), minimum, CreatInt32Imm(axis_)});
+    gen_g.PushBack({gen_g.NewOpInst(replace_op_name_), gen_g.virtual_input_node(), minimum, CreatInt32Imm(axis_)});
   auto dtype = gen_g.PushBack({gen_g.NewOpInst(DTYPE), gather_v2});
   auto cast = gen_g.PushBack({gen_g.NewOpInst(CAST), equal, dtype});
   auto expand_dims = gen_g.PushBack({gen_g.NewOpInst(EXPAND_DIMS), cast, CreatInt32Imm(axis_ - 1)});
@@ -346,6 +445,10 @@ Status GatherV2PInfo::ComputeReplaceGraph(const CNodePtr &cnode) {
 
 ReplaceGraphPtr GatherV2PInfo::replace_graph(const CNodePtr &cnode) {
   auto param_strategy = strategy_->GetInputDim().at(0);
+  // target_ == CPU, no need to raplace graph
+  if (target_ == CPU) {
+    return nullptr;
+  }
   if (param_strategy.at(IntToSize(axis_)) != 1 && ComputeReplaceGraph(cnode) != SUCCESS) {
     MS_LOG(ERROR) << name_ << ": ComputeReplaceGraph failed.";
     return nullptr;
@@ -353,11 +456,34 @@ ReplaceGraphPtr GatherV2PInfo::replace_graph(const CNodePtr &cnode) {
   return replace_graph_;
 }
 
+Status GatherV2PInfo::ComputeReplaceOp() {
+  if (InferBias() != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Infer offset failed.";
+    return FAILED;
+  }
+  OperatorName op_name = EMBEDDING_LOOKUP;
+  OperatorAttrs attrs;
+  Attr param_offset = std::make_pair("offset", MakeValue(bias_));
+  Attr param_flag = std::make_pair("reduce_scatter_flag", MakeValue(reduce_scatter_flag_));
+  Attr param_split_num = std::make_pair("split_num", MakeValue(split_num_));
+  OperatorParams params = {std::make_pair(param_offset, 3), std::make_pair(param_flag, 4),
+                           std::make_pair(param_split_num, 5)};
+  OperatorArgs args = std::make_pair(attrs, params);
+  Operator op = std::make_pair(op_name, args);
+  replace_op_.push_back(op);
+
+  return SUCCESS;
+}
+
 Status GatherV2PInfo::Init(const StrategyPtr &strategy) {
   if (InitWithAutoRepeatCalc(strategy) != SUCCESS) {
     MS_LOG(ERROR) << name_ << ": Init failed.";
     return FAILED;
   }
+  // only target_ == CPU, we need to replace op
+  if (target_ == CPU && ComputeReplaceOp() != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": ComputeReplaceOp failed.";
+  }
   MS_LOG(INFO) << name_ << ": Init success.";
   return SUCCESS;
 }
diff --git a/mindspore/ccsrc/parallel/ops_info/gather_v2_p_info.h b/mindspore/ccsrc/parallel/ops_info/gather_v2_p_info.h
index a87b9838c9..83868606d1 100644
--- a/mindspore/ccsrc/parallel/ops_info/gather_v2_p_info.h
+++ b/mindspore/ccsrc/parallel/ops_info/gather_v2_p_info.h
@@ -49,7 +49,7 @@ class GatherV2PInfo : public OperatorInfo {
  protected:
   Status CheckStrategy(const StrategyPtr &strategy) override;
   Status InferMirrorOps() override;
-  Status InferForwardCommunication() override { return SUCCESS; }
+  Status InferForwardCommunication() override;
   Status InferTensorInfo() override;
   Status InferDevMatrixShape() override;
   Status InferTensorMap() override;
@@ -57,14 +57,31 @@ class GatherV2PInfo : public OperatorInfo {
 
  private:
   Status ComputeReplaceGraph(const CNodePtr &cnode);
+  Status ComputeReplaceOp();
   Status InferBias();
   Status InferGroup();
 
   int32_t axis_;
+  std::string target_;
+  std::string replace_op_name_ = GATHERV2;
   int32_t bias_;
   int32_t slice_size_;
   Shape out_dev_matrix_shape_;
   Group group_;
+  bool reduce_scatter_flag_ = false;
+  int32_t split_num_ = 1;
+  bool host_reduce_scatter_ = false;
+};
+
+class SparseGatherV2Info : public GatherV2PInfo {
+ public:
+  SparseGatherV2Info(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape,
+                     const PrimitiveAttrs &attrs)
+      : GatherV2PInfo(name, inputs_shape, outputs_shape, attrs) {}
+  ~SparseGatherV2Info() override = default;
+
+ private:
+  std::string replace_op_name_ = SPARSE_GATHERV2;
 };
 }  // namespace parallel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/parallel/ops_info/get_next_info.cc b/mindspore/ccsrc/parallel/ops_info/get_next_info.cc
index 29d519fda8..0fb49364f0 100644
--- a/mindspore/ccsrc/parallel/ops_info/get_next_info.cc
+++ b/mindspore/ccsrc/parallel/ops_info/get_next_info.cc
@@ -24,15 +24,23 @@
 #include "ir/value.h"
 #include "parallel/device_matrix.h"
 #include "parallel/strategy.h"
+#include "parallel/context.h"
 #include "parallel/tensor_layout/tensor_redistribution.h"
 
 namespace mindspore {
 namespace parallel {
 Status GetNextInfo::InferTensorMap() {
+  MS_EXCEPTION_IF_NULL(ParallelContext::GetInstance());
+  bool full_batch = ParallelContext::GetInstance()->full_batch();
+
   for (auto shp : shapes_) {
     TensorMap out_tensor_map;
     for (size_t i = 0; i < shp.size(); ++i) {
-      out_tensor_map.push_back(SizeToInt(dev_matrix_shape_.size() - i - 1));
+      if (full_batch) {
+        out_tensor_map.push_back(MAP_NONE);
+      } else {
+        out_tensor_map.push_back(SizeToInt(dev_matrix_shape_.size() - i - 1));
+      }
     }
     outputs_tensor_map_.push_back(out_tensor_map);
   }
@@ -190,6 +198,9 @@ Status GetNextInfo::GetAttrs() {
 }
 
 Status GetNextInfo::InferReplaceOps(const StrategyPtr &) {
+  MS_EXCEPTION_IF_NULL(ParallelContext::GetInstance());
+  bool full_batch = ParallelContext::GetInstance()->full_batch();
+
   Shapes out_shapes = outputs_shape_;
   for (size_t i = 0; i < out_shapes.size(); ++i) {
     if (dev_num_ <= 0) {
@@ -200,7 +211,9 @@ Status GetNextInfo::InferReplaceOps(const StrategyPtr &) {
       MS_LOG(ERROR) << name_ << " : batch num cannot floor div dev num.";
       return FAILED;
     }
-    out_shapes[i][0] = out_shapes[i][0] / dev_num_;
+    if (!full_batch) {
+      out_shapes[i][0] = out_shapes[i][0] / dev_num_;
+    }
   }
   ValuePtr new_shapes = MakeValue(out_shapes);
   Attr attr_types = std::make_pair(TYPES, attrs_[TYPES]);
diff --git a/mindspore/ccsrc/parallel/ops_info/ops_utils.h b/mindspore/ccsrc/parallel/ops_info/ops_utils.h
index 4da54a358d..4b8f61bb2e 100644
--- a/mindspore/ccsrc/parallel/ops_info/ops_utils.h
+++ b/mindspore/ccsrc/parallel/ops_info/ops_utils.h
@@ -55,6 +55,7 @@ constexpr char REDUCE_OP_SUM[] = "sum";
 constexpr char REDUCE_OP_MAX[] = "max";
 constexpr char REDUCE_OP_MIN[] = "min";
 constexpr char OP_PATH[] = "mindspore.ops.operations";
+constexpr char INNER_OP_PATH[] = "mindspore.ops.operations._inner_ops";
 constexpr char GET_OP_FUNCTION_PATH[] = "mindspore.parallel._utils";
 constexpr char GET_OP_FUNCTION[] = "_get_python_op";
 constexpr char KEEP_DIMS[] = "keep_dims";
@@ -72,10 +73,12 @@ constexpr char OP[] = "op";
 constexpr char IDENTITY_INFO[] = "identity_info";
 constexpr char DIVISOR[] = "divisor";
 constexpr char NONE[] = "None";
-constexpr char DEPEND[] = "depend";
+constexpr char DEPEND[] = "Depend";
 constexpr char BATCH_PARALLEL[] = "BatchParallel";
 
 constexpr char ACTIVATION_TYPE[] = "activation_type";
+constexpr char TARGET[] = "primitive_target";
+constexpr char CPU[] = "CPU";
 constexpr char TRANSPOSE_A[] = "transpose_a";
 constexpr char TRANSPOSE_B[] = "transpose_b";
 constexpr char SHAPE[] = "shape";
@@ -127,6 +130,7 @@ constexpr char FORWARD_OP[] = "forward_op";
 constexpr char REDISTRIBUTION_OP[] = "redistribution_op";
 constexpr char DARA_PARALLEL[] = "data_parallel";
 constexpr char FORWARD_REDUCE_SCATTER[] = "forward_reduce_scatter";
+constexpr char OPTIMIZER_SUB_STRING[] = "optimizer";
 
 // Operator
 constexpr char VIRTUAL_DIV[] = "_VirtualDiv";
@@ -141,6 +145,8 @@ constexpr char MIRROR_OPERATOR[] = "_MirrorOperator";
 constexpr char STRIDED_SLICE[] = "StridedSlice";
 constexpr char ALL_GATHER[] = "AllGather";
 constexpr char REDUCE_SCATTER[] = "ReduceScatter";
+constexpr char HOST_REDUCE_SCATTER[] = "HostReduceScatter";
+constexpr char EMBEDDING_LOOKUP[] = "EmbeddingLookup";
 constexpr char CONCAT[] = "Concat";
 constexpr char SOFTMAX_CROSS_ENTROPY_WITH_LOGITS[] = "SoftmaxCrossEntropyWithLogits";
 constexpr char SIGMOID_CROSS_ENTROPY_WITH_LOGITS[] = "SigmoidCrossEntropyWithLogits";
@@ -201,6 +207,7 @@ constexpr char EQUAL[] = "Equal";
 constexpr char NOT_EQUAL[] = "NotEqual";
 constexpr char LOGICALNOT[] = "LogicalNot";
 constexpr char GATHERV2[] = "GatherV2";
+constexpr char SPARSE_GATHERV2[] = "SparseGatherV2";
 constexpr char STRIDEDSLICE[] = "StridedSlice";
 constexpr char BROADCAST[] = "Broadcast";
 constexpr char SQRT[] = "Sqrt";
@@ -211,6 +218,16 @@ constexpr char NEG[] = "Neg";
 constexpr char BATCH_MATMUL[] = "BatchMatMul";
 constexpr char EXPAND_DIMS[] = "ExpandDims";
 constexpr char SQUARE[] = "Square";
+constexpr char BATCHMATMUL[] = "BatchMatMul";
+constexpr char TOPK[] = "TopK";
+constexpr char IN_TOPK[] = "InTopK";
+constexpr char PACK[] = "Pack";
+constexpr char GATHER_ND[] = "GatherNd";
+constexpr char UNSORTEF_SEGMENT_MIND[] = "UnsortedSegmentMinD";
+constexpr char UNSORTEF_SEGMENT_PRODD[] = "UnsortedSegmentProdD";
+constexpr char DEPTHWISE_CONV2D_NATIVE[] = "DepthwiseConv2dNative";
+constexpr char DEPTHWISE_CONV2D[] = "DepthwiseConv2D";
+constexpr char ADD[] = "Add";
 
 // Parallel don't care
 constexpr char TUPLE_GETITEM[] = "tuple_getitem";
@@ -263,7 +280,7 @@ constexpr char COL2IMV1[] = "col2im_v1";
 constexpr char RESOLVE[] = "resolve";
 constexpr char EMBED[] = "embed";
 constexpr char CREATINSTANCE[] = "create_instance";
-constexpr char ZEROSLIKETENSOR[] = "zeros_like_tensor";
+constexpr char ZEROSLIKE[] = "ZerosLike";
 constexpr char REF_TO_EMBED[] = "RefToEmbed";
 constexpr char STOP_GRADIENT[] = "stop_gradient";
 
diff --git a/mindspore/ccsrc/parallel/ops_info/virtual_dataset_info.cc b/mindspore/ccsrc/parallel/ops_info/virtual_dataset_info.cc
index 4b695ba62d..ce8b04d802 100644
--- a/mindspore/ccsrc/parallel/ops_info/virtual_dataset_info.cc
+++ b/mindspore/ccsrc/parallel/ops_info/virtual_dataset_info.cc
@@ -23,6 +23,7 @@
 #include "parallel/device_manager.h"
 #include "parallel/device_matrix.h"
 #include "parallel/step_parallel.h"
+#include "parallel/context.h"
 #include "utils/log_adapter.h"
 
 namespace mindspore {
@@ -93,59 +94,21 @@ Status VirtualDatasetInfo::InferDevMatrixShape() {
   return SUCCESS;
 }
 
-Status VirtualDatasetInfo::InferMirrorOps() {
-  mirror_ops_.clear();
-
-  int32_t stage = strategy_->GetInputStage();
-  CheckGlobalDeviceManager();
-  RankList dev_list = g_device_manager->GetDeviceListByStageId(stage);
-  if (dev_list.empty()) {
-    MS_LOG(ERROR) << name_ << ": The current stage is empty!";
-    return Status::FAILED;
-  }
-  if (dev_list.size() == 1) {
-    MS_LOG(INFO) << name_ << ": No need mirror ops.";
-    return Status::SUCCESS;
-  }
-
-  OperatorName operator_name = BROADCAST;
-  ValuePtr attr0_value = MakeValue(dev_list.front());
-  std::vector<Group> group_list;
-  if (CreateGroupByDim(dev_matrix_shape_.size() - 1, &group_list) != SUCCESS) {
-    MS_LOG(ERROR) << name_ << ": Infer mirror ops, create group failed.";
-    return FAILED;
-  } else if (group_list.empty()) {
-    MS_LOG(INFO) << name_ << ": No need mirror ops.";
-    return SUCCESS;
-  }
-  std::string group = group_list[0].name();
-  ValuePtr attr1_value = MakeValue(group);
-
-  Attr attr0 = std::make_pair(SRC, attr0_value);
-  Attr attr1 = std::make_pair(GROUP, attr1_value);
-
-  OperatorAttrs operator_attrs = {attr0, attr1};
-
-  OperatorParams operator_param;
-  OperatorArgs operator_args = std::make_pair(operator_attrs, operator_param);
-
-  Operator op = std::make_pair(operator_name, operator_args);
-  OperatorVector op_vector = {op};
-
-  size_t size = inputs_shape_.size();
-  for (size_t i = 0; i < size; ++i) {
-    mirror_ops_.push_back(op_vector);
-  }
-  mirror_ops_.clear();
-  return SUCCESS;
-}
+Status VirtualDatasetInfo::InferMirrorOps() { return SUCCESS; }
 
 Status VirtualDatasetInfo::InferForwardCommunication() { return SUCCESS; }
 
 Status VirtualDatasetInfo::InferTensorMap() {
+  MS_EXCEPTION_IF_NULL(ParallelContext::GetInstance());
+  bool full_batch = ParallelContext::GetInstance()->full_batch();
+
   for (size_t i = 0; i < strategy_->GetInputNumber(); i++) {
     std::vector<int32_t> tensor_map_index;
-    tensor_map_index.push_back((int32_t)(LAST_INDEX(SizeToUint(dev_matrix_shape_.size()))));
+    if (full_batch) {
+      tensor_map_index.push_back(MAP_NONE);
+    } else {
+      tensor_map_index.push_back((int32_t)(LAST_INDEX(SizeToUint(dev_matrix_shape_.size()))));
+    }
     for (size_t j = 1; j < strategy_->GetInputDim()[i].size(); ++j) {
       tensor_map_index.push_back(MAP_NONE);
     }
@@ -213,6 +176,10 @@ Status VirtualDatasetInfo::SetCostUnderStrategy(const StrategyPtr &strategy) {
 }
 
 Status VirtualDatasetInfo::GenerateStrategies(int32_t stage_id) {
+  MS_EXCEPTION_IF_NULL(ParallelContext::GetInstance());
+  bool full_batch = ParallelContext::GetInstance()->full_batch();
+  size_t total_dev_num;
+
   if (GetAttrs() != SUCCESS) {
     MS_LOG(ERROR) << name_ << ": GetAttrs failed";
     return FAILED;
@@ -220,7 +187,11 @@ Status VirtualDatasetInfo::GenerateStrategies(int32_t stage_id) {
 
   CheckGlobalDeviceManager();
   is_auto_parallel_ = true;
-  size_t total_dev_num = g_device_manager->GetDeviceListByStageId(stage_id).size();
+  if (full_batch) {
+    total_dev_num = 1;
+  } else {
+    total_dev_num = g_device_manager->GetDeviceListByStageId(stage_id).size();
+  }
   StrategyPtr sp;
   std::vector<Dimensions> strategy;
   for (auto &shape : inputs_shape_) {
@@ -232,10 +203,18 @@ Status VirtualDatasetInfo::GenerateStrategies(int32_t stage_id) {
   sp = std::make_shared<Strategy>(stage_id, strategy);
 
   if (SetCostUnderStrategy(sp) == SUCCESS) {
-    MS_LOG(INFO) << name_ << ": Successfully generated batch-parallel-strategy.";
+    if (full_batch) {
+      MS_LOG(INFO) << name_ << ": Successfully generated full-batch-parallel-strategy.";
+    } else {
+      MS_LOG(INFO) << name_ << ": Successfully generated batch-parallel-strategy.";
+    }
     PrintStrategy(sp);
   } else {
-    MS_LOG(ERROR) << name_ << ": Generating batch-parallel-strategy failed.";
+    if (full_batch) {
+      MS_LOG(ERROR) << name_ << ": Generating full-batch-parallel-strategy failed.";
+    } else {
+      MS_LOG(ERROR) << name_ << ": Generating batch-parallel-strategy failed.";
+    }
     return FAILED;
   }
   return SUCCESS;
diff --git a/mindspore/ccsrc/parallel/step_auto_parallel.cc b/mindspore/ccsrc/parallel/step_auto_parallel.cc
index fe77b6027b..894177df8d 100644
--- a/mindspore/ccsrc/parallel/step_auto_parallel.cc
+++ b/mindspore/ccsrc/parallel/step_auto_parallel.cc
@@ -107,7 +107,7 @@ bool StepAutoParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &) {
   time += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
   MS_LOG(INFO) << "Now leaving step auto parallel, used time: " << time << " us";
 
-  root->flags()[AUTO_PARALLEL_RUN_ONCE_ONLY] = true;
+  root->set_flag(AUTO_PARALLEL_RUN_ONCE_ONLY, true);
   return changes;
 }
 
@@ -261,7 +261,7 @@ bool IsSplittableOperator(const std::string &op_name) {
      REDUCE_MAX, REDUCE_MIN, ARGMAXWITHVALUE, ARGMINWITHVALUE, REDUCE_SUM, CONV2D, FUSE_BATCH_NORM, POOLING,
      MAX_POOL_WITH_ARGMAX, SIMPLE_MEAN, FLATTEN, BATCH_NORM, LAYER_NORM, BIAS_ADD, ASSIGN_SUB, COS, ACOS, EXP,
      LOG, REDUCE_MEAN, REAL_DIV, SIGMOID, POW, MAXIMUM, MINIMUM, EQUAL, NOT_EQUAL, LOGICALNOT, GATHERV2, SQRT,
-     STRIDEDSLICE, GET_NEXT, CAST, NEG, SQUARE, BATCH_MATMUL, EXPAND_DIMS, SQUEEZE,
+     STRIDEDSLICE, GET_NEXT, CAST, NEG, SQUARE, BATCH_MATMUL, EXPAND_DIMS, SQUEEZE, SPARSE_GATHERV2,
      SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, SIGMOID_CROSS_ENTROPY_WITH_LOGITS, SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS};
   // clang-format on
 
@@ -283,6 +283,10 @@ bool IsAutoParallelCareNode(const CNodePtr &cnode) {
   if (bool_result) {
     MS_LOG(EXCEPTION) << "Should implementing OperatorInfo for: " << prim->name();
   } else if (prim->name() == CAST) {
+    if (cnode->fullname_with_scope().find(OPTIMIZER_SUB_STRING) != std::string::npos) {
+      // Do not care CASTs from optimizer
+      return false;
+    }
     return true;
   }
   return IsParallelCareNode(cnode) && IsSplittableOperator(prim->name());
@@ -409,6 +413,13 @@ Status ConstructCostGraphNodesByUniqueId(const std::vector<AnfNodePtr> &all_node
     }
     ValueNodePtr prim_anf_node = cnode->input(0)->cast<ValueNodePtr>();
     if (!IsAutoParallelCareNode(cnode)) {
+      // Needed by rec_parser
+      if (ParallelContext::GetInstance()->strategy_search_mode() == RECURSIVE_PROGRAMMING) {
+        auto prev_cnode = GetInternalOperatorInfo(cnode, prim_anf_node);
+        if (prev_cnode != nullptr) {
+          entire_costgraph->add_tuple_getitem(std::make_pair(cnode->UniqueId(), prev_cnode->UniqueId()));
+        }
+      }
       continue;
     }
     PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
@@ -467,6 +478,13 @@ Status ConstructCostGraphNodesByUniqueIdTC(const std::vector<AnfNodePtr> &all_no
     }
     ValueNodePtr prim_anf_node = cnode->input(0)->cast<ValueNodePtr>();
     if (!IsAutoParallelCareNode(cnode)) {
+      // Needed by rec_parser
+      if (ParallelContext::GetInstance()->strategy_search_mode() == RECURSIVE_PROGRAMMING) {
+        auto prev_cnode = GetInternalOperatorInfo(cnode, prim_anf_node);
+        if (prev_cnode != nullptr) {
+          entire_costgraph->add_tuple_getitem(std::make_pair(cnode->UniqueId(), prev_cnode->UniqueId()));
+        }
+      }
       continue;
     }
     PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
@@ -1090,14 +1108,44 @@ std::vector<std::vector<std::string>> RecInputTensorNames(const std::map<std::st
   return input_tensor_names;
 }
 
+CNodePtr GetInternalOperatorInfo(const CNodePtr &cnode, const ValueNodePtr &prim_anf_node) {
+  PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
+  if (prim->name() == TUPLE_GETITEM || prim->name() == DEPEND) {
+    auto prev_cnode = cnode->input(1)->cast<CNodePtr>();
+    if (prev_cnode == nullptr || !IsValueNode<Primitive>(prev_cnode->input(0))) {
+      return nullptr;
+    }
+    auto prev_prim = prev_cnode->input(0)->cast<ValueNodePtr>()->value()->cast<PrimitivePtr>();
+    while (prev_prim->name() == TUPLE_GETITEM || prev_prim->name() == DEPEND) {
+      prev_cnode = prev_cnode->input(1)->cast<CNodePtr>();
+      if (prev_cnode == nullptr || !IsValueNode<Primitive>(prev_cnode->input(0))) {
+        return nullptr;
+      }
+      prev_prim = prev_cnode->input(0)->cast<ValueNodePtr>()->value()->cast<PrimitivePtr>();
+    }
+    return prev_cnode;
+  }
+  return nullptr;
+}
+
 Status ParallelStrategyRecSearch(const std::vector<AnfNodePtr> &all_nodes, const FuncGraphPtr &root) {
-  if (ConstructCostGraphNodesByUniqueId(all_nodes, root) == SUCCESS) {
-    MS_LOG(INFO) << "Constructing nodes for cost graph succeeded. There are " << entire_costgraph->GetOperators().size()
-                 << " operators.";
+  if (CostModelContext::GetInstance()->is_multi_subgraphs()) {
+    if (ConstructCostGraphNodesByUniqueIdTC(all_nodes, root) == SUCCESS) {
+      MS_LOG(INFO) << "Constructing nodes for cost graph succeeded. There are "
+                   << entire_costgraph->GetOperators().size() << " operators.";
+    } else {
+      MS_LOG(EXCEPTION) << "Constructing nodes for cost graph failed.";
+    }
   } else {
-    MS_LOG(ERROR) << "Constructing nodes for cost graph failed.";
-    return FAILED;
+    if (ConstructCostGraphNodesByUniqueId(all_nodes, root) == SUCCESS) {
+      MS_LOG(INFO) << "Constructing nodes for cost graph succeeded. There are "
+                   << entire_costgraph->GetOperators().size() << " operators.";
+    } else {
+      MS_LOG(EXCEPTION) << "Constructing nodes for cost graph failed.";
+    }
   }
+  ReshapeCostCompute(all_nodes);
+
   auto ops = entire_costgraph->GetOperators();
   std::vector<std::vector<std::string>> input_tensor_names = entire_costgraph->get_inputs_tensor_name_list();
   auto tuple_getitem_list = entire_costgraph->get_tuple_getitem_list();
diff --git a/mindspore/ccsrc/parallel/step_auto_parallel.h b/mindspore/ccsrc/parallel/step_auto_parallel.h
index fff9dfa4c3..c923e5770f 100644
--- a/mindspore/ccsrc/parallel/step_auto_parallel.h
+++ b/mindspore/ccsrc/parallel/step_auto_parallel.h
@@ -57,6 +57,8 @@ Status ParallelStrategyRecSearch(const std::vector<AnfNodePtr> &all_nodes, const
 
 std::vector<std::vector<std::string>> RecInputTensorNames(const std::map<std::string, std::string>::iterator &it,
                                                           std::vector<std::vector<std::string>> input_tensor_names);
+
+CNodePtr GetInternalOperatorInfo(const CNodePtr &cnode, const ValueNodePtr &prim_anf_node);
 }  // namespace parallel
 }  // namespace mindspore
 #endif  // PARALLEL_STEP_AUTO_PARALLEL_H_
diff --git a/mindspore/ccsrc/parallel/step_parallel.cc b/mindspore/ccsrc/parallel/step_parallel.cc
index fd09b5e0b5..fc7b48d267 100644
--- a/mindspore/ccsrc/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/parallel/step_parallel.cc
@@ -534,6 +534,10 @@ std::vector<AnfNodePtr> ReplaceOpInput(const Operator &replace_op, const std::st
     MS_LOG(EXCEPTION) << "Failure: " << node->ToString() << " size is smaller than 2";
   }
   std::vector<AnfNodePtr> replace_input = {NewValueNode(pyop_instance), node->input(1)};
+  auto prim = GetValueNode<PrimitivePtr>(node->input(0));
+  if (prim->name() == GATHERV2 || prim->name() == SPARSE_GATHERV2) {
+    replace_input = {NewValueNode(pyop_instance), node->input(1), node->input(2)};
+  }
   if (!params.empty()) {
     Param param_first = *(params.begin());
     int32_t first_position = param_first.second;
@@ -1371,11 +1375,19 @@ void SetClonedTensorShapeForOptimizer(const FuncGraphPtr &root) {
 
 void SetVirtualDatasetStrategy(const CNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(ParallelContext::GetInstance());
+  bool full_batch = ParallelContext::GetInstance()->full_batch();
+
   PrimitivePtr prim = GetValueNode<PrimitivePtr>(node->input(0));
   MS_EXCEPTION_IF_NULL(prim);
   if (prim->name() == VIRTUAL_DATA_SET) {
     CheckGlobalDeviceManager();
-    int32_t dev_num = SizeToInt(g_device_manager->GetDeviceListByStageId(0).size());
+    int32_t dev_num;
+    if (full_batch) {
+      dev_num = 1;
+    } else {
+      dev_num = SizeToInt(g_device_manager->GetDeviceListByStageId(0).size());
+    }
     auto attrs_temp = prim->attrs();
     std::vector<Shapes> shape_list = ExtractShape(node);
     if (shape_list.empty()) {
@@ -1864,11 +1876,15 @@ void HandleDropoutNode(const OperatorInfoPtr &distribute_operator, const CNodePt
 
   DropoutDoMaskInfoPtr dropout_do_mask = std::dynamic_pointer_cast<DropoutDoMaskInfo>(distribute_operator);
   MS_EXCEPTION_IF_NULL(dropout_do_mask);
-  Operator replace_op = dropout_do_mask->GetDropoutGenMaskReplaceOp(cnode);
+  std::vector<Operator> replace_op = dropout_do_mask->GetDropoutGenMaskReplaceOp(cnode);
+  if (replace_op.empty()) {
+    MS_LOG(DEBUG) << "No need to replace dropout_gen_mask";
+    return;
+  }
   if (cnode->inputs().size() != DROPOUT_DO_MASK_CNODE_INPUT_SIZE) {
     MS_LOG(EXCEPTION) << "The size of drop out do mask cnode's input is not " << DROPOUT_DO_MASK_CNODE_INPUT_SIZE;
   }
-  ReplaceOneOp(replace_op, cnode->input(DROPOUT_GEN_MASK_INDEX)->cast<CNodePtr>());
+  ReplaceOneOp(replace_op[0], cnode->input(DROPOUT_GEN_MASK_INDEX)->cast<CNodePtr>());
 }
 
 void HandleSpecialNode(const OperatorInfoPtr &distribute_operator, const CNodePtr &cnode) {
@@ -2254,10 +2270,10 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
       (root->has_flag(SEMI_AUTO_PARALLEL_RUN_ONCE_ONLY))) {
     if (!root->has_flag(CHECK_SET_STRATEGY_VALID_ONCE_ONLY)) {
       if (HasStrategy(root)) {
-        MS_LOG(INFO) << "strategies ignored in " << parallel_mode
+        MS_LOG(INFO) << "Strategies ignored in " << parallel_mode
                      << ", set_strategy() only valid in [semi_]auto_parallel.";
       }
-      root->flags()[CHECK_SET_STRATEGY_VALID_ONCE_ONLY] = true;
+      root->set_flag(CHECK_SET_STRATEGY_VALID_ONCE_ONLY, true);
     }
 
     return changes;
@@ -2314,11 +2330,11 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
   DumpGraph(root, std::string(STEP_PARALLEL_END));
 
   // step parallel only run once
-  root->flags()[SEMI_AUTO_PARALLEL_RUN_ONCE_ONLY] = true;
+  root->set_flag(SEMI_AUTO_PARALLEL_RUN_ONCE_ONLY, true);
   res->results()[pipeline::kStepParallelGraph] = root;
 
   // in auto parallel mode, no need to check if stategies set
-  root->flags()[CHECK_SET_STRATEGY_VALID_ONCE_ONLY] = true;
+  root->set_flag(CHECK_SET_STRATEGY_VALID_ONCE_ONLY, true);
 
   (void)gettimeofday(&end_time, nullptr);
   uint64_t time = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
diff --git a/mindspore/ccsrc/pipeline/action.cc b/mindspore/ccsrc/pipeline/action.cc
index 3e87000be7..7d56551ff0 100644
--- a/mindspore/ccsrc/pipeline/action.cc
+++ b/mindspore/ccsrc/pipeline/action.cc
@@ -38,6 +38,7 @@
 #include "pipeline/remove_value_node_dup.h"
 #include "optimizer/optimizer.h"
 #include "vm/transform.h"
+#include "parse/python_adapter.h"
 
 namespace mindspore {
 namespace pipeline {
@@ -228,6 +229,9 @@ bool AbstractSpecializeAction(const ResourcePtr &res) {
     if (param_node->has_default()) {
       auto param_value = std::dynamic_pointer_cast<ParamValuePy>(param_node->default_param());
       AbstractBasePtr ptr = abstract::FromValue(parse::data_converter::PyDataToValue(param_value->value()), true);
+      auto sparse_grad =
+        py::cast<std::string>(parse::python_adapter::GetPyObjAttr(param_value->value(), "sparse_grad"));
+      ptr->set_sparse_grad(sparse_grad);
 
       parallel::ParallelParameterContextRestoreInNoTraining(func_graph, param_node, ptr);
       args_spec.push_back(ptr);
@@ -276,8 +280,14 @@ bool GeOptimizeAction(const ResourcePtr &res) { return OptimizeAction(res, kGePa
 
 bool VmOptimizeAction(const ResourcePtr &res) { return OptimizeAction(res, kVmPasses); }
 
+bool PynativeOptimizeAction(const ResourcePtr &res) { return OptimizeAction(res, kPynativePasses); }
+
 static bool IsCtrlSink() {
   auto ms_ctx = MsContext::GetInstance();
+  if (ms_ctx->execution_mode() != kGraphMode) {
+    return false;
+  }
+
   std::string device_target = ms_ctx->device_target();
   if (device_target != kAscendDevice) {
     return false;
@@ -287,15 +297,9 @@ static bool IsCtrlSink() {
     return false;
   }
 
-  const char *enable_ctrl_sink = std::getenv("ENABLE_CTRL_SINK");
-  if (enable_ctrl_sink == nullptr) {
-    return false;
-  }
-  std::string enable_ctrl_sink_str(enable_ctrl_sink);
-  if (enable_ctrl_sink_str == "0") {
+  if (!ms_ctx->is_multi_graph_sink()) {
     return false;
   }
-
   return true;
 }
 
@@ -305,12 +309,24 @@ bool TaskEmitAction(const ResourcePtr &res) {
   }
   FuncGraphPtr func_graph = res->func_graph();
   auto bc_ptr = res->results()[kBackend].cast<compile::BackendPtr>();
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (CompileGraphs::ContainMixedTarget(func_graph)) {
+    bc_ptr->set_is_multi_graph_sink(false);
+    context_ptr->set_is_multi_graph_sink(false);
+    context_ptr->set_loop_sink_flag(false);
+  } else if (context_ptr->execution_mode() != kPynativeMode) {
+    std::string device_target = context_ptr->device_target();
+    if (device_target == kAscendDevice) {
+      bc_ptr->set_is_multi_graph_sink(true);
+      context_ptr->set_is_multi_graph_sink(true);
+    }
+  }
 
   if (IsCtrlSink()) {
     res->results()[kOutput] = bc_ptr->CompileGraph(NOT_NULL(func_graph));
     return true;
   }
-
   std::vector<PrimitivePtr> cut_list = compile::nonlinear_ops;
   if (bc_ptr->name() == kMsConvert) {
     cut_list = compile::GetMsNonlinearOps();
@@ -329,7 +345,6 @@ bool ExecuteAction(const ResourcePtr &res) {
     if (!res->results()[kOutput].is<GraphId>()) {
       MS_LOG(EXCEPTION) << "Execute args error";
     }
-
     auto graph_id = res->results()[kOutput].cast<GraphId>();
     std::shared_ptr<compile::Backend> bc_ptr = res->results()[kBackend].cast<std::shared_ptr<compile::Backend>>();
     std::shared_ptr<compile::MsBackend> msbc_ptr = std::dynamic_pointer_cast<compile::MsBackend>(bc_ptr);
diff --git a/mindspore/ccsrc/pipeline/action.h b/mindspore/ccsrc/pipeline/action.h
index 8a651c0038..eed1307872 100644
--- a/mindspore/ccsrc/pipeline/action.h
+++ b/mindspore/ccsrc/pipeline/action.h
@@ -35,6 +35,7 @@ bool SymbolResolveAction(const ResourcePtr &res);
 bool AbstractSpecializeAction(const ResourcePtr &res);
 bool GeOptimizeAction(const ResourcePtr &res);
 bool VmOptimizeAction(const ResourcePtr &res);
+bool PynativeOptimizeAction(const ResourcePtr &res);
 bool TaskEmitAction(const ResourcePtr &res);
 bool ExecuteAction(const ResourcePtr &res);
 
diff --git a/mindspore/ccsrc/pipeline/init.cc b/mindspore/ccsrc/pipeline/init.cc
index 1b9666a400..7025447a29 100644
--- a/mindspore/ccsrc/pipeline/init.cc
+++ b/mindspore/ccsrc/pipeline/init.cc
@@ -17,6 +17,7 @@
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
 #include "kernel/oplib/oplib.h"
+#include "kernel/oplib/oploader.h"
 #include "pipeline/pipeline.h"
 #include "operator/composite/composite.h"
 #include "ir/signature.h"
@@ -26,6 +27,7 @@
 #include "pipeline/parse/python_adapter.h"
 #include "utils/summary/event_writer.h"
 #include "utils/config_manager.h"
+#include "utils/mpi/mpi_config.h"
 #include "parallel/context.h"
 #include "parallel/device_manager.h"
 #include "parallel/costmodel_context.h"
@@ -44,6 +46,7 @@ using PrimitivePy = mindspore::PrimitivePy;
 using MetaFuncGraph = mindspore::MetaFuncGraph;
 using EventWriter = mindspore::summary::EventWriter;
 using OpLib = mindspore::kernel::OpLib;
+using OpInfoLoaderPy = mindspore::kernel::OpInfoLoaderPy;
 using ParallelContext = mindspore::parallel::ParallelContext;
 using CostModelContext = mindspore::parallel::CostModelContext;
 
@@ -76,6 +79,8 @@ PYBIND11_MODULE(_c_expression, m) {
          "Get CNode Strategy Dictionary.")
     .def("get_allreduce_fusion", &ExecutorPy::GetAllreduceFusion, py::arg("phase") = py::str("train"),
          "Get Allreduce Fusion Dictionary.")
+    .def("fetch_info_for_quant_export", &ExecutorPy::FetchInfoForQuantExport, py::arg("phase") = py::str("train"),
+         "Fetch the inputs of Conv or Matmul for quant export.")
     .def("build_data_graph", &ExecutorPy::BuildGraph, py::arg("build_params"), py::arg("phase") = py::str("train"),
          py::arg("broadcast_params") = py::dict(), "Build data graph.")
     .def("has_compiled", &ExecutorPy::HasCompiled, py::arg("phase") = py::str(""), "get if cell compiled.")
@@ -143,7 +148,18 @@ PYBIND11_MODULE(_c_expression, m) {
     .def("get_profiling_options", &mindspore::MsContext::profiling_options, "Get options to profiling.")
     .def("set_profiling_options", &mindspore::MsContext::set_profiling_options, "Set options to profiling.")
     .def("get_check_bprop_flag", &mindspore::MsContext::check_bprop_flag, "Get whether to check bprop.")
-    .def("set_check_bprop_flag", &mindspore::MsContext::set_check_bprop_flag, "Set whether to check bprop.");
+    .def("set_check_bprop_flag", &mindspore::MsContext::set_check_bprop_flag, "Set whether to check bprop.")
+    .def("get_max_device_memory", &mindspore::MsContext::max_device_memory, "Get deivce memory max size.")
+    .def("set_max_device_memory", &mindspore::MsContext::set_max_device_memory, "Set deivce memory max size.")
+    .def("set_print_file_path", &mindspore::MsContext::set_print_file_path, "Set path to print.")
+    .def("set_enable_graph_kernel", &mindspore::MsContext::set_enable_graph_kernel,
+         "Set the GraphKernel switch to on or off.")
+    .def("get_enable_graph_kernel", &mindspore::MsContext::enable_graph_kernel, "Get the value of GraphKernel switch.");
+
+  (void)py::class_<mindspore::MpiConfig, std::shared_ptr<mindspore::MpiConfig>>(m, "MpiConfig")
+    .def_static("get_instance", &mindspore::MpiConfig::GetInstance, "Get mpi config instance.")
+    .def("get_enable_mpi", &mindspore::MpiConfig::enable_mpi, "Get whether enable mpi.")
+    .def("set_enable_mpi", &mindspore::MpiConfig::set_enable_mpi, "Set whether to enable mpi.");
 
   (void)py::class_<ParallelContext, std::shared_ptr<ParallelContext>>(m, "AutoParallelContext")
     .def_static("get_instance", &ParallelContext::GetInstance, "Get auto parallel context instance.")
@@ -187,6 +203,8 @@ PYBIND11_MODULE(_c_expression, m) {
          "Set strategy checkpoint save file.")
     .def("get_strategy_ckpt_load_file", &ParallelContext::strategy_ckpt_load_file, "Get strategy checkpoint load file.")
     .def("get_strategy_ckpt_save_file", &ParallelContext::strategy_ckpt_save_file, "Get strategy checkpoint save file.")
+    .def("set_full_batch", &ParallelContext::set_full_batch, "Set whether load full batch on each device.")
+    .def("get_full_batch", &ParallelContext::full_batch, "Get whether load full batch on each device.")
     .def("reset", &ParallelContext::Reset, "Reset auto parallel context.");
 
   (void)py::class_<CostModelContext, std::shared_ptr<CostModelContext>>(m, "CostModelContext")
@@ -312,4 +330,8 @@ PYBIND11_MODULE(_c_expression, m) {
               "Finalize gpu collective communication mode.");
 
 #endif
+
+  (void)py::class_<OpInfoLoaderPy, std::shared_ptr<OpInfoLoaderPy>>(m, "OpInfoLoaderPy")
+    .def(py::init())
+    .def("get_all_ops_info", &OpInfoLoaderPy::GetAllOpsInfo, "get all ops info.");
 }
diff --git a/mindspore/ccsrc/pipeline/parse/data_converter.cc b/mindspore/ccsrc/pipeline/parse/data_converter.cc
index 5dbb8bc453..330d03d11c 100644
--- a/mindspore/ccsrc/pipeline/parse/data_converter.cc
+++ b/mindspore/ccsrc/pipeline/parse/data_converter.cc
@@ -32,6 +32,7 @@
 #include "utils/symbolic.h"
 #include "utils/context/ms_context.h"
 #include "debug/trace.h"
+#include "optimizer/ad/grad.h"
 
 namespace mindspore {
 namespace parse {
@@ -40,6 +41,35 @@ using TensorPtr = mindspore::tensor::TensorPtr;
 using MetaTensor = mindspore::tensor::MetaTensor;
 using MetaTensorPtr = mindspore::tensor::MetaTensorPtr;
 
+FuncGraphPtr ConvertToBpropCut(const py::object &obj) {
+  std::vector<std::string> results = data_converter::GetObjKey(obj);
+  std::string obj_key = results[0];
+  py::function bprop_func = py::getattr(obj, CUSTOM_BPROP_NAME);
+
+  auto bprop_graph = std::make_shared<FuncGraph>();
+  std::vector<AnfNodePtr> outputs;
+
+  auto fake_bprop = std::make_shared<PrimitivePy>("bprop_cut", py::object());
+  fake_bprop->set_hook(bprop_func);
+  (void)fake_bprop->AddAttr(CUSTOM_BPROP_NAME, MakeValue(true));
+  outputs.push_back(NewValueNode(fake_bprop));
+
+  py::object code_obj = py::getattr(bprop_func, "__code__");
+  size_t inputs_num = py::cast<int>(py::getattr(code_obj, "co_argcount")) - 3;
+  for (size_t i = 0; i < inputs_num; ++i) {
+    auto param = bprop_graph->add_parameter();
+    outputs.push_back(param);
+  }
+  auto p1 = bprop_graph->add_parameter();
+  auto p2 = bprop_graph->add_parameter();
+  outputs.push_back(p1);
+  outputs.push_back(p2);
+
+  bprop_graph->set_output(bprop_graph->NewCNode(outputs));
+  data_converter::SetObjGraphValue(obj_key, bprop_graph);
+  return bprop_graph;
+}
+
 namespace {
 bool ConvertTuple(const py::object &obj, ValuePtr *const data, bool use_signature) {
   MS_LOG(DEBUG) << "Converting python tuple";
@@ -208,33 +238,51 @@ bool ConvertTensor(const py::object &obj, ValuePtr *const data) {
   return true;
 }
 
-FuncGraphPtr ConvertToBpropCut(py::object obj) {
-  std::vector<std::string> results = data_converter::GetObjKey(obj);
-  std::string obj_key = results[0];
-  py::function bprop_func = py::getattr(obj, "bprop");
-
-  FuncGraphPtr bprop_graph = std::make_shared<FuncGraph>();
-  std::vector<AnfNodePtr> outputs;
-
-  auto fake_bprop = std::make_shared<Primitive>("bprop_cut");
-  fake_bprop->set_hook(bprop_func);
-  (void)fake_bprop->AddAttr("bprop", MakeValue(true));
-  outputs.push_back(NewValueNode(fake_bprop));
+bool ConvertSlice(const py::object &obj, ValuePtr *const data) {
+  MS_LOG(DEBUG) << "Converting slice object";
+
+  py::slice slice_obj = obj.cast<py::slice>();
+  auto convert_func = [obj](std::string attr) -> ValuePtr {
+    auto py_attr = py::getattr(obj, attr.c_str());
+    if (py::isinstance<py::none>(py_attr)) {
+      return kNone;
+    } else if (py::isinstance<py::int_>(py_attr)) {
+      int value = py::cast<int>(py_attr);
+      return MakeValue(value);
+    } else {
+      MS_LOG(EXCEPTION) << "Slice should contain only int or none";
+    }
+  };
+  ValuePtr start = convert_func("start");
+  ValuePtr stop = convert_func("stop");
+  ValuePtr step = convert_func("step");
+  *data = std::make_shared<ValueSlice>(start, stop, step);
+  return true;
+}
 
-  py::object code_obj = py::getattr(bprop_func, "__code__");
-  size_t inputs_num = py::cast<int>(py::getattr(code_obj, "co_argcount")) - 3;
-  for (size_t i = 0; i < inputs_num; ++i) {
-    auto param = bprop_graph->add_parameter();
-    outputs.push_back(param);
+bool ConvertCellObjToFuncGraph(py::object obj, ValuePtr *const data) {
+  FuncGraphPtr func_graph = ConvertToFuncGraph(obj);
+  if (func_graph == nullptr) {
+    MS_LOG(ERROR) << "Parse resolve function error.";
+    return false;
   }
-  auto p1 = bprop_graph->add_parameter();
-  auto p2 = bprop_graph->add_parameter();
-  outputs.push_back(p1);
-  outputs.push_back(p2);
-
-  bprop_graph->set_output(bprop_graph->NewCNode(outputs));
-  data_converter::SetObjGraphValue(obj_key, bprop_graph);
-  return bprop_graph;
+  // if the cell object has specified bprop, it has user-defined bprop function parse and record it
+  if (py::hasattr(obj, CUSTOM_BPROP_NAME)) {
+    FuncGraphPtr bprop_graph = nullptr;
+    bool enable_bprop_debug = py::cast<bool>(py::getattr(obj, "bprop_debug"));
+    if (enable_bprop_debug) {
+      bprop_graph = ConvertToBpropCut(obj);
+    } else {
+      bprop_graph = ConvertToFuncGraph(obj, PYTHON_MOD_GET_BPROP_METHOD);
+    }
+    if (bprop_graph != nullptr) {
+      (void)func_graph->transforms().insert(std::make_pair(CUSTOM_BPROP_NAME, FuncGraphTransform(bprop_graph)));
+      (void)bprop_graph->transforms().insert(std::make_pair("primal", FuncGraphTransform(func_graph)));
+      func_graph->set_flag(FUNC_GRAPH_FLAG_DEFER_INLINE, true);
+    }
+  }
+  *data = func_graph;
+  return true;
 }
 
 bool ConvertOtherObj(py::object obj, ValuePtr *const data) {
@@ -261,32 +309,12 @@ bool ConvertOtherObj(py::object obj, ValuePtr *const data) {
     // Create the namespace for common class instance
     // When the obj is Cell, default parse the 'construct'
     if (data_converter::IsCellInstance(obj)) {
-      FuncGraphPtr func_graph = ConvertToFuncGraph(obj);
-      if (func_graph == nullptr) {
-        MS_LOG(ERROR) << "Parse resolve function error.";
-        return false;
-      }
-      // if the cell object has specified bprop, it has user-defined bprop function parse and record it
-      if (py::hasattr(obj, "bprop")) {
-        FuncGraphPtr bprop_graph = nullptr;
-        bool enable_bprop_debug = py::cast<bool>(py::getattr(obj, "bprop_debug"));
-        if (enable_bprop_debug) {
-          bprop_graph = ConvertToBpropCut(obj);
-        } else {
-          bprop_graph = ConvertToFuncGraph(obj, PYTHON_MOD_GET_BPROP_METHOD);
-        }
-        if (bprop_graph != nullptr) {
-          (void)func_graph->transforms().insert(std::make_pair("bprop", FuncGraphTransform(bprop_graph)));
-          (void)bprop_graph->transforms().insert(std::make_pair("primal", FuncGraphTransform(func_graph)));
-          func_graph->set_flags(FUNC_GRAPH_FLAG_DEFER_INLINE, true);
-        }
-      }
-      *data = func_graph;
-    } else {
-      py::module mod = python_adapter::GetPyModule(PYTHON_MOD_PARSE_MODULE);
-      py::object namespace_var = python_adapter::CallPyModFn(mod, PYTHON_MOD_GET_MEMBER_NAMESPACE_SYMBOL, obj);
-      *data = std::make_shared<NameSpace>(RESOLVE_NAMESPACE_NAME_CLASS_MEMBER, namespace_var);
+      return ConvertCellObjToFuncGraph(obj, data);
     }
+
+    py::module mod = python_adapter::GetPyModule(PYTHON_MOD_PARSE_MODULE);
+    py::object namespace_var = python_adapter::CallPyModFn(mod, PYTHON_MOD_GET_MEMBER_NAMESPACE_SYMBOL, obj);
+    *data = std::make_shared<NameSpace>(RESOLVE_NAMESPACE_NAME_CLASS_MEMBER, namespace_var);
     return true;
   }
   MS_LOG(ERROR) << "Resolve type is invalid " << ((std::string)py::str(obj));
@@ -315,6 +343,10 @@ bool ConvertData(const py::object &obj, ValuePtr *const data, bool use_signature
     converted = std::make_shared<StringImm>(py::cast<std::string>(obj));
   } else if (py::isinstance<py::dict>(obj)) {
     ret = ConvertDict(obj, &converted, use_signature);
+  } else if (py::isinstance<py::slice>(obj)) {
+    ret = ConvertSlice(obj, &converted);
+  } else if (py::isinstance<py::ellipsis>(obj)) {
+    converted = kEllipsis;
   } else if (py::isinstance<py::tuple>(obj)) {
     ret = ConvertTuple(obj, &converted, use_signature);
   } else if (py::hasattr(obj, PYTHON_CELL_AS_LIST)) {
@@ -338,6 +370,9 @@ bool ConvertData(const py::object &obj, ValuePtr *const data, bool use_signature
   } else if (py::hasattr(obj, PYTHON_ENVINSTANCE_FLAG)) {
     std::shared_ptr<EnvInstance> env = obj.cast<std::shared_ptr<EnvInstance>>();
     converted = env;
+  } else if (py::hasattr(obj, "__parameter__")) {
+    auto to_convert = py::cast<py::object>(python_adapter::GetPyObjAttr(obj, "default_input"));
+    ret = ConvertData(to_convert, &converted);
   } else {
     ret = ConvertOtherObj(obj, &converted);
   }
diff --git a/mindspore/ccsrc/pipeline/parse/data_converter.h b/mindspore/ccsrc/pipeline/parse/data_converter.h
index a8918fa60c..0165b55363 100644
--- a/mindspore/ccsrc/pipeline/parse/data_converter.h
+++ b/mindspore/ccsrc/pipeline/parse/data_converter.h
@@ -51,6 +51,7 @@ void ClearObjectCache();
 }  // namespace data_converter
 
 ClassPtr ParseDataClass(const py::object &cls_obj);
+FuncGraphPtr ConvertToBpropCut(const py::object &obj);
 
 void CleanDataClassToClassMap();
 
diff --git a/mindspore/ccsrc/pipeline/parse/function_block.cc b/mindspore/ccsrc/pipeline/parse/function_block.cc
index 66534390a0..fbeeba94a1 100644
--- a/mindspore/ccsrc/pipeline/parse/function_block.cc
+++ b/mindspore/ccsrc/pipeline/parse/function_block.cc
@@ -265,6 +265,13 @@ CNodePtr FunctionBlock::ForceToBoolNode(const AnfNodePtr &cond) {
   return op_apply_node;
 }
 
+CNodePtr FunctionBlock::ForceToWhileCond(const AnfNodePtr &cond) {
+  TraceManager::DebugTrace(std::make_shared<TraceForceWhileCond>(cond->debug_info()));
+  CNodePtr op_apply_node = func_graph()->NewCNode({MakeResolveOperation("while_cond"), cond});
+  TraceManager::EndTrace();
+  return op_apply_node;
+}
+
 // Perform a jump from this block to target block
 void FunctionBlock::Jump(const FunctionBlockPtr &target_block, AnfNodePtr node) {
   if (func_graph()->get_return() != nullptr) {
@@ -315,12 +322,10 @@ void FunctionBlock::InsertDependItemsBeforeReturn() {
 
   ValueNodePtr make_tuple_op = NewValueNode(prim::kPrimMakeTuple);
   ValueNodePtr depend_op = NewValueNode(prim::kPrimDepend);
-  ValueNodePtr get_ref_origin_op = NewValueNode(prim::kPrimGetRefOrigin);
   ValueNodePtr stop_gradient_op = NewValueNode(prim::kPrimStopGradient);
   const std::string primitive_name("assign");
   const std::string module_name("mindspore.ops.functional");
-  ValueNodePtr assign_op = NewValueNode(prim::GetPythonOps(primitive_name, module_name));
-
+  ValueNodePtr assign_op = NewValueNode(prim::GetPythonOps(primitive_name, module_name, true));
   if (state_assign_.size() == 0 && auto_depends_.size() == 0) {
     return;
   }
@@ -329,8 +334,7 @@ void FunctionBlock::InsertDependItemsBeforeReturn() {
   vec_states.emplace_back(make_tuple_op);
   for (auto &item : state_assign_) {
     auto source = ReadVariable(item.second);
-    auto origin = func_graph()->NewCNode({get_ref_origin_op, item.first});
-    auto assign = func_graph()->NewCNode({assign_op, origin, source});
+    auto assign = func_graph()->NewCNode({assign_op, item.first, source});
     MS_LOG(INFO) << "SetState read " << item.first->ToString() << ", " << item.second;
     vec_states.emplace_back(assign);
   }
diff --git a/mindspore/ccsrc/pipeline/parse/function_block.h b/mindspore/ccsrc/pipeline/parse/function_block.h
index e7842903ee..346061430d 100644
--- a/mindspore/ccsrc/pipeline/parse/function_block.h
+++ b/mindspore/ccsrc/pipeline/parse/function_block.h
@@ -28,6 +28,7 @@
 #include <utility>
 #include "pipeline/parse/parse_base.h"
 #include "utils/log_adapter.h"
+#include "utils/ordered_map.h"
 
 namespace mindspore {
 namespace parse {
@@ -55,6 +56,7 @@ class FunctionBlock : public std::enable_shared_from_this<FunctionBlock> {
   // A block is matured if all its predecessors is generated
   void Mature();
   CNodePtr ForceToBoolNode(const AnfNodePtr &cond);
+  CNodePtr ForceToWhileCond(const AnfNodePtr &cond);
   void Jump(const FunctionBlockPtr &block, AnfNodePtr node);
   AnfNodePtr SearchReplaceNode(const std::string &var, const ParameterPtr &phi);
   void ConditionalJump(AnfNodePtr condNode, const FunctionBlockPtr &trueBlock, const FunctionBlockPtr &falseBlock);
@@ -99,7 +101,7 @@ class FunctionBlock : public std::enable_shared_from_this<FunctionBlock> {
   std::unordered_map<ParameterPtr, AnfNodePtr> removable_phis_;
 
   // set state nodes need to insert before function return nodes.
-  std::unordered_map<AnfNodePtr, std::string> state_assign_;
+  OrderedMap<AnfNodePtr, std::string> state_assign_;
 
   // hold declared global variables in function
   std::set<std::string> global_vars_;
diff --git a/mindspore/ccsrc/pipeline/parse/parse.cc b/mindspore/ccsrc/pipeline/parse/parse.cc
index c6e5d3713a..6d5c28c98c 100644
--- a/mindspore/ccsrc/pipeline/parse/parse.cc
+++ b/mindspore/ccsrc/pipeline/parse/parse.cc
@@ -67,7 +67,7 @@ AnfNodePtr GetMixedPrecisionCastHelp(const FuncGraphPtr &func_graph, const AnfNo
   } else {
     return param;
   }
-  auto cast_helper = prim::GetPythonOps("_mp_cast_helper", "mindspore.ops.composite.base");
+  auto cast_helper = prim::kPrimMixedPrecisionCast;
   auto cast = func_graph->NewCNode({NewValueNode(cast_helper), NewValueNode(dst_type), param});
   return cast;
 }
@@ -967,6 +967,7 @@ FunctionBlockPtr Parser::ParseWhile(const FunctionBlockPtr &block, const py::obj
 
   py::object test_node = python_adapter::GetPyObjAttr(node, "test");
   AnfNodePtr condition_node = ParseExprNode(header_block, test_node);
+  condition_node = header_block->ForceToWhileCond(condition_node);
   body_block->Mature();
   header_block->ConditionalJump(condition_node, body_block, after_block);
 
@@ -1175,11 +1176,11 @@ void Parser::HandleAssignClassMember(const FunctionBlockPtr &block, const py::ob
   auto filename = location[0].cast<std::string>();
   auto line_no = location[1].cast<int>();
   // Now only support the self.xxx = yyy, where self.xxx must be a defined Parameter type
-  if (!py::hasattr(ast()->obj(), attr_name.c_str())) {
+  if (!py::hasattr(ast()->obj(), common::SafeCStr(attr_name))) {
     MS_EXCEPTION(TypeError) << "'" << var_name << "' should be a Parameter, but not defined, at " << filename << ":"
                             << line_no;
   }
-  auto obj = ast()->obj().attr(attr_name.c_str());
+  auto obj = ast()->obj().attr(common::SafeCStr(attr_name));
   auto obj_type = obj.attr("__class__").attr("__name__");
   if (!py::hasattr(obj, "__parameter__")) {
     MS_EXCEPTION(TypeError) << "'" << var_name << "' should be a Parameter, but got '"
@@ -1205,8 +1206,18 @@ void Parser::HandleAssignSubscript(const FunctionBlockPtr &block, const py::obje
   // getitem apply should return the sequence data structure itself
   std::string var_name = "";
   if (ast_->IsClassMember(value_obj)) {
-    var_name = "self.";
-    (void)var_name.append(value_obj.attr("attr").cast<std::string>());
+    std::string attr_name = value_obj.attr("attr").cast<std::string>();
+    var_name = "self." + attr_name;
+    if (!py::hasattr(ast()->obj(), common::SafeCStr(attr_name))) {
+      MS_EXCEPTION(TypeError) << "'" << var_name << "' was not defined in the class '__init__' function.";
+    }
+    auto obj = ast()->obj().attr(common::SafeCStr(attr_name));
+    auto obj_type = obj.attr("__class__").attr("__name__");
+    if (!py::hasattr(obj, "__parameter__")) {
+      MS_EXCEPTION(TypeError) << "'" << var_name << "' should be a Parameter, but got '"
+                              << py::str(obj).cast<std::string>() << "' with type '"
+                              << py::str(obj_type).cast<std::string>() << "'.";
+    }
   } else {
     var_name = value_obj.attr("id").cast<std::string>();
   }
@@ -1231,7 +1242,7 @@ void Parser::WriteAssignVars(const FunctionBlockPtr &block, const py::object &ta
   }
 }
 
-// process a assign statement , such as a =b,  a,b = tup
+// process a assign statement, such as a =b,  a,b = tup
 FunctionBlockPtr Parser::ParseAssign(const FunctionBlockPtr &block, const py::object &node) {
   MS_LOG(DEBUG) << "Process ast assgin";
   py::object value_object = python_adapter::GetPyObjAttr(node, "value");
@@ -1437,15 +1448,23 @@ bool ParseAst::UpdateFuncGraphFlags(const FuncGraphPtr &func_graph) {
   }
   py::dict flags = python_adapter::GetPyObjAttr(obj_, PYTHON_EXTERN_MINDSPORE_FLAG);
   for (auto &item : flags) {
-    if (!py::isinstance<py::str>(item.first) || !py::isinstance<py::bool_>(item.second)) {
+    if (!py::isinstance<py::str>(item.first)) {
       MS_LOG(ERROR) << "Type error in flags dict convert";
       return false;
     }
     auto name = py::cast<std::string>(item.first);
-    auto value = py::cast<bool>(item.second);
-    MS_LOG(DEBUG) << "Flag name: " << name << ". Value: " << value;
-
-    func_graph->set_flags(name, value);
+    if (py::isinstance<py::bool_>(item.second)) {
+      auto value = py::cast<bool>(item.second);
+      MS_LOG(DEBUG) << "Flag name: " << name << ". Value: " << value;
+      func_graph->set_flag(name, value);
+    } else if (py::isinstance<py::str>(item.second)) {
+      auto value = py::cast<std::string>(item.second);
+      MS_LOG(DEBUG) << "Flag name: " << name << ". Value: " << value;
+      func_graph->set_attr(name, MakeValue(value));
+    } else {
+      MS_LOG(ERROR) << "Type error in flags/attrs dict convert";
+      return false;
+    }
   }
 
   return true;
diff --git a/mindspore/ccsrc/pipeline/parse/parse.h b/mindspore/ccsrc/pipeline/parse/parse.h
index 969effbd18..0a56ccaed9 100644
--- a/mindspore/ccsrc/pipeline/parse/parse.h
+++ b/mindspore/ccsrc/pipeline/parse/parse.h
@@ -223,8 +223,8 @@ class Parser {
     FunctionBlockPtr block = std::make_shared<FunctionBlock>(parse);
     // In order to keep effect order in the sub-graphs which generated by control flow.
     // We copy the flags from the top graph to the sub-graphs.
-    if (func_graph_ && !func_graph_->flags().empty()) {
-      block->func_graph()->set_flags(func_graph_->flags());
+    if (func_graph_ && !func_graph_->attrs().empty()) {
+      block->func_graph()->set_attrs(func_graph_->attrs());
     }
     func_block_list_.push_back(block);
     return block;
diff --git a/mindspore/ccsrc/pipeline/parse/parse_base.h b/mindspore/ccsrc/pipeline/parse/parse_base.h
index ef1aeef55c..4961ab78c0 100644
--- a/mindspore/ccsrc/pipeline/parse/parse_base.h
+++ b/mindspore/ccsrc/pipeline/parse/parse_base.h
@@ -60,6 +60,7 @@ const char PYTHON_MOD_RESOLVE_FUNCTION[] = "resolve_symbol";
 const char PYTHON_MOD_RESOLVE_GET_OBJ_KEY[] = "get_object_key";
 const char PYTHON_MOD_PARSE_CHECK_IS_CLASS_MEMBER[] = "is_class_member";
 const char PYTHON_MOD_RESOLVE_GET_OBJ_TYPE[] = "get_obj_type";
+const char PYTHON_MOD_GET_OBJ_ID[] = "get_obj_id";
 const char PYTHON_MOD_GET_CLASS_INSTANCE_TYPE[] = "get_class_instance_type";
 const char PYTHON_MOD_CREATE_OBJ_INSTANCE[] = "create_obj_instance";
 const char PYTHON_MOD_GET_DATACLASS_ATTRS[] = "get_dataclass_attributes";
@@ -83,6 +84,7 @@ const char PYTHON_PARSE_GET_SCOPE_NAME[] = "get_scope_name";
 
 const char PYTHON_PARSE_CLASS_SLICE[] = "create_slice_obj";
 const char PYTHON_PARSE_CLASS_ELLIPSIS[] = "create_ellipsis_obj";
+const char PYTHON_MOD_GET_DEFAULT_INPUT[] = "get_default_input";
 
 // define the common name
 const char NAMED_PRIMITIVE_ITER[] = "iter";
@@ -107,6 +109,7 @@ const char PYTHON_EXTERN_MINDSPORE_FLAG[] = "_mindspore_flags";
 
 // define the parse constant
 const int MAX_COMPARISON_OPS_SUPPORTED = 1;
+const char CUSTOM_BPROP_NAME[] = "bprop";
 
 // define the Namespace name
 const char RESOLVE_NAMESPACE_NAME_AST[] = "Ast";                   // for ast type namespace
diff --git a/mindspore/ccsrc/pipeline/pass.cc b/mindspore/ccsrc/pipeline/pass.cc
index 0a5af9e3df..94063fb780 100644
--- a/mindspore/ccsrc/pipeline/pass.cc
+++ b/mindspore/ccsrc/pipeline/pass.cc
@@ -25,12 +25,14 @@
 #include <functional>
 
 #include "ir/func_graph_cloner.h"
+#include "debug/anf_ir_utils.h"
 #include "pipeline/parse/parse_base.h"
 #include "pipeline/parse/data_converter.h"
 #include "pipeline/resource.h"
 #include "pipeline/validator.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/cse.h"
+#include "optimizer/graph_kernel_reuse.h"
 #include "optimizer/clean.h"
 #include "optimizer/irpass.h"
 #include "optimizer/control_depend.h"
@@ -38,6 +40,7 @@
 #include "parallel/step_auto_parallel.h"
 #include "parallel/allreduce_fusion/step_allreduce_fusion.h"
 #include "utils/any.h"
+#include "utils/log_adapter.h"
 
 namespace mindspore {
 namespace pipeline {
@@ -79,15 +82,9 @@ OptPassGroupMap GetOptPassesA(const opt::irpass::OptimizeIRPassLib &irpass) {
     // Specialization
     irpass.specialize_transform_,
 
-    // Arithmetic simplifications
-    irpass.arithmetic_simplify_,
-    irpass.addn_zero_filter_,
-
     // Miscellaneous
     irpass.item_tuple_eliminate_,
-    irpass.env_get_set_item_,
-    irpass.new_env_get_item_,
-    irpass.add_env_get_item_,
+    irpass.env_get_item_eliminate_,
     irpass.cast_eliminate_,
     irpass.reshape_eliminate_,
     irpass.reduce_eliminate_,
@@ -95,18 +92,26 @@ OptPassGroupMap GetOptPassesA(const opt::irpass::OptimizeIRPassLib &irpass) {
     irpass.transpose_eliminate_,
     irpass.minmaximum_grad_,
     irpass.get_make_ref_eliminate_,
+
+    // Arithmetic simplifications
+    irpass.arithmetic_simplify_,
+    irpass.addn_zero_filter_,
+    irpass.adjust_all_reduce_mul_add_,
+
+    // Safe inlining
+    irpass.inline_,
   });
   opt::OptPassConfig a_2 = opt::OptPassConfig({
     irpass.merge_addn_,
     irpass.float_tuple_getitem_switch_,
     irpass.float_env_getitem_switch_,
-    irpass.incorporate_getitem_,
-    irpass.incorporate_getitem_switch_,
+    irpass.incorporate_getitem_set_,
     irpass.incorporate_call_,
     irpass.incorporate_call_switch_,
     irpass.incorporate_env_getitem_,
     irpass.incorporate_env_getitem_switch_,
     irpass.new_env_get_item_,
+    irpass.depend_value_elim_,
   });
   opt::OptPassConfig a_3 = opt::OptPassConfig({
     irpass.same_eliminate_,
@@ -144,12 +149,12 @@ OptPassGroupMap GetOptPassesB(const opt::irpass::OptimizeIRPassLib &irpass) {
     irpass.reset_defer_inline_,
     irpass.inline_,
     irpass.special_op_eliminate_,
-    irpass.stop_gradient_eliminate_,
     irpass.get_make_ref_eliminate_,
   });
   opt::OptPassConfig b_2 = opt::OptPassConfig({
     irpass.replace_refkey_by_param_,
     irpass.make_ref_eliminate_,
+    irpass.get_ref_param_eliminate_,
   });
   OptPassGroupMap map({
     {"b_1", b_1},
@@ -160,6 +165,40 @@ OptPassGroupMap GetOptPassesB(const opt::irpass::OptimizeIRPassLib &irpass) {
   return map;
 }
 
+OptPassGroupMap GetOptPassesGraphKernelA(const opt::irpass::OptimizeIRPassLib &irpass) {
+  opt::OptPassConfig interface_fusion = opt::OptPassConfig({
+    irpass.mark_interface_fusion_,
+  });
+  OptPassGroupMap map({
+    {"graph_kernel_reuse", opt::OptPassConfig(opt::GraphKernelReuse())},
+    {"interface_fusion", interface_fusion},
+    {"renormalize", opt::OptPassConfig::Renormalize()},
+    {"cse", opt::OptPassConfig(opt::CSE(false))},
+  });
+  return map;
+}
+
+OptPassGroupMap GetOptPassesGraphKernelB(const opt::irpass::OptimizeIRPassLib &irpass) {
+  opt::OptPassConfig elim_1 = opt::OptPassConfig({
+    irpass.addn_eliminate_,
+    irpass.incorporate_getitem_from_param_,
+  });
+  opt::OptPassConfig elim_2 = opt::OptPassConfig({
+    irpass.unused_parameter_eliminate_,
+    irpass.unused_output_eliminate_,
+  });
+  OptPassGroupMap map({
+    {"elim_1", elim_1},
+    {"renormalize", opt::OptPassConfig::Renormalize()},
+    {"elim_2", elim_2},
+  });
+  return map;
+}
+
+OptPassGroupMap GetOptPassesC(const opt::irpass::OptimizeIRPassLib &irpass) {
+  return OptPassGroupMap({{"renormalize", opt::OptPassConfig::Renormalize()}});
+}
+
 OptPassGroupMap GetControlPhases(const opt::irpass::OptimizeIRPassLib &irpass) {
   opt::OptPassConfig control_group = opt::OptPassConfig({irpass.convert_switch_replacement_}, true);
   OptPassGroupMap map({
@@ -189,8 +228,19 @@ void InitOpt(const ResourcePtr &res) {
     opt::irpass::OptimizeIRPassLib irpass;
     g_pass_opts["opt_a"] = Optimizer::MakeOptimizer("opt_a", res, GetOptPassesA(irpass));
     g_pass_opts["opt_b"] = Optimizer::MakeOptimizer("opt_b", res, GetOptPassesB(irpass), false, true);
+    g_pass_opts["opt_graph_kernel_a"] =
+      Optimizer::MakeOptimizer("opt_graph_kernel_a", res, GetOptPassesGraphKernelA(irpass), true);
+    g_pass_opts["opt_graph_kernel_b"] =
+      Optimizer::MakeOptimizer("opt_graph_kernel_b", res, GetOptPassesGraphKernelB(irpass), false);
+    g_pass_opts["renormal"] = Optimizer::MakeOptimizer("renormal", res, GetOptPassesC(irpass));
     g_pass_opts["opt_control"] = Optimizer::MakeOptimizer("opt_control", res, GetControlPhases(irpass), false, true);
     g_pass_opts["opt_prepare"] = Optimizer::MakeOptimizer("opt_prepare", res, GetPreparePhases(irpass));
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    if (!(context_ptr->enable_graph_kernel())) {
+      g_pass_opts["opt_graph_kernel_a"]->set_enable(false);
+      g_pass_opts["opt_graph_kernel_b"]->set_enable(false);
+    }
   }
 }
 }  // namespace
@@ -222,9 +272,13 @@ bool OptPassGroup(const ResourcePtr &res, const std::string &name) {
 
 bool OptPassAGroup(const ResourcePtr &res) { return OptPassGroup(res, "opt_a"); }
 bool OptPassBGroup(const ResourcePtr &res) { return OptPassGroup(res, "opt_b"); }
+bool OptPassGraphKernelGroupA(const ResourcePtr &res) { return OptPassGroup(res, "opt_graph_kernel_a"); }
+bool OptPassGraphKernelGroupB(const ResourcePtr &res) { return OptPassGroup(res, "opt_graph_kernel_b"); }
 bool ControlGroup(const ResourcePtr &res) { return OptPassGroup(res, "opt_control"); }
 bool PrepareGroup(const ResourcePtr &res) { return OptPassGroup(res, "opt_prepare"); }
 
+bool OptPassRNGroup(const ResourcePtr &res) { return OptPassGroup(res, "renormal"); }
+
 bool AddControlDependPass(const ResourcePtr &res) {
   FuncGraphPtr func_graph = res->func_graph();
   MS_EXCEPTION_IF_NULL(func_graph);
@@ -268,8 +322,10 @@ bool InferenceOptPreparePass(const ResourcePtr &res) {
 std::vector<PassItem> kVmPasses = {{"simplify_data_structures", SimplifyDataStructuresPass},
                                    {"opt_a", OptPassAGroup},
                                    {"opt_b", OptPassBGroup},
-                                   {"add_control_depend", AddControlDependPass},
-                                   {"cconv", CconvPass}};
+                                   {"cconv", CconvPass},
+                                   {"opt_graph_kernel_a", OptPassGraphKernelGroupA},
+                                   {"opt_graph_kernel_b", OptPassGraphKernelGroupB},
+                                   {"add_control_depend", AddControlDependPass}};
 
 std::vector<PassItem> kGePasses = {{"simplify_data_structures", SimplifyDataStructuresPass},
                                    {"opt_a", OptPassAGroup},
@@ -278,5 +334,7 @@ std::vector<PassItem> kGePasses = {{"simplify_data_structures", SimplifyDataStru
                                    {"opt_control", ControlGroup},
                                    {"opt_prepare", PrepareGroup},
                                    {"cconv", CconvPass}};
+
+std::vector<PassItem> kPynativePasses = {{"opt_a", OptPassAGroup}, {"opt_b", OptPassBGroup}, {"cconv", CconvPass}};
 }  // namespace pipeline
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pipeline/pass.h b/mindspore/ccsrc/pipeline/pass.h
index 2636879d01..9064df52ee 100644
--- a/mindspore/ccsrc/pipeline/pass.h
+++ b/mindspore/ccsrc/pipeline/pass.h
@@ -29,6 +29,7 @@ using PassItem = std::pair<std::string, std::function<bool(ResourcePtr)>>;
 
 extern std::vector<PassItem> kGePasses;
 extern std::vector<PassItem> kVmPasses;
+extern std::vector<PassItem> kPynativePasses;
 
 bool CconvPass(const ResourcePtr &res);
 bool ValidatePass(const ResourcePtr &res);
diff --git a/mindspore/ccsrc/pipeline/pipeline.cc b/mindspore/ccsrc/pipeline/pipeline.cc
index 103477363f..517d4cc518 100644
--- a/mindspore/ccsrc/pipeline/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/pipeline.cc
@@ -59,6 +59,7 @@ using mindspore::abstract::AbstractTuplePtr;
 
 const char IR_TYPE_ANF[] = "anf_ir";
 const char IR_TYPE_ONNX[] = "onnx_ir";
+const char IR_TYPE_BINARY[] = "binary_ir";
 
 ExecutorPyPtr ExecutorPy::executor_ = nullptr;
 std::mutex ExecutorPy::instance_lock_;
@@ -212,6 +213,14 @@ py::bytes ExecutorPy::GetFuncGraphProto(const std::string &phase, const std::str
     return proto_str;
   }
 
+  if (ir_type == IR_TYPE_BINARY) {
+    std::string proto_str = GetBinaryProtoString(fg_ptr);
+    if (proto_str.empty()) {
+      MS_LOG(EXCEPTION) << "Graph proto is empty.";
+    }
+    return proto_str;
+  }
+
   MS_LOG(EXCEPTION) << "Unknown ir type: " << ir_type;
 }
 
@@ -236,9 +245,7 @@ py::dict ExecutorPy::GetAllreduceFusion(const std::string &phase) {
 }
 
 void ExecutorPy::DelNetRes(const std::string &id) {
-#ifdef ENABLE_GE
   FinalizeBackend();
-#endif
   if (executor_ != nullptr) {
     bool flag = false;
     auto tmp_info = info_;
@@ -272,6 +279,75 @@ ExecutorPy::~ExecutorPy() {
   ConfigManager::GetInstance().ResetConfig();
 }
 
+std::map<std::string, std::pair<PrimitivePyPtr, std::string>> ExecutorPy::FetchInfoForQuantExport(
+  const std::string &phase_s) {
+  FuncGraphPtr func_graph = info_[phase_s]->resource->func_graph();
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_LOG(DEBUG) << "FetchInfoForQuantExport func graph(" << func_graph->ToString() << ") phase(" << phase_s << ")!";
+  std::map<std::string, std::pair<PrimitivePyPtr, std::string>> fake_quant_table;
+  auto filter = [](AnfNodePtr node) {
+    return !(IsPrimitiveCNode(node, prim::kPrimConv2D) || IsPrimitiveCNode(node, prim::kPrimMatMul));
+  };
+  std::vector<AnfNodePtr> nodes = DeepScopedGraphSearchWithFilter(func_graph->get_return(), AlwaysInclude, filter);
+  auto is_quant_cnode = [](AnfNodePtr node) {
+    return IsPrimitiveCNode(node, prim::kPrimFakeQuantPerLayer) ||
+           IsPrimitiveCNode(node, prim::kPrimFakeQuantPerChannel);
+  };
+  for (auto node : nodes) {
+    auto cnode = node->cast<CNodePtr>();
+    if (cnode == nullptr || cnode->size() != 3) {
+      continue;
+    }
+    auto x = cnode->input(1);
+    auto weight = cnode->input(2);
+    if (!is_quant_cnode(weight)) {
+      continue;
+    }
+    // get parameter weight's name
+    cnode = weight->cast<CNodePtr>();
+    auto weight_node = cnode->input(2);
+    if (!weight_node->isa<Parameter>()) {
+      continue;
+    }
+    auto weight_name = weight_node->cast<ParameterPtr>()->name();
+    // find the fakequant from input
+    int count = 0;
+    int max_depth = 5;
+    while (!is_quant_cnode(x)) {
+      if (count >= max_depth) {
+        break;
+      }
+      cnode = x->cast<CNodePtr>();
+      if (cnode == nullptr || cnode->size() <= 1) {
+        break;
+      }
+      x = cnode->input(1);
+      count += 1;
+    }
+    // get the fakequant parameter minq's name
+    if (!is_quant_cnode(x)) {
+      continue;
+    }
+    cnode = x->cast<CNodePtr>();
+    if (cnode == nullptr || cnode->size() != 4) {
+      continue;
+    }
+    auto fakequant_min_node = cnode->input(2);
+    if (!fakequant_min_node->isa<Parameter>()) {
+      continue;
+    }
+    auto fakequant_min_node_name = fakequant_min_node->cast<ParameterPtr>()->name();
+    auto quant_op_value = cnode->input(0)->cast<ValueNodePtr>()->value();
+    if (!quant_op_value->isa<PrimitivePy>()) {
+      continue;
+    }
+    auto quant_op = quant_op_value->cast<PrimitivePyPtr>();
+    fake_quant_table[weight_name] = std::make_pair(quant_op, fakequant_min_node_name);
+  }
+
+  return fake_quant_table;
+}
+
 void ExecutorPy::SaveCompiledGraph(const std::string &phase_s) {
   // save the graph to ExecutorPy
   FuncGraphPtr func_graph = info_[phase_s]->resource->func_graph();
@@ -462,6 +538,9 @@ bool ExecutorPy::Compile(const py::object &obj, const py::tuple &args, const py:
   } catch (const py::value_error &ex) {
     ReleaseResource(phase);
     throw py::value_error(ex);
+  } catch (const py::index_error &ex) {
+    ReleaseResource(phase);
+    throw py::index_error(ex);
   } catch (const std::exception &ex) {
     ReleaseResource(phase);
     // re-throw this exception to Python interpreter to handle it
@@ -506,7 +585,6 @@ void RunPipelineAction(const ActionItem &action, pipeline::ResourcePtr resource,
 
   // when in loading anf ir mode, action `parse` do nothing
   if (action.first == "parse") {
-    parse::PythonAdapter::SetPythonEnvFlag(true);
     return;
   }
 
@@ -566,6 +644,7 @@ void Pipeline::Run() {
           draw::Draw(base_name + ".dot", graph);
           // generate IR file in human readable format
           DumpIR(base_name + ".ir", graph);
+
           // generate IR file in a heavily commented format, which can also be reloaded
           if (action.first != "parse") {
             ExportIR(base_name + ".dat", std::to_string(i), graph);
@@ -608,24 +687,27 @@ void Pipeline::Run() {
   MS_LOG(INFO) << "End";
 }
 
-void ExecutorPy::ProcessVmArg(const py::tuple &args, const std::string &phase, VectorRef *arg_list) {
+void ProcessVmArgInner(const py::tuple &args, const ResourcePtr &res, VectorRef *const arg_list) {
   std::size_t size = args.size();
 
   for (std::size_t i = 0; i < size; i++) {
     py::object arg = args[i];
     auto ms_context = MsContext::GetInstance();
     if (ms_context->backend_policy() == kMsConvert && py::isinstance<py::array>(arg)) {
-      MS_LOG(EXCEPTION) << "Args[" << i << "] is numpy array, not tensor";
+      MS_LOG(EXCEPTION) << "The " << i << "th arg is numpy array, not tensor.";
     }
     ValuePtr converted = nullptr;
     bool succ = parse::ConvertData(arg, &converted);
     if (!succ) {
-      MS_LOG(EXCEPTION) << "Args convert error";
+      MS_LOG(EXCEPTION) << "The " << i << "th arg convert failed.";
+    }
+    if (MsContext::GetInstance()->execution_mode() == 0 && !converted->isa<tensor::Tensor>()) {
+      MS_EXCEPTION(TypeError) << "For 'graph mode', the " << i << "th arg: " << converted->ToString()
+                              << " is not tensor.";
     }
     arg_list->push_back(converted);
   }
 
-  ResourcePtr res = GetResource(phase);
   MS_EXCEPTION_IF_NULL(res);
   auto graph = res->func_graph();
   MS_EXCEPTION_IF_NULL(graph);
@@ -647,6 +729,10 @@ void ExecutorPy::ProcessVmArg(const py::tuple &args, const std::string &phase, V
   }
 }
 
+void ExecutorPy::ProcessVmArg(const py::tuple &args, const std::string &phase, VectorRef *const arg_list) {
+  ProcessVmArgInner(args, GetResource(phase), arg_list);
+}
+
 py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) {
   std::size_t size = args.size();
   if (!py::isinstance<py::str>(phase)) {
@@ -775,7 +861,7 @@ bool InitExecDatasetVm(const std::string &queue_name, int64_t size, int64_t batc
   MS_EXCEPTION_IF_NULL(convert_fn);
   // Convert CNodeList to LinConvertResult.
   ConfigManager::GetInstance().set_iter_num(1);
-  auto runner = convert_fn({app_init});
+  auto runner = convert_fn({app_init}, "");
   if (MsContext::GetInstance()->execution_mode() != kPynativeMode) {
     backend->Link(runner.graph_id);
   }
@@ -874,6 +960,8 @@ void ClearResAtexit() {
   compile::ClearConvertCache();
   pipeline::GetMethodMap().clear();
   pipeline::ExecutorPy::ClearRes();
+  pipeline::ReclaimOptimizer();
+  pynative::PynativeExecutor::GetInstance()->ClearRes();
 #ifdef ENABLE_GE
   transform::DfGraphManager::GetInstance().ClearGraph();
   transform::DfGraphConvertor::get_adpt_map().clear();
diff --git a/mindspore/ccsrc/pipeline/pipeline.h b/mindspore/ccsrc/pipeline/pipeline.h
index 81d0e1a9f4..3f1274c417 100644
--- a/mindspore/ccsrc/pipeline/pipeline.h
+++ b/mindspore/ccsrc/pipeline/pipeline.h
@@ -97,6 +97,8 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
   void ReleaseResource(const py::object &phase);
   static void ClearRes();
 
+  std::map<std::string, std::pair<PrimitivePyPtr, std::string>> FetchInfoForQuantExport(const std::string &phase_s);
+
  private:
   ExecutorPy();
   void ConvertObjectToTensors(const py::dict &dict, std::map<std::string, tensor::TensorPtr> *tensors);
@@ -139,6 +141,8 @@ bool InitExecDatasetVm(const std::string &queue_name, int64_t size, int64_t batc
                        const std::vector<TypePtr> &types, const std::vector<std::vector<int64_t>> &shapes,
                        const std::vector<int64_t> &input_indexes, bool need_run);
 
+void ProcessVmArgInner(const py::tuple &args, const ResourcePtr &res, VectorRef *const arg_list);
+
 }  // namespace pipeline
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/pipeline/pipeline_ge.cc b/mindspore/ccsrc/pipeline/pipeline_ge.cc
index 309b482d62..ea0ca14c7a 100644
--- a/mindspore/ccsrc/pipeline/pipeline_ge.cc
+++ b/mindspore/ccsrc/pipeline/pipeline_ge.cc
@@ -460,12 +460,12 @@ void ProcessGeArg(const std::map<std::string, ExecutorInfoPtr> &info, const py::
       ValuePtr converted = nullptr;
       bool succ = parse::ConvertData(args[i], &converted);
       if (!succ) {
-        MS_LOG(EXCEPTION) << "Args convert error";
+        MS_LOG(EXCEPTION) << "The " << i << "th arg convert failed.";
       }
       if (converted->isa<tensor::Tensor>()) {
         inputs->push_back(converted->cast<tensor::TensorPtr>());
       } else {
-        MS_EXCEPTION(TypeError) << "Args " << converted->ToString() << " is not tensor";
+        MS_EXCEPTION(TypeError) << "The " << i << "th arg: " << converted->ToString() << " is not tensor.";
       }
     }
   }
@@ -488,7 +488,7 @@ py::object ExecDFGraph(const std::map<std::string, ExecutorInfoPtr> &info, const
 #ifdef ENABLE_INFER
   // Now don't use the graph because the exec ge function don't take effect
   MS_EXCEPTION_IF_NULL(info.at(phase)->func_graph);
-  if (ENABLE_TRAIN != info.at(phase)->func_graph->flags()["training"]) {
+  if (ENABLE_TRAIN != info.at(phase)->func_graph->has_flag("training")) {
     MS_LOG(ERROR) << "Graph training mode mismatch mode of libraries";
     ConfigManager::GetInstance().ResetConfig();
     return py::none();
diff --git a/mindspore/ccsrc/pipeline/static_analysis/abstract_value.cc b/mindspore/ccsrc/pipeline/static_analysis/abstract_value.cc
index d4f0c6f8d4..f23c6e31c4 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/abstract_value.cc
+++ b/mindspore/ccsrc/pipeline/static_analysis/abstract_value.cc
@@ -51,6 +51,7 @@ ValuePtr AbstractBase::BuildValue() const {
 AbstractBasePtr AbstractBase::Broaden() const {
   AbstractBasePtr clone = Clone();
   clone->set_value(kAnyValue);
+  clone->set_sparse_grad(sparse_grad_);
   return clone;
 }
 
@@ -63,7 +64,8 @@ std::string AbstractBase::ToString() const {
   MS_EXCEPTION_IF_NULL(type_);
   MS_EXCEPTION_IF_NULL(shape_);
   buffer << type_name() << "("
-         << "Type: " << type_->ToString() << " Value: " << value << " Shape: " << shape_->ToString() << ")";
+         << "Type: " << type_->ToString() << " Value: " << value << " Shape: " << shape_->ToString()
+         << " sparse_grad: " << sparse_grad_ << ")";
   return buffer.str();
 }
 
@@ -72,16 +74,22 @@ AbstractBasePtr AbstractScalar::Broaden() const { return AbstractBase::Broaden()
 AbstractBasePtr AbstractScalar::Join(const AbstractBasePtr &other) {
   MS_EXCEPTION_IF_NULL(other);
   if (*this == *other) {
-    return shared_from_base<AbstractBase>();
+    auto ret = shared_from_base<AbstractBase>();
+    ret->set_sparse_grad(sparse_grad());
+    return ret;
   }
   auto value_self = GetValueTrack();
   MS_EXCEPTION_IF_NULL(value_self);
   ValuePtr res_value = ValueJoin(value_self, other->GetValueTrack());
   TypePtr res_type = TypeJoin(GetTypeTrack(), other->GetTypeTrack());
   if (res_value == value_self) {
-    return shared_from_base<AbstractBase>();
+    auto ret = shared_from_base<AbstractBase>();
+    ret->set_sparse_grad(sparse_grad());
+    return ret;
   }
-  return std::make_shared<AbstractScalar>(res_value, res_type);
+  auto ret = std::make_shared<AbstractScalar>(res_value, res_type);
+  ret->set_sparse_grad(sparse_grad());
+  return ret;
 }
 
 AbstractBasePtr AbstractType::Clone() const {
@@ -423,7 +431,9 @@ AbstractBasePtr AbstractTensor::Join(const AbstractBasePtr &other) {
   }
   auto element = element_->Join(other_tensor->element_);
   auto shape = ShapeJoin(this->shape(), other_tensor->shape());
-  return std::make_shared<AbstractTensor>(element, shape);
+  auto ret = std::make_shared<AbstractTensor>(element, shape);
+  ret->set_sparse_grad(sparse_grad());
+  return ret;
 }
 
 bool AbstractTensor::operator==(const AbstractTensor &other) const {
@@ -463,6 +473,7 @@ AbstractBasePtr AbstractTensor::Clone() const {
   ShapePtr shp = shape();
   clone->set_shape(shp->Clone());
   clone->set_value(GetValueTrack());
+  clone->set_sparse_grad(sparse_grad());
   return clone;
 }
 
@@ -472,6 +483,7 @@ AbstractBasePtr AbstractTensor::Broaden() const {
   auto shp = shape();
   broaden->set_shape(shp->Clone());
   broaden->set_value(kAnyValue);
+  broaden->set_sparse_grad(sparse_grad());
   return broaden;
 }
 
@@ -482,6 +494,7 @@ AbstractBasePtr AbstractTensor::BroadenWithShape() const {
   shp->Broaden();
   broaden->set_shape(shp);
   broaden->set_value(kAnyValue);
+  broaden->set_sparse_grad(sparse_grad());
   return broaden;
 }
 
@@ -502,7 +515,8 @@ std::string AbstractTensor::ToString() const {
   MS_EXCEPTION_IF_NULL(value_track);
   buffer << type_name() << "("
          << "shape: " << shape_track->ToString() << ", element: " << element_->ToString()
-         << ", value_ptr: " << value_track << ", value: " << value_track->ToString() << ")";
+         << ", value_ptr: " << value_track << ", value: " << value_track->ToString() << " sparse_grad " << sparse_grad()
+         << ")";
   return buffer.str();
 }
 
diff --git a/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h b/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h
index 939976bb95..f3375d22d6 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h
+++ b/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h
@@ -44,7 +44,7 @@ class AbstractBase : public Base {
  public:
   explicit AbstractBase(const ValuePtr &value = nullptr, const TypePtr &type = kAnyType,
                         const BaseShapePtr &shape = kNoShape)
-      : value_(value), type_(type), shape_(shape) {}
+      : value_(value), type_(type), shape_(shape), sparse_grad_("") {}
   ~AbstractBase() override = default;
   MS_DECLARE_PARENT(AbstractBase, Base)
 
@@ -53,11 +53,13 @@ class AbstractBase : public Base {
 
   virtual bool operator==(const AbstractBase &other) const;
   void set_value(const ValuePtr &value) { value_ = value; }
+  void set_sparse_grad(const std::string &sparse_grad) { sparse_grad_ = sparse_grad; }
   void set_type(const TypePtr &type) { type_ = type; }
   void set_shape(const BaseShapePtr &shape) { shape_ = shape; }
   void set_value_desc(const std::string &desc) { value_desc_ = desc; }
   const std::string &value_desc() const { return value_desc_; }
   ValuePtr GetValueTrack() const { return value_; }
+  const std::string &sparse_grad() const { return sparse_grad_; }
   TypePtr GetTypeTrack() const { return type_; }
   BaseShapePtr GetShapeTrack() const { return shape_; }
 
@@ -85,6 +87,7 @@ class AbstractBase : public Base {
   TypePtr type_;
   BaseShapePtr shape_;
   std::string value_desc_;  // store initial value description for error report
+  std::string sparse_grad_;
 };
 
 class AbstractScalar : public AbstractBase {
diff --git a/mindspore/ccsrc/pipeline/static_analysis/evaluator.cc b/mindspore/ccsrc/pipeline/static_analysis/evaluator.cc
index 254fd43c0b..c9b1ce4f93 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/evaluator.cc
+++ b/mindspore/ccsrc/pipeline/static_analysis/evaluator.cc
@@ -165,7 +165,7 @@ AbstractBasePtrList FuncGraphEvaluator::BroadenUndeterminedArgs(const AbstractBa
         MS_LOG(DEBUG) << "Joined args: " << ::mindspore::ToString(joined_args_spec_list);
         // If there is loop variant, all arguments need to be broaden to avoid wrong constant propagation.
         if (!(joined_args_spec_list == args_spec_list)) {
-          func_graph_->set_flags(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
+          func_graph_->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
         }
         return joined_args_spec_list;
       }
@@ -178,7 +178,7 @@ AbstractBasePtrList FuncGraphEvaluator::BroadenUndeterminedArgs(const AbstractBa
       // If there is loop variant, all arguments need to be broaden to avoid wrong constant propagation.
       if (!(joined_args_spec_list == args_spec_list)) {
         trace_.push_back(joined_args_spec_list);
-        func_graph_->set_flags(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
+        func_graph_->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
       }
       MS_LOG(DEBUG) << "Joined eval args: " << ::mindspore::ToString(joined_args_spec_list);
       return joined_args_spec_list;
diff --git a/mindspore/ccsrc/pipeline/static_analysis/prim.cc b/mindspore/ccsrc/pipeline/static_analysis/prim.cc
index f2f85df430..82b8395933 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/prim.cc
+++ b/mindspore/ccsrc/pipeline/static_analysis/prim.cc
@@ -55,6 +55,7 @@ PrimitiveEvalImplMap &GetPrimitiveToEvalImplMap() {
     {prim::kPrimIsNot, {InferImplIsNot, true}},
     {prim::kPrimInDict, {InferImplInDict, true}},
     {prim::kPrimNotInDict, {InferImplNotInDict, true}},
+    {prim::kPrimIsConsant, {InferImplIsConstant, true}},
     // Maths
     {prim::kPrimMaximumGrad, {InferImplMinOrMaxGrad, true}},
     {prim::kPrimMinimumGrad, {InferImplMinOrMaxGrad, true}},
@@ -106,8 +107,8 @@ PrimitiveEvalImplMap &GetPrimitiveToEvalImplMap() {
     {prim::kPrimConv2DBackpropFilter, {InferImplConv2DBackpropFilter, true}},
     {prim::kPrimBiasAddGrad, {InferImplBiasAddGrad, true}},
     {prim::kPrimRelu, {InferImplRelu, true}},
-    {prim::kPrimZerosLikeTensor, {InferImplZerosLikeTensor, true}},
     {prim::kPrimFakeBprop, {InferImplFakeBprop, false}},
+    {prim::kPrimZerosLike, {InferImplZerosLike, true}},
     {prim::kPrimBpropCut, {InferImplBpropCut, true}},
     {prim::kPrimLayerNorm, {InferImplLayerNorm, true}},
     {prim::kPrimLayerNormGrad, {InferImplLayerNormGrad, true}},
@@ -147,9 +148,6 @@ EvalResultPtr StandardPrimEvaluator::EvalPrim(const AnalysisEnginePtr &engine, c
 EvalResultPtr DoSignatureEvaluator::Run(AnalysisEnginePtr engine, const ConfigPtrList &args_conf_list,
                                         AnfNodeConfigPtr out_conf) {
   AbstractBasePtrList args_spec_list;
-  if (!prim_->isa<prim::DoSignaturePrimitive>()) {
-    MS_LOG(EXCEPTION) << "Primitive should be DoSignature, but " << prim_->ToString();
-  }
   if (out_conf->node() == nullptr || !out_conf->node()->isa<CNode>()) {
     MS_LOG(EXCEPTION) << "Node of out_conf should be CNode";
   }
@@ -221,9 +219,6 @@ EvalResultPtr UnpackGraphEvaluator::Run(AnalysisEnginePtr engine, const ConfigPt
   if (out_conf->node() == nullptr || !out_conf->node()->isa<CNode>()) {
     MS_LOG(EXCEPTION) << "Node of out_conf should be CNode";
   }
-  if (!prim_->isa<prim::UnpackGraphPrimitive>()) {
-    MS_LOG(EXCEPTION) << "Primitive should be UnpackGraphPrimitive, but got " << prim_->ToString();
-  }
 
   auto unpack_graph = prim_->cast<prim::UnpackGraphPrimitivePtr>();
   auto out_node = out_conf->node()->cast<CNodePtr>();
@@ -267,6 +262,80 @@ EvalResultPtr UnpackGraphEvaluator::Run(AnalysisEnginePtr engine, const ConfigPt
   return engine->ForwardConfig(out_conf, fn_conf);
 }
 
+AnfNodePtr MixedPrecisionCastHelper(AnfNodePtr source_node, AbstractBasePtr node_type, AnfNodePtr target_type,
+                                    FuncGraphPtr func_graph) {
+  AnfNodePtr target_node = source_node;
+  if (node_type->isa<AbstractTensor>()) {
+    auto x = node_type->cast<AbstractTensorPtr>();
+    if (x->element()->BuildType()->isa<Float>()) {
+      auto cast = prim::GetPythonOps("cast", "mindspore.ops.functional");
+      MS_EXCEPTION_IF_NULL(cast);
+      target_node = func_graph->NewCNode({NewValueNode(cast), source_node, target_type});
+    }
+  } else if (node_type->isa<AbstractTuple>()) {
+    auto x = node_type->cast<AbstractTuplePtr>();
+    auto &items = x->elements();
+    std::vector<AnfNodePtr> nodes;
+    nodes.emplace_back(NewValueNode(prim::kPrimMakeTuple));
+    int idx = 0;
+    for (const auto &item : items) {
+      AnfNodePtr tuple_node =
+        func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), source_node, NewValueNode(idx)});
+      AnfNodePtr node = MixedPrecisionCastHelper(tuple_node, item, target_type, func_graph);
+      nodes.emplace_back(node);
+      ++idx;
+    }
+    target_node = func_graph->NewCNode(nodes);
+  } else if (node_type->isa<AbstractDictionary>()) {
+    auto x = node_type->cast<AbstractDictionaryPtr>();
+    auto &items = x->elements();
+    std::vector<AnfNodePtr> dict_key_nodes;
+    std::vector<AnfNodePtr> dict_value_nodes;
+    dict_key_nodes.emplace_back(NewValueNode(prim::kPrimMakeTuple));
+    dict_value_nodes.emplace_back(NewValueNode(prim::kPrimMakeTuple));
+    for (const auto &item : items) {
+      AnfNodePtr dict_value_node =
+        func_graph->NewCNode({NewValueNode(prim::kPrimDictGetItem), source_node, NewValueNode(item.first)});
+      AnfNodePtr node = MixedPrecisionCastHelper(dict_value_node, item.second, target_type, func_graph);
+      dict_key_nodes.emplace_back(NewValueNode(item.first));
+      dict_value_nodes.emplace_back(node);
+    }
+    target_node = func_graph->NewCNode({NewValueNode(prim::kPrimMakeDict), func_graph->NewCNode(dict_key_nodes),
+                                        func_graph->NewCNode(dict_value_nodes)});
+  }
+  return target_node;
+}
+
+EvalResultPtr MixedPrecisionCastEvaluator::Run(AnalysisEnginePtr engine, const ConfigPtrList &args_conf_list,
+                                               AnfNodeConfigPtr out_conf) {
+  AbstractBasePtrList args_spec_list;
+  if (out_conf->node() == nullptr || !out_conf->node()->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << "Node of out_conf should be CNode";
+  }
+  auto out_node = out_conf->node()->cast<CNodePtr>();
+  const auto &out_node_inputs = out_node->inputs();
+  if (out_node->inputs().size() == 0 || (out_node_inputs.size() - 1) != args_conf_list.size()) {
+    MS_LOG(EXCEPTION) << "MixedPrecisionCast"
+                      << " args size should equal to inputs size minus 1, but args size " << args_conf_list.size()
+                      << ", inputs size " << out_node_inputs.size();
+  }
+  AnfNodePtrList args_inputs{out_node_inputs.begin() + 1, out_node_inputs.end()};
+  (void)std::transform(args_conf_list.begin(), args_conf_list.end(), std::back_inserter(args_spec_list),
+                       [](const ConfigPtr &ref) -> AbstractBasePtr { return ref->GetEvaluatedValue()->abstract(); });
+
+  ScopePtr scope = kDefaultScope;
+  if (out_conf != nullptr) {
+    scope = out_conf->node()->scope();
+  }
+  ScopeGuard scope_guard(scope);
+
+  FuncGraphPtr func_graph = out_conf->node()->func_graph();
+  AnfNodePtr new_node = MixedPrecisionCastHelper(out_node_inputs[2], args_spec_list[1], out_node_inputs[1], func_graph);
+  AnfNodeConfigPtr fn_conf = engine->MakeConfig(new_node, out_conf->context());
+
+  return engine->ForwardConfig(out_conf, fn_conf);
+}
+
 namespace {
 py::object BuildValue(const ValuePtr &value_ptr) {
   if (value_ptr == nullptr) {
@@ -300,11 +369,9 @@ py::dict ConvertAbstractToPython(const AbstractBasePtr &abs_base) {
     auto value = abs_base->cast<AbstractRefPtr>()->ref();
     dic = ConvertAbstractToPython(value);
   } else if (abs_base->isa<AbstractEllipsis>()) {
-    auto arg_slice = dyn_cast<AbstractEllipsis>(abs_base);
-    std::vector<int> shape;
-    dic["shape"] = shape;
-    dic["dtype"] = arg_slice->BuildType();
-    dic["value"] = BuildValue(arg_slice->BuildValue());
+    dic["shape"] = py::none();
+    dic["dtype"] = py::ellipsis();
+    dic["value"] = py::ellipsis();
   } else if (abs_base->isa<AbstractTuple>()) {
     auto arg_tuple = dyn_cast<AbstractTuple>(abs_base);
     size_t len = arg_tuple->size();
@@ -798,7 +865,11 @@ class RefToEmbedEvaluator : public SymbolicPrimEvaluator {
     }
     auto refkey = key_value->cast<RefKeyPtr>();
     if (refkey == nullptr) {
-      return std::make_shared<EvalResult>(std::make_shared<AbstractScalar>(type), std::make_shared<AttrValueMap>());
+      auto ret = std::make_shared<AbstractScalar>(type);
+      auto ref_value = ref_abs->ref();
+      MS_EXCEPTION_IF_NULL(ref_value);
+      ret->set_sparse_grad(ref_value->sparse_grad());
+      return std::make_shared<EvalResult>(ret, std::make_shared<AttrValueMap>());
     }
 
     std::string name = refkey->tag();
@@ -812,6 +883,7 @@ class RefToEmbedEvaluator : public SymbolicPrimEvaluator {
     x = SensitivityTransform(x);
     std::shared_ptr<SymbolicKeyInstance> key = std::make_shared<SymbolicKeyInstance>(node, x);
     std::shared_ptr<AbstractScalar> abs_scalar = std::make_shared<AbstractScalar>(key, type);
+    abs_scalar->set_sparse_grad(x->sparse_grad());
     return std::make_shared<EvalResult>(abs_scalar, std::make_shared<AttrValueMap>());
   }
 };
diff --git a/mindspore/ccsrc/pipeline/static_analysis/prim.h b/mindspore/ccsrc/pipeline/static_analysis/prim.h
index 22418180f7..5b910f8194 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/prim.h
+++ b/mindspore/ccsrc/pipeline/static_analysis/prim.h
@@ -102,6 +102,22 @@ class UnpackGraphEvaluator : public Evaluator {
   PrimitivePtr prim_;
 };
 
+class MixedPrecisionCastEvaluator : public Evaluator {
+ public:
+  explicit MixedPrecisionCastEvaluator(const PrimitivePtr primitive)
+      : Evaluator("MixedPrecisionCastEvaluator"), prim_(primitive) {}
+  ~MixedPrecisionCastEvaluator() override = default;
+  EvalResultPtr Run(AnalysisEnginePtr engine, const ConfigPtrList &argrefs,
+                    AnfNodeConfigPtr out_config = nullptr) override;
+
+  EvalResultPtr Eval(AnalysisEnginePtr, const AbstractBasePtrList &) override {
+    MS_LOG(EXCEPTION) << "Eval() should not be called, Run() method should be called";
+  }
+
+ private:
+  PrimitivePtr prim_;
+};
+
 bool IsInWhiteList(PrimitivePtr primitive);
 StandardPrimitiveEvalImpl GetPrimitiveInferImpl(const PrimitivePtr &primitive);
 
@@ -184,6 +200,8 @@ AbstractBasePtr InferImplInDict(const AnalysisEnginePtr &, const PrimitivePtr &,
                                 const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplNotInDict(const AnalysisEnginePtr &, const PrimitivePtr &,
                                    const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplIsConstant(const AnalysisEnginePtr &, const PrimitivePtr &,
+                                    const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplPooling(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                  const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplPoolingGrad(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
@@ -206,10 +224,10 @@ AbstractBasePtr InferImplGeluGrad(const AnalysisEnginePtr &, const PrimitivePtr
                                   const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplRelu(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                               const AbstractBasePtrList &args_spec_list);
-AbstractBasePtr InferImplZerosLikeTensor(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                         const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplFakeBprop(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplZerosLike(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                   const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplBpropCut(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplLayerNorm(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
diff --git a/mindspore/ccsrc/pipeline/static_analysis/program_specialize.cc b/mindspore/ccsrc/pipeline/static_analysis/program_specialize.cc
index 2a03eb6d5c..e01b98841b 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/program_specialize.cc
+++ b/mindspore/ccsrc/pipeline/static_analysis/program_specialize.cc
@@ -378,11 +378,7 @@ AnfNodePtr FuncGraphSpecializer::BuildSpecializedNodeInner(const AbstractBasePtr
   }
   auto real_eval = dyn_cast<BaseFuncGraphEvaluator>(eval);
 
-  if (func->context() != nullptr) {
-    if (!IsVisible(func_graph_, func->context()->func_graph())) {
-      MS_LOG(EXCEPTION) << "Func is not visible NodeInfo: " << trace::GetDebugInfo(func_graph_->debug_info());
-    }
-  } else {
+  if (func->context() == nullptr) {
     MS_LOG(EXCEPTION) << "Func context is nullptr NodeInfo: " << trace::GetDebugInfo(func_graph_->debug_info());
   }
   AnalysisContextPtr context = real_eval->MakeContext(engine_, argvals);
@@ -507,9 +503,9 @@ void FuncGraphSpecializer::ProcessCNode(const CNodePtr &new_node) {
     // First element is partial, second is func so arg is start from 2
     (void)args.insert(args.begin(), inputs.begin() + 2, inputs.end());
     func = inputs[1];
-    new_inputs = args;
-    (void)new_inputs.insert(new_inputs.begin(), func);
   }
+  new_inputs = args;
+  (void)new_inputs.insert(new_inputs.begin(), func);
 
   AbstractBasePtrList argvals;
   MS_EXCEPTION_IF_NULL(new_inputs[0]);
@@ -524,9 +520,23 @@ void FuncGraphSpecializer::ProcessCNode(const CNodePtr &new_node) {
                   << new_inputs[i]->DebugString() << ", abstract: " << new_inputs[i]->abstract()->ToString();
   }
 
-  if (func->isa<Parameter>() && func->func_graph()->has_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER)) {
-    auto wrapped_node = BuildSpecializedParameterNode(new_node);
-    new_inputs[0] = wrapped_node;
+  if (!func->isa<ValueNode>()) {
+    MS_LOG(DEBUG) << func->abstract()->type_name() << " | " << func->abstract()->ToString();
+    if (func->abstract()->isa<AbstractFunction>() && !func->abstract()->isa<AbstractFuncUnion>()) {
+      auto func_abs = func->abstract()->cast<AbstractFunctionPtr>();
+      EvaluatorPtr eval = engine_->GetEvaluatorFor(func_abs);
+      std::pair<AbstractBasePtrList, AbstractBasePtr> result;
+      AbstractBasePtrList empty_args;
+      auto status = FindUniqueArgvals(func_abs, eval, empty_args, &result);
+      MS_LOG(DEBUG) << "FindUniqueArgvals return status: " << status;
+      // if a node is a poly node, or an input parameter is a PartialAbstractClosure, expand it early
+      if (status == kSpecializeFindUniqueArgvalPoly ||
+          (func->isa<Parameter>() && (func->func_graph()->has_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER) ||
+                                      func->abstract()->isa<PartialAbstractClosure>()))) {
+        auto wrapped_node = BuildSpecializedParameterNode(new_node);
+        new_inputs[0] = wrapped_node;
+      }
+    }
   }
 
   if (CanSpecializeNode(func)) {
diff --git a/mindspore/ccsrc/pipeline/static_analysis/static_analysis.cc b/mindspore/ccsrc/pipeline/static_analysis/static_analysis.cc
index b7520176ec..9da148d2a7 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/static_analysis.cc
+++ b/mindspore/ccsrc/pipeline/static_analysis/static_analysis.cc
@@ -308,6 +308,10 @@ EvaluatorPtr GetPrimEvaluator(const PrimitivePtr &prim, const AnalysisEnginePtr
     evaluator = std::make_shared<UnpackGraphEvaluator>(prim);
     return evaluator;
   }
+  if (prim->Hash() == prim::kPrimMixedPrecisionCast->Hash() && prim->name() == prim::kPrimMixedPrecisionCast->name()) {
+    evaluator = std::make_shared<MixedPrecisionCastEvaluator>(prim);
+    return evaluator;
+  }
   if (prim->HasPyEvaluator()) {
     auto prim_py = dyn_cast<PrimitivePy>(prim);
     if (prim_py != nullptr) {
@@ -464,6 +468,85 @@ EvalResultPtr AnalysisEngine::ExecuteEvaluators(const std::vector<EvaluatorPtr>
   return ExecuteMultipleEvaluators(evaluators, out_conf, args_conf_list);
 }
 
+void AnalysisEngine::SetUndeterminedFlag(const EvaluatorPtr &evaluator) {
+  auto fg_eval = evaluator->cast<FuncGraphEvaluatorPtr>();
+  if (fg_eval == nullptr) {
+    return;
+  }
+  auto fg = fg_eval->func_graph();
+  MS_EXCEPTION_IF_NULL(fg);
+  auto undetermined_fgs = fg->recursive_graphs();
+  if (undetermined_fgs) {
+    auto fg_parent = fg->parent();
+    MS_EXCEPTION_IF_NULL(fg_parent);
+    fg_parent->set_flag(kFuncGraphFlagUndetermined, true);
+    MS_LOG(DEBUG) << "Set graph undetermined: " << fg_parent->ToString();
+  }
+}
+
+EvaluatorPtr AnalysisEngine::HandleNestedRecursion(const std::vector<EvaluatorPtr> &evaluators,
+                                                   const EvaluatorPtr &eval, const AbstractBasePtrList &args_spec_list,
+                                                   const EvalTraceRevIter &it, bool *continue_flag) {
+  *continue_flag = false;
+  // Find latest entry function to handle nested recursion.
+  EvaluatorPtr latest_entry = eval;
+  auto latest_entry_iter = eval_trace_.rbegin();
+  for (auto r_it = eval_trace_.rbegin(); *r_it != *it;) {
+    auto it_temp = std::find(evaluators.begin(), evaluators.end(), r_it->first);
+    if (it_temp != evaluators.end()) {
+      latest_entry = *it_temp;
+      latest_entry_iter = r_it;
+      break;
+    }
+    latest_entry_iter = ++r_it;
+  }
+  if (latest_entry != eval) {
+    MS_LOG(DEBUG) << "Continue Evaluator " << eval->ToString();
+    *continue_flag = true;
+    return latest_entry;
+  }
+
+  bool has_undetermined = false;
+  // Check whether sub loop has untraced undetermined evaluator.
+  std::set<std::pair<EvaluatorPtr, AbstractBasePtrList>> undetermined_evals;
+  for (auto r_it = eval_trace_.rbegin(); r_it != latest_entry_iter; r_it++) {
+    undetermined_evals.insert(*r_it);
+  }
+  MS_LOG(DEBUG) << "undetermined_evals size(): " << undetermined_evals.size();
+
+  for (auto u_eval : undetermined_evals) {
+    MS_LOG(DEBUG) << u_eval.first->ToString() << " check undetermined.";
+    if (!undetermined_evals.count(std::make_pair(multi_poss_[u_eval.first], args_spec_list))) {
+      MS_LOG(DEBUG) << u_eval.first->ToString() << " has undetermined.";
+      has_undetermined = true;
+      break;
+    }
+  }
+  if (has_undetermined == false) {
+    MS_LOG(DEBUG) << eval->ToString() << " has no undetermined.";
+    *continue_flag = true;
+    return latest_entry;
+  }
+
+  return latest_entry;
+}
+
+EvalResultPtr AnalysisEngine::ProcessEvalResults(const AbstractBasePtrList &out_specs) {
+  if (out_specs.size() == 0) {
+    MS_LOG(EXCEPTION) << "There is an endless loop for evaluator.";
+  }
+
+  if (out_specs.size() == 1) {
+    MS_EXCEPTION_IF_NULL(out_specs[0]);
+    // If only one result derived, then broaden it to avoid wrong constant propagation.
+    return std::make_shared<EvalResult>(out_specs[0]->Broaden(), std::make_shared<AttrValueMap>());
+  }
+  auto joined_spec = AbstractJoin(out_specs);
+  MS_EXCEPTION_IF_NULL(joined_spec);
+  MS_LOG(DEBUG) << "Multiple evaluators joined: " << joined_spec->ToString();
+  return std::make_shared<EvalResult>(joined_spec, std::make_shared<AttrValueMap>());
+}
+
 EvalResultPtr AnalysisEngine::ExecuteMultipleEvaluators(const std::vector<EvaluatorPtr> &evaluators,
                                                         const AnfNodeConfigPtr &out_conf,
                                                         const ConfigPtrList &args_conf_list) {
@@ -479,18 +562,7 @@ EvalResultPtr AnalysisEngine::ExecuteMultipleEvaluators(const std::vector<Evalua
                          return conf->GetEvaluatedValue()->abstract();
                        });
   for (auto eval : evaluators) {
-    auto fg_eval = eval->cast<FuncGraphEvaluatorPtr>();
-    if (fg_eval) {
-      auto fg = fg_eval->func_graph();
-      MS_EXCEPTION_IF_NULL(fg);
-      auto undetermined_fgs = fg->recursive_graphs();
-      if (undetermined_fgs) {
-        auto fg_parent = fg->parent();
-        MS_EXCEPTION_IF_NULL(fg_parent);
-        fg_parent->set_flags(kFuncGraphFlagUndetermined, true);
-        MS_LOG(DEBUG) << "Set graph undetermined: " << fg_parent->ToString();
-      }
-    }
+    SetUndeterminedFlag(eval);
 
     auto current_inf = std::make_pair(eval, args_spec_list);
     MS_LOG(DEBUG) << "Check Evaluator " << eval->ToString();
@@ -510,40 +582,9 @@ EvalResultPtr AnalysisEngine::ExecuteMultipleEvaluators(const std::vector<Evalua
         multi_poss_.clear();
       }
     } else if (it != eval_trace_.rbegin()) {
-      // Find latest entry function to handle nested recursion.
-      EvaluatorPtr latest_entry = eval;
-      auto latest_entry_iter = eval_trace_.rbegin();
-      for (auto r_it = eval_trace_.rbegin(); *r_it != *it;) {
-        auto it_temp = std::find(evaluators.begin(), evaluators.end(), r_it->first);
-        if (it_temp != evaluators.end()) {
-          latest_entry = *it_temp;
-          latest_entry_iter = r_it;
-          break;
-        }
-        latest_entry_iter = ++r_it;
-      }
-      if (latest_entry != eval) {
-        MS_LOG(DEBUG) << "Continue Evaluator " << eval->ToString();
-        continue;
-      }
-
-      bool has_undetermined = false;
-      // Check whether sub loop has untraced undetermined evaluator.
-      std::set<std::pair<EvaluatorPtr, AbstractBasePtrList>> undetermined_evals;
-      for (auto r_it = eval_trace_.rbegin(); r_it != latest_entry_iter; r_it++) {
-        undetermined_evals.insert(*r_it);
-      }
-      MS_LOG(DEBUG) << "undetermined_evals size(): " << undetermined_evals.size();
-      for (auto u_eval : undetermined_evals) {
-        MS_LOG(DEBUG) << u_eval.first->ToString() << " check undetermined.";
-        if (!undetermined_evals.count(std::make_pair(multi_poss_[u_eval.first], args_spec_list))) {
-          MS_LOG(DEBUG) << u_eval.first->ToString() << " has undetermined.";
-          has_undetermined = true;
-          break;
-        }
-      }
-      if (has_undetermined == false) {
-        MS_LOG(DEBUG) << eval->ToString() << " has no undetermined.";
+      bool continue_flag = false;
+      auto latest_entry = HandleNestedRecursion(evaluators, eval, args_spec_list, it, &continue_flag);
+      if (continue_flag) {
         continue;
       }
 
@@ -558,19 +599,8 @@ EvalResultPtr AnalysisEngine::ExecuteMultipleEvaluators(const std::vector<Evalua
       }
     }
   }
-  if (out_specs.size() == 0) {
-    MS_LOG(EXCEPTION) << "There is an endless loop for evaluator.";
-  }
 
-  if (out_specs.size() == 1) {
-    MS_EXCEPTION_IF_NULL(out_specs[0]);
-    // If only one result derived, then broaden it to avoid wrong constant propagation.
-    return std::make_shared<EvalResult>(out_specs[0]->Broaden(), std::make_shared<AttrValueMap>());
-  }
-  auto joined_spec = AbstractJoin(out_specs);
-  MS_EXCEPTION_IF_NULL(joined_spec);
-  MS_LOG(DEBUG) << "Multiple evaluators joined: " << joined_spec->ToString();
-  return std::make_shared<EvalResult>(joined_spec, std::make_shared<AttrValueMap>());
+  return ProcessEvalResults(out_specs);
 }
 
 EvalResultPtr AnfNodeConfig::GetEvaluatedValue() {
diff --git a/mindspore/ccsrc/pipeline/static_analysis/static_analysis.h b/mindspore/ccsrc/pipeline/static_analysis/static_analysis.h
index 1e7a52fda9..a0b7ee5478 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/static_analysis.h
+++ b/mindspore/ccsrc/pipeline/static_analysis/static_analysis.h
@@ -172,6 +172,8 @@ struct AnalysisResult {
   AnalysisContextPtr context;
 };
 
+using EvalTraceRevIter = std::list<std::pair<EvaluatorPtr, AbstractBasePtrList>>::reverse_iterator;
+
 class AnalysisEngine : public std::enable_shared_from_this<AnalysisEngine> {
  public:
   AnalysisEngine(const PrimEvaluatorMap &prim_evaluator_map, const FuncGraphManagerPtr &func_graph_manager)
@@ -222,6 +224,12 @@ class AnalysisEngine : public std::enable_shared_from_this<AnalysisEngine> {
   std::unordered_map<PrimitivePyPtr, EvaluatorPtr> prim_py_evaluators_;
 
  private:
+  void SetUndeterminedFlag(const EvaluatorPtr &evaluator);
+  EvaluatorPtr HandleNestedRecursion(const std::vector<EvaluatorPtr> &evaluators, const EvaluatorPtr &eval,
+                                     const AbstractBasePtrList &args_spec_list, const EvalTraceRevIter &it,
+                                     bool *continue_flag);
+  EvalResultPtr ProcessEvalResults(const AbstractBasePtrList &out_specs);
+
   const PrimEvaluatorMap &prim_constructors_;
   FuncGraphManagerPtr func_graph_manager_;
   std::unordered_map<AbstractFunctionPtr, EvaluatorPtr> constructors_;
diff --git a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
index f01dd95f06..981e2255f3 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
@@ -16,11 +16,13 @@
 #include "pre_activate/ascend/ascend_backend_optimization.h"
 #include <memory>
 #include <string>
+#include <set>
 #include "pre_activate/common/optimizer.h"
 #include "pre_activate/ascend/ir_fission/bn_split.h"
 #include "pre_activate/ascend/ir_fission/bn_grad_split.h"
 #include "pre_activate/ascend/ir_fission/batch_norm_grad_split.h"
 #include "pre_activate/ascend/ir_fission/batch_norm_bert_fission.h"
+#include "pre_activate/ascend/ir_fission/single_batch_norm_fission.h"
 #include "pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.h"
 #include "pre_activate/ascend/ir_fission/layer_norm_grad_split.h"
 #include "pre_activate/pass/communication_op_fusion.h"
@@ -54,6 +56,7 @@
 #include "pre_activate/ascend/ir_fusion/confusion_mul_grad_fusion.h"
 #include "pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion.h"
 #include "pre_activate/ascend/format_type/insert_trans_op.h"
+#include "pre_activate/ascend/format_type/rectify_do_mask_kernel_info.h"
 #include "pre_activate/pass/getitem_tuple.h"
 #include "pre_activate/pass/optimize_dependence.h"
 #include "pre_activate/pass/erase_visit_attr.h"
@@ -61,10 +64,14 @@
 #include "pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.h"
 #include "pre_activate/pass/eliminate_redundant_op.h"
 #include "pre_activate/pass/common_subexpression_elimination.h"
+#include "pre_activate/pass/fuse_graph_kernel.h"
+#include "pre_activate/pass/fuse_basic.h"
+#include "pre_activate/pass/add_atomic_clean.h"
 #include "pre_activate/ascend/format_type/merge_cast_to_op.h"
 #include "pre_activate/ascend/format_type/check_consistency.h"
 #include "pre_activate/ascend/buffer_fusion/ub_pattern_fusion.h"
 #include "pre_activate/ascend/buffer_fusion/eltwise_fusion_pass.h"
+#include "pre_activate/ascend/buffer_fusion/multi_output_fusion_pass.h"
 #include "pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_eltwise_fusion_pass.h"
 #include "pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.h"
 #include "pre_activate/ascend/buffer_fusion/conv_single_in_fusion_pass.h"
@@ -77,14 +84,16 @@
 #include "pre_activate/ascend/buffer_fusion/reduce_eltwise_fusion_pass.h"
 #include "pre_activate/ascend/buffer_fusion/segment_eltwise_fusion_pass.h"
 #include "pre_activate/ascend/format_type/deal_ref_trans_and_cast.h"
-#include "pre_activate/ascend/enhancer/add_memcpy_async.h"
+#include "pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
 #include "pre_activate/ascend/enhancer/insert_pad_for_nms_with_mask.h"
-#include "pre_activate/ascend/format_type/insert_cast_for_runop.h"
 #include "pre_activate/ascend/format_type/insert_transdata_for_runop.h"
 #include "pre_activate/ascend/enhancer/getnext_memcpy_elimination.h"
 #include "pre_activate/ascend/ir_fission/addn_fission.h"
 #include "pre_activate/ascend/enhancer/insert_memcpy_async_for_getnext.h"
 #include "pre_activate/ascend/ir_fission/batch_norm_grad_infer_fission.h"
+#include "pre_activate/ascend/ir_fission/split_fission.h"
+#include "pre_activate/ascend/format_type/modify_ops_attrs.h"
+#include "pre_activate/ascend/format_type/remove_no_use_reshape_op.h"
 #include "utils/context/ms_context.h"
 #include "utils/config_manager.h"
 #include "debug/anf_ir_dump.h"
@@ -96,10 +105,13 @@ namespace {
 void AddAscendBackendOptionalIRFusion(PassManager *ir_fusion_pm) {
   MS_EXCEPTION_IF_NULL(ir_fusion_pm);
   ir_fusion_pm->AddPass(std::make_shared<BatchNormBertFission>());
+  ir_fusion_pm->AddPass(std::make_shared<SingleBatchNormFission>());
   ir_fusion_pm->AddPass(std::make_shared<SquareSumFusion>());
   ir_fusion_pm->AddPass(std::make_shared<ClipByNormNoDivSquareSumFusion>());
   ir_fusion_pm->AddPass(std::make_shared<LambUpdateWithLRRuleFusion>());
   ir_fusion_pm->AddPass(std::make_shared<SoftmaxGradExtFusion>());
+  ir_fusion_pm->AddPass(std::make_shared<SoftmaxGradExtFusionV2>());
+  ir_fusion_pm->AddPass(std::make_shared<SoftmaxGradExtFusionV3>());
   ir_fusion_pm->AddPass(std::make_shared<ConfusionMulGradFusion>());
   ir_fusion_pm->AddPass(std::make_shared<ConfusionSoftmaxGradRule>());
   ir_fusion_pm->AddPass(std::make_shared<LambNextMVWithDecayRuleCond1>());
@@ -136,6 +148,8 @@ void AddAscendBackendOptionalIRFusion(PassManager *ir_fusion_pm) {
   ir_fusion_pm->AddPass(std::make_shared<BatchNorm2BNInfer>());
   ir_fusion_pm->AddPass(std::make_shared<BatchNormGrad2BNInferGrad>());
   ir_fusion_pm->AddPass(std::make_shared<BatchNormGradInferFission>());
+  ir_fusion_pm->AddPass(std::make_shared<SplitFission>());
+  ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
 }
 }  // namespace
 
@@ -143,7 +157,7 @@ void RunOpAscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_g
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto optimizer = std::make_shared<GraphOptimizer>();
   auto data_layout_pm = std::make_shared<PassManager>("pynative_transop_pm");
-  data_layout_pm->AddPass(std::make_shared<LayerNormGradSplit>());
+  data_layout_pm->AddPass(std::make_shared<RectifyDoMaskKernelInfo>());
   data_layout_pm->AddPass(std::make_shared<RunOpInsertTransData>());
   data_layout_pm->AddPass(std::make_shared<GetitemTuple>());
   data_layout_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
@@ -156,22 +170,15 @@ void RunOpAscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_g
   kernel_graph->SetExecOrderByDefault();
 }
 
-void RunOpAscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
+void AscendGraphKernelCommonProcess(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto optimizer = std::make_shared<GraphOptimizer>();
-  auto mixed_precision_pm = std::make_shared<PassManager>("pynative_transop_pm");
-  mixed_precision_pm->AddPass(std::make_shared<RunOpInsertCast>());
-  mixed_precision_pm->AddPass(std::make_shared<GetitemTuple>());
-  mixed_precision_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
-  mixed_precision_pm->AddPass(std::make_shared<EliminateRedundantOp>());
-  mixed_precision_pm->AddPass(std::make_shared<OptimizeDependence>());
-  mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
-  mixed_precision_pm->AddPass(std::make_shared<DealRefTransAndCast>());
-  mixed_precision_pm->AddPass(std::make_shared<GetitemTuple>());
-  mixed_precision_pm->AddPass(std::make_shared<MergeCastToOp>());
-  mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
-  mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
-  optimizer->AddPassManager(mixed_precision_pm);
+  MS_EXCEPTION_IF_NULL(optimizer);
+  auto common_process = std::make_shared<PassManager>("graph_kernel_common_process");
+  MS_EXCEPTION_IF_NULL(common_process);
+  common_process->AddPass(std::make_shared<ModifyOpAttrs>());
+  common_process->AddPass(std::make_shared<RemoveNoUseReshapeOp>());
+  optimizer->AddPassManager(common_process);
   (void)optimizer->Optimize(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
 }
@@ -180,7 +187,7 @@ void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph)
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto optimizer = std::make_shared<GraphOptimizer>();
   auto data_layout_pm = std::make_shared<PassManager>("transop_pm");
-  data_layout_pm->AddPass(std::make_shared<LayerNormGradSplit>());
+  data_layout_pm->AddPass(std::make_shared<RectifyDoMaskKernelInfo>());
   data_layout_pm->AddPass(std::make_shared<InsertTransOp>());
   data_layout_pm->AddPass(std::make_shared<GetitemTuple>());
   data_layout_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
@@ -236,10 +243,11 @@ void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGrap
     ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
   } else {
     ir_fusion_pm->AddPass(std::make_shared<BatchNormGradSplit>());
+    ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
     ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormFusion>());
-    ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormMixPrecisionFusion>());
+    ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormMixPrecisionFusion0>());
+    ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormMixPrecisionFusion1>());
   }
-  ir_fusion_pm->AddPass(std::make_shared<AddMemcpyAsync>());
   ir_fusion_pm->AddPass(std::make_shared<InsertPadForNMSWithMask>());
   if (context_ptr->ir_fusion_flag()) {
     AddAscendBackendOptionalIRFusion(ir_fusion_pm.get());
@@ -250,6 +258,7 @@ void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGrap
     ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
     ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
   }
+  ir_fusion_pm->AddPass(std::make_shared<InsertMemcpyAsyncForHcclOp>());
   optimizer->AddPassManager(ir_fusion_pm);
   (void)optimizer->Optimize(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
@@ -279,6 +288,7 @@ void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::Kerne
   auto optimizer = std::make_shared<GraphOptimizer>();
   auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
   ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
+  ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
   ir_fusion_pm->AddPass(std::make_shared<TopKSplit>());
   ir_fusion_pm->AddPass(std::make_shared<AddnFission>());
   ir_fusion_pm->AddPass(std::make_shared<InsertPadForNMSWithMask>());
@@ -318,22 +328,117 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
   other_pm->AddPass(std::make_shared<BroadcastFusion>());
   other_pm->AddPass(std::make_shared<ParameterTransOpFusion>());
   other_pm->AddPass(std::make_shared<RefreshParameterFormat>());
-  other_pm->AddPass(std::make_shared<GetitemTuple>());
-  other_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
-  if (context_ptr->enable_task_sink() && context_ptr->loop_sink_flag() && ConfigManager::GetInstance().iter_num() > 1) {
-    other_pm->AddPass(std::make_shared<GetnextMemcpyElimination>());
-  }
-  other_pm->AddPass(std::make_shared<CheckConsistency>());
   optimizer->AddPassManager(other_pm);
   (void)optimizer->Optimize(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
   // buffer fusion
   AscendBackendUBFusionOptimization(kernel_graph);
+
+  // other2 optimization
+  auto optimizer2 = std::make_shared<GraphOptimizer>();
+  auto other2_pm = std::make_shared<PassManager>("other2_pm");
+  other2_pm->AddPass(std::make_shared<GetitemTuple>());
+  other2_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
+  if (context_ptr->enable_task_sink() && context_ptr->loop_sink_flag() && ConfigManager::GetInstance().iter_num() > 1) {
+    other2_pm->AddPass(std::make_shared<GetnextMemcpyElimination>());
+  }
+  other2_pm->AddPass(std::make_shared<CheckConsistency>());
+  optimizer2->AddPassManager(other2_pm);
+  (void)optimizer2->Optimize(kernel_graph);
+  kernel_graph->SetExecOrderByDefault();
+
+  if (save_graphs) {
+    std::string file_path =
+      save_graphs_path + "/" + "hwopt_d_end" + "_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
+    DumpIR(file_path, kernel_graph, true);
+    DumpIRProto(kernel_graph, "after_hwopt");
+    kernel_graph->DumpFuncGraph("hwopt_d_end");
+  }
+}
+
+void AscendBackendGraphKernelOpt(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                                 bool is_before_kernel_select) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (!(context_ptr->enable_graph_kernel())) {
+    return;
+  }
+  bool save_graphs = context_ptr->save_graphs_flag();
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_graph_kernel_opt_before_graph_" +
+                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
+                            ".ir";
+    DumpIR(file_path, kernel_graph);
+  }
+
+  // Fuse graph kernels with basic ops
+  FuseGraphKernel(kernel_graph, is_before_kernel_select);
+
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_graph_kernel_opt_end_graph_" +
+                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
+                            ".ir";
+    DumpIR(file_path, kernel_graph, true);
+  }
+}
+
+void AscendBackendFuseBasicOpt(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                               bool is_before_kernel_select) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (!(context_ptr->enable_graph_kernel())) {
+    return;
+  }
+  bool save_graphs = context_ptr->save_graphs_flag();
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_fuse_basic_opt_before_graph_" +
+                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
+                            ".ir";
+    DumpIR(file_path, kernel_graph, true);
+  }
+
+  // Fuse basic ops with basic ops
+  FuseBasic(kernel_graph, is_before_kernel_select);
+
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_fuse_basic_opt_end_graph_" +
+                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
+                            ".ir";
+    DumpIR(file_path, kernel_graph, true);
+  }
+}
+
+void AscendBackendAddAtomicClean(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (!(context_ptr->enable_graph_kernel())) {
+    return;
+  }
+  bool save_graphs = context_ptr->save_graphs_flag();
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_add_atomic_clean_before" + "_graph_" +
+                            std::to_string(kernel_graph->graph_id()) + ".ir";
+    DumpIR(file_path, kernel_graph);
+  }
+
+  AddAtomicClean(kernel_graph);
+
   if (save_graphs) {
     std::string file_path =
       save_graphs_path + "/" + "hwopt_d_end" + "_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
     DumpIR(file_path, kernel_graph, true);
-    DumpIRProto(kernel_graph, "after_hwopt_" + std::to_string(kernel_graph->graph_id()));
   }
 }
 
@@ -350,7 +455,8 @@ void AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGrap
     save_graphs_path = ".";
   }
   if (save_graphs) {
-    std::string file_path = save_graphs_path + "/" + "hwopt_d_ub_fusion_before.ir";
+    std::string file_path =
+      save_graphs_path + "/hwopt_d_ub_fusion_before_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
     DumpIR(file_path, kernel_graph);
   }
   auto fusion_id_allocator = std::make_shared<FusionIdAllocator>();
@@ -368,6 +474,7 @@ void AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGrap
   ub_fusion_pm->AddPass(std::make_shared<ConvDoubleInFusionPass>(fusion_id_allocator));
   ub_fusion_pm->AddPass(std::make_shared<ReduceEltwiseFusionPass>(fusion_id_allocator));
   ub_fusion_pm->AddPass(std::make_shared<SegmentEltwiseFusionPass>(fusion_id_allocator));
+  ub_fusion_pm->AddPass(std::make_shared<MultiOutputFusionPass>(fusion_id_allocator));
   ub_fusion_pm->AddPass(std::make_shared<EltwiseFusionPass>(fusion_id_allocator));
   ub_fusion_pm->AddPass(std::make_shared<DepthwiseConvEltwiseFusionPass>(fusion_id_allocator));
   ub_fusion_pm->AddPass(std::make_shared<UbPatternFusion>());
@@ -375,7 +482,8 @@ void AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGrap
   (void)optimizer->Optimize(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
   if (save_graphs) {
-    std::string file_path = save_graphs_path + "/" + "hwopt_d_ub_fusion_after.ir";
+    std::string file_path =
+      save_graphs_path + "/hwopt_d_ub_fusion_after_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
     DumpIR(file_path, kernel_graph);
   }
 }
diff --git a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.h b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.h
index 914b4c053a..222c4b90b5 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.h
@@ -20,11 +20,16 @@
 namespace mindspore {
 namespace opt {
 void RunOpAscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph);
-void RunOpAscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph);
+void AscendGraphKernelCommonProcess(const std::shared_ptr<session::KernelGraph> &kernel_graph);
+void AscendBackendGraphKernelOpt(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                                 bool is_before_kernel_select = false);
+void AscendBackendFuseBasicOpt(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                               bool is_before_kernel_select = false);
+void AscendBackendAddAtomicClean(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 }  // namespace opt
diff --git a/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc b/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc
index b573cb33bb..9c498bd736 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc
@@ -22,6 +22,7 @@
 #include "utils/utils.h"
 #include "device/kernel_info.h"
 #include "kernel/oplib/oplib.h"
+#include "kernel/common_utils.h"
 #include "operator/ops.h"
 #include "session/anf_runtime_algorithm.h"
 #include "session/kernel_graph.h"
@@ -31,6 +32,7 @@ namespace mindspore {
 namespace opt {
 using KernelBuildInfoBuilder = kernel::KernelBuildInfo::KernelBuildInfoBuilder;
 namespace {
+const std::set<std::string> kCommonFormatSet = {kOpFormat_DEFAULT, kOpFormat_ND, kOpFormat_NCHW};
 AnfNodePtr CreateReshapeNode(const FuncGraphPtr &func_graph, const AnfNodePtr &input_node,
                              const KernelSelectPtr &kernel_select, const std::vector<size_t> &dst_shape) {
   std::vector<AnfNodePtr> trans_inputs;
@@ -53,7 +55,6 @@ AnfNodePtr AddTransOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePt
   CNodePtr trans_data = nullptr;
   std::string input_format = is_insert_input ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(node, 0);
   std::string dst_format = is_insert_input ? AnfAlgo::GetInputFormat(node, 0) : kOpFormat_DEFAULT;
-  TypeId dtype = AnfAlgo::GetOutputDeviceDataType(node, 0);
   std::vector<kernel::Axis> padding_axis = AnfAlgo::GetOutputReshapeType(node, 0);
   MS_EXCEPTION_IF_NULL(node);
   // if insert transdata for input we need to change the input
@@ -62,10 +63,9 @@ AnfNodePtr AddTransOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePt
       MS_LOG(EXCEPTION) << "cannot insert a transdata node to a node's input which the node is not a cnode";
     }
     auto cnode = node->cast<CNodePtr>();
-    dtype = AnfAlgo::GetInputDeviceDataType(cnode, insert_index);
     dst_format = AnfAlgo::GetInputFormat(cnode, insert_index);
     input_node = AnfAlgo::GetInputNode(cnode, insert_index);
-    padding_axis = AnfAlgo::GetInputReshapeType(node, 0);
+    padding_axis = AnfAlgo::GetInputReshapeType(node, insert_index);
   }
   bool need_padding = false;
   if (is_insert_input) {
@@ -94,7 +94,7 @@ AnfNodePtr AddTransOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePt
     trans_node = reshape_node;
   }
   // refresh the transdata's format to ori format & dst format
-  RefreshKernelBuildInfo(input_format, dst_format, dtype, trans_data, padding_axis);
+  RefreshKernelBuildInfo(input_format, dst_format, trans_data, padding_axis);
   return trans_node;
 }
 
@@ -110,13 +110,9 @@ AnfNodePtr GetTransInputNodePtr(const FuncGraphPtr &func_graph, const CNodePtr &
     MS_EXCEPTION_IF_NULL(input_node);
     AnfAlgo::SetNodeInput(node, input_node, index);
   }
-  if (AnfAlgo::GetInputFormat(node, index) == kOpFormat_NC1KHKWHWC0) {
-    MS_LOG(EXCEPTION) << "got the format " << AnfAlgo::GetInputFormat(node, index)
-                      << "when inserting the transdata node " << node->DebugString();
-  }
   std::vector<size_t> origin_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, index);
   std::string dest_format = AnfAlgo::GetInputFormat(node, index);
-  if (kNeedTransFormatSet.find(dest_format) != kNeedTransFormatSet.end() && origin_shape.size() > 1) {
+  if (kCommonFormatSet.find(dest_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
     MS_LOG(DEBUG) << node->DebugString() << "Insert transdata " << AnfAlgo::GetInputFormat(node, index)
                   << " To DefaultFormat , index: " << index;
     return AddTransOpNodeToGraph(func_graph, node, kernel_select, index, true);
@@ -133,7 +129,7 @@ AnfNodePtr InsertTransOpForSingleOutput(const FuncGraphPtr &func_graph, const An
     MS_LOG(EXCEPTION) << "got the hw format " << output_format << "when insert the transdata node "
                       << node->DebugString();
   }
-  if (kNeedTransFormatSet.find(output_format) != kNeedTransFormatSet.end() && origin_shape.size() > 1) {
+  if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
     MS_LOG(DEBUG) << "Inserted Transdata " << output_format << " To default , index :0";
     return AddTransOpNodeToGraph(func_graph, node, kernel_select, 0, false);
   }
@@ -154,7 +150,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
     }
     auto tuple_getitem = CreatTupleGetItemNode(func_graph, node, output_idx);
     std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
-    if (kNeedTransFormatSet.find(output_format) != kNeedTransFormatSet.end() && origin_shape.size() > 1) {
+    if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
       make_tuple_inputs.emplace_back(AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false));
     } else {
       // No need insert trans op.
@@ -165,22 +161,17 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
   return make_tuple;
 }
 }  // namespace
-void RefreshKernelBuildInfo(const std::string &input_format, const std::string &output_format, const TypeId device_type,
+void RefreshKernelBuildInfo(const std::string &input_format, const std::string &output_format,
                             const AnfNodePtr &trans_data, const std::vector<kernel::Axis> &reshape_type) {
   MS_EXCEPTION_IF_NULL(trans_data);
-  MS_EXCEPTION_IF_NULL(trans_data->kernel_info());
-  auto ori_build_info = trans_data->kernel_info()->select_kernel_build_info();
-  KernelBuildInfoBuilder builder;
-  builder.SetInputsFormat({input_format});
-  builder.SetInputReshapeType({reshape_type});
-  builder.SetInputReshapeType({reshape_type});
-  builder.SetOutputsFormat({output_format});
-  builder.SetInputsDeviceType({device_type});
-  builder.SetOutputsDeviceType({device_type});
-  builder.SetKernelType(ori_build_info->kernel_type());
-  builder.SetFusionType(ori_build_info->fusion_type());
-  builder.SetProcessor(ori_build_info->processor());
-  AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), trans_data.get());
+  auto ori_build_info = AnfAlgo::GetSelectKernelBuildInfo(trans_data);
+  MS_EXCEPTION_IF_NULL(ori_build_info);
+  auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(ori_build_info);
+  builder->SetInputsFormat({input_format});
+  builder->SetInputReshapeType({reshape_type});
+  builder->SetOutputReshapeType({reshape_type});
+  builder->SetOutputsFormat({output_format});
+  AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), trans_data.get());
 }
 
 CNodePtr NewTransOpNode(const FuncGraphPtr &func_graph, const AnfNodePtr &input, const KernelSelectPtr &kernel_select,
@@ -239,7 +230,7 @@ AnfNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr
   if (kernel::OpLib::FindOp(prim::kPrimCast->name(), kernel::kTBE) != nullptr) {
     builder.SetKernelType(KernelType::TBE_KERNEL);
   } else {
-    builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+    builder.SetKernelType(KernelType::AKG_KERNEL);
   }
   // if kernel info is null , it remarks this function is running ut
   if (cast->kernel_info() == nullptr) {
@@ -294,19 +285,17 @@ CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnod
   MS_EXCEPTION_IF_NULL(cnode);
   std::vector<AnfNodePtr> new_inputs = {AnfAlgo::GetCNodePrimitiveNode(cnode)};
   for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(cnode); ++input_index) {
-    TypeId origin_type;
+    const auto infer_type = AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index);
+    TypeId origin_type(kTypeUnknown);
     auto cur_input = AnfAlgo::GetInputNode(cnode, input_index);
     auto kernel_with_index = AnfAlgo::VisitKernel(cur_input, 0);
-    auto is_weight_boundary = [](const AnfNodePtr &node) -> bool {
-      if (node->isa<ValueNode>() || node->isa<Parameter>()) {
-        return true;
-      }
-      return false;
-    };
     auto real_input_node = kernel_with_index.first;
-    if (is_weight_boundary(real_input_node)) {
+    if (kernel::IsWeightBoundary(real_input_node) || func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
       // weight
-      origin_type = AnfAlgo::GetPrevNodeOutputDeviceDataType(cnode, input_index);
+      origin_type = AnfAlgo::GetPrevNodeOutputPrecision(cnode, input_index);
+      if (origin_type == kTypeUnknown) {
+        origin_type = AnfAlgo::GetPrevNodeOutputDeviceDataType(cnode, input_index);
+      }
     } else {
       // feature map
       origin_type = AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index);
@@ -314,9 +303,13 @@ CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnod
     const std::string dev_fmt = AnfAlgo::GetInputFormat(cnode, input_index);
     const std::vector<size_t> origin_shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, input_index);
     const TypeId device_type = AnfAlgo::GetInputDeviceDataType(cnode, input_index);
-    if (origin_type != device_type) {
+    // In graph kernel, we check parameter,
+    // the eliminate pass will not eliminate this case, so we just do not insert the noused cast.
+    if (func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL) && IsValueNode<tensor::Tensor>(cur_input)) {
+      new_inputs.push_back(cur_input);
+    } else if (origin_type != device_type) {
       auto cast =
-        AddCastOpNodeToGraph(func_graph, cur_input, dev_fmt, origin_type, device_type, origin_shape, origin_type);
+        AddCastOpNodeToGraph(func_graph, cur_input, dev_fmt, origin_type, device_type, origin_shape, infer_type);
       MS_EXCEPTION_IF_NULL(cast);
       cast->set_scope(cnode->scope());
       AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), cast);
diff --git a/mindspore/ccsrc/pre_activate/ascend/ascend_helper.h b/mindspore/ccsrc/pre_activate/ascend/ascend_helper.h
index 66e3f2ad33..ad48ca5291 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ascend_helper.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_helper.h
@@ -21,7 +21,8 @@
 #include <vector>
 #include "device/ascend/kernel_select_ascend.h"
 #include "kernel/kernel_query.h"
-#include "kernel/tbe/tbe_kernel_select.h"
+#include "kernel/oplib/oplib.h"
+#include "session/anf_runtime_algorithm.h"
 
 namespace mindspore {
 namespace opt {
@@ -37,11 +38,11 @@ class SupportedChecker {
  public:
   SupportedChecker() = default;
   virtual ~SupportedChecker() = default;
-  virtual bool CheckAiCoreSupported(const AnfNodePtr &anf_node,
+  virtual bool CheckAICoreSupported(const AnfNodePtr &anf_node,
                                     const kernel::KernelBuildInfoPtr &select_kernel_build_info) {
     return kernel::IsSupportedByAICore(anf_node, select_kernel_build_info);
   }
-  virtual bool CheckAiCpuSupported(const AnfNodePtr &anf_node,
+  virtual bool CheckAICPUSupported(const AnfNodePtr &anf_node,
                                    const kernel::KernelBuildInfoPtr &select_kernel_build_info) {
     return kernel::IsSupportedByAICPU(anf_node, select_kernel_build_info);
   }
@@ -56,9 +57,20 @@ class KernelQuery {
                      std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list) {
     kernel::KernelQuery(kernel_node, kernel_info_list);
   }
+  virtual bool IsTbeRef(const AnfNodePtr &node) {
+    MS_EXCEPTION_IF_NULL(node);
+    if (!node->isa<CNode>()) {
+      return false;
+    }
+    auto op_info = mindspore::kernel::OpLib::FindOp(AnfAlgo::GetCNodeName(node), kernel::kTBE);
+    if (op_info != nullptr) {
+      return op_info->is_ref();
+    }
+    return false;
+  }
 };
 using KernelQueryPtr = std::shared_ptr<KernelQuery>;
-void RefreshKernelBuildInfo(const std::string &input_format, const std::string &output_format, const TypeId device_type,
+void RefreshKernelBuildInfo(const std::string &input_format, const std::string &output_format,
                             const AnfNodePtr &trans_data, const std::vector<kernel::Axis> &reshape_type = {});
 
 CNodePtr NewTransOpNode(const FuncGraphPtr &func_graph, const AnfNodePtr &input, const KernelSelectPtr &kernel_select,
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc
index 8c4b1dcc63..94318d63ca 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc
@@ -34,16 +34,22 @@ void BnupdateEltwiseEltwiseFusionPass::MatchBnupdateAddRelu(const CNodePtr &cnod
   MS_EXCEPTION_IF_NULL(candidate_fusion);
   auto manager = kernel_graph.manager();
   MS_EXCEPTION_IF_NULL(manager);
+  MS_EXCEPTION_IF_NULL(relu_input);
   auto add = relu_input->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(add);
   auto tuple_getitem = add->input(1);
+  MS_EXCEPTION_IF_NULL(tuple_getitem);
   if (tuple_getitem->isa<CNode>() && AnfAlgo::GetCNodeName(tuple_getitem) == prim::kPrimTupleGetItem->name()) {
     auto getitem = tuple_getitem->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(getitem);
     auto bnupdate = getitem->input(1);
+    MS_EXCEPTION_IF_NULL(bnupdate);
     if (bnupdate->isa<CNode>() && AnfAlgo::GetCNodeName(bnupdate) == kBNTrainingUpdateOpName) {
       std::vector<int> output_used_num(AnfAlgo::GetOutputTensorNum(bnupdate), 0);
       for (auto out_getitem : manager->node_users()[bnupdate]) {
+        MS_EXCEPTION_IF_NULL(out_getitem.first);
         auto out_getitem_ptr = out_getitem.first->cast<CNodePtr>();
+        MS_EXCEPTION_IF_NULL(out_getitem_ptr);
         auto input2 = out_getitem_ptr->input(2);
         auto output_idx = GetValue<int>(GetValueNode(input2));
         output_used_num[output_idx] = SizeToInt(manager->node_users()[out_getitem.first].size());
@@ -70,10 +76,8 @@ void BnupdateEltwiseEltwiseFusionPass::MatchSingleFusionPattern(const session::K
     if (AnfAlgo::GetKernelType(cnode) == KernelType::TBE_KERNEL &&
         AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE) {
       auto eltwise_input = cnode->input(1);
-      if (AnfAlgo::GetCNodeName(cnode) == kReluV2OpName || AnfAlgo::CheckPrimitiveType(cnode, prim::kPrimRelu)) {
-        if (eltwise_input->isa<CNode>() && AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimTensorAdd)) {
-          MatchBnupdateAddRelu(cnode, eltwise_input, kernel_graph, candidate_fusion);
-        }
+      if (eltwise_input->isa<CNode>() && AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimTensorAdd)) {
+        MatchBnupdateAddRelu(cnode, eltwise_input, kernel_graph, candidate_fusion);
       }
     }
   }
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
index 348504345a..1f7fef9e62 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
@@ -34,12 +34,17 @@ void BnupdateEltwiseFusionPass::MatchBnupdateRelu(const CNodePtr &cnode, const A
   MS_EXCEPTION_IF_NULL(candidate_fusion);
   auto manager = kernel_graph.manager();
   MS_EXCEPTION_IF_NULL(manager);
+  MS_EXCEPTION_IF_NULL(relu_input);
   auto getitem = relu_input->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(getitem);
   auto bnupdate = getitem->input(1);
+  MS_EXCEPTION_IF_NULL(bnupdate);
   if (bnupdate->isa<CNode>() && AnfAlgo::GetCNodeName(bnupdate) == kBNTrainingUpdateOpName) {
     std::vector<int> output_used_num(AnfAlgo::GetOutputTensorNum(bnupdate), 0);
     for (auto out_getitem : manager->node_users()[bnupdate]) {
+      MS_EXCEPTION_IF_NULL(out_getitem.first);
       auto out_getitem_ptr = out_getitem.first->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(out_getitem_ptr);
       auto input2 = out_getitem_ptr->input(2);
       auto output_idx = GetValue<int>(GetValueNode(input2));
       output_used_num[output_idx] = SizeToInt(manager->node_users()[out_getitem.first].size());
@@ -65,10 +70,8 @@ void BnupdateEltwiseFusionPass::MatchSingleFusionPattern(const session::KernelGr
     if (AnfAlgo::GetKernelType(cnode) == KernelType::TBE_KERNEL &&
         AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE) {
       auto eltwise_input = cnode->input(1);
-      if (AnfAlgo::GetCNodeName(cnode) == kReluV2OpName || AnfAlgo::CheckPrimitiveType(cnode, prim::kPrimRelu)) {
-        if (eltwise_input->isa<CNode>() && AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimTupleGetItem)) {
-          MatchBnupdateRelu(cnode, eltwise_input, kernel_graph, candidate_fusion);
-        }
+      if (eltwise_input->isa<CNode>() && AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimTupleGetItem)) {
+        MatchBnupdateRelu(cnode, eltwise_input, kernel_graph, candidate_fusion);
       }
     }
   }
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_eltwise_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_eltwise_fusion_pass.cc
index c90d2a17cd..6091eb572d 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_eltwise_fusion_pass.cc
@@ -35,6 +35,7 @@ void Conv2DBackpropEltwiseEltwiseFusionPass::MatchConv2DBackpropInputEltwiseEltw
   MS_EXCEPTION_IF_NULL(manager);
   std::unordered_set<AnfNodePtr> record{cnode};
   auto eltwise_input = cnode->input(1);
+  MS_EXCEPTION_IF_NULL(eltwise_input);
   if (CheckDoubleInEltWiseNode(manager.get(), eltwise_input)) {
     (void)record.insert(eltwise_input);
   } else {
@@ -43,6 +44,7 @@ void Conv2DBackpropEltwiseEltwiseFusionPass::MatchConv2DBackpropInputEltwiseEltw
   auto input_cnode = eltwise_input->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(input_cnode);
   auto double_in_eltwise_input = input_cnode->input(1);
+  MS_EXCEPTION_IF_NULL(double_in_eltwise_input);
   if (!double_in_eltwise_input->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(double_in_eltwise_input) ||
       fusion_id_allocator->HasFusionIdAttr(double_in_eltwise_input)) {
     return;
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.cc
index a18d578f7f..963f1885fe 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.cc
@@ -36,6 +36,7 @@ void Conv2DBackpropEltwiseFusionPass::MatchConv2DBackpropInputEltwise(const CNod
   MS_EXCEPTION_IF_NULL(manager);
   std::unordered_set<AnfNodePtr> record{cnode};
   auto eltwise_input = cnode->input(1);
+  MS_EXCEPTION_IF_NULL(eltwise_input);
   if (!eltwise_input->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(eltwise_input) ||
       fusion_id_allocator->HasFusionIdAttr(eltwise_input)) {
     return;
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_bnreduce_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_bnreduce_fusion_pass.cc
index 2b243dbdac..63e7dcf6b8 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_bnreduce_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_bnreduce_fusion_pass.cc
@@ -35,6 +35,7 @@ void ConvBnReduceFusionPass::MatchConvBnreduce(const CNodePtr &cnode, const sess
   auto manager = kernel_graph.manager();
   MS_EXCEPTION_IF_NULL(manager);
   auto conv = cnode->input(1);
+  MS_EXCEPTION_IF_NULL(conv);
   if (conv->isa<CNode>() && AnfAlgo::GetCNodeName(conv) == prim::kPrimConv2D->name()) {
     std::vector<int> output_used_num{SizeToInt(manager->node_users()[conv].size())};
     AnfAlgo::SetNodeAttr(kAttrOutputUsedNum, MakeValue(output_used_num), conv);
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_double_in_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_double_in_fusion_pass.cc
index c4bfb96109..a126143811 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_double_in_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_double_in_fusion_pass.cc
@@ -35,6 +35,7 @@ void ConvDoubleInFusionPass::MatchConvDoubleInEltwise(const CNodePtr &cnode, con
   MS_EXCEPTION_IF_NULL(manager);
   std::unordered_set<AnfNodePtr> record{cnode};
   auto eltwise_input = cnode->input(1);
+  MS_EXCEPTION_IF_NULL(eltwise_input);
   if (CheckDoubleInEltWiseNode(manager.get(), eltwise_input)) {
     (void)record.insert(eltwise_input);
   } else {
@@ -43,6 +44,7 @@ void ConvDoubleInFusionPass::MatchConvDoubleInEltwise(const CNodePtr &cnode, con
   auto input_cnode = eltwise_input->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(input_cnode);
   auto double_in_eltwise_input = input_cnode->input(1);
+  MS_EXCEPTION_IF_NULL(double_in_eltwise_input);
   if (!double_in_eltwise_input->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(double_in_eltwise_input) ||
       fusion_id_allocator->HasFusionIdAttr(double_in_eltwise_input)) {
     return;
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_single_in_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_single_in_fusion_pass.cc
index c07c30f11c..d83b32a888 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_single_in_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/conv_single_in_fusion_pass.cc
@@ -44,6 +44,7 @@ void ConvSingleInFusionPass::MatchConvSingleInEltwise(const CNodePtr &cnode, con
       break;
     }
   }
+  MS_EXCEPTION_IF_NULL(eltwise_input);
   if (!eltwise_input->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(eltwise_input) ||
       fusion_id_allocator->HasFusionIdAttr(eltwise_input)) {
     return;
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.cc
index f485e901d8..98a6838bed 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.cc
@@ -74,11 +74,8 @@ void DepthwiseConvEltwiseFusionPass::MatchSingleFusionPattern(const session::Ker
     if (AnfAlgo::GetKernelType(cnode) == KernelType::TBE_KERNEL &&
         AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE) {
       auto eltwise_input = cnode->input(1);
-      if (AnfAlgo::GetCNodeName(cnode) == kReluV2OpName || AnfAlgo::CheckPrimitiveType(cnode, prim::kPrimRelu)) {
-        if (eltwise_input->isa<CNode>() &&
-            AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimDepthwiseConv2dNative)) {
-          MatchDepthwiseConvRelu(cnode, kernel_graph, candidate_fusion, true);
-        }
+      if (eltwise_input->isa<CNode>() && AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimDepthwiseConv2dNative)) {
+        MatchDepthwiseConvRelu(cnode, kernel_graph, candidate_fusion, true);
       }
     } else if (AnfAlgo::GetCNodeName(cnode) == prim::kPrimDepthwiseConv2dNative->name()) {
       MatchDepthwiseConvRelu(cnode, kernel_graph, candidate_fusion, false);
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/eltwise_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/eltwise_fusion_pass.cc
index 42860de700..2f04e16692 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/eltwise_fusion_pass.cc
@@ -35,6 +35,7 @@ void EltwiseFusionPass::MatchEltwise(const CNodePtr &cnode, const session::Kerne
   MS_EXCEPTION_IF_NULL(manager);
   std::unordered_set<AnfNodePtr> record{cnode};
   auto eltwise_input = cnode->input(1);
+  MS_EXCEPTION_IF_NULL(eltwise_input);
   while (CheckEltWiseNode(manager.get(), eltwise_input)) {
     (void)record.insert(eltwise_input);
     if (record.size() == MAX_ELTWISE_SIZE) {
@@ -55,7 +56,9 @@ void EltwiseFusionPass::MatchSingleFusionPattern(const session::KernelGraph &ker
                                                  FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(candidate_fusion);
   std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph.get_return());
+  std::reverse(node_list.begin(), node_list.end());
   for (auto &node : node_list) {
+    MS_EXCEPTION_IF_NULL(node);
     if (!AnfAlgo::IsRealCNodeKernel(node) || fusion_id_allocator->HasFusionIdAttr(node) ||
         AnfAlgo::CheckPrimitiveType(node, prim::kPrimReturn)) {
       continue;
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/fusion_base_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/fusion_base_pass.cc
index 3f5dd98112..a516f04442 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/fusion_base_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/fusion_base_pass.cc
@@ -25,6 +25,7 @@ namespace mindspore {
 namespace opt {
 bool FusionBasePass::CheckEltWiseNode(FuncGraphManager *manager, const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(manager);
+  MS_EXCEPTION_IF_NULL(node);
   if (!node->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(node) || fusion_id_allocator->HasFusionIdAttr(node)) {
     return false;
   }
@@ -38,6 +39,7 @@ bool FusionBasePass::CheckEltWiseNode(FuncGraphManager *manager, const AnfNodePt
 
 bool FusionBasePass::CheckDoubleInEltWiseNode(FuncGraphManager *manager, const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(manager);
+  MS_EXCEPTION_IF_NULL(node);
   if (!node->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(node) || fusion_id_allocator->HasFusionIdAttr(node)) {
     return false;
   }
@@ -49,6 +51,20 @@ bool FusionBasePass::CheckDoubleInEltWiseNode(FuncGraphManager *manager, const A
          cnode->inputs().size() == ELTWISE_DOUBLE_IN_INPUT_SIZE;
 }
 
+bool FusionBasePass::CheckMultiOutputEltWiseNode(FuncGraphManager *manager, const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(manager);
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(node) || fusion_id_allocator->HasFusionIdAttr(node)) {
+    return false;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto user_nodes = manager->node_users()[node];
+  return AnfAlgo::GetKernelType(node) == KernelType::TBE_KERNEL &&
+         AnfAlgo::GetFusionType(node) == kernel::FusionType::ELEMWISE && user_nodes.size() == ELTWISE_MULTI_USE &&
+         cnode->inputs().size() == ELTWISE_INPUT_SIZE;
+}
+
 void FusionBasePass::SetRecordFusionId(const std::unordered_set<AnfNodePtr> &record) {
   auto id = fusion_id_allocator->AllocateFusionId();
   for (auto node : record) {
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/fusion_base_pass.h b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/fusion_base_pass.h
index 421efa9716..8d6eca774c 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/fusion_base_pass.h
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/fusion_base_pass.h
@@ -33,8 +33,12 @@ const int8_t MAX_ELTWISE_NUM = 3;
 const int8_t MIN_ELTWISE_SIZE = 2;
 const int8_t ELTWISE_INPUT_SIZE = 2;
 const int8_t ELTWISE_DOUBLE_IN_INPUT_SIZE = 3;
+const int8_t CONV_DOUBLE_IN_INPUT_SIZE = 3;
+const int8_t CONV_QUART_IN_INPUT_SIZE = 5;
 const int8_t ELTWISE_USE = 1;
+const int8_t ELTWISE_MULTI_USE = 2;
 const int8_t MAX_ELTWISE_SIZE = 6;
+const int8_t MULTI_ELTWISE_SIZE = 4;
 using FusedNodeRecord = std::vector<std::unordered_set<AnfNodePtr>>;
 
 struct BufferFusionInfo_t {
@@ -58,6 +62,7 @@ class FusionBasePass : public Pass {
   void SetRecordFusionId(const std::unordered_set<AnfNodePtr> &record);
   bool CheckEltWiseNode(FuncGraphManager *manager, const AnfNodePtr &node);
   bool CheckDoubleInEltWiseNode(FuncGraphManager *manager, const AnfNodePtr &node);
+  bool CheckMultiOutputEltWiseNode(FuncGraphManager *manager, const AnfNodePtr &node);
   FusionIdAllocatorPtr fusion_id_allocator;
 };
 }  // namespace opt
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/matmul_eltwise_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/matmul_eltwise_fusion_pass.cc
index 41b17eba04..d1ef5dc83b 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/matmul_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/matmul_eltwise_fusion_pass.cc
@@ -55,6 +55,7 @@ void MatmulEltwiseFusionPass::MatchSingleFusionPattern(const session::KernelGrap
     if (AnfAlgo::GetKernelType(cnode) == KernelType::TBE_KERNEL &&
         AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE) {
       auto eltwise_input = cnode->input(1);
+      MS_EXCEPTION_IF_NULL(eltwise_input);
       if (eltwise_input->isa<CNode>() && AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimMatMul)) {
         MatchMatmulEltwise(cnode, eltwise_input, kernel_graph, candidate_fusion);
       }
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/multi_output_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/multi_output_fusion_pass.cc
new file mode 100644
index 0000000000..be4d2af1cb
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/multi_output_fusion_pass.cc
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pre_activate/ascend/buffer_fusion/multi_output_fusion_pass.h"
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <string>
+#include "kernel/kernel_fusion.h"
+#include "debug/anf_ir_dump.h"
+#include "session/anf_runtime_algorithm.h"
+#include "operator/ops.h"
+#include "utils/context/ms_context.h"
+#include "pre_activate/common/fusion_id_allocator.h"
+
+namespace mindspore {
+namespace opt {
+void MultiOutputFusionPass::MatchMultiOutputEltwise(const CNodePtr &cnode, const session::KernelGraph &kernel_graph,
+                                                    FusedNodeRecord *candidate_fusion) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  MS_EXCEPTION_IF_NULL(candidate_fusion);
+  auto manager = kernel_graph.manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  std::unordered_set<AnfNodePtr> record{cnode};
+  auto eltwise_input = cnode->input(1);
+  MS_EXCEPTION_IF_NULL(eltwise_input);
+  if (CheckMultiOutputEltWiseNode(manager.get(), eltwise_input)) {
+    std::vector<int> output_used_num{SizeToInt(manager->node_users()[eltwise_input].size())};
+    AnfAlgo::SetNodeAttr(kAttrOutputUsedNum, MakeValue(output_used_num), eltwise_input);
+    (void)record.insert(eltwise_input);
+    auto input_cnode = eltwise_input->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(input_cnode);
+    eltwise_input = input_cnode->input(1);
+  } else {
+    return;
+  }
+  while (CheckEltWiseNode(manager.get(), eltwise_input)) {
+    (void)record.insert(eltwise_input);
+    if (record.size() == MULTI_ELTWISE_SIZE) {
+      break;
+    }
+    auto input_cnode = eltwise_input->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(input_cnode);
+    eltwise_input = input_cnode->input(1);
+  }
+  if (record.size() != MULTI_ELTWISE_SIZE) {
+    return;
+  }
+  candidate_fusion->push_back(record);
+  SetRecordFusionId(record);
+}
+
+void MultiOutputFusionPass::MatchSingleFusionPattern(const session::KernelGraph &kernel_graph,
+                                                     FusedNodeRecord *candidate_fusion) {
+  MS_EXCEPTION_IF_NULL(candidate_fusion);
+  std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph.get_return());
+  std::reverse(node_list.begin(), node_list.end());
+  for (auto &node : node_list) {
+    if (!AnfAlgo::IsRealCNodeKernel(node) || fusion_id_allocator->HasFusionIdAttr(node) ||
+        AnfAlgo::CheckPrimitiveType(node, prim::kPrimReturn)) {
+      continue;
+    }
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    if (AnfAlgo::GetKernelType(cnode) == KernelType::TBE_KERNEL &&
+        AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE && cnode->inputs().size() == ELTWISE_INPUT_SIZE) {
+      MatchMultiOutputEltwise(cnode, kernel_graph, candidate_fusion);
+    }
+  }
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/multi_output_fusion_pass.h b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/multi_output_fusion_pass.h
new file mode 100644
index 0000000000..0e2510128a
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/multi_output_fusion_pass.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_BUFFER_FUSION_PASS_MULTI_OUTPUT_FUSION_PASS_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_BUFFER_FUSION_PASS_MULTI_OUTPUT_FUSION_PASS_H_
+
+#include <unordered_set>
+#include <vector>
+
+#include "pre_activate/ascend/buffer_fusion/fusion_base_pass.h"
+#include "ir/anf.h"
+#include "pre_activate/common/pass.h"
+#include "pre_activate/common/fusion_id_allocator.h"
+#include "device/kernel_info.h"
+#include "kernel/kernel.h"
+#include "session/kernel_graph.h"
+
+namespace mindspore {
+namespace opt {
+using FusedNodeRecord = std::vector<std::unordered_set<AnfNodePtr>>;
+
+class MultiOutputFusionPass : public FusionBasePass {
+ public:
+  explicit MultiOutputFusionPass(FusionIdAllocatorPtr idAllocator)
+      : FusionBasePass("MultiOutputFusionPass", idAllocator) {}
+  ~MultiOutputFusionPass() override = default;
+  void MatchSingleFusionPattern(const session::KernelGraph &kernel_graph, FusedNodeRecord *candidate_fusion) override;
+
+ private:
+  void MatchMultiOutputEltwise(const CNodePtr &cnode, const session::KernelGraph &kernel_graph,
+                               FusedNodeRecord *candidate_fusion);
+};
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_BUFFER_FUSION_PASS_MULTI_OUTPUT_FUSION_PASS_H_
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/reduce_eltwise_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/reduce_eltwise_fusion_pass.cc
index 2293754106..623f0e3426 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/reduce_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/reduce_eltwise_fusion_pass.cc
@@ -45,6 +45,7 @@ void ReduceEltwiseFusionPass::MatchReduceEltwise(const CNodePtr &cnode, const se
       break;
     }
   }
+  MS_EXCEPTION_IF_NULL(eltwise_input);
   if (!eltwise_input->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(eltwise_input) ||
       fusion_id_allocator->HasFusionIdAttr(eltwise_input)) {
     return;
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/segment_eltwise_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/segment_eltwise_fusion_pass.cc
index 1926d64c61..0dcf2362bc 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/segment_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/segment_eltwise_fusion_pass.cc
@@ -44,6 +44,7 @@ void SegmentEltwiseFusionPass::MatchSegmentEltwise(const CNodePtr &cnode, const
       break;
     }
   }
+  MS_EXCEPTION_IF_NULL(eltwise_input);
   if (!eltwise_input->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(eltwise_input) ||
       fusion_id_allocator->HasFusionIdAttr(eltwise_input)) {
     return;
@@ -73,6 +74,7 @@ void SegmentEltwiseFusionPass::MatchSingleFusionPattern(const session::KernelGra
                                                         FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(candidate_fusion);
   std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph.get_return());
+  std::reverse(node_list.begin(), node_list.end());
   for (auto &node : node_list) {
     if (!AnfAlgo::IsRealCNodeKernel(node) || fusion_id_allocator->HasFusionIdAttr(node) ||
         AnfAlgo::CheckPrimitiveType(node, prim::kPrimReturn)) {
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/stridedread_conv_stridedwrite_fusion_pass.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/stridedread_conv_stridedwrite_fusion_pass.cc
new file mode 100644
index 0000000000..5bc0fdced7
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/stridedread_conv_stridedwrite_fusion_pass.cc
@@ -0,0 +1,89 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pre_activate/ascend/buffer_fusion/stridedread_conv_stridedwrite_fusion_pass.h"
+
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <string>
+#include "kernel/kernel_fusion.h"
+#include "debug/anf_ir_dump.h"
+#include "session/anf_runtime_algorithm.h"
+#include "operator/ops.h"
+#include "utils/context/ms_context.h"
+#include "pre_activate/common/fusion_id_allocator.h"
+
+namespace mindspore {
+namespace opt {
+void StridedReadConvStridedWriteFusionPass::MatchStridedReadConvStridedWrite(const CNodePtr &cnode,
+                                                                             const session::KernelGraph &kernel_graph,
+                                                                             FusedNodeRecord *candidate_fusion) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  MS_EXCEPTION_IF_NULL(candidate_fusion);
+  auto manager = kernel_graph.manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  std::unordered_set<AnfNodePtr> record{cnode};
+  auto write_input = cnode->input(1);
+  if (CheckEltWiseNode(manager.get(), write_input)) {
+    (void)record.insert(write_input);
+    auto input_cnode = write_input->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(input_cnode);
+    write_input = input_cnode->input(1);
+  }
+  MS_EXCEPTION_IF_NULL(write_input);
+  if (!write_input->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(write_input) ||
+      fusion_id_allocator->HasFusionIdAttr(write_input)) {
+    return;
+  }
+  auto conv_cnode = write_input->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(conv_cnode);
+  if (AnfAlgo::GetKernelType(conv_cnode) == KernelType::TBE_KERNEL &&
+      AnfAlgo::GetFusionType(conv_cnode) == kernel::FusionType::CONVLUTION &&
+      conv_cnode->inputs().size() >= CONV_DOUBLE_IN_INPUT_SIZE &&
+      conv_cnode->inputs().size() <= CONV_QUART_IN_INPUT_SIZE) {
+    (void)record.insert(write_input);
+    auto conv_input = conv_cnode->input(1);
+    MS_EXCEPTION_IF_NULL(conv_input);
+    if (!conv_input->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(conv_input) ||
+        fusion_id_allocator->HasFusionIdAttr(conv_input)) {
+      return;
+    }
+    if (AnfAlgo::GetCNodeName(conv_input) == kStridedReadOpName) {
+      (void)record.insert(conv_input);
+      candidate_fusion->push_back(record);
+      SetRecordFusionId(record);
+    }
+  }
+}
+
+void StridedReadConvStridedWriteFusionPass::MatchSingleFusionPattern(const session::KernelGraph &kernel_graph,
+                                                                     FusedNodeRecord *candidate_fusion) {
+  MS_EXCEPTION_IF_NULL(candidate_fusion);
+  std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph.get_return());
+  for (auto &node : node_list) {
+    if (!AnfAlgo::IsRealCNodeKernel(node) || fusion_id_allocator->HasFusionIdAttr(node) ||
+        AnfAlgo::CheckPrimitiveType(node, prim::kPrimReturn)) {
+      continue;
+    }
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    if (AnfAlgo::GetCNodeName(cnode) == kStridedWriteOpName) {
+      MatchStridedReadConvStridedWrite(cnode, kernel_graph, candidate_fusion);
+    }
+  }
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/stridedread_conv_stridedwrite_fusion_pass.h b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/stridedread_conv_stridedwrite_fusion_pass.h
new file mode 100644
index 0000000000..c6c5fe88dc
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/stridedread_conv_stridedwrite_fusion_pass.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_BUFFER_FUSION_STRIDEDREAD_CONV_STRIDEDWRITE_FUSION_PASS_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_BUFFER_FUSION_STRIDEDREAD_CONV_STRIDEDWRITE_FUSION_PASS_H_
+
+#include <unordered_set>
+#include <vector>
+
+#include "pre_activate/ascend/buffer_fusion/fusion_base_pass.h"
+#include "ir/anf.h"
+#include "pre_activate/common/pass.h"
+#include "pre_activate/common/fusion_id_allocator.h"
+#include "device/kernel_info.h"
+#include "kernel/kernel.h"
+#include "session/kernel_graph.h"
+
+namespace mindspore {
+namespace opt {
+using FusedNodeRecord = std::vector<std::unordered_set<AnfNodePtr>>;
+
+class StridedReadConvStridedWriteFusionPass : public FusionBasePass {
+ public:
+  explicit StridedReadConvStridedWriteFusionPass(FusionIdAllocatorPtr idAllocator)
+      : FusionBasePass("StridedReadConvStridedWriteFusionPass", idAllocator) {}
+  ~StridedReadConvStridedWriteFusionPass() override = default;
+  void MatchSingleFusionPattern(const session::KernelGraph &kernel_graph, FusedNodeRecord *candidate_fusion) override;
+
+ private:
+  void MatchStridedReadConvStridedWrite(const CNodePtr &cnode, const session::KernelGraph &kernel_graph,
+                                        FusedNodeRecord *candidate_fusion);
+};
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_BUFFER_FUSION_STRIDEDREAD_CONV_STRIDEDWRITE_FUSION_PASS_H_
diff --git a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/ub_pattern_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/ub_pattern_fusion.cc
index af20c47996..faa5169c40 100644
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/ub_pattern_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/ub_pattern_fusion.cc
@@ -206,6 +206,7 @@ void ReplaceOldNode(std::unordered_map<int32_t, BufferFusionInfo_t> *buffer_fusi
 void GetFusionScopeComputeNodeList(session::KernelGraph *kernel_graph,
                                    std::unordered_map<int32_t, BufferFusionInfo_t> *buffer_fusion_infos) {
   MS_EXCEPTION_IF_NULL(buffer_fusion_infos);
+  MS_EXCEPTION_IF_NULL(kernel_graph);
   auto nodes = TopoSort(kernel_graph->get_return());
   for (auto &node : nodes) {
     MS_EXCEPTION_IF_NULL(node);
@@ -231,6 +232,7 @@ void GetFusionScopeInputNodeList(const session::KernelGraph &kernel_graph,
     auto fusion_info = buffer_fusion_info.second;
     for (const auto &node : fusion_info.anf_nodes) {
       auto cnode = node->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(cnode);
       for (size_t idx = 1; idx < cnode->inputs().size(); ++idx) {
         auto real_input = AnfAlgo::VisitKernel(cnode->input(idx), 0);
         if (std::find(fusion_info.anf_nodes.begin(), fusion_info.anf_nodes.end(), real_input.first) ==
@@ -253,6 +255,14 @@ bool TupleGetitemNodeCompare(const AnfNodePtr &node1, const AnfNodePtr &node2) {
   auto getitem2 = node2->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(getitem1);
   MS_EXCEPTION_IF_NULL(getitem2);
+  if (getitem1->size() < kTupleGetItemInputSize) {
+    MS_LOG(EXCEPTION) << "node's input size less than " << kTupleGetItemInputSize << ", getitem1["
+                      << getitem1->DebugString() << "]";
+  }
+  if (getitem2->size() < kTupleGetItemInputSize) {
+    MS_LOG(EXCEPTION) << "node's input size less than " << kTupleGetItemInputSize << ", getitem1["
+                      << getitem2->DebugString() << "]";
+  }
   auto output_idx1 = GetValue<int>(GetValueNode(getitem1->input(2)));
   auto output_idx2 = GetValue<int>(GetValueNode(getitem2->input(2)));
   return output_idx1 < output_idx2;
@@ -285,6 +295,7 @@ void GetFusionScopeOutputNodeList(session::KernelGraph *kernel_graph,
                        [](const std::pair<AnfNodePtr, int> &use_node) { return use_node.first; });
         std::sort(tuple_getitem_nodes.begin(), tuple_getitem_nodes.end(), TupleGetitemNodeCompare);
         for (auto getitem : tuple_getitem_nodes) {
+          MS_EXCEPTION_IF_NULL(getitem);
           auto getitem_ptr = getitem->cast<CNodePtr>();
           auto input2 = getitem_ptr->input(2);
           auto output_idx = GetValue<int>(GetValueNode(input2));
@@ -313,6 +324,7 @@ void SetFusionOpRefInfos(session::KernelGraph *kernel_graph, const std::vector<A
   MS_EXCEPTION_IF_NULL(manager);
   for (size_t idx = 0; idx < outputs_list.size(); ++idx) {
     auto output = outputs_list[idx];
+    MS_EXCEPTION_IF_NULL(output);
     if (output->isa<CNode>() && AnfAlgo::GetCNodeName(output) == prim::kPrimTupleGetItem->name()) {
       auto real_output = AnfAlgo::VisitKernel(output, 0);
       auto output_cnode = output->cast<CNodePtr>();
@@ -393,6 +405,7 @@ bool UbPatternFusion::FuseBufferFusionPattern(session::KernelGraph *kernel_graph
 bool UbPatternFusion::ReplaceFusionOp(std::unordered_map<int32_t, BufferFusionInfo_t> *buffer_fusion_infos,
                                       int32_t fusion_id, const kernel::KernelModPtr &kernel_ptr,
                                       session::KernelGraph *kernel_graph) const {
+  MS_EXCEPTION_IF_NULL(buffer_fusion_infos);
   auto buffer_fusion_info = (*buffer_fusion_infos)[fusion_id];
   auto buffer_fusion = CreateFusionOp(buffer_fusion_info.inputs_list, buffer_fusion_info.outputs_list,
                                       buffer_fusion_info.anf_nodes, kernel_graph);
diff --git a/mindspore/ccsrc/pre_activate/ascend/enhancer/add_memcpy_async.cc b/mindspore/ccsrc/pre_activate/ascend/enhancer/add_memcpy_async.cc
deleted file mode 100644
index 51f6732c66..0000000000
--- a/mindspore/ccsrc/pre_activate/ascend/enhancer/add_memcpy_async.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "pre_activate/ascend/enhancer/add_memcpy_async.h"
-#include <vector>
-#include "utils/utils.h"
-#include "session/anf_runtime_algorithm.h"
-#include "optimizer/opt.h"
-#include "pre_activate/ascend/ascend_helper.h"
-
-namespace mindspore {
-namespace opt {
-namespace {
-bool InputIsParameterOrValueNode(const AnfNodePtr &node) {
-  MS_EXCEPTION_IF_NULL(node);
-  auto kernel_with_index = AnfAlgo::VisitKernelWithReturnType(node, 0, true);
-  return kernel_with_index.first->isa<Parameter>() || kernel_with_index.first->isa<ValueNode>();
-}
-
-const AnfNodePtr AddMemcpyAsyncIfInputIsUsedByOthers(const FuncGraphPtr &graph, const CNodePtr &node) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node);
-  auto manager = graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-  const std::vector<AnfNodePtr> &inputs = node->inputs();
-  bool replace = false;
-  if (inputs.empty()) {
-    MS_LOG(EXCEPTION) << "node[" + AnfAlgo::GetCNodeName(node) + "]'s inputs is empty";
-  }
-  std::vector<AnfNodePtr> new_inputs = {inputs[0]};
-  for (size_t i = 1; i < inputs.size(); ++i) {
-    auto input = node->input(i);
-    if (manager->node_users().find(input) == manager->node_users().end()) {
-      MS_LOG(EXCEPTION) << "node has no output in manager";
-    }
-    // when input is used by others or is a parameter or is a value node, insert a memcpy_async
-    if (manager->node_users()[input].size() > 1 || InputIsParameterOrValueNode(input)) {
-      replace = true;
-      new_inputs.push_back(CreateMemcpyAsyncOp(graph, input));
-    } else {
-      new_inputs.push_back(input);
-    }
-  }
-
-  CNodePtr new_node = std::make_shared<CNode>(*node);
-  new_node->set_inputs(new_inputs);
-  return replace ? new_node : nullptr;
-}
-}  // namespace
-
-const AnfNodePtr AddMemcpyAsync::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
-                                         const EquivPtr &) const {
-  if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) {
-    return nullptr;
-  }
-  auto cnode = node->cast<CNodePtr>();
-  if (!AnfAlgo::IsCommunicationOp(node)) {
-    return nullptr;
-  }
-  return AddMemcpyAsyncIfInputIsUsedByOthers(func_graph, cnode);
-}
-}  // namespace opt
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op.cc b/mindspore/ccsrc/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op.cc
new file mode 100644
index 0000000000..63ea59d744
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op.cc
@@ -0,0 +1,144 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
+#include <vector>
+#include <set>
+#include <string>
+#include "utils/utils.h"
+#include "session/anf_runtime_algorithm.h"
+#include "optimizer/opt.h"
+#include "pre_activate/ascend/ascend_helper.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+// insert memcpy for some cnode even if not a Ref cnode
+const std::set<std::string> kNeedInsertMemcpyOpSet = {kLambNextMVOpName, kLambNextMVWithDecayOpName,
+                                                      kLambUpdateWithLROpName};
+
+bool IsParameterOrValueNode(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  auto kernel_with_index = AnfAlgo::VisitKernelWithReturnType(node, 0, true);
+  return kernel_with_index.first->isa<Parameter>() || kernel_with_index.first->isa<ValueNode>();
+}
+
+void TransferControl(const CNodePtr &hccl_node, const AnfNodePtr &memcpy_async, const FuncGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(hccl_node);
+  MS_EXCEPTION_IF_NULL(memcpy_async);
+  MS_EXCEPTION_IF_NULL(graph);
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  auto &node_users = manager->node_users();
+  auto iter = node_users.find(hccl_node);
+  if (iter == node_users.end()) {
+    MS_LOG(EXCEPTION) << "node has no output in manager";
+  }
+  // find hccl_node's output which is a control depend
+  for (const auto &node_index : iter->second) {
+    AnfNodePtr output = node_index.first;
+    int output_index = node_index.second;
+    if (AnfAlgo::CheckPrimitiveType(output, prim::kPrimControlDepend)) {
+      CNodePtr control_depend = output->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(control_depend);
+      std::vector<AnfNodePtr> new_inputs;
+      for (size_t i = 0; i < control_depend->size(); ++i) {
+        if (i == IntToSize(output_index)) {
+          new_inputs.push_back(memcpy_async);
+        } else {
+          new_inputs.push_back(control_depend->input(i));
+        }
+      }
+      control_depend->set_inputs(new_inputs);
+    }
+  }
+}
+}  // namespace
+
+bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(input);
+  // when input is a parameter or is a value node
+  if (IsParameterOrValueNode(input)) {
+    return true;
+  }
+
+  // when input is a Ref or some special cnodes
+  if (kernel_query_->IsTbeRef(input) ||
+      kNeedInsertMemcpyOpSet.find(AnfAlgo::GetCNodeName(input)) != kNeedInsertMemcpyOpSet.end()) {
+    return true;
+  }
+
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  auto &node_users = manager->node_users();
+  auto iter = node_users.find(input);
+  if (iter == node_users.end()) {
+    MS_LOG(EXCEPTION) << "node has no output in manager";
+  }
+  // when input is used by others
+  if (iter->second.size() > 1) {
+    return true;
+  }
+  return false;
+}
+
+void InsertMemcpyAsyncForHcclOp::InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(hccl_node);
+  bool has_insert_memcpy = false;
+  AnfNodePtr memcpy_async = nullptr;
+  std::vector<AnfNodePtr> new_inputs = {hccl_node->input(0)};
+  for (size_t i = 1; i < hccl_node->size(); ++i) {
+    auto input = hccl_node->input(i);
+    if (NeedInsertMemcpy(graph, input)) {
+      memcpy_async = CreateMemcpyAsyncOp(graph, input);
+      has_insert_memcpy = true;
+      new_inputs.push_back(memcpy_async);
+    } else {
+      new_inputs.push_back(input);
+    }
+  }
+
+  if (has_insert_memcpy) {
+    CNodePtr new_hccl_node = std::make_shared<CNode>(*hccl_node);
+    new_hccl_node->set_inputs(new_inputs);
+    auto manager = graph->manager();
+    MS_EXCEPTION_IF_NULL(manager);
+    MS_LOG(DEBUG) << "start replace new_hccl_node to old hccl_node";
+    (void)manager->Replace(hccl_node, new_hccl_node);
+    MS_LOG(DEBUG) << "end replace";
+
+    // transer hccl op's control to the memcpy_async
+    if (hccl_node->size() == 2) {
+      TransferControl(new_hccl_node, memcpy_async, graph);
+    }
+  }
+}
+
+const AnfNodePtr InsertMemcpyAsyncForHcclOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                                     const EquivPtr &) const {
+  if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) {
+    return nullptr;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  if (!AnfAlgo::IsCommunicationOp(node)) {
+    return nullptr;
+  }
+  InsertMemcpyAsync(func_graph, cnode);
+  return nullptr;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op.h b/mindspore/ccsrc/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op.h
new file mode 100644
index 0000000000..e2f3b781ed
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
+
+#include <memory>
+#include "pre_activate/common/optimizer.h"
+#include "pre_activate/ascend/ascend_helper.h"
+
+namespace mindspore {
+namespace opt {
+class InsertMemcpyAsyncForHcclOp : public PatternProcessPass {
+ public:
+  explicit InsertMemcpyAsyncForHcclOp(bool multigraph = true)
+      : PatternProcessPass("insert_memcpy_async_for_hccl_op", multigraph),
+        kernel_query_(std::make_shared<KernelQuery>()) {}
+  ~InsertMemcpyAsyncForHcclOp() override = default;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  void InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
+  bool NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input) const;
+  KernelQueryPtr kernel_query_;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/check_consistency.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/check_consistency.cc
index d2557a4bb7..7c8fb70fda 100644
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/check_consistency.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/check_consistency.cc
@@ -17,9 +17,12 @@
 
 #include <string>
 #include <memory>
+#include <vector>
 
 #include "utils/utils.h"
 #include "session/anf_runtime_algorithm.h"
+#include "common/utils.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace opt {
@@ -74,11 +77,21 @@ const AnfNodePtr CheckConsistency::Process(const FuncGraphPtr &, const AnfNodePt
   if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsRealKernel(node)) {
     return nullptr;
   }
-  CNodePtr cnode = node->cast<CNodePtr>();
-  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(cnode); i++) {
-    if (!CheckFormatForConsistency(cnode, i) || !CheckDataTypeForConsistency(cnode, i)) {
-      MS_LOG(EXCEPTION) << "Found inconsistent format or data type! Op: " << AnfAlgo::GetCNodeName(node) << "["
-                        << node->DebugString() << "]";
+
+  std::vector<AnfNodePtr> todos = {node};
+  if (AnfAlgo::IsGraphKernel(node)) {
+    auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(sub_graph);
+    kernel::GetValidKernelNodes(sub_graph, &todos);
+  }
+
+  for (auto &t : todos) {
+    CNodePtr cnode = t->cast<CNodePtr>();
+    for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(cnode); i++) {
+      if (!CheckFormatForConsistency(cnode, i) || !CheckDataTypeForConsistency(cnode, i)) {
+        MS_LOG(EXCEPTION) << "Found inconsistent format or data type! Op: " << AnfAlgo::GetCNodeName(cnode) << "["
+                          << cnode->DebugString() << "]";
+      }
     }
   }
   return nullptr;
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc
index 5b5bf7e4fc..c0f99ed415 100644
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc
@@ -34,13 +34,13 @@ const AnfNodePtr ConvertUnSupportNodeToAICPU::Process(const mindspore::FuncGraph
     return nullptr;
   }
   auto node_name = AnfAlgo::GetCNodeName(node);
-  if (node_name != prim::KPrimTransData->name() || node_name != prim::kPrimCast->name()) {
+  if (node_name != prim::KPrimTransData->name() && node_name != prim::kPrimCast->name()) {
     return nullptr;
   }
   auto kernel_builder_info = AnfAlgo::GetSelectKernelBuildInfo(node);
-  if (supported_checker_->CheckAiCoreSupported(node, kernel_builder_info)) {
-    return node;
-  } else if (supported_checker_->CheckAiCpuSupported(node, kernel_builder_info)) {
+  if (supported_checker_->CheckAICoreSupported(node, kernel_builder_info)) {
+    return nullptr;
+  } else if (supported_checker_->CheckAICPUSupported(node, kernel_builder_info)) {
     auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(kernel_builder_info);
     builder->SetKernelType(AICPU_KERNEL);
     AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
@@ -49,7 +49,7 @@ const AnfNodePtr ConvertUnSupportNodeToAICPU::Process(const mindspore::FuncGraph
     MS_LOG(EXCEPTION) << " kernel " << kernel_builder_info->ToString() << "is not supported in AiCPU & AiCore : node ["
                       << node->DebugString() << "]";
   }
-  return node;
+  return nullptr;
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/deal_ref_trans_and_cast.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/deal_ref_trans_and_cast.cc
index 43857dddfd..f909dae9e4 100644
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/deal_ref_trans_and_cast.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/deal_ref_trans_and_cast.cc
@@ -31,13 +31,14 @@ session::KernelWithIndex FindRefOriginNode(const AnfNodePtr &node) {
   session::KernelWithIndex kernel_with_index = AnfAlgo::VisitKernel(node, 0);
   AnfNodePtr cur_node = kernel_with_index.first;
   size_t cur_out_index = kernel_with_index.second;
+  MS_EXCEPTION_IF_NULL(cur_node);
   if (cur_node->isa<CNode>()) {
-    auto cnode = node->cast<CNodePtr>();
+    auto cnode = cur_node->cast<CNodePtr>();
     MS_EXCEPTION_IF_NULL(cnode);
     std::string op_name = AnfAlgo::GetCNodeName(cnode);
     auto op_info = mindspore::kernel::OpLib::FindOp(op_name, kernel::kTBE);
     // deal ref op
-    if (op_info->is_ref()) {
+    if (op_info != nullptr && op_info->is_ref()) {
       auto ref_infos = op_info->ref_infos();
       if (ref_infos.count(cur_out_index) != 0) {
         auto in_index = ref_infos.at(cur_out_index);
@@ -88,7 +89,7 @@ AnfNodePtr AddAdditionalToRefOutput(const FuncGraphPtr &func_graph, const CNodeP
                                     size_t input_index, const AnfNodePtr &get_item) {
   AnfNodePtr final_node = (get_item == nullptr ? cnode : get_item);
   size_t final_index = output_index;
-  AnfNodePtr input_node = cnode->input(input_index + 1);
+  AnfNodePtr input_node = AnfAlgo::GetInputNode(cnode, input_index);
   session::KernelWithIndex origin_pair;
   origin_pair = FindRefOriginNode(input_node);
   MS_EXCEPTION_IF_NULL(origin_pair.first);
@@ -106,7 +107,7 @@ AnfNodePtr AddAdditionalToRefOutput(const FuncGraphPtr &func_graph, const CNodeP
   if (origin_format != cur_format && cur_shape.size() > 1) {
     auto kernel_select = std::make_shared<KernelSelect>();
     final_node = NewTransOpNode(func_graph, final_node, kernel_select, false, prim::KPrimTransData->name());
-    RefreshKernelBuildInfo(cur_format, origin_format, origin_type, final_node);
+    RefreshKernelBuildInfo(cur_format, origin_format, final_node);
     final_index = 0;
     MS_EXCEPTION_IF_NULL(final_node);
     MS_LOG(INFO) << "DealRefTransAndCast add trans op, op debug info is " << final_node->DebugString();
@@ -133,6 +134,7 @@ AnfNodePtr AddAdditionalToRefOutput(const FuncGraphPtr &func_graph, const CNodeP
 }
 AnfNodePtr DealRefForMultipleOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode,
                                     const std::shared_ptr<kernel::OpInfo> &op_info) {
+  MS_EXCEPTION_IF_NULL(op_info);
   auto ref_infos = op_info->ref_infos();
   std::vector<AnfNodePtr> make_tuple_inputs;
   AbstractBasePtrList abstract_list;
@@ -144,9 +146,11 @@ AnfNodePtr DealRefForMultipleOutput(const FuncGraphPtr &func_graph, const CNodeP
       auto input_index = ref_infos.at(output_index);
       final_node = AddAdditionalToRefOutput(func_graph, cnode, output_index, input_index, final_node);
     }
+    MS_EXCEPTION_IF_NULL(final_node);
     abstract_list.push_back(final_node->abstract());
     make_tuple_inputs.push_back(final_node);
   }
+  MS_EXCEPTION_IF_NULL(func_graph);
   AnfNodePtr make_tuple = func_graph->NewCNode(make_tuple_inputs);
   MS_EXCEPTION_IF_NULL(make_tuple);
   make_tuple->set_abstract(std::make_shared<abstract::AbstractTuple>(abstract_list));
@@ -155,6 +159,8 @@ AnfNodePtr DealRefForMultipleOutput(const FuncGraphPtr &func_graph, const CNodeP
 
 AnfNodePtr DealRefSigleOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode,
                               const std::shared_ptr<kernel::OpInfo> &op_info) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  MS_EXCEPTION_IF_NULL(op_info);
   auto ref_infos = op_info->ref_infos();
   for (const auto &ref_info : ref_infos) {
     if (ref_info.second > cnode->inputs().size()) {
@@ -206,7 +212,9 @@ const AnfNodePtr DealRefTransAndCast::Process(const FuncGraphPtr &graph, const A
     return nullptr;
   }
   if (op_info->is_ref()) {
-    if (!cnode->Type()->isa<Tuple>()) {
+    auto type = cnode->Type();
+    MS_EXCEPTION_IF_NULL(type);
+    if (!type->isa<Tuple>()) {
       return DealRefSigleOutput(graph, cnode, op_info);
     } else {
       return DealRefForMultipleOutput(graph, cnode, op_info);
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast.cc
index 0fefab10d0..3d09233d99 100644
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast.cc
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <utility>
 
 #include "device/kernel_info.h"
 #include "pre_activate/ascend/ascend_helper.h"
@@ -27,34 +28,45 @@
 #include "session/anf_runtime_algorithm.h"
 #include "session/kernel_graph.h"
 #include "utils/utils.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace opt {
 namespace {
-AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
+AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode,
+                                       const std::vector<bool> &need_insert_cast) {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(cnode);
   std::vector<AnfNodePtr> make_tuple_inputs;
   AbstractBasePtrList abstract_list;
   make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
   for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) {
-    const std::string dev_fmt = AnfAlgo::GetOutputFormat(cnode, output_idx);
-    const std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
-    const TypeId origin_type = AnfAlgo::GetOutputInferDataType(cnode, output_idx);
-    const TypeId device_type = AnfAlgo::GetOutputDeviceDataType(cnode, output_idx);
+    AnfNodePtr replace_node = nullptr;
+    const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
+    const auto infer_type = AnfAlgo::GetOutputInferDataType(cnode, output_idx);
     auto idx = NewValueNode(SizeToInt(output_idx));
     MS_EXCEPTION_IF_NULL(idx);
     auto imm = std::make_shared<Int32Imm>(output_idx);
     idx->set_abstract(std::make_shared<abstract::AbstractScalar>(imm));
     auto getitem = func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), cnode, idx});
-    AnfAlgo::SetOutputInferTypeAndShape({origin_type}, {origin_shape}, getitem.get());
-    AnfNodePtr replace_node = nullptr;
-    if (origin_type != device_type) {
-      replace_node =
-        AddCastOpNodeToGraph(func_graph, getitem, dev_fmt, device_type, origin_type, origin_shape, origin_type);
-      MS_EXCEPTION_IF_NULL(replace_node);
-      replace_node->set_scope(cnode->scope());
-      AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
+    AnfAlgo::SetOutputInferTypeAndShape({infer_type}, {origin_shape}, getitem.get());
+    if (need_insert_cast[output_idx]) {
+      const auto dev_fmt = AnfAlgo::GetOutputFormat(cnode, output_idx);
+      TypeId origin_type(kTypeUnknown);
+      if (func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+        origin_type = AnfAlgo::GetCNodeOutputPrecision(cnode);
+      }
+      origin_type = origin_type == kTypeUnknown ? infer_type : origin_type;
+      const auto device_type = AnfAlgo::GetOutputDeviceDataType(cnode, output_idx);
+      if (origin_type != device_type) {
+        replace_node =
+          AddCastOpNodeToGraph(func_graph, getitem, dev_fmt, device_type, origin_type, origin_shape, infer_type);
+        MS_EXCEPTION_IF_NULL(replace_node);
+        replace_node->set_scope(cnode->scope());
+        AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
+      } else {
+        replace_node = getitem;
+      }
     } else {
       replace_node = getitem;
     }
@@ -65,9 +77,10 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
   MS_EXCEPTION_IF_NULL(make_tuple);
   make_tuple->set_abstract(std::make_shared<abstract::AbstractTuple>(abstract_list));
   return make_tuple;
-}
+}  // namespace
 
-AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
+AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode,
+                               const std::vector<bool> &need_insert_cast) {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(cnode);
   if (AnfAlgo::GetOutputTensorNum(cnode) == 0) {
@@ -76,14 +89,23 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
   MS_EXCEPTION_IF_NULL(cnode->Type());
   // Single output
   if (!cnode->Type()->isa<Tuple>()) {
+    if (!need_insert_cast[0]) {
+      return cnode;
+    }
+
     const std::string dev_fmt = AnfAlgo::GetOutputFormat(cnode, 0);
     std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(cnode, 0);
-    const TypeId origin_type = AnfAlgo::GetOutputInferDataType(cnode, 0);
+    const auto infer_type = AnfAlgo::GetOutputInferDataType(cnode, 0);
+    TypeId origin_type(kTypeUnknown);
+    if (func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+      origin_type = AnfAlgo::GetCNodeOutputPrecision(cnode);
+    }
+    origin_type = origin_type == kTypeUnknown ? infer_type : origin_type;
     const TypeId device_type = AnfAlgo::GetOutputDeviceDataType(cnode, 0);
     AnfNodePtr replace_node = cnode;
     if (origin_type != device_type) {
       replace_node =
-        AddCastOpNodeToGraph(func_graph, cnode, dev_fmt, device_type, origin_type, origin_shape, origin_type);
+        AddCastOpNodeToGraph(func_graph, cnode, dev_fmt, device_type, origin_type, origin_shape, infer_type);
       MS_EXCEPTION_IF_NULL(replace_node);
       replace_node->set_scope(cnode->scope());
       AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
@@ -91,7 +113,57 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
     return replace_node;
   }
   // Multiple output
-  return InsertCastForMultipleOutput(func_graph, cnode);
+  return InsertCastForMultipleOutput(func_graph, cnode, need_insert_cast);
+}
+
+AnfNodePtr ProcessGraphKernelOp(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
+  // insert cast for ops in graph kernel.
+  auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(sub_graph);
+  auto mng = sub_graph->manager();
+  MS_EXCEPTION_IF_NULL(mng);
+  std::vector<AnfNodePtr> todo;
+  std::vector<std::pair<AnfNodePtr, size_t>> graph_rets;
+  kernel::GetValidKernelNodes(sub_graph, &todo);
+  kernel::GetGraphRealOutput(sub_graph, &graph_rets);
+  for (auto &t : todo) {
+    AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), t);
+    // process input
+    CNodePtr t_cnode = t->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(t_cnode);
+    auto t_new_node = InsertCastForInput(sub_graph, t_cnode);
+    AnfNodePtr t_new_node_1 = nullptr;
+    std::vector<bool> need_insert_cast(AnfAlgo::GetOutputTensorNum(t), true);
+    // process output
+    auto iter = std::find_if(graph_rets.begin(), graph_rets.end(),
+                             [&t](const std::pair<AnfNodePtr, size_t> &ret) { return ret.first == t; });
+    if (iter != graph_rets.end()) {
+      auto t_fix_output_type = AnfAlgo::GetCNodeOutputPrecision(t);
+      auto t_output_type = AnfAlgo::GetOutputDeviceDataType(t, iter->second);
+      auto graph_output_type = AnfAlgo::GetOutputDeviceDataType(node, iter - graph_rets.begin());
+      if (t_fix_output_type == kTypeUnknown && t_output_type == graph_output_type) {
+        need_insert_cast[iter->second] = false;
+      } else if (t_fix_output_type == t_output_type && t_output_type == graph_output_type) {
+        need_insert_cast[iter->second] = false;
+      }
+      t_new_node_1 = InsertCastForOutput(sub_graph, t_new_node, need_insert_cast);
+    } else {
+      t_new_node_1 = InsertCastForOutput(sub_graph, t_new_node, need_insert_cast);
+    }
+
+    if (t_new_node_1 != nullptr && t_new_node_1 != t) {
+      (void)mng->Replace(t, t_new_node_1);
+    }
+  }
+
+  // insert cast for graph kernel.
+  AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
+  // process input
+  CNodePtr cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto new_node = InsertCastForInput(func_graph, cnode);
+  // process output
+  return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
 }
 }  // namespace
 
@@ -106,13 +178,27 @@ const AnfNodePtr InsertCast::Process(const FuncGraphPtr &func_graph, const AnfNo
   if (!AnfAlgo::IsRealCNodeKernel(node) || func_graph == nullptr) {
     return nullptr;
   }
+
+  if (AnfAlgo::IsGraphKernel(node)) {
+    return ProcessGraphKernelOp(func_graph, node);
+  } else {
+    // insert cast for single op.
+    AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
+    // process input
+    CNodePtr cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    auto new_node = InsertCastForInput(func_graph, cnode);
+    // process output
+    return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
+  }
+  // insert cast for single op.
   AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
   // process input
   CNodePtr cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   auto new_node = InsertCastForInput(func_graph, cnode);
   // process output
-  return InsertCastForOutput(func_graph, new_node);
+  return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast_for_runop.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast_for_runop.cc
deleted file mode 100644
index 7647b86c17..0000000000
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast_for_runop.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "pre_activate/ascend/format_type/insert_cast_for_runop.h"
-
-#include <memory>
-
-#include "device/kernel_info.h"
-#include "pre_activate/ascend/ascend_helper.h"
-#include "pre_activate/common/helper.h"
-#include "kernel/oplib/oplib.h"
-#include "session/anf_runtime_algorithm.h"
-#include "utils/utils.h"
-
-namespace mindspore {
-namespace opt {
-const BaseRef RunOpInsertCast::DefinePattern() const {
-  VarPtr V = std::make_shared<CondVar>(UnVisited);
-  VarPtr Xs = std::make_shared<SeqVar>();
-  return VectorRef({V, Xs});
-}
-
-const AnfNodePtr RunOpInsertCast::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
-                                          const EquivPtr &) const {
-  MS_EXCEPTION_IF_NULL(node);
-  if (!AnfAlgo::IsRealCNodeKernel(node) || func_graph == nullptr) {
-    return nullptr;
-  }
-  AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
-  // process input
-  CNodePtr cnode = node->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(cnode);
-  return InsertCastForInput(func_graph, cnode);
-}
-}  // namespace opt
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast_for_runop.h b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast_for_runop.h
deleted file mode 100644
index 4467cc5198..0000000000
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast_for_runop.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_INSERT_CAST_FOR_RUNOP_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_INSERT_CAST_FOR_RUNOP_H_
-#include <string>
-
-#include "pre_activate/common/optimizer.h"
-#include "pre_activate/common/pattern_engine.h"
-#include "ir/anf.h"
-namespace mindspore {
-namespace opt {
-class RunOpInsertCast : public PatternProcessPass {
- public:
-  explicit RunOpInsertCast(bool multigraph = true) : PatternProcessPass("insert_cast_for_runop", multigraph) {}
-  ~RunOpInsertCast() override = default;
-  const BaseRef DefinePattern() const override;
-  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
-};
-}  // namespace opt
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_INSERT_CAST_FOR_RUNOP_H_
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_trans_op.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_trans_op.cc
index 97244e40c6..953f464431 100644
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_trans_op.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_trans_op.cc
@@ -16,11 +16,13 @@
 
 #include "pre_activate/ascend/format_type/insert_trans_op.h"
 #include <memory>
+#include <vector>
 #include "utils/utils.h"
 #include "pre_activate/ascend/ascend_helper.h"
 #include "session/anf_runtime_algorithm.h"
 #include "device/kernel_info.h"
 #include "kernel/oplib/oplib.h"
+#include "utils/context/ms_context.h"
 
 namespace mindspore {
 namespace opt {
@@ -30,6 +32,15 @@ const BaseRef InsertTransOp::DefinePattern() const {
   return VectorRef({V, Xs});
 }
 
+bool IsGraphOutput(const AnfNodePtr &node, const std::vector<AnfNodePtr> &outputs) {
+  auto iter = std::find(outputs.begin(), outputs.end(), node);
+  if (iter != outputs.end()) {
+    return true;
+  }
+
+  return false;
+}
+
 const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                         const EquivPtr &) const {
   if (node == nullptr || !AnfAlgo::IsRealKernel(node)) {
@@ -38,6 +49,13 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
   AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
   MS_LOG(DEBUG) << "====process op: " << node->DebugString();
   AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->execution_mode() == kPynativeMode && !ms_context->enable_pynative_hook()) {
+    if (IsGraphOutput(node, AnfAlgo::GetAllOutput(func_graph->output(), {prim::kPrimTupleGetItem}))) {
+      return new_node;
+    }
+  }
   return InsertTransOpForOutput(func_graph, new_node, kernel_select_);
 }
 }  // namespace opt
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/merge_cast_to_op.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/merge_cast_to_op.cc
index dc47757e5d..b1817cec3d 100644
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/merge_cast_to_op.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/merge_cast_to_op.cc
@@ -61,16 +61,14 @@ bool AlternativeKernelInfoForInput(const CNodePtr &node, const TypeId dst_type,
 
 bool GetNextNodeAndCastIndex(const FuncGraphPtr &graph, const AnfNodePtr &node, AnfNodePtr *next_node,
                              size_t *cast_index) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node);
-  // Check whether the cast node is used for input by only one another node.
-  auto manager = graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-  if (manager->node_users().find(node) == manager->node_users().end() || manager->node_users()[node].size() != 1) {
+  auto output_node_list = GetRealNodeUsedList(graph, node);
+  MS_EXCEPTION_IF_NULL(output_node_list);
+  if (output_node_list->size() != 1) {
     return false;
   }
-  *next_node = manager->node_users()[node].begin()->first;
-  *cast_index = IntToSize(manager->node_users()[node].begin()->second - 1);
+  auto node_pair = output_node_list->at(0);
+  *next_node = node_pair.first;
+  *cast_index = node_pair.second - 1;
   return true;
 }
 
@@ -122,6 +120,24 @@ bool CheckIndexOutput(const CNodePtr &node, const std::shared_ptr<kernel::Kernel
   return AnfAlgo::GetOutputFormat(node, 0) == kernel_info->GetOutputFormat(index);
 }
 
+void ChangeNodeInferInfo(const CNodePtr &cnode, const CNodePtr &cast, const size_t cast_index) {
+  using Shape = std::vector<size_t>;
+  auto cast_dtype = AnfAlgo::GetOutputInferDataType(cast, 0);
+  auto cast_shape = AnfAlgo::GetOutputInferShape(cast, 0);
+  std::vector<Shape> shapes;
+  std::vector<TypeId> types;
+  for (size_t index = 0; index < AnfAlgo::GetOutputTensorNum(cnode); ++index) {
+    if (cast_index == index) {
+      shapes.emplace_back(cast_shape);
+      types.emplace_back(cast_dtype);
+      continue;
+    }
+    shapes.emplace_back(AnfAlgo::GetOutputInferShape(cnode, index));
+    types.emplace_back(AnfAlgo::GetOutputInferDataType(cnode, index));
+  }
+  AnfAlgo::SetOutputInferTypeAndShape(types, shapes, cnode.get());
+}
+
 AnfNodePtr MergeCastToNextOp(const FuncGraphPtr &graph, const CNodePtr &node, const KernelQueryPtr kernel_query) {
   MS_EXCEPTION_IF_NULL(node);
   MS_EXCEPTION_IF_NULL(kernel_query);
@@ -135,6 +151,9 @@ AnfNodePtr MergeCastToNextOp(const FuncGraphPtr &graph, const CNodePtr &node, co
     return nullptr;
   }
   auto next_cnode = next_node->cast<CNodePtr>();
+  if (AnfAlgo::IsGraphKernel(next_node)) {
+    return nullptr;
+  }
   auto next_op_name = AnfAlgo::GetCNodeName(next_node);
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
   kernel_query->Query(next_cnode, &kernel_info_list);
@@ -148,11 +167,14 @@ AnfNodePtr MergeCastToNextOp(const FuncGraphPtr &graph, const CNodePtr &node, co
   if (alternative_kernel_info == kernel_info_list.end()) {
     return nullptr;
   }
-  MS_LOG(INFO) << "Found alternative kernel info for current anf kernel " << next_op_name;
+  auto ori_kernel_info = AnfAlgo::GetSelectKernelBuildInfo(next_node);
+  MS_LOG(INFO) << "Found alternative kernel info for current anf kernel " << next_cnode->DebugString()
+               << "ori kernel info" << ori_kernel_info->ToString() << "alternative kernel info"
+               << (*alternative_kernel_info)->ToString();
   AnfAlgo::SetSelectKernelBuildInfo(*alternative_kernel_info, next_cnode.get());
+  ChangeNodeInferInfo(next_cnode, node, cast_index);
   if (node->inputs().size() < kCastInputNum) {
-    auto op_name = AnfAlgo::GetCNodeName(node);
-    MS_LOG(EXCEPTION) << "op[" << op_name << "] has wrong input num:";
+    MS_LOG(EXCEPTION) << "Op[" << node->DebugString() << "] has wrong input num:";
   }
   return node->input(1);
 }
@@ -205,6 +227,9 @@ AnfNodePtr MergeCastToPriorOp(const FuncGraphPtr &graph, const CNodePtr &cur_nod
     return nullptr;
   }
   MS_EXCEPTION_IF_NULL(prior_op);
+  if (AnfAlgo::IsGraphKernel(prior_op)) {
+    return nullptr;
+  }
 
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
   kernel_query->Query(prior_op, &kernel_info_list);
@@ -217,8 +242,16 @@ AnfNodePtr MergeCastToPriorOp(const FuncGraphPtr &graph, const CNodePtr &cur_nod
   if (kernel_info_it == kernel_info_list.end()) {
     return nullptr;
   }
+  auto ori_kernel_info = AnfAlgo::GetSelectKernelBuildInfo(prior_op);
+  MS_LOG(INFO) << "Found alternative kernel info for current anf kernel " << prior_op->DebugString()
+               << "ori kernel info" << ori_kernel_info->ToString() << "alternative kernel info"
+               << (*kernel_info_it)->ToString();
   AnfAlgo::SetSelectKernelBuildInfo(*kernel_info_it, prior_op.get());
-
+  ChangeNodeInferInfo(prior_op, cur_node, output_idx);
+  if (!single_output) {
+    MS_EXCEPTION_IF_NULL(x_node);
+    ChangeNodeInferInfo(x_node->cast<CNodePtr>(), cur_node, 0);
+  }
   auto prior_name = AnfAlgo::GetCNodeName(prior_op);
   if (prior_name == kFive2FourOpName) {
     AnfAlgo::CopyNodeAttr("dst_type", "dstType", cur_node, prior_op);
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.cc
new file mode 100644
index 0000000000..42061957b9
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.cc
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pre_activate/ascend/format_type/modify_ops_attrs.h"
+#include <vector>
+#include <memory>
+#include "utils/utils.h"
+#include "pre_activate/common/helper.h"
+#include "kernel/common_utils.h"
+#include "session/anf_runtime_algorithm.h"
+#include "operator/ops.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+AnfNodePtr ModifyReduceOpsAttrs(const CNodePtr &cnode) {
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, 0);
+  auto input_format = AnfAlgo::GetInputFormat(cnode, 0);
+  if (input_shape.size() == 5 || input_format != kOpFormat_NC1HWC0) {
+    return nullptr;
+  }
+  if (!AnfAlgo::HasNodeAttr(kAttrKeepDims, cnode)) {
+    return nullptr;
+  }
+
+  AnfAlgo::SetNodeAttr(kAttrKeepDims, MakeValue(true), cnode);
+  return cnode;
+}
+
+AnfNodePtr ModifyTileOpAttrs(const CNodePtr &cnode) {
+  auto input_shape = AnfAlgo::GetInputDeviceShape(cnode, 0);
+  if (input_shape.size() != 5) {
+    return nullptr;
+  }
+  if (!AnfAlgo::HasNodeAttr(kAttrMultiples, cnode)) {
+    return nullptr;
+  }
+
+  auto multiples = AnfAlgo::GetNodeAttr<std::vector<int>>(cnode, kAttrMultiples);
+  if (multiples.size() == 4 && multiples[1] == 1) {
+    multiples.push_back(1);
+    AnfAlgo::SetNodeAttr(kAttrMultiples, MakeValue(multiples), cnode);
+  }
+
+  return cnode;
+}
+
+AnfNodePtr ModifyAttrs(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto op_name = AnfAlgo::GetCNodeName(cnode);
+  if (op_name == prim::kPrimTile->name()) {
+    return ModifyTileOpAttrs(cnode);
+  } else if (op_name == prim::kPrimReduceSum->name()) {
+    // kPrimReduceMean
+    // kPrimReduceSum
+    // kPrimReduceAll
+    // kPrimReduceMax
+    // kPrimReduceMin
+    return ModifyReduceOpsAttrs(cnode);
+  }
+  return nullptr;
+}
+}  // namespace
+
+const AnfNodePtr ModifyOpAttrs::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                        const EquivPtr &) const {
+  if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsGraphKernel(node)) {
+    return nullptr;
+  }
+  MS_LOG(DEBUG) << "====Process op: " << AnfAlgo::GetCNodeName(node);
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  auto manager = fg->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  std::vector<AnfNodePtr> todos;
+  kernel::GetValidKernelNodes(fg, &todos);
+  for (auto &t : todos) {
+    auto new_node = ModifyAttrs(t->cast<CNodePtr>());
+    if (new_node != nullptr && new_node != t) {
+      (void)manager->Replace(t, new_node);
+    }
+  }
+  return node;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/enhancer/add_memcpy_async.h b/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.h
similarity index 66%
rename from mindspore/ccsrc/pre_activate/ascend/enhancer/add_memcpy_async.h
rename to mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.h
index 900b0fb46a..25ec94b6b4 100644
--- a/mindspore/ccsrc/pre_activate/ascend/enhancer/add_memcpy_async.h
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.h
@@ -13,19 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_ADD_MEMCPY_ASYNC_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_ADD_MEMCPY_ASYNC_H_
 
-#include <memory>
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_MODIFY_OPS_ATTRS_H
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_MODIFY_OPS_ATTRS_H
+
 #include "pre_activate/common/optimizer.h"
+
 namespace mindspore {
 namespace opt {
-class AddMemcpyAsync : public PatternProcessPass {
+class ModifyOpAttrs : public PatternProcessPass {
  public:
-  explicit AddMemcpyAsync(bool multigraph = true) : PatternProcessPass("add_memcpy_async", multigraph) {}
-  ~AddMemcpyAsync() override = default;
+  explicit ModifyOpAttrs(bool multigraph = true) : PatternProcessPass("modify_ops_attrs", multigraph) {}
+  ~ModifyOpAttrs() override = default;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
 };
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_ADD_MEMCPY_ASYNC_H_
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_MODIFY_OPS_ATTRS_H
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/rectify_do_mask_kernel_info.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/rectify_do_mask_kernel_info.cc
new file mode 100644
index 0000000000..d81a8c90ce
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/rectify_do_mask_kernel_info.cc
@@ -0,0 +1,163 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pre_activate/ascend/format_type/rectify_do_mask_kernel_info.h"
+
+#include <vector>
+#include <map>
+#include <string>
+#include <memory>
+
+#include "session/anf_runtime_algorithm.h"
+#include "kernel/kernel_build_info.h"
+#include "utils/utils.h"
+#include "kernel/common_utils.h"
+#include "utils/context/ms_context.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef RectifyDoMaskKernelInfo::DefinePattern() const {
+  VarPtr X = std::make_shared<Var>();
+  VarPtr Xs = std::make_shared<SeqVar>();
+  return VectorRef({X, Xs});
+}
+
+const AnfNodePtr RectifyDoMaskKernelInfo::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                                  const EquivPtr &) const {
+  if (node == nullptr || !node->isa<CNode>()) {
+    return nullptr;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->execution_mode() == kPynativeMode) {
+    return RectifyKernelInfoInPynativeProcess(node);
+  }
+  if (AnfAlgo::GetCNodeName(cnode) != prim::kPrimDropoutGenMask->name()) {
+    return nullptr;
+  }
+  std::vector<CNodePtr> do_mask_node_list;
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  auto node_map = manager->node_users();
+  auto iter = node_map.find(node);
+  if (iter == node_map.end()) {
+    MS_LOG(EXCEPTION) << "Cannot find the node " << node->DebugString() << " in the graph manager!";
+  }
+  auto gen_mask_output_nodes = iter->second;
+  for (const auto &output_node : gen_mask_output_nodes) {
+    if (AnfAlgo::GetCNodeName(output_node.first) == prim::kPrimDropoutDoMask->name()) {
+      auto output_cnode = output_node.first->cast<CNodePtr>();
+      do_mask_node_list.push_back(output_cnode);
+    }
+  }
+  std::vector<size_t> input_shape;
+  for (const auto &output_node : do_mask_node_list) {
+    if (input_shape.empty()) {
+      input_shape = AnfAlgo::GetPrevNodeOutputInferShape(output_node, 0);
+      continue;
+    }
+    auto shape = AnfAlgo::GetPrevNodeOutputInferShape(output_node, 0);
+    if (!kernel::IsSameShape(shape, input_shape)) {
+      MS_LOG(EXCEPTION) << "The DropOutGenMask connected with same genmask's shape must be equal!"
+                        << " GenMask " << node->DebugString();
+    }
+  }
+  RectifyKernelInfo(do_mask_node_list);
+  return nullptr;
+}
+
+void RectifyDoMaskKernelInfo::RectifyKernelInfo(const std::vector<CNodePtr> &do_mask_node_list) const {
+  std::map<std::string, size_t> format_counter;
+  std::string special_format;
+  std::string convert_format;
+  for (const auto &do_mask : do_mask_node_list) {
+    auto do_mask_data_format = AnfAlgo::GetInputFormat(do_mask, 0);
+    if (special_format.empty() && kHWSpecialFormatSet.find(do_mask_data_format) != kHWSpecialFormatSet.end()) {
+      special_format = do_mask_data_format;
+    }
+    if (format_counter.find(do_mask_data_format) == format_counter.end()) {
+      format_counter[do_mask_data_format] = 1;
+    } else {
+      format_counter[do_mask_data_format] = format_counter[do_mask_data_format] + 1;
+    }
+    // if has two or more special format we need change all domask's format to default that can avoid insert more
+    // transdata
+    if (format_counter.size() > 2) {
+      convert_format = kOpFormat_DEFAULT;
+      break;
+    }
+    if (kHWSpecialFormatSet.find(do_mask_data_format) != kHWSpecialFormatSet.end() &&
+        special_format != do_mask_data_format) {
+      convert_format = kOpFormat_DEFAULT;
+      break;
+    }
+  }
+  if (format_counter.size() == 1) {
+    return;
+  }
+  if (convert_format.empty()) {
+    convert_format = GetConvertFormat(format_counter);
+  }
+  RectifyDropOutDoMaskKernelInfo(do_mask_node_list, convert_format);
+}
+
+std::string RectifyDoMaskKernelInfo::GetConvertFormat(const std::map<std::string, size_t> &format_counter) const {
+  std::string convert_format;
+  const size_t counter = 0;
+  for (const auto &iter : format_counter) {
+    if (counter < iter.second) {
+      convert_format = iter.first;
+    }
+    if (counter == iter.second && kHWSpecialFormatSet.find(convert_format) == kHWSpecialFormatSet.end()) {
+      convert_format = iter.first;
+    }
+  }
+  return convert_format;
+}
+
+void RectifyDoMaskKernelInfo::RectifyDropOutDoMaskKernelInfo(const std::vector<CNodePtr> &do_mask_node_list,
+                                                             const std::string &format) const {
+  for (const auto &do_mask : do_mask_node_list) {
+    auto builder =
+      std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(do_mask));
+    builder->SetInputFormat(format, 0);
+    builder->SetOutputFormat(format, 0);
+    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), do_mask.get());
+  }
+}
+
+AnfNodePtr RectifyDoMaskKernelInfo::RectifyKernelInfoInPynativeProcess(const AnfNodePtr &node) const {
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  if (cnode == nullptr) {
+    return nullptr;
+  }
+  if (AnfAlgo::GetCNodeName(cnode) != prim::kPrimDropoutDoMask->name()) {
+    return nullptr;
+  }
+  auto do_mask_input_format = AnfAlgo::GetInputFormat(node, 0);
+  if (do_mask_input_format != kOpFormat_DEFAULT) {
+    auto builder =
+      std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(node));
+    builder->SetInputFormat(kOpFormat_DEFAULT, 0);
+    builder->SetOutputFormat(kOpFormat_DEFAULT, 0);
+    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
+  }
+  return nullptr;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/rectify_do_mask_kernel_info.h b/mindspore/ccsrc/pre_activate/ascend/format_type/rectify_do_mask_kernel_info.h
new file mode 100644
index 0000000000..81bad4d8f8
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/rectify_do_mask_kernel_info.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_RECTIFY_DO_MASK_KERNEL_INFO_H
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_RECTIFY_DO_MASK_KERNEL_INFO_H
+#include <map>
+#include <string>
+#include <vector>
+
+#include "pre_activate/common/optimizer.h"
+namespace mindspore {
+namespace opt {
+class RectifyDoMaskKernelInfo : public PatternProcessPass {
+ public:
+  explicit RectifyDoMaskKernelInfo(bool multigraph = true)
+      : PatternProcessPass("batch_norm_bert_fission", multigraph) {}
+  ~RectifyDoMaskKernelInfo() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  void RectifyKernelInfo(const std::vector<CNodePtr> &do_mask_node_list) const;
+  AnfNodePtr RectifyKernelInfoInPynativeProcess(const AnfNodePtr &node) const;
+  std::string GetConvertFormat(const std::map<std::string, size_t> &format_counter) const;
+  void RectifyDropOutDoMaskKernelInfo(const std::vector<CNodePtr> &do_mask_node_list, const std::string &format) const;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_RECTIFY_DO_MASK_KERNEL_INFO_H
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.cc
new file mode 100644
index 0000000000..dde40a5090
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.cc
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pre_activate/ascend/format_type/remove_no_use_reshape_op.h"
+#include <vector>
+#include <memory>
+#include "pre_activate/common/helper.h"
+#include "kernel/common_utils.h"
+#include "session/anf_runtime_algorithm.h"
+#include "operator/ops.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+AnfNodePtr RemoveReshapeOp(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto op_name = AnfAlgo::GetCNodeName(cnode);
+  if (op_name != prim::kPrimReshape->name()) {
+    return nullptr;
+  }
+
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, 0);
+  auto input_format = AnfAlgo::GetPrevNodeOutputFormat(cnode, 0);
+  if (input_shape.size() != 1 || input_format != kOpFormat_NC1HWC0) {
+    return nullptr;
+  }
+
+  return cnode->input(1);
+}
+}  // namespace
+
+const AnfNodePtr RemoveNoUseReshapeOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                               const EquivPtr &) const {
+  if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsGraphKernel(node)) {
+    return nullptr;
+  }
+  MS_LOG(DEBUG) << "====process op: " << AnfAlgo::GetCNodeName(node);
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  auto manager = fg->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  std::vector<AnfNodePtr> todos;
+  kernel::GetValidKernelNodes(fg, &todos);
+  for (auto &t : todos) {
+    auto new_node = RemoveReshapeOp(t->cast<CNodePtr>());
+    if (new_node != nullptr && new_node != t) {
+      (void)manager->Replace(t, new_node);
+    }
+  }
+  return node;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_add_relu_fusion.h b/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.h
similarity index 57%
rename from mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_add_relu_fusion.h
rename to mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.h
index eb7cc730b5..4942c2fc08 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_add_relu_fusion.h
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,21 +14,20 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_CONV_BN_ADD_RELU_FUSION_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_CONV_BN_ADD_RELU_FUSION_H_
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_NO_USE_RESHAPE_OP_H
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_NO_USE_RESHAPE_OP_H
 
 #include "pre_activate/common/optimizer.h"
-#include "pre_activate/common/helper.h"
 
 namespace mindspore {
 namespace opt {
-class ConvBnAddReluFusion : public PatternProcessPass {
+class RemoveNoUseReshapeOp : public PatternProcessPass {
  public:
-  explicit ConvBnAddReluFusion(bool multigraph = true) : PatternProcessPass("conv_bn_add_relu_fusion", multigraph) {}
-  ~ConvBnAddReluFusion() override = default;
-  const BaseRef DefinePattern() const override;
+  explicit RemoveNoUseReshapeOp(bool multigraph = true) : PatternProcessPass("remove_no_use_reshape_op", multigraph) {}
+  ~RemoveNoUseReshapeOp() override = default;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
 };
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_CONV_BN_ADD_RELU_FUSION_H_
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_NO_USE_RESHAPE_OP_H
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fission/batch_norm_bert_fission.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fission/batch_norm_bert_fission.cc
index 640f84aa44..e6a8864e46 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fission/batch_norm_bert_fission.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/batch_norm_bert_fission.cc
@@ -27,24 +27,6 @@ const std::vector<int> kOutputIndex{0, 3, 4, 5};
 constexpr size_t kBatchNormRealOutputNum = 3;
 constexpr size_t kBatchNormRealInputNum = 3;
 
-bool CompareTupleGetitem(const AnfNodePtr &n1, const AnfNodePtr &n2) {
-  MS_EXCEPTION_IF_NULL(n1);
-  MS_EXCEPTION_IF_NULL(n2);
-  auto n1_cnode = n1->cast<CNodePtr>();
-  auto n2_cnode = n2->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(n1_cnode);
-  MS_EXCEPTION_IF_NULL(n2_cnode);
-  auto index_input1 = n1_cnode->input(kInputNodeOutputIndexInTupleGetItem);
-  MS_EXCEPTION_IF_NULL(index_input1);
-  auto value_node1 = index_input1->cast<ValueNodePtr>();
-  MS_EXCEPTION_IF_NULL(value_node1);
-  auto index_input2 = n2_cnode->input(kInputNodeOutputIndexInTupleGetItem);
-  MS_EXCEPTION_IF_NULL(index_input2);
-  auto value_node2 = index_input2->cast<ValueNodePtr>();
-  MS_EXCEPTION_IF_NULL(value_node2);
-  return GetValue<int>(value_node1->value()) < GetValue<int>(value_node2->value());
-}
-
 bool GetBatchNormOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &bn, std::vector<AnfNodePtr> *bn_outputs) {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(bn_outputs);
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fission/bn_split.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fission/bn_split.cc
index c8d92f7200..66ffa24bf1 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fission/bn_split.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/bn_split.cc
@@ -28,14 +28,14 @@
 namespace mindspore {
 namespace opt {
 namespace {
-void CreateOutputsOfBNTrainingReduce(const FuncGraphPtr &graph, const CNodePtr &bn_cnode,
+bool CreateOutputsOfBNTrainingReduce(const FuncGraphPtr &graph, const CNodePtr &bn_cnode,
                                      std::vector<AnfNodePtr> *bn_training_reduce_outputs) {
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(bn_cnode);
   if (bn_cnode->inputs().size() != kBnInputNum) {
-    MS_LOG(EXCEPTION) << "BN node has wrong input size";
+    MS_LOG(INFO) << "FusedbatchNorm's input size less than " << kBnInputNum << ". " << bn_cnode->DebugString();
+    return false;
   }
-  // All the inputs of BNTrainingReduce are from the inputs of BN
   std::vector<AnfNodePtr> bn_training_reduce_inputs = {
     NewValueNode(std::make_shared<Primitive>(kBNTrainingReduceOpName))};
   bn_training_reduce_inputs.push_back(bn_cnode->input(1));
@@ -45,8 +45,9 @@ void CreateOutputsOfBNTrainingReduce(const FuncGraphPtr &graph, const CNodePtr &
   MS_EXCEPTION_IF_NULL(kernel_info);
   bn_training_reduce->set_kernel_info(kernel_info);
   std::vector<size_t> bn_shape_i0 = AnfAlgo::GetPrevNodeOutputInferShape(bn_cnode, 0);
-  if (bn_shape_i0.size() != kShape4dDims) {
-    MS_LOG(EXCEPTION) << "Get shape of FusedBatchNorm fail";
+  if (bn_shape_i0.size() < kShape2dDims) {
+    MS_LOG(INFO) << "The FusedBatchNorm's first input's shape dims less than " << kShape2dDims;
+    return false;
   }
   std::vector<size_t> bn_training_reduce_shape = {bn_shape_i0[1]};
   auto types = {kNumberTypeFloat32, kNumberTypeFloat32};
@@ -56,6 +57,7 @@ void CreateOutputsOfBNTrainingReduce(const FuncGraphPtr &graph, const CNodePtr &
   AnfAlgo::CopyNodeAttrs(bn_cnode, bn_training_reduce);
 
   CreateMultipleOutputsOfAnfNode(graph, bn_training_reduce, kBNTrainingReduceOutputNum, bn_training_reduce_outputs);
+  return true;
 }
 
 AnfNodePtr CreateOutputsOfBNTrainingUpdate(const FuncGraphPtr &graph, const CNodePtr &bn_cnode,
@@ -99,11 +101,15 @@ AnfNodePtr SplitFusedBatchNormForTBE(const FuncGraphPtr &func_graph, const AnfNo
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   if (cnode->inputs().size() < kBnInputNum) {
-    MS_LOG(EXCEPTION) << "op[FusedBatchNorm] has less than " << kBnInputNum << " inputs.";
+    MS_LOG(INFO) << "op[FusedBatchNorm] has less than " << kBnInputNum << " inputs.";
+    return nullptr;
   }
   // Create BNTrainingReduce node and get outputs of BNTrainingReduce
   std::vector<AnfNodePtr> bn_training_reduce_outputs;
-  CreateOutputsOfBNTrainingReduce(func_graph, cnode, &bn_training_reduce_outputs);
+  if (!CreateOutputsOfBNTrainingReduce(func_graph, cnode, &bn_training_reduce_outputs)) {
+    MS_LOG(WARNING) << "Create BNTrainingReduce fail, quit split";
+    return nullptr;
+  }
   if (bn_training_reduce_outputs.size() != kBN1OutputNum) {
     MS_LOG(EXCEPTION) << "make outputs of op BNTrainingReduce fail";
   }
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fission/layer_norm_grad_split.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fission/layer_norm_grad_split.cc
index cc1356c724..1a25d83650 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fission/layer_norm_grad_split.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/layer_norm_grad_split.cc
@@ -32,7 +32,6 @@ void LayerNormGradSplit::CreateOutputsOfLayerNormXBackprop(
   std::vector<AnfNodePtr> *layer_norm_x_backprop_outputs) const {
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(layer_norm_grad);
-  MS_EXCEPTION_IF_NULL(kernel_select_);
   auto prim = std::make_shared<Primitive>(kLayerNormXBackpropOpName);
   std::vector<AnfNodePtr> layer_norm_x_backprop_inputs = {NewValueNode(prim)};
   for (size_t i = 1; i < layer_norm_grad->inputs().size(); ++i) {
@@ -46,7 +45,6 @@ void LayerNormGradSplit::CreateOutputsOfLayerNormXBackprop(
   auto shapes = {AnfAlgo::GetOutputInferShape(layer_norm_grad, 0)};
   AnfAlgo::SetOutputInferTypeAndShape(types, shapes, layer_norm_x_backprop.get());
 
-  kernel_select_->SelectKernel(layer_norm_x_backprop);
   (*layer_norm_x_backprop_outputs).push_back(layer_norm_x_backprop);
 }
 
@@ -55,7 +53,6 @@ void LayerNormGradSplit::CreateOutputsOfLayerNormBetaGammaBackprop(
   std::vector<AnfNodePtr> *layer_norm_beta_gamma_backprop_outputs) const {
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(layer_norm_grad);
-  MS_EXCEPTION_IF_NULL(kernel_select_);
   auto prim = std::make_shared<Primitive>(kLayerNormBetaGammaBackpropOpName);
   std::vector<AnfNodePtr> layer_norm_beta_gamma_backprop_inputs = {NewValueNode(prim)};
   for (size_t i = 1; i < layer_norm_grad->inputs().size() - 1; ++i) {
@@ -73,10 +70,9 @@ void LayerNormGradSplit::CreateOutputsOfLayerNormBetaGammaBackprop(
   AnfAlgo::SetOutputInferTypeAndShape(types, shapes, layer_norm_beta_gamma_backprop.get());
 
   // get device shape of LayerNormGrad's 5th Input, and convert it to attr
-  std::vector<size_t> shape_gamma = AnfAlgo::GetInputDeviceShape(layer_norm_grad, 4);
+  std::vector<size_t> shape_gamma = AnfAlgo::GetPrevNodeOutputInferShape(layer_norm_grad, 4);
   AnfAlgo::SetNodeAttr(kAttrShapeGamma, MakeValue(opt::Convert2Int(shape_gamma)), layer_norm_beta_gamma_backprop);
 
-  kernel_select_->SelectKernel(layer_norm_beta_gamma_backprop);
   CreateMultipleOutputsOfAnfNode(graph, layer_norm_beta_gamma_backprop, kLayerNormBetaGammaBackpropOutputNum,
                                  layer_norm_beta_gamma_backprop_outputs);
 }
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fission/layer_norm_grad_split.h b/mindspore/ccsrc/pre_activate/ascend/ir_fission/layer_norm_grad_split.h
index f25c2e9838..f442446b01 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fission/layer_norm_grad_split.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/layer_norm_grad_split.h
@@ -26,8 +26,7 @@ namespace mindspore {
 namespace opt {
 class LayerNormGradSplit : public PatternProcessPass {
  public:
-  explicit LayerNormGradSplit(bool multigraph = true)
-      : PatternProcessPass("layer_norm_grad_split", multigraph), kernel_select_(std::make_shared<KernelSelect>()) {}
+  explicit LayerNormGradSplit(bool multigraph = true) : PatternProcessPass("layer_norm_grad_split", multigraph) {}
   ~LayerNormGradSplit() override = default;
   const BaseRef DefinePattern() const override;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
@@ -37,7 +36,6 @@ class LayerNormGradSplit : public PatternProcessPass {
                                          std::vector<AnfNodePtr> *layer_norm_grad_outputs) const;
   void CreateOutputsOfLayerNormBetaGammaBackprop(const FuncGraphPtr &graph, const CNodePtr &layer_norm_grad,
                                                  std::vector<AnfNodePtr> *layer_norm_beta_gamma_outputs) const;
-  KernelSelectPtr kernel_select_;
 };
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fission/single_batch_norm_fission.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fission/single_batch_norm_fission.cc
new file mode 100644
index 0000000000..159be2ac3b
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/single_batch_norm_fission.cc
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pre_activate/ascend/ir_fission/single_batch_norm_fission.h"
+#include <vector>
+#include <memory>
+#include <algorithm>
+#include "session/anf_runtime_algorithm.h"
+#include "pre_activate/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+constexpr size_t kBatchNormRealInputNum = 3;
+
+AnfNodePtr CreateBNTrainingReduce(const FuncGraphPtr &func_graph, const AnfNodePtr &bn) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(bn);
+  auto bn_cnode = bn->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(bn_cnode);
+  if (bn_cnode->inputs().size() < kBatchNormRealInputNum + 1) {
+    MS_LOG(EXCEPTION) << "The input size of node " + bn_cnode->DebugString() + " is less than "
+                      << kBatchNormRealInputNum + 1;
+  }
+  std::vector<AnfNodePtr> bn_training_reduce_inputs = {
+    NewValueNode(std::make_shared<Primitive>(kBNTrainingReduceOpName)), bn_cnode->input(1)};
+  auto bn_training_reduce = func_graph->NewCNode(bn_training_reduce_inputs);
+  MS_EXCEPTION_IF_NULL(bn_training_reduce);
+
+  // set abstract
+  auto bn_input1 = bn_cnode->input(2);
+  MS_EXCEPTION_IF_NULL(bn_input1);
+  AbstractBasePtrList abstract_list{bn_input1->abstract(), bn_input1->abstract()};
+  auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);
+  bn_training_reduce->set_abstract(abstract_tuple);
+  bn_training_reduce->set_scope(bn->scope());
+  return bn_training_reduce;
+}
+
+AnfNodePtr CreateBNTrainingUpdateV3(const FuncGraphPtr &func_graph, const AnfNodePtr &bn,
+                                    const std::vector<AnfNodePtr> &bn_training_reduce_outputs) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(bn);
+  auto bn_cnode = bn->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(bn_cnode);
+  if (bn_cnode->inputs().size() < kBatchNormRealInputNum + 1) {
+    MS_LOG(EXCEPTION) << "The input size of node " + bn_cnode->DebugString() + " is less than "
+                      << kBatchNormRealInputNum + 1;
+  }
+  if (bn_training_reduce_outputs.size() != kBNTrainingReduceOutputNum) {
+    MS_LOG(EXCEPTION) << "The output size of node bn_training_reduce must be " << kBNTrainingReduceOutputNum
+                      << ", but it is " << bn_training_reduce_outputs.size();
+  }
+  std::vector<AnfNodePtr> bn_training_update_v3_inputs = {
+    NewValueNode(std::make_shared<Primitive>(kBNTrainingUpdateV3OpName)),
+    bn_cnode->input(1),
+    bn_training_reduce_outputs[0],
+    bn_training_reduce_outputs[1],
+    bn_cnode->input(2),
+    bn_cnode->input(3)};
+  auto bn_training_update_v3 = func_graph->NewCNode(bn_training_update_v3_inputs);
+  MS_EXCEPTION_IF_NULL(bn_training_update_v3);
+
+  auto bn_abstract_tuple = dyn_cast<abstract::AbstractTuple>(bn->abstract());
+  MS_EXCEPTION_IF_NULL(bn_abstract_tuple);
+  if (bn_abstract_tuple->elements().size() != kBatchNormOutputNum) {
+    MS_LOG(EXCEPTION) << "The abstract size of node bn must be " << kBatchNormOutputNum << ", but it is "
+                      << bn_abstract_tuple->elements().size();
+  }
+  bn_training_update_v3->set_abstract(bn->abstract());
+  bn_training_update_v3->set_scope(bn->scope());
+  AnfAlgo::CopyNodeAttr(kAttrEpsilon, bn_cnode, bn_training_update_v3);
+  return bn_training_update_v3;
+}
+}  // namespace
+
+const BaseRef SingleBatchNormFission::DefinePattern() const {
+  VarPtr Xs = std::make_shared<SeqVar>();
+  return VectorRef({prim::kPrimBatchNorm, Xs});
+}
+
+const AnfNodePtr SingleBatchNormFission::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                                 const EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (cnode->size() < kBatchNormRealInputNum + 1) {
+    MS_LOG(INFO) << "The input num of BatchNorm less than" << kBatchNormRealInputNum
+                 << ". The node should not be changed";
+    return nullptr;
+  }
+  if (!GetBoolAttr(cnode, kAttrIsTraining)) {
+    MS_LOG(INFO) << "is training should be true if do fusion";
+    return nullptr;
+  }
+  AnfNodePtr bn_training_reduce = CreateBNTrainingReduce(func_graph, node);
+  std::vector<AnfNodePtr> bn_training_reduce_outputs;
+  CreateMultipleOutputsOfAnfNode(func_graph, bn_training_reduce, kBNTrainingReduceOutputNum,
+                                 &bn_training_reduce_outputs);
+
+  return CreateBNTrainingUpdateV3(func_graph, node, bn_training_reduce_outputs);
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_relu_fusion.h b/mindspore/ccsrc/pre_activate/ascend/ir_fission/single_batch_norm_fission.h
similarity index 61%
rename from mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_relu_fusion.h
rename to mindspore/ccsrc/pre_activate/ascend/ir_fission/single_batch_norm_fission.h
index ea415564ae..145603132b 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_relu_fusion.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/single_batch_norm_fission.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,21 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_CONV_BN_RELU_FUSION_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_CONV_BN_RELU_FUSION_H_
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FISSION_SINGLE_BATCH_NORM_FISSION_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FISSION_SINGLE_BATCH_NORM_FISSION_H_
 
 #include "pre_activate/common/optimizer.h"
-#include "pre_activate/common/helper.h"
 
 namespace mindspore {
 namespace opt {
-class ConvBnReluFusion : public PatternProcessPass {
+class SingleBatchNormFission : public PatternProcessPass {
  public:
-  explicit ConvBnReluFusion(bool multigraph = true) : PatternProcessPass("conv_bn_relu_fusion", multigraph) {}
-  ~ConvBnReluFusion() override = default;
+  explicit SingleBatchNormFission(bool multigraph = true)
+      : PatternProcessPass("single_batch_norm_fission", multigraph) {}
+  ~SingleBatchNormFission() override = default;
   const BaseRef DefinePattern() const override;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
 };
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_CONV_BN_RELU_FUSION_H_
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FISSION_SINGLE_BATCH_NORM_FISSION_H_
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fission/split_fission.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fission/split_fission.cc
new file mode 100644
index 0000000000..c39a5e01e6
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/split_fission.cc
@@ -0,0 +1,191 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pre_activate/ascend/ir_fission/split_fission.h"
+#include <memory>
+#include <vector>
+#include "session/anf_runtime_algorithm.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+CNodePtr CreateSplitVNode(const FuncGraphPtr &func_graph, const AnfNodePtr &input_node) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(input_node);
+  std::vector<AnfNodePtr> splitv_inputs{NewValueNode(std::make_shared<Primitive>(kSplitVOpName)), input_node};
+  CNodePtr splitv = func_graph->NewCNode(splitv_inputs);
+  MS_EXCEPTION_IF_NULL(splitv);
+  splitv->set_scope(input_node->scope());
+  return splitv;
+}
+
+CNodePtr CreateBaseSplitVNode(const FuncGraphPtr &func_graph, const CNodePtr &origin_cnode) {
+  MS_EXCEPTION_IF_NULL(origin_cnode);
+  if (origin_cnode->inputs().size() < kSplitInputNum) {
+    MS_LOG(EXCEPTION) << "The input number of split: " << origin_cnode->DebugString() << " should be "
+                      << kSplitInputNum - 1;
+  }
+  return CreateSplitVNode(func_graph, origin_cnode->input(1));
+}
+
+void SetAttrForSplitVNode(const AnfNodePtr &splitv, const std::vector<int> &size_splits, int split_dim, int num_split) {
+  AnfAlgo::SetNodeAttr(kAttrSizeSplits, MakeValue(size_splits), splitv);
+  AnfAlgo::SetNodeAttr(kAttrSplitDim, MakeValue(split_dim), splitv);
+  AnfAlgo::SetNodeAttr(kAttrNumSplit, MakeValue(num_split), splitv);
+}
+
+size_t GetSmallSplitSize(const AnfNodePtr &split_node, int split_dim, int num_split) {
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(split_node, 0);
+  if (split_dim < 0) {
+    split_dim += input_shape.size();
+  }
+  if (IntToSize(split_dim) >= input_shape.size()) {
+    MS_LOG(EXCEPTION) << "The split_dim value should be less than the shape size of input 0";
+  }
+  return input_shape[split_dim] / num_split;
+}
+
+void AddNewOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &new_splitv, int outputs_num,
+                   std::vector<AnfNodePtr> *inputs) {
+  MS_EXCEPTION_IF_NULL(inputs);
+  std::vector<AnfNodePtr> new_splitv_output;
+  CreateMultipleOutputsOfAnfNode(func_graph, new_splitv, outputs_num, &new_splitv_output);
+  inputs->insert(inputs->end(), new_splitv_output.begin(), new_splitv_output.end());
+}
+
+AnfNodePtr CreateTupleGetItem(const FuncGraphPtr &func_graph, const AnfNodePtr &input, size_t index) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  auto idx = NewValueNode(SizeToInt(index));
+  MS_EXCEPTION_IF_NULL(idx);
+  auto imm = std::make_shared<Int32Imm>(SizeToInt(index));
+  auto abstract_scalar = std::make_shared<abstract::AbstractScalar>(imm);
+  idx->set_abstract(abstract_scalar);
+  auto tuple_getitem = func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), input, idx});
+  return tuple_getitem;
+}
+
+void CreateOutputShapeAndTypeId(const CNodePtr &origin_cnode, int split_dim, int split_size, int num_split,
+                                std::vector<TypeId> *new_type_ids,
+                                std::vector<std::vector<size_t>> *new_output_shapes) {
+  MS_EXCEPTION_IF_NULL(new_type_ids);
+  MS_EXCEPTION_IF_NULL(new_output_shapes);
+  auto output_shape = AnfAlgo::GetOutputInferShape(origin_cnode, 0);
+  output_shape[split_dim] = split_size;
+  TypeId type_id = AnfAlgo::GetOutputInferDataType(origin_cnode, 0);
+  for (int i = 0; i < num_split; ++i) {
+    new_type_ids->emplace_back(type_id);
+    new_output_shapes->emplace_back(output_shape);
+  }
+}
+
+void SetAttrAndAbstractForBaseSplitv(const CNodePtr &origin_cnode, const CNodePtr &base_splitv,
+                                     const std::vector<int> &size_splits_base, int split_dim, int num_split) {
+  SetAttrForSplitVNode(base_splitv, size_splits_base, split_dim, num_split);
+  std::vector<TypeId> base_type_ids;
+  std::vector<std::vector<size_t>> base_output_shapes_base;
+  auto output_shape = AnfAlgo::GetOutputInferShape(origin_cnode, 0);
+  TypeId type_id = AnfAlgo::GetOutputInferDataType(origin_cnode, 0);
+  for (int i = 0; i < num_split; ++i) {
+    output_shape[split_dim] = size_splits_base[i];
+    base_output_shapes_base.emplace_back(output_shape);
+    base_type_ids.emplace_back(type_id);
+  }
+  AnfAlgo::SetOutputInferTypeAndShape(base_type_ids, base_output_shapes_base, base_splitv.get());
+}
+
+AnfNodePtr DoFission(const FuncGraphPtr &func_graph, const CNodePtr &cnode, int num_split, int divisor) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  auto split_dim = AnfAlgo::GetNodeAttr<int>(cnode, kAttrAxis);
+  CNodePtr base_splitv = CreateBaseSplitVNode(func_graph, cnode);
+
+  // Create new size_splits for "size_splits" attr of each new Splitv node which has full inputs.
+  auto small_split_size = SizeToInt(GetSmallSplitSize(cnode, split_dim, num_split));
+  std::vector<int> size_splits_new;
+  for (int i = 0; i < divisor; ++i) {
+    size_splits_new.emplace_back(small_split_size);
+  }
+  // Create new output shape and new output type id for each new Splitv node which has full inputs.
+  std::vector<TypeId> new_type_ids;
+  std::vector<std::vector<size_t>> new_output_shapes;
+  CreateOutputShapeAndTypeId(cnode, split_dim, small_split_size, divisor, &new_type_ids, &new_output_shapes);
+
+  // Create make_tuple input to create a make_tuple for replacing the old Split node.
+  std::vector<AnfNodePtr> make_tuple_inputs{NewValueNode(prim::kPrimMakeTuple)};
+  // Start to divide the outputs of Split.
+  std::vector<int> size_splits_base;
+  const auto base_split_size = divisor * small_split_size;
+  int nodes_num = 0;
+  int cur_output_index = 0;
+  while (num_split - cur_output_index > divisor) {
+    CNodePtr new_splitv = CreateSplitVNode(func_graph, CreateTupleGetItem(func_graph, base_splitv, nodes_num));
+    SetAttrForSplitVNode(new_splitv, size_splits_new, split_dim, divisor);
+    AnfAlgo::SetOutputInferTypeAndShape(new_type_ids, new_output_shapes, new_splitv.get());
+    AddNewOutputs(func_graph, new_splitv, divisor, &make_tuple_inputs);
+    cur_output_index += divisor;
+    size_splits_base.emplace_back(base_split_size);
+    nodes_num++;
+  }
+  if (cur_output_index < num_split) {
+    auto last_node_num_split = num_split - cur_output_index;
+    if (last_node_num_split > 1) {
+      CNodePtr new_splitv = CreateSplitVNode(func_graph, CreateTupleGetItem(func_graph, base_splitv, nodes_num));
+      std::vector<int> size_splits_new_last;
+      for (int i = 0; i < last_node_num_split; ++i) {
+        size_splits_new_last.emplace_back(small_split_size);
+      }
+      SetAttrForSplitVNode(new_splitv, size_splits_new_last, split_dim, last_node_num_split);
+      // Create new output shape and new output type id for the last Splitv node
+      std::vector<TypeId> last_new_type_ids;
+      std::vector<std::vector<size_t>> last_new_output_shapes;
+      CreateOutputShapeAndTypeId(cnode, split_dim, small_split_size, last_node_num_split, &last_new_type_ids,
+                                 &last_new_output_shapes);
+      AnfAlgo::SetOutputInferTypeAndShape(last_new_type_ids, last_new_output_shapes, new_splitv.get());
+      AddNewOutputs(func_graph, new_splitv, last_node_num_split, &make_tuple_inputs);
+      size_splits_base.emplace_back(last_node_num_split * small_split_size);
+    } else {
+      make_tuple_inputs.emplace_back(CreateTupleGetItem(func_graph, base_splitv, nodes_num));
+      size_splits_base.emplace_back(small_split_size);
+    }
+    nodes_num++;
+  }
+  // Set Attr and abstract for the base splitv
+  SetAttrAndAbstractForBaseSplitv(cnode, base_splitv, size_splits_base, split_dim, nodes_num);
+  AnfNodePtr make_tuple = func_graph->NewCNode(make_tuple_inputs);
+  return make_tuple;
+}
+}  // namespace
+
+const BaseRef SplitFission::DefinePattern() const {
+  VarPtr Xs = std::make_shared<SeqVar>();
+  auto split_prim = std::make_shared<Primitive>(kSplitOpName);
+  return VectorRef({split_prim, Xs});
+}
+
+const AnfNodePtr SplitFission::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, const EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  // Check output num
+  if (!AnfAlgo::HasNodeAttr(kAttrOutputNum, cnode)) {
+    return nullptr;
+  }
+  auto num_split = AnfAlgo::GetNodeAttr<int>(cnode, kAttrOutputNum);
+  if (num_split <= outputs_divisor_) {
+    return nullptr;
+  }
+  return DoFission(func_graph, cnode, num_split, outputs_divisor_);
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_fusion.h b/mindspore/ccsrc/pre_activate/ascend/ir_fission/split_fission.h
similarity index 60%
rename from mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_fusion.h
rename to mindspore/ccsrc/pre_activate/ascend/ir_fission/split_fission.h
index 892e6053cf..c2763bb714 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_fusion.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/split_fission.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,22 +13,25 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_CONV_BN_FUSION_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_CONV_BN_FUSION_H_
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FISSION_SPLIT_FISSION_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FISSION_SPLIT_FISSION_H_
 
 #include "pre_activate/common/optimizer.h"
-#include "pre_activate/common/helper.h"
 
 namespace mindspore {
 namespace opt {
-class ConvBnFusion : public PatternProcessPass {
+constexpr int kSplitOutputsDivisor = 63;
+class SplitFission : public PatternProcessPass {
  public:
-  explicit ConvBnFusion(bool multigraph = true) : PatternProcessPass("conv_bn_fusion", multigraph) {}
-  ~ConvBnFusion() override = default;
+  explicit SplitFission(bool multigraph = true)
+      : PatternProcessPass("split_fission", multigraph), outputs_divisor_(kSplitOutputsDivisor) {}
+  ~SplitFission() override = default;
   const BaseRef DefinePattern() const override;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  int outputs_divisor_;
 };
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_CONV_BN_FUSION_H_
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FISSION_SPLIT_FISSION_H_
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fission/topk_split.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fission/topk_split.cc
index 9abef8fa70..1cace41fc4 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fission/topk_split.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/topk_split.cc
@@ -91,6 +91,30 @@ kernel::KernelBuildInfoPtr CreateKernelBuildInfo() {
   builder.SetOutputsDeviceType({kNumberTypeFloat16, kNumberTypeInt32});
   return builder.Build();
 }
+
+bool CheckInputNamesSize(const CNodePtr &cnode) {
+  auto input_names_vec = AnfAlgo::GetNodeAttr<std::vector<std::string>>(cnode, kAttrInputNames);
+  if (input_names_vec.size() < kTopkIndexK + 1) {
+    MS_LOG(INFO) << "The input k of topk has been converted to attr";
+    return false;
+  }
+  return true;
+}
+
+bool CheckOutputShape(const AnfNodePtr &node) {
+  auto shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 0);
+  if (shape.empty()) {
+    MS_LOG(INFO) << "The output shape of topk to split must not be empty";
+    return false;
+  }
+  auto last_dim = shape[shape.size() - 1];
+  const size_t kMaxFloat16 = 65500;
+  if (last_dim > kMaxFloat16) {
+    MS_LOG(INFO) << "The last dim is more than " << kMaxFloat16 << ", switch to aicpu ops.";
+    return false;
+  }
+  return true;
+}
 }  // namespace
 
 const BaseRef TopKSplit::DefinePattern() const {
@@ -107,16 +131,10 @@ const AnfNodePtr TopKSplit::Process(const FuncGraphPtr &func_graph, const AnfNod
   // set value node as topk's input
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
-  auto input_names_vec = AnfAlgo::GetNodeAttr<std::vector<std::string>>(cnode, kAttrInputNames);
-  if (input_names_vec.size() < kTopkIndexK + 1) {
-    MS_LOG(INFO) << "The input k of topk has been converted to attr";
+  if (!CheckInputNamesSize(cnode)) {
     return nullptr;
   }
-  auto shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 0);
-  auto last_dim = shape[shape.size() - 1];
-  const size_t kMaxFloat16 = 65500;
-  if (last_dim > kMaxFloat16) {
-    MS_LOG(INFO) << "The last dim is more than 65500, switch to aicpu ops.";
+  if (!CheckOutputShape(cnode)) {
     return nullptr;
   }
   // Copy a new node to check supported.
@@ -148,7 +166,7 @@ const AnfNodePtr TopKSplit::Process(const FuncGraphPtr &func_graph, const AnfNod
   auto indices_const = CreateValueNode(new_cnode);
   new_cnode->add_input(indices_const);
   MS_EXCEPTION_IF_NULL(supported_checker_);
-  if (!supported_checker_->CheckAiCoreSupported(new_cnode, CreateKernelBuildInfo())) {
+  if (!supported_checker_->CheckAICoreSupported(new_cnode, CreateKernelBuildInfo())) {
     MS_LOG(INFO) << "split topk failed, check to aicpu.";
     return nullptr;
   }
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fission/transdata_split.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fission/transdata_split.cc
index 0305104f5b..bfb7e50486 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fission/transdata_split.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/transdata_split.cc
@@ -69,13 +69,11 @@ bool TransDataSplit::DoSplit(const FuncGraphPtr &func_graph, const AnfNodePtr &n
     // trans input_format to hwcn
     new_transdata_node = NewTransOpNode(func_graph, AnfAlgo::GetInputNode(node->cast<CNodePtr>(), 0), kernel_select_,
                                         false, prim::KPrimTransData->name());
-    RefreshKernelBuildInfo(input_format, kOpFormat_HWCN, AnfAlgo::GetOutputDeviceDataType(new_transdata_node, 0),
-                           new_transdata_node);
+    RefreshKernelBuildInfo(input_format, kOpFormat_HWCN, new_transdata_node);
     // trans hwcn to default_format
     new_transpose_node =
       NewTransOpNode(func_graph, new_transdata_node, kernel_select_, false, prim::kPrimTranspose->name());
-    RefreshKernelBuildInfo(kOpFormat_HWCN, output_format, AnfAlgo::GetOutputDeviceDataType(new_transpose_node, 0),
-                           new_transpose_node);
+    RefreshKernelBuildInfo(kOpFormat_HWCN, output_format, new_transpose_node);
     AnfAlgo::SetNodeAttr(kAttrPerm, MakeValue(std::vector<int>{3, 2, 0, 1}), new_transpose_node);
     new_replace_node = new_transpose_node;
   } else {
@@ -83,14 +81,12 @@ bool TransDataSplit::DoSplit(const FuncGraphPtr &func_graph, const AnfNodePtr &n
     new_transpose_node = NewTransOpNode(func_graph, AnfAlgo::GetInputNode(node->cast<CNodePtr>(), 0), kernel_select_,
                                         false, prim::kPrimTranspose->name());
     AnfAlgo::SetNodeAttr(kAttrPerm, MakeValue(std::vector<int>{2, 3, 1, 0}), new_transpose_node);
-    RefreshKernelBuildInfo(input_format, kOpFormat_HWCN, AnfAlgo::GetOutputDeviceDataType(new_transpose_node, 0),
-                           new_transpose_node);
+    RefreshKernelBuildInfo(input_format, kOpFormat_HWCN, new_transpose_node);
 
     // trans hwcn to output_format
     new_transdata_node =
       NewTransOpNode(func_graph, new_transpose_node, kernel_select_, false, prim::KPrimTransData->name());
-    RefreshKernelBuildInfo(kOpFormat_HWCN, output_format, AnfAlgo::GetOutputDeviceDataType(new_transdata_node, 0),
-                           new_transdata_node);
+    RefreshKernelBuildInfo(kOpFormat_HWCN, output_format, new_transdata_node);
     new_replace_node = new_transdata_node;
   }
   FuncGraphManagerPtr manager = func_graph->manager();
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/adam_apply_one_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/adam_apply_one_fusion.cc
index 4645167191..59be003b15 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/adam_apply_one_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/adam_apply_one_fusion.cc
@@ -109,6 +109,9 @@ const AnfNodePtr AdamApplyOneFusion::Process(const FuncGraphPtr &func_graph, con
                                              const EquivPtr &equiv) const {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(node);
+  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
+    return nullptr;
+  }
   auto new_node = CreateAdamApplyOneNode(func_graph, equiv);
   MS_EXCEPTION_IF_NULL(new_node);
   new_node->set_scope(node->scope());
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/adam_apply_one_with_decay_rule.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/adam_apply_one_with_decay_rule.cc
index 7dc13ee7a7..f6077c95f2 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/adam_apply_one_with_decay_rule.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/adam_apply_one_with_decay_rule.cc
@@ -146,7 +146,9 @@ const AnfNodePtr AdamApplyOneWithDecayRule::Process(const FuncGraphPtr &graph, c
   if (graph == nullptr || node == nullptr || equiv == nullptr) {
     return nullptr;
   }
-
+  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
+    return nullptr;
+  }
   std::vector<AnfNodePtr> inputs = GetFusionNodeInputs(equiv);
   auto fusion_node = graph->NewCNode(inputs);
   MS_EXCEPTION_IF_NULL(fusion_node);
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/confusion_softmax_grad_rule.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/confusion_softmax_grad_rule.cc
index a524d694e6..9e2c6374ce 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/confusion_softmax_grad_rule.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/confusion_softmax_grad_rule.cc
@@ -25,29 +25,8 @@
 
 namespace mindspore {
 namespace opt {
-namespace {
-void SetAttrsForFusionNode(const AnfNodePtr &sub_anf, const AnfNodePtr &fusion_node) {
-  MS_EXCEPTION_IF_NULL(sub_anf);
-  MS_EXCEPTION_IF_NULL(fusion_node);
-  auto sub = sub_anf->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(sub);
-  if (sub->size() != kSubInputNum) {
-    MS_LOG(EXCEPTION) << "Sub's size is not equal with 3";
-  }
-  auto reduce_sum_anf = sub->input(2);
-  MS_EXCEPTION_IF_NULL(reduce_sum_anf);
-  auto reduce_sum = reduce_sum_anf->cast<CNodePtr>();
-  if (reduce_sum == nullptr) {
-    MS_LOG(EXCEPTION) << "Sub's second input is not a cnode";
-  }
-  AnfAlgo::CopyNodeAttr(kAttrAxis, reduce_sum, fusion_node);
-  AnfAlgo::CopyNodeAttr(kAttrKeepDims, reduce_sum, fusion_node);
-}
-}  // namespace
-
 const BaseRef ConfusionSoftmaxGradRule::DefinePattern() const {
-  return VectorRef(
-    {prim::kPrimSub, input0_, VectorRef({prim::kPrimReduceSum, VectorRef({prim::kPrimMul, input1_, input0_})})});
+  return VectorRef({prim::kPrimSub, input0_, VectorRef({reduce_sum_, VectorRef({prim::kPrimMul, input1_, input0_})})});
 }
 
 const AnfNodePtr ConfusionSoftmaxGradRule::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
@@ -55,22 +34,28 @@ const AnfNodePtr ConfusionSoftmaxGradRule::Process(const FuncGraphPtr &graph, co
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(node);
   MS_EXCEPTION_IF_NULL(equiv);
-  auto input0 = utils::cast<AnfNodePtr>((*equiv)[input0_]);
-  auto input1 = utils::cast<AnfNodePtr>((*equiv)[input1_]);
-  MS_EXCEPTION_IF_NULL(input0);
-  MS_EXCEPTION_IF_NULL(input1);
+  AnfNodePtr input0 = GetAnfNodeByVar(equiv, input0_);
+  AnfNodePtr input1 = GetAnfNodeByVar(equiv, input1_);
+  AnfNodePtr sum_anf = GetAnfNodeByVar(equiv, reduce_sum_);
+  if (sum_anf == nullptr || !sum_anf->isa<CNode>()) {
+    MS_LOG(WARNING) << "Matched ReduceSum is not a CNode!";
+    return nullptr;
+  }
+  if (!GetBoolAttr(sum_anf, kAttrKeepDims)) {
+    MS_LOG(INFO) << "ReduceSum's attr keep_dims should be true if do fusion. Otherwise the calculation will be wrong";
+    return nullptr;
+  }
 
   auto prim = std::make_shared<Primitive>(kConfusionSoftmaxGradOpName);
   MS_EXCEPTION_IF_NULL(prim);
   std::vector<AnfNodePtr> inputs = {NewValueNode(prim), input0, input1};
-  auto confusion_softmax_grad = graph->NewCNode(inputs);
-  MS_EXCEPTION_IF_NULL(confusion_softmax_grad);
-  auto types = {AnfAlgo::GetOutputInferDataType(node, 0)};
-  auto shapes = {AnfAlgo::GetOutputInferShape(node, 0)};
-  AnfAlgo::SetOutputInferTypeAndShape(types, shapes, confusion_softmax_grad.get());
-  confusion_softmax_grad->set_scope(node->scope());
-  SetAttrsForFusionNode(node, confusion_softmax_grad);
-  return confusion_softmax_grad;
+  auto fusion_node = graph->NewCNode(inputs);
+  MS_EXCEPTION_IF_NULL(fusion_node);
+  fusion_node->set_abstract(node->abstract());
+  fusion_node->set_scope(node->scope());
+  AnfAlgo::CopyNodeAttr(kAttrAxis, sum_anf, fusion_node);
+  AnfAlgo::CopyNodeAttr(kAttrKeepDims, sum_anf, fusion_node);
+  return fusion_node;
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/confusion_softmax_grad_rule.h b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/confusion_softmax_grad_rule.h
index 58722e586f..a4d0d1ce7a 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/confusion_softmax_grad_rule.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/confusion_softmax_grad_rule.h
@@ -24,9 +24,11 @@ namespace opt {
 class ConfusionSoftmaxGradRule : public PatternProcessPass {
  public:
   explicit ConfusionSoftmaxGradRule(bool multigraph = true)
-      : PatternProcessPass("confusion_softmax_grad_rule", multigraph),
-        input0_(std::make_shared<Var>()),
-        input1_(std::make_shared<Var>()) {}
+      : PatternProcessPass("confusion_softmax_grad_rule", multigraph) {
+    input0_ = std::make_shared<Var>();
+    input1_ = std::make_shared<Var>();
+    reduce_sum_ = std::make_shared<Var>(std::make_shared<Primitive>(prim::kPrimReduceSum->name()));
+  }
   ~ConfusionSoftmaxGradRule() override = default;
   const BaseRef DefinePattern() const override;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
@@ -34,6 +36,7 @@ class ConfusionSoftmaxGradRule : public PatternProcessPass {
  private:
   VarPtr input0_;
   VarPtr input1_;
+  VarPtr reduce_sum_;
 };
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_add_relu_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_add_relu_fusion.cc
deleted file mode 100644
index efee8c0eff..0000000000
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_add_relu_fusion.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "pre_activate/ascend/ir_fusion/conv_bn_add_relu_fusion.h"
-#include <memory>
-#include <vector>
-#include <algorithm>
-#include <string>
-#include <tuple>
-#include "session/anf_runtime_algorithm.h"
-#include "device/kernel_info.h"
-
-namespace mindspore {
-namespace opt {
-namespace {
-constexpr size_t kBn2AddReluOutputNum = 4;
-enum Bn2AddReluOutput {
-  kBn2AddReluOutput = 0,
-  kBn2AddReluRunningMean,
-  kBn2AddReluRunningVariance,
-  kBn2AddReluSaveInvVariance,
-};
-
-std::tuple<CNodePtr, CNodePtr, CNodePtr, CNodePtr> GetUsedCNode(const AnfNodePtr &node) {
-  auto relu_cnode = CheckAnfNodeIfCNodeAndInputSize(node, kReluInputNum);
-  MS_EXCEPTION_IF_NULL(relu_cnode);
-  auto add_cnode = CheckAnfNodeIfCNodeAndInputSize(relu_cnode->input(1), kAddInputNum);
-  MS_EXCEPTION_IF_NULL(add_cnode);
-  auto add_input1_cnode = CheckAnfNodeIfCNodeAndInputSize(add_cnode->input(1), kTupleGetitemInputNum);
-  MS_EXCEPTION_IF_NULL(add_input1_cnode);
-  auto bn_cnode = CheckAnfNodeIfCNodeAndInputSize(add_input1_cnode->input(1), kBnInputNum);
-  MS_EXCEPTION_IF_NULL(bn_cnode);
-  auto conv_cnode = CheckAnfNodeIfCNodeAndInputSize(bn_cnode->input(kX), kConvInputNum);
-
-  return std::make_tuple(conv_cnode, bn_cnode, add_cnode, relu_cnode);
-}
-
-void CreateOutputsOfBn2AddRelu(const FuncGraphPtr &func_graph, const std::vector<AnfNodePtr> &conv_bn1_outputs,
-                               const CNodePtr &bn_node, const CNodePtr &add_node, const CNodePtr &relu_node,
-                               std::vector<AnfNodePtr> *bn2_add_relu_outputs) {
-  MS_EXCEPTION_IF_NULL(func_graph);
-  MS_EXCEPTION_IF_NULL(add_node);
-  MS_EXCEPTION_IF_NULL(relu_node);
-  MS_EXCEPTION_IF_NULL(bn_node);
-  auto prim = std::make_shared<Primitive>(kBN2AddReluOpName);
-  std::vector<AnfNodePtr> bn2_add_relu_inputs = {NewValueNode(prim)};
-  // The inputs of bn2_add_relu are from the outputs of conv_bn1, the 2nd input of add, and the 2nd to 5th inputs of bn
-  (void)std::copy(conv_bn1_outputs.begin(), conv_bn1_outputs.end(), std::back_inserter(bn2_add_relu_inputs));
-  bn2_add_relu_inputs.push_back(add_node->input(2));
-  for (size_t i = kX + 1; i <= kVariance; i++) {
-    bn2_add_relu_inputs.push_back(bn_node->input(i));
-  }
-  auto bn2_add_relu_cnode = func_graph->NewCNode(bn2_add_relu_inputs);
-  MS_EXCEPTION_IF_NULL(bn2_add_relu_cnode);
-  auto kernel_info = std::make_shared<device::KernelInfo>();
-  MS_EXCEPTION_IF_NULL(kernel_info);
-  bn2_add_relu_cnode->set_kernel_info(kernel_info);
-
-  // Set attr for bn2_add_relu
-  AnfAlgo::CopyNodeAttrs(bn_node, bn2_add_relu_cnode);
-  AnfAlgo::CopyNodeAttr("epsilon", "eps", bn_node, bn2_add_relu_cnode);
-
-  // Set abstract of bn2_add_relu
-  auto bn_abstract_tuple = dyn_cast<abstract::AbstractTuple>(bn_node->abstract());
-  MS_EXCEPTION_IF_NULL(bn_abstract_tuple);
-  if (bn_abstract_tuple->elements().size() != kBnOutputNum) {
-    MS_LOG(EXCEPTION) << "Abstract tuple size of FusedBatchNorm must be " << kBnOutputNum << ", but it is "
-                      << bn_abstract_tuple->elements().size();
-  }
-  auto relu_abstract = relu_node->abstract();
-  MS_EXCEPTION_IF_NULL(relu_abstract);
-  // The abstracts of node bn2_add_relu are from the some abstracts of bn and relu nodes.
-  AbstractBasePtrList bn2_add_relu_abstract_list{relu_abstract, bn_abstract_tuple->elements()[kRunningMean],
-                                                 bn_abstract_tuple->elements()[kRunningVariance],
-                                                 bn_abstract_tuple->elements()[kSaveInvVariance]};
-  auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(bn2_add_relu_abstract_list);
-  MS_EXCEPTION_IF_NULL(abstract_tuple);
-  bn2_add_relu_cnode->set_abstract(abstract_tuple);
-
-  CreateMultipleOutputsOfAnfNode(func_graph, bn2_add_relu_cnode, kBn2AddReluOutputNum, bn2_add_relu_outputs);
-}
-}  // namespace
-
-const BaseRef ConvBnAddReluFusion::DefinePattern() const {
-  VarPtr X = std::make_shared<Var>();
-  MS_EXCEPTION_IF_NULL(X);
-  VarPtr W = std::make_shared<Var>();
-  MS_EXCEPTION_IF_NULL(W);
-  VarPtr Ys = std::make_shared<SeqVar>();
-  MS_EXCEPTION_IF_NULL(Ys);
-  VarPtr Zs = std::make_shared<SeqVar>();
-  MS_EXCEPTION_IF_NULL(Zs);
-
-  return VectorRef(
-    {prim::kPrimRelu,
-     PatternListType(
-       {prim::kPrimTensorAdd,
-        PatternListType({prim::kPrimTupleGetItem,
-                         PatternListType({prim::kPrimFusedBatchNorm, PatternListType({prim::kPrimConv2D, Ys}), Zs}),
-                         W}),
-        X})});
-}
-
-const AnfNodePtr ConvBnAddReluFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
-                                              const EquivPtr &) const {
-  MS_EXCEPTION_IF_NULL(func_graph);
-  auto manager = func_graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-  CNodePtr conv_cnode = nullptr;
-  CNodePtr bn_cnode = nullptr;
-  CNodePtr add_cnode = nullptr;
-  CNodePtr relu_cnode = nullptr;
-  std::tie(conv_cnode, bn_cnode, add_cnode, relu_cnode) = GetUsedCNode(node);
-  // Create conv_bn1 node and get outputs of conv_bn1
-  std::vector<AnfNodePtr> conv_bn1_outputs;
-  CreateOutputsOfConvBn1(func_graph, conv_cnode, bn_cnode, &conv_bn1_outputs);
-  if (conv_bn1_outputs.size() != kConvBn1OutputNum) {
-    MS_LOG(EXCEPTION) << "The output size of node conv_bn1 must be " << kConvBn1OutputNum << ", but it is "
-                      << conv_bn1_outputs.size();
-  }
-  // Replace conv_node with the output 0 of conv_bn1 directly because the conv node may be used as input by others
-  (void)manager->Replace(conv_cnode, conv_bn1_outputs[kData]);
-
-  // Create bn2_add_relu node and get outputs of bn2_add_relu
-  std::vector<AnfNodePtr> bn2_add_relu_outputs;
-  CreateOutputsOfBn2AddRelu(func_graph, conv_bn1_outputs, bn_cnode, add_cnode, relu_cnode, &bn2_add_relu_outputs);
-  if (bn2_add_relu_outputs.size() != kBn2AddReluOutputNum) {
-    MS_LOG(EXCEPTION) << "The output size of node bn2_add_relu must be " << kBn2AddReluOutputNum << ", but it is "
-                      << bn2_add_relu_outputs.size();
-  }
-
-  // Create a make_tuple to replace the bn node here, the outputs are from node bn2_add_relu and conv_bn1.
-  std::vector<AnfNodePtr> make_tuple_inputs{NewValueNode(prim::kPrimMakeTuple),
-                                            bn2_add_relu_outputs[kBn2AddReluOutput],
-                                            bn2_add_relu_outputs[kBn2AddReluRunningMean],
-                                            bn2_add_relu_outputs[kBn2AddReluRunningVariance],
-                                            conv_bn1_outputs[kMean],
-                                            bn2_add_relu_outputs[kBn2AddReluSaveInvVariance]};
-  auto make_tuple = func_graph->NewCNode(make_tuple_inputs);
-  (void)manager->Replace(bn_cnode, make_tuple);
-  return bn2_add_relu_outputs[kBn2AddReluOutput];
-}
-}  // namespace opt
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_fusion.cc
deleted file mode 100644
index 70a7b53809..0000000000
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_fusion.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "pre_activate/ascend/ir_fusion/conv_bn_fusion.h"
-#include <memory>
-#include <vector>
-#include "session/anf_runtime_algorithm.h"
-#include "device/kernel_info.h"
-
-namespace mindspore {
-namespace opt {
-const BaseRef ConvBnFusion::DefinePattern() const {
-  VarPtr Xs = std::make_shared<SeqVar>();
-  MS_EXCEPTION_IF_NULL(Xs);
-  VarPtr Ys = std::make_shared<SeqVar>();
-  MS_EXCEPTION_IF_NULL(Ys);
-  return VectorRef({prim::kPrimFusedBatchNorm, PatternListType({prim::kPrimConv2D, Xs}), Ys});
-}
-
-const AnfNodePtr ConvBnFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, const EquivPtr &) const {
-  MS_EXCEPTION_IF_NULL(func_graph);
-  MS_EXCEPTION_IF_NULL(node);
-  if (!node->isa<CNode>()) {
-    MS_LOG(EXCEPTION) << "The bn node is expected to be a cnode";
-  }
-  auto bn_cnode = node->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(bn_cnode);
-  if (bn_cnode->inputs().size() < kVariance + 1) {
-    auto op_name = AnfAlgo::GetCNodeName(bn_cnode);
-    MS_LOG(EXCEPTION) << "op[" << op_name << "] has less than " << kVariance + 1 << " inputs.";
-  }
-  AnfNodePtr conv_node = bn_cnode->input(kX);
-  MS_EXCEPTION_IF_NULL(conv_node);
-  if (!conv_node->isa<CNode>()) {
-    MS_LOG(EXCEPTION) << "The conv node is expected to be a cnode";
-  }
-  auto conv_cnode = conv_node->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(conv_cnode);
-  auto manager = func_graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-  // Create conv_bn1 node and get outputs of conv_bn1
-  std::vector<AnfNodePtr> conv_bn1_outputs;
-  CreateOutputsOfConvBn1(func_graph, conv_cnode, bn_cnode, &conv_bn1_outputs);
-  if (conv_bn1_outputs.size() != kConvBn1OutputNum) {
-    MS_LOG(EXCEPTION) << "The output size of node conv_bn1 must be " << kConvBn1OutputNum << ", but it is "
-                      << conv_bn1_outputs.size();
-  }
-  // Replace conv_node with the output 0 of conv_bn1 directly because the conv node may be used as input by other
-  (void)manager->Replace(conv_node, conv_bn1_outputs[kData]);
-
-  // Create bn2 node and get outputs of bn2
-  std::vector<AnfNodePtr> bn2_outputs;
-  std::vector<AnfNodePtr> bn1_outputs = {conv_bn1_outputs[2], conv_bn1_outputs[1]};
-  CreateOutputsOfFusedBn2(func_graph, bn1_outputs, bn_cnode, &bn2_outputs);
-  if (bn2_outputs.size() != kBN2OutputNum) {
-    MS_LOG(EXCEPTION) << "The output size of node fusedbn2 must be " << kBN2OutputNum << ", but it is "
-                      << bn2_outputs.size();
-  }
-
-  // Create bn3 node and get outputs of bn3
-  std::vector<AnfNodePtr> bn3_outputs;
-  CreateOutputsOfFusedBn3(func_graph, conv_bn1_outputs[0], bn1_outputs, bn2_outputs, bn_cnode, &bn3_outputs);
-
-  if (bn3_outputs.size() != kBN3OutputNum) {
-    MS_LOG(EXCEPTION) << "The output size of node fusedbn3 must be " << kBN3OutputNum << ", but it is "
-                      << bn3_outputs.size();
-  }
-
-  // Return a make_tuple to replace the bn node here, the outputs are from node bn2 and conv_bn1.
-  std::vector<AnfNodePtr> make_tuple_inputs{NewValueNode(prim::kPrimMakeTuple),
-                                            bn3_outputs[0],
-                                            bn2_outputs[1],
-                                            bn2_outputs[2],
-                                            conv_bn1_outputs[2],
-                                            bn2_outputs[0]};
-
-  return func_graph->NewCNode(make_tuple_inputs);
-}
-}  // namespace opt
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_relu_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_relu_fusion.cc
deleted file mode 100644
index c5cea86b7f..0000000000
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/conv_bn_relu_fusion.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "pre_activate/ascend/ir_fusion/conv_bn_relu_fusion.h"
-
-#include <vector>
-#include <string>
-#include <memory>
-#include <algorithm>
-#include <tuple>
-
-#include "utils/utils.h"
-#include "session/anf_runtime_algorithm.h"
-#include "common/utils.h"
-#include "device/kernel_info.h"
-
-namespace mindspore {
-namespace opt {
-namespace {
-std::tuple<CNodePtr, CNodePtr, CNodePtr> GetPrevNodes(const AnfNodePtr &node) {
-  MS_EXCEPTION_IF_NULL(node);
-  auto relu_node = node->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(relu_node);
-  if (relu_node->inputs().size() < kReluInputNum) {
-    MS_LOG(EXCEPTION) << "relu has wrong input size";
-  }
-  auto tuple_getitem_anf = relu_node->input(1);
-  MS_EXCEPTION_IF_NULL(tuple_getitem_anf);
-  auto tuple_getitem = tuple_getitem_anf->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(tuple_getitem);
-  if (tuple_getitem->inputs().size() < kTupleGetitemInputNum) {
-    MS_LOG(EXCEPTION) << "tuple getitem has wrong input size";
-  }
-  auto bn_node_anf = tuple_getitem->input(1);
-  MS_EXCEPTION_IF_NULL(bn_node_anf);
-  auto bn_node = bn_node_anf->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(bn_node);
-  if (bn_node->inputs().size() < kBnInputNum) {
-    MS_LOG(EXCEPTION) << "bn_node has wrong input size";
-  }
-  auto conv_node_anf = bn_node->input(1);
-  MS_EXCEPTION_IF_NULL(conv_node_anf);
-  CNodePtr conv_node = conv_node_anf->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(conv_node);
-  return std::make_tuple(bn_node, bn_node, conv_node);
-}
-
-void CreateOutputsOfBn2Relu(const FuncGraphPtr &func_graph, const std::vector<AnfNodePtr> &conv_bn1_outputs,
-                            const CNodePtr &bn_node, const CNodePtr &relu_node,
-                            std::vector<AnfNodePtr> *bn2_relu_outputs) {
-  MS_EXCEPTION_IF_NULL(func_graph);
-  MS_EXCEPTION_IF_NULL(bn_node);
-  MS_EXCEPTION_IF_NULL(relu_node);
-  // The inputs of bn2_relu are from the outputs of conv_bn1 and the 2nd to 5th inputs of bn
-  std::vector<AnfNodePtr> bn2_relu_inputs = {NewValueNode(std::make_shared<Primitive>(kBN2ReLUOpName))};
-  (void)std::copy(conv_bn1_outputs.begin(), conv_bn1_outputs.end(), std::back_inserter(bn2_relu_inputs));
-  for (size_t i = 2; i <= 5; i++) {
-    bn2_relu_inputs.push_back(bn_node->input(i));
-  }
-  auto bn2_relu = func_graph->NewCNode(bn2_relu_inputs);
-  MS_EXCEPTION_IF_NULL(bn2_relu);
-  auto kernel_info = std::make_shared<device::KernelInfo>();
-  MS_EXCEPTION_IF_NULL(kernel_info);
-  bn2_relu->set_kernel_info(kernel_info);
-  auto types = {AnfAlgo::GetOutputInferDataType(relu_node, 0), AnfAlgo::GetOutputInferDataType(bn_node, 1),
-                AnfAlgo::GetOutputInferDataType(bn_node, 2), AnfAlgo::GetOutputInferDataType(bn_node, 4)};
-  auto shapes = {AnfAlgo::GetOutputInferShape(relu_node, 0), AnfAlgo::GetOutputInferShape(bn_node, 1),
-                 AnfAlgo::GetOutputInferShape(bn_node, 2), AnfAlgo::GetOutputInferShape(bn_node, 4)};
-  AnfAlgo::SetOutputInferTypeAndShape(types, shapes, bn2_relu.get());
-  // Set attr for bn2_add_relu
-  AnfAlgo::CopyNodeAttrs(bn_node, bn2_relu);
-  AnfAlgo::CopyNodeAttr("epsilon", "eps", bn_node, bn2_relu);
-
-  CreateMultipleOutputsOfAnfNode(func_graph, bn2_relu, kBn2ReluOutputNum, bn2_relu_outputs);
-}
-}  // namespace
-
-const BaseRef ConvBnReluFusion::DefinePattern() const {
-  VarPtr Xs = std::make_shared<SeqVar>();
-  VarPtr Ys = std::make_shared<SeqVar>();
-  VarPtr Z = std::make_shared<Var>();
-  MS_EXCEPTION_IF_NULL(Xs);
-  MS_EXCEPTION_IF_NULL(Ys);
-  MS_EXCEPTION_IF_NULL(Z);
-  return VectorRef(
-    {prim::kPrimRelu,
-     PatternListType({prim::kPrimTupleGetItem,
-                      PatternListType({prim::kPrimFusedBatchNorm, PatternListType({prim::kPrimConv2D, Xs}), Ys}), Z})});
-}
-
-const AnfNodePtr ConvBnReluFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
-                                           const EquivPtr &) const {
-  MS_EXCEPTION_IF_NULL(func_graph);
-  MS_EXCEPTION_IF_NULL(node);
-
-  CNodePtr relu_node = nullptr;
-  CNodePtr bn_node = nullptr;
-  CNodePtr conv_node = nullptr;
-  std::tie(relu_node, bn_node, conv_node) = GetPrevNodes(node);
-
-  auto manager = func_graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-
-  std::vector<AnfNodePtr> conv_bn1_outputs;
-  CreateOutputsOfConvBn1(func_graph, conv_node, bn_node, &conv_bn1_outputs);
-  if (conv_bn1_outputs.size() != kConvBn1OutputNum) {
-    MS_LOG(EXCEPTION) << "conv_bn1 outputs has wrong size: " << conv_bn1_outputs.size();
-  }
-  (void)manager->Replace(conv_node, conv_bn1_outputs[0]);
-
-  std::vector<AnfNodePtr> bn2_relu_outputs;
-  CreateOutputsOfBn2Relu(func_graph, conv_bn1_outputs, bn_node, relu_node, &bn2_relu_outputs);
-  if (bn2_relu_outputs.size() != kBn2ReluOutputNum) {
-    MS_LOG(EXCEPTION) << "bn2_relu outputs has wrong size: " << bn2_relu_outputs.size();
-  }
-  std::vector<AnfNodePtr> make_tuple_inputs{NewValueNode(prim::kPrimMakeTuple),
-                                            bn2_relu_outputs[0],
-                                            bn2_relu_outputs[1],
-                                            bn2_relu_outputs[2],
-                                            conv_bn1_outputs[2],
-                                            bn2_relu_outputs[3]};
-  auto make_tuple = func_graph->NewCNode(make_tuple_inputs);
-  MS_EXCEPTION_IF_NULL(make_tuple);
-  (void)manager->Replace(bn_node, make_tuple);
-  return bn2_relu_outputs[0];
-}
-}  // namespace opt
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.cc
index 03428e6357..efc9ee7934 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.cc
@@ -291,7 +291,7 @@ const AnfNodePtr FusedBatchNormFusion::Process(const FuncGraphPtr &func_graph, c
   return bn_training_update_outputs[0];
 }
 
-const BaseRef FusedBatchNormMixPrecisionFusion::DefinePattern() const {
+const BaseRef FusedBatchNormMixPrecisionFusion0::DefinePattern() const {
   std::shared_ptr<Var> Xs = std::make_shared<SeqVar>();
   VarPtr index0 = std::make_shared<CondVar>(IsC);
   VarPtr index1 = std::make_shared<CondVar>(IsC);
@@ -313,5 +313,28 @@ const BaseRef FusedBatchNormMixPrecisionFusion::DefinePattern() const {
   VectorRef depend0 = VectorRef({prim::kPrimDepend, tuple_getitem0, assign_sub0});
   return VectorRef({prim::kPrimDepend, depend0, assign_sub1});
 }
+
+const BaseRef FusedBatchNormMixPrecisionFusion1::DefinePattern() const {
+  std::shared_ptr<Var> Xs = std::make_shared<SeqVar>();
+  VarPtr index0 = std::make_shared<CondVar>(IsC);
+  VarPtr index1 = std::make_shared<CondVar>(IsC);
+  VarPtr index2 = std::make_shared<CondVar>(IsC);
+  VectorRef batch_norm = VectorRef({batch_norm_var_, data_input0_var_, data_input1_var_, data_input2_var_, Xs});
+  VectorRef tuple_getitem0 = VectorRef({prim::kPrimTupleGetItem, batch_norm, index0});
+  VectorRef tuple_getitem1 = VectorRef({prim::kPrimTupleGetItem, batch_norm, index1});
+  VectorRef tuple_getitem2 = VectorRef({prim::kPrimTupleGetItem, batch_norm, index2});
+  VectorRef cast_variable_input0 = VectorRef({prim::kPrimCast, variable_input0_var_});
+  VectorRef cast_variable_input1 = VectorRef({prim::kPrimCast, variable_input1_var_});
+  VectorRef sub0 = VectorRef({prim::kPrimSub, cast_variable_input0, tuple_getitem1});
+  VectorRef sub1 = VectorRef({prim::kPrimSub, cast_variable_input1, tuple_getitem2});
+  VectorRef cast0 = VectorRef({prim::kPrimCast, sub0});
+  VectorRef cast1 = VectorRef({prim::kPrimCast, sub1});
+  VectorRef mul0 = VectorRef({prim::kPrimMul, cast0, constant_input0_var_});
+  VectorRef mul1 = VectorRef({prim::kPrimMul, cast1, constant_input1_var_});
+  VectorRef assign_sub0 = VectorRef({prim::kPrimAssignSub, variable_input0_var_, mul0});
+  VectorRef assign_sub1 = VectorRef({prim::kPrimAssignSub, variable_input1_var_, mul1});
+  VectorRef depend0 = VectorRef({prim::kPrimDepend, tuple_getitem0, assign_sub0});
+  return VectorRef({prim::kPrimDepend, depend0, assign_sub1});
+}
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.h b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.h
index e4b31ca5f4..f476e96062 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.h
@@ -61,12 +61,21 @@ class FusedBatchNormFusion : public PatternProcessPass {
   VarPtr batch_norm_var_;
 };
 
-class FusedBatchNormMixPrecisionFusion : public FusedBatchNormFusion {
+class FusedBatchNormMixPrecisionFusion0 : public FusedBatchNormFusion {
  public:
-  explicit FusedBatchNormMixPrecisionFusion(bool multigraph = true)
+  explicit FusedBatchNormMixPrecisionFusion0(bool multigraph = true)
       : FusedBatchNormFusion("fused_batch_norm_mix_precision_fusion", multigraph) {}
 
-  ~FusedBatchNormMixPrecisionFusion() override = default;
+  ~FusedBatchNormMixPrecisionFusion0() override = default;
+  const BaseRef DefinePattern() const override;
+};
+
+class FusedBatchNormMixPrecisionFusion1 : public FusedBatchNormFusion {
+ public:
+  explicit FusedBatchNormMixPrecisionFusion1(bool multigraph = true)
+      : FusedBatchNormFusion("fused_batch_norm_mix_precision_fusion", multigraph) {}
+
+  ~FusedBatchNormMixPrecisionFusion1() override = default;
   const BaseRef DefinePattern() const override;
 };
 }  // namespace opt
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_rule.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_rule.cc
index 5f0b869644..42e37df3e4 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_rule.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_rule.cc
@@ -108,6 +108,9 @@ bool LambNextMVRule::IsShareNodes(const EquivPtr &equiv1, const EquivPtr &equiv2
 
 const AnfNodePtr LambNextMVRule::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                          const EquivPtr &equiv) const {
+  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
+    return nullptr;
+  }
   std::vector<AnfNodePtr> old_pattern_outputs;
   if (!IsRuleMatched(func_graph, node, equiv, &old_pattern_outputs)) {
     return nullptr;
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_with_decay_rule.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_with_decay_rule.cc
index e0389309a1..0e3cd28a66 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_with_decay_rule.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_with_decay_rule.cc
@@ -88,6 +88,9 @@ const AnfNodePtr LambNextMVWithDecayRule::Process(const FuncGraphPtr &func_graph
                                                   const EquivPtr &equiv) const {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(node);
+  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
+    return nullptr;
+  }
   AnfNodePtr mul4 = GetAnfNodeByVar(equiv, mul4_var_);
   MS_EXCEPTION_IF_NULL(mul4);
   // Get add3 and match the add3 pattern
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc
index 9efd503363..26828f2137 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_mv_with_decay_v1_rule.cc
@@ -153,6 +153,9 @@ const AnfNodePtr LambNextMVWithDecayV1Rule::Process(const FuncGraphPtr &func_gra
   if (func_graph == nullptr || node == nullptr || equiv == nullptr) {
     return nullptr;
   }
+  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
+    return nullptr;
+  }
   AnfNodePtr mul4 = nullptr;
   AnfNodePtr real_div0 = nullptr;
   AnfNodePtr real_div1 = nullptr;
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_right_rule.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_right_rule.cc
index 68baeeed99..5065c4c5ba 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_right_rule.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_next_right_rule.cc
@@ -61,6 +61,9 @@ const AnfNodePtr LambNextRightRule::Process(const FuncGraphPtr &func_graph, cons
                                             const EquivPtr &equiv) const {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(node);
+  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
+    return nullptr;
+  }
   auto new_node = CreateLambNextRightNode(func_graph, equiv);
   MS_EXCEPTION_IF_NULL(new_node);
   // Set abstract of new node
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_update_with_lr_rule_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_update_with_lr_rule_fusion.cc
index 16a43e2072..b5b6d2bb08 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_update_with_lr_rule_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_update_with_lr_rule_fusion.cc
@@ -50,6 +50,9 @@ const AnfNodePtr LambUpdateWithLRRuleFusion::Process(const FuncGraphPtr &graph,
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(node);
   MS_EXCEPTION_IF_NULL(equiv);
+  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
+    return nullptr;
+  }
   auto input0 = utils::cast<AnfNodePtr>((*equiv)[input0_]);
   auto input1 = utils::cast<AnfNodePtr>((*equiv)[input1_]);
   auto input2 = utils::cast<AnfNodePtr>((*equiv)[input2_]);
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_update_with_lr_v2.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_update_with_lr_v2.cc
index 069581b6e4..43e1872163 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_update_with_lr_v2.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/lamb_update_with_lr_v2.cc
@@ -42,6 +42,9 @@ const AnfNodePtr LambUpdateWithLrV2::Process(const FuncGraphPtr &func_graph, con
                                              const EquivPtr &equiv) const {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(equiv);
+  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
+    return nullptr;
+  }
   auto prim = std::make_shared<Primitive>(kLambUpdateWithLrV2OpName);
   std::vector<AnfNodePtr> inputs = {NewValueNode(prim)};
   (void)std::transform(input_varptr_.begin(), input_varptr_.end(), std::back_inserter(inputs),
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.cc
index fba1ab40af..b16387d8f1 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.cc
@@ -121,6 +121,9 @@ const AnfNodePtr LayerNormBetaGammaBackpropFusion::Process(const FuncGraphPtr &f
   if (node == nullptr || !node->isa<CNode>()) {
     return nullptr;
   }
+  if (AnfAlgo::IsGraphKernel(node)) {
+    return nullptr;
+  }
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   std::vector<CNodePtr> cast_nodes;
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/momentum_lossscale_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/momentum_lossscale_fusion.cc
index 6b751873d6..e7a73a9c7f 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/momentum_lossscale_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/momentum_lossscale_fusion.cc
@@ -32,19 +32,6 @@ bool CheckValueNodeInputOfMul(const AnfNodePtr &node) {
   std::vector<size_t> mul_input_shape = AnfAlgo::GetOutputInferShape(node, 0);
   return mul_input_shape.empty() || (mul_input_shape.size() == 1 && mul_input_shape[0] == 1);
 }
-void AddInputToOutput(const FuncGraphPtr &func_graph, const CNodePtr &old_cnode, const AnfNodePtr &new_node,
-                      std::vector<AnfNodePtr> *new_outputs) {
-  MS_EXCEPTION_IF_NULL(old_cnode);
-  MS_EXCEPTION_IF_NULL(new_node);
-  MS_EXCEPTION_IF_NULL(new_outputs);
-  auto node_to_output = old_cnode->input(kAccumIndex + 1);
-  MS_EXCEPTION_IF_NULL(node_to_output);
-  AbstractBasePtrList abstract_list{old_cnode->abstract(), node_to_output->abstract()};
-  auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);
-  new_node->set_abstract(abstract_tuple);
-  // Create Output
-  CreateMultipleOutputsOfAnfNode(func_graph, new_node, kFusedMulApplyMomentumOutputNum, new_outputs);
-}
 }  // namespace
 
 const BaseRef MomentumLossscaleFusion::DefinePattern() const {
@@ -94,14 +81,9 @@ const AnfNodePtr MomentumLossscaleFusion::Process(const FuncGraphPtr &func_graph
   input_names_value[3] = "x1";
   input_names_value.emplace_back("x2");
   AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(input_names_value), new_node);
+  new_node->set_abstract(node->abstract());
   new_node->set_scope(node->scope());
-  // Create Outputs
-  std::vector<AnfNodePtr> new_outputs;
-  AddInputToOutput(func_graph, cnode, new_node, &new_outputs);
-  if (new_outputs.size() != kFusedMulApplyMomentumOutputNum) {
-    MS_LOG(EXCEPTION) << "Failed to create outputs of " << new_node->DebugString();
-  }
-  return new_outputs[0];
+  return new_node;
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/remove_reshape_pair.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/remove_reshape_pair.cc
index 5e265f2cf1..fa2815ff62 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/remove_reshape_pair.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/remove_reshape_pair.cc
@@ -23,33 +23,33 @@
 namespace mindspore {
 namespace opt {
 const BaseRef RemoveReshapePair::DefinePattern() const {
-  const auto prim_reshape = std::make_shared<Primitive>(prim::kPrimReshape->name());
-  VectorRef reshape({prim_reshape, input_varptr_});
-
-  return VectorRef({prim::kPrimReshape, reshape});
+  VarPtr X = std::make_shared<Var>();
+  MS_EXCEPTION_IF_NULL(X);
+  return VectorRef({prim::kPrimReshape, VectorRef({prim::kPrimReshape, X})});
 }
 
 const AnfNodePtr RemoveReshapePair::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                             const EquivPtr &equiv) const {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(equiv);
-  auto manager = func_graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
   auto reshape_op_1 = CheckAnfNodeIfCNodeAndInputSize(node, kBackendReshapeInputNum);
   MS_EXCEPTION_IF_NULL(reshape_op_1);
   // If reshape operator used by more than one other operators, reshape operator cant not be deleted  directly
-  auto users = manager->node_users()[reshape_op_1];
-  if (users.size() > 1) {
+  if (IsUsedByOthers(func_graph, reshape_op_1)) {
     return nullptr;
   }
   auto reshape_op_2 = CheckAnfNodeIfCNodeAndInputSize(reshape_op_1->input(1), kBackendReshapeInputNum);
   MS_EXCEPTION_IF_NULL(reshape_op_2);
-  users = manager->node_users()[reshape_op_2];
-  if (users.size() > 1) {
+  if (IsUsedByOthers(func_graph, reshape_op_2)) {
     return nullptr;
   }
-  auto input_node = reshape_op_2->input(1);
-  return input_node;
+  auto output_shape = AnfAlgo::GetOutputDeviceShape(reshape_op_2, 0);
+  auto input_shape = AnfAlgo::GetInputDeviceShape(reshape_op_1, 0);
+  if (input_shape == output_shape) {
+    auto input_node = reshape_op_2->input(1);
+    return input_node;
+  }
+  return nullptr;
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/remove_reshape_pair.h b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/remove_reshape_pair.h
index a284f4eaa9..ddb25df70c 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/remove_reshape_pair.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/remove_reshape_pair.h
@@ -28,15 +28,10 @@ namespace mindspore {
 namespace opt {
 class RemoveReshapePair : public PatternProcessPass {
  public:
-  explicit RemoveReshapePair(bool multigraph = true) : PatternProcessPass("remove_reshape_pair", multigraph) {
-    input_varptr_ = std::make_shared<Var>();
-  }
+  explicit RemoveReshapePair(bool multigraph = true) : PatternProcessPass("remove_reshape_pair", multigraph) {}
   ~RemoveReshapePair() override = default;
   const BaseRef DefinePattern() const override;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
-
- private:
-  VarPtr input_varptr_;
 };
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion.cc
index ccb0cbfcb8..f95406e5e1 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion.cc
@@ -31,6 +31,24 @@ const BaseRef SoftmaxGradExtFusion::DefinePattern() const {
   return mul_grad;
 }
 
+const BaseRef SoftmaxGradExtFusionV2::DefinePattern() const {
+  VectorRef mul({prim::kPrimMul, input1_, input0_});
+  VectorRef sum({sum_var_, mul});
+  VectorRef sub({prim::kPrimSub, input0_, sum});
+  VectorRef mul1({prim::kPrimMul, input1_, sub});
+  VectorRef mul_grad({prim::kPrimMul, input2_, mul1});
+  return mul_grad;
+}
+
+const BaseRef SoftmaxGradExtFusionV3::DefinePattern() const {
+  VectorRef mul({prim::kPrimMul, input1_, input0_});
+  VectorRef sum({sum_var_, mul});
+  VectorRef sub({prim::kPrimSub, input0_, sum});
+  VectorRef mul1({prim::kPrimMul, input1_, sub});
+  VectorRef mul_grad({prim::kPrimMul, mul1, input2_});
+  return mul_grad;
+}
+
 const AnfNodePtr SoftmaxGradExtFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
                                                const EquivPtr &equiv) const {
   MS_EXCEPTION_IF_NULL(graph);
@@ -40,13 +58,17 @@ const AnfNodePtr SoftmaxGradExtFusion::Process(const FuncGraphPtr &graph, const
   auto input1 = GetAnfNodeByVar(equiv, input1_);
   auto input2 = GetAnfNodeByVar(equiv, input2_);
   auto sum = GetAnfNodeByVar(equiv, sum_var_);
+  if (!GetBoolAttr(sum, kAttrKeepDims)) {
+    MS_LOG(INFO) << "sum's attr keep_dims should be true if do fusion";
+    return nullptr;
+  }
 
   auto prim = std::make_shared<Primitive>(kSoftmaxGradExtOpName);
   auto fusion_node = graph->NewCNode({NewValueNode(prim), input0, input1, input2});
   MS_EXCEPTION_IF_NULL(fusion_node);
   fusion_node->set_scope(node->scope());
   fusion_node->set_abstract(node->abstract());
-  AnfAlgo::CopyNodeAttr(kAttrKeepDims, sum, fusion_node);
+  AnfAlgo::CopyNodeAttr(kAttrKeepDims, "keepdims", sum, fusion_node);
   AnfAlgo::CopyNodeAttr(kAttrAxis, sum, fusion_node);
   return fusion_node;
 }
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion.h b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion.h
index 70c5658e60..59032e6973 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion.h
@@ -17,13 +17,15 @@
 #define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_SOFTMAX_GRAD_EXT_FUSION_H_
 
 #include <memory>
+#include <string>
 #include "pre_activate/common/optimizer.h"
 
 namespace mindspore {
 namespace opt {
 class SoftmaxGradExtFusion : public PatternProcessPass {
  public:
-  explicit SoftmaxGradExtFusion(bool multigraph = true) : PatternProcessPass("softmax_grad_ext_fusion", multigraph) {
+  explicit SoftmaxGradExtFusion(const std::string &name = "softmax_grad_ext_fusion", bool multigraph = true)
+      : PatternProcessPass(name, multigraph) {
     input0_ = std::make_shared<Var>();
     input1_ = std::make_shared<Var>();
     input2_ = std::make_shared<Var>();
@@ -33,12 +35,28 @@ class SoftmaxGradExtFusion : public PatternProcessPass {
   const BaseRef DefinePattern() const override;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
 
- private:
+ protected:
   VarPtr input0_;
   VarPtr input1_;
   VarPtr input2_;
   VarPtr sum_var_;
 };
+
+class SoftmaxGradExtFusionV2 : public SoftmaxGradExtFusion {
+ public:
+  explicit SoftmaxGradExtFusionV2(bool multigraph = true)
+      : SoftmaxGradExtFusion("softmax_grad_ext_fusion_v2", multigraph) {}
+  ~SoftmaxGradExtFusionV2() override = default;
+  const BaseRef DefinePattern() const override;
+};
+
+class SoftmaxGradExtFusionV3 : public SoftmaxGradExtFusion {
+ public:
+  explicit SoftmaxGradExtFusionV3(bool multigraph = true)
+      : SoftmaxGradExtFusion("softmax_grad_ext_fusion_v3", multigraph) {}
+  ~SoftmaxGradExtFusionV3() override = default;
+  const BaseRef DefinePattern() const override;
+};
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_SOFTMAX_GRAD_EXT_FUSION_H_
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/square_sum_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/square_sum_fusion.cc
index c3884ff70a..6261b63882 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/square_sum_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/square_sum_fusion.cc
@@ -18,6 +18,7 @@
 #include <memory>
 #include <vector>
 #include <tuple>
+#include <string>
 
 #include "session/anf_runtime_algorithm.h"
 #include "common/utils.h"
@@ -50,6 +51,8 @@ CNodePtr GenerateSquareSumV1(const FuncGraphPtr &graph, const CNodePtr &square,
   square_sumv1->set_scope(sum->scope());
   AnfAlgo::CopyNodeAttr(kAttrAxis, sum, square_sumv1);
   AnfAlgo::CopyNodeAttr(kAttrKeepDims, sum, square_sumv1);
+  auto names = MakeValue<std::vector<std::string>>({prim::kPrimSquare->name(), prim::kPrimReduceSum->name()});
+  AnfAlgo::SetNodeAttr(kAttrDatadumpOriginalNames, names, square_sumv1);
   return square_sumv1;
 }
 
@@ -71,6 +74,8 @@ CNodePtr GenerateSquareSumV2(const FuncGraphPtr &graph, const CNodePtr &square,
   square_sumv2->set_scope(sum->scope());
   AnfAlgo::CopyNodeAttr(kAttrAxis, sum, square_sumv2);
   AnfAlgo::CopyNodeAttr(kAttrKeepDims, sum, square_sumv2);
+  auto names = MakeValue<std::vector<std::string>>({prim::kPrimSquare->name(), prim::kPrimReduceSum->name()});
+  AnfAlgo::SetNodeAttr(kAttrDatadumpOriginalNames, names, square_sumv2);
   return square_sumv2;
 }
 
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/transpose_transdata_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/transpose_transdata_fusion.cc
index 1651718703..e45fc2637f 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/transpose_transdata_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/transpose_transdata_fusion.cc
@@ -53,7 +53,7 @@ const AnfNodePtr TransposeTransDataFusion::Process(const FuncGraphPtr &func_grap
   new_transdata_builder->SetProcessor(transdata_kernel_build_info->processor());
 
   auto new_fusion_transdata = std::make_shared<Primitive>(kTransDataOpName);
-  if (supported_checker_->CheckAiCoreSupported(transdata_cnode, new_transdata_builder->Build())) {
+  if (supported_checker_->CheckAICoreSupported(transdata_cnode, new_transdata_builder->Build())) {
     std::vector<AnfNodePtr> inputs = {NewValueNode(new_fusion_transdata),
                                       utils::cast<AnfNodePtr>((*equiv)[input_varptr_])};
     auto new_node = func_graph->NewCNode(inputs);
diff --git a/mindspore/ccsrc/pre_activate/common/common_backend_optimization.cc b/mindspore/ccsrc/pre_activate/common/common_backend_optimization.cc
index 7ba42a60a0..b930ac69c9 100644
--- a/mindspore/ccsrc/pre_activate/common/common_backend_optimization.cc
+++ b/mindspore/ccsrc/pre_activate/common/common_backend_optimization.cc
@@ -28,6 +28,7 @@
 namespace mindspore {
 namespace opt {
 void BackendCommonOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
   MS_LOG(INFO) << "start common opt graph:" << kernel_graph->graph_id();
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
@@ -37,7 +38,8 @@ void BackendCommonOptimization(const std::shared_ptr<session::KernelGraph> &kern
     save_graphs_path = ".";
   }
   if (save_graphs) {
-    std::string file_path = save_graphs_path + "/" + "hwopt_common_before.ir";
+    std::string file_path =
+      save_graphs_path + "/hwopt_common_before_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
     DumpIR(file_path, kernel_graph);
   }
   auto optimizer = std::make_shared<GraphOptimizer>();
@@ -51,7 +53,8 @@ void BackendCommonOptimization(const std::shared_ptr<session::KernelGraph> &kern
   (void)optimizer->Optimize(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
   if (save_graphs) {
-    std::string file_path = save_graphs_path + "/" + "hwopt_common_after.ir";
+    std::string file_path =
+      save_graphs_path + "/hwopt_common_after_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
     DumpIR(file_path, kernel_graph);
   }
 }
diff --git a/mindspore/ccsrc/pre_activate/common/helper.cc b/mindspore/ccsrc/pre_activate/common/helper.cc
index 9be537775e..1c2ade201c 100644
--- a/mindspore/ccsrc/pre_activate/common/helper.cc
+++ b/mindspore/ccsrc/pre_activate/common/helper.cc
@@ -16,6 +16,7 @@
 
 #include "pre_activate/common/helper.h"
 #include <string>
+#include <utility>
 #include <unordered_set>
 #include <algorithm>
 #include <map>
@@ -45,6 +46,7 @@ bool IsDepend(const FuncGraphPtr &graph, const AnfNodePtr &node1, const AnfNodeP
   std::vector<AnfNodePtr> node_list = TopoSort(graph->get_return());
   std::map<AnfNodePtr, std::set<AnfNodePtr>> control_depend_map;
   for (auto &nd : node_list) {
+    MS_EXCEPTION_IF_NULL(nd);
     if (AnfAlgo::CheckPrimitiveType(nd, prim::kPrimControlDepend)) {
       auto control_depend = nd->cast<CNodePtr>();
       auto prior_node = control_depend->input(kControlDependPriorIndex);
@@ -100,9 +102,12 @@ bool UnVisited(const BaseRef &n) {
       auto prim_py = value->cast<PrimitivePtr>();
       MS_EXCEPTION_IF_NULL(prim_py);
       return !prim_py->HasAttr(kAttrVisited);
-    } else {
-      return false;
+    } else if (IsValueNode<FuncGraph>(in)) {
+      auto func_graph = GetValueNode<FuncGraphPtr>(in);
+      MS_EXCEPTION_IF_NULL(func_graph);
+      return !func_graph->has_flag(kAttrVisited);
     }
+    return false;
   }
   return false;
 }
@@ -157,6 +162,7 @@ const AnfNodePtr EliminateDependTransop(const FuncGraphPtr &func_graph, const An
   MS_EXCEPTION_IF_NULL(func_graph);
 
   auto transop_cnode = CheckAnfNodeIfCNodeAndInputSize(node, kTransOpInputNum);
+  MS_EXCEPTION_IF_NULL(transop_cnode);
   auto depend_cnode = CheckAnfNodeIfCNodeAndInputSize(transop_cnode->input(kCastInputNum - 1), kDependInputNum);
   auto prev_transop_cnode = CheckAnfNodeIfCNodeAndInputSize(depend_cnode->input(1), kTransOpInputNum);
   MS_EXCEPTION_IF_NULL(depend_cnode->input(kDependInputNum - 1));
@@ -185,9 +191,12 @@ bool Visited(const BaseRef &n) {
       auto prim_py = value->cast<PrimitivePtr>();
       MS_EXCEPTION_IF_NULL(prim_py);
       return prim_py->HasAttr(kAttrVisited);
-    } else {
-      return false;
+    } else if (IsValueNode<FuncGraph>(in)) {
+      auto func_graph = GetValueNode<FuncGraphPtr>(in);
+      MS_EXCEPTION_IF_NULL(func_graph);
+      return func_graph->has_flag(kAttrVisited);
     }
+    return false;
   }
   return false;
 }
@@ -381,7 +390,7 @@ tensor::TensorPtr CreateTupleTensor(const ValueTuplePtr &value_tuple) {
 bool IsNopNode(const AnfNodePtr &node) {
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
-  if (context_ptr->device_target() != kAscendDevice) {
+  if (context_ptr->device_target() != kAscendDevice && context_ptr->device_target() != kGPUDevice) {
     return false;
   }
   static std::unordered_set<std::string> nop_nodes = {prim::kPrimReshape->name(), kExpandDimsOpName,
@@ -473,15 +482,36 @@ void RemoveNopNode(session::KernelGraph *const graph) {
   }
 }
 
-bool IsUsedByOthers(const FuncGraphPtr &graph, const AnfNodePtr &node) {
+std::shared_ptr<std::vector<std::pair<AnfNodePtr, int>>> GetRealNodeUsedList(const FuncGraphPtr &graph,
+                                                                             const AnfNodePtr &node) {
+  auto output_node_list = std::make_shared<std::vector<std::pair<AnfNodePtr, int>>>();
   MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node);
   auto manager = graph->manager();
   MS_EXCEPTION_IF_NULL(manager);
-  if (manager->node_users().find(node) == manager->node_users().end()) {
+  auto iter = manager->node_users().find(node);
+  if (iter == manager->node_users().end()) {
     MS_LOG(EXCEPTION) << "node has no output in manager";
   }
-  return manager->node_users()[node].size() > 1;
+  auto output_info_list = iter->second;
+  for (const auto &output_info : output_info_list) {
+    if (AnfAlgo::GetCNodeName(output_info.first) == prim::kPrimControlDepend->name()) {
+      continue;
+    }
+    if (AnfAlgo::GetCNodeName(output_info.first) == prim::kPrimDepend->name() &&
+        output_info.second == kDependAttachNodeIndex) {
+      continue;
+    }
+    output_node_list->push_back(output_info);
+  }
+  return output_node_list;
+}
+
+bool IsUsedByOthers(const FuncGraphPtr &graph, const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  auto output_node_list = GetRealNodeUsedList(graph, node);
+  MS_EXCEPTION_IF_NULL(output_node_list);
+  return output_node_list->size() > 1;
 }
 
 AnfNodePtr CreatTupleGetItemNode(const FuncGraphPtr &func_graph, const AnfNodePtr &node, size_t output_idx) {
@@ -545,14 +575,22 @@ bool AnfEqual(const BaseRef &a, const BaseRef &b) {
   if (utils::isa<AnfNodePtr>(a) && utils::isa<AnfNodePtr>(b)) {
     auto a_node = utils::cast<AnfNodePtr>(a);
     auto b_node = utils::cast<AnfNodePtr>(b);
+    MS_EXCEPTION_IF_NULL(a_node);
+    MS_EXCEPTION_IF_NULL(b_node);
     if (IsValueNode<Primitive>(a_node) && IsValueNode<Primitive>(b_node)) {
       auto a_value_node = a_node->cast<ValueNodePtr>();
+      MS_EXCEPTION_IF_NULL(a_value_node);
       auto a_value = a_value_node->value();
+      MS_EXCEPTION_IF_NULL(a_value);
       auto a_prim = a_value->cast<PrimitivePtr>();
+      MS_EXCEPTION_IF_NULL(a_prim);
 
       auto b_value_node = b_node->cast<ValueNodePtr>();
+      MS_EXCEPTION_IF_NULL(b_value_node);
       auto b_value = b_value_node->value();
+      MS_EXCEPTION_IF_NULL(b_value);
       auto b_prim = b_value->cast<PrimitivePtr>();
+      MS_EXCEPTION_IF_NULL(b_prim);
 
       return a_prim->name() == b_prim->name();
     } else if (a_node->isa<ValueNode>() && b_node->isa<ValueNode>()) {
@@ -704,5 +742,44 @@ AnfNodePtr GetAnfNodeByVar(const EquivPtr &equiv, const VarPtr &var_node) {
   }
   return res;
 }
+
+bool CompareTupleGetitem(const AnfNodePtr &n1, const AnfNodePtr &n2) {
+  MS_EXCEPTION_IF_NULL(n1);
+  MS_EXCEPTION_IF_NULL(n2);
+  auto n1_cnode = n1->cast<CNodePtr>();
+  auto n2_cnode = n2->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(n1_cnode);
+  MS_EXCEPTION_IF_NULL(n2_cnode);
+  auto index_input1 = n1_cnode->input(kInputNodeOutputIndexInTupleGetItem);
+  MS_EXCEPTION_IF_NULL(index_input1);
+  auto value_node1 = index_input1->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node1);
+  auto index_input2 = n2_cnode->input(kInputNodeOutputIndexInTupleGetItem);
+  MS_EXCEPTION_IF_NULL(index_input2);
+  auto value_node2 = index_input2->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node2);
+  return GetValue<int>(value_node1->value()) < GetValue<int>(value_node2->value());
+}
+
+bool GetBoolAttr(const AnfNodePtr &node, const std::string &attr_name) {
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(INFO) << "node is not a cnode";
+    return false;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  return AnfAlgo::HasNodeAttr(attr_name, cnode) && AnfAlgo::GetNodeAttr<bool>(node, attr_name);
+}
+
+bool CheckSupportDataType(const AnfNodePtr &node, const std::set<TypeId> &supported_data_type_set) {
+  MS_EXCEPTION_IF_NULL(node);
+  TypeId data_type = AnfAlgo::GetOutputInferDataType(node, 0);
+  if (supported_data_type_set.find(data_type) != supported_data_type_set.end()) {
+    return true;
+  }
+  MS_LOG(DEBUG) << "Not supported data type. Node:" << node->DebugString();
+  return false;
+}
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/common/helper.h b/mindspore/ccsrc/pre_activate/common/helper.h
index d315f6b5d9..49a1d47d0c 100644
--- a/mindspore/ccsrc/pre_activate/common/helper.h
+++ b/mindspore/ccsrc/pre_activate/common/helper.h
@@ -18,7 +18,9 @@
 
 #include <vector>
 #include <memory>
+#include <utility>
 #include <string>
+#include <set>
 #include <unordered_set>
 #include "ir/func_graph.h"
 #include "session/kernel_graph.h"
@@ -65,6 +67,7 @@ constexpr size_t kBNGrad3OutputNum = 1;
 constexpr size_t kBNTrainingReduceOutputNum = 2;
 constexpr size_t kBNTrainingUpdateOutputNum = 5;
 constexpr size_t kBNTrainingUpdateV2OutputNum = 3;
+constexpr size_t kBNTrainingUpdateV3OutputNum = 5;
 constexpr size_t kBNTrainingUpdateGradOutputNum = 2;
 
 constexpr size_t kSingleOutputNum = 1;
@@ -94,6 +97,7 @@ constexpr size_t kBiasAddInputNum = 3;
 constexpr size_t kTopkInputNum = 3;
 constexpr size_t kLarsV2InputNum = 5;
 constexpr size_t kFusedMulApplyMomentumOutputNum = 2;
+constexpr size_t kSplitInputNum = 2;
 
 enum FusedBatchNormInput {
   kX = 1,
@@ -152,6 +156,8 @@ tensor::TensorPtr CreateTensorWithValueTuple(const ValueTuplePtr &value_tuple_pt
 
 tensor::TensorPtr CreateTupleTensor(const ValueTuplePtr &value_tuple);
 
+bool IsAllNopNode(const session::KernelGraph *const graph);
+
 bool IsNopNode(const AnfNodePtr &node);
 
 void HideNopNode(session::KernelGraph *const graph);
@@ -162,6 +168,9 @@ AnfNodePtr CreatTupleGetItemNode(const FuncGraphPtr &func_graph, const AnfNodePt
 
 bool IsUsedByOthers(const FuncGraphPtr &graph, const AnfNodePtr &node);
 
+std::shared_ptr<std::vector<std::pair<AnfNodePtr, int>>> GetRealNodeUsedList(const FuncGraphPtr &graph,
+                                                                             const AnfNodePtr &node);
+
 void ConstInputToAttr(const CNodePtr &cnode, const std::unordered_set<size_t> &input_attrs);
 
 bool AnfEqual(const BaseRef &a, const BaseRef &b);
@@ -176,6 +185,15 @@ bool IsSameNode(const EquivPtr &equiv1, const EquivPtr &equiv2, const VarPtr &va
 
 // Get anf_node from equiv by var_node
 AnfNodePtr GetAnfNodeByVar(const EquivPtr &equiv, const VarPtr &var_node);
+
+// Compare tuple getitem's index, return bool[n1's index < n2's index]
+bool CompareTupleGetitem(const AnfNodePtr &n1, const AnfNodePtr &n2);
+
+// Get attr which is bool from cnode
+bool GetBoolAttr(const AnfNodePtr &node, const std::string &attr_name);
+
+// Check node's data type is in supported data type set
+bool CheckSupportDataType(const AnfNodePtr &node, const std::set<TypeId> &supported_data_type_set);
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_COMMON_HELPER_H_
diff --git a/mindspore/ccsrc/pre_activate/common/node_pass.cc b/mindspore/ccsrc/pre_activate/common/node_pass.cc
index a6e93d2f07..876da8667b 100644
--- a/mindspore/ccsrc/pre_activate/common/node_pass.cc
+++ b/mindspore/ccsrc/pre_activate/common/node_pass.cc
@@ -22,6 +22,7 @@
 #include "ir/anf.h"
 #include "ir/func_graph.h"
 #include "ir/manager.h"
+#include "session/anf_runtime_algorithm.h"
 
 namespace mindspore {
 namespace opt {
@@ -52,8 +53,13 @@ bool NodePass::Run(const FuncGraphPtr &func_graph) {
     if (new_node && IsValueNode<FuncGraph>(new_node)) {
       auto const_func_graph = GetValueNode<FuncGraphPtr>(new_node);
       MS_EXCEPTION_IF_NULL(const_func_graph);
-      todo.push_back(const_func_graph->output());
+      if (!const_func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+        todo.push_back(const_func_graph->output());
+      }
     } else if (new_node && new_node->isa<CNode>()) {
+      if (AnfAlgo::IsGraphKernel(new_node)) {
+        todo.push_back(new_node);
+      }
       auto cnode = new_node->cast<CNodePtr>();
       MS_EXCEPTION_IF_NULL(cnode);
       auto inputs = cnode->inputs();
diff --git a/mindspore/ccsrc/pre_activate/common/optimizer.cc b/mindspore/ccsrc/pre_activate/common/optimizer.cc
index fa51a0bd8c..71a523ea1d 100644
--- a/mindspore/ccsrc/pre_activate/common/optimizer.cc
+++ b/mindspore/ccsrc/pre_activate/common/optimizer.cc
@@ -86,11 +86,8 @@ void GraphOptimizer::AddPassManager(const PassManagerPtr &pass_manager) {
 FuncGraphPtr GraphOptimizer::Optimize(const FuncGraphPtr &func_graph, bool run_only_once) {
   MS_EXCEPTION_IF_NULL(func_graph);
   run_only_once_ = (pass_managers_.size() == 1) ? true : run_only_once;
-  auto manager = func_graph->manager();
-  if (manager == nullptr) {
-    manager = Manage(func_graph, false);
-    func_graph->set_manager(manager);
-  }
+  // Performance risk by creating new manager each time
+  auto manager = Manage(func_graph, true);
 
   bool changed = true;
   while (changed) {
diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_copy_manager.h b/mindspore/ccsrc/pre_activate/mem_reuse/mem_copy_manager.h
index 49d1884a48..ea9947b41b 100644
--- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_copy_manager.h
+++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_copy_manager.h
@@ -83,9 +83,9 @@ class MemCopyManager {
 
   virtual DeviceAddressPtr UpdateSwapInQueue() { return nullptr; }
 
-  virtual bool AllocHostPinnedMem(size_t size, void **addr) { return true; }
+  virtual bool AllocHostPinnedMem(size_t size, void **addr) const { return true; }
 
-  virtual void FreeHostPinnedMem(void *addr) {}
+  virtual void FreeHostPinnedMem(void *addr) const {}
 
   virtual void ClearSwapQueue() {}
 };
diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc b/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc
index a2dfce2241..095f8f6495 100644
--- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc
+++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_dynamic_allocator.cc
@@ -150,7 +150,7 @@ size_t DynamicMemPoolBestFit::CalMemBlockAllocSize(size_t size) {
     alloc_mem_size = alloc_mem_size * 2;
   }
   alloc_mem_size = std::min(alloc_mem_size, device_free_mem_size);
-  return AlignMemorySize(alloc_mem_size);
+  return alloc_mem_size;
 }
 
 bool DynamicMemPoolBestFit::IsDivide(size_t tensor_size, size_t mem_buf_size) const {
diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.cc b/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.cc
index aaa0c155e4..2927b1204f 100644
--- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.cc
+++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.cc
@@ -18,6 +18,8 @@
 #include <algorithm>
 #include <memory>
 #include "pre_activate/mem_reuse/mem_reuse_checker.h"
+#include "pre_activate/common/helper.h"
+
 namespace mindspore {
 namespace memreuse {
 bool MemReuseUtil::InitDynamicOutputKernelRef() {
@@ -226,7 +228,11 @@ KernelRefCountPtr MemReuseUtil::GetKernelInputRef(const CNodePtr &kernel, size_t
                       << AnfAlgo::GetInputTensorNum(kernel);
   }
   auto input_node = kernel->input(input_idx + 1);
-  auto kernel_input = AnfAlgo::VisitKernel(input_node, 0);
+  // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
+  auto kernel_input = AnfAlgo::VisitKernelWithReturnType(input_node, 0, false);
+  if (IsPrimitive(kernel_input.first, prim::kPrimMakeTuple)) {
+    MS_LOG(EXCEPTION) << "Input node [" << input_node->DebugString() << "]'s input " << input_idx << " is MakeTuple";
+  }
   auto result = GetRef(kernel_input.first, SizeToInt(kernel_input.second));
   return result;
 }
@@ -252,6 +258,7 @@ void MemReuseUtil::SetKernelDefMap() {
 
 void MemReuseUtil::SetKernelDefInputs() {
   for (const auto &kernel : graph_->execution_order()) {
+    MS_EXCEPTION_IF_NULL(kernel);
     auto key = kernel.get();
     // find kernel_def according to cnode addr
     auto iter = kernel_map_.find(key);
@@ -264,7 +271,11 @@ void MemReuseUtil::SetKernelDefInputs() {
       if (ref_ptr != nullptr) {
         // set the inputs of this kernel_def
         auto input_node = AnfAlgo::GetInputNode(kernel, i);
-        auto input = AnfAlgo::VisitKernel(input_node, 0);
+        // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
+        auto input = AnfAlgo::VisitKernelWithReturnType(input_node, 0, false);
+        if (IsPrimitive(input.first, prim::kPrimMakeTuple)) {
+          MS_LOG(EXCEPTION) << "Input node [" << input_node->DebugString() << "]'s input " << i << " is MakeTuple";
+        }
         auto input_key = (input.first).get();
         auto input_iter = kernel_map_.find(input_key);
         if (input_iter == kernel_map_.end()) {
@@ -292,10 +303,47 @@ void MemReuseUtil::SetReuseRefCount() {
   }
 }
 
+void MemReuseUtil::SetSummaryNodesRefCount() {
+  bool summary_exist = graph_->summary_node_exist();
+  if (!summary_exist) {
+    return;
+  }
+
+  auto summary_nodes = graph_->summary_nodes();
+  if (summary_nodes.empty()) {
+    return;
+  }
+
+  for (auto &node_item : summary_nodes) {
+    auto node = node_item.second.first;
+    size_t index = IntToSize(node_item.second.second);
+    MS_LOG(INFO) << "set summary node's ref count, node: " << node->fullname_with_scope() << " index: " << index;
+    if (kernel_output_refs_.find(node.get()) != kernel_output_refs_.end()) {
+      KernelRefCountPtr kernel_ref = kernel_output_refs_[node.get()][index];
+      kernel_ref->ref_count_ = kMaxRefCount;
+      kernel_ref->ref_count_dynamic_use_ = kMaxRefCount;
+    } else {
+      MS_LOG(WARNING) << "can't find summary node's kernel_def " << node->fullname_with_scope();
+    }
+  }
+#ifdef MEM_REUSE_DEBUG
+  auto graph = *graph_;
+  MemReuseChecker::GetInstance().CheckMemReuseIR(total_refs_list_, kernel_def_ptr_list_, &graph);
+#endif
+}
+
 void MemReuseUtil::SetGraphOutputRefCount() {
+  auto is_all_nop_node = opt::IsAllNopNode(graph_);
   auto nodes = AnfAlgo::GetAllOutput(graph_->output(), {prim::kPrimTupleGetItem});
   for (const auto &node : nodes) {
-    auto kernel_input = AnfAlgo::VisitKernelWithReturnType(node, 0);
+    session::KernelWithIndex kernel_input;
+    if (is_all_nop_node) {
+      // The graph does not remove the nop node.
+      kernel_input = AnfAlgo::VisitKernelWithReturnType(node, 0, false);
+    } else {
+      // The graph removes the nop node.
+      kernel_input = AnfAlgo::VisitKernelWithReturnType(node, 0, true);
+    }
     MS_EXCEPTION_IF_NULL(kernel_input.first);
     if (!kernel_input.first->isa<CNode>() || !AnfAlgo::IsRealKernel(kernel_input.first)) {
       continue;
@@ -319,6 +367,7 @@ void MemReuseUtil::SetGraphOutputRefCount() {
 void MemReuseUtil::ResetDynamicUsedRefCount() {
   for (auto iter = kernel_output_refs_.begin(); iter != kernel_output_refs_.end(); ++iter) {
     for (auto &ref_count : iter->second) {
+      MS_EXCEPTION_IF_NULL(ref_count);
       ref_count->ref_count_dynamic_use_ = ref_count->ref_count_;
     }
   }
@@ -330,6 +379,7 @@ void MemReuseUtil::SetAllInfo(KernelGraph *graph) {
   }
   SetKernelDefMap();
   SetReuseRefCount();
+  SetSummaryNodesRefCount();
   SetWorkSpaceList();
 #ifdef MEM_REUSE_DEBUG
   MemReuseChecker::GetInstance().CheckMemReuseIR(total_refs_list_, kernel_def_ptr_list_, graph);
diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.h b/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.h
index 08029f231a..c7a129f1e9 100644
--- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.h
+++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.h
@@ -63,6 +63,7 @@ class MemReuseUtil {
   void SetWkMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr);
   void SetKernelDefInputs();
   void SetReuseRefCount();
+  void SetSummaryNodesRefCount();
   // Set the reference count of graph output specially.
   void SetGraphOutputRefCount();
   // Reset the dynamic used reference count by ref_count_.
diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse_checker.cc b/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse_checker.cc
index cf92679187..5cd6a5f50e 100644
--- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse_checker.cc
+++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse_checker.cc
@@ -48,7 +48,8 @@ void MemReuseChecker::CheckOutRef(const KernelRefs &kernel_refs, const CNodePtr
   auto iter = kernel_refs.find(key);
   auto node_name = AnfAlgo::GetCNodeName(c_node);
   if (iter == kernel_refs.end()) {
-    MS_LOG(EXCEPTION) << "kernel [" << node_name << "] has no output tensor";
+    MS_LOG(EXCEPTION) << "kernel [" << node_name << "] has no output tensor, node: " << c_node->DebugString()
+                      << " output index: " << output_idx;
   }
   if (output_idx >= iter->second.size()) {
     MS_LOG(INFO) << "invalid cnode: " << c_node->fullname_with_scope().c_str();
diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_manager.cc b/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_manager.cc
index c41eacc334..d81364edfb 100644
--- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_manager.cc
+++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_manager.cc
@@ -28,7 +28,7 @@ void MemSwapManager::Init(const mindspore::session::KernelGraph *kernel_graph) {
   size_t kernel_index = 0;
   for (const auto &kernel : execution_order_) {
     // parse topo order of kernel
-    kernel_execution_info_.emplace(kernel.get(), kernel_index++);
+    (void)kernel_execution_info_.emplace(kernel.get(), kernel_index++);
     // parse tensor info
     auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
     MS_EXCEPTION_IF_NULL(kernel_mod);
@@ -144,7 +144,7 @@ void MemSwapManager::AddSwapInfo() {
 }
 
 void MemSwapManager::AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &device_address,
-                                    const HostAddress &host_address) {
+                                    const HostAddress &host_address) const {
   if (swap_kind == SwapKind::kDeviceToHost) {
     mem_copy_manager_->AddMemSwapOutTask(device_address, host_address);
   } else if (swap_kind == SwapKind::kHostToDevice) {
@@ -152,9 +152,11 @@ void MemSwapManager::AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &
   }
 }
 
-bool MemSwapManager::SyncMemCopyStream(SwapKind swap_kind) { return mem_copy_manager_->SyncMemCopyStream(swap_kind); }
+bool MemSwapManager::SyncMemCopyStream(SwapKind swap_kind) const {
+  return mem_copy_manager_->SyncMemCopyStream(swap_kind);
+}
 
-DeviceAddressPtr MemSwapManager::UpdateSwapQueue(SwapKind swap_kind) {
+DeviceAddressPtr MemSwapManager::UpdateSwapQueue(SwapKind swap_kind) const {
   if (swap_kind == SwapKind::kDeviceToHost) {
     return mem_copy_manager_->UpdateSwapOutQueue();
   } else {
@@ -298,7 +300,7 @@ void MemSwapManager::ReleaseHostPinnedMem() {
   host_addrs_list_.clear();
 }
 
-void MemSwapManager::ClearSwapQueue() { mem_copy_manager_->ClearSwapQueue(); }
+void MemSwapManager::ClearSwapQueue() const { mem_copy_manager_->ClearSwapQueue(); }
 
 void MemSwapManager::ResetSwapInfo() {
   ClearSwapQueue();
diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_manager.h b/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_manager.h
index c19930000e..7e2823d27c 100644
--- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_manager.h
+++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_manager.h
@@ -44,11 +44,12 @@ class MemSwapManager {
 
   void Init(const mindspore::session::KernelGraph *kernel_graph);
 
-  void AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &device_address, const HostAddress &host_address);
+  void AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &device_address,
+                      const HostAddress &host_address) const;
 
-  bool SyncMemCopyStream(SwapKind swap_kind);
+  bool SyncMemCopyStream(SwapKind swap_kind) const;
 
-  DeviceAddressPtr UpdateSwapQueue(SwapKind swap_kind);
+  DeviceAddressPtr UpdateSwapQueue(SwapKind swap_kind) const;
 
   // retreat to find a workable swap scheme
   bool RetreatSwapInfo();
@@ -83,7 +84,7 @@ class MemSwapManager {
 
   void ReleaseHostPinnedMem();
 
-  void ClearSwapQueue();
+  void ClearSwapQueue() const;
 
  private:
   void AddSwapInfo();
diff --git a/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.cc b/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.cc
new file mode 100644
index 0000000000..9df34a1c59
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.cc
@@ -0,0 +1,122 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pre_activate/pass/add_atomic_clean.h"
+#include <memory>
+#include <vector>
+#include <functional>
+#include "operator/ops.h"
+#include "utils/utils.h"
+#include "utils/graph_utils.h"
+#include "utils/log_adapter.h"
+#include "session/anf_runtime_algorithm.h"
+#include "session/kernel_graph.h"
+#include "debug/anf_ir_dump.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+
+static std::vector<size_t> g_output_idx;
+
+bool HasAtomic(const AnfNodePtr &input) {
+  if (IsPrimitiveCNode(input)) {
+    const auto &cnode = input->cast<CNodePtr>();
+    const auto &prim = GetValueNode<PrimitivePtr>(cnode->input(0));
+    return prim->HasAttr("atomic_add");
+  }
+  return false;
+}
+
+std::vector<int> CalCleanSize(const CNodePtr &pre_node) {
+  MS_EXCEPTION_IF_NULL(pre_node);
+  std::vector<int> clean_size_list;
+  // clean output
+  for (auto &index : g_output_idx) {
+    TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(pre_node, index);
+    size_t type_size = GetTypeByte(TypeIdToType(output_type_id));
+    std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(pre_node, index);
+    auto size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
+    clean_size_list.push_back((size + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize);
+  }
+  MS_LOG(DEBUG) << "Clear output size: " << clean_size_list.size() << ", pre_node: " << pre_node->fullname_with_scope();
+  return clean_size_list;
+}
+
+CNodePtr CreateTbeAtomicCleanNode(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                                  const mindspore::CNodePtr &pre_node) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  MS_EXCEPTION_IF_NULL(pre_node);
+  auto clean_zero_prim = std::make_shared<Primitive>(kAtomicAddrCleanOpName);
+  auto new_value_node = NewValueNode(clean_zero_prim);
+  std::vector<AnfNodePtr> inputs = {new_value_node};
+  CNodePtr clean_zero = kernel_graph->NewCNode(inputs);
+  AbstractBasePtr abstract = std::make_shared<abstract::AbstractNone>();
+  clean_zero->set_abstract(abstract);
+  auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  builder->SetKernelType(KernelType::TBE_KERNEL);
+  AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_zero.get());
+  auto clean_size = CalCleanSize(pre_node);
+  AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clean_zero);
+  AnfAlgo::SetNodeAttr(kAttrAtomicOutputIndexs, MakeValue(g_output_idx), clean_zero);
+  AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(pre_node.get()), clean_zero.get());
+  return clean_zero;
+}
+}  // namespace
+
+void AddAtomicClean(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto mng = kernel_graph->manager();
+  if (mng == nullptr) {
+    mng = Manage(kernel_graph, true);
+    kernel_graph->set_manager(mng);
+  }
+  auto &todos = kernel_graph->execution_order();
+  for (auto iter = todos.cbegin(); iter != todos.end(); ++iter) {
+    auto node = *iter;
+    if (AnfAlgo::IsGraphKernel(node) && kernel_graph->nodes().contains(node)) {
+      auto fg = GetValueNode<FuncGraphPtr>(node->input(kAnfPrimitiveIndex));
+      MS_EXCEPTION_IF_NULL(fg);
+      auto input = fg->get_return()->input(1);
+      if (IsPrimitiveCNode(input, prim::kPrimMakeTuple)) {
+        const auto &cnode = input->cast<CNodePtr>();
+        for (size_t i = 0; i < cnode->inputs().size(); ++i) {
+          if (HasAtomic(cnode->input(i))) {
+            g_output_idx.push_back(i - 1);
+          }
+        }
+      } else if (HasAtomic(input)) {
+        g_output_idx.push_back(0);
+      }
+
+      if (!g_output_idx.empty()) {
+        auto zero_node = CreateTbeAtomicCleanNode(kernel_graph, node);
+        auto depend = kernel_graph->NewCNode({NewValueNode(prim::kPrimDepend), node->input(1), zero_node});
+        std::vector<AnfNodePtr> new_input = node->inputs();
+        new_input[1] = depend;
+        auto new_cnode = std::make_shared<CNode>(new_input, kernel_graph);
+        // Set abstract
+        new_cnode->set_abstract(node->abstract());
+        // Set kernel info
+        new_cnode->set_kernel_info(node->kernel_info_ptr());
+        mng->Replace(node, new_cnode);
+        g_output_idx.clear();
+      }
+    }
+  }
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.h b/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.h
new file mode 100644
index 0000000000..bb1edb0e35
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_ADD_ATOMIC_CLEAN_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_ADD_ATOMIC_CLEAN_H_
+
+#include <memory>
+#include "session/kernel_graph.h"
+
+namespace mindspore {
+namespace opt {
+void AddAtomicClean(const std::shared_ptr<session::KernelGraph> &kernel_graph);
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_ADD_ATOMIC_CLEAN_H
diff --git a/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc b/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc
index f8604d7638..9af50eac33 100644
--- a/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc
+++ b/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc
@@ -45,6 +45,8 @@ bool BackendCSE::CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node) co
     auto node_value = GetValueNode(node);
     if (main_value->isa<Primitive>() && node_value->isa<Primitive>()) {
       replace = false;
+    } else if (main_value->isa<tensor::Tensor>() && node_value->isa<tensor::Tensor>()) {
+      replace = (AbsOf(main) == AbsOf(node)) && CheckEqualKernelBuildInfo(main, node);
     } else {
       replace = (AbsOf(main) == AbsOf(node)) && (*main_value == *node_value);
     }
diff --git a/mindspore/ccsrc/pre_activate/pass/communication_op_fusion.cc b/mindspore/ccsrc/pre_activate/pass/communication_op_fusion.cc
index fc878dd881..aa4690abcb 100644
--- a/mindspore/ccsrc/pre_activate/pass/communication_op_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/pass/communication_op_fusion.cc
@@ -253,6 +253,13 @@ bool CommunicationOpFusion::Run(const FuncGraphPtr &func_graph) {
     if (it.second.communication_op_nodes.size() <= 1) {
       continue;
     }
+    auto first_node = it.second.communication_op_nodes[0];
+    if (AnfAlgo::HasNodeAttr(kAttrIndex, first_node) && AnfAlgo::GetNodeAttr<int>(first_node, kAttrIndex) > 0) {
+      std::stable_sort(it.second.communication_op_nodes.begin(), it.second.communication_op_nodes.end(),
+                       [](const CNodePtr &a, const CNodePtr &b) {
+                         return AnfAlgo::GetNodeAttr<int>(a, kAttrIndex) < AnfAlgo::GetNodeAttr<int>(b, kAttrIndex);
+                       });
+    }
     size_t segment_num = 0;
     std::vector<size_t> segment_index;
     if (GetSplitSegments(it.second, &segment_num, &segment_index, it.first)) {
diff --git a/mindspore/ccsrc/pre_activate/pass/const_input_to_attr_registry.cc b/mindspore/ccsrc/pre_activate/pass/const_input_to_attr_registry.cc
index cc8a1341be..6a557388ad 100644
--- a/mindspore/ccsrc/pre_activate/pass/const_input_to_attr_registry.cc
+++ b/mindspore/ccsrc/pre_activate/pass/const_input_to_attr_registry.cc
@@ -36,6 +36,9 @@ ConstInputToAttrInfoRegistry::ConstInputToAttrInfoRegistry() {
   Register(prim::kPrimReduceSum->name(), {1});
   Register(prim::kPrimReduceMean->name(), {1});
   Register(prim::kPrimGatherV2->name(), {2});
+  Register(prim::kPrimEmbeddingLookup->name(), {2, 3, 4, 5});
+  Register(prim::kPrimEmbeddingLookupCommGrad->name(), {1});
+  Register(prim::kPrimSubscalar->name(), {1});
   Register(prim::kPrimTranspose->name(), {1});
   Register(prim::kPrimUnsortedSegmentSum->name(), {2});
   Register(prim::kPrimOneHot->name(), {1});
@@ -44,6 +47,7 @@ ConstInputToAttrInfoRegistry::ConstInputToAttrInfoRegistry() {
   Register(prim::kPrimCumProd->name(), {1});
   Register(prim::kPrimReduceAll->name(), {1});
   Register(prim::kPrimUnsortedSegmentMin->name(), {2});
+  Register(kSparseGatherV2, {2});
   Register(kUnsortedSegmentProdOpName, {2});
   Register(kSimpleMeanGradOpName, {1});
   Register(kMeanGradOpName, {1});
diff --git a/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_attr.cc b/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_attr.cc
index 1f9e2712a6..38d629c415 100644
--- a/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_attr.cc
+++ b/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_attr.cc
@@ -26,6 +26,7 @@
 #include "utils/context/ms_context.h"
 #include "operator/ops.h"
 #include "session/anf_runtime_algorithm.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace opt {
@@ -34,14 +35,24 @@ const AnfNodePtr ConvertConstInputToAttr::Process(const FuncGraphPtr &, const An
   if (node == nullptr || !AnfAlgo::IsRealCNodeKernel(node)) {
     return nullptr;
   }
-  CNodePtr cnode = node->cast<CNodePtr>();
+  std::vector<AnfNodePtr> todos;
+  if (AnfAlgo::IsGraphKernel(node)) {
+    auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(sub_graph);
+    kernel::GetValidKernelNodes(sub_graph, &todos);
+  } else {
+    todos.push_back(node);
+  }
 
-  ConstInputToAttrInfoRegister reg;
-  if (!ConstInputToAttrInfoRegistry::Instance().GetRegisterByOpName(AnfAlgo::GetCNodeName(cnode), &reg)) {
-    return nullptr;
+  for (auto &t : todos) {
+    CNodePtr cnode = t->cast<CNodePtr>();
+    ConstInputToAttrInfoRegister reg;
+    if (!ConstInputToAttrInfoRegistry::Instance().GetRegisterByOpName(AnfAlgo::GetCNodeName(cnode), &reg)) {
+      continue;
+    }
+    ConstInputToAttr(cnode, reg.GetConstInputAttrInfo());
   }
-  ConstInputToAttr(cnode, reg.GetConstInputAttrInfo());
-  return cnode;
+  return node;
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_tensor_input.cc b/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_tensor_input.cc
index 56be2e273d..b4f98cc6d7 100644
--- a/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_tensor_input.cc
+++ b/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_tensor_input.cc
@@ -17,15 +17,39 @@
 
 #include <vector>
 #include <memory>
+#include <utility>
 
 #include "utils/graph_utils.h"
 #include "pre_activate/common/helper.h"
 #include "session/anf_runtime_algorithm.h"
 #include "session/kernel_graph.h"
+#include "kernel/common_utils.h"
+#include "device/kernel_info.h"
 
 namespace mindspore {
 namespace opt {
 namespace {
+ValueNodePtr MakeValueNode(const ValueNodePtr &value_node) {
+  MS_EXCEPTION_IF_NULL(value_node);
+  ValueNodePtr new_value_node = std::make_shared<ValueNode>(value_node->value());
+  new_value_node->set_abstract(value_node->abstract());
+  // create kernel_info fo new value node
+  auto kernel_info = std::make_shared<device::KernelInfo>();
+  new_value_node->set_kernel_info(kernel_info);
+  // create kernel_build_info for new value node
+  auto kernel_build_info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  // set the format of value_node to DEFAULT_FORMAT
+  kernel_build_info_builder->SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+  // set value node initial device data type = infer data type
+  std::vector<TypeId> types;
+  for (size_t index = 0; index < AnfAlgo::GetOutputTensorNum(value_node); ++index) {
+    types.push_back(kTypeUnknown);
+  }
+  kernel_build_info_builder->SetOutputsDeviceType(types);
+  AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), new_value_node.get());
+  return new_value_node;
+}
+
 AnfNodePtr CreateTensorInput(const KernelGraphPtr &kernel_graph, const AnfNodePtr &input_node) {
   MS_EXCEPTION_IF_NULL(input_node);
   auto value_node = input_node->cast<ValueNodePtr>();
@@ -50,6 +74,8 @@ AnfNodePtr CreateTensorInput(const KernelGraphPtr &kernel_graph, const AnfNodePt
   if (kernel_graph != nullptr) {
     tensor_input = kernel_graph->NewValueNode(tensor_input);
     kernel_graph->AddValueNodeToGraph(tensor_input);
+  } else {
+    tensor_input = MakeValueNode(tensor_input);
   }
   tensor_input->set_scope(input_node->scope());
   return tensor_input;
@@ -89,6 +115,26 @@ AnfNodePtr ConstInputToTensorInput(const FuncGraphPtr &func_graph, const CNodePt
   }
   return nullptr;
 }
+
+AnfNodePtr ProcessGraphKernelOp(const AnfNodePtr &node) {
+  auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(sub_graph);
+  auto mng = sub_graph->manager();
+  MS_EXCEPTION_IF_NULL(mng);
+  std::vector<AnfNodePtr> todo;
+  std::vector<std::pair<AnfNodePtr, size_t>> graph_rets;
+  kernel::GetValidKernelNodes(sub_graph, &todo);
+  kernel::GetGraphRealOutput(sub_graph, &graph_rets);
+
+  for (auto &t : todo) {
+    auto t_new_node = ConstInputToTensorInput(sub_graph, t->cast<CNodePtr>());
+    if (t_new_node != nullptr && t_new_node != t) {
+      (void)mng->Replace(t, t_new_node);
+    }
+  }
+
+  return node;
+}
 }  // namespace
 
 const AnfNodePtr ConvertConstInputToTensorInput::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
@@ -96,8 +142,11 @@ const AnfNodePtr ConvertConstInputToTensorInput::Process(const FuncGraphPtr &fun
   if (node == nullptr || func_graph == nullptr || !AnfAlgo::IsRealCNodeKernel(node)) {
     return nullptr;
   }
-  CNodePtr cnode = node->cast<CNodePtr>();
-  return ConstInputToTensorInput(func_graph, cnode);
+  if (AnfAlgo::IsGraphKernel(node)) {
+    return ProcessGraphKernelOp(node);
+  } else {
+    return ConstInputToTensorInput(func_graph, node->cast<CNodePtr>());
+  }
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/convert_tuple_input_to_dynamic_input.cc b/mindspore/ccsrc/pre_activate/pass/convert_tuple_input_to_dynamic_input.cc
index ccc4fd5265..a03087c1a4 100644
--- a/mindspore/ccsrc/pre_activate/pass/convert_tuple_input_to_dynamic_input.cc
+++ b/mindspore/ccsrc/pre_activate/pass/convert_tuple_input_to_dynamic_input.cc
@@ -21,10 +21,37 @@
 #include "session/anf_runtime_algorithm.h"
 #include "pre_activate/common/helper.h"
 #include "session/kernel_graph.h"
+#include "kernel/common_utils.h"
+#include "device/kernel_info.h"
 
 namespace mindspore {
 namespace opt {
 namespace {
+bool MakeValueNode(const AnfNodePtr &node) {
+  auto value_node = node->cast<ValueNodePtr>();
+  if (value_node == nullptr) {
+    return false;
+  }
+
+  // create kernel_info fo new value node
+  auto kernel_info = std::make_shared<device::KernelInfo>();
+  value_node->set_kernel_info(kernel_info);
+  // create kernel_build_info for new value node
+  auto kernel_build_info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  // set the format of value_node to DEFAULT_FORMAT
+  kernel_build_info_builder->SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+  // set value node initial device data type = infer data type
+  TypeId infer_data_type;
+  if (AnfAlgo::GetOutputTensorNum(value_node) == 0) {
+    infer_data_type = kTypeUnknown;
+  } else {
+    infer_data_type = AnfAlgo::GetOutputInferDataType(value_node, 0);
+  }
+  kernel_build_info_builder->SetOutputsDeviceType(std::vector<TypeId>{infer_data_type});
+  AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), value_node.get());
+  return true;
+}
+
 void ConvertTupleOuputToPlantInputs(const FuncGraphPtr &graph, const AnfNodePtr &input_node,
                                     std::vector<AnfNodePtr> *plant_inputs, std::vector<int> *dyn_input_sizes) {
   MS_EXCEPTION_IF_NULL(plant_inputs);
@@ -50,12 +77,12 @@ void ConvertTupleOuputToPlantInputs(const FuncGraphPtr &graph, const AnfNodePtr
   (void)std::copy(convert_inputs.begin(), convert_inputs.end(), std::back_inserter(*plant_inputs));
 }
 
-CNodePtr ConvertMakeTupleInputToPlantInputs(const FuncGraphPtr &graph, const CNodePtr &cnode_ptr) {
+void ConvertMakeTupleInputToPlantInputs(const FuncGraphPtr &graph, const CNodePtr &cnode_ptr) {
   MS_EXCEPTION_IF_NULL(cnode_ptr);
   MS_EXCEPTION_IF_NULL(graph);
   auto &ori_args = cnode_ptr->inputs();
   if (ori_args.size() < 1) {
-    return nullptr;
+    return;
   }
   std::vector<AnfNodePtr> plant_inputs;
   std::vector<int> dyn_input_sizes;
@@ -68,8 +95,17 @@ CNodePtr ConvertMakeTupleInputToPlantInputs(const FuncGraphPtr &graph, const CNo
       auto cnode = input_node->cast<CNodePtr>();
       MS_EXCEPTION_IF_NULL(cnode);
       auto inputs = cnode->inputs();
-      (void)std::copy(inputs.begin() + 1, inputs.end(), std::back_inserter(plant_inputs));
-    } else if (AnfAlgo::IsTupleOutput(input_node)) {
+      for (size_t j = 1; j < inputs.size(); ++j) {
+        MS_EXCEPTION_IF_NULL(inputs[j]);
+        if (IsValueNode<tensor::Tensor>(inputs[j])) {
+          auto success = MakeValueNode(inputs[j]);
+          if (!success) {
+            MS_LOG(WARNING) << "Make value node failed, " << inputs[j]->DebugString();
+          }
+        }
+        plant_inputs.push_back(inputs[j]);
+      }
+    } else if (input_node->Type() != nullptr && AnfAlgo::IsTupleOutput(input_node)) {
       ConvertTupleOuputToPlantInputs(graph, input_node, &plant_inputs, &dyn_input_sizes);
     } else {
       dyn_input_sizes.push_back(-1);
@@ -81,7 +117,6 @@ CNodePtr ConvertMakeTupleInputToPlantInputs(const FuncGraphPtr &graph, const CNo
     AnfAlgo::SetNodeAttr(kAttrDynInputSizes, MakeValue(dyn_input_sizes), cnode_ptr);
     cnode_ptr->set_inputs(plant_inputs);
   }
-  return cnode_ptr;
 }
 }  // namespace
 
@@ -96,7 +131,18 @@ const AnfNodePtr ConvertTupleInputToDynamicInput::Process(const FuncGraphPtr &fu
   if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsRealKernel(node)) {
     return nullptr;
   }
-  return ConvertMakeTupleInputToPlantInputs(func_graph, node->cast<CNodePtr>());
+  if (AnfAlgo::IsGraphKernel(node)) {
+    auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(sub_graph);
+    std::vector<AnfNodePtr> todos;
+    kernel::GetValidKernelNodes(sub_graph, &todos);
+    for (auto &t : todos) {
+      ConvertMakeTupleInputToPlantInputs(sub_graph, t->cast<CNodePtr>());
+    }
+  } else {
+    ConvertMakeTupleInputToPlantInputs(func_graph, node->cast<CNodePtr>());
+  }
+  return node;
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/convert_tuple_output_to_maketuple.cc b/mindspore/ccsrc/pre_activate/pass/convert_tuple_output_to_maketuple.cc
index 66b3dc1d88..a5e51411bc 100644
--- a/mindspore/ccsrc/pre_activate/pass/convert_tuple_output_to_maketuple.cc
+++ b/mindspore/ccsrc/pre_activate/pass/convert_tuple_output_to_maketuple.cc
@@ -47,8 +47,7 @@ CNodePtr ConvertTupleInputToMakeTuple(const FuncGraphPtr &graph, const CNodePtr
       convert_inputs.push_back(input_node);
     }
   }
-  cnode_ptr->set_inputs(convert_inputs);
-  return cnode_ptr;
+  return graph->NewCNode(convert_inputs);
 }
 }  // namespace
 
@@ -68,8 +67,9 @@ const AnfNodePtr ConvertTupleOutputToMaketuple::Process(const FuncGraphPtr &func
   if (IsPrimitiveCNode(cnode, prim::kPrimTupleGetItem) || IsPrimitiveCNode(cnode, prim::kPrimControlDepend)) {
     return nullptr;
   }
-  if (std::any_of(cnode->inputs().begin() + 1, cnode->inputs().end(),
-                  [](const AnfNodePtr &node) { return AnfAlgo::IsRealKernel(node) && AnfAlgo::IsTupleOutput(node); })) {
+  if (std::any_of(cnode->inputs().begin() + 1, cnode->inputs().end(), [](const AnfNodePtr &node) {
+        return node->Type() != nullptr && AnfAlgo::IsRealKernel(node) && AnfAlgo::IsTupleOutput(node);
+      })) {
     return ConvertTupleInputToMakeTuple(func_graph, cnode);
   }
   return nullptr;
diff --git a/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.cc b/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.cc
index 2fc971881d..4d3dcfccc0 100644
--- a/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.cc
+++ b/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.cc
@@ -18,10 +18,12 @@
 #include <memory>
 #include <utility>
 #include <unordered_map>
+#include <unordered_set>
 #include "session/anf_runtime_algorithm.h"
 #include "utils/utils.h"
 #include "pre_activate/common/helper.h"
 #include "operator/ops.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace opt {
@@ -125,13 +127,7 @@ void EliminateRedundantOp::Init() {
     kTransDataOpName, std::pair<std::string, ConditionFunc>(kTransDataOpName, TransDataOpEliminateCondition)));
 }
 
-const AnfNodePtr EliminateRedundantOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
-                                               const EquivPtr &) const {
-  MS_EXCEPTION_IF_NULL(node);
-  auto cnode = node->cast<CNodePtr>();
-  if (cnode == nullptr || func_graph == nullptr) {
-    return nullptr;
-  }
+const AnfNodePtr EliminateRedundantOp::DoEliminate(const FuncGraphPtr &func_graph, const CNodePtr &cnode) const {
   // match the first name
   auto name1 = AnfAlgo::GetCNodeName(cnode);
   auto it = redundant_process_map_.find(name1);
@@ -160,5 +156,35 @@ const AnfNodePtr EliminateRedundantOp::Process(const FuncGraphPtr &func_graph, c
 
   return ProcessMatchedNodes(func_graph, cnode, prev_cnode, &pass_vector);
 }
+
+const AnfNodePtr EliminateRedundantOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                               const EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  if (cnode == nullptr || func_graph == nullptr) {
+    return nullptr;
+  }
+
+  if (AnfAlgo::IsGraphKernel(node)) {
+    // do eliminate for ops in graph kernel.
+    auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(sub_graph);
+    auto mng = sub_graph->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    std::vector<AnfNodePtr> todo;
+    kernel::GetValidKernelNodes(sub_graph, &todo);
+    for (auto &t : todo) {
+      CNodePtr t_cnode = t->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(t_cnode);
+      auto t_new_node = DoEliminate(sub_graph, t_cnode);
+      if (t_new_node != nullptr && t_new_node != t) {
+        (void)mng->Replace(t, t_new_node);
+      }
+    }
+    return node;
+  }
+  // do eliminate for single op.
+  return DoEliminate(func_graph, cnode);
+}
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.h b/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.h
index 9e0dacecb1..c44190f645 100644
--- a/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.h
+++ b/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.h
@@ -40,6 +40,7 @@ class EliminateRedundantOp : public PatternProcessPass {
 
  private:
   void Init();
+  const AnfNodePtr DoEliminate(const FuncGraphPtr &func_graph, const CNodePtr &cnode) const;
   std::unordered_map<std::string, RedundantOpPair> redundant_process_map_;
 };
 }  // namespace opt
diff --git a/mindspore/ccsrc/pre_activate/pass/erase_visit_attr.cc b/mindspore/ccsrc/pre_activate/pass/erase_visit_attr.cc
index 4ea817df85..3b566b4f7c 100644
--- a/mindspore/ccsrc/pre_activate/pass/erase_visit_attr.cc
+++ b/mindspore/ccsrc/pre_activate/pass/erase_visit_attr.cc
@@ -16,6 +16,8 @@
 
 #include "pre_activate/pass/erase_visit_attr.h"
 #include <memory>
+#include <vector>
+#include "kernel/common_utils.h"
 #include "session/anf_runtime_algorithm.h"
 #include "pre_activate/common/helper.h"
 
@@ -28,7 +30,20 @@ const BaseRef EraseVisitAttr::DefinePattern() const {
 }
 
 const AnfNodePtr EraseVisitAttr::Process(const FuncGraphPtr &, const AnfNodePtr &node, const EquivPtr &) const {
-  AnfAlgo::EraseNodeAttr(kAttrVisited, node);
+  if (node != nullptr && AnfAlgo::IsRealCNodeKernel(node)) {
+    if (AnfAlgo::IsGraphKernel(node)) {
+      auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+      MS_EXCEPTION_IF_NULL(fg);
+      std::vector<AnfNodePtr> todos;
+      kernel::GetValidKernelNodes(fg, &todos);
+      for (auto &t : todos) {
+        AnfAlgo::EraseNodeAttr(kAttrVisited, t);
+      }
+    }
+    AnfAlgo::EraseNodeAttr(kAttrVisited, node);
+  } else {
+    AnfAlgo::EraseNodeAttr(kAttrVisited, node);
+  }
   return nullptr;
 }
 }  // namespace opt
diff --git a/mindspore/ccsrc/pre_activate/pass/fuse_basic.cc b/mindspore/ccsrc/pre_activate/pass/fuse_basic.cc
new file mode 100644
index 0000000000..84edd5c5e2
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/fuse_basic.cc
@@ -0,0 +1,222 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pre_activate/pass/fuse_basic.h"
+#include "pre_activate/pass/fuse_graph_kernel.h"
+
+#include <memory>
+#include <algorithm>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+#include <string>
+
+#include "operator/ops.h"
+#include "utils/utils.h"
+#include "utils/graph_utils.h"
+#include "pre_activate/common/helper.h"
+#include "session/anf_runtime_algorithm.h"
+#include "vm/segment_runner.h"
+#include "debug/draw.h"
+#include "debug/anf_ir_dump.h"
+#include "ir/func_graph_cloner.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+std::vector<PrimitivePtr> get_fusable_basic_ops(bool is_before_kernel_select) {
+  std::vector<PrimitivePtr> fusable_basic_ops = {prim::kPrimTensorAdd, prim::kPrimMul, prim::kPrimSub,
+                                                 prim::kPrimExpandDims};
+  if (!is_before_kernel_select) {
+    fusable_basic_ops.push_back(prim::kPrimCast);
+  }
+  return fusable_basic_ops;
+}
+
+IncludeType IncludeFusedBasicOpForward(const AnfNodePtr &cur_node, const GraphKernelInfo &info,
+                                       const AnfNodePtr &node) {
+  if (cur_node == node) {
+    return FOLLOW;
+  }
+  if (!IsPrimitiveCNode(node)) {
+    return EXCLUDE;
+  }
+
+  auto fusable_basic_ops = get_fusable_basic_ops(info.is_before_kernel_select);
+  bool is_fusable = std::any_of(fusable_basic_ops.begin(), fusable_basic_ops.end(),
+                                [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+
+  return is_fusable ? FOLLOW : EXCLUDE;
+}
+
+std::vector<AnfNodePtr> FindFuseCNodes(const CNodePtr &cnode, bool is_before_kernel_select) {
+  GraphKernelInfo info;
+  info.is_before_kernel_select = is_before_kernel_select;
+  // Search fusable nodes according input direction.
+  auto include_func_forward = std::bind(IncludeFusedBasicOpForward, cnode, info, std::placeholders::_1);
+  auto used_nodes = DeepLinkedGraphSearch(cnode, include_func_forward);
+  if (used_nodes.size() > 1) {
+    used_nodes = RemoveCircle(used_nodes, false);
+  }
+  TopoSortForNodeList(&used_nodes);
+  return used_nodes;
+}
+
+void RemoveControlDependOut(const FuncGraphPtr &fg, AnfNodePtrList *outputs, const FuncGraphManagerPtr &mng) {
+  AnfNodeSet outputs_set;
+  for (auto out : *outputs) {
+    outputs_set.insert(out);
+  }
+
+  AnfNodePtrList vir_outputs;
+  std::unordered_map<AnfNodePtr, AnfNodePtr> eqv;
+  auto fg_outputs = fg->output();
+  if (IsPrimitiveCNode(fg_outputs, prim::kPrimMakeTuple)) {
+    auto cnode = fg_outputs->cast<CNodePtr>();
+    for (size_t i = 1; i < cnode->size(); ++i) {
+      vir_outputs.push_back(cnode->input(i));
+    }
+  } else {
+    vir_outputs.push_back(fg_outputs);
+  }
+
+  if (vir_outputs.size() != outputs->size()) {
+    MS_LOG(EXCEPTION) << "The size of virtual output of the fg is not the same with the real output";
+  }
+  bool has_erase_outs = false;
+  size_t index = -1;
+  for (auto it = outputs->begin(); it != outputs->end();) {
+    index++;
+    auto out = *it;
+    eqv[out] = vir_outputs[index];
+    auto users = mng->node_users()[out];
+    bool is_only_control_depend_use = true;
+    std::vector<size_t> control_depend_use_index;
+    std::vector<CNodePtr> control_depend_nodes;
+    AnfNodePtr use_out = nullptr;
+    for (auto &user : users) {
+      auto use_node = user.first;
+      if (outputs_set.count(use_node) == 0 && !(IsPrimitiveCNode(use_node, prim::kPrimControlDepend))) {
+        is_only_control_depend_use = false;
+        continue;
+      }
+      if (outputs_set.count(use_node) != 0) {
+        use_out = use_node;
+      }
+
+      if (IsPrimitiveCNode(use_node, prim::kPrimControlDepend)) {
+        control_depend_nodes.push_back(use_node->cast<CNodePtr>());
+        control_depend_use_index.push_back(user.second);
+      }
+    }
+
+    if (is_only_control_depend_use && !control_depend_nodes.empty()) {
+      MS_EXCEPTION_IF_NULL(use_out);
+      it = outputs->erase(it);
+      for (size_t i = 0; i < control_depend_nodes.size(); ++i) {
+        auto control_depend_node = control_depend_nodes[i];
+        std::vector<AnfNodePtr> new_control_depend_inputs;
+        for (size_t j = 0; j < control_depend_node->size(); ++j) {
+          if (j == control_depend_use_index[i]) {
+            new_control_depend_inputs.push_back(use_out);
+          } else {
+            new_control_depend_inputs.push_back(control_depend_node->input(j));
+          }
+        }
+        auto new_control_depend = control_depend_node->func_graph()->NewCNode(new_control_depend_inputs);
+        mng->Replace(control_depend_node, new_control_depend);
+        has_erase_outs = true;
+      }
+    } else {
+      it++;
+    }
+  }
+
+  if (!has_erase_outs) {
+    return;
+  }
+
+  AnfNodePtr fg_new_output;
+  if (outputs->size() > 1) {
+    std::vector<AnfNodePtr> output_args;
+    output_args.push_back(NewValueNode(prim::kPrimMakeTuple));
+    (void)std::transform(std::begin(*outputs), std::end(*outputs), std::back_inserter(output_args),
+                         [&eqv](const AnfNodePtr &o) -> AnfNodePtr { return eqv[o]; });
+    // Set output for AnfGraph
+    fg_new_output = fg->NewCNode(output_args);
+  } else {
+    fg_new_output = eqv[(*outputs)[0]];
+  }
+  fg->set_output(fg_new_output, true);
+}
+
+void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, const std::vector<AnfNodePtr> &todos,
+               std::unordered_set<AnfNodePtr> *fused_ops, bool is_before_kernel_select) {
+  auto mng = kernel_graph->manager();
+  for (auto iter = todos.cbegin(); iter != todos.cend(); ++iter) {
+    auto node = (*iter)->cast<CNodePtr>();
+    if (node == nullptr) {
+      continue;
+    }
+    if (fused_ops->count(node)) {
+      continue;
+    }
+    auto fusable_basic_ops = get_fusable_basic_ops(is_before_kernel_select);
+    bool is_basic_op = std::any_of(fusable_basic_ops.begin(), fusable_basic_ops.end(),
+                                   [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+    if (!is_basic_op || !kernel_graph->nodes().contains(node)) {
+      continue;
+    }
+
+    auto fuse_nodes = FindFuseCNodes(node, is_before_kernel_select);
+    if (fuse_nodes.size() <= 1) {
+      continue;
+    }
+
+    FuncGraphPtr fg;
+    AnfNodePtrList inputs;
+    AnfNodePtrList outputs;
+    std::tie(fg, inputs, outputs) = compile::TransformSegmentToAnfGraph(fuse_nodes);
+    RemoveControlDependOut(fg, &outputs, mng);
+    auto fuse_new_node = CreateNewFuseCNode(kernel_graph, fg, inputs, outputs, is_before_kernel_select);
+
+    ReplaceNewFuseCNode(kernel_graph, fuse_new_node, outputs);
+
+    // Set graph kernel attr
+    std::string fuse_op_name = "";
+    for (auto &fuse_node : fuse_nodes) {
+      fuse_op_name += AnfAlgo::GetCNodePrimitive(fuse_node)->name() + "_";
+    }
+    fused_ops->insert(fuse_nodes.begin(), fuse_nodes.end());
+    fg->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(fuse_op_name));
+  }
+}
+}  // namespace
+
+void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto mng = kernel_graph->manager();
+  if (mng == nullptr) {
+    mng = Manage(kernel_graph, true);
+    kernel_graph->set_manager(mng);
+  }
+  std::unordered_set<AnfNodePtr> fused_ops;
+  auto todos = TopoSort(kernel_graph->get_return());
+  std::reverse(todos.begin(), todos.end());
+  FuseBasic(kernel_graph, todos, &fused_ops, is_before_kernel_select);
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/fuse_basic.h b/mindspore/ccsrc/pre_activate/pass/fuse_basic.h
new file mode 100644
index 0000000000..fbbf5d9937
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/fuse_basic.h
@@ -0,0 +1,29 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_BASIC_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_BASIC_H_
+
+#include <memory>
+#include "pre_activate/common/optimizer.h"
+#include "session/kernel_graph.h"
+
+namespace mindspore {
+namespace opt {
+void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select);
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_BASIC_H_
diff --git a/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.cc b/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.cc
new file mode 100644
index 0000000000..591b210335
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.cc
@@ -0,0 +1,562 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pre_activate/pass/fuse_graph_kernel.h"
+
+#include <memory>
+#include <string>
+#include <algorithm>
+#include <unordered_set>
+#include <map>
+#include <set>
+#include <queue>
+#include <vector>
+
+#include "operator/ops.h"
+#include "utils/utils.h"
+#include "utils/graph_utils.h"
+#include "pre_activate/common/helper.h"
+#include "session/anf_runtime_algorithm.h"
+#include "vm/segment_runner.h"
+#include "debug/draw.h"
+#include "debug/anf_ir_dump.h"
+#include "ir/func_graph_cloner.h"
+
+namespace mindspore {
+namespace opt {
+std::vector<PrimitivePtr> get_fusable_basic_ops(bool is_before_kernel_select) {
+  std::vector<PrimitivePtr> fusable_basic_ops = {
+    prim::kPrimAddN,       prim::kPrimTensorAdd,  prim::kPrimMul,      prim::kPrimSub, prim::kPrimMaximum,
+    prim::kPrimMinimum,    prim::kPrimNeg,        prim::kPrimRealDiv,  prim::kPrimPow, prim::kPrimSqrt,
+    prim::kPrimReciprocal, prim::kPrimExpandDims, prim::kPrimLessEqual};
+  if (!is_before_kernel_select) {
+    fusable_basic_ops.push_back(prim::kPrimCast);
+  }
+  return fusable_basic_ops;
+}
+
+std::vector<PrimitivePtr> get_fusable_basic_ops_with_reduce(bool is_before_kernel_select) {
+  std::vector<PrimitivePtr> fusable_basic_ops_with_reduce;
+  if (!is_before_kernel_select) {
+    fusable_basic_ops_with_reduce.push_back(prim::kPrimCast);
+  }
+  return fusable_basic_ops_with_reduce;
+}
+
+std::vector<PrimitivePtr> get_reduce_ops() {
+  std::vector<PrimitivePtr> reduce_ops = {prim::kPrimReduceSum, prim::kPrimReduceMean, prim::kPrimReduceMin,
+                                          prim::kPrimReduceMax, prim::kPrimReduceAll};
+  return reduce_ops;
+}
+
+void GetGraphKernelInfo(const FuncGraphPtr fg, GraphKernelInfo *info) {
+  MS_EXCEPTION_IF_NULL(fg);
+  auto reduce_ops = get_reduce_ops();
+  const auto &nodes = fg->nodes();
+  info->op_type = ELEWISE;
+  info->cal_step = -1;
+  info->reduce_op_num = 0;
+  for (auto node : nodes) {
+    auto cnode = node->cast<CNodePtr>();
+    if (cnode == nullptr) {
+      continue;
+    }
+    info->cal_step++;
+    auto prim = GetValueNode<PrimitivePtr>(cnode->input(0));
+    if (prim != nullptr) {
+      bool is_reudce = std::any_of(reduce_ops.begin(), reduce_ops.end(), [&prim](const PrimitivePtr &op) {
+        return op->hash() == prim->hash() && op->name() == prim->name();
+      });
+      if (is_reudce) {
+        info->op_type = REDUCE;
+        info->reduce_op_num++;
+      }
+    }
+  }
+}
+
+bool IsFuse(const GraphKernelInfo &info, const AnfNodePtr &node) {
+  auto fusable_basic_ops = get_fusable_basic_ops(info.is_before_kernel_select);
+  auto fusable_basic_ops_with_reduce = get_fusable_basic_ops_with_reduce(info.is_before_kernel_select);
+  bool is_fusable = false;
+  if (info.op_type == REDUCE &&
+      (info.cal_step >= MAX_REDUCE_OP_FUSION_CAL_STEP || info.reduce_op_num >= MAX_REDUCE_OP_FUSION_REDUCE_NUM)) {
+    is_fusable = std::any_of(fusable_basic_ops_with_reduce.begin(), fusable_basic_ops_with_reduce.end(),
+                             [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+  } else {
+    is_fusable = std::any_of(fusable_basic_ops.begin(), fusable_basic_ops.end(),
+                             [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+  }
+
+  return is_fusable;
+}
+
+IncludeType IncludeFusedBasicOpForward(const AnfNodePtr &cur_node, const GraphKernelInfo &info,
+                                       const AnfNodePtr &node) {
+  if (cur_node == node) {
+    return FOLLOW;
+  }
+  if (!IsPrimitiveCNode(node)) {
+    return EXCLUDE;
+  }
+
+  bool is_fusable = IsFuse(info, node);
+  return is_fusable ? FOLLOW : EXCLUDE;
+}
+
+IncludeType IncludeFusedBasicOpBackward(const AnfNodePtr &cur_node, const GraphKernelInfo &info,
+                                        const AnfNodePtr &node) {
+  if (cur_node == node) {
+    return FOLLOW;
+  }
+  if (AnfAlgo::IsGraphKernel(node)) {
+    auto cnode = node->cast<CNodePtr>();
+    auto fg = GetValueNode<FuncGraphPtr>(cnode->input(kAnfPrimitiveIndex));
+    auto fg_attr_val = fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+    MS_EXCEPTION_IF_NULL(fg_attr_val);
+    auto fg_attr = GetValue<std::string>(fg_attr_val);
+    if (fg_attr == kApplyMomentumOpName) {
+      return FOLLOW;
+    }
+    return EXCLUDE;
+  }
+  if (!IsPrimitiveCNode(node)) {
+    return EXCLUDE;
+  }
+
+  bool is_fusable = IsFuse(info, node);
+  return is_fusable ? FOLLOW : EXCLUDE;
+}
+
+bool CheckCircle(const std::set<AnfNodePtr> &fused_op_set, const AnfNodePtr &check_node,
+                 std::set<AnfNodePtr> *cached_unconnected_set) {
+  if (!check_node->isa<CNode>() || AnfAlgo::IsGraphKernel(check_node)) {
+    return false;
+  }
+
+  auto cnode = check_node->cast<CNodePtr>();
+  const auto &inputs = cnode->inputs();
+  // there is a input not in fused_op_set, but the input depends on the fused_op_set
+  bool has_circle = false;
+  for (auto input : inputs) {
+    if (input->isa<CNode>() && !fused_op_set.count(input)) {
+      std::set<AnfNodePtr> done;
+      std::vector<AnfNodePtr> todos = {input};
+      while (!todos.empty()) {
+        auto node = todos.back();
+        todos.pop_back();
+        if (done.count(node) || cached_unconnected_set->count(node)) {
+          continue;
+        }
+
+        done.insert(node);
+        if (fused_op_set.count(node)) {
+          has_circle = true;
+          break;
+        }
+
+        if (node->isa<CNode>()) {
+          auto cnode_ptr = node->cast<CNodePtr>();
+          for (auto it : cnode_ptr->inputs()) {
+            if (it->isa<CNode>()) {
+              todos.push_back(it);
+            }
+          }
+        }
+      }
+
+      if (has_circle) {
+        return true;
+      }
+      cached_unconnected_set->insert(done.begin(), done.end());
+    }
+  }
+
+  return false;
+}
+
+bool IsMakeTupleOut(const AnfNodePtr &out, AnfNodePtrList *real_outs) {
+  if (IsPrimitiveCNode(out, prim::kPrimMakeTuple)) {
+    auto &inputs = out->cast<CNodePtr>()->inputs();
+    for (size_t i = 1; i < inputs.size(); ++i) {
+      real_outs->push_back(inputs[i]);
+    }
+    return true;
+  }
+
+  if (AnfAlgo::GetCNodeFuncGraphPtr(out) != nullptr) {
+    auto fg = AnfAlgo::GetCNodeFuncGraphPtr(out);
+    auto fg_out = fg->output();
+    if (IsPrimitiveCNode(fg_out, prim::kPrimMakeTuple)) {
+      auto inputs = fg_out->cast<CNodePtr>()->inputs();
+      for (size_t i = 1; i < inputs.size(); ++i) {
+        real_outs->push_back(inputs[i]);
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<AnfNodePtr> RemoveCircle(const std::vector<AnfNodePtr> &fused_op, bool is_backward) {
+  std::set<AnfNodePtr> cached_unconnected_set;
+  std::set<AnfNodePtr> fused_op_set(fused_op.begin(), fused_op.end());
+  auto include = [&fused_op_set](const AnfNodePtr &node) {
+    if (fused_op_set.count(node)) {
+      return FOLLOW;
+    }
+    return EXCLUDE;
+  };
+  for (auto iter = fused_op.rbegin(); iter != fused_op.rend(); ++iter) {
+    bool has_circle = CheckCircle(fused_op_set, *iter, &cached_unconnected_set);
+    // delete the circle node and the node which depend on the circle node in fused op
+    if (has_circle) {
+      auto mng = (*iter)->func_graph()->manager();
+      std::vector<AnfNodePtr> erase_nodes;
+      if (is_backward) {
+        erase_nodes = DeepUsersSearch(*iter, include, mng);
+      } else {
+        erase_nodes = DeepLinkedGraphSearch(*iter, include);
+      }
+      for (auto erase_node : erase_nodes) {
+        fused_op_set.erase(erase_node);
+      }
+    }
+  }
+
+  std::vector<AnfNodePtr> res;
+  for (auto node : fused_op) {
+    if (fused_op_set.count(node)) {
+      res.push_back(node);
+    }
+  }
+  return res;
+}
+
+void TopoSortForNodeList(std::vector<AnfNodePtr> *lst) {
+  if (lst->size() < 2) {
+    return;
+  }
+
+  std::vector<AnfNodePtr> res;
+  std::set<AnfNodePtr> node_sets(lst->begin(), lst->end());
+  std::map<AnfNodePtr, std::set<AnfNodePtr>> ins;
+  std::map<AnfNodePtr, std::set<AnfNodePtr>> outs;
+  std::queue<AnfNodePtr> q;
+  for (auto node : *lst) {
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    for (auto input : cnode->inputs()) {
+      if (!node_sets.count(input)) {
+        continue;
+      }
+      // out_degree
+      outs[input].insert(node);
+      // in_degree
+      ins[node].insert(input);
+    }
+    if (!ins.count(node)) {
+      ins[node] = {};
+    }
+  }
+
+  for (auto p : ins) {
+    if (p.second.size() == 0) {
+      q.push(p.first);
+    }
+  }
+
+  while (!q.empty()) {
+    auto node = q.front();
+    q.pop();
+    res.push_back(node);
+    if (!outs.count(node)) {
+      continue;
+    }
+    for (auto out : outs[node]) {
+      if (!ins.count(out)) {
+        continue;
+      }
+      ins[out].erase(node);
+      if (ins[out].size() == 0) {
+        q.push(out);
+      }
+    }
+  }
+
+  lst->assign(res.begin(), res.end());
+}
+
+std::vector<AnfNodePtr> FindFuseCNodes(const CNodePtr &cnode, bool is_before_kernel_select) {
+  auto func_graph = cnode->func_graph();
+  auto graph_kernel_g = GetValueNode<FuncGraphPtr>(cnode->input(0));
+  GraphKernelInfo info;
+  info.is_before_kernel_select = is_before_kernel_select;
+  GetGraphKernelInfo(graph_kernel_g, &info);
+  auto mng = func_graph->manager();
+  // Search fusable nodes according input direction.
+  auto include_func_forward = std::bind(IncludeFusedBasicOpForward, cnode, info, std::placeholders::_1);
+  auto used_nodes = DeepLinkedGraphSearch(cnode, include_func_forward);
+  std::reverse(used_nodes.begin(), used_nodes.end());
+  // Search fusable nodes according output direction.
+  auto include_func_backward = std::bind(IncludeFusedBasicOpBackward, cnode, info, std::placeholders::_1);
+  auto user_nodes = DeepUsersSearch(cnode, include_func_backward, mng);
+
+  used_nodes.insert(used_nodes.end(), user_nodes.begin() + 1, user_nodes.end());
+  if (used_nodes.size() > 1) {
+    used_nodes = RemoveCircle(used_nodes);
+  }
+  TopoSortForNodeList(&used_nodes);
+  return used_nodes;
+}
+
+AbstractBasePtr GetOutputAbstract(const AnfNodePtr &node, size_t output_idx) {
+  auto out_spec = node->abstract();
+  if (out_spec->isa<abstract::AbstractTuple>()) {
+    return out_spec->cast<abstract::AbstractTuplePtr>()->elements()[output_idx];
+  }
+  return out_spec;
+}
+
+AnfNodePtr CreateNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const FuncGraphPtr &fg,
+                              const AnfNodePtrList &inputs, const AnfNodePtrList &outputs,
+                              bool is_before_kernel_select) {
+  auto func_node = NewValueNode(fg);
+  std::vector<AnfNodePtr> fn_inputs;
+  fn_inputs.push_back(func_node);
+  fn_inputs.insert(fn_inputs.end(), inputs.begin(), inputs.end());
+  auto fuse_cnode = kernel_graph->NewCNode(fn_inputs);
+  // Set output abstract
+  if (outputs.size() > 1) {
+    std::vector<AbstractBasePtr> out_specs;
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      out_specs.push_back(outputs[i]->abstract());
+    }
+    auto out_spec = std::make_shared<abstract::AbstractTuple>(out_specs);
+    fuse_cnode->set_abstract(out_spec);
+  } else {
+    fuse_cnode->set_abstract(outputs[0]->abstract());
+  }
+  // Set parameter abstract.
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto kernel_with_index = AnfAlgo::VisitKernel(inputs[i], 0);
+    auto input_abs = GetOutputAbstract(kernel_with_index.first, kernel_with_index.second);
+    fg->parameters()[i]->set_abstract(input_abs);
+    if (is_before_kernel_select) {
+      fg->parameters()[i]->set_kernel_info(std::make_shared<device::KernelInfo>());
+    }
+  }
+  // Set kernel info.
+  if (!is_before_kernel_select) {
+    std::vector<std::string> graph_input_format;
+    std::vector<TypeId> graph_input_type;
+    std::vector<std::string> graph_output_format;
+    std::vector<TypeId> graph_output_type;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto kernel_with_index = AnfAlgo::VisitKernel(inputs[i], 0);
+      auto input_format = AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second);
+      graph_input_format.push_back(input_format);
+      auto input_type = AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second);
+      graph_input_type.push_back(input_type);
+      auto input_abs = GetOutputAbstract(kernel_with_index.first, kernel_with_index.second);
+      fg->parameters()[i]->set_abstract(input_abs);
+    }
+    auto new_outputs = outputs;
+    if (outputs.size() == 1 && AnfAlgo::IsGraphKernel(outputs[0])) {
+      std::vector<AnfNodePtr> real_outs;
+      if (IsMakeTupleOut(outputs[0], &real_outs)) {
+        new_outputs = real_outs;
+      }
+    }
+    for (size_t i = 0; i < new_outputs.size(); ++i) {
+      auto kernel_with_index = AnfAlgo::VisitKernel(new_outputs[i], 0);
+      auto output_format = AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second);
+      auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second);
+      graph_output_format.push_back(output_format);
+      graph_output_type.push_back(output_type);
+    }
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder;
+    graph_info_builder.SetInputsFormat(graph_input_format);
+    graph_info_builder.SetInputsDeviceType(graph_input_type);
+    graph_info_builder.SetOutputsFormat(graph_output_format);
+    graph_info_builder.SetOutputsDeviceType(graph_output_type);
+    graph_info_builder.SetProcessor(kernel::Processor::AICORE);
+    graph_info_builder.SetKernelType(KernelType::AKG_KERNEL);
+    graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
+    auto graph_selected_info = graph_info_builder.Build();
+    AnfAlgo::SetSelectKernelBuildInfo(graph_selected_info, fuse_cnode.get());
+  }
+  return fuse_cnode;
+}
+
+void ReplaceNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const AnfNodePtr &new_fuse_cnode,
+                         const AnfNodePtrList &outputs) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto mng = kernel_graph->manager();
+  MS_EXCEPTION_IF_NULL(mng);
+  // single out
+  if (outputs.size() == 1) {
+    mng->Replace(outputs[0], new_fuse_cnode);
+    return;
+  }
+
+  std::vector<AnfNodePtr> fn_inputs;
+  for (size_t out_idx = 0; out_idx < outputs.size(); out_idx++) {
+    AnfNodePtrList real_outs;
+    // not make tuple out, replace
+    if (!IsMakeTupleOut(outputs[out_idx], &real_outs)) {
+      fn_inputs.clear();
+      fn_inputs.push_back(NewValueNode(prim::kPrimTupleGetItem));
+      fn_inputs.push_back(new_fuse_cnode);
+      fn_inputs.push_back(NewValueNode(MakeValue(SizeToInt(out_idx))));
+      auto new_out = kernel_graph->NewCNode(fn_inputs);
+      new_out->set_abstract(outputs[out_idx]->abstract());
+      mng->Replace(outputs[out_idx], new_out);
+      continue;
+    }
+
+    // the out is make tuple , modify the get_item node's value
+    auto users = mng->node_users()[outputs[out_idx]];
+    for (auto &user : users) {
+      auto use_node = user.first;
+      if (use_node->isa<CNode>() && (IsPrimitiveCNode(use_node, prim::kPrimTupleGetItem))) {
+        auto get_item_cnode = use_node->cast<CNodePtr>();
+        auto value_input = get_item_cnode->input(kInputNodeOutputIndexInTupleGetItem);
+        MS_EXCEPTION_IF_NULL(value_input);
+        auto value_node = value_input->cast<ValueNodePtr>();
+        MS_EXCEPTION_IF_NULL(value_node);
+        int item_idx = GetValue<int>(value_node->value());
+        int new_item_idx = SizeToInt(out_idx) + item_idx;
+        fn_inputs.clear();
+        fn_inputs.push_back(NewValueNode(prim::kPrimTupleGetItem));
+        fn_inputs.push_back(new_fuse_cnode);
+        fn_inputs.push_back(NewValueNode(new_item_idx));
+        auto new_out = kernel_graph->NewCNode(fn_inputs);
+        new_out->set_abstract(get_item_cnode->abstract());
+        mng->Replace(get_item_cnode, new_out);
+      }
+    }
+  }
+}
+
+AnfNodePtrList EliminateMakeTuple(FuncGraphPtr *fg, FuncGraphManagerPtr *mng) {
+  AnfNodePtrList outs;
+  auto out_node = (*fg)->output();
+  if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple)) {
+    std::vector<AnfNodePtr> output_args;
+    auto out_cnode = out_node->cast<CNodePtr>();
+    for (auto out : out_cnode->inputs()) {
+      if (IsPrimitiveCNode(out, prim::kPrimMakeTuple)) {
+        auto inputs = out->cast<CNodePtr>()->inputs();
+        for (size_t i = 1; i < inputs.size(); ++i) {
+          output_args.push_back(inputs[i]);
+        }
+      } else {
+        output_args.push_back(out);
+      }
+    }
+    if (output_args.size() != out_cnode->inputs().size()) {
+      auto new_out = (*fg)->NewCNode(output_args);
+      (*mng)->Replace(out_node, new_out);
+    }
+
+    for (size_t i = 1; i < output_args.size(); ++i) {
+      outs.push_back(output_args[i]);
+    }
+    return outs;
+  }
+
+  outs.push_back(out_node);
+  return outs;
+}
+
+AnfNodePtrList GetExpandOuts(const AnfNodePtrList &outs) {
+  AnfNodePtrList res;
+  if (outs.size() <= 1) {
+    return outs;
+  }
+
+  for (auto out : outs) {
+    AnfNodePtrList real_outs;
+    if (IsMakeTupleOut(out, &real_outs)) {
+      res.insert(res.end(), real_outs.begin(), real_outs.end());
+      continue;
+    }
+    res.push_back(out);
+  }
+  return res;
+}
+
+void FuseGraphKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto mng = kernel_graph->manager();
+  if (mng == nullptr) {
+    mng = Manage(kernel_graph, true);
+    kernel_graph->set_manager(mng);
+  }
+  auto &todos = kernel_graph->execution_order();
+  for (auto iter = todos.cbegin(); iter != todos.cend(); ++iter) {
+    auto node = *iter;
+    if (!AnfAlgo::IsGraphKernel(node) || !kernel_graph->nodes().contains(node)) {
+      continue;
+    }
+
+    auto origin_fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    auto fg_attr = origin_fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+    if (fg_attr != nullptr) {
+      auto fg_name = GetValue<std::string>(fg_attr);
+      if (graph_kernel_black_list.count(fg_name) != 0) {
+        continue;
+      }
+    }
+
+    auto fuse_nodes = FindFuseCNodes(node, is_before_kernel_select);
+    if (fuse_nodes.size() <= 1) {
+      continue;
+    }
+
+    FuncGraphPtr fg;
+    AnfNodePtrList inputs;
+    AnfNodePtrList outputs;
+    std::tie(fg, inputs, outputs) = compile::TransformSegmentToAnfGraph(fuse_nodes);
+
+    // Remove nest make tuple in outs
+    auto expand_out = GetExpandOuts(outputs);
+    auto fuse_new_node = CreateNewFuseCNode(kernel_graph, fg, inputs, expand_out, is_before_kernel_select);
+
+    ReplaceNewFuseCNode(kernel_graph, fuse_new_node, outputs);
+
+    // Inline origin graphkernel
+    auto cnodes = fg->GetOrderedCnodes();
+    for (const auto &n : cnodes) {
+      if (!AnfAlgo::IsGraphKernel(n)) {
+        continue;
+      }
+      auto graph_kernel_g = GetValueNode<FuncGraphPtr>(n->input(0));
+      AnfNodePtrList ins;
+      ins.insert(ins.end(), n->inputs().begin() + 1, n->inputs().end());
+      auto out = InlineClone(graph_kernel_g, fg, ins, n->input(0)->scope());
+      mng->Replace(n, out);
+    }
+
+    EliminateMakeTuple(&fg, &mng);
+    // Set graphkernel flag
+    auto ori_fg = GetValueNode<FuncGraphPtr>(node->input(kAnfPrimitiveIndex));
+    fg->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, ori_fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL));
+  }
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.h b/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.h
new file mode 100644
index 0000000000..a5a26765a3
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.h
@@ -0,0 +1,63 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_GRAPH_KERNEL_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_GRAPH_KERNEL_H_
+
+#include <set>
+#include <string>
+#include <vector>
+#include <memory>
+#include "pre_activate/common/optimizer.h"
+#include "session/kernel_graph.h"
+
+namespace mindspore {
+namespace opt {
+enum GraphKernelType {
+  ELEWISE = 0,  // only contain elewise basic ops
+  REDUCE,       // contain reduce ops
+  CUBE,         // contain cube ops
+};
+struct GraphKernelInfo {
+  GraphKernelType op_type = ELEWISE;
+  bool is_before_kernel_select = false;
+  int reduce_op_num = 0;
+  int cal_step = 0;
+};
+
+// when reduce graph kernel's cal step is greater than this number, not fuse
+const int MAX_REDUCE_OP_FUSION_CAL_STEP = 5;
+// when reduce graph kernel contain reduce op num is greater than this number, not fuse
+const int MAX_REDUCE_OP_FUSION_REDUCE_NUM = 2;
+
+const std::set<std::string> graph_kernel_black_list = {"BNTrainingUpdateSum", "ApplyMomentum", "LayerNormForward",
+                                                       "LambNextMV", "LambUpdateWithLR"};
+
+std::vector<AnfNodePtr> RemoveCircle(const std::vector<AnfNodePtr> &fused_op, bool is_backward = true);
+
+void TopoSortForNodeList(std::vector<AnfNodePtr> *lst);
+
+AnfNodePtr CreateNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const FuncGraphPtr &fg,
+                              const AnfNodePtrList &inputs, const AnfNodePtrList &outputs,
+                              bool is_before_kernel_select);
+
+void ReplaceNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const AnfNodePtr &new_fuse_cnode,
+                         const AnfNodePtrList &outputs);
+
+void FuseGraphKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select = false);
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_GRAPH_KERNEL_H_
diff --git a/mindspore/ccsrc/pre_activate/pass/optimize_dependence.cc b/mindspore/ccsrc/pre_activate/pass/optimize_dependence.cc
index 86a90a4dfe..1d5f909e7d 100644
--- a/mindspore/ccsrc/pre_activate/pass/optimize_dependence.cc
+++ b/mindspore/ccsrc/pre_activate/pass/optimize_dependence.cc
@@ -44,11 +44,11 @@ AnfNodePtr GetReplaceNode(const AnfNodePtr &node) {
   return cnode->input(kSingleInputIndex);
 }
 
-bool ReplaceMakeTuple(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
+AnfNodePtr ReplaceMakeTuple(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(cnode);
   if (AnfAlgo::GetCNodeName(cnode) != prim::kPrimMakeTuple->name()) {
-    return false;
+    return nullptr;
   }
   std::vector<AnfNodePtr> new_make_tuple_inputs;
   bool need_update = false;
@@ -75,17 +75,16 @@ bool ReplaceMakeTuple(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
     auto manager = func_graph->manager();
     MS_EXCEPTION_IF_NULL(manager);
     manager->Replace(cnode, new_make_tuple);
+    return new_make_tuple;
   }
-  return true;
+  return nullptr;
 }
 }  // namespace
 
 const BaseRef OptimizeDependence::DefinePattern() const {
-  VarPtr X = std::make_shared<Var>("X");
-  MS_EXCEPTION_IF_NULL(X);
-  VarPtr Y = std::make_shared<Var>("Y");
-  MS_EXCEPTION_IF_NULL(Y);
-  return VectorRef({prim::kPrimDepend, X, Y});
+  VarPtr X = std::make_shared<Var>();
+  VarPtr Xs = std::make_shared<SeqVar>();
+  return VectorRef({X, Xs});
 }
 
 const AnfNodePtr OptimizeDependence::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
@@ -95,29 +94,31 @@ const AnfNodePtr OptimizeDependence::Process(const FuncGraphPtr &func_graph, con
   if (!node->isa<CNode>()) {
     return nullptr;
   }
+  auto node_name = AnfAlgo::GetCNodeName(node);
+  if (node_name != prim::kPrimControlDepend->name() && node_name != prim::kPrimDepend->name()) {
+    return nullptr;
+  }
+  size_t index = 0;
   auto depend_cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(depend_cnode);
-  CheckCNodeInputSize(depend_cnode, kDependInputNum);
-  auto replacing_node = depend_cnode->input(kDependInputNum - 1);
-  MS_EXCEPTION_IF_NULL(replacing_node);
-  if (!replacing_node->isa<CNode>()) {
-    return nullptr;
+  std::vector<AnfNodePtr> new_depend_inputs = {depend_cnode->input(kAnfPrimitiveIndex)};
+  if (node_name == prim::kPrimDepend->name()) {
+    index = 1;
+    new_depend_inputs.push_back(depend_cnode->input(kRealInputIndexInDepend));
   }
-  auto replacing_cnode = replacing_node->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(replacing_cnode);
-  // Deal with the make_tuple with TransData or Cast inputs.
-  if (ReplaceMakeTuple(func_graph, replacing_cnode)) {
-    return nullptr;
+  if (AnfAlgo::GetInputTensorNum(depend_cnode) < 2) {
+    MS_LOG(EXCEPTION) << "The depend node input size is at less size 2,but got "
+                      << AnfAlgo::GetInputTensorNum(depend_cnode) << depend_cnode->DebugString();
   }
-  AnfNodePtr replace_node = GetReplaceNode(replacing_cnode);
-  if (replace_node == nullptr) {
-    MS_LOG(DEBUG) << "Can not find the TransData or Cast with single output node. Depend node: " << node->DebugString();
-    return nullptr;
+  auto input_num = AnfAlgo::GetInputTensorNum(depend_cnode);
+  while (index < input_num) {
+    auto replace_node = GetConvertNode(func_graph, node, index);
+    MS_EXCEPTION_IF_NULL(replace_node);
+    new_depend_inputs.push_back(replace_node);
+    ++index;
   }
-  std::vector<AnfNodePtr> new_depend_inputs = {depend_cnode->input(kAnfPrimitiveIndex),
-                                               depend_cnode->input(kRealInputIndexInDepend), replace_node};
   auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
-  CNodePtr new_depend;
+  CNodePtr new_depend = nullptr;
   if (kernel_graph == nullptr) {
     new_depend = func_graph->NewCNode(new_depend_inputs);
     MS_EXCEPTION_IF_NULL(new_depend);
@@ -130,5 +131,31 @@ const AnfNodePtr OptimizeDependence::Process(const FuncGraphPtr &func_graph, con
   }
   return new_depend;
 }
+
+const AnfNodePtr OptimizeDependence::GetConvertNode(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                                    const size_t index) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  auto depend_cnode = node->cast<CNodePtr>();
+  auto replacing_node = AnfAlgo::GetInputNode(depend_cnode, index);
+  MS_EXCEPTION_IF_NULL(replacing_node);
+  if (!replacing_node->isa<CNode>()) {
+    return replacing_node;
+  }
+  auto replacing_cnode = replacing_node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(replacing_cnode);
+  // Deal with the make_tuple with TransData or Cast inputs.
+  auto make_tuple_replace_node = ReplaceMakeTuple(graph, replacing_cnode);
+  if (make_tuple_replace_node != nullptr) {
+    return make_tuple_replace_node;
+  }
+  AnfNodePtr replace_node = GetReplaceNode(replacing_cnode);
+  if (replace_node == nullptr) {
+    MS_LOG(DEBUG) << "Can not find the TransData or Cast with single output node. Depend node: " << node->DebugString();
+    return replacing_node;
+  }
+  return replace_node;
+}
+
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/optimize_dependence.h b/mindspore/ccsrc/pre_activate/pass/optimize_dependence.h
index d2995cdd30..30027b790a 100644
--- a/mindspore/ccsrc/pre_activate/pass/optimize_dependence.h
+++ b/mindspore/ccsrc/pre_activate/pass/optimize_dependence.h
@@ -27,6 +27,7 @@ class OptimizeDependence : public PatternProcessPass {
   ~OptimizeDependence() override = default;
   const BaseRef DefinePattern() const override;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+  const AnfNodePtr GetConvertNode(const FuncGraphPtr &graph, const AnfNodePtr &node, const size_t index) const;
 };
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/converter/attr_utils/convert_util.h b/mindspore/ccsrc/predict/converter/attr_utils/convert_util.h
index d29e5e532e..5c7551a190 100644
--- a/mindspore/ccsrc/predict/converter/attr_utils/convert_util.h
+++ b/mindspore/ccsrc/predict/converter/attr_utils/convert_util.h
@@ -36,7 +36,6 @@ using GraphDefT = mindspore::predict::GraphDefT;
 using TensorDefT = mindspore::predict::TensorDefT;
 using SubGraphDefT = mindspore::predict::SubGraphDefT;
 using SubGraphPtr = std::unique_ptr<mindspore::predict::SubGraphDefT>;
-using NodeDef = mindspore::predict::NodeDefT;
 using MsDataType = mindspore::predict::DataType;
 using MsFormat = mindspore::predict::Format;
 using MsKernelKey = void *;
diff --git a/mindspore/ccsrc/predict/converter/kernel2ms.cc b/mindspore/ccsrc/predict/converter/kernel2ms.cc
index 32cdee1350..902efac720 100644
--- a/mindspore/ccsrc/predict/converter/kernel2ms.cc
+++ b/mindspore/ccsrc/predict/converter/kernel2ms.cc
@@ -108,8 +108,7 @@ bool Kernel2Ms::SetGraphOutputIdx(const KernelGraphPtr &kernel_graph_ptr, const
 }
 
 bool Kernel2Ms::SetOpOutputIdx(const CNodePtr &c_node_ptr, const TensorPtr &output_tensor,
-                               const TensorCachePtr &tensor_cache, int ref_count, size_t order_index,
-                               NodeDef *ms_node) {
+                               const TensorCachePtr &tensor_cache, int ref_count, size_t order_index, OpDefT *ms_node) {
   MS_EXCEPTION_IF_NULL(c_node_ptr);
   MS_EXCEPTION_IF_NULL(output_tensor);
   MS_EXCEPTION_IF_NULL(ms_node);
@@ -123,7 +122,7 @@ bool Kernel2Ms::SetOpOutputIdx(const CNodePtr &c_node_ptr, const TensorPtr &outp
   std::vector<int> tensor_shape;
   (void)std::transform(host_shape.begin(), host_shape.end(), std::back_inserter(tensor_shape), SizeToInt);
   int outputIndex = tensor_cache->addExTensor(tensor_key, output_tensor, ref_count, tensor_shape, KERNEL);
-  ms_node->opDef->outputIndex.push_back(outputIndex);
+  ms_node->outputIndex.push_back(outputIndex);
   return true;
 }
 
@@ -164,7 +163,7 @@ void Kernel2Ms::GetRealInpoutsPtr(const AnfNodePtr &node, std::vector<AnfNodePtr
   }
 }
 
-bool Kernel2Ms::SetOpInputIdx(const CNodePtr &c_node_ptr, const TensorCachePtr &tensor_cache, NodeDef *ms_node) {
+bool Kernel2Ms::SetOpInputIdx(const CNodePtr &c_node_ptr, const TensorCachePtr &tensor_cache, OpDefT *ms_node) {
   MS_EXCEPTION_IF_NULL(c_node_ptr);
   MS_EXCEPTION_IF_NULL(tensor_cache);
   MS_EXCEPTION_IF_NULL(ms_node);
@@ -184,7 +183,7 @@ bool Kernel2Ms::SetOpInputIdx(const CNodePtr &c_node_ptr, const TensorCachePtr &
       }
       ExTensorPtr ex_tensor_ptr = ex_tensor_list[real_output_idx[j]];
       ex_tensor_list.clear();
-      ms_node->opDef->inputIndex.push_back(ex_tensor_ptr->index_);
+      ms_node->inputIndex.push_back(ex_tensor_ptr->index_);
     }
   }
   return true;
@@ -397,19 +396,17 @@ bool Kernel2Ms::SetGraphOpTensors(const KernelGraphPtr &kernel_graph_ptr, const
       return false;
     }
     auto kernel_key = node_indexs_[kernel.get()];
-    std::unique_ptr<NodeDef> ms_node(new NodeDef);
+    std::unique_ptr<OpDefT> ms_node(new OpDefT);
+    ms_node->name = kernel->fullname_with_scope();
     ms_node->fmkType = mindspore::predict::FmkType_CAFFE;
-    std::unique_ptr<OpDefT> ms_op(new OpDefT());
     auto c_name = AnfAlgo::GetCNodeName(kernel);
     auto fun = predict::convert::OpAttrFactory::GetInstance()->GetPackFun(c_name);
     if (fun == nullptr) {
-      MS_LOG(ERROR) << "get node [" << kernel->fullname_with_scope() << "] attr failed.";
-      return false;
-    } else if (!fun(kernel, ms_op.get())) {
+      MS_LOG(WARNING) << "get node [" << kernel->fullname_with_scope() << "] attr failed.";
+    } else if (!fun(kernel, ms_node.get())) {
       MS_LOG(ERROR) << "set node [" << kernel->fullname_with_scope() << "] attr failed.";
       return false;
     }
-    ms_node->opDef = std::move(ms_op);
     auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
     int nodeRefCount = SizeToInt(output_size);
     for (size_t j = 0; j < output_size; ++j) {
@@ -466,7 +463,7 @@ bool Kernel2Ms::KernelGraph2MsGraph(const KernelGraphPtr &kernel_graph_ptr) {
     if (!SetOpInputIdx(kernels[i], tensor_cache_ptr_, ms_node)) {
       return false;
     }
-    std::unique_ptr<NodeDef> ms_node_tmp(ms_node);
+    std::unique_ptr<OpDefT> ms_node_tmp(ms_node);
     sub_ms_graph->nodes.emplace_back(std::move(ms_node_tmp));
   }
   if (!SetAllTensors(tensor_cache_ptr_, sub_ms_graph.get())) {
diff --git a/mindspore/ccsrc/predict/converter/kernel2ms.h b/mindspore/ccsrc/predict/converter/kernel2ms.h
index f991ecc94a..7013f88107 100644
--- a/mindspore/ccsrc/predict/converter/kernel2ms.h
+++ b/mindspore/ccsrc/predict/converter/kernel2ms.h
@@ -64,10 +64,10 @@ class Kernel2Ms {
 
   bool SetAllTensors(const TensorCachePtr &tensor_cache, SubGraphDefT *sub_graph_def_t);
 
-  bool SetOpInputIdx(const CNodePtr &c_node_ptr, const TensorCachePtr &tensor_cache, NodeDef *ms_node);
+  bool SetOpInputIdx(const CNodePtr &c_node_ptr, const TensorCachePtr &tensor_cache, OpDefT *ms_node);
 
   bool SetOpOutputIdx(const CNodePtr &c_node_ptr, const TensorPtr &output_tensor, const TensorCachePtr &tensor_cache,
-                      int ref_count, size_t order_index, NodeDef *ms_node);
+                      int ref_count, size_t order_index, OpDefT *ms_node);
 
   bool SetGraphOutputIdx(const KernelGraphPtr &kernel_graph_ptr, const TensorCachePtr &tensor_cache,
                          SubGraphDefT *sub_graph_def_t, AllOutputTensors *all_output_tensors);
@@ -102,7 +102,7 @@ class Kernel2Ms {
   bool SetMemResue() const;
   SubGraphPtr sub_ms_graph_;
   AllOutputTensors all_output_tensors_;
-  std::vector<NodeDef *> tmp_op_nodes_;
+  std::vector<OpDefT *> tmp_op_nodes_;
   std::unordered_map<MsKernelKey, int> node_indexs_;
   std::unordered_map<int, MsKernelKey> index_nodes_;
   int graph_index_ = 0;
diff --git a/mindspore/ccsrc/predict/converter/lite_model/op_attr_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/op_attr_packer.cc
index e6fec3d540..52648812be 100644
--- a/mindspore/ccsrc/predict/converter/lite_model/op_attr_packer.cc
+++ b/mindspore/ccsrc/predict/converter/lite_model/op_attr_packer.cc
@@ -33,6 +33,14 @@ bool CastPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
 bool MeanPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
 bool SoftmaxPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
 bool ScalePacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
+bool AddFoldPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
+bool ArgMaxPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
+bool BatchNormFoldPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
+bool FakeQuantWithMinMaxPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
+bool FakeQuantWithMinMaxPerChannelPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
+bool MulPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
+bool MulFoldPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
+bool SqueezePacker(const CNodePtr &c_node_ptr, OpDefT *ms_op);
 
 OpAttrFactory::OpAttrFactory() {
   pack_funs_ = {{"Conv2D", Conv2dPacker},
@@ -60,23 +68,31 @@ OpAttrFactory::OpAttrFactory() {
                 {"TensorAdd", AddPacker},
                 {"SoftMax", SoftmaxPacker},
                 {"SimpleMean", MeanPacker},
-                {"Scale", ScalePacker}};
+                {"ReduceMean", MeanPacker},
+                {"AddFold", AddFoldPacker},
+                {"ArgMax", ArgMaxPacker},
+                {"BatchNorm", BatchNormFoldPacker},
+                {"FakeQuantPerLayer", FakeQuantWithMinMaxPacker},
+                {"FakeQuantPerChannel", FakeQuantWithMinMaxPerChannelPacker},
+                {"Mul", MulPacker},
+                {"MulFold", MulFoldPacker},
+                {"Squeeze", SqueezePacker}};
 }
 OpAttrPackFun OpAttrFactory::GetPackFun(const std::string &opType) {
   if (pack_funs_.find(opType) == pack_funs_.end()) {
-    MS_LOG(ERROR) << "Op Attr pack fun  [\" << opType << \"] not found.";
+    MS_LOG(WARNING) << "Op Attr pack fun  [" << opType << "] not found.";
     return nullptr;
   }
   return pack_funs_[opType];
 }
 
-mindspore::predict::DataFormatType GetAttrFormat(const std::string &format) {
+mindspore::predict::Format GetAttrFormat(const std::string &format) {
   if (format == kOpFormat_NCHW) {
-    return predict::DataFormatType::DataFormatType_NCHW;
+    return predict::Format::Format_NCHW;
   } else if (format == kOpFormat_NHWC) {
-    return predict::DataFormatType::DataFormatType_NHWC;
+    return predict::Format::Format_NHWC;
   } else {
-    return predict::DataFormatType::DataFormatType_UNKNOW;
+    return predict::Format::Format_NUM_OF_FORMAT;
   }
 }
 
diff --git a/mindspore/ccsrc/predict/converter/lite_model/op_attr_packer.h b/mindspore/ccsrc/predict/converter/lite_model/op_attr_packer.h
index 83d0f9287b..89e38d1871 100644
--- a/mindspore/ccsrc/predict/converter/lite_model/op_attr_packer.h
+++ b/mindspore/ccsrc/predict/converter/lite_model/op_attr_packer.h
@@ -48,7 +48,7 @@ class OpAttrFactory {
   std::unordered_map<std::string, OpAttrPackFun> pack_funs_;
 };
 
-mindspore::predict::DataFormatType GetAttrFormat(const std::string &format);
+mindspore::predict::Format GetAttrFormat(const std::string &format);
 
 mindspore::predict::PadMode GetAttrPadMode(const std::string &pad_mode);
 }  // namespace convert
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/add_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/add_packer.cc
index 81a2d3a9af..02a9bda65e 100644
--- a/mindspore/ccsrc/predict/converter/lite_model/operations/add_packer.cc
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/add_packer.cc
@@ -25,7 +25,6 @@ bool AddPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
   }
   std::unique_ptr<AddT> attr(new AddT());
   MS_EXCEPTION_IF_NULL(attr);
-  attr->format = predict::DataFormatType::DataFormatType_NCHW;
   ms_op->name = c_node_ptr->fullname_with_scope();
   ms_op->attr.type = OpT_Add;
   ms_op->attr.value = attr.release();
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/addfold_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/addfold_packer.cc
new file mode 100644
index 0000000000..b6affd5001
--- /dev/null
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/addfold_packer.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "predict/converter/lite_model/op_attr_packer.h"
+
+namespace mindspore {
+namespace predict {
+namespace convert {
+bool AddFoldPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
+  if (c_node_ptr == nullptr || ms_op == nullptr) {
+    return false;
+  }
+  std::unique_ptr<AddFoldT> attr(new AddFoldT());
+  MS_EXCEPTION_IF_NULL(attr);
+  ms_op->attr.type = OpT_AddFold;
+  ms_op->attr.value = attr.release();
+  return true;
+}
+}  // namespace convert
+}  // namespace predict
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/argmax_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/argmax_packer.cc
new file mode 100644
index 0000000000..4df643704c
--- /dev/null
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/argmax_packer.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "predict/converter/lite_model/op_attr_packer.h"
+
+namespace mindspore {
+namespace predict {
+namespace convert {
+bool ArgMaxPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
+  if (c_node_ptr == nullptr || ms_op == nullptr) {
+    return false;
+  }
+  std::unique_ptr<ArgMaxT> attr(new ArgMaxT());
+  MS_EXCEPTION_IF_NULL(attr);
+  ms_op->attr.type = OpT_ArgMax;
+  ms_op->attr.value = attr.release();
+  return true;
+}
+}  // namespace convert
+}  // namespace predict
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/batchnormfold_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/batchnormfold_packer.cc
new file mode 100644
index 0000000000..f05f3894be
--- /dev/null
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/batchnormfold_packer.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "predict/converter/lite_model/op_attr_packer.h"
+
+namespace mindspore {
+namespace predict {
+namespace convert {
+bool BatchNormFoldPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
+  if (c_node_ptr == nullptr || ms_op == nullptr) {
+    return false;
+  }
+  std::unique_ptr<BatchNormFoldT> attr(new BatchNormFoldT());
+  MS_EXCEPTION_IF_NULL(attr);
+  ms_op->attr.type = OpT_BatchNormFold;
+  ms_op->attr.value = attr.release();
+  return true;
+}
+}  // namespace convert
+}  // namespace predict
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/fakequantwithminmax_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/fakequantwithminmax_packer.cc
new file mode 100644
index 0000000000..195a4fde9f
--- /dev/null
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/fakequantwithminmax_packer.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "predict/converter/lite_model/op_attr_packer.h"
+
+namespace mindspore {
+namespace predict {
+namespace convert {
+bool FakeQuantWithMinMaxPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
+  if (c_node_ptr == nullptr || ms_op == nullptr) {
+    return false;
+  }
+  std::unique_ptr<FakeQuantWithMinMaxT> attr(new FakeQuantWithMinMaxT());
+  MS_EXCEPTION_IF_NULL(attr);
+  ms_op->attr.type = OpT_FakeQuantWithMinMax;
+  ms_op->attr.value = attr.release();
+  return true;
+}
+}  // namespace convert
+}  // namespace predict
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/fakequantwithminmaxperchannel_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/fakequantwithminmaxperchannel_packer.cc
new file mode 100644
index 0000000000..0074c87646
--- /dev/null
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/fakequantwithminmaxperchannel_packer.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "predict/converter/lite_model/op_attr_packer.h"
+
+namespace mindspore {
+namespace predict {
+namespace convert {
+bool FakeQuantWithMinMaxPerChannelPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
+  if (c_node_ptr == nullptr || ms_op == nullptr) {
+    return false;
+  }
+  std::unique_ptr<FakeQuantWithMinMaxPerChannelT> attr(new FakeQuantWithMinMaxPerChannelT());
+  MS_EXCEPTION_IF_NULL(attr);
+  ms_op->attr.type = OpT_FakeQuantWithMinMaxPerChannel;
+  ms_op->attr.value = attr.release();
+  return true;
+}
+}  // namespace convert
+}  // namespace predict
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/mul_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/mul_packer.cc
new file mode 100644
index 0000000000..6c430e79e7
--- /dev/null
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/mul_packer.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "predict/converter/lite_model/op_attr_packer.h"
+
+namespace mindspore {
+namespace predict {
+namespace convert {
+bool MulPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
+  if (c_node_ptr == nullptr || ms_op == nullptr) {
+    return false;
+  }
+  std::unique_ptr<MulT> attr(new MulT());
+  MS_EXCEPTION_IF_NULL(attr);
+  ms_op->attr.type = OpT_Mul;
+  ms_op->attr.value = attr.release();
+  return true;
+}
+}  // namespace convert
+}  // namespace predict
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/mulflod_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/mulflod_packer.cc
new file mode 100644
index 0000000000..1df7204875
--- /dev/null
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/mulflod_packer.cc
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "predict/converter/lite_model/op_attr_packer.h"
+
+namespace mindspore {
+namespace predict {
+namespace convert {
+bool MulFoldPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
+  if (c_node_ptr == nullptr || ms_op == nullptr) {
+    return false;
+  }
+  std::unique_ptr<MulFoldT> attr(new MulFoldT());
+  MS_EXCEPTION_IF_NULL(attr);
+  ms_op->name = c_node_ptr->fullname_with_scope();
+  ms_op->attr.type = OpT_MulFold;
+  ms_op->attr.value = attr.release();
+  return true;
+}
+}  // namespace convert
+}  // namespace predict
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/pooling_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/pooling_packer.cc
index 4eeb643817..edfdcda040 100644
--- a/mindspore/ccsrc/predict/converter/lite_model/operations/pooling_packer.cc
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/pooling_packer.cc
@@ -36,7 +36,6 @@ bool PoolingPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
     attr->poolingMode = mindspore::predict::PoolMode::PoolMode_MEAN_POOLING;
   } else if (c_name == "GlobalPool") {
     ms_op->name = c_node_ptr->fullname_with_scope();
-    attr->poolingMode = mindspore::predict::PoolMode::PoolMode_GLOBAL_POOING;
   } else {
     MS_LOG(ERROR) << "unknowed pooling type.";
     return false;
@@ -53,7 +52,6 @@ bool PoolingPacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
   attr->padDown = 0;
   attr->padLeft = 0;
   attr->padRight = 0;
-  attr->caffeMode = false;
   ms_op->attr.type = OpT_Pooling;
   ms_op->attr.value = attr.release();
   return true;
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/reshape_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/reshape_packer.cc
index cd8b72a8ac..a0a263631d 100644
--- a/mindspore/ccsrc/predict/converter/lite_model/operations/reshape_packer.cc
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/reshape_packer.cc
@@ -25,7 +25,7 @@ bool ReshapePacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
   }
   std::unique_ptr<ReshapeT> attr(new ReshapeT());
   MS_EXCEPTION_IF_NULL(attr);
-  attr->format = predict::DataFormatType::DataFormatType_NCHW;
+  attr->format = predict::Format::Format_NCHW;
   ms_op->name = c_node_ptr->fullname_with_scope();
   ms_op->attr.type = OpT_Reshape;
   ms_op->attr.value = attr.release();
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/scale_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/scale_packer.cc
index 7b4f6f6283..356775247d 100644
--- a/mindspore/ccsrc/predict/converter/lite_model/operations/scale_packer.cc
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/scale_packer.cc
@@ -25,7 +25,7 @@ bool ScalePacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
   }
   std::unique_ptr<ScaleT> attr(new ScaleT());
   MS_EXCEPTION_IF_NULL(attr);
-  attr->format = predict::DataFormatType::DataFormatType_NCHW;
+  attr->format = predict::Format::Format_NCHW;
   ms_op->name = c_node_ptr->fullname_with_scope();
   ms_op->attr.type = OpT_Scale;
   ms_op->attr.value = attr.release();
diff --git a/mindspore/ccsrc/predict/converter/lite_model/operations/squeeze_packer.cc b/mindspore/ccsrc/predict/converter/lite_model/operations/squeeze_packer.cc
new file mode 100644
index 0000000000..7e836fe021
--- /dev/null
+++ b/mindspore/ccsrc/predict/converter/lite_model/operations/squeeze_packer.cc
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "predict/converter/lite_model/op_attr_packer.h"
+
+namespace mindspore {
+namespace predict {
+namespace convert {
+bool SqueezePacker(const CNodePtr &c_node_ptr, OpDefT *ms_op) {
+  if (c_node_ptr == nullptr || ms_op == nullptr) {
+    return false;
+  }
+  std::unique_ptr<SqueezeT> attr(new SqueezeT());
+  MS_EXCEPTION_IF_NULL(attr);
+
+  std::vector<int> kernel_axis_value = AnfAlgo::GetNodeAttr<std::vector<int>>(c_node_ptr, "axis");
+  attr->axis = kernel_axis_value;
+
+  ms_op->attr.type = OpT_Squeeze;
+  ms_op->attr.value = attr.release();
+  return true;
+}
+}  // namespace convert
+}  // namespace predict
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/predict.cc b/mindspore/ccsrc/predict/predict.cc
index d81dcd3321..bbb12c3787 100644
--- a/mindspore/ccsrc/predict/predict.cc
+++ b/mindspore/ccsrc/predict/predict.cc
@@ -22,12 +22,15 @@
 
 namespace mindspore {
 namespace predictmodel {
-void StepConvertGraph(const KernelGraphPtrNew &kernel_graph_ptr) {
+void StepConvertGraph(const KernelGraphPtr &kernel_graph_ptr) {
   MS_LOG(INFO) << "start convert_graph step";
   // get kernel_graph. this graph can be origin or device, depends on which steps to persistence
   MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
   bool save_ms_model = MsContext::GetInstance()->save_ms_model_flag();
   if (save_ms_model) {
+    if (kernel_graph_ptr->inputs().empty()) {
+      return;
+    }
     // set convert_mode: convert cpu info or convert Davnici
     executor::Kernel2Ms::GetInstance().set_convert_mode(executor::kConvertCpuMode);
     // convert kernel_graph to sub_ms_graph
@@ -46,6 +49,9 @@ void StepConvertWeight(const std::vector<tensor::TensorPtr> &inputs) {
   bool save_ms_model = MsContext::GetInstance()->save_ms_model_flag();
   std::string save_path = MsContext::GetInstance()->save_ms_model_path();
   if (save_ms_model) {
+    if (inputs.empty()) {
+      return;
+    }
     MS_LOG(INFO) << "save ms model is true to path " << save_path;
     if (!executor::Kernel2Ms::GetInstance().KernelInput2MS(inputs)) {
       MS_LOG(WARNING) << "convert mindspore kernel input failed";
@@ -59,15 +65,5 @@ void StepConvertWeight(const std::vector<tensor::TensorPtr> &inputs) {
     }
   }
 }
-
-executor::TargetMode GetDeviceTarget(const std::string &device_target) {
-  if (device_target == "GPU") {
-    return executor::kGPUTarget;
-  } else if (device_target == "Ascend") {
-    return executor::kCPUTarget;
-  } else {
-    return executor::kUnknowTarget;
-  }
-}
 }  // namespace predictmodel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/predict/predict.h b/mindspore/ccsrc/predict/predict.h
index 04184fe77c..7c65f16619 100644
--- a/mindspore/ccsrc/predict/predict.h
+++ b/mindspore/ccsrc/predict/predict.h
@@ -19,16 +19,14 @@
 
 #include <memory>
 #include <vector>
-#include <string>
 #include "session/session_basic.h"
 #include "predict/converter/kernel2ms.h"
 
 namespace mindspore {
 namespace predictmodel {
-using KernelGraphPtrNew = std::shared_ptr<mindspore::session::KernelGraph>;
-void StepConvertGraph(const KernelGraphPtrNew &kernel_graph_ptr);
+using KernelGraphPtr = std::shared_ptr<mindspore::session::KernelGraph>;
+void StepConvertGraph(const KernelGraphPtr &kernel_graph_ptr);
 void StepConvertWeight(const std::vector<tensor::TensorPtr> &inputs);
-executor::TargetMode GetDeviceTarget(const std::string &device_target);
 }  // namespace predictmodel
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_PREDICT_H_
diff --git a/mindspore/ccsrc/predict/schema/ms.fbs b/mindspore/ccsrc/predict/schema/ms.fbs
index a114fc444e..7c3dcfb498 100644
--- a/mindspore/ccsrc/predict/schema/ms.fbs
+++ b/mindspore/ccsrc/predict/schema/ms.fbs
@@ -13,42 +13,26 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 include "op.fbs";
 
 namespace mindspore.predict;
 
-enum DataType : int {
-  DT_FLOAT = 0,
-  DT_FLOAT16 = 1,
-  DT_INT8 = 2,
-  DT_INT32 = 3,
-  DT_UINT8 = 4,
-  DT_UINT32 = 8,
-  DT_UNDEFINED = 16
-}
-
-enum Format : int {
-    NCHW = 0,
-    NHWC,
-    NC4HW4 = 100,
-    NUM_OF_FORMAT
-}
-
-enum MSConst: int {
+enum MSCONST: int {
     WEIGHT_REFCOUNT = 999
 }
 
-table QuantizationDef {
-    // Quantized value q, corresponding float value r:
-    // r = scale * (q - zero_point), where scale = (rmax - rmin) / (qmax - qmin)
-    min: [float];
-    max: [float];
-    scale: [float];
-    zero_point: [long];
+table QuantParam {
+    scale: double;
+    zeroPoint: int;
+    min: double = 0;
+    max: double = 0;
+    narrowRange: bool = true;
+    numBits: int = 8;
+}
 
-    // Tensor shape of the specifies dimension.
-    dimension: int;
+table QuantParamArray {
+    param: [QuantParam];  //pre-channel
 }
 
 table TensorDef {
@@ -60,7 +44,6 @@ table TensorDef {
     refCount: int;
     offset: int;
     data: [ubyte];
-    quantization: QuantizationDef;
 }
 
 union OpT {
@@ -70,7 +53,6 @@ union OpT {
     Conv2D,
     FusedBatchNorm,
     CaffeBatchNorm,
-    Squeeze,
     BiasAdd,
     Pooling,
     DepthwiseConv2D,
@@ -85,57 +67,134 @@ union OpT {
     Eltwise,
     NetOutput,
     Add,
+    Sub,
     MatMul,
     StridedSlice,
     Power,
     Slice,
     Stack,
     Mul,
+    RealDiv,
     Pad,
     Maximum,
+    Minimum,
     CaffePReLU,
+    LeakyReLU,
     ArgMax,
+    ArgMin,
     Exp,
     CaffeCrop,
     Range,
+    Rsqrt,
     ExpandDims,
     Tile,
-    Cast
-//    Split
+    Cast,
+    Shape,
+    Nchw2Nhwc,
+    Nhwc2Nchw,
+    QuantDTypeCast,
+    Split,
+    Permute,
+    FakeQuantWithMinMaxVars,
+    Equal,
+    Less,
+    Greater,
+    Min,
+    Floor,
+    Abs,
+    Neg,
+    Cos,
+    Sin,
+    Sqrt,
+    Square,
+    Constant,
+    Log,
+    Tan,
+    Atan,
+    Asin,
+    Clip,
+    Transpose,
+    Squeeze,
+    Unsqueeze,
+    Upsample,
+    Dropout,
+    Broadcast,
+    Lrn,
+    Prelu,
+    ZerosLike,
+    TopK,
+    SpaceToDepth,
+    SpaceToBatch,
+    SparseToDense,
+    ReverseSequence,
+    Rank,
+    Gather,
+    GatherNd,
+    Fill,
+    Elu,
+    DepthToSpace,
+    BatchToSpace,
+    AddN,
+    Ceil,
+    EmbeddingLookup,
+    EmbeddingLookupSparse,
+    FloorDiv,
+    FloorMod,
+    L2Norm,
+    LocalResponseNormalization,
+    MatrixDiag,
+    Reduce,
+    Reverse,
+    Round,
+    Select,
+    Scatter,
+    Unique,
+    Unstack,
+    LogicalAnd,
+    LogicalOr,
+    LogicalXor,
+    LogicalNot,
+    OnnxInt8Quantize,
+    OnnxInt8Dequantize,
+    FakeQuantWithMinMax,
+    FakeQuantWithMinMaxPerChannel,
+    BatchNormFold,
+    MulFold,
+    AddFold,
+    SquaredDifference
 }
 
 enum QuantType: int {
     QUANT_NONE,
-    QUANT_INT8
+    AwareTrainning,
+    WeightQuant,
+    PostTraining
+}
+
+enum FmkType: int {
+    TF,
+    CAFFE,
+    ONNX,
+    MS,
+    TFLITE
 }
 
 table OpDef {
     name: string;
+    fmkType: FmkType;
     attr: OpT;
     inputIndex: [uint];
     outputIndex: [uint];
-    isLastConv: bool;
     quantType: QuantType = QUANT_NONE;
+    quantParam: [QuantParamArray];
 }
 
-
-enum FmkType: int {
-    TF,
-    CAFFE
-}
-
-table NodeDef {
-    fmkType: FmkType;
-    opDef: OpDef;
-}
-
-
 table SubGraphDef {
     name: string;
     inputIndex: [uint];
     outputIndex: [uint];
     mempoolSize: uint;
-    nodes: [NodeDef];
+    nodes: [OpDef];
     allTensors: [TensorDef]; // weight + input + output
 }
 
diff --git a/mindspore/ccsrc/predict/schema/op.fbs b/mindspore/ccsrc/predict/schema/op.fbs
index d48f11b4d1..9286c2b2d3 100644
--- a/mindspore/ccsrc/predict/schema/op.fbs
+++ b/mindspore/ccsrc/predict/schema/op.fbs
@@ -22,12 +22,30 @@ enum ResizeMethod: byte {
     NEAREST_NEIGHBOR = 1
 }
 
-enum DataFormatType : byte {  // todo combine with mslite.h::Format
-    UNKNOW = -1,
+enum DataType : int {
+  DT_FLOAT = 0,
+  DT_FLOAT16 = 1,
+  DT_INT8 = 2,
+  DT_INT32 = 3,
+  DT_UINT8 = 4,
+  DT_INT16 = 5,
+  DT_UINT32 = 8,
+  DT_INT64 = 9,
+  DT_UINT16 = 10,
+  DT_UNDEFINED = 16
+}
+
+enum Format : int {
     NCHW = 0,
-    NHWC = 1,
-    HWC = 2,   // for input image or resize
-    CHW = 3,   // for input image or resize
+    NHWC,
+    HWKC,
+    HWCK,
+    KCHW,
+    CKHW,
+    KHWC,
+    CHWK,
+    NC4HW4 = 100,
+    NUM_OF_FORMAT
 }
 
 enum ActivationType : byte {
@@ -42,26 +60,47 @@ enum ActivationType : byte {
     SOFTSIGN = 8,
     SOFTPLUS = 9,
     TANH = 10,
-    UNKNOW = 11
+    SELU = 11,
+    HSWISH = 12,
+    HSIGMOID = 13,
+    THRESHOLDRELU = 14,
+    LINEAR = 15,
+    UNKNOW = 16
+}
+
+enum ReduceType : byte {
+    REDUCE_MAX = 0,
+    REDUCE_MEAN = 1,
+    REDUCE_ALL = 2,
+    REDUCE_ANY = 3,
+    REDUCE_LOG_SUM_EXP = 4,
+    REDUCE_PROD = 5,
+    REDUCE_SUM = 6,
+    UNKNOW = 7
 }
 
 enum PoolMode : byte {
     MAX_POOLING = 0,
     MEAN_POOLING = 1,
-    GLOBAL_POOING = 2
 }
 
 enum EltwiseMode : byte {
     PROD = 0,
     SUM = 1,
-    MAXIMUM = 2
+    MAXIMUM = 2,
+    UNKNOW = 3
 }
 
 enum PadMode : byte {
-    NOTSET=0,
-    SAME=1,
-    VALID=2,
-    CAFFE_CEIL_NEW=4
+    NOTSET = 0,
+    SAME = 1,
+    VALID = 2,
+    CAFFE = 4
+}
+
+enum RoundMode : byte {
+    FLOOR = 0,
+    CEIL = 1
 }
 
 enum PaddingMode : byte {
@@ -77,7 +116,9 @@ table Pad {
 }
 
 table Maximum {
-    format: DataFormatType = 0;
+}
+
+table Minimum {
 }
 
 table Concat {
@@ -94,7 +135,7 @@ table Activation {
 }
 
 table Conv2D {
-    format: DataFormatType = 0;
+    format: Format = 0;
     group: int;
     channelIn: int;
     channelOut: int;
@@ -114,15 +155,29 @@ table Conv2D {
 }
 
 table FusedBatchNorm {
-    epsilon: float;   // eg. epsilon=0.001
+    epsilon: float = 0.00001;   // eg. epsilon=0.001
+    momentum: float = 0.9;
+    spatial: int = 1;
 }
 
 table CaffeBatchNorm {
     epsilon: float;   // eg. epsilon=0.001
 }
 
-table Squeeze {
-    axis: [int];
+table Shape {
+}
+
+table Nchw2Nhwc {
+
+}
+
+table Nhwc2Nchw {
+
+}
+
+table FakeQuantWithMinMaxVars {
+    narrowRange: bool;
+    numBits: int;
 }
 
 table BiasAdd {
@@ -130,8 +185,9 @@ table BiasAdd {
 }
 
 table Pooling {
-    format: DataFormatType = 0;
+    format: Format = 0;
     poolingMode: PoolMode;
+    global: bool = false;
     windowW: int;
     windowH: int;
     strideW: int;
@@ -141,12 +197,11 @@ table Pooling {
     padDown: int;
     padLeft: int;
     padRight: int;
-    // todo replace with padValueMode in convolution pooling and so on
-    caffeMode: bool = false;
+    roundMode: RoundMode;
 }
 
 table DepthwiseConv2D {
-    format: DataFormatType = 0;
+    format: Format = 0;
     channelIn: int;
     channelMultiplier: int;
     kernelW: int;
@@ -165,7 +220,7 @@ table DepthwiseConv2D {
 }
 
 table DeDepthwiseConv2D {
-    format: DataFormatType = 0;
+    format: Format = 0;
     channelIn: int;
     channelMultiplier: int;
     kernelW: int;
@@ -185,7 +240,7 @@ table DeDepthwiseConv2D {
 
 
 table Resize {
-    format: DataFormatType = 0;
+    format: Format = 0;
     method: ResizeMethod;
     newHeight: long;
     newWidth: long;
@@ -194,7 +249,7 @@ table Resize {
 }
 
 table DetectionPostProcess {
-    format: DataFormatType = 0;
+    format: Format = 0;
     inputSize: int;
     hScale: float;
     wScale: float;
@@ -210,8 +265,8 @@ table DetectionPostProcess {
 }
 
 table FullConnection {
-    format: DataFormatType = 0;
     hasBias: bool;
+    axis: int;
 }
 
 // Mean(input_tensor, axis, keep_dims)
@@ -221,7 +276,7 @@ table Mean {
 }
 
 table DeConv2D {
-    format: DataFormatType = 0;
+    format: Format = 0;
     group: int;
     channelIn: int;
     channelOut: int;
@@ -241,34 +296,88 @@ table DeConv2D {
 }
 
 table Scale {
-    format: DataFormatType = 0;
+    format: Format = 0;
 }
 
 table Eltwise {
-    format: DataFormatType = 0;
     mode: EltwiseMode;
-    // todo repeat coeff (default 1)
 }
 
 table Add {
-    format: DataFormatType = 0;
+}
+
+table Sub {
+}
+
+table Mul {
+}
+
+table RealDiv {
+}
+
+table Rsqrt {
+}
+
+table Equal {
+}
+
+table Less {
+}
+
+table Greater {
+}
+
+table Min {
 }
 
 table Slice {
-    format: DataFormatType = 0;
+    format: Format = 0;
     begin: [int];
-    end: [int];
-    stride: [int];
+    size: [int];
 }
 
-table Mul {
+table Floor {
+}
+
+table Abs {
+}
+
+table Neg {
 }
 
 table Exp {
 }
 
+table Cos {
+}
+
+table Sin {
+}
+
+table Sqrt {
+}
+
+table Square {
+}
+
+table Ceil {
+}
+
+table Log {
+}
+
+table Tan {
+}
+
+table Atan {
+}
+
+table Asin {
+}
+
 table Reshape {
-    format: DataFormatType = 0;
+    format: Format = 0;
+    shape: [long];
 }
 
 table Power {
@@ -280,13 +389,20 @@ table Power {
 table ArgMax {
     axis: int;
     outMaxValue: bool;
-    topK: int;
+    topK: int = 1;
+    keepDims: bool;
+    axisType: int;
+}
+
+table ArgMin {
+    axis: int;
+    outMaxValue: bool;
+    topK: int = 1;
     keepDims: bool;
     axisType: int;
 }
 
 table NetOutput {
-    format: DataFormatType = 0;
 }
 
 table MatMul {
@@ -298,6 +414,10 @@ table CaffePReLU {
     channelShared : bool = false;
 }
 
+table LeakyReLU {
+    negativeSlope: float;
+}
+
 table StridedSlice {
     beginMask: int;
     endMask: int;
@@ -317,6 +437,7 @@ table Stack {
 }
 
 table Range {
+    dType: DataType;
     start: int;
     limit: int;
     delta: int;
@@ -335,13 +456,244 @@ table Cast {
     dstT: int;
 }
 
-//table Split {
-//    numberSplit: int;
-//    sizeSplits: [int];
-//    splitDim: int;
-//}
+table QuantDTypeCast {
+    srcT: DataType;
+    dstT: DataType;
+}
+
+table Split {
+    numberSplit: int;
+    sizeSplits: [int];
+    splitDim: int;
+}
 
 table CaffeCrop {
     axis : long;
     offsets : [long];
 }
+
+table Permute {
+    order: [long];
+}
+
+table Clip {
+    max: float;
+    min: float;
+}
+
+table Constant {
+}
+
+
+table Elu {
+    alpha: float = 1.0;
+}
+
+table Broadcast {
+}
+
+table Lrn {
+    alpha: float = 0.0001;
+    beta: float = 0.75;
+    bias: float = 1.0;
+    size: int;
+}
+
+enum ReduceMode : byte {
+    ReduceMean = 0,
+    ReduceMax = 1,
+    ReduceMin = 2,
+    ReduceProd = 3,
+    ReduceSum = 4,
+    ReduceSumSquare = 5
+}
+
+table Reduce {
+    axes: [int];
+    keepDims: int;
+    mode: ReduceMode;
+}
+
+table Prelu {
+    slope: [float];
+}
+
+table Transpose {
+    perm: [int];
+    conjugate: bool = false;
+}
+
+table Squeeze {
+    axis: [int];
+}
+
+table Unsqueeze {
+    axis: [int];
+}
+
+table Upsample {
+    mode: string;
+    scales: [float];
+}
+
+table Dropout {
+    ratio : float = 0.5;
+}
+
+table LocalResponseNormalization {
+    depth_radius: int;
+    bias: float;
+    alpha: float;
+    beta: float;
+}
+
+table ZerosLike {
+}
+
+table TopK {
+    k : int;
+    sorted : bool = true;
+}
+
+table SpaceToDepth {
+    blockSize : int;
+    format: Format = 0;
+}
+
+table SpaceToBatch {
+    blockShape : [int];
+    paddings : [int];
+}
+
+table SparseToDense {
+    validateIndices: bool;
+}
+
+table ReverseSequence {
+    seqAxis: int;
+    batchAxis: int;
+}
+
+table Rank {
+}
+
+
+table Gather {
+    axis: int;
+    batchDims: int;
+}
+
+table GatherNd {
+    batchDims: int;
+}
+
+table Fill {
+    dims: [int];
+}
+
+table DepthToSpace {
+    blockSize: int;
+    format: Format = 0;
+}
+
+
+table BatchToSpace {
+    blockShape: [int];
+    crops: [int];
+}
+
+table AddN {
+    N: int;
+}
+
+
+table EmbeddingLookup {
+    ids: [int];
+    maxNorm: float;
+}
+
+table EmbeddingLookupSparse {
+    spIds: [int];
+    spWeights: [float];
+    //combiner: Combiner=0;
+    maxNortm: float;
+}
+
+table FloorDiv {
+}
+
+table FloorMod {
+}
+
+table L2Norm {
+    axis: [int];
+    epsilon: float;
+}
+
+table LogicalAnd {
+}
+
+table LogicalOr {
+}
+
+table LogicalXor {
+}
+
+table LogicalNot {
+}
+
+table MatrixDiag {
+    k: int;
+    numRows: int;
+    numCols: int;
+    paddingValue: float;
+}
+
+table Select {
+}
+
+table TfReduce {
+    type: ReduceType = 7;
+}
+
+table Reverse {
+    axis: [int];
+}
+
+table Round {
+}
+
+table Scatter {
+}
+
+table Unique {
+}
+
+table Unstack {
+    num: int;
+    axis: int;
+}
+
+table OnnxInt8Quantize {
+}
+
+table OnnxInt8Dequantize {
+}
+
+table FakeQuantWithMinMax {
+}
+
+table FakeQuantWithMinMaxPerChannel {
+}
+
+table BatchNormFold {
+}
+
+table MulFold {
+}
+
+table AddFold {
+}
+
+table SquaredDifference {
+}
diff --git a/mindspore/ccsrc/pynative/base.h b/mindspore/ccsrc/pynative/base.h
index fc143da3c1..60ae869227 100644
--- a/mindspore/ccsrc/pynative/base.h
+++ b/mindspore/ccsrc/pynative/base.h
@@ -45,7 +45,7 @@ enum PynativeStatusCode {
   PYNATIVE_UNKNOWN_STATE = 0XFF
 };
 
-enum RunOpArgsEnum { PY_PRIM = 0, PY_NAME, PY_INPUTS, PY_INPUT_MASK, PY_ARGS_NUM };
+enum RunOpArgsEnum { PY_PRIM = 0, PY_NAME, PY_INPUTS, PY_ARGS_NUM };
 
 struct OpExecInfo {
   PrimitivePyPtr py_primitive;
@@ -57,9 +57,9 @@ struct OpExecInfo {
   py::dict op_attrs;
 };
 using OpExecInfoPtr = std::shared_ptr<OpExecInfo>;
-OpExecInfoPtr GenerateOpExecInfo(const py::args &args);
+OpExecInfoPtr GenerateOpExecInfo(const py::args &args, py::list *const out_args);
 
-const std::set<std::string> ignore_infer_prim = {"partial", "make_ref"};
+const std::set<std::string> ignore_infer_prim = {"make_ref"};
 }  // namespace pynative
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/pynative/pynative_execute.cc b/mindspore/ccsrc/pynative/pynative_execute.cc
index 6f0a4e5790..75653ff5d2 100644
--- a/mindspore/ccsrc/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pynative/pynative_execute.cc
@@ -22,17 +22,30 @@
 #include <unordered_set>
 #include <algorithm>
 
+#include "ir/param_value_py.h"
 #include "utils/any.h"
 #include "utils/utils.h"
 #include "utils/context/ms_context.h"
 #include "operator/ops.h"
+#include "operator/composite/composite.h"
 #include "operator/composite/do_signature.h"
 #include "pipeline/parse/data_converter.h"
+#include "pipeline/parse/parse_base.h"
+#include "pipeline/parse/resolve.h"
 #include "pipeline/static_analysis/prim.h"
 #include "session/session_factory.h"
 #include "pre_activate/pass/const_input_to_attr_registry.h"
 #include "pre_activate/common/helper.h"
+#include "pipeline/action.h"
+
 #include "pynative/base.h"
+#include "pybind_api/api_register.h"
+#include "vm/transform.h"
+
+#include "optimizer/ad/grad.h"
+#include "pipeline/resource.h"
+#include "pipeline/pipeline.h"
+#include "pipeline/pass.h"
 
 #ifdef ENABLE_GE
 #include "pynative/pynative_execute_ge.h"
@@ -40,77 +53,139 @@
 
 const char SINGLE_OP_GRAPH[] = "single_op_graph";
 // primitive unable to infer value for constant input in PyNative mode
-const std::set<std::string> vm_operators = {"partial", "depend", "make_ref", "zeros_like_tensor"};
+const std::set<std::string> vm_operators = {"make_ref", "HookBackward", "stop_gradient"};
 
 namespace mindspore {
 namespace pynative {
+
 static std::shared_ptr<session::SessionBasic> session = nullptr;
+PynativeExecutorPtr PynativeExecutor::executor_ = nullptr;
+std::mutex PynativeExecutor::instance_lock_;
+ResourcePtr PynativeExecutor::resource_;
+
 inline ValuePtr PyAttrValue(const py::object &obj) {
-  ValuePtr converted_ret = nullptr;
-  bool converted = parse::ConvertData(obj, &converted_ret);
-  if (!converted) {
+  ValuePtr converted_ret = parse::data_converter::PyDataToValue(obj);
+  if (!converted_ret) {
     MS_LOG(EXCEPTION) << "Attribute convert error with type:" << std::string(py::str(obj));
   }
   return converted_ret;
 }
 
-py::tuple ConvertInputs(const PrimitivePyPtr &prim, const py::tuple &py_args) {
-  auto signature = prim->signatures();
-  std::vector<SignatureEnumDType> dtypes;
-  (void)std::transform(signature.begin(), signature.end(), std::back_inserter(dtypes),
-                       [](const Signature &sig) { return sig.dtype; });
-  int empty_dtype_count = std::count(dtypes.begin(), dtypes.end(), SignatureEnumDType::kDTypeEmptyDefaultValue);
-  if (dtypes.size() == 0 || static_cast<int>(dtypes.size()) == empty_dtype_count) {
-    return py_args;
+std::string GetId(const py::object &obj) {
+  py::object to_process = obj;
+  std::string prefix = "";
+  if (py::isinstance<py::tuple>(to_process)) {
+    auto p_list = py::cast<py::tuple>(to_process);
+    if (p_list.size() == 0) {
+      return "empty";
+    }
+    prefix = "tuple:";
+    std::string key = "";
+    for (size_t i = 0; i < p_list.size(); ++i) {
+      key += std::string(py::str(GetId(p_list[i]))) + ":";
+    }
+    return prefix + key;
+  }
+  if (py::isinstance<py::int_>(to_process)) {
+    return prefix + std::string(py::str(to_process));
   }
-  std::map<SignatureEnumDType, std::vector<size_t>> type_indexs;
+  if (py::isinstance<py::float_>(to_process)) {
+    return prefix + std::string(py::str(to_process));
+  }
+  if (py::isinstance<tensor::Tensor>(to_process)) {
+    auto tensor_ptr = py::cast<tensor::TensorPtr>(to_process);
+    return prefix + tensor_ptr->id();
+  }
+
+  py::object ret = parse::python_adapter::CallPyFn(parse::PYTHON_MOD_PARSE_MODULE, parse::PYTHON_MOD_GET_OBJ_ID, obj);
+  return py::cast<std::string>(ret);
+}
+
+py::object GetTupleObj(const py::object &obj) {
+  py::module mod = parse::python_adapter::GetPyModule(parse::PYTHON_MOD_PARSE_MODULE);
+  py::object obj_tuple = parse::python_adapter::CallPyModFn(mod, parse::PYTHON_MOD_GET_DEFAULT_INPUT, obj);
+  return obj_tuple;
+}
+
+std::map<SignatureEnumDType, std::vector<size_t>> GetTypeIndex(const std::vector<SignatureEnumDType> &dtypes) {
+  std::map<SignatureEnumDType, std::vector<size_t>> type_indexes;
   for (size_t i = 0; i < dtypes.size(); ++i) {
-    auto it = type_indexs.find(dtypes[i]);
-    if (it == type_indexs.end()) {
-      (void)type_indexs.insert(std::make_pair(dtypes[i], std::vector<size_t>{i}));
+    auto it = type_indexes.find(dtypes[i]);
+    if (it == type_indexes.end()) {
+      (void)type_indexes.insert(std::make_pair(dtypes[i], std::vector<size_t>{i}));
     } else {
       it->second.push_back(i);
     }
   }
+  return type_indexes;
+}
+
+std::map<SignatureEnumDType, size_t> GetDstType(const py::tuple &py_args,
+                                                const std::map<SignatureEnumDType, std::vector<size_t>> &type_indexes) {
   std::map<SignatureEnumDType, size_t> dst_type;
-  for (auto it = type_indexs.begin(); it != type_indexs.end(); (void)++it) {
+  for (auto it = type_indexes.begin(); it != type_indexes.end(); (void)++it) {
     auto type = it->first;
-    auto indexs = it->second;
-    if (indexs.size() < 2) {
+    auto indexes = it->second;
+    if (indexes.size() < 2) {
       continue;
     }
-    size_t m_index = indexs[0];
-    for (size_t i = 1; i < indexs.size(); ++i) {
-      if (py::isinstance<tensor::Tensor>(py_args[indexs[i]])) {
-        m_index = indexs[i];
+    size_t m_index = indexes[0];
+    for (size_t i = 1; i < indexes.size(); ++i) {
+      if (py::isinstance<tensor::Tensor>(py_args[indexes[i]])) {
+        m_index = indexes[i];
       }
     }
     (void)dst_type.insert(std::make_pair(type, m_index));
   }
-  py::tuple py_inputs(py_args.size());
+  return dst_type;
+}
+
+py::tuple ConvertInputs(const PrimitivePyPtr &prim, const py::list &args, py::tuple *const out_args,
+                        py::list *out_args_list) {
+  auto &py_args = *out_args;
+  py::tuple input_mask(args.size());
+  for (size_t i = 0; i < args.size(); ++i) {
+    if (py::hasattr(args[i], "__parameter__")) {
+      input_mask[i] = true;
+    } else {
+      input_mask[i] = false;
+    }
+    py_args[i] = GetTupleObj(args[i]);
+  }
+  auto signature = prim->signatures();
+  std::vector<SignatureEnumDType> dtypes;
+  (void)std::transform(signature.begin(), signature.end(), std::back_inserter(dtypes),
+                       [](const Signature &sig) { return sig.dtype; });
+  int empty_dtype_count = std::count(dtypes.begin(), dtypes.end(), SignatureEnumDType::kDTypeEmptyDefaultValue);
+  if (dtypes.size() == 0 || static_cast<int>(dtypes.size()) == empty_dtype_count) {
+    return input_mask;
+  }
+  auto type_indexes = GetTypeIndex(dtypes);
+  auto dst_type = GetDstType(py_args, type_indexes);
   for (size_t i = 0; i < py_args.size(); ++i) {
     auto it = dst_type.find(dtypes[i]);
     if (it != dst_type.end() && it->second != i &&
         (py::isinstance<py::int_>(py_args[i]) || py::isinstance<py::float_>(py_args[i]))) {
       auto tensor_ptr = py::cast<tensor::TensorPtr>(py_args[it->second]);
       if (py::isinstance<py::int_>(py_args[i])) {
-        py_inputs[i] = std::make_shared<tensor::Tensor>(py::cast<py::int_>(py_args[i]), tensor_ptr->Dtype());
+        py_args[i] = std::make_shared<tensor::Tensor>(py::cast<py::int_>(py_args[i]), tensor_ptr->Dtype());
+        (*out_args_list)[i] = py_args[i];
       } else {
-        py_inputs[i] = std::make_shared<tensor::Tensor>(py::cast<py::float_>(py_args[i]), tensor_ptr->Dtype());
+        py_args[i] = std::make_shared<tensor::Tensor>(py::cast<py::float_>(py_args[i]), tensor_ptr->Dtype());
+        (*out_args_list)[i] = py_args[i];
       }
       continue;
     }
-    py_inputs[i] = py_args[i];
   }
-  return py_inputs;
+  return input_mask;
 }
 
-void PynativeInfer(const PrimitivePyPtr &prim, const py::tuple &py_args, OpExecInfo *const op_exec_info) {
+void PynativeInfer(const PrimitivePyPtr &prim, const py::list &py_args, OpExecInfo *const op_exec_info) {
   size_t size = py_args.size();
   AbstractBasePtrList args_spec_list;
   for (size_t i = 0; i < size; i++) {
     ValuePtr input_value = PyAttrValue(py_args[i]);
-    if (py::isinstance<tensor::Tensor>(py_args[i])) {
+    if (!py::hasattr(prim->GetPyObj(), "const_value") && input_value->isa<tensor::Tensor>()) {
       args_spec_list.emplace_back(abstract::FromValueInside(input_value, true));
     } else {
       args_spec_list.emplace_back(abstract::FromValueInside(input_value, false));
@@ -120,9 +195,9 @@ void PynativeInfer(const PrimitivePyPtr &prim, const py::tuple &py_args, OpExecI
   op_exec_info->abstract = infer_res;
 }
 
-OpExecInfoPtr GenerateOpExecInfo(const py::args &args) {
+OpExecInfoPtr GenerateOpExecInfo(const py::args &args, py::list *const out_args) {
   if (args.size() != PY_ARGS_NUM) {
-    MS_LOG(ERROR) << "Four args are needed by RunOp";
+    MS_LOG(ERROR) << "Three args are needed by RunOp";
     return nullptr;
   }
   auto op_exec_info = std::make_shared<OpExecInfo>();
@@ -133,15 +208,18 @@ OpExecInfoPtr GenerateOpExecInfo(const py::args &args) {
   if (pyobj == nullptr) {
     MS_LOG(EXCEPTION) << "pyobj is empty";
   }
-  py::tuple py_args = ConvertInputs(prim, args[PY_INPUTS]);
+
+  py::list a = args[PY_INPUTS];
+  size_t input_num = a.size();
+  op_exec_info->op_inputs = py::tuple(input_num);
+
+  op_exec_info->inputs_mask = ConvertInputs(prim, args[PY_INPUTS], &op_exec_info->op_inputs, out_args);
   // use python infer method
   if (ignore_infer_prim.find(op_exec_info->op_name) == ignore_infer_prim.end()) {
-    PynativeInfer(prim, py_args, op_exec_info.get());
+    PynativeInfer(prim, op_exec_info->op_inputs, op_exec_info.get());
   }
   op_exec_info->py_primitive = prim;
   op_exec_info->op_attrs = py::getattr(args[PY_PRIM], "attrs");
-  op_exec_info->op_inputs = py_args;
-  op_exec_info->inputs_mask = args[PY_INPUT_MASK];
   if (op_exec_info->op_inputs.size() != op_exec_info->inputs_mask.size()) {
     MS_LOG(ERROR) << "Op:" << op_exec_info->op_name << " inputs size not equal op_mask";
     return nullptr;
@@ -154,9 +232,13 @@ std::string GetSingleOpGraphInfo(const OpExecInfoPtr &op_exec_info,
   MS_EXCEPTION_IF_NULL(op_exec_info);
   std::string graph_info;
   // get input tensor info
-  for (const auto &input_tensor : input_tensors) {
-    MS_EXCEPTION_IF_NULL(input_tensor);
-    (void)graph_info.append(input_tensor->GetShapeAndDataTypeInfo() + "_");
+  size_t input_num = op_exec_info->op_inputs.size();
+  for (size_t index = 0; index < input_num; ++index) {
+    auto input = op_exec_info->op_inputs[index];
+    if (py::isinstance<tensor::Tensor>(input)) {
+      auto tensor_ptr = py::cast<tensor::TensorPtr>(input);
+      (void)graph_info.append(tensor_ptr->GetShapeAndDataTypeInfo() + "_");
+    }
   }
   // get prim and abstract info
   MS_EXCEPTION_IF_NULL(op_exec_info->abstract);
@@ -171,6 +253,23 @@ py::object RunOpInVM(const OpExecInfoPtr &op_exec_info, PynativeStatusCode *stat
   MS_EXCEPTION_IF_NULL(status);
   MS_EXCEPTION_IF_NULL(op_exec_info);
   MS_EXCEPTION_IF_NULL(op_exec_info->py_primitive);
+  if (op_exec_info->op_name == "HookBackward") {
+    auto op_inputs = op_exec_info->op_inputs;
+    py::tuple result(op_inputs.size());
+    for (size_t i = 0; i < op_inputs.size(); i++) {
+      py::object input = op_inputs[i];
+      if (py::hasattr(input, "__parameter__")) {
+        result[i] = py::getattr(input, "data");
+      } else {
+        auto tensor = py::cast<tensor::TensorPtr>(op_inputs[i]);
+        auto new_tensor = std::make_shared<tensor::Tensor>(tensor->data());
+        result[i] = new_tensor;
+      }
+    }
+    *status = PYNATIVE_SUCCESS;
+    MS_LOG(INFO) << "RunOpInVM end";
+    return std::move(result);
+  }
   auto func = op_exec_info->py_primitive->GetComputeFunction();
   if (py::isinstance<py::none>(func)) {
     MS_LOG(ERROR) << "VM failed to get func";
@@ -237,6 +336,27 @@ void ConvertValueTupleToTensor(const py::object &input_object, std::vector<tenso
   input_tensors->push_back(tensor_ptr);
 }
 
+void ConvertMultiPyObjectToTensor(const py::object &input_object, const PrimitivePtr &op_prim,
+                                  std::vector<tensor::TensorPtr> *input_tensors, int *tensor_mask) {
+  MS_EXCEPTION_IF_NULL(op_prim);
+  MS_EXCEPTION_IF_NULL(input_tensors);
+  MS_EXCEPTION_IF_NULL(tensor_mask);
+
+  if (!py::isinstance<py::tuple>(input_object)) {
+    MS_LOG(EXCEPTION) << "The input should be a tuple!";
+  }
+  auto tuple_inputs = py::cast<py::tuple>(input_object);
+  if (tuple_inputs.size() == 0) {
+    MS_LOG(EXCEPTION) << "The size of input list or tuple is 0!";
+  }
+  if (py::isinstance<tensor::Tensor>(tuple_inputs[0])) {
+    PlantTensorTupleToVector(tuple_inputs, op_prim, input_tensors);
+  } else {
+    ConvertValueTupleToTensor(input_object, input_tensors);
+    *tensor_mask = kValueNodeTensorMask;
+  }
+}
+
 void ConvertPyObjectToTensor(const py::object &input_object, const PrimitivePtr &op_prim,
                              std::vector<tensor::TensorPtr> *input_tensors, int *tensor_mask) {
   MS_EXCEPTION_IF_NULL(op_prim);
@@ -251,20 +371,20 @@ void ConvertPyObjectToTensor(const py::object &input_object, const PrimitivePtr
   } else if (py::isinstance<py::int_>(input_object)) {
     tensor_ptr = std::make_shared<tensor::Tensor>(py::cast<py::int_>(input_object), kInt32);
     *tensor_mask = kValueNodeTensorMask;
-  } else if (py::isinstance<py::list>(input_object)) {
-    tensor_ptr = std::make_shared<tensor::Tensor>(py::cast<py::list>(input_object), nullptr);
   } else if (py::isinstance<py::array>(input_object)) {
     tensor_ptr = std::make_shared<tensor::Tensor>(py::cast<py::array>(input_object), nullptr);
-  } else if (py::isinstance<py::none>(input_object)) {
+  } else if (py::isinstance<py::list>(input_object)) {
+    auto list_inputs = py::cast<py::list>(input_object);
+    py::tuple tuple_inputs(list_inputs.size());
+    for (size_t i = 0; i < tuple_inputs.size(); ++i) {
+      tuple_inputs[i] = list_inputs[i];
+    }
+    ConvertMultiPyObjectToTensor(tuple_inputs, op_prim, input_tensors, tensor_mask);
     return;
   } else if (py::isinstance<py::tuple>(input_object)) {
-    auto tuple_inputs = py::cast<py::tuple>(input_object);
-    if (py::isinstance<tensor::Tensor>(tuple_inputs[0])) {
-      PlantTensorTupleToVector(tuple_inputs, op_prim, input_tensors);
-    } else {
-      ConvertValueTupleToTensor(input_object, input_tensors);
-      *tensor_mask = kValueNodeTensorMask;
-    }
+    ConvertMultiPyObjectToTensor(input_object, op_prim, input_tensors, tensor_mask);
+    return;
+  } else if (py::isinstance<py::none>(input_object)) {
     return;
   } else {
     MS_LOG(EXCEPTION) << "Run op inputs type is invalid!";
@@ -288,7 +408,6 @@ void ConstructInputTensor(const OpExecInfoPtr &op_run_info, std::vector<int> *te
   opt::ConstInputToAttrInfoRegister reg;
   bool reg_exist = opt::ConstInputToAttrInfoRegistry::Instance().GetRegisterByOpName(op_run_info->op_name, &reg);
   size_t input_num = op_run_info->op_inputs.size();
-  MS_LOG(INFO) << "py input size: " << input_num;
   for (size_t index = 0; index < input_num; ++index) {
     // convert const input to attr
     if (reg_exist &&
@@ -386,22 +505,61 @@ py::object RunOpWithBackendPolicy(MsBackendPolicy backend_policy, const OpExecIn
   return result;
 }
 
-py::tuple RunOp(const py::args &args) {
-  py::object result;
-  // returns a null py::tuple on error
-  py::tuple err_ret(0);
-  PynativeStatusCode status = PYNATIVE_UNKNOWN_STATE;
+AnfNodePtr PynativeExecutor::MakeCNode(const OpExecInfoPtr &op_exec_info, const py::args &args, const py::tuple &out) {
+  if (!grad_flag_ || graph_info_map_.size() == 0) {
+    return nullptr;
+  }
+  std::vector<AnfNodePtr> inputs;
+  auto prim = op_exec_info->py_primitive;
+  inputs.push_back(NewValueNode(prim));
+  py::tuple op_masks = op_exec_info->inputs_mask;
+  AbstractBasePtrList args_spec_list;
+  for (size_t i = 0; i < args.size(); i++) {
+    auto node = GetInput(args[i], op_masks[i]);
+    args_spec_list.push_back(node->abstract());
+    inputs.push_back(node);
+  }
 
-  OpExecInfoPtr op_exec_info = GenerateOpExecInfo(args);
-  MS_EXCEPTION_IF_NULL(op_exec_info);
-  if (op_exec_info->abstract != nullptr) {
-    py::dict output = abstract::ConvertAbstractToPython(op_exec_info->abstract);
-    if (!output["value"].is_none()) {
-      py::tuple value_ret(1);
-      value_ret[0] = output["value"];
-      return value_ret;
+  auto cnode = curr_g_->NewCNode(inputs);
+  MS_LOG(DEBUG) << "MakeCnode set node " << cnode->DebugString(4);
+  py::object out_real = out;
+  if (out.size() == 1) {
+    MS_LOG(DEBUG) << "MakeCnode out size is one.";
+    out_real = out[0];
+  }
+  std::string obj_id = GetId(out_real);
+  if (py::isinstance<py::tuple>(out_real)) {
+    auto value = py::cast<py::tuple>(out_real);
+    if (value.size() > 1) {
+      for (int i = 0; i < static_cast<int>(value.size()); i++) {
+        auto value_id = GetId(value[i]);
+        MS_LOG(DEBUG) << "MakeCnode set node id " << value_id;
+        set_obj_node_map(curr_g_, value_id, cnode, i);
+      }
     }
   }
+  MS_LOG(DEBUG) << "MakeCnode set node id " << obj_id;
+  set_obj_node_map(curr_g_, obj_id, cnode);
+  set_pyobj(curr_g_, obj_id);
+  return cnode;
+}
+
+AnfNodePtr PynativeExecutor::GetObjNode(const py::object &obj) {
+  auto &out = graph_info_map_[curr_g_].obj_node_map[GetId(obj)];
+  if (out.second.size() == 1 && out.second[0] == -1) {
+    return out.first;
+  }
+  auto node = out.first;
+  MS_LOG(DEBUG) << "output size " << out.second.size() << node->DebugString();
+  for (auto &idx : out.second) {
+    std::vector<AnfNodePtr> tuple_get_item_inputs{NewValueNode(prim::kPrimTupleGetItem), node, NewValueNode(idx)};
+    node = curr_g_->NewCNode(tuple_get_item_inputs);
+  }
+  MS_LOG(DEBUG) << "GetObjNode output" << node->DebugString(6);
+  return node;
+}
+
+py::tuple RunOp(const OpExecInfoPtr &op_exec_info, const py::args &args) {
   MS_LOG(INFO) << "RunOp start, op name is: " << op_exec_info->op_name;
   mindspore::parse::python_adapter::set_python_env_flag(true);
   MsBackendPolicy backend_policy;
@@ -422,16 +580,442 @@ py::tuple RunOp(const py::args &args) {
   if (vm_operators.find(op_exec_info->op_name) != vm_operators.end()) {
     backend_policy = kMsBackendVmOnly;
   }
-  result = RunOpWithBackendPolicy(backend_policy, op_exec_info, &status);
+  PynativeStatusCode status = PYNATIVE_UNKNOWN_STATE;
+  // returns a null py::tuple on error
+  py::tuple err_ret(0);
+  py::object result = RunOpWithBackendPolicy(backend_policy, op_exec_info, &status);
   if (status != PYNATIVE_SUCCESS) {
     MS_LOG(ERROR) << "Failed to run " << op_exec_info->op_name;
     return err_ret;
   }
 
-  MS_LOG(INFO) << "RunOp end";
+  auto node = PynativeExecutor::GetInstance()->MakeCNode(op_exec_info, args, result);
+  if (node != nullptr) {
+    node->set_abstract(op_exec_info->abstract);
+    MS_LOG(DEBUG) << "RunOp MakeCnode,new node is: " << node->DebugString();
+  }
+  MS_LOG(DEBUG) << "RunOp end";
   return result;
 }
 
+py::tuple RunOp(const py::args &args) {
+  MS_LOG(DEBUG) << "RunOp start" << args.size();
+  py::list args_input = args[PY_INPUTS];
+
+  OpExecInfoPtr op_exec_info = GenerateOpExecInfo(args, &args_input);
+  MS_EXCEPTION_IF_NULL(op_exec_info);
+
+  if (op_exec_info->abstract != nullptr) {
+    py::dict output = abstract::ConvertAbstractToPython(op_exec_info->abstract);
+    if (!output["value"].is_none()) {
+      py::tuple value_ret(1);
+      value_ret[0] = output["value"];
+      return value_ret;
+    }
+    if (py::hasattr(op_exec_info->py_primitive->GetPyObj(), "const_value")) {
+      py::tuple value_ret(1);
+      value_ret[0] = "";
+      return value_ret;
+    }
+  }
+  return RunOp(op_exec_info, args_input);
+}
+
 void ClearPyNativeSession() { session = nullptr; }
+
+PynativeExecutor::~PynativeExecutor() { ClearRes(); }
+
+PynativeExecutor::PynativeExecutor() { grad_flag_ = false; }
+
+void PynativeExecutor::NewGraph(const py::object &cell, const py::args &args) {
+  auto cell_id = GetId(cell);
+  if (cell_graph_map_.count(cell_id) != 0) {
+    MS_LOG(DEBUG) << "Newgraph already compiled";
+    return;
+  }
+
+  auto g = std::make_shared<FuncGraph>();
+
+  if (top_g_ == nullptr) {
+    top_g_ = curr_g_ = g;
+    df_builder_ = std::make_shared<FuncGraph>();
+    MS_LOG(DEBUG) << "First new graph" << top_g_.get();
+    Pushp();
+  } else {
+    Pushp();
+    curr_g_ = g;
+  }
+  if (graph_info_map_.count(g) == 0) {
+    graph_info_map_[g] = GraphInfo();
+  }
+  for (size_t i = 0; i < args.size(); i++) {
+    auto new_param = g->add_parameter();
+    std::string param_obj = GetId(args[i]);
+    graph_info_map_[g].param_map[param_obj] = new_param;
+  }
+}
+
+AnfNodePtr PynativeExecutor::MakeValueNode(const py::object &obj, const std::string &obj_id) {
+  ValuePtr converted_ret = nullptr;
+  parse::ConvertData(obj, &converted_ret);
+  auto node = NewValueNode(converted_ret);
+  set_obj_node_map(curr_g_, obj_id, node);
+  return node;
+}
+
+AnfNodePtr PynativeExecutor::GetInput(const py::object &obj, const py::object &op_mask) {
+  AnfNodePtr node = nullptr;
+  std::string obj_id = GetId(obj);
+
+  if (op_mask != nullptr && py::cast<bool>(op_mask)) {
+    MS_LOG(DEBUG) << "Topgraph free parameter";
+    // get the parameter name from parameter object
+    auto name_attr = mindspore::parse::python_adapter::GetPyObjAttr(obj, "name");
+    if (py::isinstance<py::none>(name_attr)) {
+      MS_LOG(EXCEPTION) << "Parameter object should have name attribute";
+    }
+    std::string param_name = py::cast<std::string>(name_attr);
+    if (graph_info_map_[df_builder_].param_map.count(obj_id) == 0) {
+      auto free_param = df_builder_->add_parameter();
+      free_param->set_name(param_name);
+      auto free_param_new = std::make_shared<ParamValuePy>(obj);
+      free_param->set_default_param(free_param_new);
+      free_param->debug_info()->set_name(param_name);
+      MS_LOG(DEBUG) << "Top graph set free parameter " << obj_id;
+      graph_info_map_[df_builder_].param_map[obj_id] = free_param;
+      return free_param;
+    }
+    return graph_info_map_[df_builder_].param_map[obj_id];
+  }
+
+  // if input is graph output
+  if (graph_info_map_[curr_g_].param_map.count(obj_id) != 0) {
+    // op(x, y)
+    node = graph_info_map_[curr_g_].param_map[obj_id];
+  } else if (graph_info_map_[curr_g_].obj_node_map.count(obj_id) != 0) {
+    // out = op(op1(x, y))
+    // out = op(cell1(x, y))
+    // out = op(cell1(x, y)[0])
+    node = GetObjNode(obj);
+  } else if (py::isinstance<py::tuple>(obj)) {
+    // out = op((x, y))
+    // out = cell((x, y))
+    auto tuple = obj.cast<py::tuple>();
+
+    // cell((1,2)): support not mix (scalar, tensor)
+    if (tuple.size() > 0 && !py::isinstance<tensor::Tensor>(tuple[0])) {
+      return MakeValueNode(obj, obj_id);
+    }
+
+    std::vector<AnfNodePtr> args;
+    args.push_back(NewValueNode(prim::kPrimMakeTuple));
+
+    auto tuple_size = static_cast<int>(tuple.size());
+    for (int i = 0; i < tuple_size; i++) {
+      args.push_back(GetInput(tuple[i], py::object()));
+    }
+    auto cnode = curr_g_->NewCNode(args);
+    set_obj_node_map(curr_g_, GetId(obj), cnode);
+    node = cnode;
+  } else {
+    node = MakeValueNode(obj, obj_id);
+  }
+
+  MS_LOG(DEBUG) << "Now getinput node " << node->ToString() << obj_id;
+  return node;
+}
+
+// for output[0][1] need getitem multi
+void PynativeExecutor::SetTupleOutput(const py::object &obj, const AnfNodePtr &cnode, std::vector<int> idx) {
+  if (py::isinstance<py::tuple>(obj)) {
+    auto tuple = obj.cast<py::tuple>();
+    for (int i = 0; i < static_cast<int>(tuple.size()); i++) {
+      std::vector<int> tmp = idx;
+      tmp.push_back(i);
+      set_obj_node_map(curr_g_, GetId(tuple[i]), cnode, tmp);
+      SetTupleOutput(tuple[i], cnode, tmp);
+    }
+  }
+}
+
+void PynativeExecutor::Pushp() { graph_p_.push(curr_g_); }
+
+void PynativeExecutor::Popp() {
+  if (graph_p_.empty()) {
+    MS_LOG(EXCEPTION) << "Stack graph_p_ is empty";
+  }
+  curr_g_ = graph_p_.top();
+  graph_p_.pop();
+}
+
+void PynativeExecutor::EndGraph(const py::object &cell, const py::object &out, const py::args &args) {
+  auto cell_id = GetId(cell);
+  if (cell_graph_map_.count(cell_id) != 0) {
+    MS_LOG(DEBUG) << "Endgraph already compiled";
+    return;
+  }
+  cell_graph_map_[cell_id] = curr_g_;
+  auto out_id = GetId(out);
+  if (!graph_info_map_[curr_g_].obj_node_map.count(out_id) && !graph_info_map_[curr_g_].param_map.count(out_id)) {
+    // cell construct return x, y
+    if (py::isinstance<py::tuple>(out)) {
+      std::vector<AnfNodePtr> args;
+      args.push_back(NewValueNode(prim::kPrimMakeTuple));
+
+      auto tuple = out.cast<py::tuple>();
+      MS_LOG(DEBUG) << "End graph start tuple size" << tuple.size();
+      auto tuple_size = static_cast<int>(tuple.size());
+      auto cnode = curr_g_->NewCNode(args);
+      for (int i = 0; i < tuple_size; i++) {
+        args.push_back(GetInput(tuple[i], py::object()));
+        set_obj_node_map(curr_g_, GetId(tuple[i]), cnode, i);
+        SetTupleOutput(tuple[i], cnode, std::vector<int>{i});
+      }
+      cnode->set_inputs(args);
+      set_obj_node_map(curr_g_, out_id, cnode);
+    } else {
+      MS_LOG(ERROR) << "Graph has no this out: " << out_id;
+      return;
+    }
+  }
+  EndGraphByOutId(out_id, cell, out, args);
+}
+
+void PynativeExecutor::EndGraphByOutId(const std::string &out_id, const py::object &cell, const py::object &out,
+                                       const py::args &args) {
+  AnfNodePtr output_node;
+  if (graph_info_map_[curr_g_].param_map.count(out_id)) {
+    output_node = graph_info_map_[curr_g_].param_map[out_id];
+  } else {
+    output_node = GetObjNode(out);
+  }
+  curr_g_->set_output(output_node);
+  std::vector<AnfNodePtr> inputs;
+  inputs.push_back(NewValueNode(curr_g_));
+  MS_LOG(DEBUG) << "Current graph" << curr_g_->output()->DebugString();
+  resource_->manager()->AddFuncGraph(curr_g_);
+  // custom bprop debug
+  if (py::hasattr(cell, parse::CUSTOM_BPROP_NAME)) {
+    MS_LOG(DEBUG) << "Use cell custom bprop function.";
+    FuncGraphPtr bprop_graph = parse::ConvertToBpropCut(cell);
+    if (bprop_graph != nullptr) {
+      (void)curr_g_->transforms().insert(std::make_pair(parse::CUSTOM_BPROP_NAME, FuncGraphTransform(bprop_graph)));
+      (void)bprop_graph->transforms().insert(std::make_pair("primal", FuncGraphTransform(curr_g_)));
+    }
+  }
+  auto newfg = ad::Grad(curr_g_, resource_, curr_g_ == top_g_);
+  if (curr_g_ != top_g_) {
+    Popp();
+    for (size_t i = 0; i < args.size(); i++) {
+      auto input = GetInput(args[i], py::object());
+      inputs.push_back(input);
+    }
+    auto out_cnode = curr_g_->NewCNode(inputs);
+    set_pyobj(curr_g_, GetId(cell));
+    if (py::isinstance<py::tuple>(out)) {
+      auto out_list = py::cast<py::tuple>(out);
+      auto out_size = static_cast<int>(out_list.size());
+      for (int i = 0; i < out_size; i++) {
+        set_obj_node_map(curr_g_, GetId(out_list[i]), out_cnode, i);
+        SetTupleOutput(out_list[i], out_cnode, std::vector<int>{i});
+      }
+    }
+    set_obj_node_map(curr_g_, GetId(out), out_cnode);
+  } else {
+    parse::ResolveFuncGraph(newfg, resource_);
+    resource_->set_func_graph(newfg);
+  }
+}
+
+std::vector<AnfNodePtr> PynativeExecutor::GetWeightsArgs(const py::object &weights) {
+  std::vector<AnfNodePtr> w_args;
+  if (py::hasattr(weights, "__parameter_tuple__")) {
+    auto tuple = weights.cast<py::tuple>();
+    MS_LOG(DEBUG) << "GradNet start weights tuple size" << tuple.size();
+    w_args.push_back(NewValueNode(prim::kPrimMakeTuple));
+    for (size_t it = 0; it < tuple.size(); ++it) {
+      auto param = tuple[it];
+      auto param_id = GetId(param);
+      AnfNodePtr para_node = nullptr;
+      if (graph_info_map_[df_builder_].param_map.count(param_id)) {
+        para_node = graph_info_map_[df_builder_].param_map[param_id];
+
+        AnfNodePtr value = parse::GetMixedPrecisionCastHelp(df_builder_, para_node);
+        AnfNodePtr make_ref = NewValueNode(prim::kPrimMakeRef);
+        auto refkey = std::make_shared<RefKey>(para_node->cast<ParameterPtr>()->name());
+        AnfNodePtr ref_key_node = NewValueNode(refkey);
+        AnfNodePtr ref_node = df_builder_->NewCNode({make_ref, ref_key_node, value, para_node});
+
+        w_args.push_back(ref_node);
+      }
+    }
+  } else {
+    MS_LOG(EXCEPTION) << "training not paramter_tuple";
+  }
+  return w_args;
+}
+
+abstract::AbstractBasePtrList PynativeExecutor::GetArgsSpec(const py::args &args) {
+  abstract::AbstractBasePtrList args_spec;
+  std::size_t size = args.size();
+  for (std::size_t i = 0; i < size; i++) {
+    ValuePtr converted = nullptr;
+    bool succ = parse::ConvertData(args[i], &converted);
+    if (!succ) {
+      MS_LOG(EXCEPTION) << "Args convert error";
+    }
+    bool broaden = true;
+    auto abs = abstract::FromValue(converted, broaden);
+    args_spec.push_back(abs);
+    auto param_node = std::static_pointer_cast<Parameter>(df_builder_->parameters()[i]);
+    param_node->set_abstract(abs);
+  }
+
+  for (const auto &param : df_builder_->parameters()) {
+    auto param_node = std::static_pointer_cast<Parameter>(param);
+    if (param_node->has_default()) {
+      auto param_value = std::dynamic_pointer_cast<ParamValuePy>(param_node->default_param());
+      AbstractBasePtr ptr = abstract::FromValue(parse::data_converter::PyDataToValue(param_value->value()), true);
+      if (ptr == nullptr) {
+        MS_LOG(EXCEPTION) << "Args convert error";
+      }
+      args_spec.push_back(ptr);
+      param_node->set_abstract(ptr);
+    }
+  }
+
+  return args_spec;
+}
+
+void PynativeExecutor::GradNet(const GradOperationPtr &grad, const py::object &cell, const py::object &weights,
+                               const py::args &args) {
+  MS_LOG(INFO) << "GradNet start" << args.size();
+
+  std::size_t size = args.size();
+  auto cell_id = GetId(cell);
+  if (graph_map_.count(cell_id) != 0) {
+    MS_LOG(DEBUG) << "GradNet already compiled";
+    return;
+  }
+  MS_LOG(DEBUG) << "GradNet first compiled";
+  std::vector<AnfNodePtr> new_params;
+  for (size_t i = 0; i < size; i++) {
+    ParameterPtr p = std::make_shared<Parameter>(df_builder_);
+    new_params.push_back(p);
+  }
+  MS_LOG(DEBUG) << "GradNet start weight size" << df_builder_->parameters().size();
+  new_params.insert(new_params.end(), df_builder_->parameters().begin(), df_builder_->parameters().end());
+  df_builder_->set_parameters(new_params);
+  resource_->manager()->SetParameters(df_builder_, new_params);
+
+  std::vector<AnfNodePtr> w_args = GetWeightsArgs(weights);
+  MS_EXCEPTION_IF_NULL(resource_->func_graph());
+  auto g = GradGraph(resource_->func_graph(), grad, w_args, size);
+  resource_->set_func_graph(g);
+  resource_->manager()->KeepRoots({g});
+
+  // get the parameters items and add the value to args_spec
+  abstract::AbstractBasePtrList args_spec = GetArgsSpec(args);
+  MS_LOG(DEBUG) << "Args_spec size" << args_spec.size();
+
+  resource_->set_args_spec(args_spec);
+  MS_LOG(DEBUG) << "Start opt";
+
+  // Create backend and session
+  resource_->results()[pipeline::kBackend] = compile::CreateBackend();
+
+  graph_map_[cell_id] = g;
+  PynativeOptimizeAction(resource_);
+  TaskEmitAction(resource_);
+  ExecuteAction(resource_);
+  resource_->Clean();
+  ad::CleanRes();
+  pipeline::ReclaimOptimizer();
+}
+
+void PynativeExecutor::Clear(const std::string &flag) {
+  if (flag == "resource") {
+    MS_LOG(INFO) << "Clear res";
+    Clean();
+    // Maybe exit in the pynative runing op, so need reset pynative flag.
+    auto ms_context = MsContext::GetInstance();
+    if (ms_context != nullptr) {
+      ms_context->set_enable_pynative_infer(false);
+    }
+    return;
+  }
+  MS_LOG(INFO) << "Clear";
+  top_g_ = nullptr;
+  curr_g_ = nullptr;
+  graph_info_map_.clear();
+  std::stack<FuncGraphPtr>().swap(graph_p_);
+}
+
+void PynativeExecutor::Clean() {
+  MS_LOG(INFO) << "Clean all res";
+  Clear();
+  grad_flag_ = false;
+  df_builder_ = nullptr;
+  ad::CleanRes();
+  pipeline::ReclaimOptimizer();
+}
+
+void PynativeExecutor::ClearRes() {
+  Clean();
+  resource_.reset();
+}
+
+py::object PynativeExecutor::Run(const py::tuple &args, const py::object &phase) {
+  VectorRef arg_list;
+  pipeline::ProcessVmArgInner(args, resource_, &arg_list);
+  if (resource_->results().find(pipeline::kOutput) == resource_->results().end() ||
+      !resource_->results()[pipeline::kOutput].is<compile::VmEvalFuncPtr>()) {
+    MS_LOG(EXCEPTION) << "Can't find run graph func for ";
+  }
+  compile::VmEvalFuncPtr run = resource_->results()[pipeline::kOutput].cast<compile::VmEvalFuncPtr>();
+  if (run == nullptr) {
+    MS_LOG(EXCEPTION) << "Can't find run graph func for ";
+  }
+
+  std::string backend = MsContext::GetInstance()->backend_policy();
+
+  MS_LOG(DEBUG) << "Eval run" << backend;
+  BaseRef value = (*run)(arg_list);
+  MS_LOG(DEBUG) << "Run end" << value.ToString();
+  return BaseRefToPyData(value);
+}
+
+FuncGraphPtr PynativeExecutor::GradGraph(FuncGraphPtr g, const GradOperationPtr &grad_op,
+                                         const std::vector<AnfNodePtr> &weights, size_t arg_size) {
+  auto nparam = top_g_->parameters().size();
+  std::ostringstream ss;
+  ss << "grad{" << nparam << "}";
+  df_builder_->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  df_builder_->debug_info()->set_name(ss.str());
+
+  auto df = grad_op->GetGrad(NewValueNode(g), nullptr, top_g_->parameters(), weights);
+  std::vector<AnfNodePtr> inputs = {NewValueNode(df)};
+  for (size_t i = 0; i < arg_size; ++i) {
+    inputs.push_back(df_builder_->parameters()[i]);
+  }
+  auto out = df_builder_->NewCNode(inputs);
+  df_builder_->set_output(out);
+  resource_->manager()->AddFuncGraph(df);
+  resource_->manager()->AddFuncGraph(df_builder_);
+  return df_builder_;
+}
+
+REGISTER_PYBIND_DEFINE(PynativeExecutor_, ([](const py::module *m) {
+                         (void)py::class_<PynativeExecutor, std::shared_ptr<PynativeExecutor>>(*m, "PynativeExecutor_")
+                           .def_static("get_instance", &PynativeExecutor::GetInstance, "PynativeExecutor get_instance.")
+                           .def("new_graph", &PynativeExecutor::NewGraph, "pynative new a graph.")
+                           .def("end_graph", &PynativeExecutor::EndGraph, "pynative end a graph.")
+                           .def("grad_net", &PynativeExecutor::GradNet, "pynative grad graph.")
+                           .def("clear", &PynativeExecutor::Clear, "pynative clear status.")
+                           .def("__call__", &PynativeExecutor::Run, py::arg("args"), py::arg("phase") = py::str(""),
+                                "Executor run function.")
+                           .def("set_grad_flag", &PynativeExecutor::set_grad_flag, py::arg("flag") = py::bool_(false),
+                                "Executor set grad flag.");
+                       }));
 }  // namespace pynative
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pynative/pynative_execute.h b/mindspore/ccsrc/pynative/pynative_execute.h
index 65be3b2ab2..310cf0cb1e 100644
--- a/mindspore/ccsrc/pynative/pynative_execute.h
+++ b/mindspore/ccsrc/pynative/pynative_execute.h
@@ -22,23 +22,103 @@
 #include <string>
 #include <memory>
 #include <unordered_map>
+#include <mutex>
+#include <stack>
 
 #include "pybind11/pybind11.h"
 
 #include "pynative/base.h"
 #include "utils/context/ms_context.h"
+#include "ir/anf.h"
+#include "pipeline/resource.h"
+#include "operator/composite/composite.h"
 
 namespace mindspore {
 namespace pynative {
 
 namespace py = pybind11;
+using ResourcePtr = std::shared_ptr<pipeline::Resource>;
+using GradOperationPtr = std::shared_ptr<prim::GradOperation>;
 
 py::object RunOpInVM(const OpExecInfoPtr &op_exec_info, PynativeStatusCode *status);
 
 py::tuple RunOp(const py::args &args);
 
+py::tuple ConvertInputs(const PrimitivePyPtr &prim, const py::list &py_args, py::tuple *const out_args,
+                        py::list *out_args_list);
+
 void ClearPyNativeSession();
 
+struct GraphInfo {
+  std::unordered_map<std::string, AnfNodePtr> param_map;
+  std::unordered_map<std::string, std::pair<AnfNodePtr, std::vector<int>>> obj_node_map;
+  AnfNodePtr output;
+  std::vector<std::string> objects;
+};
+
+class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> {
+ public:
+  static std::shared_ptr<PynativeExecutor> GetInstance() {
+    std::lock_guard<std::mutex> i_lock(instance_lock_);
+    if (executor_ == nullptr) {
+      executor_ = std::shared_ptr<PynativeExecutor>(new (std::nothrow) PynativeExecutor());
+      resource_ = std::make_shared<pipeline::Resource>();
+    }
+    return executor_;
+  }
+  void NewGraph(const py::object &cell, const py::args &args);
+  void EndGraph(const py::object &cell, const py::object &out, const py::args &args);
+  void EndGraphByOutId(const std::string &out_id, const py::object &cell, const py::object &out, const py::args &args);
+  std::vector<AnfNodePtr> GetWeightsArgs(const py::object &weights);
+  abstract::AbstractBasePtrList GetArgsSpec(const py::args &args);
+  void GradNet(const GradOperationPtr &grad, const py::object &cell, const py::object &weights, const py::args &args);
+  void Clear(const std::string &flag = "");
+  void Clean();
+  void ClearRes();
+  bool grad_flag() { return grad_flag_; }
+  void set_grad_flag(bool flag) { grad_flag_ = flag; }
+  AnfNodePtr GetInput(const py::object &obj, const py::object &op_mask);
+  AnfNodePtr GetObjNode(const py::object &obj);
+  FuncGraphPtr curr_g() { return curr_g_; }
+  void set_pyobj(FuncGraphPtr g, const std::string obj) { graph_info_map_[g].objects.push_back(obj); }
+  void set_obj_node_map(FuncGraphPtr g, const std::string obj, AnfNodePtr node) {
+    graph_info_map_[g].obj_node_map[obj] = std::make_pair(node, std::vector<int>{-1});
+  }
+  void set_obj_node_map(FuncGraphPtr g, const std::string obj, AnfNodePtr node, int index) {
+    graph_info_map_[g].obj_node_map[obj] = std::make_pair(node, std::vector<int>{index});
+  }
+  void set_obj_node_map(FuncGraphPtr g, const std::string obj, AnfNodePtr node, std::vector<int> index) {
+    graph_info_map_[g].obj_node_map[obj] = std::make_pair(node, index);
+  }
+  AnfNodePtr MakeCNode(const OpExecInfoPtr &op_exec_info, const py::args &args, const py::tuple &out);
+  py::object Run(const py::tuple &args, const py::object &phase);
+
+  void Pushp();
+  void Popp();
+  FuncGraphPtr GradGraph(FuncGraphPtr g, const GradOperationPtr &grad_op, const std::vector<AnfNodePtr> &weights,
+                         size_t arg_size);
+  void SetTupleOutput(const py::object &obj, const AnfNodePtr &cnode, std::vector<int> idx);
+  AnfNodePtr MakeValueNode(const py::object &obj, const std::string &obj_id);
+
+  ~PynativeExecutor();
+
+ private:
+  PynativeExecutor();
+  static std::shared_ptr<PynativeExecutor> executor_;
+  static std::mutex instance_lock_;
+  static ResourcePtr resource_;
+  bool grad_flag_;
+  std::unordered_map<std::string, FuncGraphPtr> graph_map_;
+  std::unordered_map<std::string, FuncGraphPtr> cell_graph_map_;
+  std::unordered_map<FuncGraphPtr, GraphInfo> graph_info_map_;
+  std::stack<FuncGraphPtr> graph_p_;
+  FuncGraphPtr top_g_;
+  FuncGraphPtr df_builder_;
+  FuncGraphPtr curr_g_;
+};
+
+using PynativeExecutorPtr = std::shared_ptr<PynativeExecutor>;
+
 }  // namespace pynative
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/session/CMakeLists.txt b/mindspore/ccsrc/session/CMakeLists.txt
index 2824af8a5d..782eb51183 100644
--- a/mindspore/ccsrc/session/CMakeLists.txt
+++ b/mindspore/ccsrc/session/CMakeLists.txt
@@ -23,6 +23,7 @@ if (ENABLE_D)
     file(GLOB_RECURSE _D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "ascend_session.cc"
         "ascend_control_parser.cc"
+        "ascend_inference_session.cc"
         )
     list(APPEND _SESSION_SRC_LIST ${_D_SRC_LIST})
 endif ()
diff --git a/mindspore/ccsrc/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/session/anf_runtime_algorithm.cc
index 6cc68457e5..5db7dbc324 100644
--- a/mindspore/ccsrc/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/session/anf_runtime_algorithm.cc
@@ -178,12 +178,29 @@ bool AnfRuntimeAlgorithm::CheckPrimitiveType(const AnfNodePtr &node, const Primi
   return IsPrimitive(cnode->input(kAnfPrimitiveIndex), primitive_type);
 }
 
+FuncGraphPtr AnfRuntimeAlgorithm::GetCNodeFuncGraphPtr(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto attr_input = cnode->input(kAnfPrimitiveIndex);
+  MS_EXCEPTION_IF_NULL(attr_input);
+  auto value_node = attr_input->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  auto value = value_node->value();
+  MS_EXCEPTION_IF_NULL(value);
+  return value->cast<FuncGraphPtr>();
+}
+
 std::string AnfRuntimeAlgorithm::GetCNodeName(const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node);
   if (node->isa<CNode>()) {
     auto primitive = AnfAlgo::GetCNodePrimitive(node);
-    MS_EXCEPTION_IF_NULL(primitive);
-    return primitive->name();
+    if (primitive != nullptr) {
+      return primitive->name();
+    }
+    auto func_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(func_graph);
+    return func_graph->ToString();
   }
   MS_LOG(EXCEPTION) << "Unknown anf node type " << node->DebugString();
 }
@@ -198,9 +215,16 @@ void AnfRuntimeAlgorithm::SetNodeAttr(const std::string &key, const ValuePtr &va
   if (!node->isa<CNode>()) {
     MS_LOG(EXCEPTION) << "Only cnode has attr, but this anf is " << node->DebugString();
   }
+  // single op cnode.
   auto primitive = AnfAlgo::GetCNodePrimitive(node);
-  MS_EXCEPTION_IF_NULL(primitive);
-  primitive->set_attr(key, value);
+  if (primitive != nullptr) {
+    primitive->set_attr(key, value);
+    return;
+  }
+  // graph kernel cnode.
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  fg->set_attr(key, value);
 }
 
 void AnfRuntimeAlgorithm::CopyNodeAttr(const std::string &key, const AnfNodePtr &from, const AnfNodePtr &to) {
@@ -241,16 +265,33 @@ void AnfRuntimeAlgorithm::EraseNodeAttr(const std::string &key, const AnfNodePtr
   if (!node->isa<CNode>()) {
     MS_LOG(EXCEPTION) << "Only cnode has attr, but this anf is " << node->DebugString();
   }
+  // single op cnode.
   auto primitive = AnfAlgo::GetCNodePrimitive(node);
-  MS_EXCEPTION_IF_NULL(primitive);
-  primitive->EraseAttr(key);
+  if (primitive != nullptr) {
+    primitive->EraseAttr(key);
+    return;
+  }
+  // graph kernel cnode.
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  fg->erase_flag(key);
 }
 
 bool AnfRuntimeAlgorithm::HasNodeAttr(const std::string &key, const CNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(WARNING) << "Only cnode has attr, but this anf is " << node->DebugString();
+    return false;
+  }
+  // single op cnode.
   auto primitive = AnfAlgo::GetCNodePrimitive(node);
-  MS_EXCEPTION_IF_NULL(primitive);
-  return primitive->HasAttr(key);
+  if (primitive != nullptr) {
+    return primitive->HasAttr(key);
+  }
+  // graph kernel cnode.
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  return fg->has_flag(key);
 }
 
 size_t AnfRuntimeAlgorithm::GetInputTensorNum(const AnfNodePtr &node) {
@@ -544,9 +585,10 @@ TypeId AnfRuntimeAlgorithm::GetPrevNodeOutputDeviceDataType(const AnfNodePtr &an
 }
 
 // get output device addr of anf_node
-const DeviceAddress *AnfRuntimeAlgorithm::GetOutputAddr(const AnfNodePtr &node, size_t output_idx) {
+const DeviceAddress *AnfRuntimeAlgorithm::GetOutputAddr(const AnfNodePtr &node, size_t output_idx,
+                                                        bool visit_nop_node) {
   MS_EXCEPTION_IF_NULL(node);
-  if (opt::IsNopNode(node)) {
+  if (opt::IsNopNode(node) && visit_nop_node) {
     auto cnode = node->cast<CNodePtr>();
     MS_EXCEPTION_IF_NULL(cnode);
     if (cnode->inputs().size() == 2) {
@@ -565,9 +607,10 @@ const DeviceAddress *AnfRuntimeAlgorithm::GetOutputAddr(const AnfNodePtr &node,
   return addr;
 }
 
-DeviceAddressPtr AnfRuntimeAlgorithm::GetMutableOutputAddr(const AnfNodePtr &node, size_t output_idx) {
+DeviceAddressPtr AnfRuntimeAlgorithm::GetMutableOutputAddr(const AnfNodePtr &node, size_t output_idx,
+                                                           bool visit_nop_node) {
   MS_EXCEPTION_IF_NULL(node);
-  if (opt::IsNopNode(node)) {
+  if (opt::IsNopNode(node) && visit_nop_node) {
     auto cnode = node->cast<CNodePtr>();
     MS_EXCEPTION_IF_NULL(cnode);
     if (cnode->inputs().size() == 2) {
@@ -598,14 +641,16 @@ bool AnfRuntimeAlgorithm::OutputAddrExist(const AnfNodePtr &node, size_t output_
   return kernel_info->OutputAddrExist(output_idx);
 }
 
-const DeviceAddress *AnfRuntimeAlgorithm::GetPrevNodeOutputAddr(const AnfNodePtr &anf_node, size_t input_idx) {
+const DeviceAddress *AnfRuntimeAlgorithm::GetPrevNodeOutputAddr(const AnfNodePtr &anf_node, size_t input_idx,
+                                                                bool visit_nop_node) {
   KernelWithIndex kernel_with_index = AnfAlgo::GetPrevNodeOutput(anf_node, input_idx);
-  return AnfRuntimeAlgorithm::GetOutputAddr(kernel_with_index.first, kernel_with_index.second);
+  return AnfRuntimeAlgorithm::GetOutputAddr(kernel_with_index.first, kernel_with_index.second, visit_nop_node);
 }
 
-DeviceAddressPtr AnfRuntimeAlgorithm::GetPrevNodeMutableOutputAddr(const AnfNodePtr &anf_node, size_t input_idx) {
+DeviceAddressPtr AnfRuntimeAlgorithm::GetPrevNodeMutableOutputAddr(const AnfNodePtr &anf_node, size_t input_idx,
+                                                                   bool visit_nop_node) {
   KernelWithIndex kernel_with_index = AnfAlgo::GetPrevNodeOutput(anf_node, input_idx);
-  return AnfRuntimeAlgorithm::GetMutableOutputAddr(kernel_with_index.first, kernel_with_index.second);
+  return AnfRuntimeAlgorithm::GetMutableOutputAddr(kernel_with_index.first, kernel_with_index.second, visit_nop_node);
 }
 
 // set output device addr of anf_node
@@ -778,6 +823,26 @@ bool AnfRuntimeAlgorithm::IsRealCNodeKernel(const AnfNodePtr &node) {
   return IsRealKernel(node);
 }
 
+bool AnfRuntimeAlgorithm::IsGraphKernel(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  // graph kernel should be a real cnode kernel.
+  if (!IsRealCNodeKernel(node)) {
+    return false;
+  }
+
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto input = cnode->input(kAnfPrimitiveIndex);
+  // graph kernel should has func_graph as first input.
+  if (!IsValueNode<FuncGraph>(input)) {
+    return false;
+  }
+
+  auto func_graph = GetValueNode<FuncGraphPtr>(input);
+  MS_EXCEPTION_IF_NULL(func_graph);
+  return func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+}
+
 bool AnfRuntimeAlgorithm::IsParameterWeight(const ParameterPtr &node) {
   MS_EXCEPTION_IF_NULL(node);
   return node->has_default();
@@ -976,5 +1041,78 @@ bool AnfRuntimeAlgorithm::IsSwitchCall(const CNodePtr &call_node) {
   }
   MS_LOG(EXCEPTION) << "Unexpected input1 of call node,input1:" << input1->DebugString();
 }
+
+bool AnfRuntimeAlgorithm::IsScalarInput(const CNodePtr &cnode, size_t index) {
+  auto shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, index);
+  if (shape.empty()) {
+    return true;
+  }
+  return shape.size() == kShape1dDims && shape[0] == 1;
+}
+
+bool AnfRuntimeAlgorithm::IsScalarOutput(const CNodePtr &cnode, size_t index) {
+  auto shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, index);
+  if (shape.empty()) {
+    return true;
+  }
+  return shape.size() == kShape1dDims && shape[0] == 1;
+}
+
+void AnfRuntimeAlgorithm::ReorderExecList(NotNull<std::vector<CNodePtr> *> node_list) {
+  std::vector<CNodePtr> all_opt_list;
+  std::vector<CNodePtr> non_opt_list;
+
+  for (const auto &node : *node_list) {
+    MS_EXCEPTION_IF_NULL(node);
+    if (kOptOperatorSet.find(AnfAlgo::GetCNodeName(node)) != kOptOperatorSet.end()) {
+      all_opt_list.emplace_back(node);
+    } else {
+      non_opt_list.emplace_back(node);
+    }
+  }
+  node_list->clear();
+  std::copy(non_opt_list.begin(), non_opt_list.end(), std::back_inserter(*node_list));
+  std::copy(all_opt_list.begin(), all_opt_list.end(), std::back_inserter(*node_list));
+}
+
+TypeId AnfRuntimeAlgorithm::GetCNodeOutputPrecision(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  auto prim = AnfAlgo::GetCNodePrimitive(node);
+  if (prim == nullptr) {
+    return kTypeUnknown;
+  }
+
+  TypeId except_type = kTypeUnknown;
+  if (prim->GetAttr(kAttrOutputPrecision) != nullptr) {
+    auto output_type_str = GetValue<std::string>(prim->GetAttr(kAttrOutputPrecision));
+    if (output_type_str == "float16") {
+      except_type = kNumberTypeFloat16;
+    } else if (output_type_str == "float32") {
+      except_type = kNumberTypeFloat32;
+    } else {
+      MS_LOG(EXCEPTION) << "The fix precision must be float16 or float32, but got " << output_type_str;
+    }
+  }
+
+  return except_type;
+}
+
+TypeId AnfRuntimeAlgorithm::GetPrevNodeOutputPrecision(const AnfNodePtr &node, size_t input_idx) {
+  if (!node->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << node->DebugString() << ", input node is not CNode.";
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (input_idx + 1 >= cnode->inputs().size()) {
+    MS_LOG(EXCEPTION) << "Input index " << input_idx << " is larger than input number " << GetInputTensorNum(cnode);
+  }
+  auto input_node = cnode->input(input_idx + 1);
+  MS_EXCEPTION_IF_NULL(input_node);
+  auto kernel_with_index = VisitKernel(input_node, 0);
+  if (!kernel_with_index.first->isa<CNode>()) {
+    return kTypeUnknown;
+  }
+  return GetCNodeOutputPrecision(kernel_with_index.first);
+}
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/session/anf_runtime_algorithm.h b/mindspore/ccsrc/session/anf_runtime_algorithm.h
index 10ae5282e0..c46f0b5955 100644
--- a/mindspore/ccsrc/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/session/anf_runtime_algorithm.h
@@ -54,6 +54,8 @@ class AnfRuntimeAlgorithm {
   static PrimitivePtr GetCNodePrimitive(const AnfNodePtr &node);
   // check whether anf node is a node of 'primitive_type',such as make_tuple is a cnode of kPrimMakeTuple
   static bool CheckPrimitiveType(const AnfNodePtr &node, const PrimitivePtr &primitive_type);
+  // get cnode primitive
+  static FuncGraphPtr GetCNodeFuncGraphPtr(const AnfNodePtr &node);
   // get kernel_name of anf node
   static std::string GetCNodeName(const AnfNodePtr &node);
   // get detail info of anf node
@@ -121,14 +123,16 @@ class AnfRuntimeAlgorithm {
   // get output select data type from prev node,input_index is the input index of current node related to prev node
   static TypeId GetPrevNodeOutputDeviceDataType(const AnfNodePtr &node, size_t input_idx);
   // get output device addr of anf_node
-  static const DeviceAddress *GetOutputAddr(const AnfNodePtr &node, size_t output_idx);
+  static const DeviceAddress *GetOutputAddr(const AnfNodePtr &node, size_t output_idx, bool visit_nop_node = true);
   // get mutable output device addr of anf_node
-  static DeviceAddressPtr GetMutableOutputAddr(const AnfNodePtr &node, size_t output_idx);
+  static DeviceAddressPtr GetMutableOutputAddr(const AnfNodePtr &node, size_t output_idx, bool visit_nop_node = true);
   // check whether output addr is exist or not
   static bool OutputAddrExist(const AnfNodePtr &node, size_t output_idx);
   // get address from prev node,input_index is the input index of current node related to prev node
-  static const DeviceAddress *GetPrevNodeOutputAddr(const AnfNodePtr &node, size_t input_idx);
-  static DeviceAddressPtr GetPrevNodeMutableOutputAddr(const AnfNodePtr &anf_node, size_t input_idx);
+  static const DeviceAddress *GetPrevNodeOutputAddr(const AnfNodePtr &node, size_t input_idx,
+                                                    bool visit_nop_node = true);
+  static DeviceAddressPtr GetPrevNodeMutableOutputAddr(const AnfNodePtr &anf_node, size_t input_idx,
+                                                       bool visit_nop_node = true);
   // set output device addr of anf_node
   static void SetOutputAddr(const DeviceAddressPtr &addr, size_t output_idx, AnfNode *node);
   // set workspace device addr of anf_node
@@ -159,6 +163,8 @@ class AnfRuntimeAlgorithm {
   static bool IsRealKernel(const AnfNodePtr &node);
   // checkout whether the anf node is a real kernel that is a cnode and can run on device
   static bool IsRealCNodeKernel(const AnfNodePtr &node);
+  // checkout whether the anf node is a graph kernel.
+  static bool IsGraphKernel(const AnfNodePtr &node);
   // check parameter is weight or data
   static bool IsParameterWeight(const ParameterPtr &node);
   // set stream id of kernel,which will be set in stream assign and be used in stream generate
@@ -185,6 +191,14 @@ class AnfRuntimeAlgorithm {
   static FuncGraphPtr GetValueNodeFuncGraph(const AnfNodePtr &node);
   static std::vector<KernelGraphPtr> GetCallNodeKernelGraph(const CNodePtr &call_node);
   static bool IsSwitchCall(const CNodePtr &call_node);
+  static bool IsScalarInput(const CNodePtr &cnode, size_t index);
+  static bool IsScalarOutput(const CNodePtr &cnode, size_t index);
+  static void ReorderExecList(NotNull<std::vector<CNodePtr> *> node_list);
+  static bool IsWhileTrueGraph(const KernelGraphPtr &child_graph);
+  // get fix output precision of cnode.
+  static TypeId GetCNodeOutputPrecision(const AnfNodePtr &node);
+  // get fix output precision from prev node, input_idx is the input index of current node related to prev node.
+  static TypeId GetPrevNodeOutputPrecision(const AnfNodePtr &node, size_t input_idx);
 };
 }  // namespace session
 using AnfAlgo = session::AnfRuntimeAlgorithm;
diff --git a/mindspore/ccsrc/session/ascend_control_parser.cc b/mindspore/ccsrc/session/ascend_control_parser.cc
index 2853caa732..868b968d9e 100644
--- a/mindspore/ccsrc/session/ascend_control_parser.cc
+++ b/mindspore/ccsrc/session/ascend_control_parser.cc
@@ -18,6 +18,7 @@
 #include <memory>
 #include "session/ascend_control_parser.h"
 #include "session/anf_runtime_algorithm.h"
+#include "utils/union_find_set.h"
 
 static constexpr size_t kCNodePrim = 0;
 static constexpr size_t kCNodeCallArg = 1;
@@ -32,35 +33,126 @@ static constexpr size_t kCNodeSwitchLayerLength = 3;
 
 namespace mindspore {
 namespace session {
+static void InitUnionFindSet(NotNull<KernelGraphPtr> kg, const NotNull<UnionFindSet<AnfNodePtr> *> union_find_set,
+                             const NotNull<std::set<KernelGraphPtr> *> memo) {
+  if (memo->find(kg.get()) != memo->end()) {
+    return;
+  }
+  memo->insert(kg.get());
 
-void AscendControlParser::ChildGraphDataAssign(const std::map<uint32_t, KernelGraphPtr> &graph_id_map) {
-  for (auto &iter : graph_id_map) {
-    auto &kg = iter.second;
-    MS_EXCEPTION_IF_NULL(kg);
-    auto real_inputs = kg->real_inputs();
-    for (auto &it : real_inputs) {
-      auto &parameter = it.first;
-      auto &args = it.second;
-      for (auto &arg : args) {
-        MS_EXCEPTION_IF_NULL(arg);
-        if (arg->isa<Parameter>()) {
-          MS_LOG(INFO) << "Parameter should be reused, no need insert assign, parameter: " << parameter->DebugString()
-                       << ", arg:" << arg->DebugString();
-          continue;
-        }
-        auto target_graph_iter = graph_id_map.find(AnfAlgo::GetGraphId(arg.get()));
-        if (target_graph_iter == graph_id_map.end()) {
-          MS_LOG(EXCEPTION) << "Graph id " << AnfAlgo::GetGraphId(arg.get()) << " not found.";
-        }
-        InsertAssignToGraph(NOT_NULL(target_graph_iter->second), NOT_NULL(arg), NOT_NULL(parameter));
+  const std::vector<std::pair<AnfNodePtr, std::vector<AnfNodePtr>>> &real_inputs = kg->real_inputs();
+  for (auto &iter : real_inputs) {
+    auto &para = iter.first;
+    MS_EXCEPTION_IF_NULL(para);
+    if (para->isa<Parameter>()) {
+      union_find_set->Add(para);
+    }
+    for (auto &arg : iter.second) {
+      MS_EXCEPTION_IF_NULL(arg);
+      if (!arg->isa<Parameter>()) {
+        continue;
       }
+      union_find_set->Add(arg);
     }
   }
+  for (auto &child : kg->child_graph_order()) {
+    InitUnionFindSet(NOT_NULL(child), union_find_set, memo);
+  }
+}
+
+static void UnionParentParameter(NotNull<KernelGraphPtr> kg, const NotNull<UnionFindSet<AnfNodePtr> *> union_find_set,
+                                 const NotNull<std::set<KernelGraphPtr> *> memo) {
+  if (memo->find(kg.get()) != memo->end()) {
+    return;
+  }
+  memo->insert(kg.get());
+
+  const std::vector<std::pair<AnfNodePtr, std::vector<AnfNodePtr>>> &real_inputs = kg->real_inputs();
+  for (auto &iter : real_inputs) {
+    auto &para = iter.first;
+    for (auto &arg : iter.second) {
+      MS_EXCEPTION_IF_NULL(arg);
+      if (!arg->isa<Parameter>()) {
+        continue;
+      }
+      union_find_set->Union(arg, para);
+    }
+  }
+  for (auto &child : kg->child_graph_order()) {
+    UnionParentParameter(NOT_NULL(child), union_find_set, memo);
+  }
+}
+
+static UnionFindSet<AnfNodePtr> MakeUnionFindSet(NotNull<KernelGraphPtr> root_kg) {
+  UnionFindSet<AnfNodePtr> result;
+  std::set<KernelGraphPtr> memo;
+  InitUnionFindSet(root_kg, NOT_NULL(&result), NOT_NULL(&memo));
+  memo.clear();
+  UnionParentParameter(root_kg, NOT_NULL(&result), NOT_NULL(&memo));
+  return result;
+}
+
+static void RecursiveReplaceNode(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> main_parameter,
+                                 const std::set<AnfNodePtr> &parameter_reuse_set,
+                                 const NotNull<std::set<KernelGraphPtr> *> memo) {
+  if (parameter_reuse_set.empty()) {
+    MS_LOG(EXCEPTION) << "parameter_reuse_set is empty.";
+  }
+  if (memo->find(kg.get()) != memo->end()) {
+    return;
+  }
+  memo->insert(kg.get());
+
+  for (auto &para : parameter_reuse_set) {
+    if (para == main_parameter.get()) {
+      continue;
+    }
+    MS_EXCEPTION_IF_NULL(para);
+    MS_LOG(INFO) << "Replace " << para->DebugString() << " of graph " << AnfAlgo::GetGraphId(para.get()) << " to "
+                 << main_parameter->DebugString() << " of graph " << AnfAlgo::GetGraphId(main_parameter.get().get());
+    kg->ReplaceNode(NOT_NULL(para), main_parameter);
+  }
+
+  for (auto &child : kg->child_graph_order()) {
+    RecursiveReplaceNode(NOT_NULL(child), main_parameter, parameter_reuse_set, memo);
+  }
+}
+
+static void ReuseParameter(NotNull<KernelGraphPtr> root_kg, NotNull<UnionFindSet<AnfNodePtr> *> parameter_set) {
+  auto parameter_reuse_sets = parameter_set->GetSets();
+  for (auto &[key, parameter_reuse_set] : parameter_reuse_sets) {
+    if (parameter_reuse_set.size() <= 1) {
+      continue;
+    }
+
+    AnfNodePtr main_parameter = key;
+    std::set<AnfNodePtr> root_inputs_set;
+    const auto &root_inputs_vector = root_kg->inputs();
+    root_inputs_set.insert(root_inputs_vector.begin(), root_inputs_vector.end());
+    for (auto &node : parameter_reuse_set) {
+      if (root_inputs_set.find(node) != root_inputs_set.end()) {
+        main_parameter = node;
+        break;
+      }
+    }
+
+    std::set<KernelGraphPtr> memo;
+    RecursiveReplaceNode(root_kg, NOT_NULL(main_parameter), parameter_reuse_set, NOT_NULL(&memo));
+  }
+}
+
+CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) {
+  for (size_t i = start; i < list.size() - 1; ++i) {
+    if (!IsPrimitiveCNode(list[i], prim::kPrimPartial) && AnfAlgo::IsRealKernel(list[i])) {
+      return list[i];
+    }
+  }
+  return nullptr;
 }
 
 void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) {
   std::set<KernelGraphPtr> memo;
-  ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo));
+  (void)ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo));
   std::map<uint32_t, KernelGraphPtr> graph_id_map;
   for (auto &g : memo) {
     if (graph_id_map.find(g->graph_id()) != graph_id_map.end()) {
@@ -69,16 +161,49 @@ void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) {
     }
     graph_id_map[g->graph_id()] = g;
   }
+  // Make UnionFindSet
+  UnionFindSet<AnfNodePtr> parameter_set = MakeUnionFindSet(kg);
+  // Reuse Parameter
+  ReuseParameter(kg, NOT_NULL(&parameter_set));
+  // Insert Assign
   ChildGraphDataAssign(graph_id_map);
 }
 
-CNodePtr AscendControlParser::GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) {
-  for (size_t i = start; i < list.size() - 1; ++i) {
-    if (!IsPrimitiveCNode(list[i], prim::kPrimPartial) && AnfAlgo::IsRealKernel(list[i])) {
-      return list[i];
+void AscendControlParser::ExecutorValidate(NotNull<KernelGraphPtr> root_graph) {
+  std::set<KernelGraphPtr> memo;
+  (void)RecurseGraph(root_graph, NOT_NULL(&memo));
+}
+
+void AscendControlParser::ChildGraphDataAssign(const std::map<uint32_t, KernelGraphPtr> &graph_id_map) {
+  for (auto &iter : graph_id_map) {
+    auto &kg = iter.second;
+    MS_EXCEPTION_IF_NULL(kg);
+    std::set<std::pair<AnfNodePtr, AnfNodePtr>> memo;
+    const std::vector<std::pair<AnfNodePtr, std::vector<AnfNodePtr>>> &real_inputs = kg->real_inputs();
+    for (auto &it : real_inputs) {
+      auto &parameter = it.first;
+      auto &args = it.second;
+      for (auto &arg : args) {
+        MS_EXCEPTION_IF_NULL(arg);
+        if (memo.find({parameter, arg}) != memo.end()) {
+          continue;
+        } else {
+          memo.emplace(parameter, arg);
+        }
+        if (arg->isa<Parameter>()) {
+          MS_EXCEPTION_IF_NULL(parameter);
+          MS_LOG(DEBUG) << "Parameter should be reused, no need insert assign, parameter: " << parameter->DebugString()
+                        << ", arg:" << arg->DebugString();
+          continue;
+        }
+        auto target_graph_iter = graph_id_map.find(AnfAlgo::GetGraphId(arg.get()));
+        if (target_graph_iter == graph_id_map.end()) {
+          MS_LOG(EXCEPTION) << "Graph id " << AnfAlgo::GetGraphId(arg.get()) << " not found.";
+        }
+        InsertMultipleAssignToGraph(NOT_NULL(target_graph_iter->second), NOT_NULL(arg), NOT_NULL(parameter));
+      }
     }
   }
-  return nullptr;
 }
 
 NotNull<CNodePtr> AscendControlParser::ProcessKernelGraph(NotNull<KernelGraphPtr> kg, const CNodePtr &last_node,
@@ -99,25 +224,29 @@ NotNull<CNodePtr> AscendControlParser::ProcessKernelGraph(NotNull<KernelGraphPtr
   // 3. topological sort
   kg->SetExecOrderByDefault();
   const std::vector<CNodePtr> &nodes = kg->execution_order();
-  if (nodes.empty()) {
-    MS_LOG(EXCEPTION) << "KernelGraph " << kg->ToString() << " has no cnodes!";
-  }
   // 4. insert first_label
-  auto start_label = kg->NewCNode({std::make_shared<ValueNode>(std::make_shared<Primitive>(kLabelSetOpName))});
-  MS_LOG(INFO) << "Insert start label " << start_label->DebugString() << " to " << kg->ToString();
-  kg->set_start_label(start_label);
+  CNodePtr start_label;
+  if (last_node != nullptr && last_label != nullptr) {
+    start_label = kg->NewCNode({std::make_shared<ValueNode>(std::make_shared<Primitive>(kLabelSetOpName))});
+    MS_LOG(INFO) << "Insert start label " << start_label->DebugString() << " to " << kg->ToString();
+    kg->set_start_label(start_label);
+  } else {
+    // no goto node will jump to start label of root graph, so return a fake label
+    start_label = std::make_shared<CNode>(std::vector<AnfNodePtr>(), FuncGraphPtr(nullptr));
+  }
+
   // 5. traverse
   for (size_t i = 0; i < nodes.size(); ++i) {
     auto &cnode = nodes[i];
     if (cnode->size() < kCNodePrim + 1) {
       MS_LOG(EXCEPTION) << "Inputs of apply node is empty";
     }
-    AnfNodePtr fn = cnode->input(kCNodePrim);
+    AnfNodePtr fn = cnode->input(kAnfPrimitiveIndex);
     if (!IsPrimitive(fn, prim::kPrimCall) || cnode->size() < kCNodeCallArg + 1) {
       MS_LOG(DEBUG) << "continue node " << cnode->DebugString();
       continue;
     }
-    AnfNodePtr arg = cnode->input(kCNodeCallArg);
+    AnfNodePtr arg = cnode->input(kFirstDataInputIndex);
     if (IsValueNode<KernelGraph>(arg)) {
       RecurseCall(kg, NOT_NULL(cnode), GetNextRealKernel(nodes, i + 1), memo);
     } else if (!arg->isa<CNode>()) {
@@ -140,11 +269,10 @@ NotNull<CNodePtr> AscendControlParser::ProcessKernelGraph(NotNull<KernelGraphPtr
 }
 
 void AscendControlParser::InsertDependToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> attch_node) {
-  std::vector<AnfNodePtr> inputs = {NewValueNode(std::make_shared<Primitive>("depend"))};
   auto return_node = kg->get_return();
   MS_EXCEPTION_IF_NULL(return_node);
-  inputs.push_back(return_node->input(1));
-  inputs.push_back(attch_node.get());
+  std::vector<AnfNodePtr> inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimDepend->name())),
+                                    return_node->input(kFirstDataInputIndex), attch_node.get()};
   auto depend_node = kg->NewCNode(inputs);
   return_node->set_input(1, depend_node);
 }
@@ -161,17 +289,8 @@ void AscendControlParser::InsertControlDependToGraph(NotNull<KernelGraphPtr> kg,
 
 void AscendControlParser::LinkParentGraph(NotNull<KernelGraphPtr> kg, const CNodePtr &from_graph_call_node,
                                           const CNodePtr &last_label) {
-  auto origin_return = kg->get_return();
-  const std::vector<AnfNodePtr> &origin_return_inputs = origin_return->inputs();
-  // if entry graph, replace return with make_tuple
-  if (from_graph_call_node == nullptr || last_label == nullptr) {
-    MS_LOG(INFO) << kg->ToString() << " is entry graph.";
-    std::vector<AnfNodePtr> make_tuple_inputs = {std::make_shared<ValueNode>(prim::kPrimMakeTuple)};
-    make_tuple_inputs.insert(make_tuple_inputs.end(), origin_return_inputs.begin() + 1, origin_return_inputs.end());
-    auto make_tuple = kg->NewCNode(make_tuple_inputs);
-    origin_return->set_inputs({origin_return->input(kCNodePrim), make_tuple});
-  } else {
-    // else replace return with label_goto
+  // if not entry graph, replace return with label_goto
+  if (from_graph_call_node != nullptr && last_label != nullptr) {
     auto label_goto =
       kg->NewCNode({std::make_shared<ValueNode>(std::make_shared<Primitive>(kLabelGotoOpName)), last_label});
     MS_LOG(INFO) << "Insert end goto " << label_goto->DebugString() << " to " << kg->ToString();
@@ -181,10 +300,13 @@ void AscendControlParser::LinkParentGraph(NotNull<KernelGraphPtr> kg, const CNod
 
 void AscendControlParser::RecurseCall(NotNull<KernelGraphPtr> kg, NotNull<CNodePtr> cur_node, const CNodePtr &next_node,
                                       const NotNull<std::set<KernelGraphPtr> *> memo) {
-  MS_LOG(INFO) << "process call func " << cur_node->DebugString();
+  MS_LOG(INFO) << "Process call func " << cur_node->DebugString();
 
   // 1 get kernel graph
   const std::vector<AnfNodePtr> &origin_inputs = cur_node->inputs();
+  if (kCNodeCallArg >= origin_inputs.size()) {
+    MS_LOG(EXCEPTION) << "Index out of range,size:" << origin_inputs.size();
+  }
   std::vector<AnfNodePtr> new_inputs = {std::make_shared<ValueNode>(std::make_shared<Primitive>(kLabelGotoOpName))};
   if (!IsValueNode<KernelGraph>(origin_inputs[kCNodeCallArg])) {
     MS_LOG(WARNING) << "Node " << cur_node->DebugString(10) << " index " << kCNodeCallArg << " is not a ValueNode";
@@ -208,12 +330,12 @@ void AscendControlParser::RecurseCall(NotNull<KernelGraphPtr> kg, NotNull<CNodeP
   new_inputs.insert(new_inputs.end(), origin_inputs.begin(), origin_inputs.end());
   cur_node->set_inputs(new_inputs);
   cur_node->set_abstract(nullptr);
-  MS_LOG(INFO) << "success process call func " << cur_node->DebugString();
+  MS_LOG(INFO) << "Succeed processing call func " << cur_node->DebugString();
 }
 
 void AscendControlParser::RecurseSwitch(NotNull<KernelGraphPtr> kg, NotNull<CNodePtr> cur_node,
                                         const CNodePtr &next_node, const NotNull<std::set<KernelGraphPtr> *> memo) {
-  MS_LOG(INFO) << "process switch node " << cur_node->DebugString();
+  MS_LOG(INFO) << "Process switch node " << cur_node->DebugString();
 
   if (cur_node->size() < kCNodeSwitchLength) {
     MS_LOG(EXCEPTION) << "Inputs of apply node must more than " << kCNodeSwitchLength;
@@ -245,13 +367,13 @@ void AscendControlParser::RecurseSwitch(NotNull<KernelGraphPtr> kg, NotNull<CNod
   new_switch_inputs.insert(new_switch_inputs.end(), origin_switch_inputs.begin(), origin_switch_inputs.end());
   cur_node->set_inputs(new_switch_inputs);
   cur_node->set_abstract(nullptr);
-  MS_LOG(INFO) << "success process switch func " << cur_node->DebugString();
+  MS_LOG(INFO) << "Succeed processing switch func " << cur_node->DebugString();
 }
 
 void AscendControlParser::RecurseSwitchLayer(NotNull<KernelGraphPtr> kg, NotNull<CNodePtr> cur_node,
                                              const CNodePtr &next_node,
                                              const NotNull<std::set<KernelGraphPtr> *> memo) {
-  MS_LOG(INFO) << "process switch node " << cur_node->DebugString();
+  MS_LOG(INFO) << "Process switch node " << cur_node->DebugString();
 
   if (cur_node->size() < kCNodeSwitchLayerLength) {
     MS_LOG(EXCEPTION) << "Inputs of apply node must more than " << kCNodeSwitchLayerLength;
@@ -272,6 +394,9 @@ void AscendControlParser::RecurseSwitchLayer(NotNull<KernelGraphPtr> kg, NotNull
   }
   // 3 recurse sub graph
   const std::vector<AnfNodePtr> &origin_switch_inputs = cur_node->inputs();
+  if (kCNodeSwitchCond >= origin_switch_inputs.size()) {
+    MS_LOG(EXCEPTION) << "Index out of range:" << origin_switch_inputs.size() << ".";
+  }
   std::vector<AnfNodePtr> new_switch_inputs = {
     std::make_shared<ValueNode>(std::make_shared<Primitive>(kLabelSwitchOpName)),
     origin_switch_inputs[kCNodeSwitchCond]};
@@ -286,7 +411,7 @@ void AscendControlParser::RecurseSwitchLayer(NotNull<KernelGraphPtr> kg, NotNull
   new_switch_inputs.insert(new_switch_inputs.end(), branch_partial.begin(), branch_partial.end());
   cur_node->set_inputs(new_switch_inputs);
   cur_node->set_abstract(nullptr);
-  MS_LOG(INFO) << "success process switch layer " << cur_node->DebugString();
+  MS_LOG(INFO) << "Succeed processing switch layer " << cur_node->DebugString();
 }
 
 std::tuple<CNodePtr, KernelGraphPtr> AscendControlParser::ParsePartial(NotNull<AnfNodePtr> node) {
@@ -295,15 +420,33 @@ std::tuple<CNodePtr, KernelGraphPtr> AscendControlParser::ParsePartial(NotNull<A
   }
   // 2.1 branch kernel graph and args
   auto partial_cnode = utils::cast<CNodePtr>(node.get());
+  MS_EXCEPTION_IF_NULL(partial_cnode);
   if (partial_cnode->size() < kCNodePartialLength) {
     MS_LOG(EXCEPTION) << "Inputs of partial node must more than " << kCNodePartialLength;
   }
-  auto partial_inputs = partial_cnode->inputs();
-  auto branch_kg = GetValueNode<KernelGraphPtr>(partial_inputs[kCNodePartialFunc]);
 
+  const auto &partial_inputs = partial_cnode->inputs();
+  if (kCNodePartialFunc >= partial_inputs.size()) {
+    MS_LOG(EXCEPTION) << "Index out of range:" << partial_inputs.size() << ".";
+  }
+  auto branch_kg = GetValueNode<KernelGraphPtr>(partial_inputs[kCNodePartialFunc]);
   return {partial_cnode, branch_kg};
 }
 
+void AscendControlParser::InsertMultipleAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from,
+                                                      NotNull<AnfNodePtr> to) {
+  std::vector<AnfNodePtr> from_outputs = AnfAlgo::GetAllOutput(from, {prim::kPrimTupleGetItem});
+  std::vector<AnfNodePtr> to_outputs = AnfAlgo::GetAllOutput(to, {prim::kPrimTupleGetItem});
+  MS_LOG(INFO) << "Insert multi-assign from [" << from->DebugString() << "] to [" << to->DebugString() << "]";
+  if (from_outputs.size() != to_outputs.size()) {
+    MS_LOG(EXCEPTION) << "From outputs size[" << from_outputs.size() << "] is not equal to to outputs size["
+                      << to_outputs.size() << "]";
+  }
+  for (size_t i = 0; i < from_outputs.size(); i++) {
+    InsertAssignToGraph(kg, NOT_NULL(from_outputs[i]), NOT_NULL(to_outputs[i]));
+  }
+}
+
 void AscendControlParser::InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from,
                                               NotNull<AnfNodePtr> to) {
   if (AnfAlgo::OutputAddrExist(from, 0) && AnfAlgo::OutputAddrExist(to, 0) &&
@@ -316,7 +459,7 @@ void AscendControlParser::InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNul
   MS_LOG(INFO) << "Insert assign to graph " << kg->ToString() << " from " << from->DebugString() << " to "
                << to->DebugString();
   // config inputs of assign node
-  std::vector<AnfNodePtr> inputs = {NewValueNode(std::make_shared<Primitive>("Assign")), to, from};
+  std::vector<AnfNodePtr> inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimAssign->name())), to, from};
   // generate a new cnode
   auto assign_node = kg->NewCNode(inputs);
   MS_EXCEPTION_IF_NULL(assign_node);
@@ -325,49 +468,24 @@ void AscendControlParser::InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNul
   InsertDependToGraph(kg, NOT_NULL(assign_node));
 }
 
-void AscendControlParser::LinkArgsToParam(NotNull<KernelGraphPtr> to_graph, NotNull<KernelGraphPtr> target_graph,
-                                          NotNull<AnfNodePtr> arg, NotNull<AnfNodePtr> param) {
-  if (IsPrimitiveCNode(arg, prim::kPrimMakeTuple) && IsPrimitiveCNode(param, prim::kPrimMakeTuple)) {
-    MS_LOG(INFO) << "Arg " << arg->DebugString() << " Param " << param->DebugString() << " is a tuple";
-    CNodePtr cnode_arg = arg.get()->cast<CNodePtr>();
-    CNodePtr cnode_param = param.get()->cast<CNodePtr>();
-    MS_EXCEPTION_IF_NULL(cnode_arg);
-    MS_EXCEPTION_IF_NULL(cnode_param);
-    if (cnode_arg->size() != cnode_param->size()) {
-      MS_LOG(EXCEPTION) << "Arg " << arg->DebugString() << " size " << cnode_arg->size() << " but Param "
-                        << param->DebugString() << " size " << cnode_param->size();
-    }
-
-    for (size_t i = 1; i < cnode_param->size(); ++i) {
-      LinkArgsToParam(to_graph, target_graph, NOT_NULL(cnode_arg->input(i)), NOT_NULL(cnode_param->input(i)));
-    }
-  } else if (arg->isa<CNode>()) {
-    InsertAssignToGraph(target_graph, arg, param);
-  } else {
-    MS_LOG(EXCEPTION) << "Arg " << arg->DebugString() << " Param " << param->DebugString() << " unknown type.";
-  }
-}
-
-void AscendControlParser::ExecutorValidate(NotNull<KernelGraphPtr> root_graph) {
-  std::set<KernelGraphPtr> memo;
-  (void)RecurseGraph(root_graph, NOT_NULL(&memo));
-}
-
 std::vector<CNodePtr> AscendControlParser::RecurseGraph(NotNull<KernelGraphPtr> graph,
                                                         const NotNull<std::set<KernelGraphPtr> *> memo) {
-  MS_LOG(INFO) << "graph:" << graph->graph_id() << " start";
-  auto print_vector = [&](std::vector<CNodePtr> vec) -> void {
-    MS_LOG(INFO) << "graph:" << graph->graph_id() << "execution order";
-    for (size_t i = 0; i < vec.size(); i++) {
-      MS_LOG(INFO) << "[" << i << "][" << vec[i]->DebugString() << "]";
-    }
-  };
+  MS_LOG(INFO) << "Graph:" << graph->graph_id() << " start";
   if (memo->find(graph) != memo->end()) {
     return {};
   }
   memo->insert(graph.get());
   graph->SetExecOrderByDefault();
-  const std::vector<CNodePtr> &cnodes = graph->execution_order();
+  std::vector<CNodePtr> cnodes = graph->execution_order();
+
+  auto end_label_goto = graph->get_end_goto();
+  if (cnodes.rbegin() != cnodes.rend() && *cnodes.rbegin() == end_label_goto) {
+    cnodes.pop_back();
+  }
+  AnfAlgo::ReorderExecList(NOT_NULL(&cnodes));
+  if (end_label_goto != nullptr) {
+    cnodes.push_back(end_label_goto);
+  }
 
   std::vector<CNodePtr> execution_order;
   uint32_t child_order_index = 0;
@@ -377,45 +495,34 @@ std::vector<CNodePtr> AscendControlParser::RecurseGraph(NotNull<KernelGraphPtr>
     if (node == graph->get_end_goto()) {
       continue;
     }
-    if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimLabelGoto)) {
-      if (!CheckLabelIndex(child_order_index, 0, node, graph)) {
-        MS_LOG(EXCEPTION) << "Check label index fail";
-      }
-      auto child_graph = graph->child_graph_order()[child_order_index++];
-      if (child_graph == graph->parent_graph()) {
-        continue;
-      }
-      auto child_execution_order = RecurseGraph(NOT_NULL(child_graph), memo);
-      execution_order.insert(execution_order.end(), child_execution_order.begin(), child_execution_order.end());
-    } else if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimLabelSwitch)) {
-      std::vector<uint32_t> label_switch_list = GetLabelSwitchList(node);
+    if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimLabelSwitch)) {
+      std::vector<uint32_t> label_switch_list = AnfAlgo::GetNodeAttr<std::vector<uint32_t>>(node, kAttrLabelSwitchList);
       for (auto iter = label_switch_list.rbegin(); iter != label_switch_list.rend(); ++iter) {
         if (!CheckLabelIndex(child_order_index, *iter, node, graph)) {
           MS_LOG(EXCEPTION) << "Check label index fail";
         }
-        auto child_graph = graph->child_graph_order()[child_order_index++];
-        if (child_graph == graph->parent_graph()) {
-          continue;
+        if (child_order_index >= graph->child_graph_order().size()) {
+          MS_LOG(EXCEPTION) << "Index out of range:" << graph->child_graph_order().size();
         }
+        auto child_graph = graph->child_graph_order()[child_order_index++];
         auto child_execution_order = RecurseGraph(NOT_NULL(child_graph), memo);
         execution_order.insert(execution_order.end(), child_execution_order.begin(), child_execution_order.end());
       }
+    } else if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimLabelGoto)) {
+      uint32_t label_index = AnfAlgo::GetNodeAttr<uint32_t>(node, kAttrLabelIndex);
+      if (!CheckLabelIndex(child_order_index, label_index, node, graph)) {
+        MS_LOG(EXCEPTION) << "Check label index fail";
+      }
+      auto child_graph = graph->child_graph_order()[child_order_index++];
+      auto child_execution_order = RecurseGraph(NOT_NULL(child_graph), memo);
+      execution_order.insert(execution_order.end(), child_execution_order.begin(), child_execution_order.end());
     }
   }
   graph->set_execution_order(execution_order);
-  print_vector(graph->execution_order());
+  graph->PrintGraphExecuteOrder();
   return execution_order;
 }
 
-std::vector<uint32_t> AscendControlParser::GetLabelSwitchList(const CNodePtr &node) {
-  if (!AnfAlgo::HasNodeAttr(kAttrLabelSwitchList, node)) {
-    MS_LOG(EXCEPTION) << "LabelSwitchKernel has no attr label_switch_list";
-  }
-  auto primitive = AnfAlgo::GetCNodePrimitive(node);
-  MS_EXCEPTION_IF_NULL(primitive);
-  return GetValue<std::vector<uint32_t>>(primitive->GetAttr(kAttrLabelSwitchList));
-}
-
 bool AscendControlParser::CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cur_label,
                                           NotNull<KernelGraphPtr> graph) {
   const std::vector<std::shared_ptr<KernelGraph>> &child_graph_order = graph->child_graph_order();
@@ -424,37 +531,23 @@ bool AscendControlParser::CheckLabelIndex(uint32_t order_index, uint32_t label_i
     MS_LOG(EXCEPTION) << "Child graph order is wrong, graph " << graph->ToString() << " child graph size "
                       << child_graph_order.size() << " goto index " << order_index;
   }
-
-  if (AnfAlgo::CheckPrimitiveType(cur_label, prim::kPrimLabelGoto)) {
-    // check label_goto and start_label in child graph
-    if (!AnfAlgo::HasNodeAttr(kAttrLabelIndex, cur_label)) {
-      MS_LOG(EXCEPTION) << "LabelSetKernel has no attr label_index";
-    }
-    auto primitive = AnfAlgo::GetCNodePrimitive(cur_label);
-    MS_EXCEPTION_IF_NULL(primitive);
-    uint32_t label_goto_index = GetValue<uint32_t>(primitive->GetAttr(kAttrLabelIndex));
-    label_index = label_goto_index;
-  }
-  // get start_label_set_index of child graph
   auto child_graph = child_graph_order[order_index];
   MS_EXCEPTION_IF_NULL(child_graph);
+
+  // get start_label_set_index of child graph
   auto start_label_set = child_graph->get_start_label();
-  if (!AnfAlgo::HasNodeAttr(kAttrLabelIndex, start_label_set)) {
-    MS_LOG(EXCEPTION) << "LabelSetKernel has no attr label_index";
-  }
-  auto start_primitive = AnfAlgo::GetCNodePrimitive(start_label_set);
-  MS_EXCEPTION_IF_NULL(start_primitive);
-  uint32_t start_label_set_index = GetValue<uint32_t>(start_primitive->GetAttr(kAttrLabelIndex));
+  uint32_t start_label_set_index = AnfAlgo::GetNodeAttr<uint32_t>(start_label_set, kAttrLabelIndex);
   if (label_index != start_label_set_index) {
     MS_LOG(WARNING) << cur_label->DebugString() << " index " << label_index << " but " << start_label_set->DebugString()
                     << " index " << start_label_set_index << " current child graph order : " << order_index;
     return false;
+  } else {
+    return true;
   }
-  return true;
 }
 
 void AscendControlParser::UpdateChildGraphOrder(NotNull<KernelGraphPtr> kg) {
-  MS_LOG(INFO) << "graph id:" << kg->graph_id();
+  MS_LOG(INFO) << "Graph id:" << kg->graph_id();
   kg->SetExecOrderByDefault();
   auto call_nodes = kg->FindNodeByPrimitive(std::make_shared<Primitive>(prim::kPrimCall->name()));
   std::vector<KernelGraphPtr> child_graph_order;
@@ -474,6 +567,5 @@ void AscendControlParser::UpdateChildGraphOrder(NotNull<KernelGraphPtr> kg) {
   }
   kg->set_child_graph_order(child_graph_order);
 }
-
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/session/ascend_control_parser.h b/mindspore/ccsrc/session/ascend_control_parser.h
index bb1aee76af..73d68449b3 100644
--- a/mindspore/ccsrc/session/ascend_control_parser.h
+++ b/mindspore/ccsrc/session/ascend_control_parser.h
@@ -26,7 +26,6 @@
 
 namespace mindspore {
 namespace session {
-
 class AscendControlParser {
  public:
   static void ChildGraphDataAssign(const std::map<uint32_t, KernelGraphPtr> &graph_id_map);
@@ -53,15 +52,10 @@ class AscendControlParser {
                               const CNodePtr &last_label);
   static std::tuple<CNodePtr, KernelGraphPtr> ParsePartial(NotNull<AnfNodePtr> node);
 
-  static void LinkArgsToParam(NotNull<KernelGraphPtr> to_graph, NotNull<KernelGraphPtr> target_graph,
-                              NotNull<AnfNodePtr> arg, NotNull<AnfNodePtr> param);
-
+  static void InsertMultipleAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
   static void InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
 
-  static CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start);
-
   // root graph order
-  static std::vector<uint32_t> GetLabelSwitchList(const CNodePtr &node);
   static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode,
                               NotNull<KernelGraphPtr> graph);
   static std::vector<CNodePtr> RecurseGraph(NotNull<KernelGraphPtr> graph,
diff --git a/mindspore/ccsrc/session/ascend_inference_session.cc b/mindspore/ccsrc/session/ascend_inference_session.cc
new file mode 100644
index 0000000000..ff53874502
--- /dev/null
+++ b/mindspore/ccsrc/session/ascend_inference_session.cc
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "session/ascend_inference_session.h"
+#include "operator/ops.h"
+#include "ir/tensor.h"
+#include "ir/anf.h"
+#include "ir/param_value_py.h"
+#include "device/kernel_runtime.h"
+#include "session/anf_runtime_algorithm.h"
+#include "common/utils.h"
+#include "common/trans.h"
+#include "kernel/tbe/tbe_python_funcs.h"
+#include "utils/config_manager.h"
+#include "utils/base_ref_extends.h"
+
+namespace mindspore {
+namespace session {
+void AscendInferenceSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
+                                           const std::vector<tensor::TensorPtr> &inputs_const) const {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  std::vector<tensor::TensorPtr> inputs(inputs_const);
+  auto input_nodes = kernel_graph->inputs();
+
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  size_t no_weight_input = 0;
+  for (size_t i = 0; i < input_nodes.size(); ++i) {
+    tensor::TensorPtr tensor = nullptr;
+    if (!input_nodes[i]->isa<Parameter>()) {
+      MS_LOG(ERROR) << "Kernel graph inputs have anfnode which is not Parameter";
+      continue;
+    }
+    auto pk_node = input_nodes[i]->cast<ParameterPtr>();
+    MS_EXCEPTION_IF_NULL(pk_node);
+    if (AnfAlgo::IsParameterWeight(pk_node)) {
+      auto param_value = std::dynamic_pointer_cast<ParamValuePy>(pk_node->default_param());
+      MS_EXCEPTION_IF_NULL(param_value);
+      auto py_param = param_value->value();
+      MS_EXCEPTION_IF_NULL(py_param);
+      py::array py_array = py_param.cast<py::array>();
+      tensor = std::make_shared<tensor::Tensor>(py_array);
+    } else {
+      tensor = inputs[no_weight_input++];
+    }
+    MS_EXCEPTION_IF_NULL(tensor);
+    if (AnfAlgo::OutputAddrExist(pk_node, 0)) {
+      auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
+      bool need_sync = false;
+      if (ms_context->enable_pynative_infer()) {
+        if (tensor->device_address().get() == nullptr || tensor->device_address() != device_address) {
+          need_sync = true;
+        }
+      } else {
+        if (tensor->is_dirty()) {
+          need_sync = true;
+        } else if (tensor->device_address() != device_address) {
+          (void)tensor->data_sync();
+          need_sync = true;
+        }
+      }
+      if (need_sync) {
+        if (ms_context->execution_mode() == kPynativeMode || AnfAlgo::IsParameterWeight(pk_node)) {
+          tensor->set_device_address(device_address);
+        }
+        MS_EXCEPTION_IF_NULL(device_address);
+        if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
+                                              LongToSize(tensor->data().nbytes()), tensor->data_type(),
+                                              tensor->data_c(false))) {
+          MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
+        }
+      }
+    }
+    tensor->set_dirty(false);
+  }
+}
+}  // namespace session
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/session/ascend_inference_session.h b/mindspore/ccsrc/session/ascend_inference_session.h
new file mode 100644
index 0000000000..53be881f93
--- /dev/null
+++ b/mindspore/ccsrc/session/ascend_inference_session.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_SESSION_ASCEND_INFERENCE_SESSION_H
+#define MINDSPORE_CCSRC_SESSION_ASCEND_INFERENCE_SESSION_H
+#include <unordered_map>
+#include <string>
+#include <memory>
+#include <vector>
+#include <utility>
+#include <stack>
+#include <map>
+#include <tuple>
+#include <set>
+#include "session/ascend_session.h"
+#include "session/kernel_graph.h"
+#include "kernel/kernel.h"
+#include "session/session_factory.h"
+#include "session/ascend_control_parser.h"
+
+namespace mindspore {
+namespace session {
+class AscendInferenceSession : public AscendSession {
+ public:
+  AscendInferenceSession() = default;
+  ~AscendInferenceSession() = default;
+  void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
+                     const std::vector<tensor::TensorPtr> &inputs_const) const;
+};
+MS_REG_SESSION(kDavinciInferenceDevice, AscendInferenceSession);
+}  // namespace session
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_SESSION_ASCEND_INFERENCE_SESSION_H
diff --git a/mindspore/ccsrc/session/ascend_session.cc b/mindspore/ccsrc/session/ascend_session.cc
index f1b15b27ab..bae10ed943 100644
--- a/mindspore/ccsrc/session/ascend_session.cc
+++ b/mindspore/ccsrc/session/ascend_session.cc
@@ -29,6 +29,7 @@
 #include "device/ascend/ascend_kernel_runtime.h"
 #include "device/ascend/ascend_device_address.h"
 #include "pre_activate/ascend/ascend_backend_optimization.h"
+#include "pre_activate/common/common_backend_optimization.h"
 #include "device/kernel_adjust.h"
 #include "device/ascend/ascend_stream_assign.h"
 #include "device/ascend/ascend_label_assign.h"
@@ -37,6 +38,7 @@
 #include "ir/scalar.h"
 #include "debug/anf_ir_dump.h"
 #include "debug/anf_ir_utils.h"
+#include "debug/draw.h"
 #include "common/utils.h"
 #include "pre_activate/common/helper.h"
 #include "device/kernel_runtime_manager.h"
@@ -48,7 +50,7 @@ namespace mindspore {
 namespace session {
 const size_t kInvalidIndex = SIZE_MAX;
 namespace {
-void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order) {
+void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order, const std::string &tag = "") {
   MS_LOG(INFO) << "Dump execution_order size " << execution_order.size();
   MS_LOG(INFO) << "[index][stream_label][graph_id][node string]";
   int i = 0;
@@ -60,6 +62,24 @@ void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order) {
                  << "[" << cnode->DebugString() << "]";
     i++;
   }
+
+  std::stringstream buf;
+  buf << "================== execution order ==================\n";
+  if (!tag.empty()) {
+    buf << tag << "\n";
+  }
+  buf << "execution_order size: " << execution_order.size() << "\n";
+  i = 0;
+  for (auto &cnode : execution_order) {
+    MS_EXCEPTION_IF_NULL(cnode);
+    buf << i << ":\n";
+    buf << "\t" << cnode->DebugString() << "\n";
+    buf << "\t" << AnfAlgo::GetStreamDistinctionLabel(cnode.get()) << "\n";
+    buf << "\t" << AnfAlgo::GetGraphId(cnode.get()) << "\n";
+    i++;
+  }
+  buf << "================== execution order ==================\n";
+  // std::cout << buf.str() << std::endl;
 }
 
 void DumpGraphInputArgs(const VectorRef &args) {
@@ -104,6 +124,7 @@ std::vector<BaseRef> GetRealArgs(const KernelGraphPtr graph, const VectorRef &ar
         if (abstract->isa<abstract::AbstractTuple>() &&
             !AnfAlgo::CheckPrimitiveType(anf_node, prim::kPrimTupleGetItem)) {
           auto tuple_abstract = abstract->cast<abstract::AbstractTuplePtr>();
+          MS_EXCEPTION_IF_NULL(tuple_abstract);
           real_args_size += tuple_abstract->size();
           continue;
         }
@@ -131,34 +152,6 @@ std::vector<BaseRef> GetRealArgs(const KernelGraphPtr graph, const VectorRef &ar
   return real_args;
 }
 
-void ClearRunOpMemoryResource(const KernelGraphPtr &kernel_graph) {
-  MS_EXCEPTION_IF_NULL(kernel_graph);
-  // clear input parameter memory resource
-  for (const auto &input_node : kernel_graph->inputs()) {
-    MS_EXCEPTION_IF_NULL(input_node);
-    AnfAlgo::SetOutputAddr(nullptr, 0, input_node.get());
-  }
-  // clear input value node memory resource
-  for (const auto &value_node : kernel_graph->graph_value_nodes()) {
-    MS_EXCEPTION_IF_NULL(value_node);
-    AnfAlgo::SetOutputAddr(nullptr, 0, value_node.get());
-  }
-  for (const auto &cnode : kernel_graph->execution_order()) {
-    MS_EXCEPTION_IF_NULL(cnode);
-    // clear output memory resource
-    for (size_t index = 0; index < AnfAlgo::GetOutputTensorNum(cnode); ++index) {
-      AnfAlgo::SetOutputAddr(nullptr, index, cnode.get());
-    }
-    // clear workspace memory resource
-    auto kernel_mod = AnfAlgo::GetKernelMod(cnode);
-    MS_EXCEPTION_IF_NULL(kernel_mod);
-    auto workspace_lists = kernel_mod->GetWorkspaceSizeList();
-    for (size_t index = 0; index < workspace_lists.size(); ++index) {
-      AnfAlgo::SetWorkspaceAddr(nullptr, index, cnode.get());
-    }
-  }
-}
-
 std::vector<CNodePtr> GetCNodes(const std::vector<AnfNodePtr> &anf_nodes) {
   std::vector<CNodePtr> cnodes = {};
   size_t i = 0;
@@ -206,39 +199,32 @@ static std::vector<std::vector<CNodePtr>> GetChildList(const std::vector<CNodePt
   return ret;
 }
 
+static void BindCallArgsWithParameter(const std::vector<AnfNodePtr> &parameters, const std::vector<AnfNodePtr> &args,
+                                      KernelGraph *child_graph) {
+  MS_EXCEPTION_IF_NULL(child_graph);
+  MS_LOG(INFO) << "Start bind parameter of child graph:" << child_graph->graph_id();
+  if (args.empty()) {
+    return;
+  }
+  if (parameters.size() != args.size()) {
+    MS_LOG(EXCEPTION) << "Graph:" << child_graph->graph_id() << " parameters size:" << parameters.size()
+                      << " and args size:" << args.size() << " not equal!";
+  }
+  child_graph->SetExecOrderByDefault();
+  for (size_t i = 0; i < parameters.size(); i++) {
+    if (args[i] == parameters[i]) {
+      child_graph->SetRealInput(parameters[i], args[i]);
+      MS_LOG(INFO) << "Parameter and arg are same.";
+      continue;
+    }
+    child_graph->SetRealInput(parameters[i], args[i]);
+  }
+}
+
 // if a call has kernel input, it's a child graph split from ME, so these kernel input should be set into real input of
 // graph.For example, call input = (prim,graph,kernel1,kernel2),then real_input = [kernel1,kernel2]
 static void UpdateRealInput(NotNull<KernelGraphPtr> graph) {
   auto call_nodes = graph->FindNodeByPrimitive(prim::kPrimCall);
-  auto bind_call_arg_with_parameter = [&](const std::vector<AnfNodePtr> &parameters,
-                                          const std::vector<AnfNodePtr> &args, KernelGraph *child_graph) -> void {
-    MS_EXCEPTION_IF_NULL(child_graph);
-    MS_LOG(INFO) << "start bind parameter of child graph:" << child_graph->graph_id();
-    if (args.empty()) {
-      return;
-    }
-    if (parameters.size() != args.size()) {
-      MS_LOG(EXCEPTION) << "graph:" << child_graph->graph_id() << " parameters size:" << parameters.size()
-                        << " and args size:" << args.size() << " not equal!";
-    }
-    child_graph->SetExecOrderByDefault();
-    for (size_t i = 0; i < parameters.size(); i++) {
-      if (args[i] == parameters[i]) {
-        child_graph->SetRealInput(parameters[i], args[i]);
-        MS_LOG(INFO) << "Parameter and arg are same";
-        continue;
-      }
-      // if arg is a parameter ,then reuse this parameter
-      if (args[i]->isa<Parameter>()) {
-        MS_LOG(INFO) << "Parameter:" << parameters[i]->DebugString() << " of graph:" << child_graph->graph_id()
-                     << " reuse parameter:" << args[i]->DebugString()
-                     << " of graph:" << AnfAlgo::GetGraphId(args[i].get());
-        child_graph->ReplaceNode(parameters[i], args[i]);
-        continue;
-      }
-      child_graph->SetRealInput(parameters[i], args[i]);
-    }
-  };
   for (auto &call_node : call_nodes) {
     MS_EXCEPTION_IF_NULL(call_node);
     auto child_graphs = AnfAlgo::GetCallNodeKernelGraph(call_node);
@@ -247,7 +233,7 @@ static void UpdateRealInput(NotNull<KernelGraphPtr> graph) {
       std::vector<AnfNodePtr> real_args =
         std::vector<AnfNodePtr>(call_node->inputs().begin() + 2, call_node->inputs().end());
       std::vector<AnfNodePtr> child_inputs = child_graphs[0]->inputs();
-      bind_call_arg_with_parameter(child_inputs, real_args, child_graphs[0].get());
+      BindCallArgsWithParameter(child_inputs, real_args, child_graphs[0].get());
       call_node->set_inputs(std::vector<AnfNodePtr>(call_node->inputs().begin(), call_node->inputs().begin() + 2));
     } else if (child_graphs.size() == 2) {
       auto get_partial_args = [&](size_t input_index) -> std::vector<AnfNodePtr> {
@@ -264,8 +250,8 @@ static void UpdateRealInput(NotNull<KernelGraphPtr> graph) {
           std::vector<AnfNodePtr>(partial_cnode->inputs().begin(), partial_cnode->inputs().begin() + 2));
         return ret;
       };
-      bind_call_arg_with_parameter(child_graphs[0]->inputs(), get_partial_args(2), child_graphs[0].get());
-      bind_call_arg_with_parameter(child_graphs[1]->inputs(), get_partial_args(3), child_graphs[1].get());
+      BindCallArgsWithParameter(child_graphs[0]->inputs(), get_partial_args(2), child_graphs[0].get());
+      BindCallArgsWithParameter(child_graphs[1]->inputs(), get_partial_args(3), child_graphs[1].get());
     }
   }
 }
@@ -273,7 +259,7 @@ static void UpdateRealInput(NotNull<KernelGraphPtr> graph) {
 static void RecurseToUpdateCallRealInput(NotNull<KernelGraphPtr> graph,
                                          const NotNull<std::set<KernelGraphPtr> *> memo) {
   memo->insert(graph.get());
-  MS_LOG(INFO) << "start graph id:" << graph->graph_id();
+  MS_LOG(INFO) << "Start graph id:" << graph->graph_id();
   for (auto &child_graph : graph->child_graph_order()) {
     if (memo->find(child_graph) != memo->end()) {
       MS_LOG(INFO) << "Child graph:" << child_graph->graph_id()
@@ -298,37 +284,57 @@ GraphId AscendSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrL
 
 GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
   MS_LOG(INFO) << "start";
-  auto graph = ConstructKernelGraph(func_graph);
+  std::vector<KernelGraphPtr> all_graphs;
+  auto root_graph = ConstructKernelGraph(func_graph, &all_graphs);
+  BackendOptimization(all_graphs);
   // split switch
-  SplitGraphs(NOT_NULL(graph));
+  SplitGraphs(NOT_NULL(root_graph));
   // insert goto labels and label_sets
-  LinkChildGraphs(NOT_NULL(graph));
+  LinkChildGraphs(NOT_NULL(root_graph));
   // resource initialize
   InitRuntimeResource();
   // assign label
-  AssignLabel(NOT_NULL(graph));
-  // recurse compile child graph
+  AssignLabel(NOT_NULL(root_graph));
+  // recurse compile child root_graph
   std::set<KernelGraphPtr> memo;
-  RecurseCompileGraph(NOT_NULL(graph), NOT_NULL(&memo));
-  // root graph valiate,include genearte execute order and so on
-  RootGraphExecutorValidate(NOT_NULL(graph));
+  RecurseCompileGraph(NOT_NULL(root_graph), NOT_NULL(&memo));
+  // root root_graph valiate,include genearte execute order and so on
+  RootGraphExecutorValidate(NOT_NULL(root_graph));
   // adjust kernel
-  AdjustKernel(graph);
+  AdjustKernel(root_graph);
   // assign stream
-  AssignStream(graph);
+  AssignStream(root_graph);
+  // insert profiling point
+  device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get()));
   // build kernel
-  BuildKernel(graph);
+  BuildKernel(root_graph);
   // alloc mem
-  MemoryAlloc(graph.get());
+  MemoryAlloc(root_graph.get());
   // task generate
-  GenerateTaskInfo(graph);
+  GenerateTaskInfo(root_graph);
   // load task into device
-  LoadTask(graph);
-  // return the graph id to backend
-  auto graph_id = graph->graph_id();
+  LoadTask(root_graph);
+  // return the root_graph id to backend
+  auto graph_id = root_graph->graph_id();
   return graph_id;
 }
 
+void AscendSession::SetFinalGraphSummaryFlag(const std::shared_ptr<KernelGraph> &kernel_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto graph_order = GetGraphOrder(kernel_graph->graph_id());
+  for (auto graph_id : graph_order) {
+    auto child_graph = GetGraph(graph_id);
+    if (child_graph == nullptr) {
+      continue;
+    }
+    if (child_graph->summary_node_exist()) {
+      kernel_graph->set_summary_node_exist(true);
+      return;
+    }
+  }
+  kernel_graph->set_summary_node_exist(false);
+}
+
 void AscendSession::BuildGraph(GraphId graph_id) {
   MS_LOG(INFO) << "start";
   auto graph = GetGraph(graph_id);
@@ -344,6 +350,7 @@ void AscendSession::BuildGraph(GraphId graph_id) {
     InsertAllAssigns();
     // insert switch and active to child graph
     MergeSwitchCompile();
+    SetFinalGraphSummaryFlag(graph);
     // OptChildGraphs
     auto graph_order = GetGraphOrder(final_graph_id_);
     auto &graph_type = GetGraphOrderType(final_graph_id_);
@@ -355,6 +362,7 @@ void AscendSession::BuildGraph(GraphId graph_id) {
       auto child_graph = GetGraph(graph_order[i]);
       CompileChildGraph(child_graph);
     }
+    GetSummaryNodes(graph.get());
     // merge child graph
     MergeGraphExecOrder();
   } else {
@@ -394,8 +402,28 @@ void AscendSession::CompileChildGraph(const KernelGraphPtr &child_graph) {
   MS_EXCEPTION_IF_NULL(child_graph);
   MS_LOG(INFO) << "CompileChildGraph " << child_graph->ToString();
   opt::AscendBackendIRFusionOptimization(child_graph);
+  opt::AscendBackendFuseBasicOpt(child_graph, true);
+  opt::AscendBackendGraphKernelOpt(child_graph, true);
+  child_graph->SetExecOrderByDefault();
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  bool save_graphs = context_ptr->save_graphs_flag();
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (save_graphs) {
+    std::string file_path =
+      save_graphs_path + "/" + "select_kernel_before" + "_graph_" + std::to_string(child_graph->graph_id()) + ".ir";
+    DumpIR(file_path, child_graph);
+  }
   // select kernel build info
   SelectKernel(*child_graph);
+  if (save_graphs) {
+    std::string file_path =
+      save_graphs_path + "/" + "select_kernel_after" + "_graph_" + std::to_string(child_graph->graph_id()) + ".ir";
+    DumpIR(file_path, child_graph);
+  }
   // convert kernel Graph to model
   predictmodel::StepConvertGraph(child_graph);
   // optimize graph
@@ -411,7 +439,6 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
                              VectorRef *const outputs) {
   MS_LOG(INFO) << "start";
   auto kernel_graph = GetGraph(graph_id);
-  DumpIR("./run_graph.ir", kernel_graph);
   MS_EXCEPTION_IF_NULL(kernel_graph);
   // if none of child graph and no anf output exists
   if (!kernel_graph->executable()) {
@@ -512,7 +539,7 @@ py::tuple AscendSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &gr
   }
   py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_;
   py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj);
-  ClearRunOpMemoryResource(graph);
+  RunOpMemoryClear(graph.get());
   MS_LOG(INFO) << "Run op " << op_run_info.op_name << " finish!";
   return tuple_tensors;
 }
@@ -531,13 +558,17 @@ void AscendSession::SelectKernel(const KernelGraph &kernel_graph) const {
     }
     MS_LOG(INFO) << "Select ApplyKernel: " << cnode->DebugString();
   }
-  if (raise_precision_count > 0) {
-    MS_LOG(WARNING) << "There has " << raise_precision_count
-                    << " node/nodes used raise precision to selected the kernel!";
-  }
-  if (reduce_precision_count > 0) {
-    MS_LOG(WARNING) << "There has " << reduce_precision_count
-                    << " node/nodes used reduce precision to selected the kernel!";
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->execution_mode() == kGraphMode) {
+    if (raise_precision_count > 0) {
+      MS_LOG(WARNING) << "There has " << raise_precision_count
+                      << " node/nodes used raise precision to selected the kernel!";
+    }
+    if (reduce_precision_count > 0) {
+      MS_LOG(WARNING) << "There has " << reduce_precision_count
+                      << " node/nodes used reduce precision to selected the kernel!";
+    }
   }
   MS_LOG(INFO) << "Finish!";
 }
@@ -553,8 +584,12 @@ void AscendSession::InitRuntimeResource() {
 }
 
 void AscendSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) const {
+  device::ascend::KernelPreBuild(kernel_graph.get());
   MS_LOG(INFO) << "HardwareOptimize start!";
   opt::AscendBackendOptimization(kernel_graph);
+  opt::AscendGraphKernelCommonProcess(kernel_graph);
+  opt::AscendBackendFuseBasicOpt(kernel_graph, false);
+  opt::AscendBackendAddAtomicClean(kernel_graph);
   MS_EXCEPTION_IF_NULL(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
   MS_LOG(INFO) << "HardwareOptimize Finish!";
@@ -562,7 +597,6 @@ void AscendSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_
 
 void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   MS_LOG(INFO) << "Start!";
-  device::KernelAdjust::GetInstance().Reorder(kernel_graph);
   opt::HideNopNode(kernel_graph.get());
   // Insert CLearZero op
   // prepare for next step from json get atomic info
@@ -595,7 +629,7 @@ void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel
 
 void AscendSession::AssignStream(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   MS_LOG(INFO) << "Start!";
-  device::ascend::AscendStreamAssign::GetInstance().AssignStreamNew(kernel_graph);
+  device::ascend::AscendStreamAssign::GetInstance().AssignStream(kernel_graph);
   MS_LOG(INFO) << "Finish!";
 }
 
@@ -642,6 +676,13 @@ void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input
   MS_LOG(INFO) << "Finish!";
 }
 
+void AscendSession::RunOpMemoryClear(KernelGraph *kernel_graph) const {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  runtime_instance->RunOpClearMemory(kernel_graph);
+}
+
 void AscendSession::GenerateTaskInfo(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   MS_LOG(INFO) << "Start!";
   (void)device::KernelAdjust::GetInstance().StepLoadCtrlInputs(kernel_graph);
@@ -698,14 +739,15 @@ void AscendSession::ExportChildGraphs(const GraphId graph_id) {
     save_graphs_path = ".";
   }
   if (graph_id == final_graph_id_) {
-    auto &graph_order = GetGraphOrder(final_graph_id_);
-    auto &graph_type = GetGraphOrderType(final_graph_id_);
+    const auto &graph_order = GetGraphOrder(final_graph_id_);
+    const auto &graph_type = GetGraphOrderType(final_graph_id_);
     for (size_t i = 0; i < graph_order.size(); i++) {
       if (graph_type[i] == BRANCH_END || graph_type[i] == BRANCH_START) {
         continue;
       }
-      auto child_graph = GetGraph(graph_order[i]);
+      const auto child_graph = GetGraph(graph_order[i]);
       MS_LOG(DEBUG) << "Start export child graph " << graph_order[i];
+      MS_EXCEPTION_IF_NULL(child_graph);
       std::string file_path = save_graphs_path + "/graph_build_" + std::to_string(child_graph->graph_id()) + ".ir";
       DumpIR(file_path, child_graph, true);
       DumpIRProto(child_graph, "vm_build_" + std::to_string(child_graph->graph_id()));
@@ -755,29 +797,47 @@ GraphId AscendSession::SetFinalGraphInput(const std::vector<AnfNodePtr> &args) {
   return final_graph_id_;
 }
 
-void AscendSession::GetSummaryNodes(const KernelGraph *graph,
-                                    std::unordered_map<std::string, std::pair<AnfNodePtr, int>> *summary) {
-  MS_LOG(DEBUG) << "Update summary Start";
+void AscendSession::RecurseGetSummaryNodes(KernelGraph *graph,
+                                           std::map<std::string, std::pair<AnfNodePtr, int>> *summary) {
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(summary);
-  summary->clear();
   // if final graph have no child graph
   auto graph_order_iter = graph_execute_orders_.find(graph->graph_id());
   if (graph_order_iter == graph_execute_orders_.end()) {
-    SessionBasic::GetSummaryNodes(graph, summary);
+    SessionBasic::GetSummaryNodes(graph);
+    auto summary_nodes = graph->summary_nodes();
+    (*summary).insert(summary_nodes.begin(), summary_nodes.end());
     return;
   }
   // for every child graph, find summary nodes
   auto graph_order = GetGraphOrder(graph->graph_id());
   for (size_t i = 0; i < graph_order.size(); i++) {
     auto child_graph = GetGraph(graph_order[i]);
-    SessionBasic::GetSummaryNodes(child_graph.get(), summary);
+    if (child_graph == nullptr) {
+      continue;
+    }
+    SessionBasic::GetSummaryNodes(child_graph.get());
+    auto child_graph_summary = child_graph->summary_nodes();
+    (*summary).insert(child_graph_summary.begin(), child_graph_summary.end());
+    RecurseGetSummaryNodes(child_graph.get(), summary);
   }
-  MS_LOG(DEBUG) << "Update summary end size: " << (*summary).size();
+  graph->set_summary_nodes(*summary);
+}
+
+void AscendSession::GetSummaryNodes(KernelGraph *graph) {
+  MS_LOG(DEBUG) << "Update summary Start";
+  MS_EXCEPTION_IF_NULL(graph);
+  auto summary_nodes = graph->summary_nodes();
+  std::map<std::string, std::pair<AnfNodePtr, int>> summary;
+  summary.insert(summary_nodes.begin(), summary_nodes.end());
+  RecurseGetSummaryNodes(graph, &summary);
+  graph->set_summary_nodes(summary);
+  MS_LOG(DEBUG) << "Update summary end size: " << summary.size();
 }
 
 AnfNodePtr AscendSession::CreateFakeOutput(GraphId fake_graph_id, const AnfNodePtr &true_output) {
   auto fake_graph = GetGraph(fake_graph_id);
+  MS_EXCEPTION_IF_NULL(fake_graph);
   auto output_item_with_index = AnfAlgo::VisitKernelWithReturnType(true_output, 0);
   auto create_parameter = [&](const AbstractBasePtr &abstract) -> AnfNodePtr {
     auto parameter = fake_graph->NewParameter();
@@ -798,7 +858,7 @@ AnfNodePtr AscendSession::CreateFakeOutput(GraphId fake_graph_id, const AnfNodeP
     if (abstract->isa<abstract::AbstractTuple>()) {
       auto tuple_abstract = abstract->cast<abstract::AbstractTuplePtr>();
       MS_EXCEPTION_IF_NULL(tuple_abstract);
-      MS_LOG(INFO) << "tuple_size [" << tuple_abstract->size() << "]";
+      MS_LOG(INFO) << "Tuple size [" << tuple_abstract->size() << "]";
       return create_parameter((*tuple_abstract)[output_idx]);
     }
     return create_parameter(cnode->abstract());
@@ -990,6 +1050,7 @@ void AscendSession::SwitchCompile(GraphId cond_graph_id, GraphId true_graph_id,
   if (false_graph_id != kInvalidGraphId) {
     // false graph and condition in graph same stream
     auto condition_graph = GetGraph(cond_graph_id);
+    MS_EXCEPTION_IF_NULL(condition_graph);
     SetStreamDistinctionLabel(GetGraph(false_graph_id), condition_graph->stream_distinction_label(), true);
     // if false graph is a condition graph and has been switch compiled before,it's false should be updated again
     auto cond_it = switches_.find(false_graph_id);
@@ -997,6 +1058,9 @@ void AscendSession::SwitchCompile(GraphId cond_graph_id, GraphId true_graph_id,
       cond_graph_id = cond_it->first;
       false_graph_id = cond_it->second.second;
       condition_graph = GetGraph(cond_graph_id);
+      if (condition_graph == nullptr) {
+        continue;
+      }
       SetStreamDistinctionLabel(GetGraph(false_graph_id), condition_graph->stream_distinction_label(), true);
       cond_it = switches_.find(false_graph_id);
     }
@@ -1133,7 +1197,7 @@ void AscendSession::SetChildGraphParameter(const AnfNodePtr &front_anf, GraphId
     MS_EXCEPTION_IF_NULL(backend_arg);
     MS_LOG(INFO) << "Reuse node [" << backend_arg->DebugString() << "], old node[" << backend_parameter->DebugString()
                  << "] will be replaced.";
-    to_graph->ReplaceNode(backend_parameter, backend_arg);
+    to_graph->ReplaceNode(NOT_NULL(backend_parameter), NOT_NULL(backend_arg));
     return;
   }
   MS_LOG(INFO) << "Assign of node" << backend_arg->DebugString() << " of graph " << from_graph_id << " to node"
@@ -1429,22 +1493,44 @@ void AscendSession::SyncInitialTenosrToDevice() {
   }
 }
 
-std::vector<AnfNodePtr> AscendSession::ConstructSplitedGraph(const KernelGraphPtr &new_kernel_graph,
-                                                             const std::vector<CNodePtr> &list) {
-  MS_EXCEPTION_IF_NULL(new_kernel_graph);
-  MS_LOG(INFO) << "start contruct splited kernel graph:" << new_kernel_graph->graph_id();
+static void ConstructSplitedGraphOutput(const KernelGraphPtr &new_kernel_graph, const std::vector<CNodePtr> &list) {
   // count the output of every anf node
   std::set<AnfNodePtr> has_output_nodes;
   for (auto &anf_node : list) {
+    MS_EXCEPTION_IF_NULL(anf_node);
     for (auto &input : anf_node->inputs()) {
       (void)has_output_nodes.insert(input);
     }
   }
+
+  auto make_tuple_primitve = NewValueNode(std::make_shared<Primitive>(prim::kPrimMakeTuple->name()));
+  std::vector<AnfNodePtr> make_tuple_inputs = {make_tuple_primitve};
+  int output_idx = 0;
+  MS_EXCEPTION_IF_NULL(new_kernel_graph);
+  for (auto &anf_node : list) {
+    if (AnfAlgo::CheckPrimitiveType(anf_node, prim::kPrimReturn)) {
+      new_kernel_graph->set_return(anf_node);
+    }
+    if (has_output_nodes.find(anf_node) == has_output_nodes.end()) {
+      MS_LOG(INFO) << "Output[" << output_idx++ << "]:" << anf_node->DebugString();
+      make_tuple_inputs.push_back(anf_node);
+    }
+  }
+  if (new_kernel_graph->get_return() == nullptr) {
+    new_kernel_graph->set_output(new_kernel_graph->NewCNode(make_tuple_inputs));
+  }
+}
+
+std::vector<AnfNodePtr> AscendSession::ConstructSplitedGraph(const KernelGraphPtr &new_kernel_graph,
+                                                             const std::vector<CNodePtr> &list) {
+  MS_EXCEPTION_IF_NULL(new_kernel_graph);
+  MS_LOG(INFO) << "start contruct splited kernel graph:" << new_kernel_graph->graph_id();
   MS_LOG(INFO) << "Construct input of kernel graph:" << new_kernel_graph->graph_id();
   std::vector<AnfNodePtr> call_node_inputs;
   std::vector<AnfNodePtr> new_graph_inputs;
   // create new parameter from cnode
   for (auto &anf_node : list) {
+    MS_EXCEPTION_IF_NULL(anf_node);
     auto cnode = anf_node->cast<CNodePtr>();
     for (size_t input_idx = 1; input_idx < cnode->inputs().size(); input_idx++) {
       auto input = cnode->inputs()[input_idx];
@@ -1479,26 +1565,21 @@ std::vector<AnfNodePtr> AscendSession::ConstructSplitedGraph(const KernelGraphPt
   MS_EXCEPTION_IF_NULL(graph_inputs);
   graph_inputs->clear();
   std::copy(new_graph_inputs.begin(), new_graph_inputs.end(), std::back_inserter(*graph_inputs));
+
   MS_LOG(INFO) << "Construct output of kernel graph:" << new_kernel_graph->graph_id();
-  auto make_tuple_primitve = NewValueNode(std::make_shared<Primitive>(prim::kPrimMakeTuple->name()));
-  std::vector<AnfNodePtr> make_tuple_inputs = {make_tuple_primitve};
-  int output_idx = 0;
-  for (auto &anf_node : list) {
-    if (AnfAlgo::CheckPrimitiveType(anf_node, prim::kPrimReturn)) {
-      new_kernel_graph->set_return(anf_node);
-    }
-    if (has_output_nodes.find(anf_node) == has_output_nodes.end()) {
-      MS_LOG(INFO) << "output[" << output_idx++ << "]:" << anf_node->DebugString();
-      make_tuple_inputs.push_back(anf_node);
-    }
-  }
-  if (new_kernel_graph->get_return() == nullptr) {
-    new_kernel_graph->set_output(new_kernel_graph->NewCNode(make_tuple_inputs));
-  }
+  ConstructSplitedGraphOutput(new_kernel_graph, list);
   MS_LOG(INFO) << "end";
   return call_node_inputs;
 }
 
+void AscendSession::BackendOptimization(const std::vector<KernelGraphPtr> &all_graphs) {
+  MS_LOG(INFO) << "Start BackendCommonOptimization";
+  for (auto &graph : all_graphs) {
+    opt::BackendCommonOptimization(graph);
+  }
+  MS_LOG(INFO) << "End.";
+}
+
 void AscendSession::SplitGraphs(NotNull<KernelGraphPtr> root_graph) {
   std::set<KernelGraphPtr> memo;
   // if root graph output is a call node ,the root graph is condition graph of 'if' sentence
@@ -1512,43 +1593,50 @@ void AscendSession::SplitGraphs(NotNull<KernelGraphPtr> root_graph) {
     RecurseSplitGraph(root_graph, NOT_NULL(&memo));
   }
   memo.clear();
+  // add maketuple to the end of the last child graph to suit old process
+  auto output_graph = root_graph->child_graph_order().empty() ? root_graph : root_graph->child_graph_order().back();
+  auto make_tuple = output_graph->NewCNode(
+    {NewValueNode(std::make_shared<Primitive>(prim::kPrimMakeTuple->name())), output_graph->output()});
+  output_graph->set_output(make_tuple);
   // replace the real input if the real input is a call
   RecurseToUpdateCallRealInput(root_graph, NOT_NULL(&memo));
 }
 
+AnfNodePtr AscendSession::BindNewCallToNewGraph(NotNull<KernelGraphPtr> graph,
+                                                const std::vector<CNodePtr> &child_graph_list) {
+  // if child graph list only has a call ,then return the exist call
+  if (child_graph_list.size() == 1 && AnfAlgo::CheckPrimitiveType(child_graph_list[0], prim::kPrimCall)) {
+    return child_graph_list[0];
+  }
+  // create new child graph
+  auto child_graph = NewKernelGraph();
+  MS_EXCEPTION_IF_NULL(child_graph);
+  // create new value node to bind child graph
+  auto graph_value_node = graph->NewValueNode(NewValueNode(child_graph));
+  std::vector<AnfNodePtr> new_call_input = {NewValueNode(std::make_shared<Primitive>(prim::kPrimCall->name())),
+                                            graph_value_node};
+  // set the graph id of all node of child graph
+  for (auto &child_graph_node : child_graph_list) {
+    AnfAlgo::SetGraphId(child_graph->graph_id(), child_graph_node.get());
+  }
+  auto call_node_args = ConstructSplitedGraph(child_graph, child_graph_list);
+  std::copy(call_node_args.begin(), call_node_args.end(), std::back_inserter(new_call_input));
+  auto new_call = graph->NewCNode(new_call_input);
+  AnfAlgo::SetNodeAttr("graph_id", MakeValue(graph->graph_id()), new_call);
+  return new_call;
+}
+
 void AscendSession::SplitGraph(NotNull<KernelGraphPtr> graph, const std::set<PrimitivePtr> &cut_prims) {
-  MS_LOG(INFO) << "start,graph_id:" << graph->graph_id();
+  MS_LOG(INFO) << "Start,graph_id:" << graph->graph_id();
   auto apply_list = GetCNodes(TopoSort(graph->get_return()));
   // update the root graph child graph order
   AscendControlParser::UpdateChildGraphOrder(graph);
   // get child list from current graph
   std::vector<std::vector<CNodePtr>> child_graph_lists = GetChildList(apply_list, cut_prims);
-  auto bind_new_call_to_new_graph = [&](std::vector<CNodePtr> child_graph_list) -> AnfNodePtr {
-    // if child graph list only has a call ,then return the exist call
-    if (child_graph_list.size() == 1 && AnfAlgo::CheckPrimitiveType(child_graph_list[0], prim::kPrimCall)) {
-      return child_graph_list[0];
-    }
-    // create new child graph
-    auto child_graph = NewKernelGraph();
-    MS_EXCEPTION_IF_NULL(child_graph);
-    // create new value node to bind child graph
-    auto graph_value_node = graph->NewValueNode(NewValueNode(child_graph));
-    std::vector<AnfNodePtr> new_call_input = {NewValueNode(std::make_shared<Primitive>(prim::kPrimCall->name())),
-                                              graph_value_node};
-    // set the graph id of all node of child graph
-    for (auto &child_graph_node : child_graph_list) {
-      AnfAlgo::SetGraphId(child_graph->graph_id(), child_graph_node.get());
-    }
-    auto call_node_args = ConstructSplitedGraph(child_graph, child_graph_list);
-    std::copy(call_node_args.begin(), call_node_args.end(), std::back_inserter(new_call_input));
-    auto new_call = graph->NewCNode(new_call_input);
-    AnfAlgo::SetNodeAttr("graph id", MakeValue(graph->graph_id()), new_call);
-    return new_call;
-  };
   if (child_graph_lists.size() > 1) {
     std::list<AnfNodePtr> depend_input = {};
     for (size_t call_index = 0; call_index < child_graph_lists.size(); call_index++) {
-      auto call_node = bind_new_call_to_new_graph(child_graph_lists[call_index]);
+      auto call_node = BindNewCallToNewGraph(graph, child_graph_lists[call_index]);
       MS_EXCEPTION_IF_NULL(call_node);
       // if call node is the last call of true graph,no need create child graph after that
       auto child_graphs = AnfAlgo::GetCallNodeKernelGraph(call_node->cast<CNodePtr>());
@@ -1605,6 +1693,5 @@ void AscendSession::RecurseCompileGraph(NotNull<KernelGraphPtr> graph, const Not
     RecurseCompileGraph(NOT_NULL(child_graph), memo);
   }
 }
-
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/session/ascend_session.h b/mindspore/ccsrc/session/ascend_session.h
index 529304714c..7857330115 100755
--- a/mindspore/ccsrc/session/ascend_session.h
+++ b/mindspore/ccsrc/session/ascend_session.h
@@ -67,8 +67,8 @@ class AscendSession : public SessionBasic {
   void SetActive(GraphId, GraphId) override;
   // compile child graph when session have multiple child graphs
   void CompileChildGraph(const KernelGraphPtr &child_graph);
-  void GetSummaryNodes(const KernelGraph *graph,
-                       std::unordered_map<std::string, std::pair<AnfNodePtr, int>> *summary) override;
+  void RecurseGetSummaryNodes(KernelGraph *graph, std::map<std::string, std::pair<AnfNodePtr, int>> *summary);
+  void GetSummaryNodes(KernelGraph *graph);
 
  private:
   void InitRuntimeResource();
@@ -81,6 +81,7 @@ class AscendSession : public SessionBasic {
   void BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const;
   void MemoryAlloc(KernelGraph *kernel_graph) const;
   void RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors, KernelGraph *kernel_graph) const;
+  void RunOpMemoryClear(KernelGraph *kernel_graph) const;
   void GenerateTaskInfo(const std::shared_ptr<KernelGraph> &kernel_graph) const;
   void LoadTask(const std::shared_ptr<KernelGraph> &kernel_graph) const;
   void ExecTask(const std::shared_ptr<KernelGraph> &kernel_graph) const;
@@ -101,12 +102,14 @@ class AscendSession : public SessionBasic {
   void SplitGraph(NotNull<KernelGraphPtr> graph, const std::set<PrimitivePtr> &cut_prims);
   // split graphs with recurse from root graph
   void SplitGraphs(NotNull<KernelGraphPtr> root_graph);
+  void BackendOptimization(const std::vector<KernelGraphPtr> &all_graphs);
   void LinkChildGraphs(NotNull<KernelGraphPtr> graph);
   void RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph);
   std::vector<AnfNodePtr> ConstructSplitedGraph(const KernelGraphPtr &new_kernel_graph,
                                                 const std::vector<CNodePtr> &list);
   void RecurseCompileGraph(NotNull<KernelGraphPtr> graph, const NotNull<std::set<KernelGraphPtr> *> memo);
   void RecurseSplitGraph(NotNull<KernelGraphPtr> graph, const NotNull<std::set<KernelGraphPtr> *> memo);
+  AnfNodePtr BindNewCallToNewGraph(NotNull<KernelGraphPtr> graph, const std::vector<CNodePtr> &child_graph_list);
 
   // merge execution order list of child graphs
   void MergeGraphExecOrder();
@@ -148,6 +151,7 @@ class AscendSession : public SessionBasic {
   AnfNodePtr CreateFakeOutput(GraphId final_graph_id, const AnfNodePtr &true_output);
   // sync intial tensors' data to device
   void SyncInitialTenosrToDevice();
+  void SetFinalGraphSummaryFlag(const std::shared_ptr<KernelGraph> &kernel_graph);
 
   // member variables
   // key is final_graph_id,value is child graph execute order of final graph
diff --git a/mindspore/ccsrc/session/cpu_session.cc b/mindspore/ccsrc/session/cpu_session.cc
index 32e3d8b6cc..e70e551022 100644
--- a/mindspore/ccsrc/session/cpu_session.cc
+++ b/mindspore/ccsrc/session/cpu_session.cc
@@ -28,6 +28,23 @@
 
 namespace mindspore {
 namespace session {
+ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(anf);
+  if (!anf->isa<Parameter>()) {
+    MS_LOG(EXCEPTION) << "anf[" << anf->DebugString() << "] is not a parameter";
+  }
+  auto valid_inputs = graph->MutableValidInputs();
+  MS_EXCEPTION_IF_NULL(valid_inputs);
+  auto graph_inputs = graph->MutableInputs();
+  MS_EXCEPTION_IF_NULL(graph_inputs);
+  TraceManager::DebugTrace(std::make_shared<TraceCopy>(anf->debug_info()));
+  ParameterPtr new_parameter = graph->NewParameter(anf->cast<ParameterPtr>());
+  TraceManager::EndTrace();
+  graph_inputs->push_back(new_parameter);
+  valid_inputs->push_back(valid_input);
+  return new_parameter;
+}
+
 GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
   auto graph_id = graph_sum_;
   auto graph = ConstructKernelGraph(lst, outputs);
@@ -46,16 +63,35 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
   auto &kernel_graph = graphs_[graph_id];
   MS_EXCEPTION_IF_NULL(kernel_graph);
   MS_LOG(INFO) << "Bind input output address";
-  runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs);
+  std::vector<tensor::TensorPtr> need_sync_outputs;
+  runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs);
   MS_LOG(INFO) << "Run graph start";
   predictmodel::StepConvertWeight(inputs);
   auto execution_order = kernel_graph->execution_order();
   Reorder(&execution_order);
+
+  bool enable_summary = summary_callback_ != nullptr;
   kernel_graph->set_execution_order(execution_order);
+  NamedSummaryOutputs summary_outputs;
+  if (enable_summary) {
+    GetSummaryNodes(kernel_graph.get());
+    summary_outputs = kernel_graph->summary_nodes();
+    runtime_.IncreaseSummaryRefCount(summary_outputs);
+  }
+
   bool ret = runtime_.Run(kernel_graph.get());
   if (!ret) {
     MS_LOG(EXCEPTION) << "Run graph failed";
   }
+  for (auto output : need_sync_outputs) {
+    (void)output->data_sync();
+  }
+
+  if (enable_summary) {
+    Summary(kernel_graph.get());
+    runtime_.DecreaseSummaryRefCount(summary_outputs);
+  }
+
   MS_LOG(INFO) << "Run graph end";
 }
 
diff --git a/mindspore/ccsrc/session/cpu_session.h b/mindspore/ccsrc/session/cpu_session.h
index c53b0d2d8c..36b987e840 100644
--- a/mindspore/ccsrc/session/cpu_session.h
+++ b/mindspore/ccsrc/session/cpu_session.h
@@ -35,6 +35,9 @@ class CPUSession : public SessionBasic {
   GraphId CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override;
   void RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) override;
 
+ protected:
+  ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) override;
+
  private:
   void SetKernelInfo(const KernelGraph *kernel_graph);
   void BuildKernel(const KernelGraph *kernel_graph);
diff --git a/mindspore/ccsrc/session/gpu_session.cc b/mindspore/ccsrc/session/gpu_session.cc
index b843514793..e67a922567 100644
--- a/mindspore/ccsrc/session/gpu_session.cc
+++ b/mindspore/ccsrc/session/gpu_session.cc
@@ -22,6 +22,7 @@
 #include "pre_activate/common/pass_manager.h"
 #include "pre_activate/common/helper.h"
 #include "pre_activate/pass/communication_op_fusion.h"
+#include "pre_activate/pass/getitem_tuple.h"
 #include "device/kernel_runtime_manager.h"
 #include "predict/predict.h"
 #include "common/utils.h"
@@ -51,9 +52,11 @@ void GPUSession::StartKernelRT() const {
 }
 
 void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
   auto optimizer = std::make_shared<opt::GraphOptimizer>();
   auto pm = std::make_shared<opt::PassManager>();
   pm->AddPass(std::make_shared<opt::AllReduceFusion>());
+  pm->AddPass(std::make_shared<opt::GetitemTuple>());
   optimizer->AddPassManager(pm);
   (void)optimizer->Optimize(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
@@ -72,7 +75,6 @@ void GPUSession::AllocateMemory(KernelGraph *kernel_graph) const {
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
   MS_EXCEPTION_IF_NULL(runtime_instance);
-  // opt::RemoveNopNode(kernel_graph);
   runtime_instance->AssignMemory(kernel_graph);
 }
 
@@ -81,10 +83,16 @@ void GPUSession::RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
   MS_EXCEPTION_IF_NULL(runtime_instance);
-  // opt::RemoveNopNode(kernel_graph);
   runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph);
 }
 
+void GPUSession::RunOpClearMemory(KernelGraph *kernel_graph) const {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  runtime_instance->RunOpClearMemory(kernel_graph);
+}
+
 void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
                                const std::vector<tensor::TensorPtr> &inputs_const) const {
   std::vector<tensor::TensorPtr> inputs(inputs_const);
@@ -101,17 +109,19 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
     if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
       auto pk_node = input_node->cast<ParameterPtr>();
       auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
+      auto tensor_address = tensor->device_address();
       bool need_sync = false;
       if (ms_context->enable_pynative_infer()) {
-        if (tensor->device_address().get() == nullptr || tensor->device_address() != device_address) {
+        if (tensor_address.get() == nullptr || tensor_address != device_address) {
           need_sync = true;
         }
-      } else {
-        if (tensor->is_dirty()) {
+      } else if (tensor->is_dirty()) {
+        need_sync = true;
+      } else if (tensor_address != device_address) {
+        if (tensor_address->DeviceType() == device_address->DeviceType()) {
+          AnfAlgo::SetOutputAddr(tensor_address, 0, pk_node.get());
+        } else {
           need_sync = true;
-        } else if (tensor->device_address() != device_address) {
-          AnfAlgo::SetOutputAddr(tensor->device_address(), 0, pk_node.get());
-          need_sync = false;
         }
       }
       if (need_sync) {
@@ -140,6 +150,7 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
   // Construct graph, if successfully, graph_sum_ + 1
   auto graph_id = graph_sum_;
   auto graph = ConstructKernelGraph(lst, outputs);
+  MS_EXCEPTION_IF_NULL(graph);
   // Select kernel build info
   SelectKernel(graph);
   // Convert kernel Graph to model
@@ -150,14 +161,18 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
   Optimize(graph);
   // Assign CUDA streams
   AssignStream(graph);
-  // Remove NoOp from execution graph
-  // opt::HideNopNode(graph.get());
+  // Hide NoOp from execution graph
+  opt::HideNopNode(graph.get());
   // Build kernel if node is cnode
   BuildKernel(graph);
   // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph
   auto execution_order = graph->execution_order();
   Reorder(&execution_order);
   graph->set_execution_order(execution_order);
+  // Get summary nodes.
+  GetSummaryNodes(graph.get());
+  // Remove NoOp from execution graph
+  opt::RemoveNopNode(graph.get());
   // Alloc memory, including static memory and dynamic memory
   AllocateMemory(graph.get());
   MS_EXCEPTION_IF_NULL(context_);
@@ -194,11 +209,17 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
 
 void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
                          const std::vector<tensor::TensorPtr> &input_tensors, const std::vector<int> &tensors_mask) {
+  // Check if the graph cache exists.
+  if (run_op_graphs_.find(graph_info) != run_op_graphs_.end()) {
+    return;
+  }
   // Prepare the graph
   auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
   MS_EXCEPTION_IF_NULL(kernel_graph);
   SelectKernel(kernel_graph);
   StartKernelRT();
+  // Hide NoOp from execution graph
+  opt::HideNopNode(kernel_graph.get());
   BuildKernel(kernel_graph);
   run_op_graphs_[graph_info] = kernel_graph;
 }
@@ -207,6 +228,8 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph
                             const std::vector<tensor::TensorPtr> &input_tensors) {
   auto kernel_graph = run_op_graphs_[graph_info];
   MS_EXCEPTION_IF_NULL(kernel_graph);
+  // Remove NoOp from execution graph
+  opt::RemoveNopNode(kernel_graph.get());
   RunOpAllocateMemory(input_tensors, kernel_graph.get());
   // Execute the computation
   LoadInputData(kernel_graph, input_tensors);
@@ -222,7 +245,7 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph
   }
   py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_;
   py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj);
-  run_op_graphs_.clear();
+  RunOpClearMemory(kernel_graph.get());
   return tuple_tensors;
 }
 }  // namespace gpu
diff --git a/mindspore/ccsrc/session/gpu_session.h b/mindspore/ccsrc/session/gpu_session.h
index b396e4a9ba..0dfb815abe 100644
--- a/mindspore/ccsrc/session/gpu_session.h
+++ b/mindspore/ccsrc/session/gpu_session.h
@@ -59,6 +59,8 @@ class GPUSession : public SessionBasic {
 
   void RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors, KernelGraph *kernel_graph) const;
 
+  void RunOpClearMemory(KernelGraph *kernel_graph) const;
+
   void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
                      const std::vector<tensor::TensorPtr> &inputs_const) const override;
 
diff --git a/mindspore/ccsrc/session/kernel_graph.cc b/mindspore/ccsrc/session/kernel_graph.cc
index c1992b7cc0..7e9bb62aab 100644
--- a/mindspore/ccsrc/session/kernel_graph.cc
+++ b/mindspore/ccsrc/session/kernel_graph.cc
@@ -24,6 +24,7 @@
 #include "device/kernel_info.h"
 #include "kernel/kernel_build_info.h"
 #include "device/kernel_runtime_manager.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace session {
@@ -43,12 +44,28 @@ void PushNoVisitedNode(const AnfNodePtr &node, std::queue<AnfNodePtr> *que,
 
 std::vector<AnfNodePtr> GetCallRealOutputs(const AnfNodePtr &call_node) {
   auto item_with_index = AnfAlgo::VisitKernelWithReturnType(call_node, 0);
-  MS_EXCEPTION_IF_NULL(item_with_index.first);
-  if (!AnfAlgo::CheckPrimitiveType(item_with_index.first, prim::kPrimCall)) {
-    return {item_with_index.first};
+  AnfNodePtr node = item_with_index.first;
+  MS_EXCEPTION_IF_NULL(node);
+  if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimMakeTuple)) {
+    auto outputs = AnfAlgo::GetAllOutput(node);
+    std::set<AnfNodePtr> memo;
+    std::vector<AnfNodePtr> new_output;
+    for (auto &output : outputs) {
+      if (memo.find(output) != memo.end()) {
+        continue;
+      }
+      memo.insert(output);
+      new_output.push_back(output);
+    }
+    if (new_output.size() == 1 && AnfAlgo::CheckPrimitiveType(new_output[0], prim::kPrimCall)) {
+      node = new_output[0];
+    }
+  }
+  if (!AnfAlgo::CheckPrimitiveType(node, prim::kPrimCall)) {
+    return {node};
   }
   std::vector<AnfNodePtr> real_inputs;
-  auto child_graphs = AnfAlgo::GetCallNodeKernelGraph(item_with_index.first->cast<CNodePtr>());
+  auto child_graphs = AnfAlgo::GetCallNodeKernelGraph(node->cast<CNodePtr>());
   for (const auto &child_graph : child_graphs) {
     if (child_graph->get_output_null()) {
       continue;
@@ -59,6 +76,31 @@ std::vector<AnfNodePtr> GetCallRealOutputs(const AnfNodePtr &call_node) {
   }
   return real_inputs;
 }
+
+AnfNodePtr MakeValueNode(const AnfNodePtr &node) {
+  auto value_node = node->cast<ValueNodePtr>();
+  if (value_node == nullptr) {
+    return nullptr;
+  }
+
+  ValueNodePtr new_value_node = std::make_shared<ValueNode>(value_node->value());
+  new_value_node->set_abstract(value_node->abstract());
+  // create kernel_info fo new value node
+  auto kernel_info = std::make_shared<device::KernelInfo>();
+  new_value_node->set_kernel_info(kernel_info);
+  // create kernel_build_info for new value node
+  auto kernel_build_info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  // set the format of value_node to DEFAULT_FORMAT
+  kernel_build_info_builder->SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+  // set value node initial device data type = infer data type
+  std::vector<TypeId> types;
+  for (size_t index = 0; index < AnfAlgo::GetOutputTensorNum(value_node); ++index) {
+    types.push_back(kTypeUnknown);
+  }
+  kernel_build_info_builder->SetOutputsDeviceType(types);
+  AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), new_value_node.get());
+  return new_value_node;
+}
 }  // namespace
 std::vector<AnfNodePtr> KernelGraph::outputs() const {
   auto graph_output = output();
@@ -215,7 +257,8 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
   auto cnode = FuncGraph::NewCNode(inputs);
   MS_EXCEPTION_IF_NULL(cnode);
   cnode->set_abstract(std::make_shared<abstract::AbstractNone>());
-  // create kernel_info from new parameter
+  CreateKernelInfoFromNewParameter(cnode);
+
   auto kernel_info = std::make_shared<device::KernelInfo>();
   std::vector<size_t> feature_map_input_indexs;
   // if the node only has the primitive(such as getNext) or the node's input has a feature map input
@@ -241,6 +284,41 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
   return cnode;
 }
 
+void KernelGraph::CreateKernelInfoFromNewParameter(const CNodePtr &cnode) {
+  if (!AnfAlgo::IsGraphKernel(cnode)) {
+    return;
+  }
+  auto func_graph = AnfAlgo::GetCNodeFuncGraphPtr(cnode);
+  MS_EXCEPTION_IF_NULL(func_graph);
+
+  std::vector<AnfNodePtr> node_list;
+  std::vector<AnfNodePtr> input_list;
+  std::vector<AnfNodePtr> output_list;
+  kernel::GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
+  for (auto &anf_node : node_list) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_info = std::make_shared<device::KernelInfo>();
+    anf_node->set_kernel_info(kernel_info);
+    auto anf_cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(anf_cnode);
+    for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(anf_cnode); ++i) {
+      auto input_node = anf_cnode->input(i + 1);
+      MS_EXCEPTION_IF_NULL(input_node);
+      if (IsValueNode<tensor::Tensor>(input_node)) {
+        auto new_input_node = MakeValueNode(input_node);
+        if (new_input_node != nullptr) {
+          anf_cnode->set_input(i + 1, new_input_node);
+        }
+      }
+    }
+  }
+  for (auto &anf_node : input_list) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_info = std::make_shared<device::KernelInfo>();
+    anf_node->set_kernel_info(kernel_info);
+  }
+}
+
 CNodePtr KernelGraph::NewCNode(const CNodePtr &cnode) {
   MS_EXCEPTION_IF_NULL(cnode);
   auto new_cnode = std::make_shared<CNode>(*cnode);
@@ -336,21 +414,7 @@ std::vector<AnfNodePtr> KernelGraph::SplitTupleValueNodeToNodeList(const ValueNo
 
 ValueNodePtr KernelGraph::NewValueNode(const ValueNodePtr &value_node) {
   MS_EXCEPTION_IF_NULL(value_node);
-  ValueNodePtr new_value_node = std::make_shared<ValueNode>(value_node->value());
-  new_value_node->set_abstract(value_node->abstract());
-  // create kernel_info fo new value node
-  auto kernel_info = std::make_shared<device::KernelInfo>();
-  kernel_info->SetFeatureMapFlag(false);
-  new_value_node->set_kernel_info(kernel_info);
-  // create kernel_build_info for new value node
-  auto kernel_build_info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
-  // set the format of value_node to DEFAULT_FORMAT
-  auto output_tensor_num = AnfAlgo::GetOutputTensorNum(value_node);
-  kernel_build_info_builder->SetOutputsFormat(std::vector<std::string>(output_tensor_num, kOpFormat_DEFAULT));
-  // set value node initial device data type = infer data type
-  std::vector<TypeId> types = std::vector<TypeId>(output_tensor_num, kTypeUnknown);
-  kernel_build_info_builder->SetOutputsDeviceType(types);
-  AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), new_value_node.get());
+  auto new_value_node = MakeValueNode(value_node)->cast<ValueNodePtr>();
   AnfAlgo::SetGraphId(graph_id_, new_value_node.get());
   return new_value_node;
 }
@@ -377,8 +441,8 @@ void KernelGraph::FrontBackendlMapUpdate(const AnfNodePtr &old_backend_anf, cons
   MS_EXCEPTION_IF_NULL(old_backend_anf);
   MS_EXCEPTION_IF_NULL(new_backend_anf);
   if (old_backend_anf == new_backend_anf) {
-    MS_LOG(INFO) << "old:" << old_backend_anf->DebugString() << ",new:" << new_backend_anf->DebugString();
-    MS_LOG(EXCEPTION) << "old can't be same with new";
+    MS_LOG(DEBUG) << "old same with new:" << old_backend_anf->DebugString();
+    return;
   }
   if (backend_front_anf_map_.find(old_backend_anf) == backend_front_anf_map_.end()) {
     MS_LOG(DEBUG) << "old_backend_anf " << old_backend_anf->DebugString() << " is not exist in the map";
@@ -482,7 +546,13 @@ void KernelGraph::UpdateControlDependRelations(const std::vector<AnfNodePtr> &de
       depend_nodes = GetOutputNodes(depend_node);
     }
     for (auto &first_node : prior_nodes) {
+      if (AnfAlgo::CheckPrimitiveType(first_node, prim::kPrimControlDepend)) {
+        continue;
+      }
       for (auto &second_node : depend_nodes) {
+        if (AnfAlgo::CheckPrimitiveType(second_node, prim::kPrimControlDepend)) {
+          continue;
+        }
         MS_EXCEPTION_IF_NULL(first_node);
         MS_EXCEPTION_IF_NULL(second_node);
         MS_LOG(INFO) << "Add first node:" << first_node->DebugString() << ",second node:" << second_node->DebugString();
@@ -581,9 +651,7 @@ bool KernelGraph::RemoveValueNodeFromGraph(const ValueNodePtr &value_node) {
   return false;
 }
 
-void KernelGraph::ReplaceNode(const AnfNodePtr &old_anf_node, AnfNodePtr new_anf_node) {
-  MS_EXCEPTION_IF_NULL(old_anf_node);
-  MS_EXCEPTION_IF_NULL(new_anf_node);
+void KernelGraph::ReplaceNode(NotNull<AnfNodePtr> old_anf_node, NotNull<AnfNodePtr> new_anf_node) {
   MS_EXCEPTION_IF_NULL(inputs_);
   auto it = node_output_edges_.find(old_anf_node);
   if (it != node_output_edges_.end()) {
@@ -598,16 +666,16 @@ void KernelGraph::ReplaceNode(const AnfNodePtr &old_anf_node, AnfNodePtr new_anf
         continue;
       }
       for (size_t i = 1; i < output_node_inputs.size(); i++) {
-        if (output_node_inputs[i] == old_anf_node) {
+        if (output_node_inputs[i] == old_anf_node.get()) {
           output_cnode->set_input(i, new_anf_node);
         }
       }
       // update graph inputs
       for (size_t i = 0; i < inputs_->size(); i++) {
-        if ((*inputs_)[i] == old_anf_node) {
+        if ((*inputs_)[i] == old_anf_node.get()) {
           MS_LOG(INFO) << "Replace input of graph:" << graph_id_ << ", old graph input: " << old_anf_node->DebugString()
                        << ",new graph input:" << new_anf_node->DebugString();
-          (*inputs_)[i] = new_anf_node;
+          (*inputs_)[i] = new_anf_node.get();
           break;
         }
       }
@@ -615,22 +683,29 @@ void KernelGraph::ReplaceNode(const AnfNodePtr &old_anf_node, AnfNodePtr new_anf
     // update front to backend map
     FrontBackendlMapUpdate(old_anf_node, new_anf_node);
     // update output depend relations
-    node_output_edges_[new_anf_node] = it->second;
+    node_output_edges_[new_anf_node.get()] = it->second;
     (void)node_output_edges_.erase(old_anf_node);
   }
   // update graph inputs in child graph
-  auto it_real_inputs = real_inputs_.find(old_anf_node);
+  auto it_real_inputs = std::find_if(real_inputs_.begin(), real_inputs_.end(),
+                                     [&old_anf_node](const std::pair<AnfNodePtr, std::vector<AnfNodePtr>> &n) -> bool {
+                                       return n.first == old_anf_node.get();
+                                     });
   if (it_real_inputs != real_inputs_.end()) {
+    // erase old parameter in map
+    auto old_args = it_real_inputs->second;
+    real_inputs_.erase(it_real_inputs);
     // insert new parameter to map
-    auto iter = real_inputs_.find(new_anf_node);
+    auto iter = std::find_if(real_inputs_.begin(), real_inputs_.end(),
+                             [&new_anf_node](const std::pair<AnfNodePtr, std::vector<AnfNodePtr>> &n) -> bool {
+                               return n.first == new_anf_node.get();
+                             });
     if (iter != real_inputs_.end()) {
       MS_LOG(WARNING) << new_anf_node->DebugString() << " already exist in real inputs, will be rewrited.";
-      iter->second = it_real_inputs->second;
+      iter->second = old_args;
     } else {
-      real_inputs_[new_anf_node] = it_real_inputs->second;
+      real_inputs_.emplace_back(new_anf_node, old_args);
     }
-    // erase old parameter in map
-    real_inputs_.erase(old_anf_node);
   }
 }
 
@@ -672,73 +747,69 @@ void KernelGraph::SetRealInput(const AnfNodePtr &parameter, const AnfNodePtr &ar
   MS_LOG(INFO) << "parameter: " << parameter->DebugString() << ", real input : " << arg->DebugString();
   MS_EXCEPTION_IF_NULL(parameter);
   MS_EXCEPTION_IF_NULL(arg);
-  if (real_inputs_.find(parameter) == real_inputs_.end()) {
-    real_inputs_[parameter] = std::set<AnfNodePtr>();
-  }
-  auto &args = real_inputs_[parameter];
-  (void)args.insert(arg);
-}
-
-std::set<AnfNodePtr> KernelGraph::GetRealInput(const AnfNodePtr &parameter) {
-  MS_EXCEPTION_IF_NULL(parameter);
-  auto iter = real_inputs_.find(parameter);
+  auto iter = std::find_if(
+    real_inputs_.begin(), real_inputs_.end(),
+    [&parameter](const std::pair<AnfNodePtr, std::vector<AnfNodePtr>> &n) -> bool { return n.first == parameter; });
   if (iter != real_inputs_.end()) {
-    return iter->second;
+    auto &args = iter->second;
+    args.push_back(arg);
+  } else {
+    real_inputs_.emplace_back(parameter, std::vector<AnfNodePtr>(1, arg));
   }
-  MS_LOG(EXCEPTION) << parameter->DebugString() << " not found.";
 }
 
 void KernelGraph::UpdateCallRealInput() {
   MS_LOG(INFO) << "Update graph id: " << graph_id_;
-  std::map<AnfNodePtr, std::set<AnfNodePtr>> real_inputs_map;
-  std::vector<std::pair<AnfNodePtr, AnfNodePtr>> replace_list;
+  std::vector<std::pair<AnfNodePtr, std::vector<AnfNodePtr>>> real_inputs_map;
   for (auto &it : real_inputs_) {
     auto parameter = it.first;
     MS_EXCEPTION_IF_NULL(parameter);
     auto real_inputs = it.second;
     std::vector<AnfNodePtr> new_real_inputs;
-    std::set<AnfNodePtr> erase_real_inputs;
     for (auto &real_input : real_inputs) {
       // if real input is a call node ,find the child graph output act as the new real input
       auto item_with_index = AnfAlgo::VisitKernelWithReturnType(real_input, 0);
       MS_EXCEPTION_IF_NULL(item_with_index.first);
-      if (AnfAlgo::CheckPrimitiveType(item_with_index.first, prim::kPrimCall)) {
-        (void)erase_real_inputs.insert(item_with_index.first);
-        new_real_inputs = GetCallRealOutputs(item_with_index.first);
-        continue;
-      }
-    }
-    for (auto &erase_node : erase_real_inputs) {
-      MS_LOG(INFO) << "paramter: " << parameter->DebugString() << " erase real input:" << erase_node->DebugString();
-      (void)real_inputs.erase(erase_node);
-    }
-    for (auto &new_real_input : new_real_inputs) {
-      MS_LOG(INFO) << "paramter: " << parameter->DebugString()
-                   << " insert real input:" << new_real_input->DebugString();
-      (void)real_inputs.insert(new_real_input);
-      if (new_real_input->isa<Parameter>()) {
-        replace_list.emplace_back(parameter, new_real_input);
-        parameter = new_real_input;
-      }
+      auto tmp_real_input = GetCallRealOutputs(item_with_index.first);
+      std::copy(tmp_real_input.begin(), tmp_real_input.end(), std::back_inserter(new_real_inputs));
     }
-    real_inputs_map[parameter] = real_inputs;
-  }
-  for (auto [parameter, arg] : replace_list) {
-    ReplaceNode(parameter, arg);
+    real_inputs_map.emplace_back(parameter, new_real_inputs);
   }
   real_inputs_ = real_inputs_map;
 }
 
-std::string KernelGraph::ToString() const { return std::string("kernel_graph_").append(std::to_string(graph_id_)); }
+void KernelGraph::PrintGraphExecuteOrder() const {
+  MS_LOG(INFO) << "graph:" << graph_id_ << "execution order";
+  for (size_t i = 0; i < execution_order_.size(); i++) {
+    CNodePtr cur_cnode_ptr = execution_order_[i];
+    MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+    std::string event_str;
+    std::string label_str;
+    if (AnfAlgo::HasNodeAttr(kAttrEventId, cur_cnode_ptr)) {
+      event_str = ", event_id[" + std::to_string(AnfAlgo::GetNodeAttr<uint32_t>(cur_cnode_ptr, kAttrEventId)) + "]";
+    }
 
-KernelGraph::~KernelGraph() {
-  auto context = MsContext::GetInstance();
-  if (!context) {
-    return;
-  }
-  if (context->execution_mode() == kGraphMode) {
-    device::KernelRuntimeManager::Instance().ClearGraphResource(graph_id_);
+    if (AnfAlgo::HasNodeAttr(kAttrLabelIndex, cur_cnode_ptr)) {
+      label_str = ", label_id[" + std::to_string(AnfAlgo::GetNodeAttr<uint32_t>(cur_cnode_ptr, kAttrLabelIndex)) + "]";
+    }
+
+    if (AnfAlgo::HasNodeAttr(kAttrLabelSwitchList, cur_cnode_ptr)) {
+      auto label_list = AnfAlgo::GetNodeAttr<std::vector<uint32_t>>(cur_cnode_ptr, kAttrLabelSwitchList);
+      label_str = ", label_id[";
+      for (size_t j = 0; j < label_list.size(); ++j) {
+        label_str += std::to_string(label_list[j]) + (j + 1 < label_list.size() ? ", " : "]");
+      }
+    }
+
+    MS_LOG(INFO) << "index[" << i << "], node name[" << cur_cnode_ptr->fullname_with_scope() << "], logic id["
+                 << AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get()) << "], stream id["
+                 << AnfAlgo::GetStreamId(cur_cnode_ptr) << "], node info[" << cur_cnode_ptr->DebugString() << "]"
+                 << event_str << label_str;
   }
 }
+
+std::string KernelGraph::ToString() const { return std::string("kernel_graph_").append(std::to_string(graph_id_)); }
+
+KernelGraph::~KernelGraph() { device::KernelRuntimeManager::Instance().ClearGraphResource(graph_id_); }
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/session/kernel_graph.h b/mindspore/ccsrc/session/kernel_graph.h
index 98a007d1a1..3009ab0ce9 100644
--- a/mindspore/ccsrc/session/kernel_graph.h
+++ b/mindspore/ccsrc/session/kernel_graph.h
@@ -40,6 +40,7 @@ class KernelGraph : public FuncGraph {
     inputs_ = std::make_shared<std::vector<AnfNodePtr>>();
     execution_order_ = {};
     executable_ = true;
+    summary_node_exist_ = false;
     stream_distinction_label_ = kInvalidDistincLabel;
   }
   ~KernelGraph() override;
@@ -50,6 +51,7 @@ class KernelGraph : public FuncGraph {
   std::vector<AnfNodePtr> *MutableInputs() const { return inputs_.get(); }
   std::vector<AnfNodePtr> outputs() const;
   CNodePtr NewCNode(const std::vector<AnfNodePtr> &inputs) override;
+  void CreateKernelInfoFromNewParameter(const CNodePtr &cnode);
   CNodePtr NewCNode(const CNodePtr &cnode);
   ParameterPtr NewParameter(const ParameterPtr &parameter = nullptr);
   ValueNodePtr NewValueNode(const ValueNodePtr &value_node = nullptr);
@@ -90,11 +92,15 @@ class KernelGraph : public FuncGraph {
   bool executable() const { return executable_; }
   // set executable of graph
   void set_executable(bool executable) { executable_ = executable; }
+  // set summary_node of graph
+  void set_summary_node_exist(bool summary_node_exist) { summary_node_exist_ = summary_node_exist; }
+  // check whether exist summary node in graph
+  bool summary_node_exist() const { return summary_node_exist_; }
   // set invalid inputs for control sink
   std::vector<bool> *MutableValidInputs() { return &valid_inputs_; }
   std::vector<bool> valid_inputs() const { return valid_inputs_; }
   // replace node in graph
-  void ReplaceNode(const AnfNodePtr &old_anf_node, AnfNodePtr new_anf_node);
+  void ReplaceNode(NotNull<AnfNodePtr> old_anf_node, NotNull<AnfNodePtr> new_anf_node);
   // set stream label of graph
   void set_stream_distinction_label(uint32_t stream_label) { stream_distinction_label_ = stream_label; }
   // get stream label of graph
@@ -122,8 +128,7 @@ class KernelGraph : public FuncGraph {
   // find anf node in graph
   std::vector<CNodePtr> FindNodeByPrimitive(const PrimitivePtr &primitive) const;
   // get real inputs
-  const std::map<AnfNodePtr, std::set<AnfNodePtr>> &real_inputs() const { return real_inputs_; }
-  std::set<AnfNodePtr> GetRealInput(const AnfNodePtr &parameter);
+  const std::vector<std::pair<AnfNodePtr, std::vector<AnfNodePtr>>> &real_inputs() const { return real_inputs_; }
   void SetRealInput(const AnfNodePtr &parameter, const AnfNodePtr &arg);
   // used to dump ir
   std::string ToString() const override;
@@ -136,6 +141,9 @@ class KernelGraph : public FuncGraph {
   CNodePtr get_end_goto() { return end_goto_; }
   bool get_output_null() { return null_output_; }
   void set_output_null(bool is_output_null) { null_output_ = is_output_null; }
+  void PrintGraphExecuteOrder() const;
+  const std::map<std::string, std::pair<AnfNodePtr, int>> &summary_nodes() const { return summary_nodes_; }
+  void set_summary_nodes(const std::map<std::string, std::pair<AnfNodePtr, int>> &nodes) { summary_nodes_ = nodes; }
 
  private:
   // remove value node form graph
@@ -169,8 +177,11 @@ class KernelGraph : public FuncGraph {
   // record map between ref final output anf with index and ref origin input with index
   std::map<AnfWithOutIndex, AnfWithOutIndex> ref_out_in_map_;
   std::unordered_map<AnfNodePtr, std::vector<std::pair<AnfNodePtr, size_t>>> node_output_edges_;
+  std::map<std::string, std::pair<AnfNodePtr, int>> summary_nodes_;
   // graph needn't execute
   bool executable_;
+  // exist summary node in graph
+  bool summary_node_exist_;
   // valid inputs
   std::vector<bool> valid_inputs_;
 
@@ -186,7 +197,7 @@ class KernelGraph : public FuncGraph {
   // parameter graph
   std::shared_ptr<KernelGraph> parent_graph_;
   // record real parameters,inputs_ is the formal parameters
-  std::map<AnfNodePtr, std::set<AnfNodePtr>> real_inputs_;
+  std::vector<std::pair<AnfNodePtr, std::vector<AnfNodePtr>>> real_inputs_;
 
   CNodePtr start_label_;
   CNodePtr end_goto_;
diff --git a/mindspore/ccsrc/session/session.cc b/mindspore/ccsrc/session/session.cc
new file mode 100644
index 0000000000..90e02b37ff
--- /dev/null
+++ b/mindspore/ccsrc/session/session.cc
@@ -0,0 +1,174 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+#include <algorithm>
+#include "include/inference.h"
+#include "session/session.h"
+#include "utils/load_onnx/anf_converter.h"
+#include "session/session_basic.h"
+#include "session/session_factory.h"
+#include "utils/base_ref_utils.h"
+#include "kernel/oplib/oplib.h"
+#ifdef ENABLE_D
+#include "utils/context/ms_context.h"
+#include "session/ascend_session.h"
+#else
+#include "session/cpu_session.h"
+#endif
+
+namespace py = pybind11;
+namespace mindspore::inference {
+std::shared_ptr<FuncGraph> LoadModel(const char *model_buf, size_t size, const std::string &device) {
+  inference::Session::RegAllOp();
+  auto anf_graph = lite::AnfConverter::RunAnfConverter(model_buf, size);
+  return anf_graph;
+}
+
+void ExitInference() {
+  auto ms_context = MsContext::GetInstance();
+  if (ms_context == nullptr) {
+    MS_LOG(ERROR) << "Get Context failed!";
+    return;
+  }
+  if (!ms_context->CloseTsd()) {
+    MS_LOG(ERROR) << "Inference CloseTsd failed!";
+    return;
+  }
+}
+
+std::shared_ptr<MSSession> MSSession::CreateSession(const std::string &device, uint32_t device_id) {
+  auto session = std::make_shared<inference::Session>();
+  auto ret = session->Init(device, device_id);
+  if (ret != 0) {
+    return nullptr;
+  }
+  return session;
+}
+
+void Session::RegAllOp() {
+  static std::mutex init_mutex;
+  static bool Initialized = false;
+
+  std::lock_guard<std::mutex> lock(init_mutex);
+  if (Initialized) {
+    return;
+  }
+  Initialized = true;
+  MsContext::GetInstance()->set_execution_mode(kGraphMode);
+  Py_Initialize();
+  auto c_expression = PyImport_ImportModule("mindspore._c_expression");
+  if (c_expression == nullptr) {
+    MS_LOG(EXCEPTION) << "Failed to import mindspore._c_expression  module.";
+    return;
+  }
+  PyObject *c_expression_dict = PyModule_GetDict(c_expression);
+
+  PyObject *op_info_loader_class = PyDict_GetItemString(c_expression_dict, "OpInfoLoaderPy");
+  if (op_info_loader_class == nullptr) {
+    MS_LOG(EXCEPTION) << "Failed to get op_info_loader_class from mindspore._c_expression.";
+    return;
+  }
+  PyObject *op_info_loader = PyInstanceMethod_New(op_info_loader_class);
+  if (op_info_loader == nullptr) {
+    MS_LOG(EXCEPTION) << "Failed to create op_info_loader instance.";
+    return;
+  }
+  PyObject *op_info_loader_ins = PyObject_CallObject(op_info_loader, nullptr);
+  if (op_info_loader_ins == nullptr) {
+    MS_LOG(EXCEPTION) << "Failed to call op_info_loader instance.";
+    return;
+  }
+  auto all_ops_info_vector_addr_ul = PyObject_CallMethod(op_info_loader_ins, "get_all_ops_info", nullptr);
+  if (all_ops_info_vector_addr_ul == nullptr) {
+    MS_LOG(EXCEPTION) << "Failed to call get_all_ops_addr.";
+    return;
+  }
+  auto all_ops_info_vector_addr = PyLong_AsVoidPtr(all_ops_info_vector_addr_ul);
+  auto all_ops_info = static_cast<std::vector<kernel::OpInfo *> *>(all_ops_info_vector_addr);
+  for (auto op_info : *all_ops_info) {
+    kernel::OpLib::RegOpInfo(std::shared_ptr<kernel::OpInfo>(op_info));
+  }
+  all_ops_info->clear();
+  delete all_ops_info;
+  Py_DECREF(op_info_loader);
+  Py_DECREF(op_info_loader_class);
+  Py_DECREF(c_expression_dict);
+  Py_DECREF(c_expression);
+  return;
+}
+
+uint32_t Session::CompileGraph(std::shared_ptr<FuncGraph> funcGraphPtr) {
+  MS_ASSERT(session_impl_ != nullptr);
+  auto graph_id = session_impl_->CompileGraph(NOT_NULL(funcGraphPtr));
+  py::gil_scoped_release gil_release;
+  return graph_id;
+}
+
+MultiTensor Session::RunGraph(uint32_t graph_id, const std::vector<std::shared_ptr<inference::MSTensor>> &inputs) {
+  std::vector<tensor::TensorPtr> inTensors;
+  inTensors.resize(inputs.size());
+  bool has_error = false;
+  std::transform(inputs.begin(), inputs.end(), inTensors.begin(),
+                 [&has_error](const std::shared_ptr<inference::MSTensor> &tensor_ptr) -> tensor::TensorPtr {
+                   if (tensor_ptr == nullptr) {
+                     MS_LOG(WARNING) << "input MSTensor is nullptr, return nullptr";
+                     has_error = true;
+                     return nullptr;
+                   }
+                   auto tensor = static_cast<inference::Tensor *>(tensor_ptr.get());
+                   if (tensor == nullptr) {
+                     MS_LOG(ERROR) << "Can not cast input MSTensor to tensor";
+                     has_error = true;
+                     return nullptr;
+                   }
+                   return tensor->tensor();
+                 });
+  if (has_error) {
+    MS_LOG(ERROR) << "Init Tensor failed, returning empty result";
+    std::vector<std::shared_ptr<inference::MSTensor>> multiTensor;
+    return multiTensor;
+  }
+  VectorRef outputs;
+  session_impl_->RunGraph(graph_id, inTensors, &outputs);
+
+  return TransformVectorRefToMultiTensor(outputs);
+}
+
+int Session::Init(const std::string &device, uint32_t device_id) {
+  RegAllOp();
+  auto ms_context = MsContext::GetInstance();
+  ms_context->set_execution_mode(kGraphMode);
+  ms_context->set_device_target(kAscendDevice);
+  session_impl_ = session::SessionFactory::Get().Create(device);
+  if (session_impl_ == nullptr) {
+    MS_LOG(ERROR) << "Session create failed!, please make sure target device:" << device << " is available.";
+    return -1;
+  }
+  session_impl_->Init(device_id);
+  if (ms_context == nullptr) {
+    MS_LOG(ERROR) << "Get Context failed!";
+    return -1;
+  }
+  if (!ms_context->OpenTsd()) {
+    MS_LOG(ERROR) << "Session init OpenTsd failed!";
+    return -1;
+  }
+  return 0;
+}
+
+Session::Session() = default;
+}  // namespace mindspore::inference
diff --git a/mindspore/ccsrc/session/session.h b/mindspore/ccsrc/session/session.h
new file mode 100644
index 0000000000..b608163067
--- /dev/null
+++ b/mindspore/ccsrc/session/session.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_SESSION_SESSION_H
+#define MINDSPORE_CCSRC_SESSION_SESSION_H
+
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <memory>
+#include <map>
+
+#include "session/session_basic.h"
+#include "ir/anf.h"
+#include "include/inference.h"
+
+namespace mindspore {
+namespace inference {
+class Session : public MSSession {
+ public:
+  Session();
+
+  uint32_t CompileGraph(std::shared_ptr<FuncGraph> funcGraphPtr) override;
+
+  MultiTensor RunGraph(uint32_t graph_id, const std::vector<std::shared_ptr<inference::MSTensor>> &inputs) override;
+
+  int Init(const std::string &device, uint32_t device_id);
+
+  static void RegAllOp();
+
+ private:
+  std::shared_ptr<session::SessionBasic> session_impl_ = nullptr;
+  std::vector<uint32_t> graph_id_;
+};
+}  // namespace inference
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_SESSION_SESSION_BASIC_H
diff --git a/mindspore/ccsrc/session/session_basic.cc b/mindspore/ccsrc/session/session_basic.cc
index d47cea188c..ff6fa8ff94 100644
--- a/mindspore/ccsrc/session/session_basic.cc
+++ b/mindspore/ccsrc/session/session_basic.cc
@@ -21,6 +21,7 @@
 #include "pipeline/parse/data_converter.h"
 #include "ir/manager.h"
 #include "ir/param_value_py.h"
+#include "kernel/common_utils.h"
 #include "operator/ops.h"
 #include "common/trans.h"
 #include "utils/context/ms_context.h"
@@ -33,6 +34,7 @@
 #include "common/utils.h"
 #include "ir/dtype.h"
 #include "ir/anf.h"
+#include "ir/func_graph_cloner.h"
 
 namespace mindspore {
 namespace session {
@@ -50,6 +52,7 @@ PyObject *GetParamDefaultInputTensor(const AnfNodePtr &node) {
     return nullptr;
   }
   auto param_value = std::dynamic_pointer_cast<ParamValuePy>(parameter->default_param());
+  MS_EXCEPTION_IF_NULL(param_value);
   auto py_param = param_value->value();
   return py_param.ptr();
 }
@@ -67,7 +70,7 @@ BaseRef CreateOneTensor(const AnfNodePtr &node, size_t output_index, const Kerne
     }
     if (node->isa<Parameter>()) {
       for (size_t input_idx = 0; input_idx < graph.inputs().size(); input_idx++) {
-        if (input_idx > input_tensors.size()) {
+        if (input_idx >= input_tensors.size()) {
           MS_LOG(EXCEPTION) << "input idx:" << input_idx << "out of range:" << input_tensors.size();
         }
         if (graph.inputs()[input_idx] == node) {
@@ -147,6 +150,8 @@ BaseRef CreatTupleForOutput(const AnfNodePtr &anf, const KernelGraph &graph,
 }
 
 ValueNodePtr CreateNewValueNode(const AnfNodePtr &anf, KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(anf);
+  MS_EXCEPTION_IF_NULL(graph);
   auto value_node = anf->cast<ValueNodePtr>();
   MS_EXCEPTION_IF_NULL(value_node);
   auto value = value_node->value();
@@ -227,6 +232,7 @@ ValueNodePtr ConstructRunOpValueNode(const std::shared_ptr<KernelGraph> &graph,
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(input_tensor);
   auto value_node = std::make_shared<ValueNode>(input_tensor);
+  MS_EXCEPTION_IF_NULL(value_node);
   // construct abstract of value node
   auto type_of_tensor = input_tensor->Dtype();
   auto shape_of_tensor = input_tensor->shape();
@@ -240,6 +246,7 @@ ValueNodePtr ConstructRunOpValueNode(const std::shared_ptr<KernelGraph> &graph,
 
 ParameterPtr ConstructRunOpParameter(const std::shared_ptr<KernelGraph> &graph, const tensor::TensorPtr &input_tensor,
                                      int tensor_mask) {
+  MS_EXCEPTION_IF_NULL(graph);
   auto param = graph->NewParameter();
   MS_EXCEPTION_IF_NULL(param);
   if (tensor_mask == kParameterWeightTensorMask) {
@@ -291,6 +298,20 @@ void DumpGraphOutput(const Any &any, size_t recurse_level = 0) {
   (void)tab_str.append(any.ToString());
   MS_LOG(INFO) << tab_str;
 }
+
+bool ExistSummaryNode(const KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto ret = graph->get_return();
+  MS_EXCEPTION_IF_NULL(ret);
+  auto all_nodes = DeepLinkedGraphSearch(ret);
+  for (auto &n : all_nodes) {
+    if (IsPrimitiveCNode(n, prim::kPrimScalarSummary) || IsPrimitiveCNode(n, prim::kPrimTensorSummary) ||
+        IsPrimitiveCNode(n, prim::kPrimImageSummary) || IsPrimitiveCNode(n, prim::kPrimHistogramSummary)) {
+      return true;
+    }
+  }
+  return false;
+}
 }  // namespace
 
 GraphId SessionBasic::graph_sum_ = 0;
@@ -300,7 +321,7 @@ ParameterPtr SessionBasic::CreateNewParameterFromParameter(const AnfNodePtr &anf
   if (!anf->isa<Parameter>()) {
     MS_LOG(EXCEPTION) << "anf[" << anf->DebugString() << "] is not a parameter";
   }
-
+  MS_EXCEPTION_IF_NULL(graph);
   auto m_tensor = GetParamDefaultInputTensor(anf);
   auto valid_inputs = graph->MutableValidInputs();
   MS_EXCEPTION_IF_NULL(valid_inputs);
@@ -311,8 +332,9 @@ ParameterPtr SessionBasic::CreateNewParameterFromParameter(const AnfNodePtr &anf
   if (python_paras_ == nullptr) {
     python_paras_ = std::make_shared<std::map<PyObject *, ParameterPtr>>();
   }
-  if (python_paras_->find(m_tensor) != python_paras_->end() && GetGraphIdByNode(anf) == kInvalidGraphId) {
-    new_parameter = (*python_paras_)[m_tensor];
+  auto iter = python_paras_->find(m_tensor);
+  if (iter != python_paras_->end()) {
+    new_parameter = iter->second;
   } else {
     TraceManager::DebugTrace(std::make_shared<TraceCopy>(anf->debug_info()));
     new_parameter = graph->NewParameter(anf->cast<ParameterPtr>());
@@ -328,6 +350,7 @@ ParameterPtr SessionBasic::CreateNewParameterFromParameter(const AnfNodePtr &anf
 
 AnfNodePtr SessionBasic::CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) {
   MS_EXCEPTION_IF_NULL(anf);
+  MS_EXCEPTION_IF_NULL(graph);
   MS_LOG(INFO) << "Create a new parameter from cnode[" << anf->DebugString() << "]";
   auto parameters = CreateParameterFromTuple(anf, valid_input, graph);
   if (parameters.empty()) {
@@ -353,10 +376,17 @@ CNodePtr SessionBasic::CreateNewCNode(const CNodePtr &cnode, bool valid_input, K
   MS_EXCEPTION_IF_NULL(other_graph_cnode);
   *from_other_graph = false;
   // get primitive of old node
+  std::vector<AnfNodePtr> cnode_inputs;
   auto prim = AnfAlgo::GetCNodePrimitive(cnode);
-  MS_EXCEPTION_IF_NULL(prim);
-  // push attr to inputs[0] of new cnode
-  std::vector<AnfNodePtr> cnode_inputs = {std::make_shared<ValueNode>(std::make_shared<Primitive>(*prim))};
+  if (prim != nullptr) {
+    // push attr to inputs[0] of new cnode
+    cnode_inputs.push_back(std::make_shared<ValueNode>(std::make_shared<Primitive>(*prim)));
+  } else {
+    auto fg = AnfAlgo::GetCNodeFuncGraphPtr(cnode);
+    MS_EXCEPTION_IF_NULL(fg);
+    auto new_fg = BasicClone(fg);
+    cnode_inputs.push_back(std::make_shared<ValueNode>(new_fg));
+  }
   // if has multiple depends,only select first depend as parameter
   for (size_t input_idx = 1; input_idx < cnode->inputs().size(); input_idx++) {
     auto anf = cnode->inputs()[input_idx];
@@ -446,6 +476,8 @@ CNodePtr SessionBasic::CreateNewCNode(const CNodePtr &cnode, KernelGraph *graph)
     if (graph->GetBackendAnfByFrontAnf(anf) != nullptr) {
       cnode_inputs.emplace_back(graph->GetBackendAnfByFrontAnf(anf));
       continue;
+    } else if (IsValueNode<None>(anf)) {
+      continue;
     }
     MS_LOG(EXCEPTION) << "Unexpected input[" << anf->DebugString() << "]";
   }
@@ -457,6 +489,7 @@ CNodePtr SessionBasic::CreateNewCNode(const CNodePtr &cnode, KernelGraph *graph)
 
 ValueNodePtr SessionBasic::CreateValueNodeKernelGraph(const AnfNodePtr &anf, KernelGraph *graph) {
   MS_EXCEPTION_IF_NULL(anf);
+  MS_EXCEPTION_IF_NULL(graph);
   auto value_node = anf->cast<ValueNodePtr>();
   MS_EXCEPTION_IF_NULL(value_node);
   auto sub_func_graph = AnfAlgo::GetValueNodeFuncGraph(anf);
@@ -484,16 +517,27 @@ ValueNodePtr SessionBasic::CreateValueNodeKernelGraph(const AnfNodePtr &anf, Ker
 
 ParameterPtr SessionBasic::CreateNewParameter(const AnfNodePtr &anf, KernelGraph *graph) {
   MS_EXCEPTION_IF_NULL(anf);
+  MS_EXCEPTION_IF_NULL(graph);
   if (!anf->isa<Parameter>()) {
     MS_LOG(EXCEPTION) << "anf[" << anf->DebugString() << "] is not a parameter";
   }
-  auto graph_inputs = graph->MutableInputs();
-  MS_EXCEPTION_IF_NULL(graph_inputs);
-  TraceManager::DebugTrace(std::make_shared<TraceCopy>(anf->debug_info()));
-  auto new_parameter = graph->NewParameter(anf->cast<ParameterPtr>());
-  TraceManager::EndTrace();
-  graph_inputs->push_back(new_parameter);
-  graph->FrontBackendlMapAdd(anf, new_parameter);
+
+  auto m_tensor = GetParamDefaultInputTensor(anf);
+  ParameterPtr new_parameter = nullptr;
+  if (python_paras_ == nullptr) {
+    python_paras_ = std::make_shared<std::map<PyObject *, ParameterPtr>>();
+  }
+  auto iter = python_paras_->find(m_tensor);
+  if (iter != python_paras_->end()) {
+    new_parameter = iter->second;
+  } else {
+    TraceManager::DebugTrace(std::make_shared<TraceCopy>(anf->debug_info()));
+    new_parameter = graph->NewParameter(anf->cast<ParameterPtr>());
+    if (m_tensor != nullptr) {
+      (*python_paras_)[m_tensor] = new_parameter;
+    }
+    TraceManager::EndTrace();
+  }
 
   return new_parameter;
 }
@@ -501,6 +545,7 @@ ParameterPtr SessionBasic::CreateNewParameter(const AnfNodePtr &anf, KernelGraph
 KernelGraphPtr SessionBasic::ConstructKernelGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
   std::unordered_map<AnfNodePtr, AnfNodePtr> other_graph_cnode;
   auto graph = NewKernelGraph();
+  MS_EXCEPTION_IF_NULL(graph);
   MS_LOG(INFO) << "Create graph: " << graph->graph_id();
   size_t from_other_graph_depend_num = 0;
   for (const auto &node : lst) {
@@ -537,14 +582,20 @@ KernelGraphPtr SessionBasic::ConstructKernelGraph(const AnfNodePtrList &lst, con
     graph->set_manager(manager);
   }
   graph->SetExecOrderByDefault();
+  if (ExistSummaryNode(graph.get())) {
+    graph->set_summary_node_exist(true);
+  }
   opt::BackendCommonOptimization(graph);
   return graph;
 }
 
-std::shared_ptr<KernelGraph> SessionBasic::ConstructKernelGraph(const FuncGraphPtr &func_graph) {
+std::shared_ptr<KernelGraph> SessionBasic::ConstructKernelGraph(const FuncGraphPtr &func_graph,
+                                                                std::vector<KernelGraphPtr> *all_out_graph) {
   MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(all_out_graph);
   auto node_list = TopoSort(func_graph->get_return());
   auto graph = NewKernelGraph();
+  MS_EXCEPTION_IF_NULL(graph);
   front_backend_graph_map_[func_graph] = graph;
   MS_LOG(INFO) << "Create graph: " << graph->graph_id();
 
@@ -553,7 +604,11 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructKernelGraph(const FuncGraphP
     MS_EXCEPTION_IF_NULL(node);
     MS_LOG(DEBUG) << "Start create new cnode, node = " << node->DebugString();
     if (node->isa<Parameter>()) {
-      (void)CreateNewParameter(node, graph.get());
+      auto graph_inputs = graph->MutableInputs();
+      MS_EXCEPTION_IF_NULL(graph_inputs);
+      auto new_parameter = CreateNewParameter(node, graph.get());
+      graph_inputs->push_back(new_parameter);
+      graph->FrontBackendlMapAdd(node, new_parameter);
       continue;
     } else if (node->isa<ValueNode>()) {
       if (!IsValueNode<FuncGraph>(node)) {
@@ -563,10 +618,9 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructKernelGraph(const FuncGraphP
         // if input is a ValueNode<FuncGraph>
         FuncGraphPtr child_graph = AnfAlgo::GetValueNodeFuncGraph(node);
         if (front_backend_graph_map_.find(child_graph) != front_backend_graph_map_.end()) {
-          MS_LOG(INFO) << "FuncGraph: " << child_graph->ToString() << " has been transformed to KernelGraph.";
           is_trace_back = true;
         } else {
-          (void)ConstructKernelGraph(child_graph);
+          (void)ConstructKernelGraph(child_graph, all_out_graph);
         }
         (void)CreateValueNodeKernelGraph(node, graph.get());
       }
@@ -578,6 +632,7 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructKernelGraph(const FuncGraphP
       auto new_cnode = CreateNewCNode(cnode, graph.get());
       MS_EXCEPTION_IF_NULL(new_cnode);
       new_cnode->set_abstract(cnode->abstract());
+      new_cnode->set_fullname_with_scope(cnode->fullname_with_scope());
       new_cnode->set_scope(cnode->scope());
       graph->FrontBackendlMapAdd(node, new_cnode);
       if (AnfAlgo::CheckPrimitiveType(new_cnode, prim::kPrimReturn)) {
@@ -587,29 +642,33 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructKernelGraph(const FuncGraphP
   }
   // if a graph jump back unconditionally, return op of this graph will never be executed, so output is null.
   graph->set_output_null(is_trace_back);
+  AddParameterToGraphInputs(func_graph->parameters(), graph.get());
+  graph->SetExecOrderByDefault();
+  if (ExistSummaryNode(graph.get())) {
+    graph->set_summary_node_exist(true);
+  }
+  all_out_graph->push_back(graph);
+  return graph;
+}
+
+void SessionBasic::AddParameterToGraphInputs(const std::vector<AnfNodePtr> &parameters, KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
   auto graph_inputs = graph->MutableInputs();
   MS_EXCEPTION_IF_NULL(graph_inputs);
   graph_inputs->clear();
-  for (auto &parameter : func_graph->parameters()) {
+  for (auto &parameter : parameters) {
     MS_EXCEPTION_IF_NULL(parameter);
     auto backend_parameter = graph->GetBackendAnfByFrontAnf(parameter);
     if (backend_parameter == nullptr) {
       // for example "def f(x,y,z) {return x + y}", parameter z in unused
-      CreateNewParameterFromParameter(parameter, false, graph.get());
+      auto new_parameter = CreateNewParameter(parameter, graph);
+      graph_inputs->push_back(new_parameter);
       MS_LOG(INFO) << "Can't find parameter:" << parameter->DebugString();
       continue;
     }
     MS_LOG(INFO) << "graph[" << graph->graph_id() << "],parameter:" << parameter->DebugString();
     graph_inputs->push_back(backend_parameter);
   }
-  MS_EXCEPTION_IF_NULL(context_);
-  FuncGraphManagerPtr manager = context_->manager();
-  if (manager) {
-    manager->AddFuncGraph(graph);
-    graph->set_manager(manager);
-  }
-  graph->SetExecOrderByDefault();
-  return graph;
 }
 
 // run graph steps
@@ -650,7 +709,9 @@ void SessionBasic::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_grap
         }
       }
       if (need_sync) {
-        tensor->set_device_address(device_address);
+        if (ms_context->execution_mode() == kPynativeMode || AnfAlgo::IsParameterWeight(pk_node)) {
+          tensor->set_device_address(device_address);
+        }
         MS_EXCEPTION_IF_NULL(device_address);
         if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
                                               LongToSize(tensor->data().nbytes()), tensor->data_type(),
@@ -674,8 +735,8 @@ void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_grap
   }
   auto anf_outputs = kernel_graph->outputs();
   for (auto &item : anf_outputs) {
-    MS_LOG(INFO) << "update output[" << item->DebugString() << "]";
     MS_EXCEPTION_IF_NULL(item);
+    MS_LOG(INFO) << "update output[" << item->DebugString() << "]";
     if (AnfAlgo::IsTupleOutput(item) && AnfAlgo::IsRealKernel(item)) {
       outputs->emplace_back(CreatTupleForOutput(item, *kernel_graph, input_tensors));
       continue;
@@ -689,29 +750,15 @@ void SessionBasic::RegisterSummaryCallBackFunc(const CallBackFunc &callback) {
   summary_callback_ = callback;
 }
 
-void SessionBasic::Reorder(std::vector<CNodePtr> *node_list) {
-  MS_EXCEPTION_IF_NULL(node_list);
-  std::vector<CNodePtr> all_opt_list;
-  std::vector<CNodePtr> non_opt_list;
+void SessionBasic::Reorder(std::vector<CNodePtr> *node_list) { AnfAlgo::ReorderExecList(NOT_NULL(node_list)); }
 
-  for (const auto &node : *node_list) {
-    MS_EXCEPTION_IF_NULL(node);
-    if (kOptOperatorSet.find(AnfAlgo::GetCNodeName(node)) != kOptOperatorSet.end()) {
-      all_opt_list.emplace_back(node);
-    } else {
-      non_opt_list.emplace_back(node);
-    }
-  }
-  node_list->clear();
-  (void)std::copy(non_opt_list.begin(), non_opt_list.end(), std::back_inserter(*node_list));
-  (void)std::copy(all_opt_list.begin(), all_opt_list.end(), std::back_inserter(*node_list));
-}
-
-void SessionBasic::GetSummaryNodes(const KernelGraph *graph,
-                                   std::unordered_map<std::string, std::pair<AnfNodePtr, int>> *summary) {
+void SessionBasic::GetSummaryNodes(KernelGraph *graph) {
   MS_LOG(DEBUG) << "Update summary Start";
   MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(summary);
+  if (!graph->summary_node_exist()) {
+    return;
+  }
+  auto summary = graph->summary_nodes();
   auto apply_list = TopoSort(graph->get_return());
   for (auto &n : apply_list) {
     MS_EXCEPTION_IF_NULL(n);
@@ -724,14 +771,16 @@ void SessionBasic::GetSummaryNodes(const KernelGraph *graph,
       }
       auto node = cnode->input(kSummaryGetItem);
       MS_EXCEPTION_IF_NULL(node);
-      auto item_with_index = AnfAlgo::VisitKernelWithReturnType(node, 0);
+      auto item_with_index = AnfAlgo::VisitKernelWithReturnType(node, 0, true);
+      MS_EXCEPTION_IF_NULL(item_with_index.first);
       if (!AnfAlgo::IsRealKernel(item_with_index.first)) {
         MS_LOG(EXCEPTION) << "Unexpected node:" << item_with_index.first->DebugString();
       }
-      (*summary)[n->fullname_with_scope()] = item_with_index;
+      summary[n->fullname_with_scope()] = item_with_index;
     }
   }
-  MS_LOG(DEBUG) << "Update summary end size: " << (*summary).size();
+  graph->set_summary_nodes(summary);
+  MS_LOG(DEBUG) << "Update summary end size: " << summary.size();
 }
 
 void SessionBasic::Summary(KernelGraph *graph) {
@@ -739,12 +788,12 @@ void SessionBasic::Summary(KernelGraph *graph) {
     return;
   }
   MS_EXCEPTION_IF_NULL(graph);
-  std::unordered_map<std::string, std::pair<AnfNodePtr, int>> summary_outputs;
-  GetSummaryNodes(graph, &summary_outputs);
-  // do not exist summary node
-  if (summary_outputs.empty()) {
+  bool exist_summary = graph->summary_node_exist();
+  if (!exist_summary) {
     return;
   }
+  GetSummaryNodes(graph);
+  auto summary_outputs = graph->summary_nodes();
   std::map<std::string, tensor::TensorPtr> params_list;
   // fetch outputs apply kernel in session & run callback functions
   for (auto &output_item : summary_outputs) {
@@ -775,6 +824,7 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std:
   MS_EXCEPTION_IF_NULL(graph);
   std::vector<AnfNodePtr> output_args;
   for (const auto &output : outputs) {
+    MS_EXCEPTION_IF_NULL(output);
     MS_LOG(INFO) << "output:" << output->DebugString();
   }
   auto FindEqu = [graph, outputs](const AnfNodePtr &out) -> AnfNodePtr {
@@ -846,7 +896,9 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructSingleOpGraph(const OpRunInf
     }
     auto parameter = ConstructRunOpParameter(graph, input_tensors[i], tensors_mask[i]);
     inputs.push_back(parameter);
-    graph->MutableInputs()->push_back(parameter);
+    auto mutable_inputs = graph->MutableInputs();
+    MS_EXCEPTION_IF_NULL(mutable_inputs);
+    mutable_inputs->push_back(parameter);
   }
   // set execution order
   auto cnode = graph->NewCNode(inputs);
diff --git a/mindspore/ccsrc/session/session_basic.h b/mindspore/ccsrc/session/session_basic.h
index b2e8c8894f..27171b7589 100755
--- a/mindspore/ccsrc/session/session_basic.h
+++ b/mindspore/ccsrc/session/session_basic.h
@@ -48,11 +48,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>;
 
 class SessionBasic {
  public:
-  SessionBasic() : device_id_(0) {
-    graphs_ = {};
-    run_op_graphs_ = {};
-    summary_callback_ = nullptr;
-  }
+  SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) {}
 
   virtual void Init(uint32_t device_id) { device_id_ = device_id; }
 
@@ -75,7 +71,8 @@ class SessionBasic {
   virtual void RegisterSummaryCallBackFunc(const CallBackFunc &callback);
 
   std::shared_ptr<KernelGraph> ConstructKernelGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs);
-  std::shared_ptr<KernelGraph> ConstructKernelGraph(const FuncGraphPtr &func_graph);
+  std::shared_ptr<KernelGraph> ConstructKernelGraph(const FuncGraphPtr &func_graph,
+                                                    std::vector<KernelGraphPtr> *all_out_graph);
 
   CNodePtr CreateNewCNode(const CNodePtr &cnode, bool valid_input, KernelGraph *graph, bool *from_other_graph,
                           std::unordered_map<AnfNodePtr, AnfNodePtr> *other_graph_cnode);
@@ -93,8 +90,7 @@ class SessionBasic {
   virtual GraphId GetGraphIdByNode(const AnfNodePtr &) const { return kInvalidGraphId; }
   virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; }
   virtual void SetActive(GraphId, GraphId) {}
-  virtual void GetSummaryNodes(const KernelGraph *graph,
-                               std::unordered_map<std::string, std::pair<AnfNodePtr, int>> *summary);
+  virtual void GetSummaryNodes(KernelGraph *graph);
 
  protected:
   virtual void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
@@ -114,10 +110,11 @@ class SessionBasic {
   BaseRef TransformBaseRefListToTuple(const BaseRef &base_ref);
   // create a new kernel graph and update the graph sum
   KernelGraphPtr NewKernelGraph();
-  ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph);
+  virtual ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph);
   ValueNodePtr CreateValueNodeKernelGraph(const AnfNodePtr &anf, KernelGraph *graph);
   ParameterPtr CreateNewParameter(const AnfNodePtr &anf, KernelGraph *graph);
   AnfNodePtr CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph);
+  void AddParameterToGraphInputs(const std::vector<AnfNodePtr> &parameters, KernelGraph *graph);
 
   std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_;
   std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_;
@@ -129,6 +126,7 @@ class SessionBasic {
 };
 
 using SessionPtr = std::shared_ptr<session::SessionBasic>;
+using NamedSummaryOutputs = std::map<std::string, std::pair<AnfNodePtr, int>>;
 }  // namespace session
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_SESSION_SESSION_BASIC_H
diff --git a/mindspore/ccsrc/transform/convert.cc b/mindspore/ccsrc/transform/convert.cc
index 1450572e4b..a5726b078a 100644
--- a/mindspore/ccsrc/transform/convert.cc
+++ b/mindspore/ccsrc/transform/convert.cc
@@ -103,6 +103,7 @@ const char kNameReLU6[] = "ReLU6";
 const char kNameReLU6Grad[] = "ReLU6Grad";
 const char kNameElu[] = "Elu";
 const char kNameEluGrad[] = "EluGrad";
+const char kNameTensorScatterUpdate[] = "TensorScatterUpdate";
 const char kNameScatterUpdate[] = "ScatterUpdate";
 const char kNameScatterNdUpdate[] = "ScatterNdUpdate";
 const char kNameScatterMax[] = "ScatterMax";
@@ -182,6 +183,7 @@ const char kNameBinaryCrossEntropy[] = "BinaryCrossEntropy";
 const char kNameBinaryCrossEntropyGrad[] = "BinaryCrossEntropyGrad";
 const char kNameSparseApplyAdagrad[] = "SparseApplyAdagrad";
 const char kNameSparseApplyFtrlD[] = "SparseApplyFtrlD";
+const char kNameApplyProximalAdagrad[] = "ApplyProximalAdagrad";
 const char kNameAcosh[] = "Acosh";
 const char kNameAcoshGrad[] = "AcoshGrad";
 const char kNameFloorMod[] = "FloorMod";
@@ -203,6 +205,8 @@ const char kNameL2Loss[] = "L2Loss";
 const char kNameCTCLoss[] = "CTCLoss";
 const char kNameRange[] = "Range";
 const char kNameSquareSumAll[] = "SquareSumAll";
+const char kNameAscendQuant[] = "AscendQuant";
+const char kNameAscendDequant[] = "AscendDequant";
 
 // -----------------OpAdapter initialization--------------
 std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_map() {
@@ -211,7 +215,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameIOU), ADPT_DESC(Iou)},
     {string(kNameGreaterEqual), ADPT_DESC(GreaterEqual)},
     {string(kNameSlice), ADPT_DESC(SliceD)},
-    {string(kNameApplyMomentum), ADPT_DESC(ApplyMomentum)},
+    {string(kNameApplyMomentum), ADPT_DESC(ApplyMomentumD)},
     {string(kNameMaxPool), ADPT_DESC(MaxPool)},
     {string(kNameAvgPool), ADPT_DESC(AvgPool)},
     {string(kNameMaxPoolWithArgmax), ADPT_DESC(MaxPoolWithArgmax)},
@@ -260,6 +264,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameResizeBilinear), ADPT_DESC(ResizeBilinearV2D)},
     {string(kNameZerosLike), ADPT_DESC(ZerosLike)},
     {string(kNameOnesLike), ADPT_DESC(OnesLike)},
+    {string(kNameTensorScatterUpdate), ADPT_DESC(TensorScatterUpdate)},
     {string(kNameScatterUpdate), ADPT_DESC(ScatterUpdate)},
     {string(kNameScatterNdUpdate), ADPT_DESC(ScatterNdUpdate)},
     {string(kNameScatterMax), ADPT_DESC(ScatterMax)},
@@ -386,6 +391,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameBinaryCrossEntropyGrad), ADPT_DESC(BinaryCrossEntropyGrad)},
     {string(kNameSparseApplyAdagrad), ADPT_DESC(SparseApplyAdagradD)},
     {string(kNameSparseApplyFtrlD), ADPT_DESC(SparseApplyFtrlD)},
+    {string(kNameApplyProximalAdagrad), ADPT_DESC(ApplyProximalAdagradD)},
     {string(kNameAcosh), ADPT_DESC(Acosh)},
     {string(kNameAcoshGrad), ADPT_DESC(AcoshGrad)},
     {string(kNameFloorMod), ADPT_DESC(FloorMod)},
@@ -393,7 +399,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameDepthToSpace), ADPT_DESC(DepthToSpace)},
     {string(kNameSign), ADPT_DESC(Sign)},
     {string(kNameRound), ADPT_DESC(Round)},
-    {string(kNameApplyFtrl), ADPT_DESC(ApplyFtrl)},
+    {string(kNameApplyFtrl), ADPT_DESC(ApplyFtrlD)},
     {string(kNameDiag), ADPT_DESC(Diag)},
     {string(kNameDiagPart), ADPT_DESC(DiagPart)},
     {string(kNameSpaceToBatch), ADPT_DESC(SpaceToBatchD)},
@@ -404,10 +410,12 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameL2Loss), ADPT_DESC(L2Loss)},
     {string(kNameCTCLoss), ADPT_DESC(CTCLoss)},
     {string(kNameRange), ADPT_DESC(RangeD)},
-    {string(kNameSquareSumAll), ADPT_DESC(SquareSumAll)}};
+    {string(kNameSquareSumAll), ADPT_DESC(SquareSumAll)},
+    {string(kNameAscendQuant), ADPT_DESC(AscendQuant)},
+    {string(kNameAscendDequant), ADPT_DESC(AscendDequant)}};
 #ifdef ENABLE_GE
   adpt_map[string(kNamePrint)] = ADPT_DESC(Print);
-  adpt_map[string(kNameApplyAdam)] = ADPT_DESC(ApplyAdam);
+  adpt_map[string(kNameApplyAdam)] = ADPT_DESC(ApplyAdamD);
 #endif
   return adpt_map;
 }
@@ -957,8 +965,8 @@ void DfGraphConvertor::TraceOutput(const AnfNodePtr node) {
     for (unsigned int i = 1; i < c->inputs().size(); i++) {
       TraceOutput(c->input(i));
     }
-  } else if (name == "depend") {
-    if (c->inputs().size() < 3) {  // "depend" primitive have 3 inputs
+  } else if (name == "Depend") {
+    if (c->inputs().size() < 3) {  // "Depend" primitive have 3 inputs
       MS_LOG(EXCEPTION) << "length of inputs is " << c->inputs().size() << ", which is less than 3";
     }
     TraceOutput(c->input(1));
@@ -1181,7 +1189,7 @@ void DfGraphConvertor::SetOpInput(const OpAdapterPtr &adpt, const CNodePtr &node
   auto &inputs = node->inputs();
   for (size_t i = 1; i < inputs.size(); i++) {
     auto pred = inputs[i];
-    while (pred->isa<CNode>() && GetCNodeFuncName(pred->cast<CNodePtr>()) == "depend") {
+    while (pred->isa<CNode>() && GetCNodeFuncName(pred->cast<CNodePtr>()) == "Depend") {
       pred = pred->cast<CNodePtr>()->input(1);
     }
     // skip the None input
@@ -1360,7 +1368,7 @@ AnfNodePtr DfGraphConvertor::TraceTupleGetItem(const CNodePtr &node, unsigned in
 
 AnfNodePtr DfGraphConvertor::TraceDepend(const CNodePtr &node) {
   auto cnode = node->cast<CNodePtr>();
-  if (cnode->inputs().size() < 3) {  // "depend" primitive have 3 inputs
+  if (cnode->inputs().size() < 3) {  // "Depend" primitive have 3 inputs
     MS_LOG(EXCEPTION) << "length of inputs of depend is less than 3";
   }
   return cnode->inputs()[1];
@@ -1481,7 +1489,7 @@ AnfNodePtr DfGraphConvertor::GetRealOpNode(AnfNodePtr node) {
   // depend apply inputs: depend,output,depended_node
   if (IsPrimitiveCNode(node, prim::kPrimDepend)) {
     auto depend_inputs = node->cast<CNodePtr>()->inputs();
-    if (depend_inputs.size() != 3) {  // "depend" primitive have 3 inputs
+    if (depend_inputs.size() != 3) {  // "Depend" primitive have 3 inputs
       MS_LOG(ERROR) << "depend input items not correct";
       error_ = FAILED;
       return node;
@@ -1698,7 +1706,7 @@ void DfGraphConvertor::ConvertControlDependNode(const CNodePtr node) {
 
 bool DfGraphConvertor::CheckCNode(const std::string &name, const CNodePtr node) {
   // ignore apply node of return
-  if (name == "return" || name == "depend") {
+  if (name == "return" || name == "Depend") {
     return false;
   }
 
diff --git a/mindspore/ccsrc/transform/convert.h b/mindspore/ccsrc/transform/convert.h
index 8a63f00c6c..2f6c9bb0ad 100644
--- a/mindspore/ccsrc/transform/convert.h
+++ b/mindspore/ccsrc/transform/convert.h
@@ -102,22 +102,15 @@ class DfGraphConvertor {
   explicit DfGraphConvertor(const AnfGraphPtr &anf_graph)
       : anf_graph_(anf_graph), df_graph_(std::make_shared<DfGraph>(anf_graph_->ToString())) {
 #if (!defined ENABLE_GE) || (defined ENABLE_INFER)
-    auto it_training = anf_graph->flags().find("training");
-    if (it_training != anf_graph->flags().end()) {
-      training_ = it_training->second;
-    } else {
-      training_ = false;
-    }
+    training_ = anf_graph->has_flag("training");
 #else
     training_ = ENABLE_TRAIN;
 #endif
-    auto it_distribute = anf_graph->flags().find("broadcast_flag");
-    if (it_distribute != anf_graph->flags().end()) {
+    distribute_ = anf_graph->has_flag("broadcast_flag");
+    if (anf_graph->has_flag("broadcast_flag")) {
       ConfigManager::GetInstance().set_parallel_strategy(ParallelStrategy::DISTRIBUTION);
-      distribute_ = it_distribute->second;
     } else {
       ConfigManager::GetInstance().set_parallel_strategy(ParallelStrategy::ONE_DEVICE);
-      distribute_ = false;
     }
 
     MS_LOG(INFO) << "Create DfGraphConvertor with training: " << training_ << ", distribute: " << distribute_;
diff --git a/mindspore/ccsrc/transform/op_declare.cc b/mindspore/ccsrc/transform/op_declare.cc
index ee59d56003..7e5e69beb6 100644
--- a/mindspore/ccsrc/transform/op_declare.cc
+++ b/mindspore/ccsrc/transform/op_declare.cc
@@ -127,11 +127,12 @@ INPUT_MAP(Constant) = EMPTY_INPUT_MAP;
 ATTR_MAP(Constant) = {{"value", ATTR_DESC(value, AnyTraits<AnyValue>())}};
 OUTPUT_MAP(Constant) = {{0, OUTPUT_DESC(y)}};
 
-// ApplyMomentum
-INPUT_MAP(ApplyMomentum) = {
+// ApplyMomentumD
+INPUT_MAP(ApplyMomentumD) = {
   {1, INPUT_DESC(var)}, {2, INPUT_DESC(accum)}, {3, INPUT_DESC(lr)}, {4, INPUT_DESC(grad)}, {5, INPUT_DESC(momentum)}};
-ATTR_MAP(ApplyMomentum) = {{"use_nesterov", ATTR_DESC(use_nesterov, AnyTraits<bool>())}};
-OUTPUT_MAP(ApplyMomentum) = {{0, OUTPUT_DESC(var)}};
+ATTR_MAP(ApplyMomentumD) = {{"use_nesterov", ATTR_DESC(use_nesterov, AnyTraits<bool>())},
+                            {"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
+OUTPUT_MAP(ApplyMomentumD) = {{0, OUTPUT_DESC(var)}, {1, OUTPUT_DESC(accum)}};
 
 // ScalarSummary
 INPUT_MAP(Summary) = {{2, INPUT_DESC(x)}};
@@ -472,6 +473,15 @@ ATTR_MAP(ApplyAdam) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())
                        {"use_nesterov", ATTR_DESC(use_nesterov, AnyTraits<bool>())}};
 OUTPUT_MAP(ApplyAdam) = {{0, OUTPUT_DESC(var)}};
 
+// ApplyAdamD
+INPUT_MAP(ApplyAdamD) = {{1, INPUT_DESC(var)},         {2, INPUT_DESC(m)},           {3, INPUT_DESC(v)},
+                         {4, INPUT_DESC(beta1_power)}, {5, INPUT_DESC(beta2_power)}, {6, INPUT_DESC(lr)},
+                         {7, INPUT_DESC(beta1)},       {8, INPUT_DESC(beta2)},       {9, INPUT_DESC(epsilon)},
+                         {10, INPUT_DESC(grad)}};
+ATTR_MAP(ApplyAdamD) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())},
+                        {"use_nesterov", ATTR_DESC(use_nesterov, AnyTraits<bool>())}};
+OUTPUT_MAP(ApplyAdamD) = {{0, OUTPUT_DESC(var)}, {1, OUTPUT_DESC(m)}, {2, OUTPUT_DESC(v)}};
+
 // Relu6
 INPUT_MAP(Relu6) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(Relu6) = EMPTY_ATTR_MAP;
@@ -515,6 +525,11 @@ INPUT_MAP(Unpack) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(Unpack) = {{"axis", ATTR_DESC(axis, AnyTraits<int>())}, {"num", ATTR_DESC(num, AnyTraits<int>())}};
 DYN_OUTPUT_MAP(Unpack) = {{0, DYN_OUTPUT_DESC(y)}};
 
+// TensorScatterUpdate
+INPUT_MAP(TensorScatterUpdate) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(indices)}, {3, INPUT_DESC(updates)}};
+ATTR_MAP(TensorScatterUpdate) = EMPTY_ATTR_MAP;
+OUTPUT_MAP(TensorScatterUpdate) = {{0, OUTPUT_DESC(y)}};
+
 // ScatterUpdate
 INPUT_MAP(ScatterUpdate) = {{1, INPUT_DESC(var)}, {2, INPUT_DESC(indices)}, {3, INPUT_DESC(updates)}};
 ATTR_MAP(ScatterUpdate) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
@@ -1155,6 +1170,12 @@ ATTR_MAP(SparseApplyAdagradD) = {{"lr", ATTR_DESC(lr, AnyTraits<float>())},
                                  {"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
 OUTPUT_MAP(SparseApplyAdagradD) = {{0, OUTPUT_DESC(var)}, {1, OUTPUT_DESC(accum)}};
 
+// ApplyProximalAdagradD
+INPUT_MAP(ApplyProximalAdagradD) = {{1, INPUT_DESC(var)}, {2, INPUT_DESC(accum)}, {3, INPUT_DESC(lr)},
+                                    {4, INPUT_DESC(l1)},  {5, INPUT_DESC(l2)},    {6, INPUT_DESC(grad)}};
+ATTR_MAP(ApplyProximalAdagradD) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
+OUTPUT_MAP(ApplyProximalAdagradD) = {{0, OUTPUT_DESC(var)}, {1, OUTPUT_DESC(accum)}};
+
 // SparseApplyFtrlD
 INPUT_MAP(SparseApplyFtrlD) = {{1, INPUT_DESC(var)},
                                {2, INPUT_DESC(accum)},
@@ -1188,12 +1209,12 @@ INPUT_MAP(Round) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(Round) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(Round) = {{0, OUTPUT_DESC(y)}};
 
-// ApplyFtrl
-INPUT_MAP(ApplyFtrl) = {{1, INPUT_DESC(var)},  {2, INPUT_DESC(accum)},   {3, INPUT_DESC(linear)},
-                        {4, INPUT_DESC(grad)}, {5, INPUT_DESC(lr)},      {6, INPUT_DESC(l1)},
-                        {7, INPUT_DESC(l2)},   {8, INPUT_DESC(lr_power)}};
-ATTR_MAP(ApplyFtrl) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
-OUTPUT_MAP(ApplyFtrl) = {{0, OUTPUT_DESC(var)}};
+// ApplyFtrlD
+INPUT_MAP(ApplyFtrlD) = {{1, INPUT_DESC(var)},  {2, INPUT_DESC(accum)},   {3, INPUT_DESC(linear)},
+                         {4, INPUT_DESC(grad)}, {5, INPUT_DESC(lr)},      {6, INPUT_DESC(l1)},
+                         {7, INPUT_DESC(l2)},   {8, INPUT_DESC(lr_power)}};
+ATTR_MAP(ApplyFtrlD) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
+OUTPUT_MAP(ApplyFtrlD) = {{0, OUTPUT_DESC(var)}, {1, OUTPUT_DESC(accum)}, {2, OUTPUT_DESC(linear)}};
 
 // Diag
 INPUT_MAP(Diag) = {{1, INPUT_DESC(x)}};
@@ -1256,6 +1277,19 @@ ATTR_MAP(CTCLoss) = {
   {"ignore_longer_outputs_than_inputs", ATTR_DESC(ignore_longer_outputs_than_inputs, AnyTraits<bool>())}};
 OUTPUT_MAP(CTCLoss) = {{0, OUTPUT_DESC(loss)}, {1, OUTPUT_DESC(gradient)}};
 
+// AscendQuant
+INPUT_MAP(AscendQuant) = {{1, INPUT_DESC(x)}};
+ATTR_MAP(AscendQuant) = {{"scale", ATTR_DESC(scale, AnyTraits<float>())},
+                         {"offset", ATTR_DESC(offset, AnyTraits<float>())},
+                         {"sqrt_mode", ATTR_DESC(sqrt_mode, AnyTraits<bool>())},
+                         {"round_mode", ATTR_DESC(round_mode, AnyTraits<std::string>())}};
+OUTPUT_MAP(AscendQuant) = {{0, OUTPUT_DESC(y)}};
+
+// AscendDequant
+INPUT_MAP(AscendDequant) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(deq_scale)}};
+ATTR_MAP(AscendDequant) = {{"sqrt_mode", ATTR_DESC(sqrt_mode, AnyTraits<bool>())},
+                           {"relu_flag", ATTR_DESC(relu_flag, AnyTraits<bool>())}};
+OUTPUT_MAP(AscendDequant) = {{0, OUTPUT_DESC(y)}};
 #ifdef ENABLE_GE
 // Print
 INPUT_MAP(Print) = EMPTY_INPUT_MAP;
diff --git a/mindspore/ccsrc/transform/op_declare.h b/mindspore/ccsrc/transform/op_declare.h
index 3d1b6e7a7f..f64dc7b671 100755
--- a/mindspore/ccsrc/transform/op_declare.h
+++ b/mindspore/ccsrc/transform/op_declare.h
@@ -120,6 +120,8 @@ DECLARE_OP_ADAPTER(ResizeNearestNeighborV2Grad)
 DECLARE_OP_USE_OUTPUT(ResizeNearestNeighborV2Grad)
 DECLARE_OP_ADAPTER(ApplyAdam)
 DECLARE_OP_USE_OUTPUT(ApplyAdam)
+DECLARE_OP_ADAPTER(ApplyAdamD)
+DECLARE_OP_USE_OUTPUT(ApplyAdamD)
 DECLARE_OP_ADAPTER(Relu6)
 DECLARE_OP_USE_OUTPUT(Relu6)
 DECLARE_OP_ADAPTER(Relu6Grad)
@@ -132,6 +134,8 @@ DECLARE_OP_ADAPTER(ZerosLike)
 DECLARE_OP_USE_OUTPUT(ZerosLike)
 DECLARE_OP_ADAPTER(OnesLike)
 DECLARE_OP_USE_OUTPUT(OnesLike)
+DECLARE_OP_ADAPTER(TensorScatterUpdate)
+DECLARE_OP_USE_OUTPUT(TensorScatterUpdate)
 DECLARE_OP_ADAPTER(ScatterUpdate)
 DECLARE_OP_USE_OUTPUT(ScatterUpdate)
 DECLARE_OP_ADAPTER(ScatterNdUpdate)
@@ -319,8 +323,8 @@ DECLARE_OP_ADAPTER(Assign)
 DECLARE_OP_USE_OUTPUT(Assign)
 DECLARE_OP_ADAPTER(Constant)
 DECLARE_OP_USE_OUTPUT(Constant)
-DECLARE_OP_ADAPTER(ApplyMomentum)
-DECLARE_OP_USE_OUTPUT(ApplyMomentum)
+DECLARE_OP_ADAPTER(ApplyMomentumD)
+DECLARE_OP_USE_OUTPUT(ApplyMomentumD)
 // ** Summary Operations **
 DECLARE_OP_ADAPTER(Summary)
 
@@ -442,6 +446,8 @@ DECLARE_OP_ADAPTER(BinaryCrossEntropyGrad)
 DECLARE_OP_USE_OUTPUT(BinaryCrossEntropyGrad)
 DECLARE_OP_ADAPTER(SparseApplyAdagradD)
 DECLARE_OP_USE_OUTPUT(SparseApplyAdagradD)
+DECLARE_OP_ADAPTER(ApplyProximalAdagradD)
+DECLARE_OP_USE_OUTPUT(ApplyProximalAdagradD)
 DECLARE_OP_ADAPTER(SpaceToDepth)
 DECLARE_OP_USE_OUTPUT(SpaceToDepth)
 DECLARE_OP_ADAPTER(DepthToSpace)
@@ -452,8 +458,8 @@ DECLARE_OP_ADAPTER(LarsV2Update)
 DECLARE_OP_USE_OUTPUT(LarsV2Update)
 DECLARE_OP_ADAPTER(Round)
 DECLARE_OP_USE_OUTPUT(Round)
-DECLARE_OP_ADAPTER(ApplyFtrl)
-DECLARE_OP_USE_OUTPUT(ApplyFtrl)
+DECLARE_OP_ADAPTER(ApplyFtrlD)
+DECLARE_OP_USE_OUTPUT(ApplyFtrlD)
 DECLARE_OP_ADAPTER(SparseApplyFtrlD)
 DECLARE_OP_USE_OUTPUT(SparseApplyFtrlD)
 DECLARE_OP_ADAPTER(Diag)
@@ -475,6 +481,10 @@ DECLARE_OP_ADAPTER(L2Loss)
 DECLARE_OP_USE_OUTPUT(L2Loss)
 DECLARE_OP_ADAPTER(CTCLoss)
 DECLARE_OP_USE_OUTPUT(CTCLoss)
+DECLARE_OP_ADAPTER(AscendQuant)
+DECLARE_OP_USE_OUTPUT(AscendQuant)
+DECLARE_OP_ADAPTER(AscendDequant)
+DECLARE_OP_USE_OUTPUT(AscendDequant)
 #ifdef ENABLE_GE
 DECLARE_OP_ADAPTER(Print)
 DECLARE_OP_USE_DYN_INPUT(Print)
diff --git a/mindspore/ccsrc/utils/CMakeLists.txt b/mindspore/ccsrc/utils/CMakeLists.txt
index 71d68729b9..72f698a97e 100644
--- a/mindspore/ccsrc/utils/CMakeLists.txt
+++ b/mindspore/ccsrc/utils/CMakeLists.txt
@@ -5,5 +5,11 @@ if (NOT ENABLE_GE)
     list(REMOVE_ITEM _UTILS_SRC_LIST ${_UTILS_GE_SRC_FILES})
 endif ()
 
+file(GLOB_RECURSE _UTILS_LITE_SRC_FILES
+        ./load_onnx/anf_converter.cc
+        ./load_onnx/anf_model_parser.cc
+        )
+list(REMOVE_ITEM _UTILS_SRC_LIST ${_UTILS_LITE_SRC_FILES})
+
 set_property(SOURCE ${_UTILS_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_UTILS)
 add_library(_mindspore_utils_obj OBJECT ${_UTILS_SRC_LIST})
diff --git a/mindspore/ccsrc/utils/base_ref_utils.cc b/mindspore/ccsrc/utils/base_ref_utils.cc
new file mode 100644
index 0000000000..87089c6266
--- /dev/null
+++ b/mindspore/ccsrc/utils/base_ref_utils.cc
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <memory>
+#include "utils/base_ref_utils.h"
+#include "include/ms_tensor.h"
+#include "ir/tensor.h"
+
+namespace mindspore {
+void IterateFindTensor(std::vector<std::shared_ptr<inference::MSTensor>> *msTensors, const VectorRef &ref_list) {
+  for (size_t i = 0; i < ref_list.size(); ++i) {
+    if (utils::isa<tensor::TensorPtr>(ref_list[i])) {
+      auto tensor_ptr = utils::cast<std::shared_ptr<tensor::Tensor>>(ref_list[i]);
+      MS_EXCEPTION_IF_NULL(tensor_ptr);
+      auto tensor = new inference::Tensor(tensor_ptr);
+      msTensors->emplace_back(std::shared_ptr<inference::MSTensor>(tensor));
+    } else if (utils::isa<VectorRef>(ref_list[i])) {
+      auto ref_iter = utils::cast<VectorRef>(ref_list[i]);
+      IterateFindTensor(msTensors, ref_iter);
+    } else {
+      MS_LOG(EXCEPTION) << "The output is not a tensor";
+    }
+  }
+}
+
+std::vector<std::shared_ptr<inference::MSTensor>> TransformVectorRefToMultiTensor(const VectorRef &base_ref) {
+  std::vector<std::shared_ptr<inference::MSTensor>> msTensors;
+  if (utils::isa<VectorRef>(base_ref)) {
+    auto ref_list = utils::cast<VectorRef>(base_ref);
+    IterateFindTensor(&msTensors, ref_list);
+  } else if (utils::isa<tensor::Tensor>(base_ref)) {
+    auto tensor_ptr = utils::cast<std::shared_ptr<tensor::Tensor>>(base_ref);
+    MS_EXCEPTION_IF_NULL(tensor_ptr);
+    auto tensor = new inference::Tensor(tensor_ptr);
+    msTensors.emplace_back(std::shared_ptr<inference::MSTensor>(tensor));
+  } else {
+    MS_LOG(EXCEPTION) << "The output is not a base ref list or a tensor!";
+  }
+  return msTensors;
+}
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/base_ref_utils.h b/mindspore/ccsrc/utils/base_ref_utils.h
new file mode 100644
index 0000000000..2503eab738
--- /dev/null
+++ b/mindspore/ccsrc/utils/base_ref_utils.h
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <memory>
+#include "utils/base_ref.h"
+#include "include/ms_tensor.h"
+
+#ifndef MINDSPORE_CCSRC_UTILS_BASE_REF_UTILS_H
+#define MINDSPORE_CCSRC_UTILS_BASE_REF_UTILS_H
+namespace mindspore {
+std::vector<std::shared_ptr<inference::MSTensor>> TransformVectorRefToMultiTensor(const VectorRef &base_ref);
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_UTILS_BASE_REF_UTILS_H
diff --git a/mindspore/ccsrc/utils/callbacks.cc b/mindspore/ccsrc/utils/callbacks.cc
index ad9751c332..427cc5e568 100644
--- a/mindspore/ccsrc/utils/callbacks.cc
+++ b/mindspore/ccsrc/utils/callbacks.cc
@@ -26,9 +26,9 @@
 
 namespace mindspore {
 namespace callbacks {
-const char PYTHON_MOD_CALLBACK_MODULE[] = "mindspore.train.callback";
-const char PYTHON_FUN_PROCESS_CHECKPOINT[] = "_checkpoint_cb_for_save_op";
-const char PYTHON_FUN_PROCESS_SUMMARY[] = "_summary_cb_for_save_op";
+const char PYTHON_MOD_CALLBACK_MODULE[] = "mindspore.train.callback._callback";
+const char PYTHON_FUN_PROCESS_CHECKPOINT[] = "checkpoint_cb_for_save_op";
+const char PYTHON_FUN_PROCESS_SUMMARY[] = "summary_cb_for_save_op";
 const char kSummary[] = "Summary";
 const char kCheckPoint[] = "Save";
 const int ONE_SHAPE = 1;
diff --git a/mindspore/ccsrc/utils/callbacks_ge.cc b/mindspore/ccsrc/utils/callbacks_ge.cc
index 151b78d010..3174ec4b15 100644
--- a/mindspore/ccsrc/utils/callbacks_ge.cc
+++ b/mindspore/ccsrc/utils/callbacks_ge.cc
@@ -25,9 +25,9 @@
 
 namespace mindspore {
 namespace callbacks {
-const char PYTHON_MOD_CALLBACK_MODULE[] = "mindspore.train.callback";
-const char PYTHON_FUN_PROCESS_CHECKPOINT[] = "_checkpoint_cb_for_save_op";
-const char PYTHON_FUN_PROCESS_SUMMARY[] = "_summary_cb_for_save_op";
+const char PYTHON_MOD_CALLBACK_MODULE[] = "mindspore.train.callback._callback";
+const char PYTHON_FUN_PROCESS_CHECKPOINT[] = "checkpoint_cb_for_save_op";
+const char PYTHON_FUN_PROCESS_SUMMARY[] = "summary_cb_for_save_op";
 const char kSummary[] = "Summary";
 const char kCheckPoint[] = "Save";
 const int ONE_SHAPE = 1;
diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index 0aacf2d2a1..35e053dd53 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -74,6 +74,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
   precompile_only_ = false;
   auto_mixed_precision_flag_ = false;
   enable_pynative_infer_ = false;
+  enable_pynative_hook_ = false;
   enable_dynamic_mem_pool_ = true;
   graph_memory_max_size_ = "0";
   variable_memory_max_size_ = "0";
@@ -81,6 +82,9 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
   profiling_mode_ = false;
   profiling_options_ = "training_trace";
   check_bprop_flag_ = false;
+  max_device_memory_ = kDefaultMaxDeviceMemory;
+  print_file_path_ = "";
+  enable_graph_kernel_ = false;
 }
 
 std::shared_ptr<MsContext> MsContext::GetInstance() {
diff --git a/mindspore/ccsrc/utils/context/ms_context.h b/mindspore/ccsrc/utils/context/ms_context.h
index 9a91f391c9..9afe1fa5aa 100644
--- a/mindspore/ccsrc/utils/context/ms_context.h
+++ b/mindspore/ccsrc/utils/context/ms_context.h
@@ -41,9 +41,12 @@ const int kPynativeMode = 1;
 const char kCPUDevice[] = "CPU";
 const char kGPUDevice[] = "GPU";
 const char kAscendDevice[] = "Ascend";
+const char kDavinciInferenceDevice[] = "AscendInference";
 const char kDavinciDevice[] = "Davinci";
 const char KNpuLog[] = "_npu_log";
 const std::set<std::string> kTargetSet = {kCPUDevice, kGPUDevice, kAscendDevice, kDavinciDevice};
+// The default max available device memory is 1024GB.
+const float kDefaultMaxDeviceMemory = 1024;
 
 class MsContext {
  public:
@@ -62,6 +65,9 @@ class MsContext {
   bool enable_pynative_infer() const { return enable_pynative_infer_; }
   void set_enable_pynative_infer(bool enable_pynative_infer) { enable_pynative_infer_ = enable_pynative_infer; }
 
+  bool enable_pynative_hook() const { return enable_pynative_hook_; }
+  void set_enable_pynative_hook(bool enable_pynative_hook) { enable_pynative_hook_ = enable_pynative_hook; }
+
   bool enable_task_sink() const { return enable_task_sink_; }
 
   void set_precompile_only(bool precompile_only) { precompile_only_ = precompile_only; }
@@ -92,7 +98,7 @@ class MsContext {
   bool ir_fusion_flag() const { return ir_fusion_flag_; }
 
   bool loop_sink_flag() const { return enable_loop_sink_; }
-
+  void set_loop_sink_flag(bool enable_loop_sink) { enable_loop_sink_ = enable_loop_sink; }
   void set_enable_mem_reuse(bool enable_mem_reuse) { enable_mem_reuse_ = enable_mem_reuse; }
   bool enable_mem_reuse() const { return enable_mem_reuse_; }
 
@@ -135,6 +141,10 @@ class MsContext {
     variable_memory_max_size_ = variable_memory_max_size;
   }
 
+  const std::string &variable_memory_max_size() const { return variable_memory_max_size_; }
+
+  const std::string &graph_memory_max_size() const { return graph_memory_max_size_; }
+
   void set_enable_profiling(bool flag) { profiling_mode_ = flag; }
   bool enable_profiling() const { return profiling_mode_; }
 
@@ -142,6 +152,14 @@ class MsContext {
   std::string profiling_options() const { return profiling_options_; }
   bool check_bprop_flag() const { return check_bprop_flag_; }
   void set_check_bprop_flag(bool check_bprop_flag) { check_bprop_flag_ = check_bprop_flag; }
+  void set_print_file_path(const std::string &file) { print_file_path_ = file; }
+  const std::string &print_file_path() const { return print_file_path_; }
+
+  float max_device_memory() const { return max_device_memory_; }
+  void set_max_device_memory(float max_device_memory) { max_device_memory_ = max_device_memory; }
+
+  void set_enable_graph_kernel(bool enable_graph_kernel) { enable_graph_kernel_ = enable_graph_kernel; }
+  bool enable_graph_kernel() const { return enable_graph_kernel_; }
 
  private:
   MsContext(const std::string &backend_policy, const std::string &target);
@@ -156,6 +174,7 @@ class MsContext {
   uint32_t device_id_;
   int execution_mode_;
   bool enable_pynative_infer_;
+  bool enable_pynative_hook_;
   bool save_graphs_flag_;
   std::string save_graphs_path_;
   uint32_t tsd_ref_;
@@ -182,6 +201,9 @@ class MsContext {
   bool profiling_mode_;
   std::string profiling_options_;
   bool check_bprop_flag_;
+  float max_device_memory_;
+  std::string print_file_path_;
+  bool enable_graph_kernel_;
 };
 
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/convert_utils.cc b/mindspore/ccsrc/utils/convert_utils.cc
index 45c292d545..6e28e38ed1 100644
--- a/mindspore/ccsrc/utils/convert_utils.cc
+++ b/mindspore/ccsrc/utils/convert_utils.cc
@@ -30,6 +30,7 @@
 #include "pipeline/parse/parse_base.h"
 #include "ir/value.h"
 #include "ir/tensor.h"
+#include "ir/param_value_py.h"
 #include "utils/base_ref_extends.h"
 
 namespace mindspore {
@@ -105,7 +106,7 @@ py::object ValuePtrToPyData(const ValuePtr &value) {
     }
     ret = rets;
   } else if (value->isa<EllipsisObj>()) {
-    ret = parse::python_adapter::CallPyFn(parse::PYTHON_MOD_PARSE_MODULE, parse::PYTHON_PARSE_CLASS_ELLIPSIS);
+    ret = py::ellipsis();
   } else if (value->isa<ValueSlice>()) {
     auto slice = value->cast<ValueSlicePtr>();
     auto start = ValuePtrToPyData(slice->start());
@@ -426,7 +427,17 @@ bool IsGraphOutputValueNodeOrParameter(const AnfNodePtr &output, const py::tuple
       MS_EXCEPTION(UnknownError) << "Index " << index << " equal or larger than args size " << args.size()
                                  << " add Parameter count " << func_graph->hyper_param_count() << ".";
     }
-    *ret_val = args[index];
+    if (index < args.size()) {
+      *ret_val = args[index];
+    } else {
+      auto param = dyn_cast<Parameter>(params[index]);
+      MS_EXCEPTION_IF_NULL(param);
+      if (!param->has_default()) {
+        MS_LOG(EXCEPTION) << "Can not determine value of Parameter " << index << " (" << param->name() << ")";
+      }
+      auto param_value = std::dynamic_pointer_cast<ParamValuePy>(param->default_param());
+      *ret_val = param_value->value().attr("data");
+    }
     return true;
   }
   return false;
diff --git a/mindspore/ccsrc/utils/graph_utils.h b/mindspore/ccsrc/utils/graph_utils.h
index 0b49615523..93edda3e34 100644
--- a/mindspore/ccsrc/utils/graph_utils.h
+++ b/mindspore/ccsrc/utils/graph_utils.h
@@ -39,6 +39,7 @@ namespace mindspore {
 enum IncludeType { FOLLOW, NOFOLLOW, EXCLUDE };
 
 using IncludeFunc = std::function<IncludeType(const AnfNodePtr &)>;
+using FilterFunc = std::function<bool(const AnfNodePtr &)>;
 using SuccFunc = std::function<std::vector<AnfNodePtr>(AnfNodePtr)>;
 using SearchFunc = std::function<std::vector<AnfNodePtr>(const AnfNodePtr &, const IncludeFunc &)>;
 
@@ -58,6 +59,13 @@ std::vector<AnfNodePtr> DeepScopedGraphSearch(const AnfNodePtr &root, const Incl
 std::vector<AnfNodePtr> DeepUsedGraphSearch(const AnfNodePtr &root, const IncludeFunc &include = AlwaysInclude);
 std::vector<AnfNodePtr> DeepLinkedGraphSearch(const AnfNodePtr &root, const IncludeFunc &include = AlwaysInclude);
 
+std::vector<AnfNodePtr> DeepScopedGraphSearchWithFilter(const AnfNodePtr &root, const IncludeFunc &include,
+                                                        const FilterFunc &filter);
+
+class FuncGraphManager;
+using FuncGraphManagerPtr = std::shared_ptr<FuncGraphManager>;
+std::vector<AnfNodePtr> DeepUsersSearch(const AnfNodePtr &root, const IncludeFunc &include,
+                                        const FuncGraphManagerPtr &mng);
 std::vector<AnfNodePtr> TopoSort(const AnfNodePtr &root, const SuccFunc &succ = SuccIncoming,
                                  const IncludeFunc &include = AlwaysInclude);
 
diff --git a/mindspore/ccsrc/utils/graph_utils_extends.cc b/mindspore/ccsrc/utils/graph_utils_extends.cc
index 7c3991b638..0740c24236 100644
--- a/mindspore/ccsrc/utils/graph_utils_extends.cc
+++ b/mindspore/ccsrc/utils/graph_utils_extends.cc
@@ -26,6 +26,7 @@
 #include <fstream>
 
 #include "ir/visitor.h"
+#include "ir/manager.h"
 #include "ir/func_graph.h"
 #include "debug/label.h"
 #include "utils/log_adapter.h"
@@ -37,7 +38,8 @@ namespace mindspore {
 namespace {
 class DeepFirstSearcher : public AnfVisitor {
  public:
-  explicit DeepFirstSearcher(const IncludeFunc &include) : include_(include) {}
+  explicit DeepFirstSearcher(const IncludeFunc &include, const FilterFunc &filter = nullptr)
+      : include_(include), filter_(filter) {}
   ~DeepFirstSearcher() override = default;
 
   std::vector<AnfNodePtr> Search(const AnfNodePtr &root) {
@@ -61,8 +63,9 @@ class DeepFirstSearcher : public AnfVisitor {
     if (incl == EXCLUDE) {
       return;
     }
-
-    res_.push_back(node);
+    if (filter_ == nullptr || !filter_(node)) {
+      res_.push_back(node);
+    }
     if (incl == FOLLOW) {
       AnfVisitor::Visit(node);
     }
@@ -71,6 +74,7 @@ class DeepFirstSearcher : public AnfVisitor {
  private:
   size_t seen_{0};
   IncludeFunc include_;
+  FilterFunc filter_;
   std::vector<AnfNodePtr> res_{};
 };
 
@@ -158,12 +162,36 @@ class DeepLinkedGraphSearcher : public DeepFirstSearcher {
 
   void Visit(const ValueNodePtr &) override {}
 };
+
+class DeepUsersSearcher : public DeepFirstSearcher {
+ public:
+  explicit DeepUsersSearcher(const IncludeFunc &include, const FuncGraphManagerPtr &mng)
+      : DeepFirstSearcher(include), mng_(mng) {}
+  ~DeepUsersSearcher() override = default;
+
+  void Visit(const CNodePtr &cnode) override {
+    auto &users = mng_->node_users()[cnode];
+    for (auto iter = users.begin(); iter != users.end(); ++iter) {
+      DeepFirstSearcher::Visit(iter->first);
+    }
+  }
+  void Visit(const ValueNodePtr &) override {}
+
+ private:
+  FuncGraphManagerPtr mng_;
+};
 }  // namespace
 
+// include for if expand the node the search, filter for if put the node to results.
 std::vector<AnfNodePtr> DeepScopedGraphSearch(const AnfNodePtr &root, const IncludeFunc &include) {
   return DeepScopedGraphSearcher(include).Search(root);
 }
 
+std::vector<AnfNodePtr> DeepScopedGraphSearchWithFilter(const AnfNodePtr &root, const IncludeFunc &include,
+                                                        const FilterFunc &filter) {
+  return DeepFirstSearcher(include, filter).Search(root);
+}
+
 std::vector<AnfNodePtr> DeepUsedGraphSearch(const AnfNodePtr &root, const IncludeFunc &include) {
   return DeepUsedGraphSearcher(include).Search(root);
 }
@@ -171,4 +199,9 @@ std::vector<AnfNodePtr> DeepUsedGraphSearch(const AnfNodePtr &root, const Includ
 std::vector<AnfNodePtr> DeepLinkedGraphSearch(const AnfNodePtr &root, const IncludeFunc &include) {
   return DeepLinkedGraphSearcher(include).Search(root);
 }
+
+std::vector<AnfNodePtr> DeepUsersSearch(const AnfNodePtr &root, const IncludeFunc &include,
+                                        const FuncGraphManagerPtr &mng) {
+  return DeepUsersSearcher(include, mng).Search(root);
+}
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/lineage.proto b/mindspore/ccsrc/utils/lineage.proto
new file mode 100644
index 0000000000..dec6f9a3f6
--- /dev/null
+++ b/mindspore/ccsrc/utils/lineage.proto
@@ -0,0 +1,129 @@
+// Copyright 2020 Huawei Technologies Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package mindspore.irpb;
+option cc_enable_arenas = true;
+
+
+// Event Protocol buffer, Top define
+message LineageEvent {
+  // Timestamp
+  required double wall_time = 1;
+
+  // The step of train.
+  optional int64 step = 2;
+
+  oneof what {
+    // An event file was started, with the specified version.
+    // Now version is "MindSpore.Event:1"
+    string version = 3;
+
+    // Train lineage
+    TrainLineage train_lineage = 6;
+
+    // Evaluation lineage
+    EvaluationLineage evaluation_lineage = 7;
+
+    // Dataset graph
+    DatasetGraph dataset_graph = 9;
+
+    // User defined info
+    UserDefinedInfo user_defined_info = 10;
+  }
+}
+
+// User defined info
+message UserDefinedInfo{
+    // repeated user defined info
+    repeated UserDefinedInfo user_info = 1;
+
+    // key/value which contains both scalar and dict
+    map<string, UserDefinedInfo> map_dict = 2;
+    map<string, int32> map_int32 = 3;
+    map<string, string>  map_str = 4;
+    map<string, double>  map_double = 5;
+}
+
+// TrainLineage records infos of a train.
+message TrainLineage{
+    message HyperParameters{
+        optional string optimizer = 1;
+        optional float learning_rate = 2;
+        optional string loss_function = 3;
+        optional int32 epoch = 4;
+        optional string parallel_mode = 5;
+        optional int32 device_num = 6;
+        optional int32 batch_size = 8;
+    }
+
+    message TrainDataset{
+        optional string train_dataset_path = 1;
+        optional int32 train_dataset_size = 2;
+    }
+
+    message Algorithm{
+        optional string network = 1;
+        optional float loss = 2;
+    }
+
+    message Model{
+        optional string path = 3;
+        optional int64 size = 4;
+    }
+
+    optional HyperParameters hyper_parameters = 1;
+    optional TrainDataset train_dataset = 2;
+    optional Algorithm algorithm = 3;
+    optional Model model = 4;
+}
+
+//EvalLineage records infos of evaluation.
+message EvaluationLineage{
+    message ValidDataset{
+        optional string valid_dataset_path = 1;
+        optional int32 valid_dataset_size = 2;
+    }
+
+    optional string metric = 2;
+    optional ValidDataset valid_dataset = 3;
+}
+
+
+// DatasetGraph
+message DatasetGraph {
+    repeated DatasetGraph children = 1;
+    optional OperationParameter parameter = 2;
+    repeated Operation operations = 3;
+    optional Operation sampler = 4;
+}
+
+message Operation {
+    optional OperationParameter operationParam = 1;
+    repeated int32 size = 2;
+    repeated float weights = 3;
+}
+
+message OperationParameter{
+    map<string, string> mapStr = 1;
+    map<string, StrList> mapStrList = 2;
+    map<string, bool> mapBool = 3;
+    map<string, int32> mapInt = 4;
+    map<string, double> mapDouble = 5;
+}
+
+message StrList {
+    repeated string strValue = 1;
+}
diff --git a/mindspore/ccsrc/utils/load_onnx/anf_converter.cc b/mindspore/ccsrc/utils/load_onnx/anf_converter.cc
new file mode 100644
index 0000000000..ad87d6ae8f
--- /dev/null
+++ b/mindspore/ccsrc/utils/load_onnx/anf_converter.cc
@@ -0,0 +1,115 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fcntl.h>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <memory>
+#include <vector>
+#include <limits>
+#include <string>
+#include "utils/load_onnx/anf_model_parser.h"
+#include "utils/load_onnx/anf_converter.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "proto/onnx.pb.h"
+#include "utils/log_adapter.h"
+
+namespace mindspore {
+namespace lite {
+
+const char WHITESPACE[] = "\t\n\v\f\r ";
+const int FLAG_PREFIX_LEN = 2;
+
+void AnfConverter::Trim(std::string *input) {
+  if (input == nullptr) {
+    return;
+  }
+  if (input->empty()) {
+    return;
+  }
+  input->erase(0, input->find_first_not_of(WHITESPACE));
+  input->erase(input->find_last_not_of(WHITESPACE) + 1);
+}
+
+int AnfConverter::ValidateFileStr(const std::string &modelFile, std::string fileType) {
+  if (modelFile.size() > fileType.size()) {
+    if (modelFile.substr(modelFile.size() - fileType.size()) == fileType) {
+      return 0;
+    } else {
+      return 1;
+    }
+  } else {
+    return 1;
+  }
+}
+
+bool AnfConverter::ReadOnnxFromBinary(const std::string &modelFile, google::protobuf::Message *onnx_model) {
+  std::unique_ptr<char> onnx_file(new (std::nothrow) char[PATH_MAX]{0});
+  int fd = open(onnx_file.get(), O_RDONLY);
+  google::protobuf::io::FileInputStream input(fd);
+  google::protobuf::io::CodedInputStream code_input(&input);
+  code_input.SetTotalBytesLimit(INT_MAX, 536870912);
+  bool ret = onnx_model->ParseFromCodedStream(&code_input);
+  if (!ret) {
+    MS_LOG(ERROR) << "load onnx file failed";
+    return false;
+  }
+  (void)close(fd);
+  MS_LOG(INFO) << "enter ReadProtoFromBinary success!" << std::endl;
+  return true;
+}
+
+std::shared_ptr<FuncGraph> AnfConverter::RunAnfConverter(const std::string &file_path) {
+  std::string modelFile;
+
+  std::string tmp = file_path;
+  Trim(&tmp);
+  const std::string flagItem(tmp);
+
+  size_t pos = flagItem.find_first_of("=");
+  if (pos == std::string::npos) {
+    MS_LOG(ERROR) << "Trans data not support input format!";
+  } else {
+    modelFile = flagItem.substr(pos + 1);
+    std::cout << "input protobuf file path is: " << flagItem.substr(pos + 1) << std::endl;
+  }
+
+  if (ValidateFileStr(modelFile, ".pb") != 0) {
+    MS_LOG(EXCEPTION) << "INPUT ILLEGAL: modelFile must be *.pb";
+  }
+
+  onnx::ModelProto model_;
+  ReadOnnxFromBinary(modelFile, &model_);
+  MSANFModelParser model_parser;
+  FuncGraphPtr dstgraph_ptr = model_parser.Parse(model_);
+  return dstgraph_ptr;
+}
+
+std::shared_ptr<FuncGraph> AnfConverter::RunAnfConverter(const char *buf, const size_t buf_size) {
+  Py_Initialize();
+  MS_EXCEPTION_IF_NULL(buf);
+  std::string str((const char *)buf, buf_size);
+  onnx::ModelProto model_;
+  if (!model_.ParseFromString(str)) {
+    MS_LOG(EXCEPTION) << "Parse model from buffer fail!";
+  }
+  MSANFModelParser model_parser;
+  FuncGraphPtr dstgraph_ptr = model_parser.Parse(model_);
+  return dstgraph_ptr;
+}
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/load_onnx/anf_converter.h b/mindspore/ccsrc/utils/load_onnx/anf_converter.h
new file mode 100644
index 0000000000..4f5fe3971f
--- /dev/null
+++ b/mindspore/ccsrc/utils/load_onnx/anf_converter.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_UTILS_LOAD_ONNX_ANF_CONVERTER_H
+#define MINDSPORE_CCSRC_UTILS_LOAD_ONNX_ANF_CONVERTER_H
+#include <string>
+#include <memory>
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "proto/onnx.pb.h"
+#include "ir/func_graph.h"
+
+namespace mindspore {
+namespace lite {
+class AnfConverter {
+ public:
+  static std::shared_ptr<FuncGraph> RunAnfConverter(const std::string &file_path);
+  static std::shared_ptr<FuncGraph> RunAnfConverter(const char *buf, const size_t buf_size);
+
+ private:
+  static void Trim(std::string *input);
+  static int ValidateFileStr(const std::string &modelFile, std::string fileType);
+  static bool ReadOnnxFromBinary(const std::string &modelFile, google::protobuf::Message *onnx_model);
+};
+}  // namespace lite
+}  // namespace mindspore
+#endif
diff --git a/mindspore/ccsrc/utils/load_onnx/anf_model_parser.cc b/mindspore/ccsrc/utils/load_onnx/anf_model_parser.cc
new file mode 100644
index 0000000000..e44eb23001
--- /dev/null
+++ b/mindspore/ccsrc/utils/load_onnx/anf_model_parser.cc
@@ -0,0 +1,571 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/load_onnx/anf_model_parser.h"
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "ir/tensor.h"
+#include "ir/param_value_py.h"
+#include "operator/ops.h"
+#include "pipeline/static_analysis/abstract_value.h"
+#include "proto/onnx.pb.h"
+#include "utils/log_adapter.h"
+
+using std::string;
+
+namespace mindspore {
+namespace lite {
+static constexpr char kConstantValueNode[] = "Constant";
+static constexpr char kCNodeShapeAttr[] = "shape";
+static constexpr char kCNodeShape1Attr[] = "shape1";
+static constexpr char kCNodeShape2Attr[] = "shape2";
+enum ParseForm : int {
+  FORM_PARSE_TYPE = 0,
+  FORM_PARSE_SCALAR = 1,
+  FORM_PARSE_TENSOR = 2,
+};
+
+static std::map<std::string, ParseForm> kParseTypeSwitchMap{
+  {"type", FORM_PARSE_TYPE}, {"scalar", FORM_PARSE_SCALAR}, {"tensor", FORM_PARSE_TENSOR}};
+
+static std::unordered_map<int, TypeId> kDefaultValueSwitchMap{
+  {onnx::TensorProto_DataType_BOOL, kNumberTypeBool},     {onnx::TensorProto_DataType_INT8, kNumberTypeInt8},
+  {onnx::TensorProto_DataType_INT16, kNumberTypeInt16},   {onnx::TensorProto_DataType_INT32, kNumberTypeInt32},
+  {onnx::TensorProto_DataType_INT64, kNumberTypeInt64},   {onnx::TensorProto_DataType_UINT8, kNumberTypeUInt8},
+  {onnx::TensorProto_DataType_UINT16, kNumberTypeUInt16}, {onnx::TensorProto_DataType_UINT32, kNumberTypeUInt32},
+  {onnx::TensorProto_DataType_UINT64, kNumberTypeUInt64}, {onnx::TensorProto_DataType_FLOAT16, kNumberTypeFloat16},
+  {onnx::TensorProto_DataType_FLOAT, kNumberTypeFloat32}, {onnx::TensorProto_DataType_DOUBLE, kNumberTypeFloat64},
+  {onnx::TensorProto_DataType_STRING, kObjectTypeString},
+};
+
+#define PARSE_ONNXATTR_IN_SCALAR_FORM(type, valuetype)                                                \
+  void ParseAttrInScalar_##type##_##valuetype(const PrimitivePtr &prim, const std::string &attr_name, \
+                                              const onnx::TensorProto &attr_tensor) {                 \
+    MS_EXCEPTION_IF_NULL(prim);                                                                       \
+    std::vector<ValuePtr> attr_value_vec;                                                             \
+    for (int i = 0; i < attr_tensor.type##_data_size(); ++i) {                                        \
+      auto value = static_cast<valuetype>(attr_tensor.type##_data(i));                                \
+      attr_value_vec.push_back(MakeValue<valuetype>(value));                                          \
+    }                                                                                                 \
+    if (attr_value_vec.size() == 1) {                                                                 \
+      prim->AddAttr(attr_name, attr_value_vec[0]);                                                    \
+    } else {                                                                                          \
+      prim->AddAttr(attr_name, std::make_shared<ValueList>(attr_value_vec));                          \
+    }                                                                                                 \
+  }
+
+PARSE_ONNXATTR_IN_SCALAR_FORM(double, double)
+PARSE_ONNXATTR_IN_SCALAR_FORM(float, float)
+PARSE_ONNXATTR_IN_SCALAR_FORM(string, string)
+PARSE_ONNXATTR_IN_SCALAR_FORM(int32, int32)
+PARSE_ONNXATTR_IN_SCALAR_FORM(int32, bool)
+PARSE_ONNXATTR_IN_SCALAR_FORM(int64, int64)
+PARSE_ONNXATTR_IN_SCALAR_FORM(uint64, uint64)
+
+bool MSANFModelParser::BuildParameterForFuncGraph(const ParameterPtr &node, const onnx::ValueInfoProto &value_proto) {
+  MS_EXCEPTION_IF_NULL(node);
+  if (!value_proto.has_type() || !value_proto.has_name()) {
+    MS_LOG(ERROR) << "onnx ValueInfoProto has no type or name! ";
+    return false;
+  }
+  node->set_name(value_proto.name());
+  const auto &type_proto = value_proto.type();
+  if (!type_proto.has_tensor_type()) {
+    MS_LOG(ERROR) << "onnx TypeProto has no tesor_type! ";
+    return false;
+  }
+  const onnx::TypeProto_Tensor &tensor_typeproto = type_proto.tensor_type();
+  if (!tensor_typeproto.has_elem_type() || !tensor_typeproto.has_shape()) {
+    MS_LOG(ERROR) << "onnx TypeProto_Tensor has no elem_type or shape! ";
+    return false;
+  }
+  const onnx::TensorShapeProto &tensor_shape = tensor_typeproto.shape();
+  std::vector<int> shape;
+  for (int i = 0; i < tensor_shape.dim_size(); ++i) {
+    shape.push_back(tensor_shape.dim(i).dim_value());
+  }
+
+  if (kDefaultValueSwitchMap.find(tensor_typeproto.elem_type()) == kDefaultValueSwitchMap.end()) {
+    MS_LOG(ERROR) << "onnx TypeProto_Tensor  elem_type is not support yet!";
+    return false;
+  }
+
+  tensor::TensorPtr tensor_info =
+    std::make_shared<tensor::Tensor>(kDefaultValueSwitchMap[tensor_typeproto.elem_type()], shape);
+  MS_EXCEPTION_IF_NULL(tensor_info);
+  auto tensor_abstract = tensor_info->ToAbstract();
+  MS_EXCEPTION_IF_NULL(tensor_abstract);
+  node->set_abstract(tensor_abstract);
+
+  if (default_para_map_.find(value_proto.name()) != default_para_map_.end()) {
+    const onnx::TensorProto initialize_proto = default_para_map_[value_proto.name()];
+    std::string initial_data = initialize_proto.raw_data();
+    auto *tensor_data_buf = reinterpret_cast<uint8_t *>(tensor_info->data_c(true));
+    MS_EXCEPTION_IF_NULL(tensor_data_buf);
+    memcpy_s(tensor_data_buf, tensor_info->data().nbytes(), initial_data.data(), initial_data.size());
+
+    py::array array_data = tensor_info->data();
+    ParamValuePyPtr para_value_ptr = std::make_shared<ParamValuePy>();
+    MS_EXCEPTION_IF_NULL(para_value_ptr);
+    para_value_ptr->set_value(array_data);
+    node->set_default_param(para_value_ptr);
+  }
+  anfnode_build_map_[value_proto.name()] = node;
+  return true;
+}
+
+bool MSANFModelParser::ImportParametersForGraph(const FuncGraphPtr &outputFuncGraph,
+                                                const onnx::GraphProto &importProto) {
+  MS_EXCEPTION_IF_NULL(outputFuncGraph);
+  MS_LOG(INFO) << "Parameters had default paramerer size is: " << importProto.initializer_size();
+
+  for (int i = 0; i < importProto.initializer_size(); ++i) {
+    const onnx::TensorProto &initializer_proto = importProto.initializer(i);
+    if (!initializer_proto.has_name()) {
+      MS_LOG(ERROR) << "initializer vector of onnx GraphProto has no name at index: " << i;
+      return false;
+    }
+    default_para_map_[initializer_proto.name()] = initializer_proto;
+  }
+
+  MS_LOG(INFO) << "all parameters size: " << importProto.input_size();
+  for (int i = 0; i < importProto.input_size(); ++i) {
+    const onnx::ValueInfoProto &input_proto = importProto.input(i);
+    if (!BuildParameterForFuncGraph(outputFuncGraph->add_parameter(), input_proto)) {
+      MS_LOG(ERROR) << "Build parameter for funcgraph fail at index: " << i;
+      return false;
+    }
+  }
+  return true;
+}
+
+bool MSANFModelParser::ObtainCNodeAttrInTypeForm(const PrimitivePtr &prim, const std::string &attr_name,
+                                                 const onnx::TensorProto &attr_tensor) {
+  MS_EXCEPTION_IF_NULL(prim);
+  const int attr_tensor_type = attr_tensor.data_type();
+  if (kDefaultValueSwitchMap.find(attr_tensor_type) == kDefaultValueSwitchMap.end()) {
+    MS_LOG(ERROR) << "Obtain attr in type-form has not support input type:" << attr_tensor_type;
+    return false;
+  }
+  prim->AddAttr(attr_name, TypeIdToType(kDefaultValueSwitchMap[attr_tensor_type]));
+  return true;
+}
+
+bool MSANFModelParser::ObtainCNodeAttrInScalarForm(const PrimitivePtr &prim, const std::string &attr_name,
+                                                   const onnx::TensorProto &attr_tensor) {
+  MS_EXCEPTION_IF_NULL(prim);
+  const int attr_tensor_type = attr_tensor.data_type();
+  switch (attr_tensor_type) {
+    case onnx::TensorProto_DataType_STRING: {
+      ParseAttrInScalar_string_string(prim, attr_name, attr_tensor);
+      break;
+    }
+    case onnx::TensorProto_DataType_INT32: {
+      ParseAttrInScalar_int32_int32(prim, attr_name, attr_tensor);
+      break;
+    }
+    case onnx::TensorProto_DataType_INT64: {
+      ParseAttrInScalar_int64_int64(prim, attr_name, attr_tensor);
+      break;
+    }
+    case onnx::TensorProto_DataType_UINT64: {
+      ParseAttrInScalar_uint64_uint64(prim, attr_name, attr_tensor);
+      break;
+    }
+    case onnx::TensorProto_DataType_FLOAT: {
+      ParseAttrInScalar_float_float(prim, attr_name, attr_tensor);
+      break;
+    }
+    case onnx::TensorProto_DataType_DOUBLE: {
+      ParseAttrInScalar_double_double(prim, attr_name, attr_tensor);
+      break;
+    }
+    case onnx::TensorProto_DataType_BOOL: {
+      ParseAttrInScalar_int32_bool(prim, attr_name, attr_tensor);
+      auto value = prim->GetAttr(attr_name);
+      break;
+    }
+    default:
+      MS_LOG(ERROR) << "Obtain attr in scalar-form has not support input type: " << attr_tensor_type;
+      return false;
+  }
+  return true;
+}
+
+bool MSANFModelParser::ObtainCNodeAttrInTensorForm(const PrimitivePtr &prim, const std::string &attr_name,
+                                                   const onnx::TensorProto &attr_tensor) {
+  MS_EXCEPTION_IF_NULL(prim);
+  MS_LOG(ERROR) << "parse attr type don't support attr type is tensor";
+  return false;
+}
+
+bool MSANFModelParser::GetAttrValueForCNode(const PrimitivePtr &prim, const onnx::AttributeProto &attr_proto) {
+  MS_EXCEPTION_IF_NULL(prim);
+  const std::string &attr_name = attr_proto.name();
+  if (!attr_proto.has_ref_attr_name()) {
+    MS_LOG(ERROR) << "CNode parse attr type has no ref_attr_name";
+    return false;
+  }
+  const std::string &ref_attr_name = attr_proto.ref_attr_name();
+  const onnx::TensorProto &attr_tensor = attr_proto.t();
+  switch (kParseTypeSwitchMap[ref_attr_name]) {
+    case FORM_PARSE_TYPE: {
+      return ObtainCNodeAttrInTypeForm(prim, attr_name, attr_tensor);
+    }
+    case FORM_PARSE_SCALAR: {
+      return ObtainCNodeAttrInScalarForm(prim, attr_name, attr_tensor);
+    }
+    case FORM_PARSE_TENSOR: {
+      return ObtainCNodeAttrInTensorForm(prim, attr_name, attr_tensor);
+    }
+    default:
+      MS_LOG(ERROR) << "parse attr type don't support input of ref_attr_name";
+      return false;
+  }
+}
+bool MSANFModelParser::ObtainValueNodeInTensorForm(const std::string &value_node_name,
+                                                   const onnx::TensorProto &attr_tensor) {
+  const int attr_tensor_type = attr_tensor.data_type();
+  std::vector<int> shape;
+  for (int i = 0; i < attr_tensor.dims_size(); ++i) {
+    shape.push_back(attr_tensor.dims(i));
+  }
+  tensor::TensorPtr tensor_info = std::make_shared<tensor::Tensor>(kDefaultValueSwitchMap[attr_tensor_type], shape);
+  const std::string &tensor_buf = attr_tensor.raw_data();
+  auto *tensor_data_buf = reinterpret_cast<uint8_t *>(tensor_info->data_c(true));
+  memcpy_s(tensor_data_buf, tensor_info->data().nbytes(), tensor_buf.data(), tensor_buf.size());
+  auto new_value_node = NewValueNode(MakeValue(tensor_info));
+  MS_EXCEPTION_IF_NULL(new_value_node);
+  auto tensor_abstract = tensor_info->ToAbstract();
+  MS_EXCEPTION_IF_NULL(tensor_abstract);
+  new_value_node->set_abstract(tensor_abstract);
+  anfnode_build_map_[value_node_name] = new_value_node;
+  return true;
+}
+
+bool MSANFModelParser::ObtainValueNodeInScalarForm(const std::string &value_node_name,
+                                                   const onnx::TensorProto &attr_tensor) {
+  const int attr_tensor_type = attr_tensor.data_type();
+  ValuePtr value_ptr = nullptr;
+  switch (attr_tensor_type) {
+    case onnx::TensorProto_DataType_INT32: {
+      std::vector<int32> add_data;
+      for (int i = 0; i < attr_tensor.int32_data_size(); ++i) {
+        add_data.push_back(attr_tensor.int32_data(i));
+      }
+      if (add_data.size() == 1) {
+        value_ptr = MakeValue(add_data[0]);
+      } else if (!add_data.empty()) {
+        value_ptr = MakeValue<std::vector<int32>>(add_data);
+      }
+      break;
+    }
+    case onnx::TensorProto_DataType_FLOAT: {
+      std::vector<float> add_data;
+      for (int i = 0; i < attr_tensor.float_data_size(); ++i) {
+        add_data.push_back(attr_tensor.float_data(i));
+      }
+
+      if (add_data.size() == 1) {
+        value_ptr = MakeValue(add_data[0]);
+      } else if (!add_data.empty()) {
+        value_ptr = MakeValue<std::vector<float>>(add_data);
+      }
+      break;
+    }
+    case onnx::TensorProto_DataType_UNDEFINED: {
+      std::vector<ValuePtr> elems;
+      value_ptr = std::make_shared<ValueTuple>(elems);
+      break;
+    }
+    default:
+      MS_LOG(ERROR) << "Obtain attr in scalar-form has not support input type: " << attr_tensor_type;
+      return false;
+  }
+  auto new_value_node = NewValueNode(value_ptr);
+  MS_EXCEPTION_IF_NULL(new_value_node);
+  new_value_node->set_abstract(value_ptr->ToAbstract());
+  anfnode_build_map_[value_node_name] = new_value_node;
+
+  return true;
+}
+
+bool MSANFModelParser::ObtainValueNodeInTypeForm(const std::string &value_node_name,
+                                                 const onnx::TensorProto &attr_tensor) {
+  const int attr_tensor_type = attr_tensor.data_type();
+  if (kDefaultValueSwitchMap.find(attr_tensor_type) == kDefaultValueSwitchMap.end()) {
+    MS_LOG(ERROR) << "Obtain ValueNode attr in type-form has not support input type: " << attr_tensor_type;
+    return false;
+  }
+  auto new_value_node = NewValueNode(TypeIdToType(kDefaultValueSwitchMap[attr_tensor_type]));
+  abstract::AbstractTypePtr abs_type = std::make_shared<abstract::AbstractType>(std::make_shared<TypeType>());
+  new_value_node->set_abstract(abs_type);
+  anfnode_build_map_[value_node_name] = new_value_node;
+  return true;
+}
+
+bool MSANFModelParser::GetAttrValueForValueNode(const std::string &ref_attr_name, const std::string &value_node_name,
+                                                const onnx::TensorProto &attr_tensor) {
+  switch (kParseTypeSwitchMap[ref_attr_name]) {
+    case FORM_PARSE_SCALAR: {
+      return ObtainValueNodeInScalarForm(value_node_name, attr_tensor);
+    }
+    case FORM_PARSE_TENSOR: {
+      return ObtainValueNodeInTensorForm(value_node_name, attr_tensor);
+    }
+    case FORM_PARSE_TYPE: {
+      return ObtainValueNodeInTypeForm(value_node_name, attr_tensor);
+    }
+    default:
+      MS_LOG(ERROR) << "parse ValueNode value don't support input of ref_attr_name";
+      return false;
+  }
+  return true;
+}
+
+bool MSANFModelParser::BuildValueNodeForFuncGraph(const onnx::NodeProto &node_proto) {
+  const std::string &value_node_name = node_proto.output(0);
+  const onnx::AttributeProto &attr_proto = node_proto.attribute(0);
+  if (!attr_proto.has_ref_attr_name()) {
+    MS_LOG(ERROR) << "parse ValueNode  don't have ref_attr_name";
+    return false;
+  }
+  const std::string &ref_attr_name = attr_proto.ref_attr_name();
+  const onnx::TensorProto &attr_tensor = attr_proto.t();
+
+  return GetAttrValueForValueNode(ref_attr_name, value_node_name, attr_tensor);
+}
+
+AbstractBasePtr MSANFModelParser::GetAbstractForCNode(const onnx::AttributeProto &attr_proto) {
+  std::vector<int> shape_vec;
+  const onnx::TensorProto &attr_tensor = attr_proto.t();
+  for (int i = 0; i < attr_tensor.dims_size(); ++i) {
+    shape_vec.push_back(attr_tensor.dims(i));
+  }
+  tensor::TensorPtr tensor_info =
+    std::make_shared<tensor::Tensor>(kDefaultValueSwitchMap[attr_tensor.data_type()], shape_vec);
+  MS_EXCEPTION_IF_NULL(tensor_info);
+  auto abstract = tensor_info->ToAbstract();
+  MS_EXCEPTION_IF_NULL(abstract);
+  return abstract;
+}
+
+CNodePtr MSANFModelParser::BuildCNodeForFuncGraph(const FuncGraphPtr &outputFuncGraph,
+                                                  const onnx::NodeProto &node_proto) {
+  MS_EXCEPTION_IF_NULL(outputFuncGraph);
+  if (!node_proto.has_op_type()) {
+    MS_LOG(ERROR) << "Get CNode op_type failed!";
+    return nullptr;
+  }
+  const std::string &node_name = node_proto.output(0);
+  const std::string &fullname_with_scope = node_proto.domain();
+  const std::string &node_type = node_proto.op_type();
+  PrimitivePtr prim = std::make_shared<Primitive>(node_type);
+  MS_EXCEPTION_IF_NULL(prim);
+  prim->set_instance_name(node_type);
+
+  AbstractBasePtr abstract = nullptr;
+  AbstractBasePtr abstract_first = nullptr;
+  AbstractBasePtr abstract_second = nullptr;
+  for (int i = 0; i < node_proto.attribute_size(); ++i) {
+    const onnx::AttributeProto &attr_proto = node_proto.attribute(i);
+    if (attr_proto.name() == kCNodeShapeAttr) {
+      abstract = GetAbstractForCNode(attr_proto);
+      continue;
+    }
+    if (attr_proto.name() == kCNodeShape1Attr) {
+      abstract_first = GetAbstractForCNode(attr_proto);
+      continue;
+    }
+    if (attr_proto.name() == kCNodeShape2Attr) {
+      abstract_second = GetAbstractForCNode(attr_proto);
+      continue;
+    }
+    if (!GetAttrValueForCNode(prim, attr_proto)) {
+      MS_LOG(ERROR) << "Get CNode attr failed!";
+      return nullptr;
+    }
+  }
+
+  std::vector<AnfNodePtr> inputs;
+  inputs.clear();
+  inputs.push_back(NewValueNode(prim));
+  for (int i = 0; i < node_proto.input_size(); ++i) {
+    const std::string &input_name = node_proto.input(i);
+    if (anfnode_build_map_.find(input_name) == anfnode_build_map_.end()) {
+      MS_LOG(ERROR) << node_name << " input " << i << input_name << "can't find in nodes have parsed";
+      return nullptr;
+    }
+    inputs.push_back(anfnode_build_map_[input_name]);
+  }
+  CNodePtr cnode_ptr = outputFuncGraph->NewCNode(inputs);
+  MS_EXCEPTION_IF_NULL(cnode_ptr);
+  if (node_type == "LayerNorm") {
+    AbstractBasePtrList elem;
+    elem.push_back(abstract);
+    elem.push_back(abstract_first);
+    elem.push_back(abstract_second);
+    cnode_ptr->set_abstract(std::make_shared<abstract::AbstractTuple>(elem));
+  } else if (node_type == "ArgMaxWithValue") {
+    AbstractBasePtrList elem;
+    elem.push_back(abstract);
+    elem.push_back(abstract_first);
+    cnode_ptr->set_abstract(std::make_shared<abstract::AbstractTuple>(elem));
+  } else if (nullptr == abstract) {
+    AbstractBasePtrList elem;
+    for (size_t index = 1; index < cnode_ptr->inputs().size(); ++index) {
+      elem.push_back(cnode_ptr->input(index)->abstract());
+    }
+    cnode_ptr->set_abstract(std::make_shared<abstract::AbstractTuple>(elem));
+  } else {
+    cnode_ptr->set_abstract(abstract);
+  }
+  cnode_ptr->set_fullname_with_scope(fullname_with_scope);
+  anfnode_build_map_[node_name] = cnode_ptr;
+  return cnode_ptr;
+}
+
+bool MSANFModelParser::BuildReturnForFuncGraph(const FuncGraphPtr &outputFuncGraph, const onnx::GraphProto &importProto,
+                                               const CNodePtr &cnode_ptr) {
+  MS_EXCEPTION_IF_NULL(outputFuncGraph);
+  MS_EXCEPTION_IF_NULL(cnode_ptr);
+  std::vector<AnfNodePtr> inputs;
+  if (importProto.output_size() > 1) {
+    inputs.clear();
+    inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
+    AbstractBasePtrList elem;
+    for (int out_size = 0; out_size < importProto.output_size(); ++out_size) {
+      const onnx::ValueInfoProto &output_node = importProto.output(out_size);
+      const std::string &out_tuple = output_node.name();
+      inputs.push_back(anfnode_build_map_[out_tuple]);
+      elem.push_back(anfnode_build_map_[out_tuple]->abstract());
+    }
+    auto maketuple_ptr = outputFuncGraph->NewCNode(inputs);
+    maketuple_ptr->set_abstract(std::make_shared<abstract::AbstractTuple>(elem));
+    inputs.clear();
+    inputs.push_back(NewValueNode(prim::kPrimReturn));
+    inputs.push_back(maketuple_ptr);
+    auto return_node = outputFuncGraph->NewCNode(inputs);
+    MS_EXCEPTION_IF_NULL(return_node);
+    outputFuncGraph->set_return(return_node);
+    MS_LOG(INFO) << "Construct funcgraph finined, all success.";
+  } else {
+    const onnx::ValueInfoProto &output_node = importProto.output(0);
+    const onnx::TypeProto &output_typeproto = output_node.type();
+    int output_type = output_typeproto.tensor_type().elem_type();
+    std::vector<int> output_shape;
+    for (int i = 0; i < output_typeproto.tensor_type().shape().dim_size(); ++i) {
+      output_shape.push_back(output_typeproto.tensor_type().shape().dim(i).dim_value());
+    }
+    tensor::TensorPtr tensor_return =
+      std::make_shared<tensor::Tensor>(kDefaultValueSwitchMap[output_type], output_shape);
+    inputs.clear();
+    inputs.push_back(NewValueNode(prim::kPrimReturn));
+    inputs.push_back(cnode_ptr);
+    auto return_node = outputFuncGraph->NewCNode(inputs);
+    MS_EXCEPTION_IF_NULL(return_node);
+    return_node->set_abstract(tensor_return->ToAbstract());
+    outputFuncGraph->set_return(return_node);
+    MS_LOG(INFO) << "Construct funcgraph finined, all success!";
+  }
+  return true;
+}
+
+bool MSANFModelParser::ImportNodesForGraph(const FuncGraphPtr &outputFuncGraph, const onnx::GraphProto &importProto) {
+  MS_EXCEPTION_IF_NULL(outputFuncGraph);
+  MS_LOG(INFO) << "The CNdoe size : " << importProto.node_size();
+  CNodePtr cnode_ptr = nullptr;
+  for (int i = 0; i < importProto.node_size(); ++i) {
+    const onnx::NodeProto &node_proto = importProto.node(i);
+    const std::string &node_type = node_proto.op_type();
+    if (node_type == kConstantValueNode) {
+      if (!BuildValueNodeForFuncGraph(node_proto)) {
+        MS_LOG(ERROR) << "Build ValueNode for funcgraph fail at index: : " << i;
+        return false;
+      }
+      continue;
+    }
+    cnode_ptr = BuildCNodeForFuncGraph(outputFuncGraph, node_proto);
+    if (cnode_ptr == nullptr) {
+      MS_LOG(ERROR) << "Build CNode for funcgraph fail at index: : " << i;
+      return false;
+    }
+  }
+
+  BuildReturnForFuncGraph(outputFuncGraph, importProto, cnode_ptr);
+  return true;
+}
+
+bool MSANFModelParser::BuildFuncGraph(const FuncGraphPtr &outputFuncGraph, const onnx::GraphProto &importProto) {
+  MS_EXCEPTION_IF_NULL(outputFuncGraph);
+  GraphDebugInfoPtr debug_info_ptr = outputFuncGraph->debug_info();
+  MS_EXCEPTION_IF_NULL(debug_info_ptr);
+  if (importProto.has_name()) {
+    debug_info_ptr->set_name(importProto.name());
+  } else {
+    MS_LOG(ERROR) << "FuncGraph under converting has not name!";
+  }
+
+  if (!ImportParametersForGraph(outputFuncGraph, importProto)) {
+    return false;
+  }
+  return ImportNodesForGraph(outputFuncGraph, importProto);
+}
+
+bool MSANFModelParser::MSANFParseModelConfigureInfo(const onnx::ModelProto &model_proto) {
+  if (!model_proto.has_producer_name()) {
+    MS_LOG(ERROR) << "Parse model producer name from pb file failed!";
+    return false;
+  }
+  producer_name_ = model_proto.producer_name();
+  MS_LOG(INFO) << "producer_name :" << producer_name_;
+
+  if (!model_proto.has_model_version()) {
+    MS_LOG(ERROR) << "Parse model producer version from pb file failed!";
+    return false;
+  }
+  model_version_ = model_proto.model_version();
+  MS_LOG(INFO) << "producer_version : " << model_version_;
+
+  if (!model_proto.has_ir_version()) {
+    MS_LOG(ERROR) << "Parse model version from pb file failed!";
+    return false;
+  }
+  ir_version_ = model_proto.ir_version();
+  MS_LOG(INFO) << "ir_version :" << ir_version_;
+  return true;
+}
+
+FuncGraphPtr MSANFModelParser::Parse(const onnx::ModelProto &model_proto) {
+  FuncGraphPtr dstGraph = std::make_shared<FuncGraph>();
+  MS_EXCEPTION_IF_NULL(dstGraph);
+  if (!MSANFParseModelConfigureInfo(model_proto)) {
+    MS_LOG(ERROR) << "Parse configuration info for pb file failed!";
+  }
+  const onnx::GraphProto &graphBuild = model_proto.graph();
+  if (!BuildFuncGraph(dstGraph, graphBuild)) {
+    MS_LOG(ERROR) << "Build funcgraph failed!";
+    return nullptr;
+  }
+  MS_LOG(INFO) << "Parse pb to build FuncGraph Success!";
+  return dstGraph;
+}
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/load_onnx/anf_model_parser.h b/mindspore/ccsrc/utils/load_onnx/anf_model_parser.h
new file mode 100644
index 0000000000..11b9cd101f
--- /dev/null
+++ b/mindspore/ccsrc/utils/load_onnx/anf_model_parser.h
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_UTILS_LOAD_ONNX_ANF_MODEL_PARSER_H
+#define MINDSPORE_CCSRC_UTILS_LOAD_ONNX_ANF_MODEL_PARSER_H
+
+#include <string>
+#include <map>
+#include <unordered_map>
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "ir/func_graph.h"
+#include "proto/onnx.pb.h"
+
+namespace mindspore {
+namespace lite {
+using int32 = int32_t;
+using int64 = int64_t;
+using uint64 = uint64_t;
+using float16 = Eigen::half;
+class MSANFModelParser {
+ public:
+  MSANFModelParser() = default;
+  ~MSANFModelParser() = default;
+
+  FuncGraphPtr Parse(const onnx::ModelProto &model_proto);
+  bool MSANFParseModelConfigureInfo(const onnx::ModelProto &model_proto);
+
+  std::string GetProducerName() { return producer_name_; }
+  int GetProducerVersion() { return model_version_; }
+  int GetIrVersion() { return ir_version_; }
+
+ private:
+  bool BuildFuncGraph(const FuncGraphPtr &outputFuncGraph, const onnx::GraphProto &importProto);
+  bool ImportParametersForGraph(const FuncGraphPtr &outputFuncGraph, const onnx::GraphProto &importProto);
+  bool ImportNodesForGraph(const FuncGraphPtr &outputFuncGraph, const onnx::GraphProto &importProto);
+  bool BuildParameterForFuncGraph(const ParameterPtr &node, const onnx::ValueInfoProto &value_proto);
+  CNodePtr BuildCNodeForFuncGraph(const FuncGraphPtr &outputFuncGraph, const onnx::NodeProto &node_proto);
+  bool BuildReturnForFuncGraph(const FuncGraphPtr &outputFuncGraph, const onnx::GraphProto &importProto,
+                               const CNodePtr &cnode_ptr);
+  bool GetAttrValueForCNode(const PrimitivePtr &prim, const onnx::AttributeProto &attr_proto);
+  bool ObtainCNodeAttrInTypeForm(const PrimitivePtr &prim, const std::string &attr_name,
+                                 const onnx::TensorProto &attr_tensor);
+  bool ObtainCNodeAttrInScalarForm(const PrimitivePtr &prim, const std::string &attr_name,
+                                   const onnx::TensorProto &attr_tensor);
+  bool ObtainCNodeAttrInTensorForm(const PrimitivePtr &prim, const std::string &attr_name,
+                                   const onnx::TensorProto &attr_tensor);
+  bool BuildValueNodeForFuncGraph(const onnx::NodeProto &node_proto);
+  bool ObtainValueNodeInTensorForm(const string &value_node_name, const onnx::TensorProto &attr_tensor);
+
+  bool ObtainValueNodeInScalarForm(const string &value_node_name, const onnx::TensorProto &attr_tensor);
+  bool GetAttrValueForValueNode(const string &ref_attr_name, const std::string &value_node_name,
+                                const onnx::TensorProto &attr_tensor);
+  bool ObtainValueNodeInTypeForm(const string &value_node_name, const onnx::TensorProto &attr_tensor);
+  AbstractBasePtr GetAbstractForCNode(const onnx::AttributeProto &attr_proto);
+
+  std::string producer_name_;
+  int model_version_;
+  int ir_version_;
+  std::unordered_map<std::string, AnfNodePtr> anfnode_build_map_;
+  std::map<std::string, onnx::TensorProto> default_para_map_;
+};
+}  // namespace lite
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_UTILS_LOAD_ONNX_ANF_MODEL_PARSER_H
diff --git a/mindspore/ccsrc/utils/log_adapter.cc b/mindspore/ccsrc/utils/log_adapter.cc
index 10f49c7036..d16fbead9b 100644
--- a/mindspore/ccsrc/utils/log_adapter.cc
+++ b/mindspore/ccsrc/utils/log_adapter.cc
@@ -289,7 +289,7 @@ class LogConfigLexer {
     return '\0';
   }
 
-  LogConfigToken GetNext(std::string *ptr) {
+  LogConfigToken GetNext(std::string *const ptr) {
 #ifdef DEBUG
     std::string text;
     auto tok = GetNextInner(&text);
diff --git a/mindspore/ccsrc/minnie/tensor_minnie.cc b/mindspore/ccsrc/utils/mpi/mpi_config.cc
similarity index 60%
rename from mindspore/ccsrc/minnie/tensor_minnie.cc
rename to mindspore/ccsrc/utils/mpi/mpi_config.cc
index 329bf228e6..e8d81cf843 100644
--- a/mindspore/ccsrc/minnie/tensor_minnie.cc
+++ b/mindspore/ccsrc/utils/mpi/mpi_config.cc
@@ -14,21 +14,16 @@
  * limitations under the License.
  */
 
-#include "minnie/tensor_minnie.h"
+#include "utils/mpi/mpi_config.h"
 
 namespace mindspore {
-namespace tensor {
-TensorMinnie &TensorMinnie::operator=(const TensorMinnie &tensor) {
-  if (&tensor == this) {
-    return *this;
-  }
-  this->tensor_addr_ = tensor.tensor_addr();
-  this->tensor_size_ = tensor.tensor_size();
-  return *this;
-}
+std::shared_ptr<MpiConfig> MpiConfig::instance_ = nullptr;
 
-bool TensorMinnie::operator==(const TensorMinnie &tensor) {
-  return tensor_addr_ == tensor.tensor_addr() && tensor_size_ == tensor.tensor_size();
+std::shared_ptr<MpiConfig> MpiConfig::GetInstance() {
+  if (instance_ == nullptr) {
+    MS_LOG(DEBUG) << "Create new mpi config instance.";
+    instance_.reset(new (std::nothrow) MpiConfig());
+  }
+  return instance_;
 }
-}  // namespace tensor
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/mpi/mpi_config.h b/mindspore/ccsrc/utils/mpi/mpi_config.h
new file mode 100644
index 0000000000..044e767762
--- /dev/null
+++ b/mindspore/ccsrc/utils/mpi/mpi_config.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_UTILS_MPI_MS_CONTEXT_H_
+#define MINDSPORE_CCSRC_UTILS_MPI_MS_CONTEXT_H_
+#include <memory>
+#include "utils/log_adapter.h"
+
+namespace mindspore {
+class MpiConfig {
+ public:
+  ~MpiConfig() = default;
+  MpiConfig(const MpiConfig &) = delete;
+  MpiConfig &operator=(const MpiConfig &) = delete;
+
+  static std::shared_ptr<MpiConfig> GetInstance();
+
+  void set_enable_mpi(bool flag) { enable_mpi_ = flag; }
+  bool enable_mpi() const { return enable_mpi_; }
+
+ private:
+  MpiConfig() : enable_mpi_(false) {}
+
+  static std::shared_ptr<MpiConfig> instance_;
+  bool enable_mpi_;
+};
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_UTILS_MPI_MS_CONTEXT_H_
diff --git a/mindspore/ccsrc/utils/primitive_utils.cc b/mindspore/ccsrc/utils/primitive_utils.cc
index cfbfdebac7..97fa954e12 100644
--- a/mindspore/ccsrc/utils/primitive_utils.cc
+++ b/mindspore/ccsrc/utils/primitive_utils.cc
@@ -29,9 +29,6 @@ py::function GetBpropFunctionByObj(py::object obj) {
 
 py::function GetBpropFunction(std::string name) {
   auto fn = GetBpropFunctionByObj(py::str(name));
-  if (fn.is_none()) {
-    MS_LOG(WARNING) << "Can't find bprop function for " << name;
-  }
   return fn;
 }
 
@@ -41,7 +38,7 @@ py::function GetComputeFunction(std::string name) {
   if (!py::hasattr(mod, common::SafeCStr(name))) {
     PyErr_SetString(PyExc_NotImplementedError, common::SafeCStr(name));
     // If raise AttributeError, user can't understand. This case need raise NotImplementedError.
-    throw py::error_already_set();
+    throw(py::error_already_set());
   }
   py::object fn = mod.attr(common::SafeCStr(name));
   return fn;
diff --git a/mindspore/ccsrc/utils/print.proto b/mindspore/ccsrc/utils/print.proto
new file mode 100644
index 0000000000..a82791bccf
--- /dev/null
+++ b/mindspore/ccsrc/utils/print.proto
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto2";
+
+package mindspore.prntpb;
+
+message TensorProto {
+    // The shape of the tensor.
+    repeated int64 dims = 1;
+    // The type of the tensor.
+    required string tensor_type = 2;
+    // The data of the tensor.
+    required bytes tensor_content = 3;
+}
+
+
+message Print {
+    message Value {
+        oneof value {
+            string desc = 1;
+            TensorProto tensor = 2;
+        }
+    }
+    repeated Value value = 1;
+}
diff --git a/mindspore/ccsrc/utils/summary.proto b/mindspore/ccsrc/utils/summary.proto
index 6ea6ce08b8..f4a2ce957b 100644
--- a/mindspore/ccsrc/utils/summary.proto
+++ b/mindspore/ccsrc/utils/summary.proto
@@ -32,7 +32,7 @@ message Event {
 
   oneof what {
     // An event file was started, with the specified version.
-    // Now version is "Mindspore.Event:1"
+    // Now version is "MindSpore.Event:1"
     string version = 3;
 
     // GraphDef.
diff --git a/mindspore/ccsrc/utils/tensorprint_utils.cc b/mindspore/ccsrc/utils/tensorprint_utils.cc
index f4715b22a8..0d464e88a8 100644
--- a/mindspore/ccsrc/utils/tensorprint_utils.cc
+++ b/mindspore/ccsrc/utils/tensorprint_utils.cc
@@ -47,6 +47,18 @@ static std::map<std::string, size_t> type_size_map = {
   {"int64_t", sizeof(int64_t)},   {"uint64_t", sizeof(uint64_t)}, {"float16", sizeof(float) / 2},
   {"float", sizeof(float)},       {"double", sizeof(double)},     {"bool", sizeof(bool)}};
 
+std::string GetParseType(const std::string &tensorType_) {
+  static const std::map<std::string, std::string> print_parse_map = {
+    {"int8_t", "Int8"},     {"uint8_t", "Uint8"},   {"int16_t", "Int16"},  {"uint16_t", "Uint16"},
+    {"int32_t", "Int32"},   {"uint32_t", "Uint32"}, {"int64_t", "Int64"},  {"uint64_t", "Uint64"},
+    {"float16", "Float16"}, {"float", "Float32"},   {"double", "Float64"}, {"bool", "Bool"}};
+  auto type_iter = print_parse_map.find(tensorType_);
+  if (type_iter == print_parse_map.end()) {
+    MS_LOG(EXCEPTION) << "type of tensor need to print is not support " << tensorType_;
+  }
+  return type_iter->second;
+}
+
 bool ParseTensorShape(const std::string &input_shape_str, std::vector<int> *const tensor_shape, size_t *dims) {
   if (tensor_shape == nullptr) {
     return false;
@@ -141,7 +153,7 @@ void convertDataItem2Scalar(const char *str_data_ptr, const string &tensor_type,
   } else {
     MS_LOG(EXCEPTION) << "Cannot print scalar because of unsupport data type: " << tensor_type << ".";
   }
-}  // namespace mindspore
+}
 
 bool judgeLengthValid(const size_t str_len, const string &tensor_type) {
   auto type_iter = type_size_map.find(tensor_type);
@@ -200,14 +212,84 @@ bool ConvertDataItem2Tensor(const std::vector<tdt::DataItem> &items) {
   return ret_end_sequence;
 }
 
-void TensorPrint::operator()() {
-  while (true) {
-    std::vector<tdt::DataItem> bundle;
-    if (tdt::TdtHostPopData("_npu_log", bundle) != 0) {
+bool SaveDataItem2File(const std::vector<tdt::DataItem> &items, const std::string &print_file_path, prntpb::Print print,
+                       std::fstream *output) {
+  bool ret_end_sequence = false;
+  for (auto &item : items) {
+    if (item.dataType_ == tdt::TDT_END_OF_SEQUENCE) {
+      ret_end_sequence = true;
       break;
     }
-    if (ConvertDataItem2Tensor(bundle)) {
-      break;
+    prntpb::Print_Value *value = print.add_value();
+    std::shared_ptr<std::string> str_data_ptr = std::static_pointer_cast<std::string>(item.dataPtr_);
+    MS_EXCEPTION_IF_NULL(str_data_ptr);
+    if (item.tensorShape_ == kShapeScalar || item.tensorShape_ == kShapeNone) {
+      if (!judgeLengthValid(str_data_ptr->size(), item.tensorType_)) {
+        MS_LOG(EXCEPTION) << "Print op receive data length is invalid.";
+      }
+    }
+
+    std::vector<int> tensor_shape;
+    size_t totaldims = 1;
+    if (!ParseTensorShape(item.tensorShape_, &tensor_shape, &totaldims)) {
+      MS_LOG(EXCEPTION) << "Tensor print can not parse tensor shape, receive info" << item.tensorShape_;
+    }
+
+    if (item.tensorType_ == "string") {
+      std::string data(reinterpret_cast<const char *>(str_data_ptr->c_str()), item.dataLen_);
+      value->set_desc(data);
+    } else {
+      auto parse_type = GetParseType(item.tensorType_);
+      prntpb::TensorProto *tensor = value->mutable_tensor();
+      if (!(item.tensorShape_ == kShapeScalar) && !(item.tensorShape_ == kShapeNone)) {
+        for (const auto &dim : tensor_shape) {
+          tensor->add_dims(static_cast<::google::protobuf::int64>(dim));
+        }
+      }
+      tensor->set_tensor_type(parse_type);
+      std::string data(reinterpret_cast<const char *>(str_data_ptr->c_str()), item.dataLen_);
+      tensor->set_tensor_content(data);
+    }
+
+    if (!print.SerializeToOstream(output)) {
+      MS_LOG(EXCEPTION) << "Save print file:" << print_file_path << " fail.";
+    }
+    print.Clear();
+  }
+  return ret_end_sequence;
+}
+
+void TensorPrint::operator()() {
+  prntpb::Print print;
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  std::string print_file_path = ms_context->print_file_path();
+  if (print_file_path == "") {
+    while (true) {
+      std::vector<tdt::DataItem> bundle;
+      if (tdt::TdtHostPopData("_npu_log", bundle) != 0) {
+        break;
+      }
+      if (ConvertDataItem2Tensor(bundle)) {
+        break;
+      }
+    }
+  } else {
+    std::fstream output(print_file_path, std::ios::out | std::ios::trunc | std::ios::binary);
+    while (true) {
+      std::vector<tdt::DataItem> bundle;
+      if (tdt::TdtHostPopData("_npu_log", bundle) != 0) {
+        break;
+      }
+      if (SaveDataItem2File(bundle, print_file_path, print, &output)) {
+        break;
+      }
+    }
+    output.close();
+    std::string path_string = print_file_path;
+    if (chmod(common::SafeCStr(path_string), S_IRUSR) == -1) {
+      MS_LOG(ERROR) << "Modify file:" << print_file_path << " to r fail.";
+      return;
     }
   }
 }
diff --git a/mindspore/ccsrc/utils/tensorprint_utils.h b/mindspore/ccsrc/utils/tensorprint_utils.h
index c8442e6291..4a40862ea3 100644
--- a/mindspore/ccsrc/utils/tensorprint_utils.h
+++ b/mindspore/ccsrc/utils/tensorprint_utils.h
@@ -23,6 +23,8 @@
 #include "tdt/tsd_client.h"
 #include "tdt/tdt_host_interface.h"
 #include "tdt/data_common.h"
+#include "proto/print.pb.h"
+#include "utils/context/ms_context.h"
 #endif
 namespace mindspore {
 class TensorPrint {
diff --git a/mindspore/ccsrc/utils/union_find_set.h b/mindspore/ccsrc/utils/union_find_set.h
new file mode 100644
index 0000000000..81529c8bcf
--- /dev/null
+++ b/mindspore/ccsrc/utils/union_find_set.h
@@ -0,0 +1,86 @@
+/**
+ * This is the C++ adaptation and derivative work of Myia (https://github.com/mila-iqia/myia/).
+ *
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_UTILS_UNION_FIND_SET_H_
+#define MINDSPORE_CCSRC_UTILS_UNION_FIND_SET_H_
+
+#include <map>
+#include <set>
+
+namespace mindspore {
+template <class T>
+class UnionFindSet {
+ public:
+  UnionFindSet() : union_find_set_() {}
+  ~UnionFindSet() = default;
+  void Add(const T &elem) {
+    if (union_find_set_.find(elem) != union_find_set_.end()) {
+      return;
+    }
+
+    union_find_set_[elem] = elem;
+  }
+
+  T Find(const T &key) {
+    T key_parent = key;
+    auto iter = union_find_set_.find(key_parent);
+    if (iter == union_find_set_.end()) {
+      MS_LOG(EXCEPTION) << "union_find_set_ cannot find key " << key_parent;
+    }
+    while (key_parent != iter->second) {
+      key_parent = iter->second;
+      iter = union_find_set_.find(key_parent);
+      if (iter == union_find_set_.end()) {
+        MS_LOG(EXCEPTION) << "union_find_set_ cannot find key " << key_parent;
+      }
+    }
+
+    T tmp = key;
+    T tmp_parent;
+    while (tmp != key_parent) {
+      iter = union_find_set_.find(tmp);
+      if (iter == union_find_set_.end()) {
+        MS_LOG(EXCEPTION) << "union_find_set_ cannot find key " << tmp;
+      }
+      tmp_parent = iter->second;
+      union_find_set_[tmp] = key_parent;
+      tmp = tmp_parent;
+    }
+    return key_parent;
+  }
+
+  void Union(const T &left, const T &right) { union_find_set_[Find(left)] = Find(right); }
+
+  std::map<T, std::set<T>> GetSets() {
+    std::map<T, std::set<T>> result;
+    for (auto &iter : union_find_set_) {
+      (void)Find(iter.first);
+    }
+    for (auto &iter : union_find_set_) {
+      T parent = Find(iter.first);
+      result[parent].insert(iter.first);
+    }
+    return result;
+  }
+
+ private:
+  std::map<T, T> union_find_set_;
+};
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_UTILS_UNION_FIND_SET_H_
diff --git a/mindspore/ccsrc/utils/utils.h b/mindspore/ccsrc/utils/utils.h
index a63810ffaa..972d8df319 100644
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -25,6 +25,7 @@
 #include <set>
 
 #include "utils/log_adapter.h"
+#include "ir/dtype/type.h"
 
 namespace mindspore {
 // op name. Op which not exists in operator/ops.h, so define it's name here
@@ -55,6 +56,7 @@ constexpr auto kExtractImagePatchesOpName = "ExtractImagePatches";
 constexpr auto kBNTrainingReduceOpName = "BNTrainingReduce";
 constexpr auto kBNTrainingUpdateOpName = "BNTrainingUpdate";
 constexpr auto kBNTrainingUpdateV2OpName = "BNTrainingUpdateV2";
+constexpr auto kBNTrainingUpdateV3OpName = "BNTrainingUpdateV3";
 constexpr auto kSimpleMeanGradOpName = "SimpleMeanGrad";
 constexpr auto kMeanGradOpName = "MeanGrad";
 constexpr auto kSliceOpName = "Slice";
@@ -64,11 +66,13 @@ constexpr auto kScatterNdOpName = "ScatterNd";
 constexpr auto kStridedSliceAssignOpName = "StridedSliceAssign";
 constexpr auto kStridedSliceOpName = "StridedSlice";
 constexpr auto kStridedSliceGradOpName = "StridedSliceGrad";
+constexpr auto kSparseGatherV2 = "SparseGatherV2";
 constexpr auto kUnsortedSegmentProdOpName = "UnsortedSegmentProd";
 constexpr auto kUnsortedSegmentMinOpName = "UnsortedSegmentMin";
 constexpr auto kFlattenGradOpName = "FlattenGrad";
 constexpr auto kExpandDimsOpName = "ExpandDims";
 constexpr auto kSplitOpName = "Split";
+constexpr auto kSplitVOpName = "SplitV";
 constexpr auto kSparseApplyAdagradOpName = "SparseApplyAdagrad";
 constexpr auto kMomentumOpName = "Momentum";
 constexpr auto kApplyMomentumOpName = "ApplyMomentum";
@@ -131,6 +135,8 @@ constexpr auto kResizeNearestNeighborV2OpName = "ResizeNearestNeighborV2";
 constexpr auto kResizeNearestNeighborV2GradOpName = "ResizeNearestNeighborV2Grad";
 constexpr auto kApplyRMSPropOpname = "ApplyRMSProp";
 constexpr auto kCumsumOpName = "Cumsum";
+constexpr auto kInplaceAddOpName = "InplaceAdd";
+constexpr auto kInplaceSubOpName = "InplaceSub";
 constexpr auto kResizeBilinearV2OpName = "kResizeBilinearV2";
 constexpr auto kReduceProdOpName = "ReduceProd";
 constexpr auto kCumprodOpName = "Cumprod";
@@ -153,6 +159,8 @@ constexpr auto kLarsV2UpdateOpName = "LarsV2Update";
 constexpr auto kSquareSumAllOpName = "SquareSumAll";
 constexpr auto kNMSWithMaskOpName = "NMSWithMask";
 constexpr auto kSoftmaxGradExtOpName = "SoftmaxGradExt";
+constexpr auto kStridedReadOpName = "StridedRead";
+constexpr auto kStridedWriteOpName = "StridedWrite";
 
 // attr key name
 constexpr auto kAttrInputNames = "input_names";
@@ -172,9 +180,9 @@ constexpr auto kAttrKeepDims = "keep_dims";
 constexpr auto kAttrShapeGamma = "shape_gamma";
 constexpr auto kAttrPerm = "perm";
 constexpr auto kAttrTransposeFirst = "transpose_first";
-constexpr auto kAttrAutomicAddMemSize = "automic_add_mem_size";
-constexpr auto kAttrAutomicOutputIndexs = "atomic_output_clean_indexs";
-constexpr auto kAttrAutomicWorkspaceSize = "atomic_workspace_clean_size";
+constexpr auto kAttrAtomicAddMemSize = "automic_add_mem_size";
+constexpr auto kAttrAtomicOutputIndexs = "atomic_output_clean_indexs";
+constexpr auto kAttrAtomicWorkspaceIndexs = "atomic_workspace_clean_indexs";
 constexpr auto kAttrSwitchCondition = "switch_condition";
 constexpr auto kAttrDataType = "data_type";
 constexpr auto kAttrActiveTarget = "active_target";
@@ -184,6 +192,9 @@ constexpr auto kAttrEventId = "event_id";
 constexpr auto kAttrDynInput = "dynamic";
 constexpr auto kAttrDynInputSizes = "dyn_input_sizes";
 constexpr auto kAttrSrcFormat = "src_format";
+constexpr auto kAttrMultiples = "multiples";
+constexpr auto kAttrFixPrecision = "fix_precision";
+constexpr auto kAttrOutputPrecision = "output_precision";
 constexpr auto kAttrOutputUsedNum = "output_used_num";
 constexpr auto kAttrHasBias = "has_bias";
 constexpr auto kAttrN = "n";
@@ -197,6 +208,18 @@ constexpr auto kAttrLabelIndex = "label_index";
 constexpr auto kAttrLabelSwitchList = "label_switch_list";
 constexpr auto kAttrNewAxisMask = "new_axis_mask";
 constexpr auto kAttrShrinkAxisMask = "shrink_axis_mask";
+constexpr auto kAttrDatadumpOriginalNames = "_datadump_original_names";
+constexpr auto kAttrStreamId = "stream_id";
+constexpr auto kAttrRecordEvent = "record_event";
+constexpr auto kAttrWaitEvent = "wait_event";
+constexpr auto kAttrRecordEventStream = "record_event_stream";
+constexpr auto kAttrWaitEventStream = "wait_event_stream";
+constexpr auto kAttrIndex = "index";
+constexpr auto kAttrSplitDim = "split_dim";
+constexpr auto kAttrNumSplit = "num_split";
+constexpr auto kAttrOutputNum = "output_num";
+constexpr auto kAttrSizeSplits = "size_splits";
+constexpr auto kAttrOutputDefault = "output_default";
 
 // attr value
 constexpr auto kValueTargetSwitch = "target_switch";
@@ -204,7 +227,9 @@ constexpr auto kValueTargetOther = "target_other";
 
 // some size
 const size_t kShape4dDims = 4;
+const size_t kShape2dDims = 2;
 const size_t kShape5dDims = 5;
+const size_t kShape1dDims = 1;
 const size_t kCubeSize = 16;
 const size_t kMemAlignSize = 512;
 const int kParameterDataTensorMask = 0;
@@ -213,6 +238,7 @@ const int kValueNodeTensorMask = 2;
 
 // define special index in special node
 constexpr auto kAnfPrimitiveIndex = 0;
+constexpr auto kFirstDataInputIndex = 1;
 constexpr auto kAnfPartialFuncGraphIndex = 1;
 constexpr auto kRealInputNodeIndexInTupleGetItem = 1;
 constexpr auto kInputNodeOutputIndexInTupleGetItem = 2;
@@ -251,17 +277,19 @@ const std::set<std::string> kOptOperatorSet = {
   kApplyRMSPropOpName,
 };
 
-const std::set<std::string> kNeedTransFormatSet = {kOpFormat_FRAC_Z,       kOpFormat_NC1KHKWHWC0, kOpFormat_NC1HWC0,
+const std::set<std::string> kHWSpecialFormatSet = {kOpFormat_FRAC_Z,       kOpFormat_NC1KHKWHWC0, kOpFormat_NC1HWC0,
                                                    kOpFormat_FRAC_NZ,      kOpFormat_C1HWNCoC0,   kOpFormat_NC1HWC0_C04,
                                                    kOpFormat_FRACTAL_Z_C04};
 
+const std::set<TypeId> kFloatDataTypeSet = {kNumberTypeFloat16, kNumberTypeFloat32};
+
 static inline void ChangeFileMode(const std::string &file_name, mode_t mode) {
-  if (access(file_name.c_str(), F_OK) != 0) {
-    MS_LOG(DEBUG) << "File `" << file_name << "` does not exist.";
-    return;
-  }
-  if (chmod(file_name.c_str(), mode) != 0) {
-    MS_LOG(WARNING) << "Change file `" << file_name << "` to mode " << std::oct << mode << " fail.";
+  try {
+    if (chmod(file_name.c_str(), mode) != 0) {
+      MS_LOG(WARNING) << "Change file `" << file_name << "` to mode " << std::oct << mode << " fail.";
+    }
+  } catch (std::exception &e) {
+    MS_LOG(DEBUG) << "File `" << file_name << "` change mode failed! May be not exist.";
   }
 }
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/vm/backend.cc b/mindspore/ccsrc/vm/backend.cc
index 0fac84d901..3fde263c9d 100644
--- a/mindspore/ccsrc/vm/backend.cc
+++ b/mindspore/ccsrc/vm/backend.cc
@@ -39,14 +39,14 @@ LinConvertResult MsBackend::GetMultiGraphRun(const FuncGraphPtr &g) {
   multi_result_.inputs = g->parameters();
   final_output_ = NewValueNode("fake_output");
   multi_result_.outputs = {final_output_};
-  GraphId final_g = sess_->GetFinalRunGraph();
+  GraphId final_g = target_sess_->GetFinalRunGraph();
 
   multi_result_.run = std::make_shared<RunFunc>(
-    [final_g, this](const VectorRef &args) -> VectorRef { return MsRunGraph(final_g, args); });
+    [final_g, this](const VectorRef &args) -> VectorRef { return MsRunGraph(final_g, args, ""); });
   return multi_result_;
 }
 
-LinConvertResult MsBackend::MsConvert(const AnfNodePtrList &lst) {
+LinConvertResult MsBackend::MsConvert(const AnfNodePtrList &lst, const std::string &target) {
   MS_LOG(DEBUG) << "MsConvert";
   MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
   auto cached = g_ConvertCache.find(lst);
@@ -64,17 +64,25 @@ LinConvertResult MsBackend::MsConvert(const AnfNodePtrList &lst) {
   result.inputs = inputs;
   result.outputs = outputs;
   result.graph_id = kInvalidGraphId;
-  auto graph_id = sess_->CompileGraph(lst, outputs);
-  if (MsContext::GetInstance()->execution_mode() == kPynativeMode) {
-    sess_->BuildGraph(graph_id);
+  GraphId graph_id = kInvalidGraphId;
+  if (target != target_device_ && !target.empty()) {
+    CreateOtherSession(target);
+    graph_id = other_sess_->CompileGraph(lst, outputs);
+  } else {
+    graph_id = target_sess_->CompileGraph(lst, outputs);
   }
+
   if (MsContext::GetInstance()->precompile_only()) {
     MS_LOG(INFO) << "PrecompileOnly, stop run graph";
     return result;
   }
-
+  if (target != target_device_ && !target.empty()) {
+    other_sess_->BuildGraph(graph_id);
+  } else if (!is_multi_graph_sink_) {
+    target_sess_->BuildGraph(graph_id);
+  }
   result.run = std::make_shared<RunFunc>(
-    [graph_id, this](const VectorRef &args) -> VectorRef { return MsRunGraph(graph_id, args); });
+    [graph_id, target, this](const VectorRef &args) -> VectorRef { return MsRunGraph(graph_id, args, target); });
   MS_EXCEPTION_IF_NULL(result.run);
 
   result.simu_run = std::make_shared<RunFunc>(
@@ -92,7 +100,7 @@ void MsBackend::SetSwitchActive(const BaseRef &c, bool cond) {
 
   GraphId cond_g = kInvalidGraphId;
   if (utils::isa<AnfNodePtr>(c)) {
-    cond_g = sess_->GetGraphIdByNode(utils::cast<AnfNodePtr>(c));
+    cond_g = target_sess_->GetGraphIdByNode(utils::cast<AnfNodePtr>(c));
   } else {
     MS_LOG(EXCEPTION) << "cond not a anf node:" << c.ToString();
   }
@@ -116,7 +124,7 @@ void MsBackend::SetSwitchActive(const BaseRef &c, bool cond) {
     MS_LOG(DEBUG) << "invoke set active:" << active_g;
   }
   MS_LOG(DEBUG) << "switch set active:" << active_g << ", " << cond_g;
-  sess_->SetActive(active_g, cond_g);
+  target_sess_->SetActive(active_g, cond_g);
 }
 
 void MsBackend::SetSwitchGraph() {
@@ -135,12 +143,12 @@ void MsBackend::SetSwitchGraph() {
       }
       GraphId cond_g = kInvalidGraphId;
       if (utils::isa<AnfNodePtr>(curr_switch_)) {
-        cond_g = sess_->GetGraphIdByNode(utils::cast<AnfNodePtr>(curr_switch_));
+        cond_g = target_sess_->GetGraphIdByNode(utils::cast<AnfNodePtr>(curr_switch_));
       } else {
         MS_LOG(EXCEPTION) << "cond not a anf node:" << curr_switch_.ToString();
       }
       MS_LOG(DEBUG) << "switch compile:" << cond_g << ", " << true_g << ", " << false_g;
-      sess_->SwitchCompile(cond_g, true_g, false_g, utils::cast<AnfNodePtr>(curr_switch_));
+      target_sess_->SwitchCompile(cond_g, true_g, false_g, utils::cast<AnfNodePtr>(curr_switch_));
     }
     is_switch_call_ = false;
     MS_LOG(DEBUG) << "end SetSwitchGraph:" << curr_cond << ", " << is_switch_call_;
@@ -202,7 +210,7 @@ void MsBackend::RecallGraphInput(const FuncGraphPtr &func_graph, const VectorRef
         old_args[i] = args[it->second];
       }
     }
-    sess_->SetChildGraphInput(graph, old_args);
+    target_sess_->SetChildGraphInput(graph, old_args);
   }
   graph_inputs_.erase(c);
 }
@@ -211,7 +219,7 @@ void MsBackend::RecallGraphInput(const FuncGraphPtr &func_graph, const VectorRef
 VectorRef MsBackend::MsSimuRunGraph(const GraphId &g, const VectorRef &args) {
   MS_LOG(DEBUG) << "set graph input:" << g;
   // switch maybe twice
-  sess_->SetChildGraphInput(g, args);
+  target_sess_->SetChildGraphInput(g, args);
 
   if (is_switch_call_) {
     if (!curr_switch_.is_null()) {
@@ -236,7 +244,7 @@ VectorRef MsBackend::MsSimuRunGraph(const GraphId &g, const VectorRef &args) {
   return VectorRef(outputs);
 }
 
-VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args) {
+VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args, const std::string &target) {
   MS_LOG(DEBUG) << "start ms graph run:" << args.size() << ", g:" << g;
   // Run graph
   std::vector<tensor::TensorPtr> inputs;
@@ -271,7 +279,12 @@ VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args) {
 
   VectorRef outputs;
   // call ms rungraph (graphId, input ,output)
-  sess_->RunGraph(g, inputs, &outputs);
+  if (target != target_device_ && !target.empty()) {
+    other_sess_->RunGraph(g, inputs, &outputs);
+  } else {
+    target_sess_->RunGraph(g, inputs, &outputs);
+  }
+
   MS_LOG(DEBUG) << "RunGraph finished:" << outputs.size();
   return outputs;
 }
@@ -300,17 +313,17 @@ void MsBackend::SimulateRun(FinalVMPtr rt, FuncGraphPtr root) {
   (void)std::transform(parameters.begin(), parameters.end(), std::back_inserter(args),
                        [](const AnfNodePtr &v) { return v; });
   MS_LOG(DEBUG) << "Simulate start";
-  (void)sess_->SetFinalGraphInput(parameters);
+  (void)target_sess_->SetFinalGraphInput(parameters);
   BaseRef output = rt->Eval(VectorRef(args));
-  sess_->SetFinalGraphOutput(output);
+  target_sess_->SetFinalGraphOutput(output);
   MS_LOG(DEBUG) << "Simulate Eval end";
 }
 
 void MsBackend::Link(GraphId graph_id) {
   if (graph_id == kInvalidGraphId) {
-    graph_id = sess_->GetFinalRunGraph();
+    graph_id = target_sess_->GetFinalRunGraph();
   }
-  sess_->BuildGraph(graph_id);
+  target_sess_->BuildGraph(graph_id);
 }
 
 Backend::Backend(const std::string &name) : name_(name) {
@@ -322,16 +335,30 @@ Backend::Backend(const std::string &name) : name_(name) {
 }
 
 MsBackend::MsBackend(const std::string &name, const std::string &target, uint32_t device_id) : Backend(name) {
-  convert_fn_ = std::bind(&MsBackend::MsConvert, this, std::placeholders::_1);
-  sess_ = session::SessionFactory::Get().Create(target);
-  if (sess_ == nullptr) {
+  convert_fn_ = std::bind(&MsBackend::MsConvert, this, std::placeholders::_1, std::placeholders::_2);
+  target_sess_ = session::SessionFactory::Get().Create(target);
+  if (target_sess_ == nullptr) {
+    MS_LOG(EXCEPTION) << "Session create failed!, please make sure target device:" << target << " is available.";
+  }
+  target_sess_->Init(device_id);
+  target_sess_->RegisterSummaryCallBackFunc(callbacks::SummarySaveCallback);
+  target_device_ = target;
+}
+
+void MsBackend::CreateOtherSession(const std::string &target) {
+  if (other_sess_ != nullptr && other_device_ == target) {
+    return;
+  }
+  other_sess_ = session::SessionFactory::Get().Create(target);
+  if (other_sess_ == nullptr) {
     MS_LOG(EXCEPTION) << "Session create failed!, please make sure target device:" << target << " is available.";
   }
-  sess_->Init(device_id);
-  sess_->RegisterSummaryCallBackFunc(callbacks::SummarySaveCallback);
+  other_sess_->Init(0);
+  other_sess_->RegisterSummaryCallBackFunc(callbacks::SummarySaveCallback);
+  other_device_ = target;
 }
 
-GraphId MsBackend::CompileGraph(NotNull<FuncGraphPtr> fg) { return sess_->CompileGraph(fg); }
+GraphId MsBackend::CompileGraph(NotNull<FuncGraphPtr> fg) { return target_sess_->CompileGraph(fg); }
 
 VectorRef MsBackend::RunGraph(GraphId graph_id, const VectorRef &args) { return MsRunGraph(graph_id, args); }
 
diff --git a/mindspore/ccsrc/vm/backend.h b/mindspore/ccsrc/vm/backend.h
index 94b7a500e2..0e0b02c055 100644
--- a/mindspore/ccsrc/vm/backend.h
+++ b/mindspore/ccsrc/vm/backend.h
@@ -91,8 +91,8 @@ class MsBackend : public Backend {
   MsBackend(const std::string &name, const std::string &target, uint32_t device_id);
   ~MsBackend() override = default;
 
-  LinConvertResult MsConvert(const AnfNodePtrList &lst);
-  VectorRef MsRunGraph(const GraphId &g, const VectorRef &args);
+  LinConvertResult MsConvert(const AnfNodePtrList &lst, const std::string &target = "");
+  VectorRef MsRunGraph(const GraphId &g, const VectorRef &args, const std::string &target = "");
 
   VectorRef MsSimuRunGraph(const GraphId &g, const VectorRef &args);
   void SimulateRun(FinalVMPtr rt, FuncGraphPtr root) override;
@@ -107,9 +107,13 @@ class MsBackend : public Backend {
   LinConvertResult GetMultiGraphRun(const FuncGraphPtr &g) override;
   GraphId CompileGraph(NotNull<FuncGraphPtr> fg) override;
   VectorRef RunGraph(GraphId graph_id, const VectorRef &args);
+  void CreateOtherSession(const std::string &target);
 
  private:
-  session::SessionPtr sess_;
+  session::SessionPtr target_sess_;
+  session::SessionPtr other_sess_;
+  std::string target_device_;
+  std::string other_device_;
   std::unordered_map<BaseRef, CondGraph, BaseRefHash> simu_cond_map_;
   std::unordered_map<GraphId, LinConvertResult> graph_id_map_;
   std::unordered_map<BaseRef, std::list<std::pair<GraphId, VectorRef>>, BaseRefHash> graph_inputs_;
diff --git a/mindspore/ccsrc/vm/segment_runner.cc b/mindspore/ccsrc/vm/segment_runner.cc
index ae052770ff..9b2ee51b3f 100644
--- a/mindspore/ccsrc/vm/segment_runner.cc
+++ b/mindspore/ccsrc/vm/segment_runner.cc
@@ -92,6 +92,8 @@ std::tuple<FuncGraphPtr, AnfNodePtrList, AnfNodePtrList> TransformSegmentToAnfGr
     } else if (eqv.find(a) == eqv.end()) {
       inputs.push_back(a);
       eqv[a] = fg->add_parameter();
+      eqv[a]->set_abstract(a->abstract());
+      eqv[a]->set_kernel_info(a->kernel_info_ptr());
     }
 
     return eqv[a];
@@ -107,15 +109,20 @@ std::tuple<FuncGraphPtr, AnfNodePtrList, AnfNodePtrList> TransformSegmentToAnfGr
     if (inps.empty()) {
       MS_LOG(EXCEPTION) << "Input is empty";
     }
-    if (!IsValueNode<Primitive>(inps[0])) {
+    if (!IsValueNode<Primitive>(inps[0]) &&
+        !(IsValueNode<FuncGraph>(inps[0]) &&
+          inps[0]->cast<ValueNodePtr>()->value()->cast<FuncGraphPtr>()->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL))) {
       MS_LOG(EXCEPTION) << "Input[0] Must be a Primitive valuenode";
     }
+
     auto fn = inps[0];
 
     std::vector<AnfNodePtr> args{fn};
     (void)std::transform(std::begin(inps) + 1, std::end(inps), std::back_inserter(args), ref);
 
     eqv[n] = fg->NewCNode(args);
+    eqv[n]->set_abstract(n->abstract());
+    eqv[n]->set_kernel_info(n->kernel_info_ptr());
   }
 
   std::vector<AnfNodePtr> eqv_keys;
@@ -123,15 +130,18 @@ std::tuple<FuncGraphPtr, AnfNodePtrList, AnfNodePtrList> TransformSegmentToAnfGr
                        [](const std::pair<AnfNodePtr, AnfNodePtr> &elem) -> AnfNodePtr { return elem.first; });
 
   auto outputs = GetOutput(lst, lst[0]->func_graph()->manager()->node_users(), eqv_keys);
-  std::vector<AnfNodePtr> output_args;
-  output_args.push_back(NewValueNode(prim::kPrimMakeTuple));
-  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_args),
-                       [&eqv](const AnfNodePtr &o) -> AnfNodePtr { return eqv[o]; });
-
-  // Set output for AnfGraph
-  auto fg_output = fg->NewCNode(output_args);
+  AnfNodePtr fg_output;
+  if (outputs.size() > 1) {
+    std::vector<AnfNodePtr> output_args;
+    output_args.push_back(NewValueNode(prim::kPrimMakeTuple));
+    (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_args),
+                         [&eqv](const AnfNodePtr &o) -> AnfNodePtr { return eqv[o]; });
+    // Set output for AnfGraph
+    fg_output = fg->NewCNode(output_args);
+  } else {
+    fg_output = eqv[outputs[0]];
+  }
   fg->set_output(fg_output);
-
   return std::make_tuple(fg, inputs, outputs);
 }
 
@@ -148,7 +158,7 @@ std::tuple<FuncGraphPtr, AnfNodePtrList, AnfNodePtrList> TransformSegmentToAnfGr
 //   This implementation will convert the nodes into a subgraph
 //   that will run using the MsVM.
 template <typename T>
-LinConvertResult Convert(const AnfNodePtrList &lst) {
+LinConvertResult Convert(const AnfNodePtrList &lst, const std::string &) {
   auto cached = g_ConvertCache.find(lst);
   if (cached != g_ConvertCache.end()) {
     return cached->second;
diff --git a/mindspore/ccsrc/vm/segment_runner.h b/mindspore/ccsrc/vm/segment_runner.h
index 8ea87da50c..c4458d4148 100644
--- a/mindspore/ccsrc/vm/segment_runner.h
+++ b/mindspore/ccsrc/vm/segment_runner.h
@@ -43,7 +43,7 @@ struct LinConvertResult {
   uint32_t graph_id;
 };
 
-using LinkFuncType = std::function<LinConvertResult(const AnfNodePtrList &)>;
+using LinkFuncType = std::function<LinConvertResult(const AnfNodePtrList &, const std::string &)>;
 using ConvertCache = std::unordered_map<BaseRef, LinConvertResult, BaseRefHash>;
 extern LinkFuncType MsVmConvert;
 extern LinkFuncType GeVmConvert;
diff --git a/mindspore/ccsrc/vm/transform.cc b/mindspore/ccsrc/vm/transform.cc
index 636d36f931..c1fba78be8 100644
--- a/mindspore/ccsrc/vm/transform.cc
+++ b/mindspore/ccsrc/vm/transform.cc
@@ -20,6 +20,9 @@
 
 #include <algorithm>
 #include <map>
+#include <queue>
+#include <stack>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -30,6 +33,7 @@
 #include "utils/graph_utils.h"
 #include "utils/context/ms_context.h"
 #include "debug/trace.h"
+#include "debug/anf_ir_dump.h"
 
 namespace mindspore {
 namespace compile {
@@ -47,6 +51,200 @@ const std::vector<PrimitivePtr> &GetMsNonlinearOps() {
   return ms_nonlinear_ops;
 }
 
+namespace {
+std::string GetCNodeTarget(const AnfNodePtr &node) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  std::string default_target = context_ptr->device_target();
+  if (!node->isa<CNode>()) {
+    return default_target;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto attr_input = cnode->input(kAnfPrimitiveIndex);
+  if (attr_input == nullptr) {
+    return default_target;
+  }
+  auto value_node = attr_input->cast<ValueNodePtr>();
+  if (value_node == nullptr) {
+    return default_target;
+  }
+  auto value = value_node->value();
+  if (value == nullptr) {
+    return default_target;
+  }
+  if (!value->isa<Primitive>()) {
+    return default_target;
+  }
+  auto primitive = value->cast<PrimitivePtr>();
+  auto att_target = primitive->GetAttr("primitive_target");
+  if (att_target != nullptr) {
+    if (!att_target->isa<StringImm>()) {
+      MS_LOG(EXCEPTION) << "Only support string CPU|GPU|Ascend for primitive_target";
+    }
+    auto target = GetValue<std::string>(att_target);
+    if (kTargetSet.find(target) == kTargetSet.end()) {
+      MS_LOG(EXCEPTION) << "Only support string CPU|GPU|Ascend for primitive_target";
+    }
+    return target;
+  }
+  return default_target;
+}
+
+bool ContainMultiTarget(const std::vector<AnfNodePtr> &nodes) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  std::string last_target = context_ptr->device_target();
+  for (auto &node : nodes) {
+    if (node->isa<CNode>()) {
+      std::string cur_target = GetCNodeTarget(node);
+      if (last_target != cur_target) {
+        return true;
+      }
+      last_target = cur_target;
+    }
+  }
+  return false;
+}
+
+void CalcNodeRefCount(const FuncGraphPtr &graph, std::map<AnfNodePtr, size_t> *nodes_ref) {
+  std::queue<AnfNodePtr> queue;
+  queue.push(graph->get_return());
+  std::set<AnfNodePtr> visited;
+  while (!queue.empty()) {
+    auto &node = queue.front();
+    queue.pop();
+    MS_EXCEPTION_IF_NULL(node);
+    if (!node->isa<CNode>()) {
+      continue;
+    }
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    for (auto &input : cnode->inputs()) {
+      auto iter = nodes_ref->find(input);
+      if (iter != nodes_ref->end()) {
+        iter->second++;
+      } else {
+        (void)nodes_ref->insert(std::pair<AnfNodePtr, size_t>(input, 1));
+      }
+      if (visited.find(input) != visited.end()) {
+        continue;
+      }
+      visited.insert(input);
+      queue.push(input);
+    }
+  }
+}
+
+bool IsGetItemNode(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  if (node->isa<CNode>()) {
+    auto cnode = node->cast<CNodePtr>();
+    auto &inputs = cnode->inputs();
+    if (inputs.empty()) {
+      MS_LOG(EXCEPTION) << "Inputs of apply node is empty";
+    }
+    if (!IsValueNode<Primitive>(inputs[0])) {
+      return true;
+    }
+    PrimitivePtr node_prim = GetValueNode<PrimitivePtr>(inputs[0]);
+    return node_prim->name() == prim::kPrimTupleGetItem->name();
+  }
+  return false;
+}
+
+std::vector<AnfNodePtr> ReorderGetItemNode(const std::vector<AnfNodePtr> &nodes) {
+  std::vector<AnfNodePtr> result;
+  std::map<size_t, std::vector<AnfNodePtr>> insert_positions;
+  std::map<AnfNodePtr, size_t> node_positions;
+  for (auto &node : nodes) {
+    if (IsGetItemNode(node)) {
+      auto cnode = node->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(cnode);
+      auto &inputs = cnode->inputs();
+      if (inputs.size() < 2) {
+        MS_LOG(EXCEPTION) << "Invalid get item node";
+      }
+      auto &parent = inputs[1];
+      auto iter = node_positions.find(parent);
+      if (iter != node_positions.end()) {
+        size_t position = iter->second;
+        auto iter_nodes = insert_positions.find(position);
+        if (iter_nodes != insert_positions.end()) {
+          iter_nodes->second.push_back(node);
+        } else {
+          (void)insert_positions.insert(
+            std::pair<size_t, std::vector<AnfNodePtr>>(position, std::vector<AnfNodePtr>{node}));
+        }
+        continue;
+      }
+    }
+    result.emplace_back(node);
+    node_positions[node] = result.size();
+  }
+
+  size_t insert_num = 0;
+  for (auto &item : insert_positions) {
+    size_t position = item.first + insert_num;
+    (void)result.insert(result.begin() + position, item.second.begin(), item.second.end());
+    insert_num += item.second.size();
+  }
+  return result;
+}
+
+std::vector<AnfNodePtr> SplitSort(const FuncGraphPtr &graph, const std::string &default_target) {
+  std::vector<AnfNodePtr> result;
+  std::stack<AnfNodePtr> to_visit;
+  std::stack<AnfNodePtr> next_to_visit;
+  std::map<AnfNodePtr, size_t> nodes_ref;
+  CalcNodeRefCount(graph, &nodes_ref);
+  std::string handle_target = default_target;
+  std::string next_target = "";
+  to_visit.push(graph->get_return());
+  while (!to_visit.empty() || !next_to_visit.empty()) {
+    if (to_visit.empty()) {
+      to_visit.swap(next_to_visit);
+      handle_target = next_target;
+    }
+    auto &node = to_visit.top();
+    MS_EXCEPTION_IF_NULL(node);
+    to_visit.pop();
+    result.emplace_back(node);
+    if (!node->isa<CNode>()) {
+      continue;
+    }
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    auto node_inputs = cnode->inputs();
+    std::reverse(node_inputs.begin(), node_inputs.end());
+    for (auto &input : node_inputs) {
+      auto iter = nodes_ref.find(input);
+      if (iter != nodes_ref.end()) {
+        iter->second--;
+        if (iter->second != 0) {
+          continue;
+        }
+      }
+      if (!input->isa<CNode>()) {
+        to_visit.push(input);
+        continue;
+      }
+      std::string input_target = GetCNodeTarget(input);
+      if (input_target == handle_target) {
+        to_visit.push(input);
+      } else if (next_to_visit.empty() || input_target == next_target) {
+        next_to_visit.push(input);
+        next_target = input_target;
+      } else {
+        MS_LOG(EXCEPTION) << "only support two different target";
+      }
+    }
+  }
+  std::reverse(result.begin(), result.end());
+  return ReorderGetItemNode(result);
+}
+}  // namespace
+
 CompileGraph::CompileGraph(const BackendPtr &backend, const std::vector<PrimitivePtr> &cut_list)
     : backend_(backend), cut_list_(cut_list) {
   MS_EXCEPTION_IF_NULL(backend_);
@@ -72,6 +270,14 @@ bool CompileGraph::IsCut(const AnfNodePtr &node) {
     }
 
     AnfNodePtr fn = inputs[0];
+    MS_EXCEPTION_IF_NULL(fn);
+    if (IsValueNode<FuncGraph>(fn)) {
+      auto fg = GetValueNode<FuncGraphPtr>(fn);
+      if (fg->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+        return false;
+      }
+    }
+
     if (!IsValueNode<Primitive>(fn)) {
       return true;
     }
@@ -80,6 +286,11 @@ bool CompileGraph::IsCut(const AnfNodePtr &node) {
     for (auto &prim : cut_list_) {
       MS_EXCEPTION_IF_NULL(prim);
       if (prim->name() == node_prim->name()) {
+        if (prim->name() == prim::kPrimBpropCut->name()) {
+          auto ms_context = MsContext::GetInstance();
+          MS_EXCEPTION_IF_NULL(ms_context);
+          ms_context->set_enable_pynative_hook(true);
+        }
         return true;
       }
     }
@@ -102,24 +313,33 @@ VectorRef CompileGraph::SplitNodes(const FuncGraphPtr &graph) {
   MS_EXCEPTION_IF_NULL(graph);
   VectorRef splits;
   VectorRef split;
-  std::vector<AnfNodePtr> nodes = TopoSort(graph->get_return());
-
+  auto nodes = TopoSort(graph->get_return());
+  if (ContainMultiTarget(nodes)) {
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    std::string default_target = context_ptr->device_target();
+    nodes = SplitSort(graph, default_target);
+  }
+  std::string last_target;
   MS_LOG(DEBUG) << "Split all nodes size:" << nodes.size();
   for (auto &node : nodes) {
     MS_EXCEPTION_IF_NULL(node);
     if (IsCut(node)) {
-      MS_LOG(DEBUG) << "Cut node:" << node->DebugString(10) << ", size:" << split.size();
       if (split.size() != 0) {
         splits.push_back(split);
       }
       splits.push_back(node);
       split.clear();
-    } else if (!(node->isa<ValueNode>() || node->isa<Parameter>())) {
+    } else if (node->isa<CNode>()) {
+      std::string cur_target = GetCNodeTarget(node);
+      if (cur_target != last_target && !last_target.empty() && split.size() != 0) {
+        splits.push_back(split);
+        split.clear();
+      }
+      last_target = cur_target;
       split.push_back(node);
-      MS_LOG(DEBUG) << "Insert node:" << node->DebugString(10) << ", size:" << split.size();
     }
   }
-  MS_LOG(DEBUG) << "Split node size :" << splits.size();
   return splits;
 }
 
@@ -200,14 +420,14 @@ void CompileGraph::PushParameters(const FuncGraphPtr &graph) {
   }
 }
 
-int CompileGraph::LinConvert(const FuncGraphPtr &graph, const AnfNodePtrList &node_list) {
+int CompileGraph::LinConvert(const FuncGraphPtr &graph, const AnfNodePtrList &node_list, const std::string &target) {
   MS_LOG(DEBUG) << "LinConvert start";
   LinConvertResult result;
 
   if (backend_->simu_flag()) {
     result = backend_->GetMultiGraphRun(graph);
   } else {
-    result = lin_convert_(node_list);
+    result = lin_convert_(node_list, target);
   }
 
   if (result.run == nullptr) {
@@ -316,7 +536,12 @@ bool CompileGraph::SplitGraph(const FuncGraphPtr &graph) {
       auto vec_ref = utils::cast<VectorRef>(split);
       (void)std::transform(vec_ref.begin(), vec_ref.end(), std::back_inserter(args),
                            [](const BaseRef &v) { return utils::cast<AnfNodePtr>(v); });
-      ret = LinConvert(graph, args);
+      if (args.size() > 0) {
+        std::string cur_target = GetCNodeTarget(args[0]);
+        ret = LinConvert(graph, args, cur_target);
+      } else {
+        ret = LinConvert(graph, args);
+      }
       MS_LOG(DEBUG) << "End a extern LinConvert";
       if (ret == RET_FAILED) {
         return false;
@@ -348,7 +573,6 @@ InstSet CompileGraph::GenMultiGraphsSinkInst(const FuncGraphPtr &graph) {
 
 InstSet CompileGraph::Run(const FuncGraphPtr &graph) {
   MS_EXCEPTION_IF_NULL(graph);
-  MS_LOG(DEBUG) << "Compile start graph: " << graph->ToString();
 
   Reset();
   PushParameters(graph);
@@ -574,16 +798,11 @@ CompileGraphs::CompileGraphs(const BackendPtr &backend, const std::vector<Primit
 // Convert graphs to unlinked instructions.
 void CompileGraphs::Compile(const FuncGraphPtr &graph) {
   MS_LOG(DEBUG) << "Start";
-  auto graph_manager = graph->manager();
-  MS_EXCEPTION_IF_NULL(graph_manager);
-  FuncGraphSet graphs = graph_manager->func_graphs();
-  for (auto &g : graphs) {
-    mapping_[g] = static_cast<int>(insts_.size());
-    if (transform_ != nullptr) {
-      InstSet insts = transform_->Run(g);
-      if (!insts.empty()) {
-        (void)insts_.insert(insts_.end(), insts.begin(), insts.end());
-      }
+  mapping_[graph] = static_cast<int>(insts_.size());
+  if (transform_ != nullptr) {
+    InstSet insts = transform_->Run(graph);
+    if (!insts.empty()) {
+      (void)insts_.insert(insts_.end(), insts.begin(), insts.end());
     }
   }
   MS_LOG(DEBUG) << "End";
@@ -628,8 +847,15 @@ FinalVMPtr CompileGraphs::CompileAndLink(const FuncGraphPtr &graph) {
   Reset();
   MS_LOG(DEBUG) << "Begin parameter:" << graph->parameters().size();
 
-  (void)WrapPrimitives(graph);
-  Compile(graph);
+  FuncGraphPtr prim_graph = WrapPrimitives(graph);
+  Compile(prim_graph);
+  MS_EXCEPTION_IF_NULL(prim_graph);
+  FuncGraphSet graphs = prim_graph->manager()->func_graphs();
+  for (auto g : graphs) {
+    if (g != graph && g != nullptr && !(g->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL))) {
+      Compile(g);
+    }
+  }
 
   FinalVMPtr rt = Link(graph);
   Reset();
@@ -637,6 +863,20 @@ FinalVMPtr CompileGraphs::CompileAndLink(const FuncGraphPtr &graph) {
   return rt;
 }
 
+bool CompileGraphs::ContainMixedTarget(const FuncGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto graph_manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(graph_manager);
+  FuncGraphSet graphs = graph_manager->func_graphs();
+  for (auto &g : graphs) {
+    auto nodes = TopoSort(g->get_return());
+    if (ContainMultiTarget(nodes)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 BackendPtr CreateBackend() {
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
diff --git a/mindspore/ccsrc/vm/transform.h b/mindspore/ccsrc/vm/transform.h
index 711c1777ab..f2d54198d6 100644
--- a/mindspore/ccsrc/vm/transform.h
+++ b/mindspore/ccsrc/vm/transform.h
@@ -32,7 +32,7 @@
 #include "vm/segment_runner.h"
 #include "vm/backend.h"
 
-// mindspore namespace is the top level namespace of Mindsporeession project.
+// mindspore namespace is the top level namespace of MindSpore project.
 // Other namespace should be a sub namespace of mindspore namespace in the ME project.
 namespace mindspore {
 extern const char kMsVm[];
@@ -80,7 +80,7 @@ class CompileGraph {
  private:
   void PushParameters(const FuncGraphPtr &func_graph);
   bool SplitGraph(const FuncGraphPtr &func_graph);
-  int LinConvert(const FuncGraphPtr &func_graph, const AnfNodePtrList &node_list);
+  int LinConvert(const FuncGraphPtr &func_graph, const AnfNodePtrList &node_list, const std::string &target = "");
   int InterpretNode(const FuncGraphPtr &func_graph, const CNodePtr &node);
   int AddCall(const FuncGraphPtr &graph, const CNodePtr &node);
   void AddSinkSwitch(const CNodePtr &node);
@@ -124,6 +124,7 @@ class CompileGraphs {
   void Compile(const FuncGraphPtr &func_graph);
   FinalVMPtr Link(const FuncGraphPtr &func_graph);
   FinalVMPtr CompileAndLink(const FuncGraphPtr &func_graph);
+  static bool ContainMixedTarget(const FuncGraphPtr &graph);
 
  private:
   InstSet insts_;
diff --git a/mindspore/ccsrc/vm/vm.cc b/mindspore/ccsrc/vm/vm.cc
index 7107212b6c..c73d41df6c 100644
--- a/mindspore/ccsrc/vm/vm.cc
+++ b/mindspore/ccsrc/vm/vm.cc
@@ -585,8 +585,8 @@ void FinalVM::InstPushPrim(const VectorRef &args) {
     return;
   }
 
-  VectorRef tuple;
   auto prim = utils::cast<PrimitivePtr>(args[0]);
+  VectorRef tuple;
   for (size_t i = 1; i < args.size(); ++i) {
     auto index = utils::cast<int>(args[i]);
     tuple.push_back(Ref(index));
@@ -618,8 +618,9 @@ void FinalVM::SyncData(const py::object &arg) {
 
 BaseRef FinalVM::RunHook(const PrimitivePtr &prim, const VectorRef &args) {
   MS_LOG(DEBUG) << "input for operation:";
+  auto prim_py = dyn_cast<PrimitivePy>(prim);
   std::size_t args_size = args.size();
-  py::tuple py_args = py::tuple(args_size);
+  auto py_args = py::tuple(args_size);
   size_t i = 0;
   for (auto &arg : args) {
     py_args[i] = BaseRefToPyData(arg);
@@ -631,7 +632,7 @@ BaseRef FinalVM::RunHook(const PrimitivePtr &prim, const VectorRef &args) {
   bool is_bprop = prim->HasAttr("bprop");
   if (is_bprop) {
     SyncData(py_args);
-    py::function fn_bprop = prim->hook();
+    py::function fn_bprop = prim_py->hook();
     obj = fn_bprop(*py_args);
     return obj;
   }
@@ -643,11 +644,11 @@ BaseRef FinalVM::RunHook(const PrimitivePtr &prim, const VectorRef &args) {
     std::string cell_id = GetValue<std::string>(prim->GetAttr("cell_id"));
     if (_hook_grad.find(cell_id) != _hook_grad.end()) {
       std::size_t hook_args_size = 3;
-      py::tuple hook_args = py::tuple(hook_args_size);
+      auto hook_args = py::tuple(hook_args_size);
       hook_args[0] = cell_id;
       hook_args[1] = py::make_tuple(_hook_grad[cell_id]);
       hook_args[2] = py::make_tuple(py_args[2]);
-      py::function fn_hook = prim->hook();
+      py::function fn_hook = prim_py->hook();
       obj = fn_hook(*hook_args);
       if (py::isinstance<py::none>(obj)) {
         obj = py_args[2];
@@ -659,7 +660,7 @@ BaseRef FinalVM::RunHook(const PrimitivePtr &prim, const VectorRef &args) {
     }
   } else {
     // Hook operator for execute variable hook function
-    py::function fn_hook = prim->hook();
+    py::function fn_hook = prim_py->hook();
     obj = fn_hook(py::make_tuple(py_args[2]));
     if (py::isinstance<py::none>(obj)) {
       obj = py_args[2];
diff --git a/mindspore/common/_register_for_tensor.py b/mindspore/common/_register_for_tensor.py
index da183d9549..8ba2ff7cc4 100644
--- a/mindspore/common/_register_for_tensor.py
+++ b/mindspore/common/_register_for_tensor.py
@@ -16,6 +16,7 @@
 """Registry the relation."""
 
 from collections import UserDict
+from .. import context
 
 
 class Registry(UserDict):
@@ -27,9 +28,16 @@ class Registry(UserDict):
 
     def get(self, obj_str):
         """Get the value by str."""
-        if isinstance(obj_str, str):
+        if not isinstance(obj_str, str):
+            raise TypeError("key for tensor registry must be string.")
+        if context.get_context("enable_ge"):
+            def wrap(*args):
+                new_args = list(args)
+                new_args.append(obj_str)
+                return self["vm_compare"](*new_args)
+            obj = wrap
+        else:
             obj = self[obj_str]
         return obj
 
-
 tensor_operator_registry = Registry()
diff --git a/mindspore/common/api.py b/mindspore/common/api.py
index 1a726f527e..4fad3e455b 100644
--- a/mindspore/common/api.py
+++ b/mindspore/common/api.py
@@ -20,7 +20,7 @@ from collections import OrderedDict
 from functools import wraps
 from mindspore import context
 from mindspore import log as logger
-from .._c_expression import generate_key, Executor_, Tensor, MetaTensor
+from .._c_expression import generate_key, Executor_, Tensor, MetaTensor, PynativeExecutor_
 from .._c_expression import verify_inputs_signature, init_exec_dataset, _set_dataset_mode_config, init_backend
 from .tensor import Tensor as MsTensor
 
@@ -273,6 +273,34 @@ def _generate_pip_args(obj, *args, method="construct"):
     obj.__parse_method__ = parse_method
     return args_names, args_list
 
+class _PynativeExecutor:
+    """
+    An pynative executor used to compile/manage/run graph.
+
+    Returns:
+        Graph, return the result of pipeline running.
+    """
+
+    def __init__(self):
+        self._executor = PynativeExecutor_.get_instance()
+
+    def new_graph(self, obj, *args):
+        self._executor.new_graph(obj, *args)
+
+    def end_graph(self, obj, output, *args):
+        self._executor.end_graph(obj, output, *args)
+
+    def grad(self, grad, obj, weights, *args):
+        self._executor.grad_net(grad, obj, weights, *args)
+
+    def clear(self, flag=""):
+        self._executor.clear(flag)
+
+    def set_grad_flag(self, flag):
+        self._executor.set_grad_flag(flag)
+
+    def __call__(self, *args):
+        return self._executor(args, "")
 
 class _Executor:
     """
@@ -334,7 +362,7 @@ class _Executor:
                 if not auto_parallel_mode:
                     param.init_data()
                 elif key not in obj.parameter_layout_dict:
-                    logger.info("Layout dict does not contain the key %s.", key)
+                    logger.debug("Layout dict does not contain the key %s.", key)
                     param.init_data(set_sliced=True)
                 else:
                     layout = obj.parameter_layout_dict[key]
@@ -372,7 +400,7 @@ class _Executor:
         key = generate_key(phase, dic)
         self.phase_prefix = str(key[1])
         if phase == 'export':
-            phase = phase + '.' + str(obj.create_time)
+            phase = phase + '.' + self.phase_prefix + '.' + str(obj.create_time)
         else:
             phase = self.phase_prefix + phase + '.' + str(obj.create_time)
         enable_debug_runtime = context.get_context("enable_debug_runtime")
@@ -495,10 +523,16 @@ class _Executor:
             file_format (str): MindSpore currently support 'GEIR' and 'ONNX' format for exported model
         """
         from .._c_expression import export_graph
-        phase = 'export' + '.' + str(net.create_time)
+        phase = 'export' + '.' + self.phase_prefix + '.' + str(net.create_time)
         export_graph(file_name, file_format, phase)
 
+    def fetch_info_for_quant_export(self, exec_id):
+        """Get graph proto from pipeline."""
+        if self._executor.has_compiled(exec_id) is False:
+            return None
+        return self._executor.fetch_info_for_quant_export(exec_id)
 
 _executor = _Executor()
+_pynative_exec = _PynativeExecutor()
 
 __all__ = ['ms_function']
diff --git a/mindspore/common/dtype.py b/mindspore/common/dtype.py
index 02a27591d4..46b111d2f6 100644
--- a/mindspore/common/dtype.py
+++ b/mindspore/common/dtype.py
@@ -170,8 +170,8 @@ def get_py_obj_dtype(obj):
         Type of MindSpore type.
     """
     # Tensor
-    if hasattr(obj, 'dtype') and callable(obj.dtype) and isinstance(obj.dtype(), typing.Type):
-        return tensor_type(obj.dtype())
+    if hasattr(obj, 'dtype') and isinstance(obj.dtype, typing.Type):
+        return tensor_type(obj.dtype)
     if hasattr(obj, '__primitive_flag__') or hasattr(obj, 'construct'):
         return function
     if isinstance(obj, (typing.Type, type)):
diff --git a/mindspore/common/initializer.py b/mindspore/common/initializer.py
index 54c0a1debe..83586272ee 100644
--- a/mindspore/common/initializer.py
+++ b/mindspore/common/initializer.py
@@ -41,7 +41,6 @@ class Initializer:
         self._kwargs = kwargs
         self.shape = None
         self.dtype = None
-        self._seed = None
 
     def _initialize(self, *kwargs):
         raise NotImplementedError('Must be overridden!')
@@ -49,15 +48,6 @@ class Initializer:
     def __call__(self, arr):
         return self._initialize(arr)
 
-    @property
-    def seed(self):
-        return self._seed
-
-    @seed.setter
-    def seed(self, seed_):
-        """set the random seed."""
-        self._seed = seed_
-
     @property
     def shape(self):
         return self._shape
@@ -74,19 +64,30 @@ class Initializer:
     def dtype(self, dtype):
         self._dtype = dtype
 
-    def to_tensor(self):
-        """Get the tensor format data of this Initializer."""
+    def to_tensor(self, slice_index=None, shape=None):
+        """
+        Get the tensor format data of this Initializer.
+
+        Args:
+            slice_index (int): Slice index of a parameter's slices.
+                Used when initialize a slice of a parameter, it guarantee that
+                devices use the same slice can generate the same tensor.
+            shape (list[int]): Shape of the slice, used when initialize a slice of the parameter.
+        """
         arr = None
+        if shape is None:
+            shape = self.shape
+
         try:
-            arr = np.ndarray(self.shape)
+            arr = np.ndarray(shape)
         except ValueError:
-            msg = "Error shape={}".format(self.shape)
+            msg = "Error shape={}".format(shape)
             logger.error(msg)
             raise ValueError(msg)
-        if self._seed is not None:
-            np.random.seed(self.seed)
+
+        if slice_index is not None:
+            np.random.seed(slice_index)
         self.__call__(arr)
-        self._seed = None
         return Tensor(arr, dtype=self.dtype)
 
 def _register(*aliases):
@@ -331,11 +332,11 @@ def initializer(init, shape=None, dtype=mstype.float32):
         raise TypeError("Unsupported init type '{}'.".format(type(init)))
 
     if isinstance(init, Tensor):
-        init_shape = init.shape()
+        init_shape = init.shape
         shape = shape if isinstance(shape, (tuple, list)) else [shape]
         if shape is not None and init_shape != tuple(shape):
             raise ValueError("The shape of init should be same as variable shape, but got the shape of init {} and "
-                             "the variable shape {}.".format(list(init.shape()), shape))
+                             "the variable shape {}.".format(list(init.shape), shape))
         return init
 
     if isinstance(shape, list):
diff --git a/mindspore/common/parameter.py b/mindspore/common/parameter.py
index 788c2d0307..773f6a99a6 100644
--- a/mindspore/common/parameter.py
+++ b/mindspore/common/parameter.py
@@ -15,13 +15,14 @@
 
 """Parameter for cell."""
 import numbers
-from copy import copy, deepcopy
+from copy import copy
+from mindspore import context
 from . import dtype as mstype
 from .initializer import initializer, Initializer
 from .tensor import Tensor, MetaTensor
 from .._checkparam import _check_str_by_regular
 from ..parallel._utils import _set_clone_info, _CloneInfo
-from ..parallel._tensor import _get_seed
+from ..parallel._tensor import _get_slice_index
 
 __all__ = ['Parameter', 'ParameterTuple']
 
@@ -50,15 +51,19 @@ class Parameter:
         requires_grad (bool): True if the parameter requires gradient. Default: True.
         layerwise_parallel (bool): A kind of model parallel mode. When layerwise_parallel is true in paralle mode,
             broadcast and gradients communication would not be applied on parameters. Default: False.
+        sparse_grad (str): Set if the parameter's gradient is sparse. Default: empty.
     """
-    def __init__(self, default_input, name, requires_grad=True, layerwise_parallel=False):
+    def __init__(self, default_input, name, requires_grad=True, layerwise_parallel=False, sparse_grad=""):
         self.set_parameter_data(default_input)
         self.name = name
         self.requires_grad = requires_grad
         self.layerwise_parallel = layerwise_parallel
+        self.sparse_grad = sparse_grad
         self._is_init = False
         self._sliced = False
         self.clone_info = _CloneInfo()
+        if context.get_context("mode") == context.PYNATIVE_MODE:
+            self.init_data()
 
     def __repr__(self):
         format_str = 'Parameter (name={name})'
@@ -135,11 +140,13 @@ class Parameter:
         x.name = prefix + '.' + x.name
         x.is_init = False
         if init != 'same':
-            shape = self.default_input.shape()
-            dtype = self.default_input.dtype()
+            shape = self.default_input.shape
+            dtype = self.default_input.dtype
             if isinstance(init, (str, Initializer, numbers.Number)):
                 x.init_mode = initializer(init, shape=shape, dtype=dtype)
                 x.default_input = MetaTensor(dtype, shape)
+                if context.get_context("mode") == context.PYNATIVE_MODE:
+                    x.init_data()
             else:
                 x.default_input = initializer(init, shape=shape, dtype=dtype)
 
@@ -168,30 +175,37 @@ class Parameter:
             raise TypeError("`requires_grad` parameter must be bool type")
         self._requires_grad = value
 
+    @property
+    def sparse_grad(self):
+        """Return whether the parameter's gradient is sparse."""
+        return self._sparse_grad
+
+    @sparse_grad.setter
+    def sparse_grad(self, value=""):
+        if not isinstance(value, str):
+            raise TypeError("`sparse_grad` parameter must be str type")
+        self._sparse_grad = value
+
     @property
     def data(self):
         return self.default_input
 
     def __add__(self, other):
-        res = deepcopy(self)
-        res.default_input = res.default_input + other
-        return res
+        return self.default_input + other
 
     def __sub__(self, other):
-        res = deepcopy(self)
-        res.default_input = res.default_input - other
-        return res
+        return self.default_input - other
 
     def __mul__(self, other):
-        res = deepcopy(self)
-        default_input = res.default_input * other
-        res.default_input = Tensor(default_input.asnumpy().copy())
-        return res
+        return self.default_input * other
 
     def __truediv__(self, other):
-        res = deepcopy(self)
-        res.default_input = res.default_input / other
-        return res
+        return self.default_input / other
+
+    def __setitem__(self, index, value):
+        default_input = self.default_input
+        default_input[index] = value
+        return self
 
     def set_parameter_data(self, data):
         """Set `default_input` of current `Parameter`."""
@@ -237,10 +251,11 @@ class Parameter:
             if len(layout) != 3:
                 raise ValueError("The length of layout must be 3! layout is {}."
                                  .format(layout))
-            self.init_mode.shape = layout[2]
-            self.init_mode.seed = int(_get_seed(layout[0], layout[1]))
+            slice_index = int(_get_slice_index(layout[0], layout[1]))
+            self.default_input = self.init_mode.to_tensor(slice_index, layout[2])
+        else:
+            self.default_input = self.init_mode.to_tensor()
 
-        self.default_input = self.init_mode.to_tensor()
         self.init_mode = None
         if set_sliced:
             self.sliced = True
diff --git a/mindspore/common/tensor.py b/mindspore/common/tensor.py
index 864447c04d..0a631b954f 100644
--- a/mindspore/common/tensor.py
+++ b/mindspore/common/tensor.py
@@ -44,13 +44,13 @@ class Tensor(Tensor_):
         >>> # init a tensor with input data
         >>> t1 = Tensor(np.zeros([1, 2, 3]), mindspore.float32)
         >>> assert isinstance(t1, Tensor)
-        >>> assert t1.shape() == (1, 2, 3)
-        >>> assert t1.dtype() == mindspore.float32
+        >>> assert t1.shape == (1, 2, 3)
+        >>> assert t1.dtype == mindspore.float32
         >>>
         >>> # init a tensor with a float scalar
         >>> t2 = Tensor(0.1)
         >>> assert isinstance(t2, Tensor)
-        >>> assert t2.dtype() == mindspore.float64
+        >>> assert t2.dtype == mindspore.float64
     """
 
     def __init__(self, input_data, dtype=None):
@@ -71,38 +71,42 @@ class Tensor(Tensor_):
         return str(self.__str__())
 
     def __add__(self, other):
-        check_type('tensor input_data', other, (Tensor, float, int))
         out = tensor_operator_registry.get('__add__')(self, other)
         return out
 
     def __eq__(self, other):
-        if not isinstance(other, Tensor):
+        if not isinstance(other, (int, float, Tensor)):
             return False
-        return Tensor(np.array(self.asnumpy() == other.asnumpy()))
+        #  bool type is not supported for `Equal` operator in backend.
+        if self.dtype == mstype.bool_ or (isinstance(other, Tensor) and other.dtype == mstype.bool_):
+            return Tensor(np.array(self.asnumpy() == other.asnumpy()))
+        return tensor_operator_registry.get('__eq__')(self, other)
 
     def __ne__(self, other):
-        if not isinstance(other, Tensor):
+        if not isinstance(other, (int, float, Tensor)):
             return True
-        return Tensor(np.array(self.asnumpy() != other.asnumpy()))
+        #  bool type is not supported for `NotEqual` operator in backend.
+        if self.dtype == mstype.bool_ or (isinstance(other, Tensor) and other.dtype == mstype.bool_):
+            return Tensor(np.array(self.asnumpy() != other.asnumpy()))
+        return tensor_operator_registry.get('__ne__')(self, other)
 
     def __hash__(self):
         return hash(id(self))
 
     def __mul__(self, other):
-        check_type('tensor input_data', other, (Tensor, float, int))
         out = tensor_operator_registry.get('__mul__')(self, other)
         return out
 
     def __neg__(self):
-        return Tensor(-self.asnumpy())
+        out = tensor_operator_registry.get('__neg__')(self)
+        return out
 
     def __iadd__(self, other):
         out = self.__add__(other)
         return out
 
     def __radd__(self, other):
-        check_type('tensor operation input', other, (Tensor, float, int))
-        out = tensor_operator_registry.get('__add__')(other, self)
+        out = tensor_operator_registry.get('__add__')(self, other)
         return out
 
     def __imul__(self, other):
@@ -110,23 +114,19 @@ class Tensor(Tensor_):
         return out
 
     def __rmul__(self, other):
-        check_type('tensor operation input', other, (Tensor, float, int))
-        out = tensor_operator_registry.get('__mul__')(other, self)
+        out = tensor_operator_registry.get('__mul__')(self, other)
         return out
 
     def __truediv__(self, other):
-        check_type('tensor operation input', other, (Tensor, float, int))
-        out = tensor_operator_registry.get('__div__')(self, other)
+        out = tensor_operator_registry.get('__truediv__')(self, other)
         return out
 
     def __rtruediv__(self, other):
-        check_type('tensor operation input', other, (Tensor, float, int))
-        out = tensor_operator_registry.get('__div__')(other, self)
+        out = tensor_operator_registry.get('__truediv__')(other, self)
         return out
 
     def __sub__(self, other):
-        check_type('tensor operation input', other, (Tensor, float, int))
-        out = self.__add__(-other)
+        out = tensor_operator_registry.get('__sub__')(self, other)
         return out
 
     def __isub__(self, other):
@@ -134,12 +134,42 @@ class Tensor(Tensor_):
         return out
 
     def __rsub__(self, other):
-        check_type('tensor operation input', other, (Tensor, float, int))
-        out = tensor_operator_registry.get('__add__')(other, Tensor(-self.asnumpy()))
+        out = tensor_operator_registry.get('__sub__')(other, self)
+        return out
+
+    def __lt__(self, other):
+        out = tensor_operator_registry.get('__lt__')(self, other)
+        return out
+
+    def __le__(self, other):
+        out = tensor_operator_registry.get('__le__')(self, other)
         return out
 
+    def __getitem__(self, index):
+        out = tensor_operator_registry.get('__getitem__')(self, index)
+        return out
+
+    def __setitem__(self, index, value):
+        out = tensor_operator_registry.get('__setitem__')(self, index, value)
+        self.assign_value(out)
+        return self
+
+    def __gt__(self, other):
+        out = tensor_operator_registry.get('__gt__')(self, other)
+        return out
+
+    def __ge__(self, other):
+        out = tensor_operator_registry.get('__ge__')(self, other)
+        return out
+
+    def __len__(self):
+        out = tensor_operator_registry.get('shape')(self)
+        if not out:
+            return 1
+        return out[0]
+
     def __str__(self):
-        if self.dtype() == mstype.type_none:
+        if self.dtype == mstype.type_none:
             return "Unknown Tensor type!"
         return str(self.asnumpy())
 
diff --git a/mindspore/context.py b/mindspore/context.py
index 89fb56b843..ad601f8fab 100644
--- a/mindspore/context.py
+++ b/mindspore/context.py
@@ -25,6 +25,7 @@ from mindspore._c_expression import MSContext
 from mindspore._checkparam import args_type_check
 from mindspore.parallel._auto_parallel_context import _set_auto_parallel_context, _get_auto_parallel_context, \
     _reset_auto_parallel_context
+from mindspore.parallel.mpi._mpi_config import _set_mpi_config, _get_mpi_config
 
 __all__ = ['GRAPH_MODE', 'PYNATIVE_MODE', 'set_context', 'get_context', 'set_auto_parallel_context',
            'get_auto_parallel_context', 'reset_auto_parallel_context']
@@ -55,7 +56,8 @@ def _make_directory(path):
             os.makedirs(path)
             real_path = path
         except PermissionError as e:
-            logger.error(f"No write permission on the directory `{path}, error = {e}")
+            logger.error(
+                f"No write permission on the directory `{path}, error = {e}")
             raise ValueError(f"No write permission on the directory `{path}`.")
     return real_path
 
@@ -78,11 +80,13 @@ class _ThreadLocalInfo(threading.local):
     def reserve_class_name_in_scope(self, reserve_class_name_in_scope):
         """Sets whether to save the network class name in the scope."""
         if not isinstance(reserve_class_name_in_scope, bool):
-            raise ValueError("Set reserve_class_name_in_scope value must be bool!")
+            raise ValueError(
+                "Set reserve_class_name_in_scope value must be bool!")
         self._reserve_class_name_in_scope = reserve_class_name_in_scope
 
 
-_ContextRecord = namedtuple("_ContextRecord", ["is_pynative_mode", "switch_context_fn"])
+_ContextRecord = namedtuple(
+    "_ContextRecord", ["is_pynative_mode", "switch_context_fn"])
 
 
 class _ContextSwitchInfo(threading.local):
@@ -109,7 +113,8 @@ class _ContextSwitchInfo(threading.local):
         """
         if isinstance(switch_context_fn, FunctionType):
             switch_context_fn()
-        self.context_stack.append(_ContextRecord(is_pynative, switch_context_fn))
+        self.context_stack.append(
+            _ContextRecord(is_pynative, switch_context_fn))
 
     def pop(self):
         self.context_stack.pop()
@@ -193,7 +198,8 @@ class _Context:
 
     @save_graphs_path.setter
     def save_graphs_path(self, save_graphs_path):
-        self._context_handle.set_save_graphs_path(_make_directory(save_graphs_path))
+        self._context_handle.set_save_graphs_path(
+            _make_directory(save_graphs_path))
 
     @property
     def device_target(self):
@@ -212,7 +218,8 @@ class _Context:
     @device_id.setter
     def device_id(self, device_id):
         if device_id < 0 or device_id > 4095:
-            raise ValueError("Device id must be in [0, 4095], but got {}".format(device_id))
+            raise ValueError(
+                "Device id must be in [0, 4095], but got {}".format(device_id))
         success = self._context_handle.set_device_id(device_id)
         if not success:
             raise RuntimeError("Device id set failed!!!")
@@ -239,7 +246,8 @@ class _Context:
 
     @enable_auto_mixed_precision.setter
     def enable_auto_mixed_precision(self, enable_auto_mixed_precision):
-        self._context_handle.set_auto_mixed_precision_flag(enable_auto_mixed_precision)
+        self._context_handle.set_auto_mixed_precision_flag(
+            enable_auto_mixed_precision)
 
     @property
     def enable_reduce_precision(self):
@@ -247,7 +255,8 @@ class _Context:
 
     @enable_reduce_precision.setter
     def enable_reduce_precision(self, enable_reduce_precision):
-        self._context_handle.set_enable_reduce_precision_flag(enable_reduce_precision)
+        self._context_handle.set_enable_reduce_precision_flag(
+            enable_reduce_precision)
 
     @property
     def enable_dump(self):
@@ -279,12 +288,21 @@ class _Context:
 
     @profiling_options.setter
     def profiling_options(self, option):
-        options = ["training_trace", "task_trace", "task_trace:training_trace", "training_trace:task_trace", "op_trace"]
+        options = ["training_trace", "task_trace",
+                   "task_trace:training_trace", "training_trace:task_trace", "op_trace"]
         if option not in options:
             raise ValueError("Profiling options must be in 'training_trace' 'task_trace' "
                              "'task_trace:training_trace' 'training_trace:task_trace' or 'op_trace'.")
         self._context_handle.set_profiling_options(option)
 
+    @property
+    def enable_graph_kernel(self):
+        return self._context_handle.get_enable_graph_kernel()
+
+    @enable_graph_kernel.setter
+    def enable_graph_kernel(self, graph_kernel_switch_):
+        self._context_handle.set_enable_graph_kernel(graph_kernel_switch_)
+
     @property
     def reserve_class_name_in_scope(self):
         """Gets whether to save the network class name in the scope."""
@@ -302,13 +320,19 @@ class _Context:
     @variable_memory_max_size.setter
     def variable_memory_max_size(self, variable_memory_max_size):
         if not check_input_format(variable_memory_max_size):
-            raise ValueError("Context param variable_memory_max_size should be in correct format! Such as \"5GB\"")
+            raise ValueError(
+                "Context param variable_memory_max_size should be in correct format! Such as \"5GB\"")
         if int(variable_memory_max_size[:-2]) >= _DEVICE_APP_MEMORY_SIZE:
-            raise ValueError("Context param variable_memory_max_size should be less than 31GB.")
-        variable_memory_max_size_ = variable_memory_max_size[:-2] + " * 1024 * 1024 * 1024"
-        graph_memory_max_size = _DEVICE_APP_MEMORY_SIZE - int(variable_memory_max_size[:-2])
-        graph_memory_max_size_ = str(graph_memory_max_size) + " * 1024 * 1024 * 1024"
-        self._context_handle.set_variable_memory_max_size(variable_memory_max_size_)
+            raise ValueError(
+                "Context param variable_memory_max_size should be less than 31GB.")
+        variable_memory_max_size_ = variable_memory_max_size[:-
+                                                             2] + " * 1024 * 1024 * 1024"
+        graph_memory_max_size = _DEVICE_APP_MEMORY_SIZE - \
+            int(variable_memory_max_size[:-2])
+        graph_memory_max_size_ = str(
+            graph_memory_max_size) + " * 1024 * 1024 * 1024"
+        self._context_handle.set_variable_memory_max_size(
+            variable_memory_max_size_)
         self._context_handle.set_graph_memory_max_size(graph_memory_max_size_)
 
     @property
@@ -332,6 +356,28 @@ class _Context:
     def check_bprop(self, check_bprop_flag):
         self._context_handle.set_check_bprop_flag(check_bprop_flag)
 
+    @property
+    def max_device_memory(self):
+        return self._context_handle.get_max_device_memory()
+
+    @max_device_memory.setter
+    def max_device_memory(self, max_device_memory):
+        if not check_input_format(max_device_memory):
+            raise ValueError("Context param max_device_memory should be in correct format! Such as \"3.5GB\"")
+        max_device_memory_value = float(max_device_memory[:-2])
+        if max_device_memory_value == 0:
+            raise ValueError("Context param max_device_memory should be in correct format! Such as \"3.5GB\"")
+        self._context_handle.set_max_device_memory(max_device_memory_value)
+
+    @property
+    def print_file_path(self):
+        return None
+
+    @print_file_path.setter
+    def print_file_path(self, file):
+        self._context_handle.set_print_file_path(file)
+
+
 def check_input_format(x):
     import re
     pattern = r'[1-9][0-9]*(\.)?[0-9]*GB|0\.[0-9]*GB'
@@ -367,7 +413,8 @@ def _context():
 
 
 @args_type_check(device_num=int, global_rank=int, mirror_mean=bool, cast_before_mirror=bool, parallel_mode=str,
-                 parameter_broadcast=bool, strategy_ckpt_load_file=str, strategy_ckpt_save_file=str)
+                 auto_parallel_search_mode=str, parameter_broadcast=bool, strategy_ckpt_load_file=str,
+                 strategy_ckpt_save_file=str, full_batch=bool)
 def set_auto_parallel_context(**kwargs):
     """
     Set auto parallel context.
@@ -399,11 +446,18 @@ def set_auto_parallel_context(**kwargs):
                        setting parallel strategies.
 
                      - auto_parallel: Achieving parallelism automatically.
+        auto_parallel_search_mode (str): There are two kinds of search modes, "recursive_programming"
+                     and "dynamic_programming". Default: "dynamic_programming".
+
+                     - recursive_programming: Recursive programming search mode.
+
+                     - dynamic_programming: Dynamic programming search mode.
         parameter_broadcast (bool): Indicating whether to broadcast parameters before training.
                        "stand_alone", "semi_auto_parallel" and "auto_parallel" do not support parameter
                        broadcast. Default: False.
         strategy_ckpt_load_file (str): The path to load parallel strategy checkpoint. Default: ''
         strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
+        full_batch (bool): Whether to load the whole batch on each device. Default: False.
 
     Raises:
         ValueError: If input key is not attribute in auto parallel context.
@@ -453,13 +507,11 @@ def reset_auto_parallel_context():
     _reset_auto_parallel_context()
 
 
-@args_type_check(mode=int, precompile_only=bool, device_target=str,
-                 device_id=int, enable_ir_fusion=bool, save_graphs=bool,
-                 enable_task_sink=bool, save_graphs_path=str, enable_loop_sink=bool,
-                 enable_mem_reuse=bool, save_ms_model=bool, save_ms_model_path=str, enable_gpu_summary=bool,
-                 enable_auto_mixed_precision=bool, enable_dump=bool, save_dump_path=str,
-                 enable_reduce_precision=bool, enable_dynamic_memory=bool, graph_memory_max_size=str,
-                 variable_memory_max_size=str, enable_profiling=bool, profiling_options=str)
+@args_type_check(mode=int, precompile_only=bool, device_target=str, device_id=int, save_graphs=bool,
+                 save_graphs_path=str, save_ms_model=bool, save_ms_model_path=str, enable_dump=bool,
+                 save_dump_path=str, enable_reduce_precision=bool, variable_memory_max_size=str,
+                 enable_profiling=bool, profiling_options=str, enable_auto_mixed_precision=bool,
+                 check_bprop=bool, max_device_memory=str, print_file_path=str)
 def set_context(**kwargs):
     """
     Sets context for running environment.
@@ -476,7 +528,6 @@ def set_context(**kwargs):
 
     Note:
         Attribute name is required for setting attributes.
-        If need to config graph max memory size and variable max memory size, one must make sure:
 
     Args:
         mode (int): Running in GRAPH_MODE(0) or PYNATIVE_MODE(1). Default: PYNATIVE_MODE.
@@ -511,6 +562,8 @@ def set_context(**kwargs):
             separated by colons; single operator can choose op_trace, op_trace cannot be combined with
             training_trace and task_trace. Default: "training_trace".
         check_bprop (bool): Whether to check bprop. Default: False.
+        max_device_memory (str): Sets the maximum memory available for device, currently only supported on GPU.
+            The format is "xxGB". Default: "1024GB".
 
     Raises:
         ValueError: If input key is not an attribute in context.
@@ -530,6 +583,7 @@ def set_context(**kwargs):
         >>>                     device_target="Ascend",device_id=0, save_graphs=True,
         >>>                     save_graphs_path="/mindspore")
         >>> context.set_context(enable_profiling=True, profiling_options="training_trace")
+        >>> context.set_context(max_device_memory="3.5GB")
     """
     for key, value in kwargs.items():
         if not hasattr(_context(), key):
@@ -551,5 +605,43 @@ def get_context(attr_key):
         ValueError: If input key is not an attribute in context.
     """
     if not hasattr(_context(), attr_key):
-        raise ValueError("Get context keyword %s is not recognized!" % attr_key)
+        raise ValueError(
+            "Get context keyword %s is not recognized!" % attr_key)
     return getattr(_context(), attr_key)
+
+@args_type_check(enable_mpi=bool)
+def set_mpi_config(**kwargs):
+    """
+    Sets mpi config for running environment.
+
+    mpi config should be configured before running your program. If there is no configuration,
+    mpi moudle will be disabled by default.
+
+    Note:
+        Attribute name is required for setting attributes.
+
+    Args:
+        enable_mpi (bool): Whether to enable mpi. Default: False.
+
+    Raises:
+        ValueError: If input key is not an attribute in mpi config.
+
+    Examples:
+        >>> mpiconfig.set_mpi_config(enable_mpi=True)
+    """
+    _set_mpi_config(**kwargs)
+
+def get_mpi_config(attr_key):
+    """
+    Gets mpi config attribute value according to the input key.
+
+    Args:
+        attr_key (str): The key of the attribute.
+
+    Returns:
+        Object, The value of given attribute key.
+
+    Raises:
+        ValueError: If input key is not an attribute in context.
+    """
+    return _get_mpi_config(attr_key)
diff --git a/mindspore/dataset/__init__.py b/mindspore/dataset/__init__.py
index ceca188112..f0070b428d 100644
--- a/mindspore/dataset/__init__.py
+++ b/mindspore/dataset/__init__.py
@@ -19,16 +19,16 @@ can also create samplers with this module to sample data.
 """
 
 from .core.configuration import config
-from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, \
-    GeneratorDataset, ManifestDataset, Cifar10Dataset, Cifar100Dataset, VOCDataset, CelebADataset, TextFileDataset, \
-    Schema, Shuffle, zip, RandomDataset
+from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, NumpySlicesDataset, \
+    GeneratorDataset, ManifestDataset, Cifar10Dataset, Cifar100Dataset, VOCDataset, CocoDataset, CelebADataset,\
+    TextFileDataset, CLUEDataset, Schema, Shuffle, zip, RandomDataset
 from .engine.samplers import DistributedSampler, PKSampler, RandomSampler, SequentialSampler, SubsetRandomSampler, \
-    WeightedRandomSampler, SubsetSampler, Sampler
+    WeightedRandomSampler, Sampler
 from .engine.serializer_deserializer import serialize, deserialize, show
 from .engine.graphdata import GraphData
 
 __all__ = ["config", "ImageFolderDatasetV2", "MnistDataset",
            "MindDataset", "GeneratorDataset", "TFRecordDataset",
-           "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset",
-           "VOCDataset", "TextFileDataset", "Schema", "DistributedSampler", "PKSampler", "RandomSampler",
-           "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler", "zip", "GraphData"]
+           "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", "NumpySlicesDataset", "VOCDataset",
+           "CocoDataset", "TextFileDataset", "CLUEDataset", "Schema", "DistributedSampler", "PKSampler",
+           "RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler", "zip", "GraphData"]
diff --git a/mindspore/dataset/core/configuration.py b/mindspore/dataset/core/configuration.py
index 38b25368b3..d3175cd181 100644
--- a/mindspore/dataset/core/configuration.py
+++ b/mindspore/dataset/core/configuration.py
@@ -125,6 +125,35 @@ class ConfigurationManager:
         """
         return self.config.get_num_parallel_workers()
 
+    def set_monitor_sampling_interval(self, interval):
+        """
+        Set the default interval(ms) of monitor sampling.
+
+        Args:
+            interval: interval(ms) to be used to performance monitor sampling.
+
+        Raises:
+            ValueError: If interval is invalid (<= 0 or > MAX_INT_32).
+
+        Examples:
+            >>> import mindspore.dataset as ds
+            >>> con = ds.engine.ConfigurationManager()
+            >>> # sets the new interval value.
+            >>> con.set_monitor_sampling_interval(100)
+        """
+        if interval <= 0 or interval > INT32_MAX:
+            raise ValueError("Interval given is not within the required range")
+        self.config.set_monitor_sampling_interval(interval)
+
+    def get_monitor_sampling_interval(self):
+        """
+        Get the default interval of performance monitor sampling.
+
+        Returns:
+            Interval: interval(ms) of performance monitor sampling.
+        """
+        return self.config.get_monitor_sampling_interval()
+
     def __str__(self):
         """
         String representation of the configurations.
diff --git a/mindspore/dataset/engine/__init__.py b/mindspore/dataset/engine/__init__.py
index 59dca2f681..674848f156 100644
--- a/mindspore/dataset/engine/__init__.py
+++ b/mindspore/dataset/engine/__init__.py
@@ -28,10 +28,9 @@ from .serializer_deserializer import serialize, deserialize, show, compare
 from .samplers import *
 from ..core.configuration import config, ConfigurationManager
 
-
 __all__ = ["config", "ConfigurationManager", "zip",
            "ImageFolderDatasetV2", "MnistDataset",
-           "MindDataset", "GeneratorDataset", "TFRecordDataset",
+           "MindDataset", "GeneratorDataset", "TFRecordDataset", "CLUEDataset",
            "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset",
-           "VOCDataset", "TextFileDataset", "Schema", "DistributedSampler", "PKSampler",
-           "RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler"]
+           "VOCDataset", "CocoDataset", "TextFileDataset", "Schema", "DistributedSampler",
+           "PKSampler", "RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler"]
diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py
index 04d6a6e11d..ca6f7ca33e 100644
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -33,17 +33,18 @@ import copy
 import numpy as np
 
 from mindspore._c_dataengine import DataType, TFReaderOp, ImageFolderOp, CifarOp, MnistOp, ManifestOp, \
-    MindRecordOp, TextFileOp, VOCOp, CBatchInfo
+    MindRecordOp, TextFileOp, ClueOp, VOCOp, CocoOp, CBatchInfo
 from mindspore._c_expression import typing
 
 from mindspore import log as logger
 from . import samplers
 from .iterators import DictIterator, TupleIterator
 from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
-    check_rename, \
+    check_rename, check_numpyslicesdataset, \
     check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \
-    check_tfrecorddataset, check_vocdataset, check_celebadataset, check_minddataset, check_generatordataset, \
-    check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, check_split
+    check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \
+    check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \
+    check_split, check_bucket_batch_by_length, check_cluedataset
 from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
 
 try:
@@ -60,7 +61,7 @@ class Shuffle(str, Enum):
 @check_zip
 def zip(datasets):
     """
-    Zips the datasets in the input tuple of datasets.
+    Zip the datasets in the input tuple of datasets.
 
     Args:
         datasets (tuple of class Dataset): A tuple of datasets to be zipped together.
@@ -133,8 +134,8 @@ class Dataset:
     """
 
     def __init__(self, num_parallel_workers=None):
-        self.input = []
-        self.output = []
+        self.children = []
+        self.parent = []
         self.num_parallel_workers = num_parallel_workers
         self._device_iter = 0
         self._input_indexs = ()
@@ -151,7 +152,7 @@ class Dataset:
 
     def get_args(self):
         """
-        Returns attributes (member variables) related to the current class.
+        Return attributes (member variables) related to the current class.
 
         Must include all arguments passed to the __init__() of the current class, excluding 'input_dataset'.
 
@@ -164,11 +165,81 @@ class Dataset:
         args["num_parallel_workers"] = self.num_parallel_workers
         return args
 
+    @check_bucket_batch_by_length
+    def bucket_batch_by_length(self, column_names, bucket_boundaries, bucket_batch_sizes,
+                               element_length_function=None, pad_info=None,
+                               pad_to_bucket_boundary=False, drop_remainder=False):
+        """
+        Bucket elements according to their lengths, and pad and batch the buckets when
+        they are full.
+
+        A length function is called on each row in the dataset, the row is then
+        bucketed based on its length and bucket_boundaries. When a bucket reaches its
+        corresponding size specified in bucket_batch_sizes, the entire bucket will be
+        padded according to batch_info, and then batched. Each batch will be full,
+        except for maybe the last batch for each bucket.
+
+        Args:
+            column_names (list of string): Columns passed to element_length_function.
+            bucket_boundaries (list of int): A list consisting of the upper boundaries
+                of the buckets. Must be strictly increasing. If there are n boundaries,
+                n+1 buckets are created: One bucket for [0, bucket_boundaries[0]), one
+                bucket for [bucket_boundaries[i], bucket_boundaries[i+1]) for each
+                0<i<n, and one bucket for [bucket_boundaries[n-1], inf).
+            bucket_batch_sizes (list of int): A list consisting of the batch sizes for
+                each bucket. Must contain len(bucket_boundaries)+1 elements.
+            element_length_function (Callable, optional): A function that takes in
+                len(column_names) arguments and returns an int. If no value is
+                provided, then len(column_names) must be 1, and the size of the first
+                dimension of that column will be taken as the length (default=None).
+            pad_info (dict, optional): Represents how to batch each column. The key
+                corresponds to the column name, the value must be a tuple of 2 elements.
+                The first element corresponds to the shape to pad to, and the second
+                element corresponds to the value to pad with. If a column is not
+                specified, then that column will be padded to the longest in the current
+                batch, and 0 will be used as the padding value. Any None dimensions will
+                be padded to the longest in the current batch, unless if
+                pad_to_bucket_boundary is True. If no padding is wanted, set pad_info
+                to None (default=None).
+            pad_to_bucket_boundary (bool, optional): If True, will pad each None
+                dimension in pad_info to the bucket_boundary minus 1. If there are any
+                elements that fall into the last bucket, an error will occur
+                (default=False).
+            drop_remainder (bool, optional): If True, will drop the last batch for each
+                bucket if it is not a full batch (default=False).
+
+        Examples:
+            >>> import mindspore.dataset as ds
+            >>> # data is an instance of Dataset object.
+            >>>
+            >>> # creates a dataset where every 100 rows is combined into a batch
+            >>> # and drops the last incomplete batch if there is one.
+            >>> column_names = ["col1", "col2"]
+            >>> buket_boundaries = [5, 10]
+            >>> bucket_batch_sizes = [5, 1, 1]
+            >>> element_length_function = (lambda col1, col2: max(len(col1), len(col2)))
+            >>>
+            >>> # will pad col1 to shape [2, bucket_boundaries[i]] where i is the
+            >>> # index of the bucket that is currently being batched.
+            >>> # will pad col2 to a shape where each dimension is the longest in all
+            >>> # the elements currently being batched.
+            >>> pad_info = {"col1", ([2, None], -1)}
+            >>> pad_to_bucket_boundary = True
+            >>>
+            >>> data = data.bucket_batch_by_length(column_names, bucket_boundaries,
+            >>>                                    bucket_batch_sizes,
+            >>>                                    element_length_function, pad_info,
+            >>>                                    pad_to_bucket_boundary)
+        """
+        return BucketBatchByLengthDataset(self, column_names, bucket_boundaries, bucket_batch_sizes,
+                                          element_length_function, pad_info,
+                                          pad_to_bucket_boundary, drop_remainder)
+
     @check_batch
     def batch(self, batch_size, drop_remainder=False, num_parallel_workers=None, per_batch_map=None,
               input_columns=None, pad_info=None):
         """
-        Combines batch_size number of consecutive rows into batches.
+        Combine batch_size number of consecutive rows into batches.
 
         For any child node, a batch is treated as a single row.
         For any column, all the elements within that column must have the same shape.
@@ -269,7 +340,7 @@ class Dataset:
 
     def flat_map(self, func):
         """
-        Maps `func` to each row in dataset and flatten the result.
+        Map `func` to each row in dataset and flatten the result.
 
         The specified `func` is a function that must take one 'Ndarray' as input
         and return a 'Dataset'.
@@ -299,6 +370,7 @@ class Dataset:
         """
         dataset = None
         if not hasattr(func, '__call__'):
+            logger.error("func must be a function.")
             raise TypeError("func must be a function.")
 
         for row_data in self:
@@ -308,6 +380,7 @@ class Dataset:
                 dataset += func(row_data)
 
         if not isinstance(dataset, Dataset):
+            logger.error("flat_map must return a Dataset object.")
             raise TypeError("flat_map must return a Dataset object.")
         return dataset
 
@@ -315,7 +388,7 @@ class Dataset:
     def map(self, input_columns=None, operations=None, output_columns=None, columns_order=None,
             num_parallel_workers=None, python_multiprocessing=False):
         """
-        Applies each operation in operations to this dataset.
+        Apply each operation in operations to this dataset.
 
         The order of operations is determined by the position of each operation in operations.
         operations[0] will be applied first, then operations[1], then operations[2], etc.
@@ -499,7 +572,7 @@ class Dataset:
     @check_repeat
     def repeat(self, count=None):
         """
-        Repeats this dataset count times. Repeat indefinitely if the count is None or -1.
+        Repeat this dataset count times. Repeat indefinitely if the count is None or -1.
 
         Note:
             The order of using repeat and batch reflects the number of batches. Recommend that
@@ -591,13 +664,16 @@ class Dataset:
         dataset_size = self.get_dataset_size()
 
         if dataset_size is None or dataset_size <= 0:
-            raise RuntimeError("dataset size unknown, unable to split.")
+            raise RuntimeError("dataset_size is unknown, unable to split.")
+
+        if not isinstance(sizes, list):
+            raise RuntimeError("sizes should be a list.")
 
         all_int = all(isinstance(item, int) for item in sizes)
         if all_int:
             sizes_sum = sum(sizes)
             if sizes_sum != dataset_size:
-                raise RuntimeError("sum of split sizes {} is not equal to dataset size {}."
+                raise RuntimeError("Sum of split sizes {} is not equal to dataset size {}."
                                    .format(sizes_sum, dataset_size))
             return sizes
 
@@ -605,7 +681,7 @@ class Dataset:
         for item in sizes:
             absolute_size = int(round(item * dataset_size))
             if absolute_size == 0:
-                raise RuntimeError("split percentage {} is too small.".format(item))
+                raise RuntimeError("Split percentage {} is too small.".format(item))
             absolute_sizes.append(absolute_size)
 
         absolute_sizes_sum = sum(absolute_sizes)
@@ -613,7 +689,7 @@ class Dataset:
         # if we still need more rows, give them to the first split.
         # if we have too many rows, remove the extras from the first split that has
         # enough rows.
-        size_difference = dataset_size - absolute_sizes_sum
+        size_difference = int(dataset_size - absolute_sizes_sum)
         if size_difference > 0:
             absolute_sizes[0] += size_difference
         else:
@@ -623,7 +699,7 @@ class Dataset:
                     break
 
         if sum(absolute_sizes) != dataset_size:
-            raise RuntimeError("sum of calculated split sizes {} is not equal to dataset size {}."
+            raise RuntimeError("Sum of calculated split sizes {} is not equal to dataset size {}."
                                .format(absolute_sizes_sum, dataset_size))
 
         return absolute_sizes
@@ -631,7 +707,7 @@ class Dataset:
     @check_split
     def split(self, sizes, randomize=True):
         """
-        Splits the dataset into smaller, non-overlapping datasets.
+        Split the dataset into smaller, non-overlapping datasets.
 
         This is a general purpose split function which can be called from any operator in the pipeline.
         There is another, optimized split function, which will be called automatically if ds.split is
@@ -647,10 +723,14 @@ class Dataset:
                 Datasets of size round(f1*K), round(f2*K), …, round(fn*K) where K is the size of the
                 original dataset.
                 If after rounding:
-                    -Any size equals 0, an error will occur.
-                    -The sum of split sizes < K, the difference will be added to the first split.
-                    -The sum of split sizes > K, the difference will be removed from the first large
-                    enough split such that it will have atleast 1 row after removing the difference.
+
+                    - Any size equals 0, an error will occur.
+
+                    - The sum of split sizes < K, the difference will be added to the first split.
+
+                    - The sum of split sizes > K, the difference will be removed from the first large
+                      enough split such that it will have atleast 1 row after removing the difference.
+
             randomize (bool, optional): determines whether or not to split the data randomly (default=True).
                 If true, the data will be randomly split. Otherwise, each split will be created with
                 consecutive rows from the dataset.
@@ -684,10 +764,10 @@ class Dataset:
             >>> train, test = data.split([0.9, 0.1])
         """
         if self.is_shuffled():
-            logger.warning("dataset is shuffled before split.")
+            logger.warning("Dataset is shuffled before split.")
 
         if self.is_sharded():
-            raise RuntimeError("dataset should not be sharded before split.")
+            raise RuntimeError("Dataset should not be sharded before split.")
 
         absolute_sizes = self._get_absolute_split_sizes(sizes)
         splits = []
@@ -713,7 +793,7 @@ class Dataset:
     @check_zip_dataset
     def zip(self, datasets):
         """
-        Zips the datasets in the input tuple of datasets. Columns in the input datasets must not have the same name.
+        Zip the datasets in the input tuple of datasets. Columns in the input datasets must not have the same name.
 
         Args:
             datasets (tuple or class Dataset): A tuple of datasets or a single class Dataset
@@ -770,7 +850,7 @@ class Dataset:
     @check_rename
     def rename(self, input_columns, output_columns):
         """
-        Renames the columns in input datasets.
+        Rename the columns in input datasets.
 
         Args:
             input_columns (list[str]): list of names of the input columns.
@@ -796,7 +876,7 @@ class Dataset:
     @check_project
     def project(self, columns):
         """
-        Projects certain columns in input datasets.
+        Project certain columns in input datasets.
 
         The specified columns will be selected from the dataset and passed down
         the pipeline in the order specified. The other columns are discarded.
@@ -819,6 +899,9 @@ class Dataset:
 
         return ProjectDataset(self, columns)
 
+    def build_vocab(self, vocab, columns, freq_range, top_k, special_tokens, special_first):
+        return BuildVocabDataset(self, vocab, columns, freq_range, top_k, special_tokens, special_first)
+
     def apply(self, apply_func):
         """
         Apply a function in this dataset.
@@ -858,7 +941,7 @@ class Dataset:
 
     def device_que(self, prefetch_size=None):
         """
-        Returns a transferredDataset that transfer data through device.
+        Return a transferredDataset that transfer data through device.
 
         Args:
             prefetch_size (int, optional): prefetch number of records ahead of the
@@ -875,7 +958,7 @@ class Dataset:
 
     def to_device(self, num_batch=None):
         """
-        Transfers data through CPU, GPU or Ascend devices.
+        Transfer data through CPU, GPU or Ascend devices.
 
         Args:
             num_batch (int, optional): limit the number of batch to be sent to device (default=None).
@@ -910,29 +993,28 @@ class Dataset:
             raise TypeError("Please set device_type in context")
 
         if device_type not in ('Ascend', 'GPU', 'CPU'):
-            raise ValueError("only support CPU, Ascend, GPU")
+            raise ValueError("Only support CPU, Ascend, GPU")
 
         if num_batch is None or num_batch == 0:
             raise ValueError("num_batch is None or 0.")
 
         def get_distribution(output_dataset):
             dev_id = 0
-            if isinstance(output_dataset, (MindDataset)):
-                return output_dataset.distribution, dev_id
             if isinstance(output_dataset, (Cifar10Dataset, Cifar100Dataset, GeneratorDataset, ImageFolderDatasetV2,
-                                           ManifestDataset, MnistDataset, VOCDataset, CelebADataset)):
+                                           ManifestDataset, MnistDataset, VOCDataset, CocoDataset, CelebADataset,
+                                           MindDataset)):
                 sampler = output_dataset.sampler
                 if isinstance(sampler, samplers.DistributedSampler):
                     dev_id = sampler.shard_id
                 return "", dev_id
-            if isinstance(output_dataset, TFRecordDataset):
+            if isinstance(output_dataset, (TFRecordDataset, TextFileDataset, CLUEDataset)):
                 if output_dataset.shard_id is not None:
                     dev_id = output_dataset.shard_id
                 return "", dev_id
 
-            if not output_dataset.input:
+            if not output_dataset.children:
                 raise RuntimeError("Unknown output_dataset: {}".format(type(output_dataset)))
-            input_dataset = output_dataset.input[0]
+            input_dataset = output_dataset.children[0]
             return get_distribution(input_dataset)
 
         distribution_path, device_id = get_distribution(self)
@@ -1012,7 +1094,7 @@ class Dataset:
 
     def _get_pipeline_info(self):
         """
-        Gets pipeline information.
+        Get pipeline information.
         """
         device_iter = TupleIterator(self)
         self._output_shapes = device_iter.get_output_shapes()
@@ -1053,8 +1135,8 @@ class Dataset:
         Return:
             Number, number of batches.
         """
-        if self.input:
-            return self.input[0].get_dataset_size()
+        if self.children:
+            return self.children[0].get_dataset_size()
         return None
 
     def num_classes(self):
@@ -1064,23 +1146,23 @@ class Dataset:
         Return:
             Number, number of classes.
         """
-        if self.input:
-            return self.input[0].num_classes()
+        if self.children:
+            return self.children[0].num_classes()
         return None
 
     def get_sync_notifiers(self):
-        if self.input:
-            return self.input[0].get_sync_notifiers()
+        if self.children:
+            return self.children[0].get_sync_notifiers()
         return {}
 
     def disable_sync(self):
-        if self.input:
-            return self.input[0].disable_sync()
+        if self.children:
+            return self.children[0].disable_sync()
         return {}
 
     def is_sync(self):
-        if self.input:
-            return self.input[0].is_sync()
+        if self.children:
+            return self.children[0].is_sync()
         return False
 
     def sync_update(self, condition_name, num_batch=None, data=None):
@@ -1114,8 +1196,8 @@ class Dataset:
         Return:
             Number, the number of data in a batch.
         """
-        if self.input:
-            return self.input[0].get_batch_size()
+        if self.children:
+            return self.children[0].get_batch_size()
         return 1
 
     def get_repeat_count(self):
@@ -1125,8 +1207,8 @@ class Dataset:
         Return:
             Number, the count of repeat.
         """
-        if self.input:
-            return self.input[0].get_repeat_count()
+        if self.children:
+            return self.children[0].get_repeat_count()
         return 1
 
     def get_class_indexing(self):
@@ -1136,22 +1218,22 @@ class Dataset:
         Return:
             Dict, A str-to-int mapping from label name to index.
         """
-        if self.input:
-            return self.input[0].get_class_indexing()
+        if self.children:
+            return self.children[0].get_class_indexing()
         raise NotImplementedError("Dataset {} has not supported api get_class_indexing yet.".format(type(self)))
 
     def reset(self):
         """Reset the dataset for next epoch."""
 
     def is_shuffled(self):
-        for input_dataset in self.input:
+        for input_dataset in self.children:
             if input_dataset.is_shuffled():
                 return True
 
         return False
 
     def is_sharded(self):
-        for input_dataset in self.input:
+        for input_dataset in self.children:
             if input_dataset.is_sharded():
                 return True
 
@@ -1257,8 +1339,8 @@ class MappableDataset(SourceDataset):
 
     def _get_sampler_dataset_size(self):
         if self.sampler is not None:
-            if hasattr(self.sampler, 'get_dataset_size'):
-                return self.sampler.get_dataset_size()
+            if hasattr(self.sampler, 'get_num_samples'):
+                return self.sampler.get_num_samples()
             if hasattr(self.sampler, '__len__'):
                 return len(self.sampler)
 
@@ -1267,7 +1349,7 @@ class MappableDataset(SourceDataset):
     @check_split
     def split(self, sizes, randomize=True):
         """
-        Splits the dataset into smaller, non-overlapping datasets.
+        Split the dataset into smaller, non-overlapping datasets.
 
         There is the optimized split function, which will be called automatically when the dataset
         that calls this function is a MappableDataset.
@@ -1282,10 +1364,14 @@ class MappableDataset(SourceDataset):
                 Datasets of size round(f1*K), round(f2*K), …, round(fn*K) where K is the size of the
                 original dataset.
                 If after rounding:
-                    -Any size equals 0, an error will occur.
-                    -The sum of split sizes < K, the difference will be added to the first split.
-                    -The sum of split sizes > K, the difference will be removed from the first large
-                    enough split such that it will have atleast 1 row after removing the difference.
+
+                    - Any size equals 0, an error will occur.
+
+                    - The sum of split sizes < K, the difference will be added to the first split.
+
+                    - The sum of split sizes > K, the difference will be removed from the first large
+                      enough split such that it will have atleast 1 row after removing the difference.
+
             randomize (bool, optional): determines whether or not to split the data randomly (default=True).
                 If true, the data will be randomly split. Otherwise, each split will be created with
                 consecutive rows from the dataset.
@@ -1330,10 +1416,10 @@ class MappableDataset(SourceDataset):
             >>> train.use_sampler(train_sampler)
         """
         if self.is_shuffled():
-            logger.warning("dataset is shuffled before split.")
+            logger.warning("Dataset is shuffled before split.")
 
         if self.is_sharded():
-            raise RuntimeError("dataset should not be sharded before split.")
+            raise RuntimeError("Dataset should not be sharded before split.")
 
         absolute_sizes = self._get_absolute_split_sizes(sizes)
         splits = []
@@ -1347,7 +1433,7 @@ class MappableDataset(SourceDataset):
                 random_sampler.reshuffle_each_epoch = False
                 ds.add_sampler(random_sampler)
 
-            subset_sampler = samplers.SubsetSampler(current_split_start_index, size)
+            subset_sampler = samplers.SequentialSampler(current_split_start_index, size)
             ds.add_sampler(subset_sampler)
 
             # add sequential sampler, so that if user calls use_sampler, we will
@@ -1369,6 +1455,48 @@ class DatasetOp(Dataset):
     # No need for __init__ since it is the same as the super's init
 
 
+class BucketBatchByLengthDataset(DatasetOp):
+    """
+    The result of applying BucketBatchByLength operator to the input dataset.
+    """
+
+    def __init__(self, input_dataset, column_names, bucket_boundaries, bucket_batch_sizes,
+                 element_length_function, pad_info, pad_to_bucket_boundary, drop_remainder):
+        super().__init__()
+
+        self.column_names = column_names
+        self.bucket_boundaries = bucket_boundaries
+        self.bucket_batch_sizes = bucket_batch_sizes
+        self.element_length_function = element_length_function
+        self.pad_info = pad_info
+        self.pad_to_bucket_boundary = pad_to_bucket_boundary
+        self.drop_remainder = drop_remainder
+
+        self.children.append(input_dataset)
+        input_dataset.parent.append(self)
+        self._input_indexs = input_dataset.input_indexs
+
+    def get_args(self):
+        args = super().get_args()
+        args["length_dependent_columns"] = self.column_names
+        args["bucket_boundaries"] = self.bucket_boundaries
+        args["bucket_batch_sizes"] = self.bucket_batch_sizes
+        args["element_length_function"] = self.element_length_function
+        args["pad_info"] = self.pad_info
+        args["pad_to_bucket_boundary"] = self.pad_to_bucket_boundary
+        args["drop_remainder"] = self.drop_remainder
+        return args
+
+    def get_dataset_size(self):
+        """
+        Get the number of batches in an epoch.
+
+        Return:
+            Number, number of batches.
+        """
+        return None
+
+
 class BatchDataset(DatasetOp):
     """
     The result of applying Batch operator to the input dataset.
@@ -1407,8 +1535,8 @@ class BatchDataset(DatasetOp):
         self.per_batch_map = per_batch_map
         self.input_columns = input_columns
         self.pad_info = pad_info
-        self.input.append(input_dataset)
-        input_dataset.output.append(self)
+        self.children.append(input_dataset)
+        input_dataset.parent.append(self)
         self._input_indexs = input_dataset.input_indexs
 
     def get_args(self):
@@ -1427,7 +1555,7 @@ class BatchDataset(DatasetOp):
         Return:
             Number, number of batches.
         """
-        child_size = self.input[0].get_dataset_size()
+        child_size = self.children[0].get_dataset_size()
         if child_size is not None:
             if self.drop_remainder:
                 return math.floor(child_size / self.batch_size)
@@ -1456,7 +1584,7 @@ class BatchDataset(DatasetOp):
         if isinstance(dataset, RepeatDataset):
             return True
         flag = False
-        for input_dataset in dataset.input:
+        for input_dataset in dataset.children:
             flag = flag | BatchDataset._is_ancestor_of_repeat(input_dataset)
         return flag
 
@@ -1467,13 +1595,14 @@ class BatchDataset(DatasetOp):
 
         Args:
              dataset (Dataset): dataset to be checked.
-             batchsize (int): batch size to notify.
+             batch_size (int): batch size to notify.
         """
         if isinstance(dataset, SyncWaitDataset):
             dataset.update_sync_batch_size(batch_size)
-        for input_dataset in dataset.input:
+        for input_dataset in dataset.children:
             BatchDataset._update_batch_size_for_syncwait(input_dataset, batch_size)
 
+
 class BatchInfo(CBatchInfo):
     """
     The information object associates with the current batch of tensors.
@@ -1497,17 +1626,19 @@ class BatchInfo(CBatchInfo):
         """
         return
 
+
 class BlockReleasePair:
     """
     The blocking condition class used by SyncWaitDataset.
 
     Args:
         init_release_rows (int): Number of lines to allow through the pipeline.
-        callback (function): The callback funciton that will be called when release is called.
+        callback (function): The callback function that will be called when release is called.
     """
+
     def __init__(self, init_release_rows, callback=None):
         if isinstance(init_release_rows, int) and init_release_rows <= 0:
-            raise ValueError("release_rows  need to be greater than 0.")
+            raise ValueError("release_rows need to be greater than 0.")
         self.row_count = -init_release_rows
         self.cv = threading.Condition()
         self.callback = callback
@@ -1566,7 +1697,7 @@ class SyncWaitDataset(DatasetOp):
         input_dataset (Dataset): Input dataset to apply flow control.
         num_batch (int): the number of batches without blocking at the start of each epoch.
         condition_name (str): The condition name that is used to toggle sending next row.
-        callback (function): The callback funciton that will be invoked when sync_update is called.
+        callback (function): The callback function that will be invoked when sync_update is called.
 
     Raises:
         RuntimeError: If condition name already exists.
@@ -1574,21 +1705,21 @@ class SyncWaitDataset(DatasetOp):
 
     def __init__(self, input_dataset, condition_name, num_batch, callback=None):
         super().__init__()
-        self.input.append(input_dataset)
-        input_dataset.output.append(self)
+        self.children.append(input_dataset)
+        input_dataset.parent.append(self)
         # set to the default value, waiting for the batch to update it
         self._condition_name = condition_name
         if isinstance(num_batch, int) and num_batch <= 0:
             raise ValueError("num_batch need to be greater than 0.")
 
         self._pair = BlockReleasePair(num_batch, callback)
-        if self._condition_name in self.input[0].get_sync_notifiers():
+        if self._condition_name in self.children[0].get_sync_notifiers():
             raise RuntimeError("Condition name is already in use")
         logger.warning("Please remember to add dataset.sync_update(condition=%s), otherwise will result in hanging",
                        condition_name)
 
     def get_sync_notifiers(self):
-        return {**self.input[0].get_sync_notifiers(), **{self._condition_name: self._pair.release_func}}
+        return {**self.children[0].get_sync_notifiers(), **{self._condition_name: self._pair.release_func}}
 
     def is_sync(self):
         return True
@@ -1621,7 +1752,7 @@ class SyncWaitDataset(DatasetOp):
         if isinstance(dataset, BatchDataset):
             return True
         flag = False
-        for input_dataset in dataset.input:
+        for input_dataset in dataset.children:
             flag = flag | SyncWaitDataset._is_ancestor_of_batch(input_dataset)
         return flag
 
@@ -1641,9 +1772,9 @@ class ShuffleDataset(DatasetOp):
     def __init__(self, input_dataset, buffer_size):
         super().__init__()
         self.buffer_size = buffer_size
-        self.input.append(input_dataset)
+        self.children.append(input_dataset)
         self.reshuffle_each_epoch = None
-        input_dataset.output.append(self)
+        input_dataset.parent.append(self)
         self._input_indexs = input_dataset.input_indexs
         if self.is_sync():
             raise RuntimeError("No shuffle after sync operators")
@@ -1687,6 +1818,7 @@ class _PythonCallable:
     """
     Internal python function wrapper for multiprocessing pyfunc.
     """
+
     def __init__(self, py_callable, idx, pool=None):
         # Original python callable from user.
         self.py_callable = py_callable
@@ -1738,7 +1870,7 @@ class MapDataset(DatasetOp):
     def __init__(self, input_dataset, input_columns=None, operations=None, output_columns=None, columns_order=None,
                  num_parallel_workers=None, python_multiprocessing=False):
         super().__init__(num_parallel_workers)
-        self.input.append(input_dataset)
+        self.children.append(input_dataset)
         if input_columns is not None and not isinstance(input_columns, list):
             input_columns = [input_columns]
         self.input_columns = input_columns
@@ -1755,7 +1887,7 @@ class MapDataset(DatasetOp):
                 and self.columns_order is None:
             raise ValueError("When (len(input_columns) != len(output_columns)), columns_order must be specified.")
 
-        input_dataset.output.append(self)
+        input_dataset.parent.append(self)
         self._input_indexs = input_dataset.input_indexs
         self.python_multiprocessing = python_multiprocessing
         self.process_pool = None
@@ -1765,6 +1897,7 @@ class MapDataset(DatasetOp):
         args["input_columns"] = self.input_columns
         args["operations"] = self.operations
         args["output_columns"] = self.output_columns
+        args["columns_order"] = self.columns_order
         return args
 
     def get_dataset_size(self):
@@ -1774,7 +1907,7 @@ class MapDataset(DatasetOp):
         Return:
             Number, number of batches.
         """
-        return self.input[0].get_dataset_size()
+        return self.children[0].get_dataset_size()
 
     def __deepcopy__(self, memodict):
         if id(self) in memodict:
@@ -1782,12 +1915,12 @@ class MapDataset(DatasetOp):
         cls = self.__class__
         new_op = cls.__new__(cls)
         memodict[id(self)] = new_op
-        new_op.input = copy.deepcopy(self.input, memodict)
+        new_op.children = copy.deepcopy(self.children, memodict)
         new_op.input_columns = copy.deepcopy(self.input_columns, memodict)
         new_op.output_columns = copy.deepcopy(self.output_columns, memodict)
         new_op.columns_order = copy.deepcopy(self.columns_order, memodict)
         new_op.num_parallel_workers = copy.deepcopy(self.num_parallel_workers, memodict)
-        new_op.output = copy.deepcopy(self.output, memodict)
+        new_op.parent = copy.deepcopy(self.parent, memodict)
         new_op.input_indexs = copy.deepcopy(self._input_indexs, memodict)
         new_op.python_multiprocessing = copy.deepcopy(self.python_multiprocessing, memodict)
         new_op.operations = self.operations
@@ -1848,8 +1981,8 @@ class FilterDataset(DatasetOp):
     def __init__(self, input_dataset, predicate, input_columns=None, num_parallel_workers=None):
         super().__init__(num_parallel_workers)
         self.predicate = lambda *args: bool(predicate(*args))
-        self.input.append(input_dataset)
-        input_dataset.output.append(self)
+        self.children.append(input_dataset)
+        input_dataset.parent.append(self)
         if input_columns is not None and not isinstance(input_columns, list):
             input_columns = [input_columns]
         self.input_columns = input_columns
@@ -1885,8 +2018,8 @@ class RepeatDataset(DatasetOp):
             self.count = -1
         else:
             self.count = count
-        self.input.append(input_dataset)
-        input_dataset.output.append(self)
+        self.children.append(input_dataset)
+        input_dataset.parent.append(self)
         self._input_indexs = input_dataset.input_indexs
 
     def get_args(self):
@@ -1901,7 +2034,7 @@ class RepeatDataset(DatasetOp):
         Return:
             Number, number of batches.
         """
-        child_size = self.input[0].get_dataset_size()
+        child_size = self.children[0].get_dataset_size()
         if child_size is not None:
             return child_size
         return None
@@ -1921,15 +2054,15 @@ class SkipDataset(DatasetOp):
     The result of applying Skip operator to the input Dataset.
 
     Args:
-        datasets (tuple): A tuple of datasets to be skipped.
+        input_dataset (tuple): A tuple of datasets to be skipped.
         count (int): Number of rows the dataset should be skipped.
     """
 
     def __init__(self, input_dataset, count):
         super().__init__()
         self.count = count
-        self.input.append(input_dataset)
-        input_dataset.output.append(self)
+        self.children.append(input_dataset)
+        input_dataset.parent.append(self)
         self._input_indexs = input_dataset.input_indexs
 
     def get_args(self):
@@ -1944,7 +2077,7 @@ class SkipDataset(DatasetOp):
         Return:
             Number, number of batches.
         """
-        child_size = self.input[0].get_dataset_size()
+        child_size = self.children[0].get_dataset_size()
         output_size = 0
         if self.count >= 0 and self.count < child_size:
             output_size = child_size - self.count
@@ -1963,8 +2096,8 @@ class TakeDataset(DatasetOp):
     def __init__(self, input_dataset, count):
         super().__init__()
         self.count = count
-        self.input.append(input_dataset)
-        input_dataset.output.append(self)
+        self.children.append(input_dataset)
+        input_dataset.parent.append(self)
         self._input_indexs = input_dataset.input_indexs
 
     def get_args(self):
@@ -1979,7 +2112,7 @@ class TakeDataset(DatasetOp):
         Return:
             Number, number of batches.
         """
-        child_size = self.input[0].get_dataset_size()
+        child_size = self.children[0].get_dataset_size()
         if child_size < self.count:
             return child_size
         return self.count
@@ -2003,8 +2136,8 @@ class ZipDataset(DatasetOp):
                 raise TypeError("The parameter %s of zip has type error!" % (dataset))
         self.datasets = datasets
         for data in datasets:
-            self.input.append(data)
-            data.output.append(self)
+            self.children.append(data)
+            data.parent.append(self)
 
     def get_dataset_size(self):
         """
@@ -2013,7 +2146,7 @@ class ZipDataset(DatasetOp):
         Return:
             Number, number of batches.
         """
-        children_sizes = [c.get_dataset_size() for c in self.input]
+        children_sizes = [c.get_dataset_size() for c in self.children]
         if all(c is not None for c in children_sizes):
             return min(children_sizes)
         return None
@@ -2028,7 +2161,7 @@ class ZipDataset(DatasetOp):
         return None
 
     def is_sync(self):
-        return any([c.is_sync() for c in self.input])
+        return any([c.is_sync() for c in self.children])
 
     def get_args(self):
         args = super().get_args()
@@ -2053,8 +2186,8 @@ class ConcatDataset(DatasetOp):
                 raise TypeError("The parameter %s of concat has type error!" % (dataset))
         self.datasets = datasets
         for data in datasets:
-            self.input.append(data)
-            data.output.append(self)
+            self.children.append(data)
+            data.parent.append(self)
 
     def get_dataset_size(self):
         """
@@ -2063,8 +2196,8 @@ class ConcatDataset(DatasetOp):
         Return:
             Number, number of batches.
         """
-        children_sizes = [c.get_dataset_size() for c in self.input]
-        dataset_size = np.sum(children_sizes)
+        children_sizes = [c.get_dataset_size() for c in self.children]
+        dataset_size = sum(children_sizes)
         return dataset_size
 
 
@@ -2074,8 +2207,8 @@ class RenameDataset(DatasetOp):
 
     Args:
         input_dataset (Dataset): Input Dataset to be Renamed.
-        input_column_names (list[str]): list of names of the input columns.
-        output_column_names (list[str]): list of names of the output columns.
+        input_columns (list[str]): list of names of the input columns.
+        output_columns (list[str]): list of names of the output columns.
     """
 
     def __init__(self, input_dataset, input_columns, output_columns):
@@ -2086,8 +2219,8 @@ class RenameDataset(DatasetOp):
             output_columns = [output_columns]
         self.input_column_names = input_columns
         self.output_column_names = output_columns
-        self.input.append(input_dataset)
-        input_dataset.output.append(self)
+        self.children.append(input_dataset)
+        input_dataset.parent.append(self)
         self._input_indexs = input_dataset.input_indexs
 
     def get_args(self):
@@ -2113,10 +2246,10 @@ class ProjectDataset(DatasetOp):
         if not isinstance(columns, list):
             columns = [columns]
         self.columns = columns
-        self.input.append(input_dataset)
+        self.children.append(input_dataset)
         self.prefetch_size = prefetch_size
 
-        input_dataset.output.append(self)
+        input_dataset.parent.append(self)
         self._input_indexs = input_dataset.input_indexs
 
     def get_args(self):
@@ -2140,8 +2273,8 @@ class TransferDataset(DatasetOp):
 
     def __init__(self, input_dataset, queue_name, device_id, device_type, num_batch=None):
         super().__init__()
-        self.input.append(input_dataset)
-        input_dataset.output.append(self)
+        self.children.append(input_dataset)
+        input_dataset.parent.append(self)
         self.queue_name = queue_name
         self._input_indexs = input_dataset.input_indexs
         self._device_type = device_type
@@ -2218,31 +2351,45 @@ def _select_sampler(num_samples, input_sampler, shuffle, num_shards, shard_id):
         num_shards (int): Number of shard for sharding.
         shard_id (int): Shard ID.
     """
+    if input_sampler is not None:
+        # If the user provided a sampler, then it doesn't matter what the other args are because
+        # we are being asked specifically to use the given sampler.
+        # That means the following arguments: num_shards, shard_id, shuffle, num_samples should all
+        # be None. Consider this example:
+        #     sampler = ds.DistributedSampler(num_shards=8, shard_id=3, shuffle=shuffle)
+        #     data1 = ds.VOCDataset(voc_dir, decode=True, sampler=sampler, num_shards=4, shard_id=1)
+        # In this case, the user has given different sample-related arguments that contradict each other.
+        # To prevent this, only allow the user to manually specify the sampler if those arguments are all None
+        if (isinstance(input_sampler, (samplers.SequentialSampler, samplers.DistributedSampler,
+                                       samplers.RandomSampler, samplers.SubsetRandomSampler,
+                                       samplers.WeightedRandomSampler, samplers.Sampler)) and
+                (num_shards is not None or shard_id is not None or shuffle is not None or num_samples is not None)):
+            raise ValueError(
+                'Conflicting arguments during sampler assignments. num_samples: {}, num_shards: {},'
+                ' shard_id: {}, shuffle: {})'.format(num_samples, num_shards, shard_id, shuffle))
+        return input_sampler
     if shuffle is None:
-        if input_sampler is not None:
-            # If shuffle is not specified, user provided sampler, use user's sampler
-            return input_sampler
         if num_shards is not None:
             # If shuffle is not specified, sharding enabled, use distributed random sampler
             shuffle = True
-            return samplers.DistributedSampler(num_shards, shard_id, shuffle=shuffle)
+            return samplers.DistributedSampler(num_shards, shard_id, shuffle=shuffle, num_samples=num_samples)
         # If shuffle is not specified, sharding disabled, use random sampler
         if num_samples is not None:
             return samplers.RandomSampler(replacement=True, num_samples=num_samples)
-        return samplers.RandomSampler()
+        return samplers.RandomSampler(num_samples=num_samples)
     if shuffle is True:
         if num_shards is not None:
             # If shuffle enabled, sharding enabled, use distributed random sampler
-            return samplers.DistributedSampler(num_shards, shard_id, shuffle=shuffle)
+            return samplers.DistributedSampler(num_shards, shard_id, shuffle=shuffle, num_samples=num_samples)
         # If shuffle enabled, sharding disabled, use random sampler
         if num_samples is not None:
             return samplers.RandomSampler(replacement=True, num_samples=num_samples)
-        return samplers.RandomSampler()
+        return samplers.RandomSampler(num_samples=num_samples)
     if num_shards is not None:
         # If shuffle disabled, sharding enabled, use distributed sequential sampler
-        return samplers.DistributedSampler(num_shards, shard_id, shuffle=shuffle)
+        return samplers.DistributedSampler(num_shards, shard_id, shuffle=shuffle, num_samples=num_samples)
     # If shuffle disabled, sharding disabled, use sequential sampler
-    return samplers.SequentialSampler()
+    return samplers.SequentialSampler(num_samples=num_samples)
 
 
 class ImageFolderDatasetV2(MappableDataset):
@@ -2362,11 +2509,7 @@ class ImageFolderDatasetV2(MappableDataset):
         Return:
             Number, number of batches.
         """
-        if self.num_samples is None:
-            num_samples = 0
-        else:
-            num_samples = self.num_samples
-        num_rows = ImageFolderOp.get_num_rows_and_classes(self.dataset_dir, num_samples)[0]
+        num_rows = ImageFolderOp.get_num_rows_and_classes(self.dataset_dir)[0]
         rows_per_shard = get_num_rows(num_rows, self.num_shards)
         rows_from_sampler = self._get_sampler_dataset_size()
 
@@ -2382,11 +2525,7 @@ class ImageFolderDatasetV2(MappableDataset):
         Return:
             Number, number of classes.
         """
-        if self.num_samples is None:
-            num_samples = 0
-        else:
-            num_samples = self.num_samples
-        return ImageFolderOp.get_num_rows_and_classes(self.dataset_dir, num_samples)[1]
+        return ImageFolderOp.get_num_rows_and_classes(self.dataset_dir)[1]
 
     def is_shuffled(self):
         if self.shuffle_level is None:
@@ -2495,12 +2634,7 @@ class MnistDataset(MappableDataset):
         Return:
             Number, number of batches.
         """
-        if self.num_samples is None:
-            num_samples = 0
-        else:
-            num_samples = self.num_samples
-
-        num_rows = MnistOp.get_num_rows(self.dataset_dir, num_samples)
+        num_rows = MnistOp.get_num_rows(self.dataset_dir)
         rows_per_shard = get_num_rows(num_rows, self.num_shards)
         rows_from_sampler = self._get_sampler_dataset_size()
 
@@ -2522,7 +2656,7 @@ class MnistDataset(MappableDataset):
         return self.sampler.is_sharded()
 
 
-class MindDataset(SourceDataset):
+class MindDataset(MappableDataset):
     """
     A source dataset that reads from shard files and database.
 
@@ -2539,7 +2673,13 @@ class MindDataset(SourceDataset):
         sampler (Sampler, optional): Object used to choose samples from the
             dataset (default=None, sampler is exclusive
             with shuffle and block_reader). Support list: SubsetRandomSampler,
-            PkSampler
+            PkSampler, RandomSampler, SequentialSampler, DistributedSampler.
+        padded_sample (dict, optional): Samples will be appended to dataset, which
+            keys are the same as column_list.
+        num_padded (int, optional): Number of padding samples.Dataset size
+            plus num_padded should be divisible by num_shards.
+        num_samples (int, optional): The number of samples to be included in the dataset
+            (default=None, all samples).
 
     Raises:
         ValueError: If num_shards is specified but shard_id is None.
@@ -2550,7 +2690,8 @@ class MindDataset(SourceDataset):
     @check_minddataset
     def __init__(self, dataset_file, columns_list=None, num_parallel_workers=None,
                  shuffle=None, num_shards=None, shard_id=None,
-                 block_reader=False, sampler=None):
+                 block_reader=False, sampler=None, padded_sample=None,
+                 num_padded=None, num_samples=None):
         super().__init__(num_parallel_workers)
         if isinstance(dataset_file, list):
             self.load_dataset = False
@@ -2558,53 +2699,57 @@ class MindDataset(SourceDataset):
             self.load_dataset = True
         self.dataset_file = dataset_file
         self.columns_list = columns_list
-        self.global_shuffle = shuffle
-        self.distribution = ""
-        self.sampler = sampler
-
-        if num_shards is None or shard_id is None:
-            self.partitions = None
-        else:
-            self.partitions = [num_shards, shard_id]
+        self.shuffle_option = shuffle
+        self.num_shards = num_shards
+        self.shard_id = shard_id
 
-        if block_reader is True and self.partitions is not None:
-            raise ValueError("block reader not allowed true when use partitions")
+        if block_reader is True and num_shards is not None:
+            raise ValueError("block_reader not allowed true when use partitions")
 
         if block_reader is True and shuffle is True:
-            raise ValueError("block reader not allowed true when use shuffle")
+            raise ValueError("block_reader not allowed true when use shuffle")
 
         if block_reader is True:
             logger.warning("WARN: global shuffle is not used.")
 
         if sampler is not None:
-            if isinstance(sampler, samplers.SubsetRandomSampler) is False and \
-            isinstance(sampler, samplers.PKSampler) is False:
-                raise ValueError("the sampler is not supported yet.")
+            if isinstance(sampler, (samplers.SubsetRandomSampler, samplers.PKSampler,
+                                    samplers.DistributedSampler, samplers.RandomSampler,
+                                    samplers.SequentialSampler)) is False:
+                raise ValueError("The sampler is not supported yet.")
+
+        self.sampler = _select_sampler(num_samples, sampler, shuffle, num_shards, shard_id)
+        self.num_samples = num_samples
 
         # sampler exclusive
         if block_reader is True and sampler is not None:
-            raise ValueError("block reader not allowed true when use sampler")
-
-        if shuffle is not None and sampler is not None:
-            raise ValueError("shuffle not allowed when use sampler")
+            raise ValueError("block_reader not allowed true when use sampler")
 
-        if block_reader is False and sampler is None:
-            self.global_shuffle = not bool(shuffle is False)
+        if num_padded is None:
+            num_padded = 0
 
-        self.num_shards = num_shards
-        self.shard_id = shard_id
         self.block_reader = block_reader
+        self.padded_sample = padded_sample
+        self.num_padded = num_padded
 
     def get_args(self):
         args = super().get_args()
+        padded_sample = None
+        if self.padded_sample:
+            padded_sample = {}
+            for k, v in self.padded_sample.items():
+                if isinstance(v, np.ndarray):
+                    padded_sample[k] = v.tobytes()
+                else:
+                    padded_sample[k] = v
         args["dataset_file"] = self.dataset_file
         args["load_dataset"] = self.load_dataset
         args["columns_list"] = self.columns_list
-        args["global_shuffle"] = self.global_shuffle
-        args["partitions"] = self.partitions
+        args["shuffle_option"] = self.shuffle_option
+        args["num_samples"] = self.num_samples
         args["block_reader"] = self.block_reader
-        args["num_shards"] = self.num_shards
-        args["shard_id"] = self.shard_id
+        args["num_padded"] = self.num_padded
+        args["padded_sample"] = padded_sample
         args["sampler"] = self.sampler
         return args
 
@@ -2615,23 +2760,28 @@ class MindDataset(SourceDataset):
         Return:
             Number, number of batches.
         """
-        if self.load_dataset:
-            dataset_file = [self.dataset_file]
-        else:
-            dataset_file = self.dataset_file
-        num_rows = MindRecordOp.get_num_rows(dataset_file, self.load_dataset, self.sampler)
-        if self.partitions is not None and self.partitions[0] > 0:
-            if num_rows % self.partitions[0] == 0:
-                num_rows = num_rows // self.partitions[0]
+        if self._dataset_size is None:
+            if self.load_dataset:
+                dataset_file = [self.dataset_file]
             else:
-                num_rows = num_rows // self.partitions[0] + 1
-        return num_rows
+                dataset_file = self.dataset_file
+            num_rows = MindRecordOp.get_num_rows(dataset_file, self.load_dataset, self.sampler, self.num_padded)
+            return num_rows
+        return self._dataset_size
+
+    # manually set dataset_size as a tempoary solution.
+    def set_dataset_size(self, value):
+        logger.warning("WARN_DEPRECATED: This method is deprecated. Please use get_dataset_size directly.")
+        if value >= 0:
+            self._dataset_size = value
+        else:
+            raise ValueError('Set dataset_size with negative value {}'.format(value))
 
     def is_shuffled(self):
-        if self.global_shuffle is None:
+        if self.shuffle_option is None:
             return True
 
-        return self.global_shuffle or self.sampler.is_shuffled()
+        return self.shuffle_option or self.sampler.is_shuffled()
 
     def is_sharded(self):
         if self.num_shards is not None:
@@ -2727,7 +2877,7 @@ def _py_sampler_fn_mp(sampler, num_samples, dataset, num_worker):
 
 def _fetch_py_sampler_indices(sampler, num_samples):
     """
-    Indices fetcher for python sampler.
+    Indice fetcher for python sampler.
     """
     if num_samples is not None:
         sampler_iter = iter(sampler)
@@ -2827,6 +2977,7 @@ class _GeneratorWorker(multiprocessing.Process):
     """
     Worker process for multiprocess Generator.
     """
+
     def __init__(self, dataset, eoe):
         self.idx_queue = multiprocessing.Queue(16)
         self.res_queue = multiprocessing.Queue(16)
@@ -2892,7 +3043,7 @@ class GeneratorDataset(MappableDataset):
             provide either column_names or schema.
         column_types (list[mindspore.dtype], optional): List of column data types of the dataset (default=None).
             If provided, sanity check will be performed on generator output.
-        schema (Schema/String, optional): Path to the json schema file or schema object (default=None). Users are
+        schema (Schema/str, optional): Path to the json schema file or schema object (default=None). Users are
             required to provide either column_names or schema. If both are provided, schema will be used.
         num_samples (int, optional): The number of samples to be included in the dataset
             (default=None, all images).
@@ -2948,11 +3099,8 @@ class GeneratorDataset(MappableDataset):
             if isinstance(self.sampler, (samplers.SequentialSampler, samplers.DistributedSampler,
                                          samplers.RandomSampler, samplers.SubsetRandomSampler,
                                          samplers.WeightedRandomSampler, samplers.Sampler)):
-                if num_samples is None:
-                    num_samples = len(source)
                 sampler_instance = self.sampler.create()
                 sampler_instance.set_num_rows(len(source))
-                sampler_instance.set_num_samples(num_samples)
                 sampler_instance.initialize()
                 if num_parallel_workers > 1:
                     self.source = (lambda: _cpp_sampler_fn_mp(sampler_instance, source, num_parallel_workers))
@@ -3020,7 +3168,7 @@ class GeneratorDataset(MappableDataset):
         if value >= 0:
             self._dataset_size = value
         else:
-            raise ValueError('set dataset_size with negative value {}'.format(value))
+            raise ValueError('Set dataset_size with negative value {}'.format(value))
 
     def __deepcopy__(self, memodict):
         if id(self) in memodict:
@@ -3028,8 +3176,8 @@ class GeneratorDataset(MappableDataset):
         cls = self.__class__
         new_op = cls.__new__(cls)
         memodict[id(self)] = new_op
-        new_op.input = copy.deepcopy(self.input, memodict)
-        new_op.output = copy.deepcopy(self.output, memodict)
+        new_op.children = copy.deepcopy(self.children, memodict)
+        new_op.parent = copy.deepcopy(self.parent, memodict)
         new_op.num_parallel_workers = copy.deepcopy(self.num_parallel_workers, memodict)
         new_op.column_types = copy.deepcopy(self.column_types, memodict)
         new_op.column_names = copy.deepcopy(self.column_names, memodict)
@@ -3139,6 +3287,7 @@ class TFRecordDataset(SourceDataset):
         args["num_samples"] = self.num_samples
         if self.shuffle_files is not None:
             args["shuffle_files"] = self.shuffle_files
+        args["shuffle_global"] = (self.shuffle_level == Shuffle.GLOBAL)
         args["shuffle"] = self.shuffle_level
         args["num_shards"] = self.num_shards
         args["shard_id"] = self.shard_id
@@ -3169,7 +3318,7 @@ class TFRecordDataset(SourceDataset):
         if value >= 0:
             self._dataset_size = value
         else:
-            raise ValueError('set dataset_size with negative value {}'.format(value))
+            raise ValueError('Set dataset_size with negative value {}'.format(value))
 
     def is_shuffled(self):
         return self.shuffle_files
@@ -3296,17 +3445,12 @@ class ManifestDataset(MappableDataset):
         Return:
             Number, number of batches.
         """
-        if self.num_samples is None:
-            num_samples = 0
-        else:
-            num_samples = self.num_samples
-
         if self.class_indexing is None:
             class_indexing = dict()
         else:
             class_indexing = self.class_indexing
 
-        num_rows = ManifestOp.get_num_rows_and_classes(self.dataset_file, num_samples, class_indexing, self.usage)[0]
+        num_rows = ManifestOp.get_num_rows_and_classes(self.dataset_file, class_indexing, self.usage)[0]
         rows_per_shard = get_num_rows(num_rows, self.num_shards)
         rows_from_sampler = self._get_sampler_dataset_size()
 
@@ -3322,17 +3466,12 @@ class ManifestDataset(MappableDataset):
         Return:
             Number, number of classes.
         """
-        if self.num_samples is None:
-            num_samples = 0
-        else:
-            num_samples = self.num_samples
-
         if self.class_indexing is None:
             class_indexing = dict()
         else:
             class_indexing = self.class_indexing
 
-        return ManifestOp.get_num_rows_and_classes(self.dataset_file, num_samples, class_indexing, self.usage)[1]
+        return ManifestOp.get_num_rows_and_classes(self.dataset_file, class_indexing, self.usage)[1]
 
     def get_class_indexing(self):
         """
@@ -3341,17 +3480,12 @@ class ManifestDataset(MappableDataset):
         Return:
             Dict, A str-to-int mapping from label name to index.
         """
-        if self.num_samples is None:
-            num_samples = 0
-        else:
-            num_samples = self.num_samples
-
         if self.class_indexing is None:
             class_indexing = dict()
         else:
             class_indexing = self.class_indexing
 
-        return ManifestOp.get_class_indexing(self.dataset_file, num_samples, class_indexing, self.usage)
+        return ManifestOp.get_class_indexing(self.dataset_file, class_indexing, self.usage)
 
     def is_shuffled(self):
         if self.shuffle_level is None:
@@ -3465,12 +3599,8 @@ class Cifar10Dataset(MappableDataset):
         Return:
             Number, number of batches.
         """
-        if self.num_samples is None:
-            num_samples = 0
-        else:
-            num_samples = self.num_samples
 
-        num_rows = CifarOp.get_num_rows(self.dataset_dir, num_samples, True)
+        num_rows = CifarOp.get_num_rows(self.dataset_dir, True)
         rows_per_shard = get_num_rows(num_rows, self.num_shards)
         rows_from_sampler = self._get_sampler_dataset_size()
 
@@ -3589,12 +3719,8 @@ class Cifar100Dataset(MappableDataset):
         Return:
             Number, number of batches.
         """
-        if self.num_samples is None:
-            num_samples = 0
-        else:
-            num_samples = self.num_samples
 
-        num_rows = CifarOp.get_num_rows(self.dataset_dir, num_samples, False)
+        num_rows = CifarOp.get_num_rows(self.dataset_dir, False)
         rows_per_shard = get_num_rows(num_rows, self.num_shards)
         rows_from_sampler = self._get_sampler_dataset_size()
 
@@ -3623,7 +3749,7 @@ class RandomDataset(SourceDataset):
     Args:
         num_samples (int): number of samples to generate.
         schema (str or Schema, optional): Path to the json schema file or schema object (default=None).
-            If the schema is not provided, the meta data from the TFRecord file is considered the schema.
+            If the schema is not provided, the random dataset generates a random schema.
         columns_list (list[str], optional): List of columns to be read (default=None, read all columns)
         num_parallel_workers (int, optional): number of workers to read the data
             (default=None, number set in the config).
@@ -3636,9 +3762,12 @@ class RandomDataset(SourceDataset):
             schema_obj = Schema(schema)  # read the schema file and convert to schema object to validate it
         self.schema = schema
         self.columns_list = columns_list
-        self.num_samples = num_samples
         if schema_obj is not None and num_samples is None:
             self.num_samples = schema_obj.num_rows
+        elif num_samples is None:
+            self.num_samples = 0
+        else:
+            self.num_samples = num_samples
 
     def get_args(self):
         args = super().get_args()
@@ -3677,6 +3806,7 @@ class RandomDataset(SourceDataset):
     def is_sharded(self):
         return False
 
+
 class Schema:
     """
     Class to represent a schema of dataset.
@@ -3859,10 +3989,14 @@ class VOCDataset(MappableDataset):
     """
     A source dataset for reading and parsing VOC dataset.
 
-    The generated dataset has two columns ['image', 'target'].
-    The shape of both column is [image_size] if decode flag is False, or [H, W, C]
+    The generated dataset has two columns :
+    task='Detection' : ['image', 'annotation'].
+    task='Segmentation' : ['image', 'target']
+    The shape of both column 'image' and 'target' is [image_size] if decode flag is False, or [H, W, C]
     otherwise.
-    The type of both tensor is uint8.
+    The type of both tensor 'image' and 'target' is uint8.
+    The type of tensor 'annotation' is uint32.
+
     This dataset can take in a sampler. sampler and shuffle are mutually exclusive. Table
     below shows what input args are allowed and their expected behavior.
 
@@ -4007,17 +4141,171 @@ class VOCDataset(MappableDataset):
         if self.task != "Detection":
             raise NotImplementedError()
 
-        if self.num_samples is None:
-            num_samples = 0
-        else:
-            num_samples = self.num_samples
-
         if self.class_indexing is None:
             class_indexing = dict()
         else:
             class_indexing = self.class_indexing
 
-        return VOCOp.get_class_indexing(self.dataset_dir, self.task, self.mode, class_indexing, num_samples)
+        return VOCOp.get_class_indexing(self.dataset_dir, self.task, self.mode, class_indexing)
+
+    def is_shuffled(self):
+        if self.shuffle_level is None:
+            return True
+
+        return self.shuffle_level or self.sampler.is_shuffled()
+
+    def is_sharded(self):
+        if self.num_shards is not None:
+            return self.num_shards > 1
+
+        return self.sampler.is_sharded()
+
+
+class CocoDataset(MappableDataset):
+    """
+    A source dataset for reading and parsing COCO dataset.
+
+    CocoDataset support four kinds of task:
+        2017 Train/Val/Test Detection, Keypoints, Stuff, Panoptic.
+
+    The generated dataset has multi-columns :
+        - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32],
+          ['iscrowd', dtype=uint32]].
+        - task='Stuff', column: [['image', dtype=uint8], ['segmentation',dtype=float32], ['iscrowd',dtype=uint32]].
+        - task='Keypoint', column: [['image', dtype=uint8], ['keypoints', dtype=float32],
+          ['num_keypoints', dtype=uint32]].
+        - task='Panoptic', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32],
+          ['iscrowd', dtype=uint32], ['area', dtype=uint32]].
+
+    This dataset can take in a sampler. sampler and shuffle are mutually exclusive. CocoDataset doesn't support
+    PKSampler. Table below shows what input args are allowed and their expected behavior.
+
+    .. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle'
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - Parameter 'sampler'
+         - Parameter 'shuffle'
+         - Expected Order Behavior
+       * - None
+         - None
+         - random order
+       * - None
+         - True
+         - random order
+       * - None
+         - False
+         - sequential order
+       * - Sampler object
+         - None
+         - order defined by sampler
+       * - Sampler object
+         - True
+         - not allowed
+       * - Sampler object
+         - False
+         - not allowed
+
+    Args:
+        dataset_dir (str): Path to the root directory that contains the dataset.
+        annotation_file (str): Path to the annotation json.
+        task (str): Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint'
+            (default='Detection')
+        num_samples (int, optional): The number of images to be included in the dataset
+            (default=None, all images).
+        num_parallel_workers (int, optional): Number of workers to read the data
+            (default=None, number set in the config).
+        shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None, expected
+            order behavior shown in the table).
+        decode (bool, optional): Decode the images after reading (default=False).
+        sampler (Sampler, optional): Object used to choose samples from the dataset
+            (default=None, expected order behavior shown in the table).
+        num_shards (int, optional): Number of shards that the dataset should be divided
+            into (default=None).
+        shard_id (int, optional): The shard ID within num_shards (default=None). This
+            argument should be specified only when num_shards is also specified.
+
+    Raises:
+        RuntimeError: If sampler and shuffle are specified at the same time.
+        RuntimeError: If sampler and sharding are specified at the same time.
+        RuntimeError: If num_shards is specified but shard_id is None.
+        RuntimeError: If shard_id is specified but num_shards is None.
+        RuntimeError: If parse json file failed.
+        ValueError: If task is not in ['Detection', 'Stuff', 'Panoptic', 'Keypoint'].
+        ValueError: If annotation_file is not exist.
+        ValueError: If dataset_dir is not exist.
+        ValueError: If shard_id is invalid (< 0 or >= num_shards).
+
+    Examples:
+        >>> import mindspore.dataset as ds
+        >>> dataset_dir = "/path/to/coco_dataset_directory/image_folder"
+        >>> annotation_file = "/path/to/coco_dataset_directory/annotation_folder/annotation.json"
+        >>> # 1) read COCO data for Detection task
+        >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Detection')
+        >>> # 2) read COCO data for Stuff task
+        >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Stuff')
+        >>> # 3) read COCO data for Panoptic task
+        >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Panoptic')
+        >>> # 4) read COCO data for Keypoint task
+        >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Keypoint')
+        >>> # in COCO dataset, each dictionary has keys "image" and "annotation"
+    """
+
+    @check_cocodataset
+    def __init__(self, dataset_dir, annotation_file, task="Detection", num_samples=None, num_parallel_workers=None,
+                 shuffle=None, decode=False, sampler=None, num_shards=None, shard_id=None):
+        super().__init__(num_parallel_workers)
+        self.dataset_dir = dataset_dir
+        self.annotation_file = annotation_file
+        self.task = task
+        self.sampler = _select_sampler(num_samples, sampler, shuffle, num_shards, shard_id)
+        self.num_samples = num_samples
+        self.decode = decode
+        self.shuffle_level = shuffle
+        self.num_shards = num_shards
+        self.shard_id = shard_id
+
+    def get_args(self):
+        args = super().get_args()
+        args["dataset_dir"] = self.dataset_dir
+        args["annotation_file"] = self.annotation_file
+        args["task"] = self.task
+        args["num_samples"] = self.num_samples
+        args["sampler"] = self.sampler
+        args["decode"] = self.decode
+        args["shuffle"] = self.shuffle_level
+        args["num_shards"] = self.num_shards
+        args["shard_id"] = self.shard_id
+        return args
+
+    def get_dataset_size(self):
+        """
+        Get the number of batches in an epoch.
+
+        Return:
+            Number, number of batches.
+        """
+        num_rows = CocoOp.get_num_rows(self.dataset_dir, self.annotation_file, self.task)
+        rows_per_shard = get_num_rows(num_rows, self.num_shards)
+        rows_from_sampler = self._get_sampler_dataset_size()
+
+        if rows_from_sampler is None:
+            return rows_per_shard
+
+        return min(rows_from_sampler, rows_per_shard)
+
+    def get_class_indexing(self):
+        """
+        Get the class index.
+
+        Return:
+            Dict, A str-to-int mapping from label name to index.
+        """
+        if self.task not in {"Detection", "Panoptic"}:
+            raise NotImplementedError("Only 'Detection' and 'Panoptic' support get_class_indexing.")
+
+        class_index = CocoOp.get_class_indexing(self.dataset_dir, self.annotation_file, self.task)
+        return dict(class_index)
 
     def is_shuffled(self):
         if self.shuffle_level is None:
@@ -4044,7 +4332,7 @@ class CelebADataset(MappableDataset):
         dataset_dir (str): Path to the root directory that contains the dataset.
         num_parallel_workers (int, optional): Number of workers to read the data (default=value set in the config).
         shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None).
-        dataset_type (string): one of 'all', 'train', 'valid' or 'test'.
+        dataset_type (str): one of 'all', 'train', 'valid' or 'test'.
         sampler (Sampler, optional): Object used to choose samples from the dataset (default=None).
         decode (bool, optional): decode the images after reading (default=False).
         extensions (list[str], optional): List of file extensions to be
@@ -4099,7 +4387,9 @@ class CelebADataset(MappableDataset):
             try:
                 with open(attr_file, 'r') as f:
                     num_rows = int(f.readline())
-            except Exception:
+            except FileNotFoundError:
+                raise RuntimeError("attr_file not found.")
+            except BaseException:
                 raise RuntimeError("Get dataset size failed from attribution file.")
             rows_per_shard = get_num_rows(num_rows, self.num_shards)
             if self.num_samples is not None:
@@ -4123,6 +4413,223 @@ class CelebADataset(MappableDataset):
         return self.sampler.is_sharded()
 
 
+class CLUEDataset(SourceDataset):
+    """
+    A source dataset that reads and parses CLUE datasets.
+    CLUE, the Chinese Language Understanding Evaluation Benchmark, a collection of datasets, baselines, pre-trained
+    models, corpus and leaderboard. Here we bring in classification task of CLUE, which are AFQMC, TNEWS, IFLYTEK,
+    CMNLI, WSC and CSL.
+
+    Args:
+        dataset_files (str or list[str]): String or list of files to be read or glob strings to search for a pattern of
+            files. The list will be sorted in a lexicographical order.
+        task (str, optional): The kind of task, one of 'AFQMC', 'TNEWS', 'IFLYTEK', 'CMNLI', 'WSC' and 'CSL'.
+            (default=AFQMC).
+        usage (str, optional): Need train, test or eval data (default="train").
+        num_samples (int, optional): number of samples(rows) to read (default=None, reads the full dataset).
+        num_parallel_workers (int, optional): number of workers to read the data
+            (default=None, number set in the config).
+        shuffle (bool, Shuffle level, optional): perform reshuffling of the data every epoch (default=Shuffle.GLOBAL).
+            If shuffle is False, no shuffling will be performed;
+            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
+            Otherwise, there are two levels of shuffling:
+
+            - Shuffle.GLOBAL: Shuffle both the files and samples.
+
+            - Shuffle.FILES: Shuffle files only.
+
+        num_shards (int, optional): Number of shards that the dataset should be divided into (default=None).
+        shard_id (int, optional): The shard ID within num_shards (default=None). This
+            argument should be specified only when num_shards is also specified.
+
+    Examples:
+        >>> import mindspore.dataset as ds
+        >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files
+        >>> dataset = ds.CLUEDataset(dataset_files=dataset_files, task='AFQMC', usage='train')
+
+    """
+
+    @check_cluedataset
+    def __init__(self, dataset_files, task='AFQMC', usage='train', num_samples=None,
+                 num_parallel_workers=None, shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None):
+        super().__init__(num_parallel_workers)
+        self.dataset_files = self._find_files(dataset_files)
+        self.dataset_files.sort()
+        self.num_samples = num_samples
+        self.task_dict = {
+            'AFQMC': {
+                'train': {
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2',
+                    'label': 'label'
+                },
+                'test': {
+                    'id': 'id',
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2'
+                },
+                'eval': {
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2',
+                    'label': 'label'
+                }
+            },
+            'CMNLI': {
+                'train': {
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2',
+                    'label': 'label'
+                },
+                'test': {
+                    'id': 'id',
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2'
+                },
+                'eval': {
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2',
+                    'label': 'label'
+                }
+            },
+            'CSL': {
+                'train': {
+                    'id': 'id',
+                    'abst': 'abst',
+                    'keyword': 'keyword',
+                    'label': 'label'
+                },
+                'test': {
+                    'id': 'id',
+                    'abst': 'abst',
+                    'keyword': 'keyword'
+                },
+                'eval': {
+                    'id': 'id',
+                    'abst': 'abst',
+                    'keyword': 'keyword',
+                    'label': 'label'
+                }
+            },
+            'IFLYTEK': {
+                'train': {
+                    'label': 'label',
+                    'label_des': 'label_des',
+                    'sentence': 'sentence'
+                },
+                'test': {
+                    'id': 'id',
+                    'sentence': 'sentence',
+                },
+                'eval': {
+                    'label': 'label',
+                    'label_des': 'label_des',
+                    'sentence': 'sentence'
+                }
+            },
+            'TNEWS': {
+                'train': {
+                    'label': 'label',
+                    'label_desc': 'label_desc',
+                    'sentence': 'sentence',
+                    'keywords': 'keywords'
+                },
+                'test': {
+                    'id': 'id',
+                    'sentence': 'sentence',
+                    'keywords': 'keywords'
+                },
+                'eval': {
+                    'label': 'label',
+                    'label_desc': 'label_desc',
+                    'sentence': 'sentence',
+                    'keywords': 'keywords'
+                }
+            },
+            'WSC': {
+                'train': {
+                    'span1_index': 'target/span1_index',
+                    'span2_index': 'target/span2_index',
+                    'span1_text': 'target/span1_text',
+                    'span2_text': 'target/span2_text',
+                    'idx': 'idx',
+                    'label': 'label',
+                    'text': 'text'
+                },
+                'test': {
+                    'span1_index': 'target/span1_index',
+                    'span2_index': 'target/span2_index',
+                    'span1_text': 'target/span1_text',
+                    'span2_text': 'target/span2_text',
+                    'idx': 'idx',
+                    'text': 'text'
+                },
+                'eval': {
+                    'span1_index': 'target/span1_index',
+                    'span2_index': 'target/span2_index',
+                    'span1_text': 'target/span1_text',
+                    'span2_text': 'target/span2_text',
+                    'idx': 'idx',
+                    'label': 'label',
+                    'text': 'text'
+                }
+            }
+        }
+        self.cols_to_keyword = self.task_dict[task][usage]
+
+        if not isinstance(shuffle, (bool, Shuffle)):
+            raise TypeError("shuffle should be of boolean or enum 'Shuffle'.")
+        if not isinstance(shuffle, Shuffle):
+            if shuffle:
+                self.shuffle_level = Shuffle.GLOBAL
+                self.shuffle_files = True
+            else:
+                self.shuffle_level = None
+                self.shuffle_files = False
+        else:
+            self.shuffle_level = shuffle
+            self.shuffle_files = True
+
+        self.num_shards = num_shards
+        self.shard_id = shard_id
+
+    def get_args(self):
+        args = super().get_args()
+        args["dataset_files"] = self.dataset_files
+        args["num_samples"] = self.num_samples
+        if self.shuffle_files is not None:
+            args["shuffle_files"] = self.shuffle_files
+        args["shuffle_global"] = (self.shuffle_level == Shuffle.GLOBAL)
+        args["shuffle"] = self.shuffle_level
+        args["num_shards"] = self.num_shards
+        args["shard_id"] = self.shard_id
+        args["cols_to_keyword"] = self.cols_to_keyword
+        return args
+
+    def get_dataset_size(self):
+        """
+        Get the number of batches in an epoch.
+
+        Return:
+            Number, number of batches.
+        """
+        if self._dataset_size is None:
+            num_rows = ClueOp.get_num_rows(self.dataset_files)
+            num_rows = get_num_rows(num_rows, self.num_shards)
+            if self.num_samples is None:
+                return num_rows
+            return min(self.num_samples, num_rows)
+        return self._dataset_size
+
+    def is_shuffled(self):
+        return self.shuffle_files
+
+    def is_sharded(self):
+        if self.num_shards is not None:
+            return self.num_shards > 1
+
+        return False
+
+
 class TextFileDataset(SourceDataset):
     """
     A source dataset that reads and parses datasets stored on disk in text format.
@@ -4182,6 +4689,7 @@ class TextFileDataset(SourceDataset):
         args["num_samples"] = self.num_samples
         if self.shuffle_files is not None:
             args["shuffle_files"] = self.shuffle_files
+        args["shuffle_global"] = (self.shuffle_level == Shuffle.GLOBAL)
         args["shuffle"] = self.shuffle_level
         args["num_shards"] = self.num_shards
         args["shard_id"] = self.shard_id
@@ -4197,9 +4705,11 @@ class TextFileDataset(SourceDataset):
         if self._dataset_size is None:
             num_rows = TextFileOp.get_num_rows(self.dataset_files)
             num_rows = get_num_rows(num_rows, self.num_shards)
-            if self.num_samples is None:
-                return num_rows
-            return min(self.num_samples, num_rows)
+            # If the user gave a num samples in the dataset, then the sampler will limit the rows returned
+            # to that amount.  Account for that here in the row count
+            if self.num_samples is not None and self.num_samples > 0 and num_rows > self.num_samples:
+                num_rows = self.num_samples
+            return num_rows
         return self._dataset_size
 
     def is_shuffled(self):
@@ -4210,3 +4720,208 @@ class TextFileDataset(SourceDataset):
             return self.num_shards > 1
 
         return False
+
+
+class _NumpySlicesDataset:
+    """
+    Mainly for dealing with several kinds of format of python data, and return one row each time.
+    """
+
+    def __init__(self, data, column_list=None):
+        self.column_list = None
+        # Convert dict data into tuple
+        if isinstance(data, dict):
+            data = self.process_dict(data)
+
+        if isinstance(data, tuple):
+            self.data = ()
+            data_len = len(data)
+            for i in range(data_len):
+                self.data = self.data + (np.array(data[i]),)
+        else:
+            self.data = (np.array(data),)
+
+        # Init column_name
+        if column_list is not None:
+            self.column_list = column_list
+        elif self.column_list is None:
+            self.column_list = []
+            column_num = len(self.data)
+            for i in range(column_num):
+                self.column_list.append("column_" + str(i))
+
+    def __getitem__(self, index):
+        data_row = [d[index, ...] for d in self.data]
+        data_res = tuple(data_row)
+        return data_res
+
+    def __len__(self):
+        return len(self.data[0])
+
+    def process_dict(self, input_data):
+        """
+        Convert the dict like data into tuple format, when input is a tuple of dict then compose it into a dict first.
+        """
+        # Convert pandas like dict(has "values" column) into General dict
+        data_keys = list(input_data.keys())
+        data_col = input_data[data_keys[0]]
+        if hasattr(data_col, "values"):
+            new_dict = {}
+            for key in data_keys:
+                item1 = input_data.pop(key)
+                new_dict[key] = item1.values
+            input_data = new_dict
+
+        # Convert the data in dict into tuple
+        data = ()
+        keys = list(input_data.keys())
+        self.column_list = keys
+        for key in keys:
+            value = input_data[key]
+            data = data + (list(value),)
+
+        return data
+
+
+class NumpySlicesDataset(GeneratorDataset):
+    """
+    Create a dataset with given data slices, mainly for loading python data into dataset.
+
+    This dataset can take in a sampler. sampler and shuffle are mutually exclusive. Table
+    below shows what input args are allowed and their expected behavior.
+
+    .. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle'
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - Parameter 'sampler'
+         - Parameter 'shuffle'
+         - Expected Order Behavior
+       * - None
+         - None
+         - random order
+       * - None
+         - True
+         - random order
+       * - None
+         - False
+         - sequential order
+       * - Sampler object
+         - None
+         - order defined by sampler
+       * - Sampler object
+         - True
+         - not allowed
+       * - Sampler object
+         - False
+         - not allowed
+
+    Args:
+        data (list, tuple or dict) Input of Given data, supported data type includes list, tuple, dict and other numpy
+            format. Input data will be sliced in first dimension and generate many rows, large data is not recommend to
+            load in this way as data is loading into memory.
+        column_names (list[str], optional): List of column names of the dataset (default=None). If column_names not
+            provided, when data is dict, column_names will be its key, otherwise it will be like column_1, column_2 ...
+        num_samples (int, optional): The number of samples to be included in the dataset (default=None, all images).
+        num_parallel_workers (int, optional): Number of subprocesses used to fetch the dataset in parallel (default=1).
+        shuffle (bool, optional): Whether or not to perform shuffle on the dataset. Random accessible input is required.
+            (default=None, expected order behavior shown in the table).
+        sampler (Sampler/Iterable, optional): Object used to choose samples from the dataset. Random accessible input is
+            required (default=None, expected order behavior shown in the table).
+        num_shards (int, optional): Number of shards that the dataset should be divided into (default=None).
+            This argument should be specified only when 'num_samples' is "None". Random accessible input is required.
+        shard_id (int, optional): The shard ID within num_shards (default=None). This argument should be specified only
+            when num_shards is also specified. Random accessible input is required.
+
+    Examples:
+        >>> import mindspore.dataset as ds
+        >>> # 1) Input data can be a list
+        >>> data = [1, 2, 3]
+        >>> dataset1 = ds.NumpySlicesDataset(data, column_names=["column_1"])
+        >>> # 2) Input data can be a dict, and column_names will be its key
+        >>> data = {"a": [1, 2], "b": [3, 4]}
+        >>> dataset2 = ds.NumpySlicesDataset(data)
+        >>> # 3) Input data can be a tuple of lists (or numpy arrays), each tuple element refers to data in each column
+        >>> data = ([1, 2], [3, 4], [5, 6])
+        >>> dataset3 = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"])
+        >>> # 4) Load data from csv file
+        >>> import pandas as pd
+        >>> df = pd.read_csv("file.csv")
+        >>> dataset4 = ds.NumpySlicesDataset(dict(df), shuffle=False)
+    """
+
+    @check_numpyslicesdataset
+    def __init__(self, data, column_names=None, num_samples=None, num_parallel_workers=1, shuffle=None,
+                 sampler=None, num_shards=None, shard_id=None):
+        dataset = _NumpySlicesDataset(data, column_names)
+        super().__init__(dataset, column_names=dataset.column_list, num_samples=num_samples,
+                         num_parallel_workers=num_parallel_workers, shuffle=shuffle, sampler=sampler,
+                         num_shards=num_shards, shard_id=shard_id)
+
+
+class BuildVocabDataset(DatasetOp):
+    """
+    Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
+    which contains top_k most frequent words (if top_k is specified)
+    This function is not meant to be called directly by user. To build vocab, please use the function
+    text.Vocab.from_dataset()
+
+    Args:
+        vocab(Vocab): text.vocab object.
+        columns(str or list, optional): column names to get words from. It can be a list of column names (Default is
+            None, all columns are used, return error if any column isn't string).
+        freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
+            range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
+            can be None, which corresponds to 0/total_words separately (default=None, all words are included).
+        top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
+            taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
+            all words are included).
+        special_tokens(list, optional):  a list of strings, each one is a special token. for example
+            special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
+        special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
+            is specified and special_first is set to None, special_tokens will be prepended. (default=None).
+        prefetch_size (int, optional): prefetch number of records ahead of the user's request (default=None).
+    """
+
+    def __init__(self, input_dataset, vocab, columns, freq_range, top_k, special_tokens, special_first,
+                 prefetch_size=None):
+        super().__init__()
+        self.columns = columns
+        self.children.append(input_dataset)
+        self.prefetch_size = prefetch_size
+        self.vocab = vocab
+        self.freq_range = freq_range
+        self.top_k = top_k
+        self.special_tokens = special_tokens
+        self.special_first = special_first
+        input_dataset.parent.append(self)
+
+    def get_args(self):
+        args = super().get_args()
+        args["columns"] = self.columns
+        args["vocab"] = self.vocab
+        args["freq_range"] = self.freq_range
+        args["prefetch_size"] = self.prefetch_size
+        args["top_k"] = self.top_k
+        args["special_tokens"] = self.special_tokens
+        args["special_first"] = self.special_first
+        return args
+
+    def __deepcopy__(self, memodict):
+        if id(self) in memodict:
+            return memodict[id(self)]
+        cls = self.__class__
+        new_op = cls.__new__(cls)
+        memodict[id(self)] = new_op
+        new_op.children = copy.deepcopy(self.children, memodict)
+        new_op.columns = copy.deepcopy(self.columns, memodict)
+        new_op.num_parallel_workers = copy.deepcopy(self.num_parallel_workers, memodict)
+        new_op.prefetch_size = copy.deepcopy(self.prefetch_size, memodict)
+        new_op.parent = copy.deepcopy(self.parent, memodict)
+        new_op.freq_range = copy.deepcopy(self.freq_range, memodict)
+        new_op.top_k = copy.deepcopy(self.top_k, memodict)
+        new_op.vocab = self.vocab
+        new_op.special_tokens = copy.deepcopy(self.special_tokens)
+        new_op.special_first = copy.deepcopy(self.special_first)
+
+        return new_op
diff --git a/mindspore/dataset/engine/graphdata.py b/mindspore/dataset/engine/graphdata.py
index 23f8dbda6a..472819784e 100644
--- a/mindspore/dataset/engine/graphdata.py
+++ b/mindspore/dataset/engine/graphdata.py
@@ -20,8 +20,9 @@ import numpy as np
 from mindspore._c_dataengine import Graph
 from mindspore._c_dataengine import Tensor
 
-from .validators import check_gnn_graphdata, check_gnn_get_all_nodes, check_gnn_get_all_neighbors, \
-    check_gnn_get_node_feature
+from .validators import check_gnn_graphdata, check_gnn_get_all_nodes, check_gnn_get_all_edges, \
+    check_gnn_get_nodes_from_edges, check_gnn_get_all_neighbors, check_gnn_get_sampled_neighbors, \
+    check_gnn_get_neg_sampled_neighbors, check_gnn_get_node_feature, check_gnn_random_walk
 
 
 class GraphData:
@@ -60,7 +61,44 @@ class GraphData:
         Raises:
             TypeError: If `node_type` is not integer.
         """
-        return self._graph.get_nodes(node_type, -1).as_array()
+        return self._graph.get_all_nodes(node_type).as_array()
+
+    @check_gnn_get_all_edges
+    def get_all_edges(self, edge_type):
+        """
+        Get all edges in the graph.
+
+        Args:
+            edge_type (int): Specify the type of edge.
+
+        Returns:
+            numpy.ndarray: array of edges.
+
+        Examples:
+            >>> import mindspore.dataset as ds
+            >>> data_graph = ds.GraphData('dataset_file', 2)
+            >>> nodes = data_graph.get_all_edges(0)
+
+        Raises:
+            TypeError: If `edge_type` is not integer.
+        """
+        return self._graph.get_all_edges(edge_type).as_array()
+
+    @check_gnn_get_nodes_from_edges
+    def get_nodes_from_edges(self, edge_list):
+        """
+        Get nodes from the edges.
+
+        Args:
+            edge_list (list or numpy.ndarray): The given list of edges.
+
+        Returns:
+            numpy.ndarray: array of nodes.
+
+        Raises:
+            TypeError: If `edge_list` is not list or ndarray.
+        """
+        return self._graph.get_nodes_from_edges(edge_list).as_array()
 
     @check_gnn_get_all_neighbors
     def get_all_neighbors(self, node_list, neighbor_type):
@@ -86,6 +124,60 @@ class GraphData:
         """
         return self._graph.get_all_neighbors(node_list, neighbor_type).as_array()
 
+    @check_gnn_get_sampled_neighbors
+    def get_sampled_neighbors(self, node_list, neighbor_nums, neighbor_types):
+        """
+        Get sampled neighbor information, maximum support 6-hop sampling.
+
+        Args:
+            node_list (list or numpy.ndarray): The given list of nodes.
+            neighbor_nums (list or numpy.ndarray): Number of neighbors sampled per hop.
+            neighbor_types (list or numpy.ndarray): Neighbor type sampled per hop.
+
+        Returns:
+            numpy.ndarray: array of nodes.
+
+        Examples:
+            >>> import mindspore.dataset as ds
+            >>> data_graph = ds.GraphData('dataset_file', 2)
+            >>> nodes = data_graph.get_all_nodes(0)
+            >>> neighbors = data_graph.get_all_neighbors(nodes, [2, 2], [0, 0])
+
+        Raises:
+            TypeError: If `node_list` is not list or ndarray.
+            TypeError: If `neighbor_nums` is not list or ndarray.
+            TypeError: If `neighbor_types` is not list or ndarray.
+        """
+        return self._graph.get_sampled_neighbors(
+            node_list, neighbor_nums, neighbor_types).as_array()
+
+    @check_gnn_get_neg_sampled_neighbors
+    def get_neg_sampled_neighbors(self, node_list, neg_neighbor_num, neg_neighbor_type):
+        """
+        Get `neg_neighbor_type` negative sampled neighbors of the nodes in `node_list`.
+
+        Args:
+            node_list (list or numpy.ndarray): The given list of nodes.
+            neg_neighbor_num (int): Number of neighbors sampled.
+            neg_neighbor_type (int): Specify the type of negative neighbor.
+
+        Returns:
+            numpy.ndarray: array of nodes.
+
+        Examples:
+            >>> import mindspore.dataset as ds
+            >>> data_graph = ds.GraphData('dataset_file', 2)
+            >>> nodes = data_graph.get_all_nodes(0)
+            >>> neg_neighbors = data_graph.get_neg_sampled_neighbors(nodes, 5, 0)
+
+        Raises:
+            TypeError: If `node_list` is not list or ndarray.
+            TypeError: If `neg_neighbor_num` is not integer.
+            TypeError: If `neg_neighbor_type` is not integer.
+        """
+        return self._graph.get_neg_sampled_neighbors(
+            node_list, neg_neighbor_num, neg_neighbor_type).as_array()
+
     @check_gnn_get_node_feature
     def get_node_feature(self, node_list, feature_types):
         """
@@ -110,4 +202,51 @@ class GraphData:
         """
         if isinstance(node_list, list):
             node_list = np.array(node_list, dtype=np.int32)
-        return [t.as_array() for t in self._graph.get_node_feature(Tensor(node_list), feature_types)]
+        return [
+            t.as_array() for t in self._graph.get_node_feature(
+                Tensor(node_list),
+                feature_types)]
+
+    def graph_info(self):
+        """
+        Get the meta information of the graph, including the number of nodes, the type of nodes,
+        the feature information of nodes, the number of edges, the type of edges, and the feature information of edges.
+
+        Returns:
+            Dict: Meta information of the graph. The key is node_type, edge_type, node_num, edge_num,
+            node_feature_type and edge_feature_type.
+        """
+        return self._graph.graph_info()
+
+    @check_gnn_random_walk
+    def random_walk(
+            self,
+            target_nodes,
+            meta_path,
+            step_home_param=1.0,
+            step_away_param=1.0,
+            default_node=-1):
+        """
+        Random walk in nodes.
+
+        Args:
+            target_nodes (list[int]): Start node list in random walk
+            meta_path (list[int]): node type for each walk step
+            step_home_param (float): return hyper parameter in node2vec algorithm
+            step_away_param (float): inout hyper parameter in node2vec algorithm
+            default_node (int): default node if no more neighbors found
+
+        Returns:
+            numpy.ndarray: array of nodes.
+
+        Examples:
+            >>> import mindspore.dataset as ds
+            >>> data_graph = ds.GraphData('dataset_file', 2)
+            >>> nodes = data_graph.random_walk([1,2], [1,2,1,2,1])
+
+        Raises:
+            TypeError: If `target_nodes` is not list or ndarray.
+            TypeError: If `meta_path` is not list or ndarray.
+        """
+        return self._graph.random_walk(target_nodes, meta_path, step_home_param, step_away_param,
+                                       default_node).as_array()
diff --git a/mindspore/dataset/engine/iterators.py b/mindspore/dataset/engine/iterators.py
index f58db32094..4946fb3252 100644
--- a/mindspore/dataset/engine/iterators.py
+++ b/mindspore/dataset/engine/iterators.py
@@ -38,43 +38,24 @@ def _cleanup():
 
 def alter_tree(node):
     """Traversing the python Dataset tree/graph to perform some alteration to some specific nodes."""
-    if not node.input:
+    if not node.children:
         return _alter_node(node)
 
     converted_children = []
-    for input_op in node.input:
+    for input_op in node.children:
         converted_children.append(alter_tree(input_op))
-    node.input = converted_children
+    node.children = converted_children
     return _alter_node(node)
 
 
 def _alter_node(node):
-    """Performing some alteration to a dataset node. A common alteration is to insert a node."""
-    if isinstance(node, (de.TFRecordDataset, de.TextFileDataset)) and node.shuffle_level == de.Shuffle.GLOBAL:
-        # Remove the connection between the parent's node to the current node because we are inserting a node.
-        if node.output:
-            node.output.pop()
-        # Perform a fast scan for average rows per file
-        if isinstance(node, de.TFRecordDataset):
-            avg_rows_per_file = node.get_dataset_size(True) // len(node.dataset_files)
-        else:
-            avg_rows_per_file = node.get_dataset_size() // len(node.dataset_files)
-
-        # Shuffle between 4 files with a minimum size of 10000 rows
-        new_shuffle = node.shuffle(max(avg_rows_per_file * 4, 10000))
-        return new_shuffle
-
+    """DEPRECATED"""
+    # Please check ccsrc/dataset/engine/opt for tree transformation.
     if isinstance(node, de.MapDataset):
         if node.python_multiprocessing:
             # Bootstrap can only be performed on a copy of the original dataset node.
             # Bootstrap on original dataset node will make all iterators share the same process pool
             node.iterator_bootstrap()
-        if node.columns_order is not None:
-            # Remove the connection between the parent's node to the current node because we are inserting a node.
-            if node.output:
-                node.output.pop()
-
-            return node.project(node.columns_order)
     return node
 
 
@@ -105,14 +86,14 @@ class Iterator:
 
     def __is_tree_node(self, node):
         """Check if a node is tree node."""
-        if not node.input:
-            if len(node.output) > 1:
+        if not node.children:
+            if len(node.parent) > 1:
                 return False
 
-        if len(node.output) > 1:
+        if len(node.parent) > 1:
             return False
 
-        for input_node in node.input:
+        for input_node in node.children:
             cls = self.__is_tree_node(input_node)
             if not cls:
                 return False
@@ -131,6 +112,8 @@ class Iterator:
             op_type = OpName.MINDRECORD
         elif isinstance(dataset, de.BatchDataset):
             op_type = OpName.BATCH
+        elif isinstance(dataset, de.BucketBatchByLengthDataset):
+            op_type = OpName.BUCKETBATCH
         elif isinstance(dataset, de.SyncWaitDataset):
             op_type = OpName.BARRIER
         elif isinstance(dataset, de.ZipDataset):
@@ -165,6 +148,8 @@ class Iterator:
             op_type = OpName.MANIFEST
         elif isinstance(dataset, de.VOCDataset):
             op_type = OpName.VOC
+        elif isinstance(dataset, de.CocoDataset):
+            op_type = OpName.COCO
         elif isinstance(dataset, de.Cifar10Dataset):
             op_type = OpName.CIFAR10
         elif isinstance(dataset, de.Cifar100Dataset):
@@ -175,6 +160,10 @@ class Iterator:
             op_type = OpName.RANDOMDATA
         elif isinstance(dataset, de.TextFileDataset):
             op_type = OpName.TEXTFILE
+        elif isinstance(dataset, de.BuildVocabDataset):
+            op_type = OpName.BUILDVOCAB
+        elif isinstance(dataset, de.CLUEDataset):
+            op_type = OpName.CLUE
         else:
             raise ValueError("Unsupported DatasetOp")
 
@@ -185,7 +174,7 @@ class Iterator:
         op_type = self.__get_dataset_type(node)
         c_node = self.depipeline.AddNodeToTree(op_type, node.get_args())
 
-        for py_child in node.input:
+        for py_child in node.children:
             c_child = self.__convert_node_postorder(py_child)
             self.depipeline.AddChildToParentNode(c_child, c_node)
 
@@ -195,7 +184,7 @@ class Iterator:
         """Recursively get batch node in the dataset tree."""
         if isinstance(dataset, de.BatchDataset):
             return
-        for input_op in dataset.input:
+        for input_op in dataset.children:
             self.__batch_node(input_op, level + 1)
 
     @staticmethod
@@ -205,11 +194,11 @@ class Iterator:
         ptr = hex(id(dataset))
         for _ in range(level):
             logger.info("\t", end='')
-        if not dataset.input:
+        if not dataset.children:
             logger.info("-%s (%s)", name, ptr)
         else:
             logger.info("+%s (%s)", name, ptr)
-        for input_op in dataset.input:
+        for input_op in dataset.children:
             Iterator.__print_local(input_op, level + 1)
 
     def print(self):
diff --git a/mindspore/dataset/engine/samplers.py b/mindspore/dataset/engine/samplers.py
index 8951a1c4a0..b74874f9cf 100644
--- a/mindspore/dataset/engine/samplers.py
+++ b/mindspore/dataset/engine/samplers.py
@@ -22,7 +22,6 @@ User can also define custom sampler by extending from Sampler class.
 import numpy as np
 import mindspore._c_dataengine as cde
 
-
 class Sampler:
     """
     Base class for user defined sampler.
@@ -44,10 +43,10 @@ class Sampler:
         >>> ds = ds.ImageFolderDatasetV2(path, sampler=ReverseSampler())
     """
 
-    def __init__(self):
+    def __init__(self, num_samples=None):
         self.dataset_size = 0
-        self.num_samples = 0
         self.child_sampler = None
+        self.num_samples = num_samples
 
     def __iter__(self):
         """
@@ -84,7 +83,8 @@ class Sampler:
     # Instance fetcher
     # Do not override this method!
     def create(self):
-        c_sampler = cde.PythonSampler(self)
+        num_samples = self.num_samples if self.num_samples is not None else 0
+        c_sampler = cde.PythonSampler(num_samples, self)
         c_child_sampler = self.create_child()
         c_sampler.add_child(c_child_sampler)
         return c_sampler
@@ -114,7 +114,9 @@ class Sampler:
 
         return self.child_sampler.is_sharded()
 
-    def get_dataset_size(self):
+    def get_num_samples(self):
+        if self.num_samples is None:
+            return None
         return self._get_indices().size
 
 
@@ -124,8 +126,9 @@ class BuiltinSampler:
 
     User should not extend this class.
     """
-    def __init__(self):
+    def __init__(self, num_samples=None):
         self.child_sampler = None
+        self.num_samples = num_samples
 
     def create(self):
         pass
@@ -140,7 +143,12 @@ class BuiltinSampler:
         c_child_sampler = None
         if self.child_sampler is not None:
             c_child_sampler = self.child_sampler.create()
+        return c_child_sampler
 
+    def create_child_for_minddataset(self):
+        c_child_sampler = None
+        if self.child_sampler is not None:
+            c_child_sampler = self.child_sampler.create_for_minddataset()
         return c_child_sampler
 
     def is_shuffled(self):
@@ -149,11 +157,61 @@ class BuiltinSampler:
     def is_sharded(self):
         raise NotImplementedError("Sampler must implement is_sharded.")
 
-    def get_dataset_size(self):
+    def get_num_samples(self):
+        """
+        All samplers can contain a numeric num_samples value (or it could be set to None).
+        Child sampler can exist or be None.
+        if child sampler exists, then the child sampler count can be a numeric value or None.
+        Given these conditions, we need to output what the sampler count is for this sampler.
+        The following table shows the possible results from calling this function.
+
+        .. list-table::
+           :widths: 25 25 25 25
+           :header-rows: 1
+
+           * - child sampler
+             - num_samples
+             - child_samples
+             - result
+           * - T
+             - x
+             - y
+             - min(x, y)
+           * - T
+             - x
+             - None
+             - x
+           * - T
+             - None
+             - y
+             - y
+           * - T
+             - None
+             - None
+             - None
+           * - None
+             - x
+             - n/a
+             - x
+           * - None
+             - None
+             - n/a
+             - None
+
+        Returns:
+            int, The number of samples, or None
+        """
         if self.child_sampler is not None:
-            return self.child_sampler.get_dataset_size()
+            child_samples = self.child_sampler.get_num_samples()
+            if self.num_samples is not None:
+                if child_samples is not None:
+                    return min(self.num_samples, child_samples)
+
+                return self.num_samples
 
-        return None
+            return child_samples
+
+        return self.num_samples
 
 
 class DistributedSampler(BuiltinSampler):
@@ -164,6 +222,7 @@ class DistributedSampler(BuiltinSampler):
         num_shards (int): Number of shards to divide the dataset into.
         shard_id (int): Shard ID of the current shard within num_shards.
         shuffle (bool, optional): If true, the indices are shuffled (default=True).
+        num_samples (int, optional): The number of samples to draw (default=None, all elements).
 
     Examples:
         >>> import mindspore.dataset as ds
@@ -180,7 +239,7 @@ class DistributedSampler(BuiltinSampler):
         ValueError: If shuffle is not a boolean value.
     """
 
-    def __init__(self, num_shards, shard_id, shuffle=True):
+    def __init__(self, num_shards, shard_id, shuffle=True, num_samples=None):
         if num_shards <= 0:
             raise ValueError("num_shards should be a positive integer value, but got num_shards={}".format(num_shards))
 
@@ -190,20 +249,32 @@ class DistributedSampler(BuiltinSampler):
         if not isinstance(shuffle, bool):
             raise ValueError("shuffle should be a boolean value, but got shuffle={}".format(shuffle))
 
+        if num_samples is not None:
+            if num_samples <= 0:
+                raise ValueError("num_samples should be a positive integer "
+                                 "value, but got num_samples={}".format(num_samples))
+
         self.num_shards = num_shards
         self.shard_id = shard_id
         self.shuffle = shuffle
         self.seed = 0
-        super().__init__()
+        super().__init__(num_samples)
 
     def create(self):
+        num_samples = self.num_samples if self.num_samples is not None else 0
         # each time user calls create_dict_iterator() (to do repeat) sampler would get a different seed to shuffle
         self.seed += 1
-        c_sampler = cde.DistributedSampler(self.num_shards, self.shard_id, self.shuffle, self.seed)
+        c_sampler = cde.DistributedSampler(num_samples, self.num_shards, self.shard_id, self.shuffle, self.seed)
         c_child_sampler = self.create_child()
         c_sampler.add_child(c_child_sampler)
         return c_sampler
 
+    def create_for_minddataset(self):
+        c_sampler = cde.MindrecordDistributedSampler(self.num_shards, self.shard_id, self.shuffle, self.seed)
+        c_child_sampler = self.create_child_for_minddataset()
+        c_sampler.add_child(c_child_sampler)
+        return c_sampler
+
     def is_shuffled(self):
         if self.child_sampler is None:
             return self.shuffle
@@ -226,6 +297,7 @@ class PKSampler(BuiltinSampler):
         num_class (int, optional): Number of classes to sample (default=None, all classes).
         shuffle (bool, optional): If true, the class IDs are shuffled (default=False).
         class_column (str, optional): Name of column to classify dataset(default='label'), for MindDataset.
+        num_samples (int, optional): The number of samples to draw (default=None, all elements).
 
     Examples:
         >>> import mindspore.dataset as ds
@@ -242,23 +314,29 @@ class PKSampler(BuiltinSampler):
         ValueError: If shuffle is not boolean.
     """
 
-    def __init__(self, num_val, num_class=None, shuffle=False, class_column='label'):
+    def __init__(self, num_val, num_class=None, shuffle=False, class_column='label', num_samples=None):
         if num_val <= 0:
             raise ValueError("num_val should be a positive integer value, but got num_val={}".format(num_val))
 
         if num_class is not None:
-            raise NotImplementedError
+            raise NotImplementedError("Not support specify num_class")
 
         if not isinstance(shuffle, bool):
             raise ValueError("shuffle should be a boolean value, but got shuffle={}".format(shuffle))
 
+        if num_samples is not None:
+            if num_samples <= 0:
+                raise ValueError("num_samples should be a positive integer "
+                                 "value, but got num_samples={}".format(num_samples))
+
         self.num_val = num_val
         self.shuffle = shuffle
-        self.class_column = class_column # work for minddataset
-        super().__init__()
+        self.class_column = class_column  # work for minddataset
+        super().__init__(num_samples)
 
     def create(self):
-        c_sampler = cde.PKSampler(self.num_val, self.shuffle)
+        num_samples = self.num_samples if self.num_samples is not None else 0
+        c_sampler = cde.PKSampler(num_samples, self.num_val, self.shuffle)
         c_child_sampler = self.create_child()
         c_sampler.add_child(c_child_sampler)
         return c_sampler
@@ -275,12 +353,14 @@ class PKSampler(BuiltinSampler):
 
         return self.child_sampler.is_sharded()
 
-    def _create_for_minddataset(self):
+    def create_for_minddataset(self):
         if not self.class_column or not isinstance(self.class_column, str):
             raise ValueError("class_column should be a not empty string value, \
                     but got class_column={}".format(class_column))
-        return cde.MindrecordPkSampler(self.num_val, self.class_column, self.shuffle)
-
+        c_sampler = cde.MindrecordPkSampler(self.num_val, self.class_column, self.shuffle)
+        c_child_sampler = self.create_child_for_minddataset()
+        c_sampler.add_child(c_child_sampler)
+        return c_sampler
 
 class RandomSampler(BuiltinSampler):
     """
@@ -315,59 +395,25 @@ class RandomSampler(BuiltinSampler):
 
         self.deterministic = False
         self.replacement = replacement
-        self.num_samples = num_samples
         self.reshuffle_each_epoch = True
-        super().__init__()
+        super().__init__(num_samples)
 
     def create(self):
-        c_sampler = None
-        if self.num_samples is None:
-            c_sampler = cde.RandomSampler(self.replacement, self.reshuffle_each_epoch)
-        else:
-            c_sampler = cde.RandomSampler(self.replacement, self.reshuffle_each_epoch, self.num_samples)
-
+        num_samples = self.num_samples if self.num_samples is not None else 0
+        c_sampler = cde.RandomSampler(num_samples, self.replacement, self.reshuffle_each_epoch)
         c_child_sampler = self.create_child()
         c_sampler.add_child(c_child_sampler)
         return c_sampler
 
-    def is_shuffled(self):
-        return True
-
-    def is_sharded(self):
-        if self.child_sampler is None:
-            return False
-
-        return self.child_sampler.is_sharded()
-
-    def get_dataset_size(self):
-        return self.num_samples
-
-
-class SequentialSampler(BuiltinSampler):
-    """
-    Samples the dataset elements sequentially, same as not having a sampler.
-
-    Examples:
-        >>> import mindspore.dataset as ds
-        >>>
-        >>> dataset_dir = "path/to/imagefolder_directory"
-        >>>
-        >>> # creates a SequentialSampler
-        >>> sampler = ds.SequentialSampler()
-        >>> data = ds.ImageFolderDatasetV2(dataset_dir, num_parallel_workers=8, sampler=sampler)
-    """
-
-    def create(self):
-        c_sampler = cde.SequentialSampler()
-        c_child_sampler = self.create_child()
+    def create_for_minddataset(self):
+        num_samples = self.num_samples if self.num_samples is not None else 0
+        c_sampler = cde.MindrecordRandomSampler(num_samples, self.replacement, self.reshuffle_each_epoch)
+        c_child_sampler = self.create_child_for_minddataset()
         c_sampler.add_child(c_child_sampler)
         return c_sampler
 
     def is_shuffled(self):
-        if self.child_sampler is None:
-            return False
-
-        return self.child_sampler.is_shuffled()
+        return True
 
     def is_sharded(self):
         if self.child_sampler is None:
@@ -376,51 +422,54 @@ class SequentialSampler(BuiltinSampler):
         return self.child_sampler.is_sharded()
 
 
-class SubsetSampler(BuiltinSampler):
+class SequentialSampler(BuiltinSampler):
     """
-    Samples a subset of elements consecutively from a given index.
+    Samples the dataset elements sequentially, same as not having a sampler.
 
     Args:
-        start_index (int): Index to start sampling at.
-        subset_size (int): How many samples to include in this subset.
+        start_index (int, optional): Index to start sampling at. (dafault=None starts at first id)
+        num_samples (int, optional): Number of elements to sample (default=None, all elements).
 
     Examples:
         >>> import mindspore.dataset as ds
         >>>
         >>> dataset_dir = "path/to/imagefolder_directory"
         >>>
-        >>> # creates a SubsetSampler, will sample the next 5 images from the 100th image.
-        >>> sampler = ds.SubsetSampler(100, 5)
+        >>> # creates a SequentialSampler
+        >>> sampler = ds.SequentialSampler()
         >>> data = ds.ImageFolderDatasetV2(dataset_dir, num_parallel_workers=8, sampler=sampler)
-
-    Raises:
-        ValueError: If start_index is not a positive int.
-        ValueError: If subset_size is not a positive int.
     """
 
-    def __init__(self, start_index, subset_size):
-        if not isinstance(start_index, int):
-            raise ValueError("start_index should be an int.")
-
-        if start_index < 0:
-            raise ValueError("start_index should not be negative.")
-
-        if not isinstance(subset_size, int):
-            raise ValueError("start_index should be an int")
+    def __init__(self, start_index=None, num_samples=None):
+        if num_samples is not None:
+            if num_samples <= 0:
+                raise ValueError("num_samples should be a positive integer "
+                                 "value, but got num_samples={}".format(num_samples))
 
-        if subset_size < 0:
-            raise ValueError("subset_size should not be negative.")
+        if start_index is not None:
+            if start_index < 0:
+                raise ValueError("start_index should be a positive integer "
+                                 "value or 0, but got start_index={}".format(start_index))
 
         self.start_index = start_index
-        self.subset_size = subset_size
-        super().__init__()
+        super().__init__(num_samples)
 
     def create(self):
-        c_sampler = cde.SubsetSampler(self.start_index, self.subset_size)
+        start_index = self.start_index if self.start_index is not None else 0
+        num_samples = self.num_samples if self.num_samples is not None else 0
+        c_sampler = cde.SequentialSampler(num_samples, start_index)
         c_child_sampler = self.create_child()
         c_sampler.add_child(c_child_sampler)
         return c_sampler
 
+    def create_for_minddataset(self):
+        start_index = self.start_index if self.start_index is not None else 0
+        num_samples = self.num_samples if self.num_samples is not None else 0
+        c_sampler = cde.MindrecordSequentialSampler(num_samples, start_index)
+        c_child_sampler = self.create_child_for_minddataset()
+        c_sampler.add_child(c_child_sampler)
+        return c_sampler
+
     def is_shuffled(self):
         if self.child_sampler is None:
             return False
@@ -433,9 +482,6 @@ class SubsetSampler(BuiltinSampler):
 
         return self.child_sampler.is_sharded()
 
-    def get_dataset_size(self):
-        return self.subset_size
-
 
 class SubsetRandomSampler(BuiltinSampler):
     """
@@ -443,6 +489,7 @@ class SubsetRandomSampler(BuiltinSampler):
 
     Args:
         indices (list[int]): A sequence of indices.
+        num_samples (int, optional): Number of elements to sample (default=None, all elements).
 
     Examples:
         >>> import mindspore.dataset as ds
@@ -456,15 +503,21 @@ class SubsetRandomSampler(BuiltinSampler):
         >>> data = ds.ImageFolderDatasetV2(dataset_dir, num_parallel_workers=8, sampler=sampler)
     """
 
-    def __init__(self, indices):
+    def __init__(self, indices, num_samples=None):
+        if num_samples is not None:
+            if num_samples <= 0:
+                raise ValueError("num_samples should be a positive integer "
+                                 "value, but got num_samples={}".format(num_samples))
+
         if not isinstance(indices, list):
             indices = [indices]
 
         self.indices = indices
-        super().__init__()
+        super().__init__(num_samples)
 
     def create(self):
-        c_sampler = cde.SubsetRandomSampler(self.indices)
+        num_samples = self.num_samples if self.num_samples is not None else 0
+        c_sampler = cde.SubsetRandomSampler(num_samples, self.indices)
         c_child_sampler = self.create_child()
         c_sampler.add_child(c_child_sampler)
         return c_sampler
@@ -478,12 +531,18 @@ class SubsetRandomSampler(BuiltinSampler):
 
         return self.child_sampler.is_sharded()
 
-    def _create_for_minddataset(self):
-        return cde.MindrecordSubsetRandomSampler(self.indices)
+    def create_for_minddataset(self):
+        c_sampler = cde.MindrecordSubsetRandomSampler(self.indices)
+        c_child_sampler = self.create_child_for_minddataset()
+        c_sampler.add_child(c_child_sampler)
+        return c_sampler
 
+    def get_num_samples(self):
+        num_samples = super().get_num_samples()
+        if num_samples is None:
+            return len(self.indices)
 
-    def get_dataset_size(self):
-        return len(self.indices)
+        return min(len(self.indices), num_samples)
 
 
 class WeightedRandomSampler(BuiltinSampler):
@@ -492,8 +551,8 @@ class WeightedRandomSampler(BuiltinSampler):
 
     Args:
         weights (list[float]): A sequence of weights, not necessarily summing up to 1.
-        num_samples (int): Number of elements to sample.
-        replacement (bool, optional): If True, put the sample ID back for the next draw (default=True).
+        num_samples (int, optional): Number of elements to sample (default=None, all elements).
+        replacement (bool): If True, put the sample ID back for the next draw (default=True).
 
     Examples:
         >>> import mindspore.dataset as ds
@@ -511,24 +570,25 @@ class WeightedRandomSampler(BuiltinSampler):
         ValueError: If replacement is not boolean.
     """
 
-    def __init__(self, weights, num_samples, replacement=True):
+    def __init__(self, weights, num_samples=None, replacement=True):
         if not isinstance(weights, list):
             weights = [weights]
 
-        if num_samples <= 0:
-            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(num_samples))
+        if num_samples is not None:
+            if num_samples <= 0:
+                raise ValueError("num_samples should be a positive integer "
+                                 "value, but got num_samples={}".format(num_samples))
 
         if not isinstance(replacement, bool):
             raise ValueError("replacement should be a boolean value, but got replacement={}".format(replacement))
 
         self.weights = weights
-        self.num_samples = num_samples
         self.replacement = replacement
-        super().__init__()
+        super().__init__(num_samples)
 
     def create(self):
-        c_sampler = cde.WeightedRandomSampler(self.weights, self.num_samples, self.replacement)
+        num_samples = self.num_samples if self.num_samples is not None else 0
+        c_sampler = cde.WeightedRandomSampler(num_samples, self.weights, self.replacement)
         c_child_sampler = self.create_child()
         c_sampler.add_child(c_child_sampler)
         return c_sampler
@@ -541,6 +601,3 @@ class WeightedRandomSampler(BuiltinSampler):
             return False
 
         return self.child_sampler.is_sharded()
-
-    def get_dataset_size(self):
-        return self.num_samples
diff --git a/mindspore/dataset/engine/serializer_deserializer.py b/mindspore/dataset/engine/serializer_deserializer.py
index 688ef16753..833f660f16 100644
--- a/mindspore/dataset/engine/serializer_deserializer.py
+++ b/mindspore/dataset/engine/serializer_deserializer.py
@@ -156,17 +156,37 @@ def traverse(node):
             serialize_operations(node_repr, k, v)
         elif k == 'sampler':
             serialize_sampler(node_repr, v)
+        elif k == 'padded_sample' and v:
+            v1 = {key: value for key, value in v.items() if not isinstance(value, bytes)}
+            node_repr[k] = json.dumps(v1, indent=2)
+        # return schema json str if its type is mindspore.dataset.Schema
+        elif k == 'schema' and isinstance(v, de.Schema):
+            node_repr[k] = v.to_json()
         elif k in set(['schema', 'dataset_files', 'dataset_dir', 'schema_file_path']):
             expand_path(node_repr, k, v)
         else:
             node_repr[k] = v
 
+    # If a sampler exists in this node, then the following 4 arguments must be set to None:
+    #    num_samples, shard_id, num_shards, shuffle
+    # These arguments get moved into the sampler itself, so they are no longer needed to
+    # be set at the dataset level.
+    if 'sampler' in node_args.keys():
+        if 'num_samples' in node_repr.keys():
+            node_repr['num_samples'] = None
+        if 'shuffle' in node_repr.keys():
+            node_repr['shuffle'] = None
+        if 'num_shards' in node_repr.keys():
+            node_repr['num_shards'] = None
+        if 'shard_id' in node_repr.keys():
+            node_repr['shard_id'] = None
+
     # Leaf node doesn't have input attribute.
-    if not node.input:
+    if not node.children:
         return node_repr
 
     # Recursively traverse the child and assign it to the current node_repr['children'].
-    for child in node.input:
+    for child in node.children:
         node_repr["children"].append(traverse(child))
 
     return node_repr
@@ -206,11 +226,11 @@ def construct_pipeline(node):
     # Instantiate python Dataset object based on the current dictionary element
     dataset = create_node(node)
     # Initially it is not connected to any other object.
-    dataset.input = []
+    dataset.children = []
 
     # Construct the children too and add edge between the children and parent.
     for child in node['children']:
-        dataset.input.append(construct_pipeline(child))
+        dataset.children.append(construct_pipeline(child))
 
     return dataset
 
@@ -285,6 +305,12 @@ def create_node(node):
                         node.get('num_samples'), node.get('num_parallel_workers'), node.get('shuffle'),
                         node.get('decode'), sampler, node.get('num_shards'), node.get('shard_id'))
 
+    elif dataset_op == 'CocoDataset':
+        sampler = construct_sampler(node.get('sampler'))
+        pyobj = pyclass(node['dataset_dir'], node.get('annotation_file'), node.get('task'), node.get('num_samples'),
+                        node.get('num_parallel_workers'), node.get('shuffle'), node.get('decode'), sampler,
+                        node.get('num_shards'), node.get('shard_id'))
+
     elif dataset_op == 'CelebADataset':
         sampler = construct_sampler(node.get('sampler'))
         pyobj = pyclass(node['dataset_dir'], node.get('num_parallel_workers'), node.get('shuffle'),
diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py
index 049931c80e..005f7072aa 100644
--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@@ -19,10 +19,12 @@ import inspect as ins
 import os
 from functools import wraps
 from multiprocessing import cpu_count
+
 import numpy as np
 from mindspore._c_expression import typing
-from . import samplers
+
 from . import datasets
+from . import samplers
 
 INT32_MAX = 2147483647
 valid_detype = [
@@ -31,169 +33,6 @@ valid_detype = [
 ]
 
 
-def check(method):
-    """Check the function parameters and return the function ."""
-    func_name = method.__name__
-    # Required parameter
-    req_param_int = []
-    req_param_bool = []
-    # Non-required parameter
-    nreq_param_int = []
-    nreq_param_bool = []
-
-    if func_name in 'repeat':
-        nreq_param_int = ['count', 'prefetch_size']
-
-    if func_name in 'take':
-        req_param_int = ['count']
-        nreq_param_int = ['prefetch_size']
-
-    elif func_name in 'shuffle':
-        req_param_int = ['buffer_size']
-        nreq_param_bool = ['reshuffle_each_iteration']
-        nreq_param_int = ['prefetch_size', 'seed']
-
-    elif func_name in 'batch':
-        req_param_int = ['batch_size']
-        nreq_param_int = ['num_parallel_workers', 'prefetch_size']
-        nreq_param_bool = ['drop_remainder']
-
-    elif func_name in ('zip', 'filter', 'cache', 'rename', 'project'):
-        nreq_param_int = ['prefetch_size']
-
-    elif func_name in ('map', '__init__'):
-        nreq_param_int = ['num_parallel_workers', 'prefetch_size', 'seed']
-        nreq_param_bool = ['block_reader']
-
-    @wraps(method)
-    def wrapper(*args, **kwargs):
-
-        def _make_key():
-            sig = ins.signature(method)
-            params = sig.parameters
-            keys = list(params.keys())
-            param_dic = dict()
-            for name, value in enumerate(args):
-                param_dic[keys[name]] = value
-            param_dic.update(zip(params.keys(), args))
-            param_dic.update(kwargs)
-
-            for name, value in params.items():
-                if name not in param_dic:
-                    param_dic[name] = value.default
-            return param_dic
-
-        # check type
-        def _check_param_type(arg, param_name, param_type=None):
-            if param_type is not None and not isinstance(arg, param_type):
-                raise ValueError(
-                    "The %s function %s type error!" % (func_name, param_name))
-
-        # check range
-        def _check_param_range(arg, param_name):
-            if isinstance(arg, int) and param_name == "seed" and (
-                    arg < 0 or arg > 2147483647):
-                raise ValueError(
-                    "The %s function %s exceeds the boundary!" % (
-                        func_name, param_name))
-            if isinstance(arg, int) and param_name == "count" and ((arg <= 0 and arg != -1) or arg > 2147483647):
-                raise ValueError(
-                    "The %s function %s exceeds the boundary!" % (
-                        func_name, param_name))
-            if isinstance(arg, int) and param_name == "prefetch_size" and (
-                    arg <= 0 or arg > 1024):
-                raise ValueError(
-                    "The %s function %s exceeds the boundary!" % (
-                        func_name, param_name))
-            if isinstance(arg, int) and param_name == "num_parallel_workers" and (
-                    arg < 1 or arg > cpu_count()):
-                raise ValueError(
-                    "The %s function %s exceeds the boundary(%s)!" % (
-                        func_name, param_name, cpu_count()))
-            if isinstance(arg, int) and param_name != "seed" \
-                    and param_name != "count" and param_name != "prefetch_size" \
-                    and param_name != "num_parallel_workers" and (arg < 1 or arg > 2147483647):
-                raise ValueError(
-                    "The %s function %s exceeds the boundary!" % (
-                        func_name, param_name))
-
-        key = _make_key()
-        # check integer
-        for karg in req_param_int:
-            _check_param_type(key[karg], karg, int)
-            _check_param_range(key[karg], karg)
-        for karg in nreq_param_int:
-            if karg in key:
-                if key[karg] is not None:
-                    _check_param_type(key[karg], karg, int)
-                    _check_param_range(key[karg], karg)
-        # check bool
-        for karg in req_param_bool:
-            _check_param_type(key[karg], karg, bool)
-        for karg in nreq_param_bool:
-            if karg in key:
-                if key[karg] is not None:
-                    _check_param_type(key[karg], karg, bool)
-
-        if func_name in '__init__':
-            if 'columns_list' in key.keys():
-                columns_list = key['columns_list']
-                if columns_list is not None:
-                    _check_param_type(columns_list, 'columns_list', list)
-
-            if 'columns' in key.keys():
-                columns = key['columns']
-                if columns is not None:
-                    _check_param_type(columns, 'columns', list)
-
-            if 'partitions' in key.keys():
-                partitions = key['partitions']
-                if partitions is not None:
-                    _check_param_type(partitions, 'partitions', list)
-
-            if 'schema' in key.keys():
-                schema = key['schema']
-                if schema is not None:
-                    check_filename(schema)
-                    if not os.path.isfile(schema) or not os.access(schema, os.R_OK):
-                        raise ValueError(
-                            "The file %s does not exist or permission denied!" % schema)
-
-            if 'dataset_dir' in key.keys():
-                dataset_dir = key['dataset_dir']
-                if dataset_dir is not None:
-                    if not os.path.isdir(dataset_dir) or not os.access(dataset_dir, os.R_OK):
-                        raise ValueError(
-                            "The folder %s does not exist or permission denied!" % dataset_dir)
-
-            if 'dataset_files' in key.keys():
-                dataset_files = key['dataset_files']
-                if not dataset_files:
-                    raise ValueError(
-                        "The dataset file does not exists!")
-                if dataset_files is not None:
-                    _check_param_type(dataset_files, 'dataset_files', list)
-                    for file in dataset_files:
-                        if not os.path.isfile(file) or not os.access(file, os.R_OK):
-                            raise ValueError(
-                                "The file %s does not exist or permission denied!" % file)
-
-            if 'dataset_file' in key.keys():
-                dataset_file = key['dataset_file']
-                if not dataset_file:
-                    raise ValueError(
-                        "The dataset file does not exists!")
-                check_filename(dataset_file)
-                if dataset_file is not None:
-                    if not os.path.isfile(dataset_file) or not os.access(dataset_file, os.R_OK):
-                        raise ValueError(
-                            "The file %s does not exist or permission denied!" % dataset_file)
-
-        return method(*args, **kwargs)
-
-    return wrapper
-
-
 def check_valid_detype(type_):
     if type_ not in valid_detype:
         raise ValueError("Unknown column type")
@@ -211,7 +50,7 @@ def check_filename(path):
         Exception: when error
     """
     if not isinstance(path, str):
-        raise ValueError("path: {} is not string".format(path))
+        raise TypeError("path: {} is not string".format(path))
     filename = os.path.basename(path)
 
     # '#', ':', '|', ' ', '}', '"', '+', '!', ']', '[', '\\', '`',
@@ -283,8 +122,8 @@ def check_num_parallel_workers(value):
 
 def check_num_samples(value):
     check_type(value, 'num_samples', int)
-    if value <= 0:
-        raise ValueError("num_samples must be greater than 0!")
+    if value < 0:
+        raise ValueError("num_samples cannot be less than 0!")
 
 
 def check_dataset_dir(dataset_dir):
@@ -304,7 +143,7 @@ def check_sampler_shuffle_shard_options(param_dict):
     num_shards, shard_id = param_dict.get('num_shards'), param_dict.get('shard_id')
 
     if sampler is not None and not isinstance(sampler, (samplers.BuiltinSampler, samplers.Sampler)):
-        raise ValueError("sampler is not a valid Sampler type.")
+        raise TypeError("sampler is not a valid Sampler type.")
 
     if sampler is not None:
         if shuffle is not None:
@@ -323,6 +162,27 @@ def check_sampler_shuffle_shard_options(param_dict):
         raise RuntimeError("shard_id is specified but num_shards is not.")
 
 
+def check_padding_options(param_dict):
+    """ check for valid padded_sample and num_padded of padded samples"""
+    columns_list = param_dict.get('columns_list')
+    block_reader = param_dict.get('block_reader')
+    padded_sample, num_padded = param_dict.get('padded_sample'), param_dict.get('num_padded')
+    if padded_sample is not None:
+        if num_padded is None:
+            raise RuntimeError("padded_sample is specified and requires num_padded as well.")
+        if num_padded < 0:
+            raise ValueError("num_padded is invalid, num_padded={}.".format(num_padded))
+        if columns_list is None:
+            raise RuntimeError("padded_sample is specified and requires columns_list as well.")
+        for column in columns_list:
+            if column not in padded_sample:
+                raise ValueError("padded_sample cannot match columns_list.")
+        if block_reader:
+            raise RuntimeError("block_reader and padded_sample cannot be specified at the same time.")
+
+    if padded_sample is None and num_padded is not None:
+        raise RuntimeError("num_padded is specified but padded_sample is not.")
+
 def check_imagefolderdatasetv2(method):
     """A wrapper that wrap a parameter checker to the original Dataset(ImageFolderDatasetV2)."""
 
@@ -468,13 +328,13 @@ def check_vocdataset(method):
         if task is None:
             raise ValueError("task is not provided.")
         if not isinstance(task, str):
-            raise ValueError("task is not str type.")
+            raise TypeError("task is not str type.")
         # check mode; required argument
         mode = param_dict.get('mode')
         if mode is None:
             raise ValueError("mode is not provided.")
         if not isinstance(mode, str):
-            raise ValueError("mode is not str type.")
+            raise TypeError("mode is not str type.")
 
         imagesets_file = ""
         if task == "Segmentation":
@@ -501,6 +361,52 @@ def check_vocdataset(method):
     return new_method
 
 
+def check_cocodataset(method):
+    """A wrapper that wrap a parameter checker to the original Dataset(CocoDataset)."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
+        nreq_param_bool = ['shuffle', 'decode']
+
+        # check dataset_dir; required argument
+        dataset_dir = param_dict.get('dataset_dir')
+        if dataset_dir is None:
+            raise ValueError("dataset_dir is not provided.")
+        check_dataset_dir(dataset_dir)
+
+        # check annotation_file; required argument
+        annotation_file = param_dict.get('annotation_file')
+        if annotation_file is None:
+            raise ValueError("annotation_file is not provided.")
+        check_dataset_file(annotation_file)
+
+        # check task; required argument
+        task = param_dict.get('task')
+        if task is None:
+            raise ValueError("task is not provided.")
+        if not isinstance(task, str):
+            raise TypeError("task is not str type.")
+
+        if task not in {'Detection', 'Stuff', 'Panoptic', 'Keypoint'}:
+            raise ValueError("Invalid task type")
+
+        check_param_type(nreq_param_int, param_dict, int)
+
+        check_param_type(nreq_param_bool, param_dict, bool)
+
+        sampler = param_dict.get('sampler')
+        if sampler is not None and isinstance(sampler, samplers.PKSampler):
+            raise ValueError("CocoDataset doesn't support PKSampler")
+        check_sampler_shuffle_shard_options(param_dict)
+
+        return method(*args, **kwargs)
+
+    return new_method
+
+
 def check_celebadataset(method):
     """A wrapper that wrap a parameter checker to the original Dataset(CelebADataset)."""
 
@@ -549,9 +455,10 @@ def check_minddataset(method):
     def new_method(*args, **kwargs):
         param_dict = make_param_dict(method, args, kwargs)
 
-        nreq_param_int = ['num_samples', 'num_parallel_workers', 'seed', 'num_shards', 'shard_id']
+        nreq_param_int = ['num_samples', 'num_parallel_workers', 'seed', 'num_shards', 'shard_id', 'num_padded']
         nreq_param_list = ['columns_list']
         nreq_param_bool = ['block_reader']
+        nreq_param_dict = ['padded_sample']
 
         # check dataset_file; required argument
         dataset_file = param_dict.get('dataset_file')
@@ -569,12 +476,11 @@ def check_minddataset(method):
 
         check_param_type(nreq_param_bool, param_dict, bool)
 
-        num_shards, shard_id = param_dict.get('num_shards'), param_dict.get('shard_id')
-        if (num_shards is not None and shard_id is None) or (num_shards is None and shard_id is not None):
-            raise ValueError("num_shards and shard_id need to be set or not set at the same time")
+        check_param_type(nreq_param_dict, param_dict, dict)
 
         check_sampler_shuffle_shard_options(param_dict)
 
+        check_padding_options(param_dict)
         return method(*args, **kwargs)
 
     return new_method
@@ -599,6 +505,8 @@ def check_generatordataset(method):
 
         # check column_names or schema; required argument
         column_names = param_dict.get('column_names')
+        if column_names is not None:
+            check_columns(column_names, "column_names")
         schema = param_dict.get('schema')
         if column_names is None and schema is None:
             raise ValueError("Neither columns_names not schema are provided.")
@@ -648,7 +556,7 @@ def check_generatordataset(method):
 
 def check_batch_size(batch_size):
     if not (isinstance(batch_size, int) or (callable(batch_size))):
-        raise ValueError("batch_size should either be an int or a callable.")
+        raise TypeError("batch_size should either be an int or a callable.")
     if callable(batch_size):
         sig = ins.signature(batch_size)
         if len(sig.parameters) != 1:
@@ -683,7 +591,68 @@ def check_pad_info(key, val):
                     check_type(dim, "dim in pad_shape", int)
                     assert dim > 0, "pad shape should be positive integers"
         if val[1] is not None:
-            check_type(val[1], "pad_value", (int, float))
+            check_type(val[1], "pad_value", (int, float, str, bytes))
+
+
+def check_bucket_batch_by_length(method):
+    """check the input arguments of bucket_batch_by_length."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        nreq_param_list = ['column_names', 'bucket_boundaries', 'bucket_batch_sizes']
+        check_param_type(nreq_param_list, param_dict, list)
+
+        # check column_names: must be list of string.
+        column_names = param_dict.get("column_names")
+        all_string = all(isinstance(item, str) for item in column_names)
+        if not all_string:
+            raise TypeError("column_names should be a list of str.")
+
+        element_length_function = param_dict.get("element_length_function")
+        if element_length_function is None and len(column_names) != 1:
+            raise ValueError("If element_length_function is not specified, exactly one column name should be passed.")
+
+        # check bucket_boundaries: must be list of int, positive and strictly increasing
+        bucket_boundaries = param_dict.get('bucket_boundaries')
+
+        if not bucket_boundaries:
+            raise ValueError("bucket_boundaries cannot be empty.")
+
+        all_int = all(isinstance(item, int) for item in bucket_boundaries)
+        if not all_int:
+            raise TypeError("bucket_boundaries should be a list of int.")
+
+        all_non_negative = all(item >= 0 for item in bucket_boundaries)
+        if not all_non_negative:
+            raise ValueError("bucket_boundaries cannot contain any negative numbers.")
+
+        for i in range(len(bucket_boundaries) - 1):
+            if not bucket_boundaries[i + 1] > bucket_boundaries[i]:
+                raise ValueError("bucket_boundaries should be strictly increasing.")
+
+        # check bucket_batch_sizes: must be list of int and positive
+        bucket_batch_sizes = param_dict.get('bucket_batch_sizes')
+        if len(bucket_batch_sizes) != len(bucket_boundaries) + 1:
+            raise ValueError("bucket_batch_sizes must contain one element more than bucket_boundaries.")
+
+        all_int = all(isinstance(item, int) for item in bucket_batch_sizes)
+        if not all_int:
+            raise TypeError("bucket_batch_sizes should be a list of int.")
+
+        all_non_negative = all(item >= 0 for item in bucket_batch_sizes)
+        if not all_non_negative:
+            raise ValueError("bucket_batch_sizes cannot contain any negative numbers.")
+
+        if param_dict.get('pad_info') is not None:
+            check_type(param_dict["pad_info"], "pad_info", dict)
+            for k, v in param_dict.get('pad_info').items():
+                check_pad_info(k, v)
+
+        return method(*args, **kwargs)
+
+    return new_method
 
 
 def check_batch(method):
@@ -737,6 +706,7 @@ def check_batch(method):
 
 def check_sync_wait(method):
     """check the input arguments of sync_wait."""
+
     @wraps(method)
     def new_method(*args, **kwargs):
         param_dict = make_param_dict(method, args, kwargs)
@@ -804,7 +774,7 @@ def check_filter(method):
         param_dict = make_param_dict(method, args, kwargs)
         predicate = param_dict.get("predicate")
         if not callable(predicate):
-            raise ValueError("Predicate should be a python function or a callable python object.")
+            raise TypeError("Predicate should be a python function or a callable python object.")
 
         nreq_param_int = ['num_parallel_workers']
         check_param_type(nreq_param_int, param_dict, int)
@@ -896,7 +866,7 @@ def check_zip_dataset(method):
             raise ValueError("datasets is not provided.")
 
         if not isinstance(ds, (tuple, datasets.Dataset)):
-            raise ValueError("datasets is not tuple or of type Dataset.")
+            raise TypeError("datasets is not tuple or of type Dataset.")
 
         return method(*args, **kwargs)
 
@@ -916,7 +886,7 @@ def check_concat(method):
             raise ValueError("datasets is not provided.")
 
         if not isinstance(ds, (list, datasets.Dataset)):
-            raise ValueError("datasets is not list or of type Dataset.")
+            raise TypeError("datasets is not list or of type Dataset.")
 
         return method(*args, **kwargs)
 
@@ -995,7 +965,7 @@ def check_add_column(method):
         de_type = param_dict.get("de_type")
         if de_type is not None:
             if not isinstance(de_type, typing.Type) and not check_valid_detype(de_type):
-                raise ValueError("Unknown column type.")
+                raise TypeError("Unknown column type.")
         else:
             raise TypeError("Expected non-empty string.")
 
@@ -1009,6 +979,41 @@ def check_add_column(method):
     return new_method
 
 
+def check_cluedataset(method):
+    """A wrapper that wrap a parameter checker to the original Dataset(CLUEDataset)."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
+
+        # check dataset_files; required argument
+        dataset_files = param_dict.get('dataset_files')
+        if dataset_files is None:
+            raise ValueError("dataset_files is not provided.")
+        if not isinstance(dataset_files, (str, list)):
+            raise TypeError("dataset_files should be of type str or a list of strings.")
+
+        # check task
+        task_param = param_dict.get('task')
+        if task_param not in ['AFQMC', 'TNEWS', 'IFLYTEK', 'CMNLI', 'WSC', 'CSL']:
+            raise ValueError("task should be AFQMC, TNEWS, IFLYTEK, CMNLI, WSC or CSL")
+
+        # check usage
+        usage_param = param_dict.get('usage')
+        if usage_param not in ['train', 'test', 'eval']:
+            raise ValueError("usage should be train, test or eval")
+
+        check_param_type(nreq_param_int, param_dict, int)
+
+        check_sampler_shuffle_shard_options(param_dict)
+
+        return method(*args, **kwargs)
+
+    return new_method
+
+
 def check_textfiledataset(method):
     """A wrapper that wrap a parameter checker to the original Dataset(TextFileDataset)."""
 
@@ -1130,6 +1135,36 @@ def check_gnn_get_all_nodes(method):
     return new_method
 
 
+def check_gnn_get_all_edges(method):
+    """A wrapper that wrap a parameter checker to the GNN `get_all_edges` function."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        # check node_type; required argument
+        check_type(param_dict.get("edge_type"), 'edge_type', int)
+
+        return method(*args, **kwargs)
+
+    return new_method
+
+
+def check_gnn_get_nodes_from_edges(method):
+    """A wrapper that wrap a parameter checker to the GNN `get_nodes_from_edges` function."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        # check edge_list; required argument
+        check_gnn_list_or_ndarray(param_dict.get("edge_list"), 'edge_list')
+
+        return method(*args, **kwargs)
+
+    return new_method
+
+
 def check_gnn_get_all_neighbors(method):
     """A wrapper that wrap a parameter checker to the GNN `get_all_neighbors` function."""
 
@@ -1148,6 +1183,79 @@ def check_gnn_get_all_neighbors(method):
     return new_method
 
 
+def check_gnn_get_sampled_neighbors(method):
+    """A wrapper that wrap a parameter checker to the GNN `get_sampled_neighbors` function."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        # check node_list; required argument
+        check_gnn_list_or_ndarray(param_dict.get("node_list"), 'node_list')
+
+        # check neighbor_nums; required argument
+        neighbor_nums = param_dict.get("neighbor_nums")
+        check_gnn_list_or_ndarray(neighbor_nums, 'neighbor_nums')
+        if len(neighbor_nums) > 6:
+            raise ValueError("Wrong number of input members for {0}, should be less than or equal to 6, got {1}".format(
+                'neighbor_nums', len(neighbor_nums)))
+
+        # check neighbor_types; required argument
+        neighbor_types = param_dict.get("neighbor_types")
+        check_gnn_list_or_ndarray(neighbor_types, 'neighbor_types')
+        if len(neighbor_nums) > 6:
+            raise ValueError("Wrong number of input members for {0}, should be less than or equal to 6, got {1}".format(
+                'neighbor_types', len(neighbor_types)))
+
+        if len(neighbor_nums) != len(neighbor_types):
+            raise ValueError(
+                "The number of members of neighbor_nums and neighbor_types is inconsistent")
+
+        return method(*args, **kwargs)
+
+    return new_method
+
+
+def check_gnn_get_neg_sampled_neighbors(method):
+    """A wrapper that wrap a parameter checker to the GNN `get_neg_sampled_neighbors` function."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        # check node_list; required argument
+        check_gnn_list_or_ndarray(param_dict.get("node_list"), 'node_list')
+
+        # check neg_neighbor_num; required argument
+        check_type(param_dict.get("neg_neighbor_num"), 'neg_neighbor_num', int)
+
+        # check neg_neighbor_type; required argument
+        check_type(param_dict.get("neg_neighbor_type"),
+                   'neg_neighbor_type', int)
+
+        return method(*args, **kwargs)
+
+    return new_method
+
+
+def check_gnn_random_walk(method):
+    """A wrapper that wrap a parameter checker to the GNN `random_walk` function."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        # check node_list; required argument
+        check_gnn_list_or_ndarray(param_dict.get("target_nodes"), 'target_nodes')
+
+        # check meta_path; required argument
+        check_gnn_list_or_ndarray(param_dict.get("meta_path"), 'meta_path')
+
+        return method(*args, **kwargs)
+
+    return new_method
+
+
 def check_aligned_list(param, param_name, membor_type):
     """Check whether the structure of each member of the list is the same."""
 
@@ -1205,3 +1313,48 @@ def check_gnn_get_node_feature(method):
         return method(*args, **kwargs)
 
     return new_method
+
+
+def check_numpyslicesdataset(method):
+    """A wrapper that wrap a parameter checker to the original Dataset(NumpySlicesDataset)."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        # check data; required argument
+        data = param_dict.get('data')
+        if not isinstance(data, (list, tuple, dict, np.ndarray)):
+            raise TypeError("Unsupported data type: {}, only support some common python data type, "
+                            "like list, tuple, dict, and numpy array.".format(type(data)))
+        if isinstance(data, tuple) and not isinstance(data[0], (list, np.ndarray)):
+            raise TypeError("Unsupported data type: when input is tuple, only support some common python "
+                            "data type, like tuple of lists and tuple of numpy arrays.")
+        if not data:
+            raise ValueError("Input data is empty.")
+
+        # check column_names
+        column_names = param_dict.get('column_names')
+        if column_names is not None:
+            check_columns(column_names, "column_names")
+
+            # check num of input column in column_names
+            column_num = 1 if isinstance(column_names, str) else len(column_names)
+            if isinstance(data, dict):
+                data_column = len(list(data.keys()))
+                if column_num != data_column:
+                    raise ValueError("Num of input column names is {0}, but required is {1}."
+                                     .format(column_num, data_column))
+
+            elif isinstance(data, tuple):
+                if column_num != len(data):
+                    raise ValueError("Num of input column names is {0}, but required is {1}."
+                                     .format(column_num, len(data)))
+            else:
+                if column_num != 1:
+                    raise ValueError("Num of input column names is {0}, but required is {1} as data is list."
+                                     .format(column_num, 1))
+
+        return method(*args, **kwargs)
+
+    return new_method
diff --git a/mindspore/dataset/text/__init__.py b/mindspore/dataset/text/__init__.py
index b90f912a98..7c43a2888c 100644
--- a/mindspore/dataset/text/__init__.py
+++ b/mindspore/dataset/text/__init__.py
@@ -11,9 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
-mindspore.dataset.text
+This module is to support text processing for nlp. It includes two parts:
+transforms and utils. transforms is a high performance
+nlp text processing module which is developed with icu4c and cppjieba.
+utils provides some general methods for nlp text processing.
 """
-from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer
-from .utils import to_str, to_bytes, JiebaMode, Vocab
+import platform
+from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \
+    ToNumber
+from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm
+
+__all__ = [
+    "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
+    "to_str", "to_bytes", "JiebaMode", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
+    "PythonTokenizer"
+]
+
+if platform.system().lower() != 'windows':
+    from .transforms import UnicodeScriptTokenizer, WhitespaceTokenizer, CaseFold, NormalizeUTF8, \
+        RegexReplace, RegexTokenizer, BasicTokenizer, BertTokenizer, PythonTokenizer
+
+    __all__.append(["UnicodeScriptTokenizer", "WhitespaceTokenizer", "CaseFold", "NormalizeUTF8",
+                    "RegexReplace", "RegexTokenizer", "BasicTokenizer", "BertTokenizer", "NormalizeForm"])
diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py
index 79a5b744c9..fe970e06cc 100644
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@@ -12,25 +12,60 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-c transforms for all text related operators
-"""
+The module text.transforms is inheritted from _c_dataengine
+which is implemented basing on icu4c and cppjieba in C++.
+It's a high performance module to process nlp text.
+Users can use Vocab to build their own dictionary,
+use appropriate tokenizers to split sentences into different tokens,
+and use Lookup to find the index of tokens in Vocab.
+
+.. Note::
+    Constructor's arguments for every class in this module must be saved into the
+    class attributes (self.xxx) to support save() and load().
 
+Examples:
+    >>> import mindspore.dataset as ds
+    >>> import mindspore.dataset.text as text
+    >>> dataset_file = "path/to/text_file_path"
+    >>> # sentences as line data saved in a file
+    >>> dataset = ds.TextFileDataset(dataset_file, shuffle=False)
+    >>> # tokenize sentence to unicode characters
+    >>> tokenizer = text.UnicodeCharTokenizer()
+    >>> # load vocabulary form list
+    >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
+    >>> # lookup is an operation for mapping tokens to ids
+    >>> lookup = text.Lookup(vocab)
+    >>> dataset = dataset.map(operations=[tokenizer, lookup])
+    >>> for i in dataset.create_dict_iterator():
+    >>>     print(i)
+    >>> # if text line in dataset_file is:
+    >>> # 深圳欢迎您
+    >>> # then the output will be:
+    >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)}
+"""
 import os
 import re
+import platform
+import numpy as np
 
 import mindspore._c_dataengine as cde
 
-from .utils import JiebaMode
+from .utils import JiebaMode, NormalizeForm, to_str
 from .validators import check_lookup, check_jieba_add_dict, \
-    check_jieba_add_word, check_jieba_init
+    check_jieba_add_word, check_jieba_init, check_ngram, check_pair_truncate, \
+    check_to_number, check_python_tokenizer
+from ..core.datatypes import mstype_to_detype
 
 
 class Lookup(cde.LookupOp):
     """
-        Lookup operator that looks up a word to an id
+    Lookup operator that looks up a word to an id.
+
     Args:
-        vocab(Vocab): a Vocab object
-        unknown(None,int): default id to lookup a word that is out of vocab
+        vocab(Vocab): a Vocab object.
+        unknown(int, optional): default id to lookup a word that is out of vocab. If no argument is passed, 1 will be
+            used to be the default id which is the convention for unknown_token <unk>. Otherwise, user is strongly
+            encouraged to pass in the id for <unk> (default=None).
     """
 
     @check_lookup
@@ -41,6 +76,33 @@ class Lookup(cde.LookupOp):
             super().__init__(vocab, unknown)
 
 
+class Ngram(cde.NgramOp):
+    """
+    TensorOp to generate n-gram from a 1-D string Tensor.
+
+    Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.
+
+    Args:
+        n (list of int):  n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result
+            would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for
+            a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an
+            empty string be produced.
+        left_pad (tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
+            will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default=None).
+        right_pad (tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
+            pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--"
+            (default=None).
+        separator (str, optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"]
+            with separator="-" the result would be ["mindspore-amazing"] (default=None, which means whitespace is
+            used).
+    """
+
+    @check_ngram
+    def __init__(self, n, left_pad=None, right_pad=None, separator=None):
+        super().__init__(ngrams=n, l_pad_len=left_pad[1], r_pad_len=right_pad[1], l_pad_token=left_pad[0],
+                         r_pad_token=right_pad[0], separator=separator)
+
+
 DE_C_INTER_JIEBA_MODE = {
     JiebaMode.MIX: cde.JiebaMode.DE_JIEBA_MIX,
     JiebaMode.MP: cde.JiebaMode.DE_JIEBA_MP,
@@ -55,11 +117,12 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
     Args:
         hmm_path (str): the dictionary file is used by  HMMSegment algorithm,
             the dictionary can be obtained on the official website of cppjieba.
-        mp_path(str): the dictionary file is used by MPSegment algorithm,
+        mp_path (str): the dictionary file is used by MPSegment algorithm,
             the dictionary can be obtained on the official website of cppjieba.
-        mode (Enum):  [Default "MIX"], "MP" model will tokenize with MPSegment algorithm,
+        mode (JiebaMode, optional): "MP" model will tokenize with MPSegment algorithm,
             "HMM" mode will tokenize with Hiddel Markov Model Segment algorithm,
-            "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm.
+            "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm
+            (default="MIX").
     """
 
     @check_jieba_init
@@ -73,13 +136,15 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
     @check_jieba_add_word
     def add_word(self, word, freq=None):
         """
-        Add user defined word to JiebaTokenizer's dictionary
+        Add user defined word to JiebaTokenizer's dictionary.
+
         Args:
-            word(required, string): The word to be added to the JiebaTokenizer instance.
+            word (str): The word to be added to the JiebaTokenizer instance.
                 The added word will not be written into the built-in dictionary on disk.
-            freq(optional, int): The frequency of the word to be added, The higher the frequency,
-                the better change the word will be tokenized(default None, use default frequency).
+            freq (int, optional): The frequency of the word to be added, The higher the frequency,
+                the better change the word will be tokenized(default=None, use default frequency).
         """
+
         if freq is None:
             super().add_word(word, 0)
         else:
@@ -88,15 +153,20 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
     @check_jieba_add_dict
     def add_dict(self, user_dict):
         """
-        Add user defined word to JiebaTokenizer's dictionary
+        Add user defined word to JiebaTokenizer's dictionary.
+
         Args:
-            user_dict(path/dict):Dictionary to be added, file path or Python dictionary,
-            Python Dict format: {word1:freq1, word2:freq2,...}
-            Jieba dictionary format : word(required), freq(optional), such as:
-                word1 freq1
-                word2
-                word3 freq3
+            user_dict (str or dict): Dictionary to be added, file path or Python dictionary,
+                Python Dict format: {word1:freq1, word2:freq2,...}.
+                Jieba dictionary format : word(required), freq(optional), such as:
+
+                .. code-block::
+
+                    word1 freq1
+                    word2
+                    word3 freq3
         """
+
         if isinstance(user_dict, str):
             self.__add_dict_py_file(user_dict)
         elif isinstance(user_dict, dict):
@@ -153,3 +223,249 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
     """
     Tokenize a scalar tensor of UTF-8 string to Unicode characters.
     """
+
+
+class WordpieceTokenizer(cde.WordpieceTokenizerOp):
+    """
+    Tokenize scalar token or 1-D tokens to 1-D subword tokens.
+
+    Args:
+        vocab (Vocab): a Vocab object.
+        suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default='##').
+        max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100).
+        unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
+            return the token directly, else return 'unknown_token'(default='[UNK]').
+    """
+
+    def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'):
+        self.vocab = vocab
+        self.suffix_indicator = suffix_indicator
+        self.max_bytes_per_token = max_bytes_per_token
+        self.unknown_token = unknown_token
+        super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token)
+
+
+if platform.system().lower() != 'windows':
+    class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
+        """
+        Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n').
+        """
+
+
+    class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp):
+        """
+        Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
+
+        Args:
+            keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
+        """
+
+        def __init__(self, keep_whitespace=False):
+            self.keep_whitespace = keep_whitespace
+            super().__init__(self.keep_whitespace)
+
+
+    class CaseFold(cde.CaseFoldOp):
+        """
+        Apply case fold operation on utf-8 string tensor.
+        """
+
+
+    DE_C_INTER_NORMALIZE_FORM = {
+        NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
+        NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC,
+        NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC,
+        NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD,
+        NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD
+    }
+
+
+    class NormalizeUTF8(cde.NormalizeUTF8Op):
+        """
+        Apply normalize operation on utf-8 string tensor.
+
+        Args:
+            normalize_form (NormalizeForm, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
+                If set "NONE", will do nothing for input string tensor.
+                If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default="NFKC").
+                See http://unicode.org/reports/tr15/ for details.
+        """
+
+        def __init__(self, normalize_form=NormalizeForm.NFKC):
+            self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
+            super().__init__(self.normalize_form)
+
+
+    class RegexReplace(cde.RegexReplaceOp):
+        """
+        Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.
+
+        See http://userguide.icu-project.org/strings/regexp for support regex pattern.
+
+        Args:
+            pattern(str): the regex expression patterns.
+            replace(str): the string to replace matched element.
+            replace_all(bool, optional): If False, only replace first matched element;
+                if True, replace all matched elements(default=True).
+        """
+
+        def __init__(self, pattern, replace, replace_all=True):
+            self.pattern = pattern
+            self.replace = replace
+            self.replace_all = replace_all
+            super().__init__(self.pattern, self.replace, self.replace_all)
+
+
+    class RegexTokenizer(cde.RegexTokenizerOp):
+        """
+        Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
+
+        See http://userguide.icu-project.org/strings/regexp for support regex pattern.
+
+        Args:
+            delim_pattern(str): The pattern of regex delimiters.
+                The original string will be split by matched elements.
+            keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
+                if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
+                in this situation, delimiters will not kept as a output token(default='').
+        """
+
+        def __init__(self, delim_pattern, keep_delim_pattern=''):
+            self.delim_pattern = delim_pattern
+            self.keep_delim_pattern = keep_delim_pattern
+            super().__init__(self.delim_pattern, self.keep_delim_pattern)
+
+
+    class BasicTokenizer(cde.BasicTokenizerOp):
+        """
+        Tokenize a scalar tensor of UTF-8 string by specific rules.
+
+        Args:
+            lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
+                on input text to make the text to lower case and strip accents characters; If False, only apply
+                NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
+            keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
+            normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
+                only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
+            preserve_unused_token(bool, optional): If True, do not split special tokens like
+                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
+        """
+
+        def __init__(self, lower_case=False, keep_whitespace=False,
+                     normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
+            self.lower_case = lower_case
+            self.keep_whitespace = keep_whitespace
+            self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
+            self.preserve_unused_token = preserve_unused_token
+            super().__init__(self.lower_case, self.keep_whitespace,
+                             self.normalization_form, self.preserve_unused_token)
+
+
+    class BertTokenizer(cde.BertTokenizerOp):
+        """
+        Tokenizer used for Bert text process.
+
+        Args:
+            vocab(Vocab): a Vocab object.
+            suffix_indicator(str, optional): Used to show that the subword is the last part of a word(default='##').
+            max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default=100).
+            unknown_token(str, optional): When we can not found the token: if 'unknown_token' is empty string,
+                return the token directly, else return 'unknown_token'(default='[UNK]').
+            lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
+                on input text to make the text to lower case and strip accents characters; If False, only apply
+                NormalizeUTF8('normalization_form' mode) operation on input text(default=False).
+            keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False).
+            normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode,
+                only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
+            preserve_unused_token(bool, optional): If True, do not split special tokens like
+                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
+        """
+
+        def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
+                     unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
+                     normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
+            self.vocab = vocab
+            self.suffix_indicator = suffix_indicator
+            self.max_bytes_per_token = max_bytes_per_token
+            self.unknown_token = unknown_token
+            self.lower_case = lower_case
+            self.keep_whitespace = keep_whitespace
+            self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
+            self.preserve_unused_token = preserve_unused_token
+            super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token,
+                             self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token)
+
+
+class TruncateSequencePair(cde.TruncateSequencePairOp):
+    """
+    Truncate a pair of rank-1 tensors such that the total length is less than max_length.
+
+    This operation takes two input tensors and returns two output Tenors.
+
+    Args:
+        max_length(int): Maximum length required.
+
+    Examples:
+        >>> # Data before
+        >>> # |  col1   |  col2   |
+        >>> # +---------+---------|
+        >>> # | [1,2,3] | [4,5]   |
+        >>> # +---------+---------+
+        >>> data = data.map(operations=TruncateSequencePair(4))
+        >>> # Data after
+        >>> # |  col1   |  col2   |
+        >>> # +---------+---------+
+        >>> # | [1,2]   | [4,5]   |
+        >>> # +---------+---------+
+    """
+
+    @check_pair_truncate
+    def __init__(self, max_length):
+        super().__init__(max_length)
+
+
+class ToNumber(cde.ToNumberOp):
+    """
+    Tensor operation to convert every element of a string tensor to a number.
+
+    Strings are casted according to the rules specified in the following links:
+    https://en.cppreference.com/w/cpp/string/basic_string/stof,
+    https://en.cppreference.com/w/cpp/string/basic_string/stoul,
+    except that any strings which represent negative numbers cannot be casted to an
+    unsigned integer type.
+
+    Args:
+        data_type (mindspore.dtype): mindspore.dtype to be casted to. Must be
+            a numeric type.
+
+    Raises:
+        RuntimeError: If strings are invalid to cast, or are out of range after being casted.
+    """
+
+    @check_to_number
+    def __init__(self, data_type):
+        data_type = mstype_to_detype(data_type)
+        self.data_type = str(data_type)
+        super().__init__(data_type)
+
+
+class PythonTokenizer:
+    """
+    Callable class to be used for user-defined string tokenizer.
+    Args:
+        tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
+
+    Examples:
+        >>> def my_tokenizer(line):
+        >>>     return line.split()
+        >>> data = data.map(operations=PythonTokenizer(my_tokenizer))
+    """
+
+    @check_python_tokenizer
+    def __init__(self, tokenizer):
+        self.tokenizer = np.vectorize(lambda x: np.array(tokenizer(x), dtype='U'), signature='()->(n)')
+
+    def __call__(self, in_array):
+        in_array = to_str(in_array)
+        tokens = self.tokenizer(in_array)
+        return tokens
diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py
index f3f442f238..766de76e01 100644
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@@ -12,55 +12,113 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Some basic function for nlp
+The module text.utils provides some general methods for nlp text processing.
+For example, you can use Vocab to build a dictionary,
+use to_bytes and to_str to encode and decode strings into a specified format.
 """
 from enum import IntEnum
 
-import mindspore._c_dataengine as cde
+import copy
 import numpy as np
+import mindspore._c_dataengine as cde
 
-from .validators import check_from_file, check_from_list, check_from_dict
+from .validators import check_from_file, check_from_list, check_from_dict, check_from_dataset
 
 
 class Vocab(cde.Vocab):
     """
-        Vocab object that is used for lookup word
-    Args:
+    Vocab object that is used to lookup a word.
+
+    It contains a map that maps each word(str) to an id (int).
     """
 
-    def __init__(self):
-        pass
+    @classmethod
+    @check_from_dataset
+    def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None, special_tokens=None,
+                     special_first=None):
+        """
+        Build a vocab from a dataset.
+
+        This would collect all unique words in a dataset and return a vocab within
+        the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency.
+        Words in vocab are ordered from highest frequency to lowest frequency. Words with the same frequency would be
+        ordered lexicographically.
+
+        Args:
+            dataset(Dataset): dataset to build vocab from.
+            columns(list of str, optional): column names to get words from. It can be a list of column names.
+                (default=None, where all columns will be used. If any column isn't string type, will return error).
+            freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
+                range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as
+                min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
+                min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
+                (default=None, all words are included).
+            top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
+                taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
+                all words are included).
+            special_tokens(list, optional):  a list of strings, each one is a special token. for example
+                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
+            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
+                is specified and special_first is set to None, special_tokens will be prepended (default=None).
+
+        Returns:
+            Vocab, Vocab object built from dataset.
+        """
+
+        vocab = Vocab()
+        root = copy.deepcopy(dataset).build_vocab(vocab, columns, freq_range, top_k, special_tokens, special_first)
+        for d in root.create_dict_iterator():
+            if d is not None:
+                raise ValueError("from_dataset should receive data other than None.")
+        return vocab
 
     @classmethod
     @check_from_list
-    def from_list(cls, word_list):
+    def from_list(cls, word_list, special_tokens=None, special_first=None):
         """
-           build a vocab object from a list of word
+        Build a vocab object from a list of word.
+
         Args:
-            word_list(list): a list of string where each element is a word
+            word_list(list): a list of string where each element is a word of type string.
+            special_tokens(list, optional):  a list of strings, each one is a special token. for example
+                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
+            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
+                is specified and special_first is set to None, special_tokens will be prepended (default=None).
         """
-        return super().from_list(word_list)
+
+        return super().from_list(word_list, special_tokens, special_first)
 
     @classmethod
     @check_from_file
-    def from_file(cls, file_path, delimiter=None, vocab_size=None):
+    def from_file(cls, file_path, delimiter=None, vocab_size=None, special_tokens=None, special_first=None):
         """
-            build a vocab object from a list of word
+        Build a vocab object from a list of word.
+
         Args:
-            file_path(str): path to the file which contains the vocab list
-            delimiter(None, str): a delimiter to break up each line in file, the first element is taken to be the word
-            vocab_size(None, int): number of words to read from file_path
+            file_path (str): path to the file which contains the vocab list.
+            delimiter (str, optional): a delimiter to break up each line in file, the first element is taken to be
+                the word (default=None).
+            vocab_size (int, optional): number of words to read from file_path (default=None, all words are taken).
+            special_tokens (list, optional):  a list of strings, each one is a special token. for example
+                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
+            special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
+                If special_tokens is specified and special_first is set to None,
+                special_tokens will be prepended (default=None).
         """
-        return super().from_file(file_path, delimiter, vocab_size)
+
+        return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
 
     @classmethod
     @check_from_dict
     def from_dict(cls, word_dict):
         """
-            build a vocab object from a dict.
+        Build a vocab object from a dict.
+
         Args:
-            word_dict(dict): dict contains word, id pairs. id should start from 2 and continuous
+            word_dict (dict): dict contains word, id pairs where word should be str and id int. id is recommended to
+                start from 0 and be continuous. ValueError will be raised if id is negative.
         """
+
         return super().from_dict(word_dict)
 
 
@@ -69,15 +127,15 @@ def to_str(array, encoding='utf8'):
     Convert numpy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.
 
     Args:
-        array (numpy array): Array of type `bytes` representing strings.
+        array (numpy.ndarray): Array of type `bytes` representing strings.
         encoding (string): Indicating the charset for decoding.
-    Returns:
-        Numpy array of `str`.
 
+    Returns:
+        numpy.ndarray, numpy array of `str`.
     """
 
     if not isinstance(array, np.ndarray):
-        raise ValueError('input should be a numpy array')
+        raise ValueError('input should be a numpy array.')
 
     return np.char.decode(array, encoding)
 
@@ -87,20 +145,30 @@ def to_bytes(array, encoding='utf8'):
     Convert numpy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.
 
     Args:
-        array (numpy array): Array of type `str` representing strings.
-        encoding (string): Indicating the charset for encoding.
-    Returns:
-        Numpy array of `bytes`.
+        array (numpy.ndarray): Array of type `str` representing strings.
+        encoding (str): Indicating the charset for encoding.
 
+    Returns:
+        numpy.ndarray, numpy array of `bytes`.
     """
 
     if not isinstance(array, np.ndarray):
-        raise ValueError('input should be a numpy array')
+        raise ValueError('input should be a numpy array.')
 
     return np.char.encode(array, encoding)
 
 
 class JiebaMode(IntEnum):
+    """An enumeration for JiebaTokenizer, effective enumeration types are MIX, MP, HMM."""
     MIX = 0
     MP = 1
     HMM = 2
+
+
+class NormalizeForm(IntEnum):
+    """An enumeration for NormalizeUTF8, effective enumeration types are NONE, NFC, NFKC, NFD, NFKD."""
+    NONE = 0
+    NFC = 1
+    NFKC = 2
+    NFD = 3
+    NFKD = 4
diff --git a/mindspore/dataset/text/validators.py b/mindspore/dataset/text/validators.py
index 479b90d1f0..afab8665cd 100644
--- a/mindspore/dataset/text/validators.py
+++ b/mindspore/dataset/text/validators.py
@@ -19,12 +19,29 @@ validators for text ops
 from functools import wraps
 
 import mindspore._c_dataengine as cde
+import mindspore.common.dtype as mstype
 
-from ..transforms.validators import check_uint32
+from mindspore._c_expression import typing
+from ..transforms.validators import check_uint32, check_pos_int64
+
+
+def check_unique_list_of_words(words, arg_name):
+    """Check that words is a list and each element is a str without any duplication"""
+
+    if not isinstance(words, list):
+        raise ValueError(arg_name + " needs to be a list of words of type string.")
+    words_set = set()
+    for word in words:
+        if not isinstance(word, str):
+            raise ValueError("each word in " + arg_name + " needs to be type str.")
+        if word in words_set:
+            raise ValueError(arg_name + " contains duplicate word: " + word + ".")
+        words_set.add(word)
+    return words_set
 
 
 def check_lookup(method):
-    """A wrapper that wrap a parameter checker to the original function(crop operation)."""
+    """A wrapper that wrap a parameter checker to the original function."""
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
@@ -34,9 +51,11 @@ def check_lookup(method):
         if "unknown" in kwargs:
             unknown = kwargs.get("unknown")
         if unknown is not None:
-            assert isinstance(unknown, int) and unknown >= 0, "unknown needs to be a non-negative integer"
+            if not (isinstance(unknown, int) and unknown >= 0):
+                raise ValueError("unknown needs to be a non-negative integer.")
 
-        assert isinstance(vocab, cde.Vocab), "vocab is not an instance of cde.Vocab"
+        if not isinstance(vocab, cde.Vocab):
+            raise ValueError("vocab is not an instance of cde.Vocab.")
 
         kwargs["vocab"] = vocab
         kwargs["unknown"] = unknown
@@ -46,65 +65,109 @@ def check_lookup(method):
 
 
 def check_from_file(method):
-    """A wrapper that wrap a parameter checker to the original function(crop operation)."""
+    """A wrapper that wrap a parameter checker to the original function."""
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
-        file_path, delimiter, vocab_size = (list(args) + 3 * [None])[:3]
+        file_path, delimiter, vocab_size, special_tokens, special_first = (list(args) + 5 * [None])[:5]
         if "file_path" in kwargs:
             file_path = kwargs.get("file_path")
         if "delimiter" in kwargs:
             delimiter = kwargs.get("delimiter")
         if "vocab_size" in kwargs:
             vocab_size = kwargs.get("vocab_size")
+        if "special_tokens" in kwargs:
+            special_tokens = kwargs.get("special_tokens")
+        if "special_first" in kwargs:
+            special_first = kwargs.get("special_first")
+
+        if not isinstance(file_path, str):
+            raise ValueError("file_path needs to be str.")
 
-        assert isinstance(file_path, str), "file_path needs to be str"
         if delimiter is not None:
-            assert isinstance(delimiter, str), "delimiter needs to be str"
+            if not isinstance(delimiter, str):
+                raise ValueError("delimiter needs to be str.")
         else:
             delimiter = ""
         if vocab_size is not None:
-            assert isinstance(vocab_size, int) and vocab_size > 0, "vocab size needs to be a positive integer"
+            if not (isinstance(vocab_size, int) and vocab_size > 0):
+                raise ValueError("vocab size needs to be a positive integer.")
         else:
             vocab_size = -1
+
+        if special_first is None:
+            special_first = True
+
+        if not isinstance(special_first, bool):
+            raise ValueError("special_first needs to be a boolean value")
+
+        if special_tokens is None:
+            special_tokens = []
+
+        check_unique_list_of_words(special_tokens, "special_tokens")
+
         kwargs["file_path"] = file_path
         kwargs["delimiter"] = delimiter
         kwargs["vocab_size"] = vocab_size
+        kwargs["special_tokens"] = special_tokens
+        kwargs["special_first"] = special_first
+
         return method(self, **kwargs)
 
     return new_method
 
 
 def check_from_list(method):
-    """A wrapper that wrap a parameter checker to the original function(crop operation)."""
+    """A wrapper that wrap a parameter checker to the original function."""
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
-        word_list, = (list(args) + [None])[:1]
+        word_list, special_tokens, special_first = (list(args) + 3 * [None])[:3]
         if "word_list" in kwargs:
             word_list = kwargs.get("word_list")
-        assert isinstance(word_list, list), "word_list needs to be a list of words"
-        for word in word_list:
-            assert isinstance(word, str), "each word in word list needs to be type str"
+        if "special_tokens" in kwargs:
+            special_tokens = kwargs.get("special_tokens")
+        if "special_first" in kwargs:
+            special_first = kwargs.get("special_first")
+        if special_tokens is None:
+            special_tokens = []
+        word_set = check_unique_list_of_words(word_list, "word_list")
+        token_set = check_unique_list_of_words(special_tokens, "special_tokens")
+
+        intersect = word_set.intersection(token_set)
+
+        if intersect != set():
+            raise ValueError("special_tokens and word_list contain duplicate word :" + str(intersect) + ".")
+
+        if special_first is None:
+            special_first = True
+
+        if not isinstance(special_first, bool):
+            raise ValueError("special_first needs to be a boolean value.")
 
         kwargs["word_list"] = word_list
+        kwargs["special_tokens"] = special_tokens
+        kwargs["special_first"] = special_first
         return method(self, **kwargs)
 
     return new_method
 
 
 def check_from_dict(method):
-    """A wrapper that wrap a parameter checker to the original function(crop operation)."""
+    """A wrapper that wrap a parameter checker to the original function."""
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
         word_dict, = (list(args) + [None])[:1]
         if "word_dict" in kwargs:
             word_dict = kwargs.get("word_dict")
-        assert isinstance(word_dict, dict), "word_dict needs to be a list of word,id pairs"
+        if not isinstance(word_dict, dict):
+            raise ValueError("word_dict needs to be a list of word,id pairs.")
         for word, word_id in word_dict.items():
-            assert isinstance(word, str), "each word in word_dict needs to be type str"
-            assert isinstance(word_id, int) and word_id >= 0, "each word id needs to be positive integer"
+            if not isinstance(word, str):
+                raise ValueError("Each word in word_dict needs to be type string.")
+            if not (isinstance(word_id, int) and word_id >= 0):
+                raise ValueError("Each word id needs to be positive integer.")
         kwargs["word_dict"] = word_dict
         return method(self, **kwargs)
 
@@ -124,11 +187,11 @@ def check_jieba_init(method):
             mp_path = kwargs.get("mp_path")
         if hmm_path is None:
             raise ValueError(
-                "the dict of HMMSegment in cppjieba is not provided")
+                "The dict of HMMSegment in cppjieba is not provided.")
         kwargs["hmm_path"] = hmm_path
         if mp_path is None:
             raise ValueError(
-                "the dict of MPSegment in cppjieba is not provided")
+                "The dict of MPSegment in cppjieba is not provided.")
         kwargs["mp_path"] = mp_path
         if model is not None:
             kwargs["model"] = model
@@ -149,7 +212,7 @@ def check_jieba_add_word(method):
         if "freq" in kwargs:
             freq = kwargs.get("freq")
         if word is None:
-            raise ValueError("word is not provided")
+            raise ValueError("word is not provided.")
         kwargs["word"] = word
         if freq is not None:
             check_uint32(freq)
@@ -160,7 +223,7 @@ def check_jieba_add_word(method):
 
 
 def check_jieba_add_dict(method):
-    """Wrapper method to check the parameters of add dict"""
+    """Wrapper method to check the parameters of add dict."""
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
@@ -168,8 +231,205 @@ def check_jieba_add_dict(method):
         if "user_dict" in kwargs:
             user_dict = kwargs.get("user_dict")
         if user_dict is None:
-            raise ValueError("user_dict is not provided")
+            raise ValueError("user_dict is not provided.")
         kwargs["user_dict"] = user_dict
         return method(self, **kwargs)
 
     return new_method
+
+
+def check_from_dataset(method):
+    """A wrapper that wrap a parameter checker to the original function."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+
+        dataset, columns, freq_range, top_k, special_tokens, special_first = (list(args) + 6 * [None])[:6]
+        if "dataset" in kwargs:
+            dataset = kwargs.get("dataset")
+        if "columns" in kwargs:
+            columns = kwargs.get("columns")
+        if "freq_range" in kwargs:
+            freq_range = kwargs.get("freq_range")
+        if "top_k" in kwargs:
+            top_k = kwargs.get("top_k")
+        if "special_tokens" in kwargs:
+            special_tokens = kwargs.get("special_tokens")
+        if "special_first" in kwargs:
+            special_first = kwargs.get("special_first")
+
+        if columns is None:
+            columns = []
+
+        if not isinstance(columns, list):
+            columns = [columns]
+
+        for column in columns:
+            if not isinstance(column, str):
+                raise ValueError("columns need to be a list of strings.")
+
+        if freq_range is None:
+            freq_range = (None, None)
+
+        if not isinstance(freq_range, tuple) or len(freq_range) != 2:
+            raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None.")
+
+        for num in freq_range:
+            if num is not None and (not isinstance(num, int)):
+                raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None.")
+
+        if isinstance(freq_range[0], int) and isinstance(freq_range[1], int):
+            if freq_range[0] > freq_range[1] or freq_range[0] < 0:
+                raise ValueError("frequency range [a,b] should be 0 <= a <= b (a,b are inclusive).")
+
+        if top_k is not None and (not isinstance(top_k, int)):
+            raise ValueError("top_k needs to be a positive integer.")
+
+        if isinstance(top_k, int) and top_k <= 0:
+            raise ValueError("top_k needs to be a positive integer.")
+
+        if special_first is None:
+            special_first = True
+
+        if special_tokens is None:
+            special_tokens = []
+
+        if not isinstance(special_first, bool):
+            raise ValueError("special_first needs to be a boolean value.")
+
+        check_unique_list_of_words(special_tokens, "special_tokens")
+
+        kwargs["dataset"] = dataset
+        kwargs["columns"] = columns
+        kwargs["freq_range"] = freq_range
+        kwargs["top_k"] = top_k
+        kwargs["special_tokens"] = special_tokens
+        kwargs["special_first"] = special_first
+
+        return method(self, **kwargs)
+
+    return new_method
+
+
+def check_ngram(method):
+    """A wrapper that wrap a parameter checker to the original function."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        n, left_pad, right_pad, separator = (list(args) + 4 * [None])[:4]
+        if "n" in kwargs:
+            n = kwargs.get("n")
+        if "left_pad" in kwargs:
+            left_pad = kwargs.get("left_pad")
+        if "right_pad" in kwargs:
+            right_pad = kwargs.get("right_pad")
+        if "separator" in kwargs:
+            separator = kwargs.get("separator")
+
+        if isinstance(n, int):
+            n = [n]
+
+        if not (isinstance(n, list) and n != []):
+            raise ValueError("n needs to be a non-empty list of positive integers.")
+
+        for gram in n:
+            if not (isinstance(gram, int) and gram > 0):
+                raise ValueError("n in ngram needs to be a positive number.")
+
+        if left_pad is None:
+            left_pad = ("", 0)
+
+        if right_pad is None:
+            right_pad = ("", 0)
+
+        if not (isinstance(left_pad, tuple) and len(left_pad) == 2 and isinstance(left_pad[0], str) and isinstance(
+                left_pad[1], int)):
+            raise ValueError("left_pad needs to be a tuple of (str, int) str is pad token and int is pad_width.")
+
+        if not (isinstance(right_pad, tuple) and len(right_pad) == 2 and isinstance(right_pad[0], str) and isinstance(
+                right_pad[1], int)):
+            raise ValueError("right_pad needs to be a tuple of (str, int) str is pad token and int is pad_width.")
+
+        if not (left_pad[1] >= 0 and right_pad[1] >= 0):
+            raise ValueError("padding width need to be positive numbers.")
+
+        if separator is None:
+            separator = " "
+
+        if not isinstance(separator, str):
+            raise ValueError("separator needs to be a string.")
+
+        kwargs["n"] = n
+        kwargs["left_pad"] = left_pad
+        kwargs["right_pad"] = right_pad
+        kwargs["separator"] = separator
+
+        return method(self, **kwargs)
+
+    return new_method
+
+
+def check_pair_truncate(method):
+    """Wrapper method to check the parameters of number of pair truncate."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        max_length = (list(args) + [None])[0]
+        if "max_length" in kwargs:
+            max_length = kwargs.get("max_length")
+        if max_length is None:
+            raise ValueError("max_length is not provided.")
+
+        check_pos_int64(max_length)
+        kwargs["max_length"] = max_length
+
+        return method(self, **kwargs)
+
+    return new_method
+
+
+def check_to_number(method):
+    """A wrapper that wraps a parameter check to the original function (ToNumber)."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        data_type = (list(args) + [None])[0]
+        if "data_type" in kwargs:
+            data_type = kwargs.get("data_type")
+
+        if data_type is None:
+            raise ValueError("data_type is a mandatory parameter but was not provided.")
+
+        if not isinstance(data_type, typing.Type):
+            raise TypeError("data_type is not a MindSpore data type.")
+
+        if data_type not in mstype.number_type:
+            raise TypeError("data_type is not numeric data type.")
+
+        kwargs["data_type"] = data_type
+
+        return method(self, **kwargs)
+
+    return new_method
+
+
+def check_python_tokenizer(method):
+    """A wrapper that wraps a parameter check to the original function (PythonTokenizer)."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        tokenizer = (list(args) + [None])[0]
+        if "tokenizer" in kwargs:
+            tokenizer = kwargs.get("tokenizer")
+
+        if tokenizer is None:
+            raise ValueError("tokenizer is a mandatory parameter.")
+
+        if not callable(tokenizer):
+            raise TypeError("tokenizer is not a callable python function")
+
+        kwargs["tokenizer"] = tokenizer
+
+        return method(self, **kwargs)
+
+    return new_method
diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py
index 91fb486531..ffe711b106 100644
--- a/mindspore/dataset/transforms/c_transforms.py
+++ b/mindspore/dataset/transforms/c_transforms.py
@@ -15,9 +15,14 @@
 """
 This module c_transforms provides common operations, including OneHotOp and TypeCast.
 """
+from enum import IntEnum
+import numpy as np
+
+import mindspore.common.dtype as mstype
 import mindspore._c_dataengine as cde
 
-from .validators import check_num_classes, check_de_type
+from .validators import check_num_classes, check_de_type, check_fill_value, check_slice_op, check_mask_op, \
+    check_pad_end, check_concat_type
 from ..core.datatypes import mstype_to_detype
 
 
@@ -35,6 +40,21 @@ class OneHot(cde.OneHotOp):
         super().__init__(num_classes)
 
 
+class Fill(cde.FillOp):
+    """
+    Tensor operation to create a tensor filled with passed scalar value.
+    The output tensor will have the same shape and type as the input tensor.
+
+    Args:
+        fill_value (python types (str, bytes, int, float, or bool)) : scalar value
+            to fill created tensor with.
+    """
+
+    @check_fill_value
+    def __init__(self, fill_value):
+        super().__init__(cde.Tensor(np.array(fill_value)))
+
+
 class TypeCast(cde.TypeCastOp):
     """
     Tensor operation to cast to a given MindSpore data type.
@@ -48,3 +68,165 @@ class TypeCast(cde.TypeCastOp):
         data_type = mstype_to_detype(data_type)
         self.data_type = str(data_type)
         super().__init__(data_type)
+
+
+class Slice(cde.SliceOp):
+    """
+    Slice operation to extract a tensor out using the given n slices.
+
+    The functionality of Slice is similar to NumPy indexing feature.
+    (Currently only rank 1 Tensors are supported)
+
+    Args:
+        *slices(Variable length argument list): Maximum `n` number of arguments to slice a tensor of rank `n`.
+            One object in slices can be one of:
+             1.  int: slice this index only. Negative index is supported.
+             2.  slice object: slice the generated indices from the slice object. Similar to `start:stop:step`.
+             3.  None: slice the whole dimension. Similar to `:` in python indexing.
+             4.  Ellipses ...: slice all dimensions between the two slices.
+
+    Examples:
+        >>> # Data before
+        >>> # |   col   |
+        >>> # +---------+
+        >>> # | [1,2,3] |
+        >>> # +---------|
+        >>> data = data.map(operations=Slice(slice(1,3))) # slice indices 1 and 2 only
+        >>> # Data after
+        >>> # |    col     |
+        >>> # +------------+
+        >>> # |    [1,2]   |
+        >>> # +------------|
+    """
+
+    @check_slice_op
+    def __init__(self, *slices):
+        dim0 = slices[0]
+        if isinstance(dim0, int):
+            dim0 = [dim0]
+        elif dim0 is None:
+            dim0 = True
+        elif isinstance(dim0, slice):
+            dim0 = (dim0.start, dim0.stop, dim0.step)
+        elif dim0 is Ellipsis:
+            dim0 = True
+        super().__init__(dim0)
+
+
+class Relational(IntEnum):
+    EQ = 0
+    NE = 1
+    GT = 2
+    GE = 3
+    LT = 4
+    LE = 5
+
+
+DE_C_RELATIONAL = {Relational.EQ: cde.RelationalOp.EQ,
+                   Relational.NE: cde.RelationalOp.NE,
+                   Relational.GT: cde.RelationalOp.GT,
+                   Relational.GE: cde.RelationalOp.GE,
+                   Relational.LT: cde.RelationalOp.LT,
+                   Relational.LE: cde.RelationalOp.LE}
+
+
+class Mask(cde.MaskOp):
+    """
+    Mask content of the input tensor with the given predicate.
+    Any element of the tensor that matches the predicate will be evaluated to True, otherwise False.
+
+    Args:
+        operator (Relational): One of the relational operator EQ, NE LT, GT, LE or GE
+        constant (python types (str, int, float, or bool): constant to be compared to.
+            Constant will be casted to the type of the input tensor
+        dtype (optional, mindspore.dtype): type of the generated mask. Default to bool
+
+    Examples:
+        >>> # Data before
+        >>> # |  col1   |
+        >>> # +---------+
+        >>> # | [1,2,3] |
+        >>> # +---------+
+        >>> data = data.map(operations=Mask(Relational.EQ, 2))
+        >>> # Data after
+        >>> # |       col1         |
+        >>> # +--------------------+
+        >>> # | [False,True,False] |
+        >>> # +--------------------+
+    """
+
+    @check_mask_op
+    def __init__(self, operator, constant, dtype=mstype.bool_):
+        dtype = mstype_to_detype(dtype)
+        constant = cde.Tensor(np.array(constant))
+        super().__init__(DE_C_RELATIONAL[operator], constant, dtype)
+
+
+class PadEnd(cde.PadEndOp):
+    """
+    Pad input tensor according to `pad_shape`, need to have same rank.
+
+    Args:
+        pad_shape (list of `int`): list on integers representing the shape needed. Dimensions that set to `None` will
+            not be padded (i.e., original dim will be used). Shorter dimensions will truncate the values.
+        pad_value (python types (str, bytes, int, float, or bool), optional): value used to pad. Default to 0 or empty
+            string in case of Tensors of strings.
+
+    Examples:
+        >>> # Data before
+        >>> # |   col   |
+        >>> # +---------+
+        >>> # | [1,2,3] |
+        >>> # +---------|
+        >>> data = data.map(operations=PadEnd(pad_shape=[4], pad_value=10))
+        >>> # Data after
+        >>> # |    col     |
+        >>> # +------------+
+        >>> # | [1,2,3,10] |
+        >>> # +------------|
+    """
+
+    @check_pad_end
+    def __init__(self, pad_shape, pad_value=None):
+        if pad_value is not None:
+            pad_value = cde.Tensor(np.array(pad_value))
+        super().__init__(cde.TensorShape(pad_shape), pad_value)
+
+
+class Concatenate(cde.ConcatenateOp):
+    """
+    Tensor operation to prepend and append to a tensor.
+
+    Args:
+        axis (int, optional): axis to concatenate the tensors along (Default=0).
+        prepend (np.array, optional): numpy array to be prepended to the already concatenated tensors (Default=None).
+        append (np.array, optional): numpy array to be appended to the already concatenated tensors (Default=None).
+    """
+
+    @check_concat_type
+    def __init__(self, axis=0, prepend=None, append=None):
+        if prepend is not None:
+            prepend = cde.Tensor(np.array(prepend))
+        if append is not None:
+            append = cde.Tensor(np.array(append))
+        super().__init__(axis, prepend, append)
+
+
+class Duplicate(cde.DuplicateOp):
+    """
+    Duplicate the input tensor to a new output tensor. The input tensor is carried over to the output list.
+
+    Examples:
+        >>> # Data before
+        >>> # |  x      |
+        >>> # +---------+
+        >>> # | [1,2,3] |
+        >>> # +---------+
+        >>> data = data.map(input_columns=["x"], operations=Duplicate(),
+        >>>         output_columns=["x", "y"], columns_order=["x", "y"])
+        >>> # Data after
+        >>> # |  x      |  y      |
+        >>> # +---------+---------+
+        >>> # | [1,2,3] | [1,2,3] |
+        >>> # +---------+---------+
+    """
diff --git a/mindspore/dataset/transforms/validators.py b/mindspore/dataset/transforms/validators.py
index 5572e5285e..6b5760e0c5 100644
--- a/mindspore/dataset/transforms/validators.py
+++ b/mindspore/dataset/transforms/validators.py
@@ -15,8 +15,9 @@
 """Validators for TensorOps.
 """
 from functools import wraps
-from mindspore._c_expression import typing
+import numpy as np
 
+from mindspore._c_expression import typing
 
 # POS_INT_MIN is used to limit values from starting from 0
 POS_INT_MIN = 1
@@ -159,6 +160,25 @@ def check_num_classes(method):
     return new_method
 
 
+def check_fill_value(method):
+    """Wrapper method to check the parameters of fill value."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        fill_value = (list(args) + [None])[0]
+        if "fill_value" in kwargs:
+            fill_value = kwargs.get("fill_value")
+        if fill_value is None:
+            raise ValueError("fill_value is not provided.")
+        if not isinstance(fill_value, (str, float, bool, int, bytes)):
+            raise TypeError("fill_value must be either a primitive python str, float, bool, bytes or int")
+        kwargs["fill_value"] = fill_value
+
+        return method(self, **kwargs)
+
+    return new_method
+
+
 def check_de_type(method):
     """Wrapper method to check the parameters of data type."""
 
@@ -177,3 +197,130 @@ def check_de_type(method):
         return method(self, **kwargs)
 
     return new_method
+
+
+def check_slice_op(method):
+    """Wrapper method to check the parameters of slice."""
+
+    @wraps(method)
+    def new_method(self, *args):
+        for i, arg in enumerate(args):
+            if arg is not None and arg is not Ellipsis and not isinstance(arg, (int, slice, list)):
+                raise TypeError("Indexing of dim " + str(i) + "is not of valid type")
+            if isinstance(arg, list):
+                for a in arg:
+                    if not isinstance(a, int):
+                        raise TypeError("Index " + a + " is not an int")
+        return method(self, *args)
+
+    return new_method
+
+
+def check_mask_op(method):
+    """Wrapper method to check the parameters of mask."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        operator, constant, dtype = (list(args) + 3 * [None])[:3]
+        if "operator" in kwargs:
+            operator = kwargs.get("operator")
+        if "constant" in kwargs:
+            constant = kwargs.get("constant")
+        if "dtype" in kwargs:
+            dtype = kwargs.get("dtype")
+
+        if operator is None:
+            raise ValueError("operator is not provided.")
+
+        if constant is None:
+            raise ValueError("constant is not provided.")
+
+        from .c_transforms import Relational
+        if not isinstance(operator, Relational):
+            raise TypeError("operator is not a Relational operator enum.")
+
+        if not isinstance(constant, (str, float, bool, int, bytes)):
+            raise TypeError("constant must be either a primitive python str, float, bool, bytes or int")
+
+        if dtype is not None:
+            if not isinstance(dtype, typing.Type):
+                raise TypeError("dtype is not a MindSpore data type.")
+            kwargs["dtype"] = dtype
+
+        kwargs["operator"] = operator
+        kwargs["constant"] = constant
+
+        return method(self, **kwargs)
+
+    return new_method
+
+
+def check_pad_end(method):
+    """Wrapper method to check the parameters of PadEnd."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        pad_shape, pad_value = (list(args) + 2 * [None])[:2]
+        if "pad_shape" in kwargs:
+            pad_shape = kwargs.get("pad_shape")
+        if "pad_value" in kwargs:
+            pad_value = kwargs.get("pad_value")
+
+        if pad_shape is None:
+            raise ValueError("pad_shape is not provided.")
+
+        if pad_value is not None:
+            if not isinstance(pad_value, (str, float, bool, int, bytes)):
+                raise TypeError("pad_value must be either a primitive python str, float, bool, int or bytes")
+            kwargs["pad_value"] = pad_value
+
+        if not isinstance(pad_shape, list):
+            raise TypeError("pad_shape must be a list")
+
+        for dim in pad_shape:
+            if dim is not None:
+                if isinstance(dim, int):
+                    check_pos_int64(dim)
+                else:
+                    raise TypeError("a value in the list is not an integer.")
+
+        kwargs["pad_shape"] = pad_shape
+
+        return method(self, **kwargs)
+
+    return new_method
+
+
+def check_concat_type(method):
+    """Wrapper method to check the parameters of concatenation op."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        axis, prepend, append = (list(args) + 3 * [None])[:3]
+        if "prepend" in kwargs:
+            prepend = kwargs.get("prepend")
+        if "append" in kwargs:
+            append = kwargs.get("append")
+        if "axis" in kwargs:
+            axis = kwargs.get("axis")
+
+        if axis is not None:
+            if not isinstance(axis, int):
+                raise TypeError("axis type is not valid, must be an integer.")
+            if axis not in (0, -1):
+                raise ValueError("only 1D concatenation supported.")
+            kwargs["axis"] = axis
+
+        if prepend is not None:
+            if not isinstance(prepend, (type(None), np.ndarray)):
+                raise ValueError("prepend type is not valid, must be None for no prepend tensor or a numpy array.")
+            kwargs["prepend"] = prepend
+
+        if append is not None:
+            if not isinstance(append, (type(None), np.ndarray)):
+                raise ValueError("append type is not valid, must be None for no append tensor or a numpy array.")
+            kwargs["append"] = append
+
+        return method(self, **kwargs)
+
+    return new_method
diff --git a/mindspore/dataset/transforms/vision/c_transforms.py b/mindspore/dataset/transforms/vision/c_transforms.py
index 5676a8408c..c2497f9629 100644
--- a/mindspore/dataset/transforms/vision/c_transforms.py
+++ b/mindspore/dataset/transforms/vision/c_transforms.py
@@ -45,7 +45,7 @@ import mindspore._c_dataengine as cde
 from .utils import Inter, Border
 from .validators import check_prob, check_crop, check_resize_interpolation, check_random_resize_crop, \
     check_normalize_c, check_random_crop, check_random_color_adjust, check_random_rotation, \
-    check_resize, check_rescale, check_pad, check_cutout, check_uniform_augment_cpp
+    check_resize, check_rescale, check_pad, check_cutout, check_uniform_augment_cpp, check_bounding_box_augment_cpp
 
 DE_C_INTER_MODE = {Inter.NEAREST: cde.InterpolationMode.DE_INTER_NEAREST_NEIGHBOUR,
                    Inter.LINEAR: cde.InterpolationMode.DE_INTER_LINEAR,
@@ -149,6 +149,54 @@ class RandomCrop(cde.RandomCropOp):
         super().__init__(*size, *padding, border_type, pad_if_needed, *fill_value)
 
 
+class RandomCropWithBBox(cde.RandomCropWithBBoxOp):
+    """
+    Crop the input image at a random location and adjust bounding boxes for crop area
+
+    Args:
+        size (int or sequence): The output size of the cropped image.
+            If size is an int, a square crop of size (size, size) is returned.
+            If size is a sequence of length 2, it should be (height, width).
+        padding (int or sequence, optional): The number of pixels to pad the image (default=None).
+            If padding is not None, pad image firstly with padding values.
+            If a single number is provided, it pads all borders with this value.
+            If a tuple or list of 2 values are provided, it pads the (left and top)
+            with the first value and (right and bottom) with the second value.
+            If 4 values are provided as a list or tuple,it pads the left, top, right and bottom respectively.
+        pad_if_needed (bool, optional): Pad the image if either side is smaller than
+            the given output size (default=False).
+        fill_value (int or tuple, optional): The pixel intensity of the borders if
+            the padding_mode is Border.CONSTANT (default=0). If it is a 3-tuple, it is used to
+            fill R, G, B channels respectively.
+        padding_mode (Border mode, optional): The method of padding (default=Border.CONSTANT). Can be any of
+            [Border.CONSTANT, Border.EDGE, Border.REFLECT, Border.SYMMETRIC].
+
+            - Border.CONSTANT, means it fills the border with constant values.
+
+            - Border.EDGE, means it pads with the last value on the edge.
+
+            - Border.REFLECT, means it reflects the values on the edge omitting the last
+              value of edge.
+
+            - Border.SYMMETRIC, means it reflects the values on the edge repeating the last
+              value of edge.
+    """
+
+    @check_random_crop
+    def __init__(self, size, padding=None, pad_if_needed=False, fill_value=0, padding_mode=Border.CONSTANT):
+        self.size = size
+        self.padding = padding
+        self.pad_if_needed = pad_if_needed
+        self.fill_value = fill_value
+        self.padding_mode = padding_mode.value
+        if padding is None:
+            padding = (0, 0, 0, 0)
+        if isinstance(fill_value, int):  # temporary fix
+            fill_value = tuple([fill_value] * 3)
+        border_type = DE_C_BORDER_TYPE[padding_mode]
+        super().__init__(*size, *padding, border_type, pad_if_needed, *fill_value)
+
+
 class RandomHorizontalFlip(cde.RandomHorizontalFlipOp):
     """
     Flip the input image horizontally, randomly with a given probability.
@@ -163,6 +211,21 @@ class RandomHorizontalFlip(cde.RandomHorizontalFlipOp):
         super().__init__(prob)
 
 
+class RandomHorizontalFlipWithBBox(cde.RandomHorizontalFlipWithBBoxOp):
+    """
+    Flip the input image horizontally, randomly with a given probability.
+    Maintains data integrity by also flipping bounding boxes in an object detection pipeline.
+
+    Args:
+        prob (float): Probability of the image being flipped (default=0.5).
+    """
+
+    @check_prob
+    def __init__(self, prob=0.5):
+        self.prob = prob
+        super().__init__(prob)
+
+
 class RandomVerticalFlip(cde.RandomVerticalFlipOp):
     """
     Flip the input image vertically, randomly with a given probability.
@@ -177,6 +240,38 @@ class RandomVerticalFlip(cde.RandomVerticalFlipOp):
         super().__init__(prob)
 
 
+class RandomVerticalFlipWithBBox(cde.RandomVerticalFlipWithBBoxOp):
+    """
+    Flip the input image vertically, randomly with a given probability and adjust bounding boxes as well
+
+    Args:
+        prob (float, optional): Probability of the image being flipped (default=0.5).
+    """
+
+    @check_prob
+    def __init__(self, prob=0.5):
+        self.prob = prob
+        super().__init__(prob)
+
+
+class BoundingBoxAugment(cde.BoundingBoxAugmentOp):
+    """
+    Apply a given image transform on a random selection of bounding box regions
+    of a given image.
+
+    Args:
+        transform: C++ transformation function to be applied on random selection
+            of bounding box regions of a given image.
+        ratio (float, optional): Ratio of bounding boxes to apply augmentation on.
+            Range: [0,1] (default=0.3).
+    """
+    @check_bounding_box_augment_cpp
+    def __init__(self, transform, ratio=0.3):
+        self.ratio = ratio
+        self.transform = transform
+        super().__init__(transform, ratio)
+
+
 class Resize(cde.ResizeOp):
     """
     Resize the input image to the given size.
@@ -207,6 +302,42 @@ class Resize(cde.ResizeOp):
             super().__init__(*size, interpoltn)
 
 
+class RandomResizedCropWithBBox(cde.RandomCropAndResizeWithBBoxOp):
+    """
+    Crop the input image to a random size and aspect ratio and adjust the Bounding Boxes accordingly
+
+    Args:
+        size (int or sequence): The size of the output image.
+            If size is an int, a square crop of size (size, size) is returned.
+            If size is a sequence of length 2, it should be (height, width).
+        scale (tuple, optional): Range (min, max) of respective size of the original
+            size to be cropped (default=(0.08, 1.0)).
+        ratio (tuple, optional): Range (min, max) of aspect ratio to be cropped
+            (default=(3. / 4., 4. / 3.)).
+        interpolation (Inter mode, optional): Image interpolation mode (default=Inter.BILINEAR).
+            It can be any of [Inter.BILINEAR, Inter.NEAREST, Inter.BICUBIC].
+
+            - Inter.BILINEAR, means interpolation method is bilinear interpolation.
+
+            - Inter.NEAREST, means interpolation method is nearest-neighbor interpolation.
+
+            - Inter.BICUBIC, means interpolation method is bicubic interpolation.
+
+        max_attempts (int, optional): The maximum number of attempts to propose a valid
+            crop_area (default=10). If exceeded, fall back to use center_crop instead.
+    """
+    @check_random_resize_crop
+    def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.),
+                 interpolation=Inter.BILINEAR, max_attempts=10):
+        self.size = size
+        self.scale = scale
+        self.ratio = ratio
+        self.interpolation = interpolation
+        self.max_attempts = max_attempts
+        interpoltn = DE_C_INTER_MODE[interpolation]
+        super().__init__(*size, *scale, *ratio, interpoltn, max_attempts)
+
+
 class RandomResizedCrop(cde.RandomCropAndResizeOp):
     """
     Crop the input image to a random size and aspect ratio.
diff --git a/mindspore/dataset/transforms/vision/py_transforms.py b/mindspore/dataset/transforms/vision/py_transforms.py
index ee5a4b09fd..b252c3434b 100644
--- a/mindspore/dataset/transforms/vision/py_transforms.py
+++ b/mindspore/dataset/transforms/vision/py_transforms.py
@@ -606,7 +606,7 @@ class RandomRotation:
 
 class RandomOrder:
     """
-    Perform a series of transforms to the input PIL image in a random oreder.
+    Perform a series of transforms to the input PIL image in a random order.
 
     Args:
         transforms (list): List of the transformations to be applied.
@@ -1087,7 +1087,7 @@ class RandomAffine:
             The horizontal and vertical shift is selected randomly from the range:
             (-tx*width, tx*width) and (-ty*height, ty*height), respectively.
             If None, no translations gets applied.
-        scale (sequence, optional): Scaling factor interval (default=None, riginal scale is used).
+        scale (sequence, optional): Scaling factor interval (default=None, original scale is used).
         shear (int or float or sequence, optional): Range of shear factor (default=None).
             If a number 'shear', then a shear parallel to the x axis in the range of (-shear, +shear) is applied.
             If a tuple or list of size 2, then a shear parallel to the x axis in the range of (shear[0], shear[1])
diff --git a/mindspore/dataset/transforms/vision/py_transforms_util.py b/mindspore/dataset/transforms/vision/py_transforms_util.py
index ac77624bf8..d076109ff4 100644
--- a/mindspore/dataset/transforms/vision/py_transforms_util.py
+++ b/mindspore/dataset/transforms/vision/py_transforms_util.py
@@ -455,6 +455,9 @@ def random_crop(img, size, padding, pad_if_needed, fill_value, padding_mode):
     def _input_to_factor(img, size):
         img_width, img_height = img.size
         height, width = size
+        if height > img_height or width > img_width:
+            raise ValueError("Crop size {} is larger than input image size {}".format(size, (img_height, img_width)))
+
         if width == img_width and height == img_height:
             return 0, 0, img_height, img_width
 
@@ -551,26 +554,28 @@ def adjust_hue(img, hue_factor):
     Returns:
         img (PIL Image), Hue adjusted image.
     """
-    if not -0.5 <= hue_factor <= 0.5:
-        raise ValueError('hue_factor {} is not in [-0.5, 0.5].'.format(hue_factor))
+    image = img
+    image_hue_factor = hue_factor
+    if not -0.5 <= image_hue_factor <= 0.5:
+        raise ValueError('image_hue_factor {} is not in [-0.5, 0.5].'.format(image_hue_factor))
 
-    if not is_pil(img):
-        raise TypeError(augment_error_message.format(type(img)))
+    if not is_pil(image):
+        raise TypeError(augment_error_message.format(type(image)))
 
-    input_mode = img.mode
-    if input_mode in {'L', '1', 'I', 'F'}:
-        return img
+    mode = image.mode
+    if mode in {'L', '1', 'I', 'F'}:
+        return image
 
-    h, s, v = img.convert('HSV').split()
+    hue, saturation, value = img.convert('HSV').split()
 
-    np_h = np.array(h, dtype=np.uint8)
+    np_hue = np.array(hue, dtype=np.uint8)
 
     with np.errstate(over='ignore'):
-        np_h += np.uint8(hue_factor * 255)
-    h = Image.fromarray(np_h, 'L')
+        np_hue += np.uint8(image_hue_factor * 255)
+    hue = Image.fromarray(np_hue, 'L')
 
-    img = Image.merge('HSV', (h, s, v)).convert(input_mode)
-    return img
+    image = Image.merge('HSV', (hue, saturation, value)).convert(mode)
+    return image
 
 
 def to_type(img, output_type):
diff --git a/mindspore/dataset/transforms/vision/validators.py b/mindspore/dataset/transforms/vision/validators.py
index 20239232b5..b49116349b 100644
--- a/mindspore/dataset/transforms/vision/validators.py
+++ b/mindspore/dataset/transforms/vision/validators.py
@@ -852,6 +852,32 @@ def check_uniform_augment_cpp(method):
     return new_method
 
 
+def check_bounding_box_augment_cpp(method):
+    """Wrapper method to check the parameters of BoundingBoxAugment cpp op."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        transform, ratio = (list(args) + 2 * [None])[:2]
+        if "transform" in kwargs:
+            transform = kwargs.get("transform")
+        if "ratio" in kwargs:
+            ratio = kwargs.get("ratio")
+        if not isinstance(ratio, float) and not isinstance(ratio, int):
+            raise ValueError("Ratio should be an int or float.")
+        if ratio is not None:
+            check_value(ratio, [0., 1.])
+            kwargs["ratio"] = ratio
+        else:
+            ratio = 0.3
+        if not isinstance(transform, TensorOp):
+            raise ValueError("Transform can only be a C++ operation.")
+        kwargs["transform"] = transform
+        kwargs["ratio"] = ratio
+        return method(self, **kwargs)
+
+    return new_method
+
+
 def check_uniform_augment_py(method):
     """Wrapper method to check the parameters of python UniformAugment op."""
 
diff --git a/mindspore/mindrecord/__init__.py b/mindspore/mindrecord/__init__.py
index 31fb801c46..ee23b68cb6 100644
--- a/mindspore/mindrecord/__init__.py
+++ b/mindspore/mindrecord/__init__.py
@@ -29,9 +29,11 @@ from .common.exceptions import *
 from .shardutils import SUCCESS, FAILED
 from .tools.cifar10_to_mr import Cifar10ToMR
 from .tools.cifar100_to_mr import Cifar100ToMR
+from .tools.csv_to_mr import CsvToMR
 from .tools.imagenet_to_mr import ImageNetToMR
 from .tools.mnist_to_mr import MnistToMR
+from .tools.tfrecord_to_mr import TFRecordToMR
 
 __all__ = ['FileWriter', 'FileReader', 'MindPage',
-           'Cifar10ToMR', 'Cifar100ToMR', 'ImageNetToMR', 'MnistToMR',
+           'Cifar10ToMR', 'Cifar100ToMR', 'CsvToMR', 'ImageNetToMR', 'MnistToMR', 'TFRecordToMR',
            'SUCCESS', 'FAILED']
diff --git a/mindspore/mindrecord/tools/csv_to_mr.py b/mindspore/mindrecord/tools/csv_to_mr.py
new file mode 100644
index 0000000000..4bc8f37b47
--- /dev/null
+++ b/mindspore/mindrecord/tools/csv_to_mr.py
@@ -0,0 +1,168 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Csv format convert tool for MindRecord.
+"""
+from importlib import import_module
+import os
+
+from mindspore import log as logger
+from ..filewriter import FileWriter
+from ..shardutils import check_filename
+
+try:
+    pd = import_module("pandas")
+except ModuleNotFoundError:
+    pd = None
+
+__all__ = ['CsvToMR']
+
+class CsvToMR:
+    """
+    Class is for transformation from csv to MindRecord.
+
+    Args:
+        source (str): the file path of csv.
+        destination (str): the MindRecord file path to transform into.
+        columns_list(list[str], optional): List of columns to be read(default=None).
+        partition_number (int, optional): partition size (default=1).
+
+    Raises:
+        ValueError: If source, destination, partition_number is invalid.
+        RuntimeError: If columns_list is invalid.
+    """
+
+    def __init__(self, source, destination, columns_list=None, partition_number=1):
+        if not pd:
+            raise Exception("Module pandas is not found, please use pip install it.")
+        if isinstance(source, str):
+            check_filename(source)
+            self.source = source
+        else:
+            raise ValueError("The parameter source must be str.")
+
+        self._check_columns(columns_list, "columns_list")
+        self.columns_list = columns_list
+
+        if isinstance(destination, str):
+            check_filename(destination)
+            self.destination = destination
+        else:
+            raise ValueError("The parameter destination must be str.")
+
+        if partition_number is not None:
+            if not isinstance(partition_number, int):
+                raise ValueError("The parameter partition_number must be int")
+            self.partition_number = partition_number
+        else:
+            raise ValueError("The parameter partition_number must be int")
+
+        self.writer = FileWriter(self.destination, self.partition_number)
+
+    def _check_columns(self, columns, columns_name):
+        if columns:
+            if isinstance(columns, list):
+                for col in columns:
+                    if not isinstance(col, str):
+                        raise ValueError("The parameter {} must be list of str.".format(columns_name))
+            else:
+                raise ValueError("The parameter {} must be list of str.".format(columns_name))
+
+    def _get_schema(self, df):
+        """
+        Construct schema from df columns
+        """
+        if self.columns_list:
+            for col in self.columns_list:
+                if col not in df.columns:
+                    raise RuntimeError("The parameter columns_list is illegal, column {} does not exist.".format(col))
+        else:
+            self.columns_list = df.columns
+
+        schema = {}
+        for col in self.columns_list:
+            if str(df[col].dtype) == 'int64':
+                schema[col] = {"type": "int64"}
+            elif str(df[col].dtype) == 'float64':
+                schema[col] = {"type": "float64"}
+            elif str(df[col].dtype) == 'bool':
+                schema[col] = {"type": "int32"}
+            else:
+                schema[col] = {"type": "string"}
+        if not schema:
+            raise RuntimeError("Failed to generate schema from csv file.")
+        return schema
+
+    def _get_row_of_csv(self, df):
+        """Get row data from csv file."""
+        for _, r in df.iterrows():
+            row = {}
+            for col in self.columns_list:
+                if str(df[col].dtype) == 'bool':
+                    row[col] = int(r[col])
+                else:
+                    row[col] = r[col]
+            yield row
+
+    def transform(self):
+        """
+        Executes transformation from csv to MindRecord.
+
+        Returns:
+            SUCCESS/FAILED, whether successfully written into MindRecord.
+        """
+        if not os.path.exists(self.source):
+            raise IOError("Csv file {} do not exist.".format(self.source))
+
+        pd.set_option('display.max_columns', None)
+        df = pd.read_csv(self.source)
+
+        csv_schema = self._get_schema(df)
+
+        logger.info("transformed MindRecord schema is: {}".format(csv_schema))
+
+        # set the header size
+        self.writer.set_header_size(1 << 24)
+
+        # set the page size
+        self.writer.set_page_size(1 << 26)
+
+        # create the schema
+        self.writer.add_schema(csv_schema, "csv_schema")
+
+        # add the index
+        self.writer.add_index(list(self.columns_list))
+
+        csv_iter = self._get_row_of_csv(df)
+        batch_size = 256
+        transform_count = 0
+        while True:
+            data_list = []
+            try:
+                for _ in range(batch_size):
+                    data_list.append(csv_iter.__next__())
+                    transform_count += 1
+                self.writer.write_raw_data(data_list)
+                logger.info("transformed {} record...".format(transform_count))
+            except StopIteration:
+                if data_list:
+                    self.writer.write_raw_data(data_list)
+                    logger.info(
+                        "transformed {} record...".format(transform_count))
+                break
+
+        ret = self.writer.commit()
+
+        return ret
diff --git a/mindspore/mindrecord/tools/tfrecord_to_mr.py b/mindspore/mindrecord/tools/tfrecord_to_mr.py
new file mode 100644
index 0000000000..e8c52001fd
--- /dev/null
+++ b/mindspore/mindrecord/tools/tfrecord_to_mr.py
@@ -0,0 +1,266 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TFRecord convert tool for MindRecord
+"""
+
+from importlib import import_module
+from string import punctuation
+import numpy as np
+
+from mindspore import log as logger
+from ..filewriter import FileWriter
+from ..shardutils import check_filename
+
+try:
+    tf = import_module("tensorflow")    # just used to convert tfrecord to mindrecord
+except ModuleNotFoundError:
+    tf = None
+
+__all__ = ['TFRecordToMR']
+
+SupportedTensorFlowVersion = '2.1.0'
+
+def _cast_type(value):
+    """
+    Cast complex data type to basic datatype for MindRecord to recognize.
+
+    Args:
+        value: the TFRecord data type
+
+    Returns:
+        str, which is MindRecord field type.
+    """
+    tf_type_to_mr_type = {tf.string: "string",
+                          tf.int8: "int32",
+                          tf.int16: "int32",
+                          tf.int32: "int32",
+                          tf.int64: "int64",
+                          tf.uint8: "int32",
+                          tf.uint16: "int32",
+                          tf.uint32: "int64",
+                          tf.uint64: "int64",
+                          tf.float16: "float32",
+                          tf.float32: "float32",
+                          tf.float64: "float64",
+                          tf.double: "float64",
+                          tf.bool: "int32"}
+    unsupport_tf_type_to_mr_type = {tf.complex64: "None",
+                                    tf.complex128: "None"}
+
+    if value in tf_type_to_mr_type:
+        return tf_type_to_mr_type[value]
+
+    raise ValueError("Type " + value + " is not supported in MindRecord.")
+
+def _cast_string_type_to_np_type(value):
+    """Cast string type like: int32/int64/float32/float64 to np.int32/np.int64/np.float32/np.float64"""
+    string_type_to_np_type = {"int32": np.int32,
+                              "int64": np.int64,
+                              "float32": np.float32,
+                              "float64": np.float64}
+
+    if value in string_type_to_np_type:
+        return string_type_to_np_type[value]
+
+    raise ValueError("Type " + value + " is not supported cast to numpy type in MindRecord.")
+
+def _cast_name(key):
+    """
+    Cast schema names which containing special characters to valid names.
+
+    Here special characters means any characters in
+    '!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~
+    Valid names can only contain a-z, A-Z, and 0-9 and _
+
+    Args:
+        key (str): original key that might contains special characters.
+
+    Returns:
+        str, casted key that replace the special characters with "_". i.e. if
+            key is "a b" then returns "a_b".
+    """
+    special_symbols = set('{}{}'.format(punctuation, ' '))
+    special_symbols.remove('_')
+    new_key = ['_' if x in special_symbols else x for x in key]
+    casted_key = ''.join(new_key)
+    return casted_key
+
+class TFRecordToMR:
+    """
+    Class is for tranformation from TFRecord to MindRecord.
+
+    Args:
+        source (str): the TFRecord file to be transformed.
+        destination (str): the MindRecord file path to tranform into.
+        feature_dict (dict): a dictionary than states the feature type, i.e.
+            feature_dict = {"xxxx": tf.io.FixedLenFeature([], tf.string), \
+                            "yyyy": tf.io.FixedLenFeature([], tf.int64)}
+
+            **Follow case which uses VarLenFeature not support**
+
+            feature_dict = {"context": {"xxxx": tf.io.FixedLenFeature([], tf.string), \
+                                        "yyyy": tf.io.VarLenFeature(tf.int64)}, \
+                            "sequence": {"zzzz": tf.io.FixedLenSequenceFeature([], tf.float32)}}
+        bytes_fields (list): the bytes fields which are in feature_dict.
+
+    Raises:
+        ValueError: If parameter is invalid.
+        Exception: when tensorflow module not found or version is not correct.
+    """
+    def __init__(self, source, destination, feature_dict, bytes_fields=None):
+        if not tf:
+            raise Exception("Module tensorflow is not found, please use pip install it.")
+
+        if tf.__version__ < SupportedTensorFlowVersion:
+            raise Exception("Module tensorflow version must be greater or equal {}.".format(SupportedTensorFlowVersion))
+
+        if not isinstance(source, str):
+            raise ValueError("Parameter source must be string.")
+        check_filename(source)
+
+        if not isinstance(destination, str):
+            raise ValueError("Parameter destination must be string.")
+        check_filename(destination)
+
+        self.source = source
+        self.destination = destination
+
+        if feature_dict is None or not isinstance(feature_dict, dict):
+            raise ValueError("Parameter feature_dict is None or not dict.")
+
+        for key, val in feature_dict.items():
+            if not isinstance(val, tf.io.FixedLenFeature):
+                raise ValueError("Parameter feature_dict: {} only support FixedLenFeature.".format(feature_dict))
+
+        self.feature_dict = feature_dict
+
+        bytes_fields_list = []
+        if bytes_fields:
+            if not isinstance(bytes_fields, list):
+                raise ValueError("Parameter bytes_fields: {} must be list(str).".format(bytes_fields))
+            for item in bytes_fields:
+                if not isinstance(item, str):
+                    raise ValueError("Parameter bytes_fields's item: {} is not str.".format(item))
+
+                if item not in self.feature_dict:
+                    raise ValueError("Parameter bytes_fields's item: {} is not in feature_dict: {}."
+                                     .format(item, self.feature_dict))
+
+                if not isinstance(self.feature_dict[item].shape, list):
+                    raise ValueError("Parameter feature_dict[{}].shape should be a list.".format(item))
+
+                casted_bytes_field = _cast_name(item)
+                bytes_fields_list.append(casted_bytes_field)
+
+        self.bytes_fields_list = bytes_fields_list
+        self.scalar_set = set()
+        self.list_set = set()
+
+        mindrecord_schema = {}
+        for key, val in self.feature_dict.items():
+            if not val.shape:
+                self.scalar_set.add(_cast_name(key))
+                if key in self.bytes_fields_list:
+                    mindrecord_schema[_cast_name(key)] = {"type": "bytes"}
+                else:
+                    mindrecord_schema[_cast_name(key)] = {"type": _cast_type(val.dtype)}
+            else:
+                if len(val.shape) != 1:
+                    raise ValueError("Parameter len(feature_dict[{}].shape) should be 1.")
+                if val.shape[0] < 1:
+                    raise ValueError("Parameter feature_dict[{}].shape[0] should > 0".format(key))
+                if val.dtype == tf.string:
+                    raise ValueError("Parameter feautre_dict[{}].dtype is tf.string which shape[0] \
+                        is not None. It is not supported.".format(key))
+                self.list_set.add(_cast_name(key))
+                mindrecord_schema[_cast_name(key)] = {"type": _cast_type(val.dtype), "shape": [val.shape[0]]}
+        self.mindrecord_schema = mindrecord_schema
+
+    def _parse_record(self, example):
+        """Returns features for a single example"""
+        features = tf.io.parse_single_example(example, features=self.feature_dict)
+        return features
+
+    def _get_data_when_scalar_field(self, ms_dict, cast_key, key, val):
+        """put data in ms_dict when field type is string"""
+        if isinstance(val.numpy(), (np.ndarray, list)):
+            raise ValueError("The response key: {}, value: {} from TFRecord should be a scalar.".format(key, val))
+        if self.feature_dict[key].dtype == tf.string:
+            if cast_key in self.bytes_fields_list:
+                ms_dict[cast_key] = val.numpy()
+            else:
+                ms_dict[cast_key] = str(val.numpy(), encoding="utf-8")
+        elif _cast_type(self.feature_dict[key].dtype).startswith("int"):
+            ms_dict[cast_key] = int(val.numpy())
+        else:
+            ms_dict[cast_key] = float(val.numpy())
+
+    def tfrecord_iterator(self):
+        """Yield a dict with key to be fields in schema, and value to be data."""
+        dataset = tf.data.TFRecordDataset(self.source)
+        dataset = dataset.map(self._parse_record)
+        iterator = dataset.__iter__()
+        index_id = 0
+        try:
+            for features in iterator:
+                ms_dict = {}
+                index_id = index_id + 1
+                for key, val in features.items():
+                    cast_key = _cast_name(key)
+                    if key in self.scalar_set:
+                        self._get_data_when_scalar_field(ms_dict, cast_key, key, val)
+                    else:
+                        if not isinstance(val.numpy(), np.ndarray) and not isinstance(val.numpy(), list):
+                            raise ValueError("he response key: {}, value: {} from TFRecord should be a ndarray or list."
+                                             .format(key, val))
+                        # list set
+                        ms_dict[cast_key] = \
+                            np.asarray(val, _cast_string_type_to_np_type(self.mindrecord_schema[cast_key]["type"]))
+                yield ms_dict
+        except tf.errors.InvalidArgumentError:
+            raise ValueError("TFRecord feature_dict parameter error.")
+
+    def transform(self):
+        """
+        Executes transform from TFRecord to MindRecord.
+
+        Returns:
+            SUCCESS/FAILED, whether successfuly written into MindRecord.
+        """
+        writer = FileWriter(self.destination)
+        logger.info("Transformed MindRecord schema is: {}, TFRecord feature dict is: {}"
+                    .format(self.mindrecord_schema, self.feature_dict))
+
+        writer.add_schema(self.mindrecord_schema, "TFRecord to MindRecord")
+
+        tf_iter = self.tfrecord_iterator()
+        batch_size = 256
+
+        transform_count = 0
+        while True:
+            data_list = []
+            try:
+                for _ in range(batch_size):
+                    data_list.append(tf_iter.__next__())
+                    transform_count += 1
+
+                writer.write_raw_data(data_list)
+                logger.info("Transformed {} records...".format(transform_count))
+            except StopIteration:
+                if data_list:
+                    writer.write_raw_data(data_list)
+                    logger.info("Transformed {} records...".format(transform_count))
+                break
+        return writer.commit()
diff --git a/mindspore/model_zoo/mobilenetV2.py b/mindspore/model_zoo/mobilenetV2.py
deleted file mode 100644
index df35c5f369..0000000000
--- a/mindspore/model_zoo/mobilenetV2.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""MobileNetV2 model define"""
-import numpy as np
-import mindspore.nn as nn
-from mindspore.ops import operations as P
-from mindspore.ops.operations import TensorAdd
-from mindspore import Parameter, Tensor
-from mindspore.common.initializer import initializer
-
-__all__ = ['mobilenet_v2']
-
-
-def _make_divisible(v, divisor, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class GlobalAvgPooling(nn.Cell):
-    """
-    Global avg pooling definition.
-
-    Args:
-
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> GlobalAvgPooling()
-    """
-
-    def __init__(self):
-        super(GlobalAvgPooling, self).__init__()
-        self.mean = P.ReduceMean(keep_dims=False)
-
-    def construct(self, x):
-        x = self.mean(x, (2, 3))
-        return x
-
-
-class DepthwiseConv(nn.Cell):
-    """
-    Depthwise Convolution warpper definition.
-
-    Args:
-        in_planes (int): Input channel.
-        kernel_size (int): Input kernel size.
-        stride (int): Stride size.
-        pad_mode (str): pad mode in (pad, same, valid)
-        channel_multiplier (int): Output channel multiplier
-        has_bias (bool): has bias or not
-
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> DepthwiseConv(16, 3, 1, 'pad', 1, channel_multiplier=1)
-    """
-
-    def __init__(self, in_planes, kernel_size, stride, pad_mode, pad, channel_multiplier=1, has_bias=False):
-        super(DepthwiseConv, self).__init__()
-        self.has_bias = has_bias
-        self.in_channels = in_planes
-        self.channel_multiplier = channel_multiplier
-        self.out_channels = in_planes * channel_multiplier
-        self.kernel_size = (kernel_size, kernel_size)
-        self.depthwise_conv = P.DepthwiseConv2dNative(channel_multiplier=channel_multiplier,
-                                                      kernel_size=self.kernel_size,
-                                                      stride=stride, pad_mode=pad_mode, pad=pad)
-        self.bias_add = P.BiasAdd()
-        weight_shape = [channel_multiplier, in_planes, *self.kernel_size]
-        self.weight = Parameter(initializer('ones', weight_shape), name='weight')
-
-        if has_bias:
-            bias_shape = [channel_multiplier * in_planes]
-            self.bias = Parameter(initializer('zeros', bias_shape), name='bias')
-        else:
-            self.bias = None
-
-    def construct(self, x):
-        output = self.depthwise_conv(x, self.weight)
-        if self.has_bias:
-            output = self.bias_add(output, self.bias)
-        return output
-
-
-class ConvBNReLU(nn.Cell):
-    """
-    Convolution/Depthwise fused with Batchnorm and ReLU block definition.
-
-    Args:
-        in_planes (int): Input channel.
-        out_planes (int): Output channel.
-        kernel_size (int): Input kernel size.
-        stride (int): Stride size for the first convolutional layer. Default: 1.
-        groups (int): channel group. Convolution is 1 while Depthiwse is input channel. Default: 1.
-
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> ConvBNReLU(16, 256, kernel_size=1, stride=1, groups=1)
-    """
-
-    def __init__(self, platform, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
-        super(ConvBNReLU, self).__init__()
-        padding = (kernel_size - 1) // 2
-        if groups == 1:
-            conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride, pad_mode='pad', padding=padding)
-        else:
-            if platform == "Ascend":
-                conv = DepthwiseConv(in_planes, kernel_size, stride, pad_mode='pad', pad=padding)
-            elif platform == "GPU":
-                conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride,
-                                 group=in_planes, pad_mode='pad', padding=padding)
-
-        layers = [conv, nn.BatchNorm2d(out_planes), nn.ReLU6()]
-        self.features = nn.SequentialCell(layers)
-
-    def construct(self, x):
-        output = self.features(x)
-        return output
-
-
-class InvertedResidual(nn.Cell):
-    """
-    Mobilenetv2 residual block definition.
-
-    Args:
-        inp (int): Input channel.
-        oup (int): Output channel.
-        stride (int): Stride size for the first convolutional layer. Default: 1.
-        expand_ratio (int): expand ration of input channel
-
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> ResidualBlock(3, 256, 1, 1)
-    """
-
-    def __init__(self, platform, inp, oup, stride, expand_ratio):
-        super(InvertedResidual, self).__init__()
-        assert stride in [1, 2]
-
-        hidden_dim = int(round(inp * expand_ratio))
-        self.use_res_connect = stride == 1 and inp == oup
-
-        layers = []
-        if expand_ratio != 1:
-            layers.append(ConvBNReLU(platform, inp, hidden_dim, kernel_size=1))
-        layers.extend([
-            # dw
-            ConvBNReLU(platform, hidden_dim, hidden_dim,
-                       stride=stride, groups=hidden_dim),
-            # pw-linear
-            nn.Conv2d(hidden_dim, oup, kernel_size=1,
-                      stride=1, has_bias=False),
-            nn.BatchNorm2d(oup),
-        ])
-        self.conv = nn.SequentialCell(layers)
-        self.add = TensorAdd()
-        self.cast = P.Cast()
-
-    def construct(self, x):
-        identity = x
-        x = self.conv(x)
-        if self.use_res_connect:
-            return self.add(identity, x)
-        return x
-
-
-class MobileNetV2(nn.Cell):
-    """
-    MobileNetV2 architecture.
-
-    Args:
-        class_num (Cell): number of classes.
-        width_mult (int): Channels multiplier for round to 8/16 and others. Default is 1.
-        has_dropout (bool): Is dropout used. Default is false
-        inverted_residual_setting (list): Inverted residual settings. Default is None
-        round_nearest (list): Channel round to . Default is 8
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> MobileNetV2(num_classes=1000)
-    """
-
-    def __init__(self, platform, num_classes=1000, width_mult=1.,
-                 has_dropout=False, inverted_residual_setting=None, round_nearest=8):
-        super(MobileNetV2, self).__init__()
-        block = InvertedResidual
-        input_channel = 32
-        last_channel = 1280
-        # setting of inverted residual blocks
-        self.cfgs = inverted_residual_setting
-        if inverted_residual_setting is None:
-            self.cfgs = [
-                # t, c, n, s
-                [1, 16, 1, 1],
-                [6, 24, 2, 2],
-                [6, 32, 3, 2],
-                [6, 64, 4, 2],
-                [6, 96, 3, 1],
-                [6, 160, 3, 2],
-                [6, 320, 1, 1],
-            ]
-
-        # building first layer
-        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
-        self.out_channels = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
-        features = [ConvBNReLU(platform, 3, input_channel, stride=2)]
-        # building inverted residual blocks
-        for t, c, n, s in self.cfgs:
-            output_channel = _make_divisible(c * width_mult, round_nearest)
-            for i in range(n):
-                stride = s if i == 0 else 1
-                features.append(block(platform, input_channel, output_channel, stride, expand_ratio=t))
-                input_channel = output_channel
-        # building last several layers
-        features.append(ConvBNReLU(platform, input_channel, self.out_channels, kernel_size=1))
-        # make it nn.CellList
-        self.features = nn.SequentialCell(features)
-        # mobilenet head
-        head = ([GlobalAvgPooling(), nn.Dense(self.out_channels, num_classes, has_bias=True)] if not has_dropout else
-                [GlobalAvgPooling(), nn.Dropout(0.2), nn.Dense(self.out_channels, num_classes, has_bias=True)])
-        self.head = nn.SequentialCell(head)
-
-        self._initialize_weights()
-
-    def construct(self, x):
-        x = self.features(x)
-        x = self.head(x)
-        return x
-
-    def _initialize_weights(self):
-        """
-        Initialize weights.
-
-        Args:
-
-        Returns:
-            None.
-
-        Examples:
-            >>> _initialize_weights()
-        """
-        for _, m in self.cells_and_names():
-            if isinstance(m, (nn.Conv2d, DepthwiseConv)):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.set_parameter_data(Tensor(np.random.normal(0, np.sqrt(2. / n),
-                                                                    m.weight.data.shape()).astype("float32")))
-                if m.bias is not None:
-                    m.bias.set_parameter_data(
-                        Tensor(np.zeros(m.bias.data.shape(), dtype="float32")))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.gamma.set_parameter_data(
-                    Tensor(np.ones(m.gamma.data.shape(), dtype="float32")))
-                m.beta.set_parameter_data(
-                    Tensor(np.zeros(m.beta.data.shape(), dtype="float32")))
-            elif isinstance(m, nn.Dense):
-                m.weight.set_parameter_data(Tensor(np.random.normal(
-                    0, 0.01, m.weight.data.shape()).astype("float32")))
-                if m.bias is not None:
-                    m.bias.set_parameter_data(
-                        Tensor(np.zeros(m.bias.data.shape(), dtype="float32")))
-
-
-def mobilenet_v2(**kwargs):
-    """
-    Constructs a MobileNet V2 model
-    """
-    return MobileNetV2(**kwargs)
diff --git a/mindspore/model_zoo/mobilenetV3.py b/mindspore/model_zoo/mobilenetV3.py
deleted file mode 100644
index 820e60493f..0000000000
--- a/mindspore/model_zoo/mobilenetV3.py
+++ /dev/null
@@ -1,390 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""MobileNetV3 model define"""
-from functools import partial
-import numpy as np
-import mindspore.nn as nn
-from mindspore.ops import operations as P
-from mindspore import Tensor
-
-
-__all__ = ['mobilenet_v3_large',
-           'mobilenet_v3_small']
-
-
-def _make_divisible(x, divisor=8):
-    return int(np.ceil(x * 1. / divisor) * divisor)
-
-
-class Activation(nn.Cell):
-    """
-    Activation definition.
-
-    Args:
-        act_func(string): activation name.
-
-    Returns:
-         Tensor, output tensor.
-    """
-
-    def __init__(self, act_func):
-        super(Activation, self).__init__()
-        if act_func == 'relu':
-            self.act = nn.ReLU()
-        elif act_func == 'relu6':
-            self.act = nn.ReLU6()
-        elif act_func in ('hsigmoid', 'hard_sigmoid'):
-            self.act = nn.HSigmoid()
-        elif act_func in ('hswish', 'hard_swish'):
-            self.act = nn.HSwish()
-        else:
-            raise NotImplementedError
-
-    def construct(self, x):
-        return self.act(x)
-
-
-class GlobalAvgPooling(nn.Cell):
-    """
-    Global avg pooling definition.
-
-    Args:
-
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> GlobalAvgPooling()
-    """
-
-    def __init__(self, keep_dims=False):
-        super(GlobalAvgPooling, self).__init__()
-        self.mean = P.ReduceMean(keep_dims=keep_dims)
-
-    def construct(self, x):
-        x = self.mean(x, (2, 3))
-        return x
-
-
-class SE(nn.Cell):
-    """
-    SE warpper definition.
-
-    Args:
-        num_out (int): Output channel.
-        ratio (int): middle output ratio.
-
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> SE(4)
-    """
-
-    def __init__(self, num_out, ratio=4):
-        super(SE, self).__init__()
-        num_mid = _make_divisible(num_out // ratio)
-        self.pool = GlobalAvgPooling(keep_dims=True)
-        self.conv1 = nn.Conv2d(in_channels=num_out, out_channels=num_mid,
-                               kernel_size=1, has_bias=True, pad_mode='pad')
-        self.act1 = Activation('relu')
-        self.conv2 = nn.Conv2d(in_channels=num_mid, out_channels=num_out,
-                               kernel_size=1, has_bias=True, pad_mode='pad')
-        self.act2 = Activation('hsigmoid')
-        self.mul = P.Mul()
-
-    def construct(self, x):
-        out = self.pool(x)
-        out = self.conv1(out)
-        out = self.act1(out)
-        out = self.conv2(out)
-        out = self.act2(out)
-        out = self.mul(x, out)
-        return out
-
-
-class Unit(nn.Cell):
-    """
-    Unit warpper definition.
-
-    Args:
-        num_in (int): Input channel.
-        num_out (int): Output channel.
-        kernel_size (int): Input kernel size.
-        stride (int): Stride size.
-        padding (int): Padding number.
-        num_groups (int): Output num group.
-        use_act (bool): Used activation or not.
-        act_type (string): Activation type.
-
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> Unit(3, 3)
-    """
-
-    def __init__(self, num_in, num_out, kernel_size=1, stride=1, padding=0, num_groups=1,
-                 use_act=True, act_type='relu'):
-        super(Unit, self).__init__()
-        self.conv = nn.Conv2d(in_channels=num_in,
-                              out_channels=num_out,
-                              kernel_size=kernel_size,
-                              stride=stride,
-                              padding=padding,
-                              group=num_groups,
-                              has_bias=False,
-                              pad_mode='pad')
-        self.bn = nn.BatchNorm2d(num_out)
-        self.use_act = use_act
-        self.act = Activation(act_type) if use_act else None
-
-    def construct(self, x):
-        out = self.conv(x)
-        out = self.bn(out)
-        if self.use_act:
-            out = self.act(out)
-        return out
-
-
-class ResUnit(nn.Cell):
-    """
-    ResUnit warpper definition.
-
-    Args:
-        num_in (int): Input channel.
-        num_mid (int): Middle channel.
-        num_out (int): Output channel.
-        kernel_size (int): Input kernel size.
-        stride (int): Stride size.
-        act_type (str): Activation type.
-        use_se (bool): Use SE warpper or not.
-
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> ResUnit(16, 3, 1, 1)
-    """
-    def __init__(self, num_in, num_mid, num_out, kernel_size, stride=1, act_type='relu', use_se=False):
-        super(ResUnit, self).__init__()
-        self.use_se = use_se
-        self.first_conv = (num_out != num_mid)
-        self.use_short_cut_conv = True
-
-        if self.first_conv:
-            self.expand = Unit(num_in, num_mid, kernel_size=1,
-                               stride=1, padding=0, act_type=act_type)
-        else:
-            self.expand = None
-        self.conv1 = Unit(num_mid, num_mid, kernel_size=kernel_size, stride=stride,
-                          padding=self._get_pad(kernel_size), act_type=act_type, num_groups=num_mid)
-        if use_se:
-            self.se = SE(num_mid)
-        self.conv2 = Unit(num_mid, num_out, kernel_size=1, stride=1,
-                          padding=0, act_type=act_type, use_act=False)
-        if num_in != num_out or stride != 1:
-            self.use_short_cut_conv = False
-        self.add = P.TensorAdd() if self.use_short_cut_conv else None
-
-    def construct(self, x):
-        if self.first_conv:
-            out = self.expand(x)
-        else:
-            out = x
-        out = self.conv1(out)
-        if self.use_se:
-            out = self.se(out)
-        out = self.conv2(out)
-        if self.use_short_cut_conv:
-            out = self.add(x, out)
-        return out
-
-    def _get_pad(self, kernel_size):
-        """set the padding number"""
-        pad = 0
-        if kernel_size == 1:
-            pad = 0
-        elif kernel_size == 3:
-            pad = 1
-        elif kernel_size == 5:
-            pad = 2
-        elif kernel_size == 7:
-            pad = 3
-        else:
-            raise NotImplementedError
-        return pad
-
-
-class MobileNetV3(nn.Cell):
-    """
-    MobileNetV3 architecture.
-
-    Args:
-        model_cfgs (Cell): number of classes.
-        num_classes (int): Output number classes.
-        multiplier (int): Channels multiplier for round to 8/16 and others. Default is 1.
-        final_drop (float): Dropout number.
-        round_nearest (list): Channel round to . Default is 8.
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> MobileNetV3(num_classes=1000)
-    """
-
-    def __init__(self, model_cfgs, num_classes=1000, multiplier=1., final_drop=0., round_nearest=8):
-        super(MobileNetV3, self).__init__()
-        self.cfgs = model_cfgs['cfg']
-        self.inplanes = 16
-        self.features = []
-        first_conv_in_channel = 3
-        first_conv_out_channel = _make_divisible(multiplier * self.inplanes)
-
-        self.features.append(nn.Conv2d(in_channels=first_conv_in_channel,
-                                       out_channels=first_conv_out_channel,
-                                       kernel_size=3, padding=1, stride=2,
-                                       has_bias=False, pad_mode='pad'))
-        self.features.append(nn.BatchNorm2d(first_conv_out_channel))
-        self.features.append(Activation('hswish'))
-        for layer_cfg in self.cfgs:
-            self.features.append(self._make_layer(kernel_size=layer_cfg[0],
-                                                  exp_ch=_make_divisible(multiplier * layer_cfg[1]),
-                                                  out_channel=_make_divisible(multiplier * layer_cfg[2]),
-                                                  use_se=layer_cfg[3],
-                                                  act_func=layer_cfg[4],
-                                                  stride=layer_cfg[5]))
-        output_channel = _make_divisible(multiplier * model_cfgs["cls_ch_squeeze"])
-        self.features.append(nn.Conv2d(in_channels=_make_divisible(multiplier * self.cfgs[-1][2]),
-                                       out_channels=output_channel,
-                                       kernel_size=1, padding=0, stride=1,
-                                       has_bias=False, pad_mode='pad'))
-        self.features.append(nn.BatchNorm2d(output_channel))
-        self.features.append(Activation('hswish'))
-        self.features.append(GlobalAvgPooling(keep_dims=True))
-        self.features.append(nn.Conv2d(in_channels=output_channel,
-                                       out_channels=model_cfgs['cls_ch_expand'],
-                                       kernel_size=1, padding=0, stride=1,
-                                       has_bias=False, pad_mode='pad'))
-        self.features.append(Activation('hswish'))
-        if final_drop > 0:
-            self.features.append((nn.Dropout(final_drop)))
-
-        # make it nn.CellList
-        self.features = nn.SequentialCell(self.features)
-        self.output = nn.Conv2d(in_channels=model_cfgs['cls_ch_expand'],
-                                out_channels=num_classes,
-                                kernel_size=1, has_bias=True, pad_mode='pad')
-        self.squeeze = P.Squeeze(axis=(2, 3))
-
-        self._initialize_weights()
-
-    def construct(self, x):
-        x = self.features(x)
-        x = self.output(x)
-        x = self.squeeze(x)
-        return x
-
-    def _make_layer(self, kernel_size, exp_ch, out_channel, use_se, act_func, stride=1):
-        mid_planes = exp_ch
-        out_planes = out_channel
-        #num_in, num_mid, num_out, kernel_size, stride=1, act_type='relu', use_se=False):
-        layer = ResUnit(self.inplanes, mid_planes, out_planes,
-                        kernel_size, stride=stride, act_type=act_func, use_se=use_se)
-        self.inplanes = out_planes
-        return layer
-
-    def _initialize_weights(self):
-        """
-        Initialize weights.
-
-        Args:
-
-        Returns:
-            None.
-
-        Examples:
-            >>> _initialize_weights()
-        """
-        for _, m in self.cells_and_names():
-            if isinstance(m, (nn.Conv2d)):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.set_parameter_data(Tensor(np.random.normal(0, np.sqrt(2. / n),
-                                                                    m.weight.data.shape()).astype("float32")))
-                if m.bias is not None:
-                    m.bias.set_parameter_data(
-                        Tensor(np.zeros(m.bias.data.shape(), dtype="float32")))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.gamma.set_parameter_data(
-                    Tensor(np.ones(m.gamma.data.shape(), dtype="float32")))
-                m.beta.set_parameter_data(
-                    Tensor(np.zeros(m.beta.data.shape(), dtype="float32")))
-            elif isinstance(m, nn.Dense):
-                m.weight.set_parameter_data(Tensor(np.random.normal(
-                    0, 0.01, m.weight.data.shape()).astype("float32")))
-                if m.bias is not None:
-                    m.bias.set_parameter_data(
-                        Tensor(np.zeros(m.bias.data.shape(), dtype="float32")))
-
-
-def mobilenet_v3(model_name, **kwargs):
-    """
-    Constructs a MobileNet V2 model
-    """
-    model_cfgs = {
-        "large": {
-            "cfg": [
-                # k, exp, c,  se,     nl,  s,
-                [3, 16, 16, False, 'relu', 1],
-                [3, 64, 24, False, 'relu', 2],
-                [3, 72, 24, False, 'relu', 1],
-                [5, 72, 40, True, 'relu', 2],
-                [5, 120, 40, True, 'relu', 1],
-                [5, 120, 40, True, 'relu', 1],
-                [3, 240, 80, False, 'hswish', 2],
-                [3, 200, 80, False, 'hswish', 1],
-                [3, 184, 80, False, 'hswish', 1],
-                [3, 184, 80, False, 'hswish', 1],
-                [3, 480, 112, True, 'hswish', 1],
-                [3, 672, 112, True, 'hswish', 1],
-                [5, 672, 160, True, 'hswish', 2],
-                [5, 960, 160, True, 'hswish', 1],
-                [5, 960, 160, True, 'hswish', 1]],
-            "cls_ch_squeeze": 960,
-            "cls_ch_expand": 1280,
-        },
-        "small": {
-            "cfg": [
-                # k, exp, c,  se,     nl,  s,
-                [3, 16, 16, True, 'relu', 2],
-                [3, 72, 24, False, 'relu', 2],
-                [3, 88, 24, False, 'relu', 1],
-                [5, 96, 40, True, 'hswish', 2],
-                [5, 240, 40, True, 'hswish', 1],
-                [5, 240, 40, True, 'hswish', 1],
-                [5, 120, 48, True, 'hswish', 1],
-                [5, 144, 48, True, 'hswish', 1],
-                [5, 288, 96, True, 'hswish', 2],
-                [5, 576, 96, True, 'hswish', 1],
-                [5, 576, 96, True, 'hswish', 1]],
-            "cls_ch_squeeze": 576,
-            "cls_ch_expand": 1280,
-        }
-    }
-    return MobileNetV3(model_cfgs[model_name], **kwargs)
-
-
-mobilenet_v3_large = partial(mobilenet_v3, model_name="large")
-mobilenet_v3_small = partial(mobilenet_v3, model_name="small")
diff --git a/mindspore/nn/__init__.py b/mindspore/nn/__init__.py
index f3f59edcbf..8d5e7d3b0a 100644
--- a/mindspore/nn/__init__.py
+++ b/mindspore/nn/__init__.py
@@ -18,14 +18,14 @@ Neural Networks Cells.
 Pre-defined building blocks or computing units to construct Neural Networks.
 """
 from . import layer, loss, optim, metrics, wrap
-from .cell import Cell
+from .cell import Cell, GraphKernel
 from .layer import *
 from .loss import *
 from .optim import *
 from .metrics import *
 from .wrap import *
 
-__all__ = ["Cell"]
+__all__ = ["Cell", "GraphKernel"]
 __all__.extend(layer.__all__)
 __all__.extend(loss.__all__)
 __all__.extend(optim.__all__)
diff --git a/mindspore/nn/cell.py b/mindspore/nn/cell.py
index dd8c4dac27..c046c2e1bf 100755
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@@ -19,7 +19,7 @@ from collections import OrderedDict
 from mindspore import log as logger
 from .. import context
 from ..common import dtype as mstype
-from ..common.api import _executor
+from ..common.api import _executor, _pynative_exec
 from .._checkparam import _check_str_by_regular
 from ..common.parameter import Parameter, ParameterTuple
 from .._c_expression import init_backend
@@ -60,6 +60,7 @@ class Cell:
         self._params = OrderedDict()
         self._cells = OrderedDict()
         self.training = False
+        self.requires_grad = False
         self.pynative = False
         self._param_prefix = ''
         self._auto_prefix = auto_prefix
@@ -79,6 +80,15 @@ class Cell:
         self._backward_hook = None
         self.enable_hook = False
         self._bprop_debug = False
+        self._is_run = False
+
+    @property
+    def is_run(self):
+        return self._is_run
+
+    @is_run.setter
+    def is_run(self, value):
+        self._is_run = value
 
     @property
     def create_time(self):
@@ -176,6 +186,7 @@ class Cell:
         raise AttributeError("'{}' object has no attribute '{}'.".format(type(self).__name__, name))
 
     def __del__(self):
+        _pynative_exec.clear("resource")
         if hasattr(self, "_create_time"):
             _executor.del_net_res(str(self._create_time))
 
@@ -192,9 +203,26 @@ class Cell:
             out = self.compile_and_run(*inputs)
             return out
         self.init_parameters_data()
-        output = self.construct(*inputs)
+        orign_grad = []
+        if self.requires_grad is True:
+            _pynative_exec.set_grad_flag(True)
+            _pynative_exec.new_graph(self, *inputs)
+            for cell in self.cells():
+                orign_grad.append(cell.requires_grad)
+                cell.set_grad(True)
+        else:
+            _pynative_exec.set_grad_flag(False)
+        if self.enable_hook:
+            output = self._hook_construct(*inputs)
+        else:
+            output = self.construct(*inputs)
         if isinstance(output, Parameter):
             output = output.data
+        if self.requires_grad is True:
+            _pynative_exec.end_graph(self, output, *inputs)
+            for i, cell in enumerate(self.cells()):
+                cell.set_grad(orign_grad[i])
+        self._is_run = True
         return output
 
     def __setattr__(self, name, value):
@@ -227,9 +255,12 @@ class Cell:
                 value.update_parameters_name(name + '.')
             cells[name] = value
         elif params and name in params:
-            if value is not None:
+            if isinstance(value, Tensor) and self._params[name] is not None:
+                self._params[name].set_parameter_data(value)
+            elif value is not None:
                 raise TypeError("Expected type in (Parameter, ParameterTuple), but got {}.".format(type(value)))
-            self.insert_param_to_cell(name, None)
+            else:
+                self.insert_param_to_cell(name, None)
         elif cells and name in cells:
             if value is not None:
                 raise TypeError("Expected type is cell, but got {}.".format(type(value)))
@@ -278,7 +309,7 @@ class Cell:
                     logger.info("layout dict does not contain the key %s", key)
                     continue
                 if self.parameters_dict()[key].sliced:
-                    logger.info("Param %s is already sliced.", key)
+                    logger.debug("Param %s is already sliced.", key)
                     continue
                 layout = self.parameter_layout_dict[key]
                 new_tensor = _load_tensor_by_layout(tensor, layout)
@@ -291,7 +322,7 @@ class Cell:
                     logger.info("layout dict does not contain the key %s", key)
                     continue
                 if params[key].sliced:
-                    logger.info("Param %s is already sliced.", key)
+                    logger.debug("Param %s is already sliced.", key)
                     continue
                 layout = self.parameter_layout_dict[key]
                 new_tensor = _load_tensor_by_layout(tensor, layout)
@@ -457,7 +488,7 @@ class Cell:
             if not auto_parallel_mode:
                 param.init_data()
             elif param.name not in self.parameter_layout_dict:
-                logger.info("Layout dict does not contain the key %s.", param.name)
+                logger.debug("Layout dict does not contain the key %s.", param.name)
                 param.init_data(set_sliced=True)
             else:
                 layout = self.parameter_layout_dict[param.name]
@@ -676,9 +707,6 @@ class Cell:
         return cells
 
     def add_flags(self, **flags):
-        for x in flags:
-            if not isinstance(flags[x], bool):
-                raise TypeError(f"Flags (f{x}) must be bool but {type(flags[x])}.")
         if not hasattr(self, "_mindspore_flags"):
             self._mindspore_flags = {}
         self._mindspore_flags.update({**flags})
@@ -722,6 +750,10 @@ class Cell:
         self.add_flags_recursive(**flags)
         return self
 
+    def set_grad(self, mode=True):
+        self.requires_grad = mode
+        return self
+
     def set_train(self, mode=True):
         """
         Sets the cell to training mode.
@@ -762,9 +794,9 @@ class Cell:
         self.add_flags(auto_parallel=True)
         self._get_construct_inputs_number_and_name()
 
-    def _hook_construct(self, inputs):
+    def _hook_construct(self, *inputs):
         """Hook construct method to replace original construct method when hook function enabled."""
-        inputs = self._backward_hook(inputs)
+        inputs = self._backward_hook(*inputs)
         inputs = self.construct(inputs)
         outputs = self._backward_hook(inputs)
         return outputs
@@ -784,4 +816,28 @@ class Cell:
 
         """
         self._backward_hook = HookBackward(fn, self.cls_name + "(" + str(id(self)) + ")")
-        self._enable_hook = True
+        self.enable_hook = True
+
+class GraphKernel(Cell):
+    """
+    Base class for GraphKernel.
+
+    A `GraphKernel` a composite of basic primitives and can be compiled into a fused kernel automaticly when
+    context.set_context(enable_graph_kernel=True).
+
+    Examples:
+        >>> class Relu(GraphKernel):
+        >>>    def __init__(self):
+        >>>        super(Relu, self).__init__()
+        >>>        self.max = P.Maximum()
+        >>>
+        >>>    def construct(self, x):
+        >>>        return self.max(P.Fill()(P.DType()(x), P.Shape()(x), 0.0), x)
+    """
+    def __init__(self, auto_prefix=True, pips=None):
+        super(GraphKernel, self).__init__(auto_prefix, pips)
+        class_name = self.__class__.__name__
+        self.add_flags(graph_kernel=class_name)
+
+    def construct(self):
+        raise NotImplementedError
diff --git a/mindspore/nn/graph_kernels/__init__.py b/mindspore/nn/graph_kernels/__init__.py
new file mode 100644
index 0000000000..8128f2db60
--- /dev/null
+++ b/mindspore/nn/graph_kernels/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+GraphKernel.
+
+GraphKernel provides a unified style to express graph and kernel for user.
+It breaks the boundary between graph and kernel and provides more opportunities to do compile optimization.
+"""
+from .graph_kernels import MaximumGrad, MinimumGrad, AbsGrad, ApplyMomentum, BiasAdd, EqualCount,     \
+    ReduceMean, ReLU, SoftmaxCrossEntropyWithLogits, LayerNorm, LayerNormXBackprop,   \
+    LayerNormBetaGammaBackprop, LogSoftmax, Tanh, TanhGrad, Gelu, Softmax, BiasAddGrad,            \
+    LambUpdateWithLR, LambNextMV
+
+__all__ = ['MaximumGrad', 'MinimumGrad', 'AbsGrad', 'ApplyMomentum', 'BiasAdd', 'EqualCount',
+           'ReduceMean', 'ReLU', 'SoftmaxCrossEntropyWithLogits', 'LayerNorm',
+           'LayerNormXBackprop', 'LayerNormBetaGammaBackprop', 'LogSoftmax', 'Tanh', 'TanhGrad',
+           'Gelu', 'Softmax', 'BiasAddGrad', 'LambUpdateWithLR', 'LambNextMV'
+           ]
diff --git a/mindspore/nn/graph_kernels/graph_kernels.py b/mindspore/nn/graph_kernels/graph_kernels.py
new file mode 100644
index 0000000000..21cc4f8710
--- /dev/null
+++ b/mindspore/nn/graph_kernels/graph_kernels.py
@@ -0,0 +1,1201 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Graph kernels. They are composites of basic primitives and can be compiled into
+a fused kernel automaticly when context.set_context(enable_graph_kernel=True).
+"""
+from ...common import dtype as mstype
+from ...ops import operations as P
+from ...ops.primitive import PrimitiveWithInfer, prim_attr_register
+from ...ops.composite import multitype_ops as C
+from ...ops.operations import _grad_ops as G
+from ..._checkparam import ParamValidator as validator
+from ..cell import Cell, GraphKernel
+
+
+class InplaceAssign(PrimitiveWithInfer):
+    """
+    Inplace assign `Parameter` with a value.
+
+    This primitive can only use in graph kernel.
+
+    Inputs:
+        - **variable** (Parameter) - The `Parameter`.
+        - **value** (Tensor) - The value to assign.
+        - **depend** (Tensor) - The dependent tensor to keep this op connected in graph.
+
+    Outputs:
+        Tensor, has the same type as original `variable`.
+
+    Examples:
+    >>> def construct(self, x):
+    >>> val = x - 1.0
+    >>> ret = x + 2.0
+    >>> return InplaceAssign()(x, val, ret)
+    >>> x = Tensor([2.0], mindspore.float32)
+    >>> net = Net()
+    >>> net(x)
+   """
+    @prim_attr_register
+    def __init__(self):
+        self.init_prim_io_names(inputs=['x', 'y', 'z'], outputs=['output'])
+
+    def infer_shape(self, x, y, z):
+        return z
+
+    def infer_dtype(self, x, y, z):
+        return z
+
+    def get_bprop(self):
+        def bprop(x, y, z, out, dout):
+            return (x, C.zeros_like(y), dout)
+        return bprop
+
+
+class MaximumGrad(GraphKernel):
+    """
+
+    Backprop function for Maximum operator.
+
+    Inputs:
+        - **x** (Tensor) - The first input tensor of maximum.
+        - **y** (Tensor) - The second input tensor of maximum.
+        - **dout** (Tensor) - has the same shape as x and y, next operator's backprop output.
+
+    Outputs:
+        dx (Tensor): has the same shape as x and y, returns dout element if
+        `x >= y` returns true at the same position, or returns zero at that
+        position
+        dy (Tensor): has the same shape as x and y, dy = dout - dx
+
+    Examples:
+        >>> layer = MaximumGrad()
+        >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6]))
+    """
+
+    def __init__(self, grad_x=True, grad_y=True):
+        super(MaximumGrad, self).__init__()
+        self.grad_x = grad_x
+        self.grad_y = grad_y
+        self.select = P.Select()
+        self.greater_equal = P.GreaterEqual()
+        self.zeros_like = P.ZerosLike()
+        self.sub = P.Sub()
+
+    def construct(self, x, y, dout):
+        cmp_result = self.greater_equal(x, y)
+        dx = self.select(cmp_result, dout, self.zeros_like(dout))
+        dy = dout - dx
+
+        return dx, dy
+
+
+class MinimumGrad(GraphKernel):
+    """
+    Backprop function for Minimum operator.
+
+    Compares x and y elementwise, dout should has the same shape with x and y.
+
+    Inputs:
+        - **x** (Tensor) - The first input
+        - **y** (Tensor) - x and y should have same shape
+        - **dout** (Tensor) - Has the same shape as x and y, next operator's backprop output
+
+    Outputs:
+        - dx (Tensor) - Has the same shape as x and y, returns dout element if
+        `x <= y` returns true at the same position, or returns zero at that
+        position
+        - dy (Tensor) - Has the same shape as x and y, dy = dout - dx
+
+    Examples:
+        >>> layer = MinimumGrad()
+        >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6]))
+    """
+
+    def __init__(self, grad_x=True, grad_y=True):
+        super(MinimumGrad, self).__init__()
+        self.grad_x = grad_x
+        self.grad_y = grad_y
+        self.select = P.Select()
+        self.less_equal = P.LessEqual()
+        self.zeros_like = P.ZerosLike()
+        self.sub = P.Sub()
+
+    def construct(self, x, y, dout):
+        cmp_result = self.less_equal(x, y)
+        dx = self.select(cmp_result, dout, self.zeros_like(dout))
+        # dy = self.select(cmp_result, self.zeros_like(dout), dout)
+        dy = dout - dx
+
+        return dx, dy
+
+
+class AbsGrad(GraphKernel):
+    """
+    Abs's backprop function.
+
+    Inputs:
+        **input_x** (Tensor) - input data of this operator.
+        **dout** (Tensor) - output of the next operator's backprop function.
+
+    Outputs:
+        Tensor, has the same shape as input_x.
+
+    Examples:
+        >>> back = AbsGrad()
+        >>> output = back(Tensor([1, 2, 3]), Tensor([4, 5, 6]))
+    """
+
+    def __init__(self):
+        super(AbsGrad, self).__init__()
+        self.mul = P.Mul()
+        self.abs = P.Abs()
+        self.add = P.TensorAdd()
+        self.div = P.RealDiv()
+        self.round = P.Round()
+
+    def construct(self, input_x, dout):
+        NUM_MAX = 32768
+        mul_max = self.mul(input_x, P.Fill()(P.DType()(input_x), (1,), NUM_MAX))
+        res_abs = self.abs(mul_max)
+        res_div = self.div(mul_max, res_abs)
+        res_round = self.round(res_div)
+        res = self.mul(res_round, dout)
+        return res
+
+
+class ApplyMomentum(GraphKernel):
+    """
+    Update parameter according to the ApplyMomentum algorithm.
+
+    Inputs:
+        variable (Tensor): mutable tensor var
+        accumulation (Tensor): mutable tensor accum
+        learning_rate (float32): learning rate
+        gradient (float32): The gradient
+        momentum (float32): Momentum
+
+    Outputs: updated accumulation and variable
+    """
+
+    def __init__(self,
+                 use_nesterov=False,
+                 use_locking=False,
+                 gradient_scale=1.0):
+        super(ApplyMomentum, self).__init__()
+        self.gradient_scale = validator.check_type('gradient_scale', gradient_scale, [float])
+        self.fake_output_assign_1 = InplaceAssign()
+        self.fake_output_assign_1.add_prim_attr("fake_output", True)
+        self.fake_output_assign_2 = InplaceAssign()
+        self.fake_output_assign_2.add_prim_attr("fake_output", True)
+
+    def construct(self, variable, accumulation, learning_rate, gradient, momentum):
+        gradient = gradient * self.gradient_scale
+        momt_accumulation = accumulation * momentum
+        accumulation_inplace = momt_accumulation + gradient
+
+        sum_gradient = accumulation_inplace * learning_rate
+        variable_inplace = variable - sum_gradient
+
+        accumulation_inplace = self.fake_output_assign_1(accumulation, accumulation_inplace, accumulation_inplace)
+        variable_inplace = self.fake_output_assign_2(variable, variable_inplace, variable_inplace)
+        return accumulation_inplace, variable_inplace
+
+
+class BiasAdd(GraphKernel):
+    """
+    Return the sum of x and bias.
+
+    Inputs:
+        x (Tensor): Tensor of input data.
+        bias (Tensor): The bias tensor.
+
+    Output:
+        Tensor, the sum of x and bias.
+
+    Example:
+    >>> layer = BiasGrad()
+    >>> output = BiasAdd(Tensor([1, 2, 3]), Tensor([1,]))
+    """
+
+    def __init__(self):
+        super(BiasAdd, self).__init__()
+
+    def construct(self, x, bias):
+        shape = P.Shape()(x)
+        if len(shape) == 4:
+            bias_shape = (1, P.Shape()(bias)[0], 1, 1)  # NCHW
+        else:
+            bias_shape = (1, P.Shape()(bias)[0])
+        res = x + P.Reshape()(bias, bias_shape)
+        return res
+
+class BiasAddGrad(GraphKernel):
+    """
+    Computes gradients of BiasAdd.
+
+    Inputs:
+        x (Tensor): the gradients of bias add output.
+
+    Output:
+        Tensor, the gradients of bias add input.
+
+    Examples:
+        >>> dout = Tensor(np.ones(shape=[1, 2, 3, 4]), mindspore.float32)
+        >>> bias_add_grad = BiasAddGrad()
+        >>> dx = bias_add_grad(dout)
+    """
+    def __init__(self):
+        super(BiasAddGrad, self).__init__()
+
+    def construct(self, x):
+        shape_x = P.Shape()(x)
+        reduce_axis = [0]
+        for i in range(2, len(shape_x)):
+            reduce_axis.append(i)
+
+        res = P.ReduceSum()(x, reduce_axis)
+        return res
+
+
+class EqualCount(GraphKernel):
+    """
+    Computes the number of the same elements of two tensors.
+
+    The two input tensors should have same shape and data type.
+
+    Inputs:
+        x (Tensor): the first input tensor.
+        y (Tensor): the second input tensor.
+
+    Outputs:
+        Tensor, the type is same as input tensor and size as (1,).
+
+    Examples:
+        >>> x = Tensor(np.array([1, 2, 3]), mindspore.int32)
+        >>> y = Tensor(np.array([1, 2, 4]), mindspore.int32)
+        >>> equal_count = EqualCount()
+        >>> equal_count(x, y)
+    """
+    def __init__(self):
+        super(EqualCount, self).__init__()
+
+    def construct(self, x, y):
+        equal_bool = P.Equal()(P.Cast()(x, mstype.float32), P.Cast()(y, mstype.float32))
+        equal_count = P.Cast()(equal_bool, mstype.float16)
+
+        axes = (0,)
+        res = P.ReduceSum()(equal_count, axes)
+        res = P.Cast()(res, P.DType()(x))
+        return res
+
+
+class ReduceMean(GraphKernel):
+    """
+    Reduce a dimension of a tensor by averaging all elements in the dimension.
+
+    The dtype of the tensor to be reduced is number.
+
+    Args:
+        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
+                          If False, don't keep these dimensions. Default : False.
+
+    Inputs:
+        - **input_x** (Tensor[Number]) - The input tensor.
+        - **axis** (Union[int, tuple(int), list(int)]) - The dimensions to reduce. Default: (), reduce all dimensions.
+          Only constant value is allowed.
+
+    Outputs:
+        Tensor, has the same dtype as the 'input_x'.
+
+        - If axis is (), and keep_dims is false,
+          the output is a 0-D tensor representing the sum of all elements in the input tensor.
+        - If axis is int, set as 2, and keep_dims is false,
+          the shape of output is :math:`(x_1, x_3, ..., x_R)`.
+        - If axis is tuple(int), set as (2, 3), and keep_dims is false,
+          the shape of output is :math:`(x_1, x_4, ..., x_R)`.
+
+    Examples:
+        >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> op = ReduceMean(keep_dims=True)
+        >>> output = op(input_x, 1)
+    """
+
+    def __init__(self, keep_dims=True):
+        super(ReduceMean, self).__init__()
+        self.keep_dims = validator.check_type('keep_dims', keep_dims, [bool])
+        self.sum = P.ReduceSum(self.keep_dims)
+
+    def construct(self, x, axis):
+        shape = P.Shape()(x)
+        value_num = 1
+        for i in axis:
+            value_num *= shape[i]
+
+        data_sum = self.sum(x, axis)
+        avg = 1.0 / P.Fill()(P.DType()(x), (1,), value_num)
+        res = data_sum * avg
+        return res
+
+
+class ReLU(GraphKernel):
+    r"""
+    Computes ReLU(Rectified Linear Unit) of input tensor element-wise.
+
+    It returns :math:`\max(x,\  0)` element-wise.
+
+    Inputs:
+        - **input_x** (Tensor) - The input tensor.
+
+    Outputs:
+        Tensor, with the same type and shape as the `input_x`.
+
+    Examples:
+        >>> input_x = Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32)
+        >>> relu = ReLU()
+        >>> result = relu(input_x)
+        [[0, 4.0, 0.0], [2.0, 0.0, 9.0]]
+    """
+    def __init__(self):
+        super(ReLU, self).__init__()
+        self.max = P.Maximum()
+
+    def construct(self, x):
+        return self.max(P.Fill()(P.DType()(x), P.Shape()(x), 0.0), x)
+
+
+class SoftmaxCrossEntropyWithLogits(GraphKernel):
+    r"""
+    Gets the softmax cross-entropy value between logits and labels which shoule be one-hot encoding.
+
+    Note:
+        Sets input logits as `X`, input label as `Y`, output as `loss`. Then,
+
+        .. math::
+            p_{ij} = softmax(X_{ij}) = \frac{exp(x_i)}{\sum_{j = 0}^{N-1}\exp(x_j)}
+
+        .. math::
+            loss_{ij} = -\sum_j{Y_{ij} * ln(p_{ij})}
+
+    Inputs:
+        - **logits** (Tensor) - Input logits, with shape :math:`(N, C)`.
+        - **labels** (Tensor) - Ground truth labels, with shape :math:`(N, C)`.
+
+    Outputs:
+        Tuple of 2 Tensor, the loss shape is `(N,)`, and the dlogits with the same shape as `logits`.
+
+    Examples:
+        >>> logits = Tensor([[2, 4, 1, 4, 5], [2, 1, 2, 4, 3]], mindspore.float32)
+        >>> labels = Tensor([[0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], mindspore.float32)
+        >>> softmax_cross = SoftmaxCrossEntropyWithLogits()
+        >>> loss, backprop = softmax_cross(logits, labels)
+    """
+
+    def __init__(self):
+        super(SoftmaxCrossEntropyWithLogits, self).__init__()
+        self.max = P.ReduceMax(keep_dims=True)
+        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
+
+    def construct(self, features, labels):
+        data_max = self.max(features, (1,))
+        data_sub = features - data_max
+        data_exp = P.Exp()(data_sub)
+        data_sum = self.sum_keep_dims(data_exp, (1,))
+        data_div = data_exp / data_sum
+        data_log_tmp = P.Log()(data_sum)
+        data_log = data_sub - data_log_tmp
+        data_mul = labels * data_log
+        data_muls = P.Neg()(data_mul)
+        loss = P.ReduceSum()(data_muls, (1,))
+        backprop = data_div - labels
+        return loss, backprop
+
+    def bprop(self, features, labels, out, dout):
+        grad = out[1]
+        grad = grad * P.ExpandDims()(dout[0], -1)
+        return grad, P.ZerosLike()(labels)
+
+
+class LayerNormForward(GraphKernel):
+    """ Forward function of the LayerNorm operator. """
+    def __init__(self, begin_norm_axis=1, begin_params_axis=1):
+        super(LayerNormForward, self).__init__()
+        self.begin_norm_axis = validator.check_type('begin_norm_axis', begin_norm_axis, [int])
+        self.begin_params_axis = validator.check_type('begin_params_axis', begin_params_axis, [int])
+        self.mul = P.Mul()
+        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
+        self.sub = P.Sub()
+        self.add = P.TensorAdd()
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.eps = P.Eps()
+
+    def construct(self, input_x, input_gamma, input_beta):
+        shape_x = P.Shape()(input_x)
+
+        # Calculate the scaling ratio of the average
+        begin_norm_axis = self.begin_norm_axis
+        if begin_norm_axis < 0:
+            begin_norm_axis += len(shape_x)
+        reduce_axis = ()
+        for i in range(len(shape_x)):
+            if i > begin_norm_axis or i == begin_norm_axis:
+                reduce_axis = reduce_axis + (i,)
+
+        reduce_elts = 1.0
+        for i in reduce_axis:
+            reduce_elts *= shape_x[i]
+        mean_cof = 1.0 / reduce_elts
+
+        # Calculate mean
+        mean_muls = self.mul(input_x, mean_cof)
+        mean = self.sum_keep_dims(mean_muls, reduce_axis)
+
+        # Calculate variance
+        variance_sub = self.sub(input_x, mean)
+        variance_mul = self.mul(variance_sub, variance_sub)
+        variance_muls = self.mul(variance_mul, mean_cof)
+        variance = self.sum_keep_dims(variance_muls, reduce_axis)
+
+        # Calculate normalize
+        normalize_sub = self.sub(input_x, mean)
+        epsilon = self.eps(input_x)
+        normalize_add = self.add(variance, epsilon)
+        normalize_log = self.log(normalize_add)
+        normalize_log_mul = self.mul(normalize_log, -0.5)
+        normalize_exp = self.exp(normalize_log_mul)
+        normalize_mul = self.mul(normalize_sub, normalize_exp)
+
+        # Calculate scale and translate
+        if self.begin_params_axis == 0:
+            scale_mul = self.mul(input_gamma, normalize_mul)
+            res = self.add(scale_mul, input_beta)
+        else:
+            scale_mul = self.mul(input_gamma, normalize_mul)
+            res = self.add(scale_mul, input_beta)
+
+        return res, mean, variance
+
+
+class LayerNormXBackprop(GraphKernel):
+    r"""
+    Together with LayerNormBetaGammaBackprop, to supply the backprop
+    functionality for LayerNorm.
+
+    Note:
+        Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`,
+        input_gamma as :math:`\gamma`. Then,
+        .. math::
+            \begin{array}{ll} \\
+                \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\
+                \frac {\partial L} {\partial x_i} =
+                    \frac{\gamma}{\sqrt{\sigma^2+\epsilon}}
+                    ( \frac{\partial L}{\partial y_i}
+                    - \frac{1}{m} \cdot \frac{\partial L}{\partial \beta}
+                    - \frac{\hat{x_i}}{m} \cdot \frac{\partial L}{\partial \gamma})
+            \end{array}
+
+    Inputs:
+        - **dy**(Tensor) - The first item of the next operator's backprop's output.
+        - **input_x**(Tensor) - The first input of the forward function of LayerNorm.
+        - **variance**(Tensor) - The second input of the forward function of LayerNorm.
+        - **mean**(Tensor) - The third input of the forward function of LayerNorm.
+        - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm.
+
+    Outputs:
+        Tensor, the output of this operator, will be used as the first item of the result of
+            LayerNorm's backprop function, has the same shape and data type as 'input_x'.
+
+    Examples:
+        >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> op = LayerNormXBackprop(keep_dims=False)
+        >>> output = op(dy, input_x, variance, mean, input_gamma)
+    """
+
+    def __init__(self):
+        super(LayerNormXBackprop, self).__init__()
+        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.eps = P.Eps()
+
+    def construct(self, dy, input_x, variance, mean, input_gamma):
+        shape_x = P.Shape()(input_x)
+        shape_mean = P.Shape()(mean)
+        reduce_axis = ()
+        flag = -1
+        min_l = 0
+        if len(shape_x) > len(shape_mean):
+            min_l = len(shape_x)
+        else:
+            min_l = len(shape_mean)
+        for i in range(min_l):
+            if (shape_x[i] != shape_mean[i]) and (flag == -1):
+                flag = i
+        if flag != -1:
+            for i in range(flag, len(shape_x)):
+                reduce_axis = reduce_axis + (i,)
+        else:
+            reduce_axis = reduce_axis + (len(shape_x) - 1,)
+        mean_num = 1.0
+        for i in reduce_axis:
+            mean_num *= shape_x[i]
+        pd_xl = input_gamma * dy
+        epsilon = self.eps(input_x)
+        var_elta = variance + epsilon
+        var_elta_log = self.log(var_elta)
+        var_elta_mul = var_elta_log * -0.5
+        var_elta_2 = P.Exp()(var_elta_mul)
+        pdvar1_mul = var_elta_2 * var_elta_2
+        pd_var_1 = pdvar1_mul * var_elta_2
+        sub_x_mean = input_x - mean
+        pdvar_mul1 = pd_xl * sub_x_mean
+        pdvar_sum = self.sum_keep_dims(pdvar_mul1, reduce_axis)
+        pdvar_mul3 = pdvar_sum * pd_var_1
+        pd_var = pdvar_mul3 * -0.5
+        pdmean1_sum = self.sum_keep_dims(pd_xl, reduce_axis)
+        pdmean1_mul = pdmean1_sum * var_elta_2
+        pd_mean_1 = pdmean1_mul * -1.0
+        pdmean2_mul1 = sub_x_mean * -2.0
+        pdmean2_sum = self.sum_keep_dims(pdmean2_mul1, reduce_axis)
+        pdmean2_mul3 = pdmean2_sum * (1.0 / mean_num)
+        pd_mean_2 = pd_var * pdmean2_mul3
+        pd_mean = pd_mean_2 + pd_mean_1
+        pd_x_1 = var_elta_2 * pd_xl
+        pdx2_mul = pd_var * sub_x_mean
+        pd_x_2 = pdx2_mul * (2.0 * (1.0 / mean_num))
+        pd_x_3 = pd_mean * (1.0 / mean_num)
+        pdx_add = pd_x_1 + pd_x_2
+        pd_x = pdx_add + pd_x_3
+        return pd_x
+
+
+class LayerNormBetaGammaBackprop(GraphKernel):
+    r"""
+    Together with LayerNormXBackprop, to supply the backprop functionality for
+    LayerNorm.
+    Note:
+        Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`,
+        input_gamma as :math:`\gamma`. Then,
+        .. math::
+            \begin{array}{ll} \\
+                \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\
+                \frac {\partial L} {\partial \beta} =
+                    \sum_{i=1}^m \\frac{\\partial L}{\partial y_i} \\
+                \frac {\partial L} {\partial \gamma} =
+                    \sum_{i=1}^m \\frac{\partial L}{\partial y_i} \cdot \hat{x_i}
+            \end{array}
+
+    Inputs:
+        - **dy**(Tensor) - The first item of the next operator's backprop's output.
+        - **input_x**(Tensor) - The first input of the forward function of LayerNorm.
+        - **variance**(Tensor) - The second input of the forward function of LayerNorm.
+        - **mean**(Tensor) - The third input of the forward function of LayerNorm.
+        - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm.
+
+    Outputs:
+        Tuple of 2 Tensor, the backprop outputs.
+
+        - **pd_beta**(Tensor) - The first item of return value of this operator, will be used as
+                    the second item of the LayerNorm's backprop function.
+        - **pd_gamma**(Tensor) - The second item of return value of this operator, will be used as
+                    the third item of the LayerNorm's backprop function.
+
+    Examples:
+        >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> op = LayerNormBetaGammaBackprop(keep_dims=False)
+        >>> pd_beta, pd_gamma = op(dy, input_x, variance, mean, input_gamma)
+    """
+    def __init__(self):
+        super(LayerNormBetaGammaBackprop, self).__init__()
+        self.sum_not_keep_dims = P.ReduceSum(keep_dims=False)
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.eps = P.Eps()
+
+    def construct(self, dy, input_x, variance, mean, shape_gamma):
+        shape_x = P.Shape()(input_x)
+        params_axis = ()
+
+        if len(shape_x) != len(shape_gamma):
+            sub = len(shape_x) - len(shape_gamma)
+            for i in range(sub):
+                params_axis = params_axis + (i,)
+
+        pd_beta = self.sum_not_keep_dims(dy, params_axis)
+        epsilon = self.eps(input_x)
+        var_elta = variance + epsilon
+        var_elta_log = self.log(var_elta)
+        var_elta_mul = var_elta_log * -0.5
+        var_elta_2 = P.Exp()(var_elta_mul)
+        sub_x_mean = input_x - mean
+        var_elta_2_cast = var_elta_2
+        xl_mul = var_elta_2_cast * sub_x_mean
+        pdga_mul = dy * xl_mul
+        pd_gamma = self.sum_not_keep_dims(pdga_mul, params_axis)
+        return pd_beta, pd_gamma
+
+
+class LogSoftmax(GraphKernel):
+    r"""
+    Log Softmax activation function.
+
+    Applies the Log Softmax function to the input tensor on the specified axis.
+    Suppose a slice along the given aixs :math:`x` then for each element :math:`x_i`
+    the Log Softmax function is shown as follows:
+
+    .. math::
+        \text{output}(x_i) = \log \left(\frac{exp(x_i)} {\sum_{j = 0}^{N-1}\exp(x_j)}\right),
+
+    where :math:`N` is the length of the Tensor.
+
+    Args:
+        axis (int): The axis to do the Log softmax operation. Default: -1.
+
+    Inputs:
+        logits (Tensor): The input of Log Softmax.
+
+    Outputs:
+        Tensor, with the same type and shape as the logits.
+
+    Examples:
+        >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32)
+        >>> log_softmax = LogSoftmax()
+        >>> log_softmax(input_x)
+        [-4.4519143, -3.4519143, -2.4519143, -1.4519144, -0.4519144]
+    """
+
+    def __init__(self, axis=-1):
+        super(LogSoftmax, self).__init__()
+        self.axis = validator.check_type('axis', axis, [int])
+        self.max_keep_dims = P.ReduceMax(keep_dims=True)
+        self.sub = P.Sub()
+        self.exp = P.Exp()
+        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
+        self.log = P.Log()
+        self.mul = P.Mul()
+
+    def construct(self, input_x):
+        data_max = self.max_keep_dims(input_x, (self.axis,))
+        data_sub = self.sub(input_x, data_max)
+
+        data_exp = self.exp(data_sub)
+        data_sum = self.sum_keep_dims(data_exp, (self.axis,))
+        data_log = self.log(data_sum)
+
+        res = self.sub(data_sub, data_log)
+        return res
+
+    def bprop(self, input_x, out, dout):
+        input_x = out
+        input_dy = dout
+
+        data_exp = self.exp(input_x)
+        data_sum = self.sum_keep_dims(input_dy, (self.axis,))
+        data_softmax = self.mul(data_exp, data_sum)
+
+        res = self.sub(input_dy, data_softmax)
+        return (res,)
+
+
+class Tanh(GraphKernel):
+    r"""
+    Tanh activation function.
+
+    Computes hyperbolic tangent of input element-wise. The Tanh function is defined as:
+
+    .. math::
+        tanh(x_i) = \frac{\exp(x_i) - \exp(-x_i)}{\exp(x_i) + \exp(-x_i)} = \frac{\exp(2x_i) - 1}{\exp(2x_i) + 1},
+
+    where :math:`x_i` is an element of the input Tensor.
+
+    Inputs:
+        - **input_x** (Tensor) - The input of Tanh.
+
+    Outputs:
+        Tensor, with the same type and shape as the input_x.
+
+    Examples:
+        >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32)
+        >>> tanh = Tanh()
+        >>> tanh(input_x)
+        [0.7615941, 0.9640276, 0.9950548, 0.9993293, 0.99990916]
+    """
+    def __init__(self):
+        super(Tanh, self).__init__()
+        self.abs = P.Abs()
+        self.add = P.TensorAdd()
+        self.div = P.RealDiv()
+        self.mul = P.Mul()
+        self.mul_fp16 = P.Mul()
+        self.mul_fp16.add_prim_attr("output_precision", "float16")
+        self.exp = P.Exp()
+
+    def construct(self, input_x):
+        input_abs = self.abs(input_x)
+        sign_flag = self.div(input_x, input_abs)
+        sign_flag_neg = self.mul(sign_flag, -1.0)
+
+        power_val = self.mul(input_abs, -2.0)
+        exp_val = self.exp(power_val)
+        up_val = self.add(exp_val, -1.0)
+        down_val = self.add(exp_val, 1.0)
+
+        div_val = self.div(up_val, down_val)
+        res = self.mul(sign_flag_neg, div_val)
+        return res
+
+    def bprop(self, input_x, out, dout):
+        input_y = out
+        input_dy = dout
+
+        data_square = self.mul(input_y, input_y)
+        data_mul = self.mul(data_square, -1.0)
+        anuminate = self.add(data_mul, 1.0)
+        res = self.mul_fp16(anuminate, input_dy)
+
+        return (res,)
+
+class TanhGrad(GraphKernel):
+    """
+    Backprop function of Tanh
+
+    Mathematical calculating:
+        result = Tanh(out)
+        result = 1 - result * result
+        result = result * dout
+    Inputs:
+        out (Tensor): Tanh's output
+        dout (Tensor): next layer's backward function's output, has same shape as out
+
+    Outputs:
+        result (Tensor): result of (1 - tanh(out)^2) * dout
+
+    Examples:
+        >>> x_np = np.random.randn(5, 3, 6).astype(np.float16)
+        >>> dy_np = np.random.randn(5, 3, 6).astype(np.float16)
+        >>> x_ms = Tensor(x_np)
+        >>> dy_ms = Tensor(dy_np)
+        >>> tanh_grad = TanhGrad()
+        >>> out = tanh_grad(x_np, dy_np)
+    """
+    def __init__(self):
+        super(TanhGrad, self).__init__()
+        self.add = P.TensorAdd()
+        self.mul = P.Mul()
+        self.mul_fp16 = P.Mul()
+        self.mul_fp16.add_prim_attr("output_precision", "float16")
+
+    def construct(self, out, dout):
+        input_y = out
+        input_dy = dout
+
+        data_square = self.mul(input_y, input_y)
+        data_mul = self.mul(data_square, -1.0)
+        anuminate = self.add(data_mul, 1.0)
+        res = self.mul_fp16(anuminate, input_dy)
+
+        return res
+
+class Gelu(GraphKernel):
+    r"""
+    Gaussian Error Linear Units activation function.
+
+    GeLU is described in the paper `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_.
+    And also please refer to `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.
+    <https://arxiv.org/abs/1810.04805>`_.
+
+    Defined as follows:
+
+    .. math::
+        \text{output} = 0.5 * x * (1 + erf(x / \sqrt{2})),
+
+    where :math:`erf` is the "Gauss error function" .
+
+    Inputs:
+        - **input_x** (Tensor) - Input to compute the Gelu.
+
+    Outputs:
+        Tensor, with the same type and shape as input.
+
+    Examples:
+        >>> tensor = Tensor(np.array([1.0, 2.0, 3.0]), mindspore.float32)
+        >>> gelu = Gelu()
+        >>> result = gelu(tensor)
+    """
+
+    def __init__(self):
+        super(Gelu, self).__init__()
+        self.add = P.TensorAdd()
+        self.abs = P.Abs()
+        self.exp = P.Exp()
+        self.neg = P.Neg()
+        self.minimum = P.Minimum()
+        self.div = P.RealDiv()
+        self.mul = P.Mul()
+        self.CSVALUE = 0.044715
+        self.CSVALUE_A = 1.59576912
+        self.CSVALUE_5 = 0.3989422804
+        self.CSVALUE_3B = 0.2140644488
+
+    def construct(self, input_x):
+        def _tanh_parameter_compute(data_x):
+            """
+            compute the parameter of tanh:
+            return: result equal (x+0.044715*tf.pow(x,3))
+            """
+            mul_0 = self.mul(data_x, data_x)
+            pow_0 = self.mul(mul_0, data_x)
+            mul_1 = self.mul(pow_0, self.CSVALUE)
+            result = self.add(data_x, mul_1)
+
+            return result
+
+        tanh_parameter = _tanh_parameter_compute(input_x)
+        mul_0 = self.mul(tanh_parameter, 1.5957691)
+
+        mul_0_min = self.minimum(mul_0, 0.0)
+        right_mul = self.exp(mul_0_min)
+
+        mul_0_abs = self.abs(mul_0)
+        mul_0_abs_neg = self.mul(mul_0_abs, -1.0)
+        mul_0_abs_neg_exp = self.exp(mul_0_abs_neg)
+
+        mul_0_abs_neg_exp_add = self.add(mul_0_abs_neg_exp, 1.0)
+        left_mul = self.div(input_x, mul_0_abs_neg_exp_add)
+
+        result = self.mul(left_mul, right_mul)
+        return result
+
+    def bprop(self, input_x, out, dout):
+        """ register backprop function for Gelu """
+        data_x = input_x
+        data_gelu = out
+        data_dy = dout
+
+        def _math_four_compute(data_x):
+            """
+            return: math_four equal 2*(np(sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3)))
+            """
+            datax_pow = data_x * data_x * data_x
+            datax_muls_c = self.mul(datax_pow, self.CSVALUE)
+            datax_addx = self.add(datax_muls_c, data_x)
+            datax_muls_s = self.mul(datax_addx, self.CSVALUE_A)
+
+            return datax_muls_s
+
+        # common part
+        math_four = _math_four_compute(data_x)
+        math_four_abs = self.abs(math_four)
+        math_four_abs_neg = self.mul(math_four_abs, -1.0)
+        math_four_abs_neg_exp = self.exp(math_four_abs_neg)
+        math_four_min = self.minimum(math_four, 0.0)
+
+        # dividend part
+        datax_pow = self.mul(data_x, data_x)
+        datax_pow_mul = self.mul(datax_pow, self.CSVALUE_3B)
+        datax_pow_mul_add = self.add(datax_pow_mul, self.CSVALUE_A)
+        data_gelu_mul = self.mul(data_gelu, datax_pow_mul_add)
+        math_four_min_2 = self.mul(math_four_min, 2.0)
+        div_right = self.mul(data_gelu_mul, math_four_abs_neg_exp)
+        div_left = self.exp(math_four_min_2)
+        dividend = self.add(div_left, div_right)
+
+        # divisor part
+        div_0 = self.add(math_four_abs_neg_exp, 1.0)
+        div_1 = self.exp(math_four_min)
+        divisor = self.mul(div_1, div_0)
+        res_grad = self.div(dividend, divisor)
+
+        result = self.mul(res_grad, data_dy)
+        return (result,)
+
+
+class Softmax(GraphKernel):
+    """
+    Operator Softmax
+    .. math: `exp(x-max(x)) / sum(exp(x-max(x)))`
+
+    Args:
+        axis (int, tuple): Axis along which the softmax normalization is applied
+
+    Inputs:
+        x (Tensor): input data for softmax
+
+    Outputs:
+        output (Tensor): a tensor with the same shape of the input
+
+    Examples:
+        >>> layer = Softmax(1)
+        >>> x = Tensor(np.array([1.2, 2.1], [2.2, 3.2]), mindspore.float32)
+        >>> output = layer(x)
+    """
+
+    def __init__(self, axis):
+        super(Softmax, self).__init__()
+        validator.check_type("axis", axis, [int, tuple])
+        if isinstance(axis, int):
+            self.axis = (axis,)
+        else:
+            self.axis = axis
+        for item in self.axis:
+            validator.check_type("item of axis", item, [int])
+        self.max = P.ReduceMax(keep_dims=True)
+        self.sub = P.Sub()
+        self.exp = P.Exp()
+        self.sum = P.ReduceSum(keep_dims=True)
+        self.mul = P.Mul()
+
+    def construct(self, x):
+        max_x = self.max(x, self.axis)
+        data_sub = self.sub(x, max_x)
+        data_exp = self.exp(data_sub)
+        data_expsum = self.sum(data_exp, self.axis)
+        output = data_exp / data_expsum
+        return output
+
+    def bprop(self, x, out, dout):
+        mul_res = self.mul(dout, out)
+        sum_res = self.sum(mul_res, self.axis)
+        sub_res = self.sub(dout, sum_res)
+        res = self.mul(sub_res, out)
+        return (res,)
+
+
+class LayerNorm(Cell):
+    r"""
+    Applies Layer Normalization over a mini-batch of inputs.
+
+    Layer normalization is widely used in recurrent neural networks. It applies
+    normalization over a mini-batch of inputs for each single training case as described
+    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
+    normalization, layer normalization performs exactly the same computation at training and
+    testing times. It can be described using the following formula. It is applied across all channels
+    and pixel but only one batch size.
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    Args:
+        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
+            `begin_norm_axis ... R - 1`.
+        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
+            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
+        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
+            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
+            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
+        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
+            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
+            'he_uniform', etc. Default: 'ones'.
+        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
+            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
+            'he_uniform', etc. Default: 'zeros'.
+
+    Inputs:
+        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
+          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
+
+    Outputs:
+        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
+
+    Examples:
+        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
+        >>> shape1 = x.shape()[1:]
+        >>> m = G.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
+        >>> m(x)
+    """
+
+    def __init__(self,
+                 begin_norm_axis=-1,
+                 begin_params_axis=-1
+                 ):
+        super(LayerNorm, self).__init__()
+        self.begin_norm_axis = begin_norm_axis
+        self.begin_params_axis = begin_params_axis
+        self.layer_norm = LayerNormForward(begin_norm_axis, begin_params_axis)
+        self.layer_norm_x_grad = LayerNormXBackprop()
+        self.layer_norm_beta_gamma = LayerNormBetaGammaBackprop()
+        self.layer_norm_grad = G.LayerNormGrad(self.begin_norm_axis, self.begin_params_axis)
+
+    def construct(self, input_x, input_gamma, input_beta):
+        return self.layer_norm(input_x, input_gamma, input_beta)
+
+    # case 1
+    def bprop(self, input_x, input_gamma, input_beta, out, dout):
+        dx, d_gamma, d_beta = self.layer_norm_grad(input_x, dout[0], out[2], dout[1], input_gamma)
+        return dx, d_gamma, d_beta
+
+
+class LambUpdateWithLR(GraphKernel):
+    r"""
+    Part of Lamb optimizer.
+
+    .. math::
+        s_1 = select(i_1 \gt y_g, select(i_0 \gt y_g, \frac{i_1}{i_2}, se), se)
+        i_5 = i_5 - max(min(s_1, y_m), y_g) \times i_3 \times i_4
+
+    Inputs:
+        - **input0** (Tensor) - The first tensor to be computed.
+        - **input1** (Tensor) - The second tensor to be computed.
+        - **input2** (Tensor) - The third tensor to be computed.
+        - **input3** (Tensor) - The fourth tensor to be computed.
+        - **input4** (Tensor) - The fifth tensor to be computed.
+        - **input5** (Tensor) - The sixth tensor to be computed. It will be updated by result.
+        - **greater_y** (Tensor) - The seventh tensor to be computed.
+        - **select_e** (Tensor) - The eighth tensor to be computed.
+        - **minimum_y** (Tensor) - The ninth tensor to be computed.
+
+    Outputs:
+        A fake output tensor.
+
+    Examples:
+        >>> lamb_update = LambUpdateWithLR()
+        >>> i0 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
+        >>> i1 = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> i2 = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> i3 = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> i4 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
+        >>> i5 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
+        >>> yg = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> se = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> ym = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> lamb_update(i0, i1, i2, i3, i4, i5, yg, se, ym)
+
+    """
+
+    def __init__(self):
+        super(LambUpdateWithLR, self).__init__()
+        self.greater = P.Greater()
+        self.select = P.Select()
+        self.div = P.RealDiv()
+        self.min = P.Minimum()
+        self.max = P.Maximum()
+        self.mul = P.Mul()
+        self.sub = P.Sub()
+        self.fake_output_assign = InplaceAssign()
+        self.fake_output_assign.add_prim_attr("fake_output", True)
+
+    def construct(self, input0, input1, input2, input3, input4, input5, greater_y, select_e, minimum_y):
+        greater0 = self.greater(input0, greater_y)
+        greater1 = self.greater(input1, greater_y)
+        real_div0 = self.div(input1, input2)
+        select0 = self.select(greater0, real_div0, select_e)
+        select1 = self.select(greater1, select0, select_e)
+        min0 = self.min(select1, minimum_y)
+        max0 = self.max(min0, greater_y)
+        mul0 = self.mul(max0, input3)
+        mul1 = self.mul(mul0, input4)
+        sub0 = self.sub(input5, mul1)
+        sub0 = self.fake_output_assign(input5, sub0, sub0)
+        return sub0
+
+class LambNextMV(GraphKernel):
+    r"""
+    Part of Lamb optimizer.
+
+    .. math::
+        rd_0 = \frac{i_8 \times i_5 + i_9 \times i_4}{i6}
+        rd_1 = \frac{x_0 \times i_2 + x_1 \times i_1}{i3}
+        y_2 = \frac{rd_0}{\sqrt{rd_1 + x3}} + x_2 \times i_7
+        y_3 = \frac{rd_0}{\sqrt{rd_1} + x3}
+        i5 = i_8 \times i_5 + i_9 \times i_4
+        i2 = x_0 \times i_2 + x_1 \times i_1
+
+    Inputs:
+        - **inputs1** (Tensor) - The first input tensor to be computed.
+        - **inputs2** (Tensor) - The second input tensor to be computed. It will be updated by result.
+        - **inputs3** (Tensor) - The third input tensor to be computed.
+        - **inputs4** (Tensor) - The fourth input tensor to be computed.
+        - **inputs5** (Tensor) - The fifth input tensor to be computed. It will be updated by result.
+        - **inputs6** (Tensor) - The sixth input tensor to be computed.
+        - **inputs7** (Tensor) - The seventh input tensor to be computed.
+        - **inputs8** (Tensor) - The eighth input tensor to be computed.
+        - **inputs9** (Tensor) - The ninth input tensor to be computed.
+        - **inputsx0** (Tensor) - The tenth input tensor to be computed.
+        - **inputsx1** (Tensor) - The eleventh input tensor to be computed.
+        - **inputsx2** (Tensor) - The twelfth input tensor to be computed.
+        - **inputsx3** (Tensor) - The thirteenth input tensor to be computed.
+
+    Outputs:
+        Tuple of 2 Tensor.
+
+        - **add3** (Tensor) - The shape is same as the shape after broadcasting, and the data type is
+                              the one with high precision or high digits among the inputs.
+        - **realdiv4** (Tensor) - The shape is same as the shape after broadcasting, and the data type is
+                                  the one with high precision or high digits among the inputs.
+
+    Examples:
+        >>> lamb_next_mv = LambNextMV()
+        >>> i1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i3 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i4 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i5 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i6 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i7 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i8 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i9 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> x0 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> x1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> x2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> x3 = Tensor(np.ones([1, 16]).astype(np.float32) * 1e-6)
+        >>> lamb_next_mv(i1, i2, i3, i4, i5, i6, i7, i8, i9, x0, x1, x2, x3)
+
+    """
+
+    def __init__(self):
+        super(LambNextMV, self).__init__()
+        self.mul = P.Mul()
+        self.add = P.TensorAdd()
+        self.div = P.RealDiv()
+        self.sqrt = P.Sqrt()
+        self.rsqrt = P.Rsqrt()
+        self.fake_output_assign_1 = InplaceAssign()
+        self.fake_output_assign_1.add_prim_attr("fake_output", False)
+        self.fake_output_assign_2 = InplaceAssign()
+        self.fake_output_assign_2.add_prim_attr("fake_output", False)
+
+
+    def construct(self, input1, input2, input3, input4, input5, input6, input7,
+                  input8, input9, inputx0, inputx1, inputx2, inputx3):
+        mul3 = self.mul(inputx1, input1)
+        mul2 = self.mul(inputx0, input2)
+        add1 = self.add(mul2, mul3)
+        realdiv1 = self.div(add1, input3)
+        add2 = self.add(realdiv1, inputx3)
+        sqrt0 = self.rsqrt(add2)
+        sqrt1 = self.sqrt(realdiv1)
+        add4 = self.add(sqrt1, inputx3)
+        mul1 = self.mul(input9, input4)
+        mul0 = self.mul(input8, input5)
+        add0 = self.add(mul0, mul1)
+        realdiv0 = self.div(add0, input6)
+        realdiv2 = self.mul(realdiv0, sqrt0)
+        realdiv4 = self.div(realdiv0, add4)
+        mul4 = self.mul(inputx2, input7)
+        add3 = self.add(realdiv2, mul4)
+
+        add3 = self.fake_output_assign_1(input5, add0, add3)
+        add3 = self.fake_output_assign_2(input2, add1, add3)
+
+        return add3, realdiv4
diff --git a/mindspore/nn/layer/activation.py b/mindspore/nn/layer/activation.py
index 3a754e4c03..14a1aa8554 100644
--- a/mindspore/nn/layer/activation.py
+++ b/mindspore/nn/layer/activation.py
@@ -20,8 +20,10 @@ from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import initializer
 from mindspore.common.tensor import Tensor
 from mindspore._extends import cell_attr_register
+from mindspore.ops import _selected_ops
 from ..cell import Cell
 
+
 __all__ = ['Softmax',
            'LogSoftmax',
            'ReLU',
@@ -73,7 +75,7 @@ class Softmax(Cell):
 
     def __init__(self, axis=-1):
         super(Softmax, self).__init__()
-        self.softmax = P.Softmax(axis)
+        self.softmax = _selected_ops.Softmax(axis)
 
     def construct(self, x):
         return self.softmax(x)
@@ -110,7 +112,7 @@ class LogSoftmax(Cell):
 
     def __init__(self, axis=-1):
         super(LogSoftmax, self).__init__()
-        self.log_softmax = P.LogSoftmax(axis)
+        self.log_softmax = _selected_ops.LogSoftmax(axis)
 
     def construct(self, x):
         return self.log_softmax(x)
@@ -249,11 +251,11 @@ class LeakyReLU(Cell):
         self.alpha = alpha
 
     def construct(self, x):
-        alpha = P.Cast()(F.scalar_to_array(self.alpha), P.DType()(x))
+        alpha_array = P.Cast()(F.scalar_to_array(self.alpha), P.DType()(x))
         if self.alpha <= 1:
-            out = P.Maximum()(alpha * x, x)
+            out = P.Maximum()(alpha_array * x, x)
         else:
-            out = P.Minimum()(alpha * x, x)
+            out = P.Minimum()(alpha_array * x, x)
         return out
 
 
@@ -286,7 +288,7 @@ class Tanh(Cell):
 
     def __init__(self):
         super(Tanh, self).__init__()
-        self.tanh = P.Tanh()
+        self.tanh = _selected_ops.Tanh()
 
     def construct(self, x):
         return self.tanh(x)
@@ -318,7 +320,7 @@ class GELU(Cell):
 
     def __init__(self):
         super(GELU, self).__init__()
-        self.gelu = P.Gelu()
+        self.gelu = _selected_ops.Gelu()
 
     def construct(self, x):
         return self.gelu(x)
@@ -378,7 +380,7 @@ class PReLU(Cell):
         Tensor, with the same type and shape as the `input_data`.
 
     Examples:
-        >>> input_x = Tensor(np.array([-1, -2, 0, 2, 1]), mindspore.float32)
+        >>> input_x = Tensor(np.random.rand(1, 10, 4, 4), mindspore.float32)
         >>> prelu = nn.PReLU()
         >>> prelu(input_x)
 
@@ -503,6 +505,7 @@ class LogSigmoid(Cell):
         [-3.1326166e-01, -1.2692806e-01, -4.8587345e-02]
 
     """
+
     def __init__(self):
         super(LogSigmoid, self).__init__()
         self.mul = P.Mul()
@@ -549,9 +552,9 @@ def get_activation(name):
     Examples:
         >>> sigmoid = nn.get_activation('sigmoid')
     """
-    if not name:
+    if name is None:
         return None
 
     if name not in _activation:
-        raise KeyError("Unknown activation type")
+        raise KeyError(f"Unknown activation type '{name}'")
     return _activation[name]()
diff --git a/mindspore/nn/layer/basic.py b/mindspore/nn/layer/basic.py
index 8f4e468e0b..b1d5af48c9 100644
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@@ -22,15 +22,21 @@ from mindspore.ops import operations as P
 from mindspore.ops import functional as F
 from mindspore.ops.functional import identity
 from mindspore.ops.operations import _inner_ops as inner
+from mindspore.ops.primitive import constexpr
 from mindspore.common.parameter import Parameter
 from mindspore._extends import cell_attr_register
 from mindspore.common.api import ms_function
 from mindspore import context
+from mindspore.ops import _selected_ops
 from ..cell import Cell
 from .activation import get_activation
 from ..._checkparam import Validator as validator
+from ..._checkparam import Rel
+
+
+__all__ = ['Dropout', 'Flatten', 'Dense', 'ClipByNorm', 'Norm', 'OneHot', 'Pad', 'Unfold',
+           'MatrixDiag', 'MatrixDiagPart', 'MatrixSetDiag']
 
-__all__ = ['Dropout', 'Flatten', 'Dense', 'ClipByNorm', 'Norm', 'OneHot', 'Pad', 'Unfold']
 
 class Dropout(Cell):
     r"""
@@ -73,6 +79,7 @@ class Dropout(Cell):
         >>> net = nn.Dropout(keep_prob=0.8)
         >>> net(x)
     """
+
     def __init__(self, keep_prob=0.5, seed0=0, seed1=0, dtype=mstype.float32):
         super(Dropout, self).__init__()
         if keep_prob <= 0 or keep_prob > 1:
@@ -130,12 +137,13 @@ class Flatten(Cell):
     Examples:
         >>> net = nn.Flatten()
         >>> input = Tensor(np.array([[[1.2, 1.2], [2.1, 2.1]], [[2.2, 2.2], [3.2, 3.2]]]), mindspore.float32)
-        >>> input.shape()
+        >>> input.shape
         (2, 2, 2)
         >>> net(input)
         [[1.2 1.2 2.1 2.1]
          [2.2 2.2 3.2 3.2]]
     """
+
     def __init__(self):
         super(Flatten, self).__init__()
 
@@ -197,21 +205,21 @@ class Dense(Cell):
         self.has_bias = check_bool(has_bias)
 
         if isinstance(weight_init, Tensor):
-            if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \
-               weight_init.shape()[1] != in_channels:
+            if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \
+               weight_init.shape[1] != in_channels:
                 raise ValueError("weight_init shape error")
 
         self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight")
 
         if self.has_bias:
             if isinstance(bias_init, Tensor):
-                if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels:
+                if bias_init.dim() != 1 or bias_init.shape[0] != out_channels:
                     raise ValueError("bias_init shape error")
 
             self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias")
 
         self.matmul = P.MatMul(transpose_b=True)
-        self.bias_add = P.BiasAdd()
+        self.bias_add = _selected_ops.BiasAdd()
 
         self.activation = get_activation(activation)
         self.activation_flag = self.activation is not None
@@ -236,6 +244,13 @@ class Dense(Cell):
         return str_info
 
 
+@constexpr
+def _is_equal_one(x):
+    if x is None:
+        return False
+    return bool(x.asnumpy().mean() == 1.0)
+
+
 class ClipByNorm(Cell):
     r"""
     Clips tensor values to a maximum :math:`L_2`-norm.
@@ -263,6 +278,7 @@ class ClipByNorm(Cell):
         >>> net(input, clip_norm)
 
     """
+
     def __init__(self):
         super(ClipByNorm, self).__init__()
         self.reduce_sum = P.ReduceSum(keep_dims=True)
@@ -290,7 +306,11 @@ class ClipByNorm(Cell):
         l2sum_safe = self.select_(cond, l2sum, self.cast(ones_, self.dtype(l2sum)))
         l2norm = self.select_(cond, self.sqrt(l2sum_safe), l2sum)
 
-        intermediate = x * clip_norm
+        if _is_equal_one(clip_norm):
+            intermediate = x
+        else:
+            intermediate = x * clip_norm
+
         max_norm = self.max_op(l2norm, clip_norm)
         values_clip = self.cast(intermediate, mstype.float32) / self.expand_dims(max_norm, -1)
         values_clip = self.reshape(values_clip, self.shape(x))
@@ -319,6 +339,7 @@ class Norm(Cell):
         >>> input = Tensor(np.random.randint(0, 10, [4, 16]), mindspore.float32)
         >>> net(input)
     """
+
     def __init__(self, axis=(), keep_dims=False):
         super(Norm, self).__init__()
         self.axis = axis
@@ -381,6 +402,7 @@ class OneHot(Cell):
           [0. 1.]
           [0. 0.]]]
     """
+
     def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, dtype=mstype.float32):
         super(OneHot, self).__init__()
         self.onehot = P.OneHot(axis)
@@ -495,6 +517,7 @@ class Unfold(Cell):
         Tensor ([[[[1, 1] [1, 1]] [[1, 1], [1, 1]] [[1, 1] [1, 1]], [[1, 1], [1, 1]]]],
                 shape=(1, 4, 2, 2), dtype=mstype.float16)
     """
+
     def __init__(self, ksizes, strides, rates, padding="valid"):
         super(Unfold, self).__init__()
         self.extract_image_patches = inner.ExtractImagePatches(ksizes, strides, rates, padding)
@@ -507,3 +530,112 @@ class Unfold(Cell):
         ret = self.extract_image_patches(x_transpose)
         ret_transpose = self.transpose(ret, self.format_NCHW)
         return ret_transpose
+
+
+@constexpr
+def _get_matrix_diag_assist(x_shape, x_dtype):
+    validator.check_integer("x rank", len(x_shape), 1, Rel.GE, "_get_matrix_diag_assist")
+    base_eye = np.eye(x_shape[-1], x_shape[-1]).reshape(-1)
+    assist = np.tile(base_eye, x_shape[:-1]).reshape(x_shape + (x_shape[-1],))
+    return Tensor(assist, x_dtype)
+
+
+@constexpr
+def _get_matrix_diag_part_assist(x_shape, x_dtype):
+    validator.check_integer("x rank", len(x_shape), 2, Rel.GE, "_get_matrix_diag_part_assist")
+    base_eye = np.eye(x_shape[-2], x_shape[-1]).reshape(-1)
+    assist = np.tile(base_eye, x_shape[:-2]).reshape(x_shape)
+    return Tensor(assist, x_dtype)
+
+
+class MatrixDiag(Cell):
+    """
+    Returns a batched diagonal tensor with a given batched diagonal values.
+
+    Inputs:
+        - **x** (Tensor) - The diagonal values. It can be of the following data types:
+          float32, float16, int32, int8, uint8.
+
+    Outputs:
+        Tensor, same type as input `x`. The shape should be x.shape + (x.shape[-1], ).
+
+    Examples:
+        >>> x = Tensor(np.array([1, -1]), mstype.float32)
+        >>> matrix_diag = nn.MatrixDiag()
+        >>> result = matrix_diag(x)
+        [[1.   0.]
+         [0.  -1.]]
+    """
+    def __init__(self):
+        super(MatrixDiag, self).__init__()
+        self.matrix_diag = inner.MatrixDiag()
+        self.dtype = P.DType()
+
+    def construct(self, input_x):
+        x_shape = F.shape(input_x)
+        x_dtype = self.dtype(input_x)
+        assist = _get_matrix_diag_assist(x_shape, x_dtype)
+        out_matrix_diag = self.matrix_diag(input_x, assist)
+        return out_matrix_diag
+
+
+class MatrixDiagPart(Cell):
+    r"""
+    Returns the batched diagonal part of a batched tensor.
+
+    Inputs:
+        - **x** (Tensor) - The batched tensor. It can be of the following data types:
+          float32, float16, int32, int8, uint8.
+
+    Outputs:
+        Tensor, same type as input `x`. The shape should be x.shape[:-2] + [min(x.shape[-2:])].
+
+    Examples:
+        >>> x = Tensor([[[-1, 0], [0, 1]], [-1, 0], [0, 1]], [[-1, 0], [0, 1]]], mindspore.float32)
+        >>> matrix_diag_part = nn.MatrixDiagPart()
+        >>> result = matrix_diag_part(x)
+        [[-1., 1.], [-1., 1.], [-1., 1.]]
+    """
+    def __init__(self):
+        super(MatrixDiagPart, self).__init__()
+        self.matrix_diag_part = inner.MatrixDiagPart()
+        self.dtype = P.DType()
+
+    def construct(self, input_x):
+        x_shape = F.shape(input_x)
+        x_dtype = self.dtype(input_x)
+        assist = _get_matrix_diag_part_assist(x_shape, x_dtype)
+        out_matrix_diag_part = self.matrix_diag_part(input_x, assist)
+        return out_matrix_diag_part
+
+
+class MatrixSetDiag(Cell):
+    r"""
+    Modify the batched diagonal part of a batched tensor.
+
+    Inputs:
+        - **x** (Tensor) - The batched tensor. It can be of the following data types:
+          float32, float16, int32, int8, uint8.
+        - **diagonal** (Tensor) - The diagonal values.
+
+    Outputs:
+        Tensor, same type as input `x`. The shape same as `x`.
+
+    Examples:
+        >>> x = Tensor([[[-1, 0], [0, 1]], [-1, 0], [0, 1]], [[-1, 0], [0, 1]]], mindspore.float32)
+        >>> diagonal = Tensor([[-1., 2.], [-1., 1.], [-1., 1.]], mindspore.float32)
+        >>> matrix_set_diag = nn.MatrixSetDiag()
+        >>> result = matrix_set_diag(x, diagonal)
+        [[[-1, 0], [0, 2]], [-1, 0], [0, 1]], [[-1, 0], [0, 1]]]
+    """
+    def __init__(self):
+        super(MatrixSetDiag, self).__init__()
+        self.matrix_set_diag = inner.MatrixSetDiag()
+        self.dtype = P.DType()
+
+    def construct(self, input_x, diagonal):
+        x_shape = F.shape(input_x)
+        x_dtype = self.dtype(input_x)
+        assist = _get_matrix_diag_part_assist(x_shape, x_dtype)
+        out_matrix_set_diag = self.matrix_set_diag(input_x, diagonal, assist)
+        return out_matrix_set_diag
diff --git a/mindspore/nn/layer/combined.py b/mindspore/nn/layer/combined.py
deleted file mode 100644
index 671365e393..0000000000
--- a/mindspore/nn/layer/combined.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Use combination of Conv, Dense, Relu, Batchnorm."""
-
-from .normalization import BatchNorm2d
-from .activation import get_activation
-from ..cell import Cell
-from . import conv, basic
-from ..._checkparam import ParamValidator as validator
-
-
-__all__ = ['Conv2d', 'Dense']
-
-class Conv2d(Cell):
-    r"""
-    A combination of convolution, Batchnorm, activation layer.
-
-    For a more Detailed overview of Conv2d op.
-
-    Args:
-        in_channels (int): The number of input channel :math:`C_{in}`.
-        out_channels (int): The number of output channel :math:`C_{out}`.
-        kernel_size (Union[int, tuple]): The data type is int or tuple with 2 integers. Specifies the height
-            and width of the 2D convolution window. Single int means the value if for both height and width of
-            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
-            width of the kernel.
-        stride (int): Specifies stride for all spatial dimensions with the same value. Value of stride should be
-            greater or equal to 1 but bounded by the height and width of the input. Default: 1.
-        pad_mode (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
-        padding (int): Implicit paddings on both sides of the input. Default: 0.
-        dilation (int): Specifying the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
-            there will be :math:`k - 1` pixels skipped for each sampling location. Its value should be greater
-            or equal to 1 and bounded by the height and width of the input. Default: 1.
-        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
-            divisible by the number of groups. Default: 1.
-        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
-        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
-            It can be a Tensor, a string, an Initializer or a numbers.Number. When a string is specified,
-            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
-            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
-            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
-            Initializer for more details. Default: 'normal'.
-        bias_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the bias vector. Possible
-            Initializer and string are the same as 'weight_init'. Refer to the values of
-            Initializer for more details. Default: 'zeros'.
-        batchnorm (bool): Specifies to used batchnorm or not. Default: None.
-        activation (string): Specifies activation type. The optional values are as following:
-            'softmax', 'logsoftmax', 'relu', 'relu6', 'tanh', 'gelu', 'sigmoid',
-            'prelu', 'leakyrelu', 'hswish', 'hsigmoid'. Default: None.
-
-    Inputs:
-        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
-
-    Outputs:
-        Tensor of shape :math:`(N, C_{out}, H_{out}, W_{out})`.
-
-    Examples:
-        >>> net = combined.Conv2d(120, 240, 4, batchnorm=True, activation='ReLU')
-        >>> input = Tensor(np.ones([1, 120, 1024, 640]), mindspore.float32)
-        >>> net(input).shape()
-        (1, 240, 1024, 640)
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 pad_mode='same',
-                 padding=0,
-                 dilation=1,
-                 group=1,
-                 has_bias=False,
-                 weight_init='normal',
-                 bias_init='zeros',
-                 batchnorm=None,
-                 activation=None):
-        super(Conv2d, self).__init__()
-        self.conv = conv.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            pad_mode,
-            padding,
-            dilation,
-            group,
-            has_bias,
-            weight_init,
-            bias_init)
-        self.has_bn = batchnorm is not None
-        self.has_act = activation is not None
-        self.batchnorm = batchnorm
-        if batchnorm is True:
-            self.batchnorm = BatchNorm2d(out_channels)
-        elif batchnorm is not None:
-            validator.check_isinstance('batchnorm', batchnorm, (BatchNorm2d,))
-        self.activation = get_activation(activation)
-
-    def construct(self, x):
-        x = self.conv(x)
-        if self.has_bn:
-            x = self.batchnorm(x)
-        if self.has_act:
-            x = self.activation(x)
-        return x
-
-
-class Dense(Cell):
-    r"""
-    A combination of Dense, Batchnorm, activation layer.
-
-    For a more Detailed overview of Dense op.
-
-    Args:
-        in_channels (int): The number of channels in the input space.
-        out_channels (int): The number of channels in the output space.
-        weight_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable weight_init parameter. The dtype
-            is same as input x. The values of str refer to the function `initializer`. Default: 'normal'.
-        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
-            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
-        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
-        activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
-        batchnorm (bool): Specifies to used batchnorm or not. Default: None.
-        activation (string): Specifies activation type. The optional values are as following:
-            'softmax', 'logsoftmax', 'relu', 'relu6', 'tanh', 'gelu', 'sigmoid',
-            'prelu', 'leakyrelu', 'hswish', 'hsigmoid'. Default: None.
-
-    Inputs:
-        - **input** (Tensor) - Tensor of shape :math:`(N, in\_channels)`.
-
-    Outputs:
-        Tensor of shape :math:`(N, out\_channels)`.
-
-    Examples:
-        >>> net = nn.Dense(3, 4)
-        >>> input = Tensor(np.random.randint(0, 255, [2, 3]), mindspore.float32)
-        >>> net(input)
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 weight_init='normal',
-                 bias_init='zeros',
-                 has_bias=True,
-                 batchnorm=None,
-                 activation=None):
-        super(Dense, self).__init__()
-        self.dense = basic.Dense(
-            in_channels,
-            out_channels,
-            weight_init,
-            bias_init,
-            has_bias)
-        self.has_bn = batchnorm is not None
-        self.has_act = activation is not None
-        if batchnorm is True:
-            self.batchnorm = BatchNorm2d(out_channels)
-        elif batchnorm is not None:
-            validator.check_isinstance('batchnorm', batchnorm, (BatchNorm2d,))
-        self.activation = get_activation(activation)
-
-    def construct(self, x):
-        x = self.dense(x)
-        if self.has_bn:
-            x = self.batchnorm(x)
-        if self.has_act:
-            x = self.activation(x)
-        return x
diff --git a/mindspore/nn/layer/container.py b/mindspore/nn/layer/container.py
index b9ce230aec..48871401bf 100644
--- a/mindspore/nn/layer/container.py
+++ b/mindspore/nn/layer/container.py
@@ -140,6 +140,11 @@ class SequentialCell(Cell):
     def __len__(self):
         return len(self._cells)
 
+    def set_grad(self, flag=True):
+        self.requires_grad = flag
+        for cell in self._cells.values():
+            cell.set_grad(flag)
+
     def construct(self, input_data):
         for cell in self.cell_list:
             input_data = cell(input_data)
@@ -150,8 +155,9 @@ class CellList(_CellListBase, Cell):
     """
     Holds Cells in a list.
 
-    CellList can be indexed like a regular Python list, but cells it
-    contains are properly registered, and will be visible by all Cell methods.
+    CellList can be used like a regular Python list, support
+    '__getitem__', '__setitem__', '__delitem__', '__len__', '__iter__' and '__iadd__',
+    but cells it contains are properly registered, and will be visible by all Cell methods.
 
     Args:
         args (list, optional): List of subclass of Cell.
@@ -245,5 +251,10 @@ class CellList(_CellListBase, Cell):
             self._cells[str(len(self))] = cell
         return self
 
+    def set_grad(self, flag=True):
+        self.requires_grad = flag
+        for cell in self._cells.values():
+            cell.set_grad(flag)
+
     def construct(self, *inputs):
         raise NotImplementedError
diff --git a/mindspore/nn/layer/conv.py b/mindspore/nn/layer/conv.py
index e02908aed3..b2a0de9cbe 100644
--- a/mindspore/nn/layer/conv.py
+++ b/mindspore/nn/layer/conv.py
@@ -168,7 +168,7 @@ class Conv2d(_Conv):
     Examples:
         >>> net = nn.Conv2d(120, 240, 4, has_bias=False, weight_init='normal')
         >>> input = Tensor(np.ones([1, 120, 1024, 640]), mindspore.float32)
-        >>> net(input).shape()
+        >>> net(input).shape
         (1, 240, 1024, 640)
     """
     @cell_attr_register
diff --git a/mindspore/nn/layer/embedding.py b/mindspore/nn/layer/embedding.py
index 5df38b6845..c8873039ab 100755
--- a/mindspore/nn/layer/embedding.py
+++ b/mindspore/nn/layer/embedding.py
@@ -44,10 +44,12 @@ class Embedding(Cell):
         dtype (:class:`mindspore.dtype`): Data type of input. Default: mindspore.float32.
 
     Inputs:
-        - **input** (Tensor) - Tensor of shape :math:`(\text{vocab_size})`.
+        - **input** (Tensor) - Tensor of shape :math:`(\text{batch_size}, \text{input_length})`. The element of
+          the Tensor should be integer and not larger than vocab_size. else the corresponding embedding vector is zero
+          if larger than vocab_size.
 
     Outputs:
-        Tensor of shape :math:`(\text{vocab_size}, \text{embedding_size})`.
+        Tensor of shape :math:`(\text{batch_size}, \text{input_length}, \text{embedding_size})`.
 
     Examples:
         >>> net = nn.Embedding(20000, 768,  True)
@@ -55,12 +57,13 @@ class Embedding(Cell):
         >>>
         >>> # Maps the input word IDs to word embedding.
         >>> output = net(input_data)
-        >>> output.shape()
+        >>> output.shape
         (8, 128, 768)
     """
     def __init__(self, vocab_size, embedding_size, use_one_hot=False, embedding_table='normal', dtype=mstype.float32):
         super(Embedding, self).__init__()
         validator.check_subclass("dtype", dtype, mstype.number_type, self.cls_name)
+        validator.check_value_type('use_one_hot', use_one_hot, [bool], self.cls_name)
         self.vocab_size = vocab_size
         self.embedding_size = embedding_size
         self.use_one_hot = use_one_hot
diff --git a/mindspore/nn/layer/image.py b/mindspore/nn/layer/image.py
index 39cc7895f3..b23f20deb8 100644
--- a/mindspore/nn/layer/image.py
+++ b/mindspore/nn/layer/image.py
@@ -23,7 +23,7 @@ from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
 from ..cell import Cell
 
-__all__ = ['ImageGradients', 'SSIM', 'PSNR']
+__all__ = ['ImageGradients', 'SSIM', 'PSNR', 'CentralCrop']
 
 class ImageGradients(Cell):
     r"""
@@ -264,3 +264,72 @@ class PSNR(Cell):
         psnr = 10 * P.Log()(F.square(max_val) / mse) / F.scalar_log(10.0)
 
         return psnr
+
+
+@constexpr
+def _raise_dims_rank_error(input_shape, param_name, func_name):
+    """raise error if input is not 3d or 4d"""
+    raise ValueError(f"{func_name} {param_name} should be 3d or 4d, but got shape {input_shape}")
+
+@constexpr
+def _get_bbox(rank, shape, central_fraction):
+    """get bbox start and size for slice"""
+    if rank == 3:
+        c, h, w = shape
+    else:
+        n, c, h, w = shape
+
+    central_fraction = central_fraction.asnumpy()[0]
+    bbox_h_start = int((float(h) - float(h) * central_fraction) / 2)
+    bbox_w_start = int((float(w) - float(w) * central_fraction) / 2)
+    bbox_h_size = h - bbox_h_start * 2
+    bbox_w_size = w - bbox_w_start * 2
+
+    if rank == 3:
+        bbox_begin = (0, bbox_h_start, bbox_w_start)
+        bbox_size = (c, bbox_h_size, bbox_w_size)
+    else:
+        bbox_begin = (0, 0, bbox_h_start, bbox_w_start)
+        bbox_size = (n, c, bbox_h_size, bbox_w_size)
+
+    return bbox_begin, bbox_size
+
+class CentralCrop(Cell):
+    """
+    Crop the centeral region of the images with the central_fraction.
+
+    Args:
+        central_fraction (float): Fraction of size to crop. It must be float and in range (0.0, 1.0].
+
+    Inputs:
+        - **image** (Tensor) - A 3-D tensor of shape [C, H, W], or a 4-D tensor of shape [N, C, H, W].
+
+    Outputs:
+        Tensor, 3-D or 4-D float tensor, according to the input.
+
+    Examples:
+        >>> net = nn.CentralCrop(central_fraction=0.5)
+        >>> image = Tensor(np.random.random((4, 3, 4, 4)), mindspore.float32)
+        >>> output = net(image)
+    """
+
+    def __init__(self, central_fraction):
+        super(CentralCrop, self).__init__()
+        validator.check_value_type("central_fraction", central_fraction, [float], self.cls_name)
+        self.central_fraction = validator.check_number_range('central_fraction', central_fraction,
+                                                             0.0, 1.0, Rel.INC_RIGHT, self.cls_name)
+        self.central_fraction_tensor = Tensor(np.array([central_fraction]).astype(np.float64))
+        self.slice = P.Slice()
+
+    def construct(self, image):
+        image_shape = F.shape(image)
+        rank = len(image_shape)
+        if not rank in (3, 4):
+            return _raise_dims_rank_error(image_shape, "image", self.cls_name)
+        if self.central_fraction == 1.0:
+            return image
+
+        bbox_begin, bbox_size = _get_bbox(rank, image_shape, self.central_fraction_tensor)
+        image = self.slice(image, bbox_begin, bbox_size)
+
+        return image
diff --git a/mindspore/nn/layer/lstm.py b/mindspore/nn/layer/lstm.py
index 6122e82aaa..71c2920850 100755
--- a/mindspore/nn/layer/lstm.py
+++ b/mindspore/nn/layer/lstm.py
@@ -13,15 +13,17 @@
 # limitations under the License.
 # ============================================================================
 """lstm"""
-from mindspore.ops import operations as P
-from mindspore.nn.cell import Cell
-from mindspore.common.parameter import Parameter
-from mindspore.common.initializer import initializer
-from mindspore._checkparam import Validator as validator
-from mindspore import context
+import math
+import numpy as np
 import mindspore.nn as nn
+from mindspore import context
+from mindspore._checkparam import Validator as validator
+from mindspore.common.initializer import initializer
+from mindspore.common.parameter import Parameter, ParameterTuple
 from mindspore.common.tensor import Tensor
-import numpy as np
+from mindspore.nn.cell import Cell
+from mindspore.ops import operations as P
+from ..._checkparam import Rel
 
 __all__ = ['LSTM', 'LSTMCell']
 
@@ -122,6 +124,8 @@ class LSTM(Cell):
         self.num_layers = num_layers
         self.has_bias = has_bias
         self.batch_first = validator.check_value_type("batch_first", batch_first, [bool], self.cls_name)
+        self.hidden_size = validator.check_integer("hidden_size", hidden_size, 0, Rel.GT, self.cls_name)
+        self.num_layers = validator.check_integer("num_layers", num_layers, 0, Rel.GT, self.cls_name)
         self.dropout = float(dropout)
         self.bidirectional = bidirectional
         if self.batch_first:
@@ -147,23 +151,31 @@ class LSTM(Cell):
                 if self.has_bias:
                     increment_size += 2 * gate_size
                 weight_size += increment_size * num_directions
-            self.weight = Parameter(initializer(0.0, [weight_size, 1, 1]), name='weight')
+            stdv = 1 / math.sqrt(hidden_size)
+            w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32)
+            self.weight = Parameter(initializer(Tensor(w_np), [weight_size, 1, 1]), name='weight')
         else:
-            layer = []
-            layer.append(nn.LSTMCell(input_size=self.input_size,
-                                     hidden_size=self.hidden_size,
-                                     layer_index=0,
-                                     has_bias=self.has_bias,
-                                     bidirectional=self.bidirectional,
-                                     dropout=self.dropout))
-            for i in range(num_layers - 1):
-                layer.append(nn.LSTMCell(input_size=self.hidden_size * num_directions,
-                                         hidden_size=self.hidden_size,
-                                         layer_index=i + 1,
-                                         has_bias=self.has_bias,
-                                         bidirectional=self.bidirectional,
-                                         dropout=self.dropout))
-            self.lstms = layer
+            input_size_list = []
+            input_size_list.append(self.input_size)
+            for i in range(self.num_layers - 1):
+                input_size_list.append(self.hidden_size * num_directions)
+            weights = []
+            layers = []
+            bias_size = 0 if not self.has_bias else num_directions * self.hidden_size * 4
+            stdv = 1 / math.sqrt(hidden_size)
+            for i in range(num_layers):
+                weight_size = (input_size_list[i] + self.hidden_size) * num_directions * self.hidden_size * 4
+                if has_bias:
+                    weight_size = weight_size + bias_size
+                w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32)
+                weights.append(Parameter(initializer(Tensor(w_np), w_np.shape), name='weight' + str(i)))
+                layers.append(nn.LSTMCell(input_size=input_size_list[i],
+                                          hidden_size=self.hidden_size,
+                                          has_bias=self.has_bias,
+                                          bidirectional=self.bidirectional,
+                                          dropout=self.dropout))
+            self.lstms = layers
+            self.weight = ParameterTuple(tuple(weights))
         self.fill = P.Fill()
         self.shape = P.Shape()
 
@@ -177,12 +189,12 @@ class LSTM(Cell):
                 output = self.transpose2(output, (1, 0, 2))
             return (output, (h, c))
         h, c = hx
-        output, hn, cn, _, _ = self.lstms[0](x, h[0], c[0])
+        output, hn, cn, _, _ = self.lstms[0](x, h[0], c[0], self.weight[0])
         for i in range(1, self.num_layers):
-            output, hn, cn, _, _ = self.lstms[i](output, h[i], c[i])
+            output, hn, cn, _, _ = self.lstms[i](output, h[i], c[i], self.weight[i])
         if self.batch_first:
             output = self.transpose2(output, (1, 0, 2))
-        return output, hn, cn, _, _
+        return (output, (hn, cn))
 
 
 class LSTMCell(Cell):
@@ -271,11 +283,9 @@ class LSTMCell(Cell):
         >>> output, hn, cn, _, _ = net(input, h0, c0)
     """
 
-
     def __init__(self,
                  input_size,
                  hidden_size,
-                 layer_index=0,
                  has_bias=True,
                  batch_first=False,
                  dropout=0,
@@ -283,8 +293,6 @@ class LSTMCell(Cell):
         super(LSTMCell, self).__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
-        self.num_layers = 1
-        self.layer_index = layer_index
         self.has_bias = has_bias
         self.batch_first = validator.check_value_type("batch_first", batch_first, [bool], self.cls_name)
         self.dropout = float(dropout)
@@ -295,16 +303,7 @@ class LSTMCell(Cell):
         if self.batch_first:
             self.transpose1 = P.Transpose()
             self.transpose2 = P.Transpose()
-        w_np = np.ones([(self.input_size + self.hidden_size) * self.num_directions * self.hidden_size * 4, 1]).astype(
-            np.float32) * 0.01
-        if has_bias:
-            b_np = np.ones([self.num_directions * self.hidden_size * 4, 1]).astype(
-                np.float32) * 0.01
-        else:
-            b_np = np.zeros([self.num_directions * self.hidden_size * 4, 1]).astype(
-                np.float32) * 0.01
-        wb_np = np.concatenate((w_np, b_np), axis=0).reshape([-1, 1, 1])
-        self.w = Parameter(initializer(Tensor(wb_np), wb_np.shape), name='w' + str(self.layer_index))
+
         self.lstm = P.LSTM(input_size=self.input_size,
                            hidden_size=self.hidden_size,
                            num_layers=1,
@@ -312,10 +311,10 @@ class LSTMCell(Cell):
                            bidirectional=self.bidirectional,
                            dropout=self.dropout)
 
-    def construct(self, x, h, c):
+    def construct(self, x, h, c, w):
         if self.batch_first:
             x = self.transpose1(x, (1, 0, 2))
-        output, hn, cn, _, _ = self.lstm(x, h, c, self.w)
+        output, hn, cn, _, _ = self.lstm(x, h, c, w)
         if self.batch_first:
             output = self.transpose2(output, (1, 0, 2))
         return output, hn, cn, _, _
diff --git a/mindspore/nn/layer/math.py b/mindspore/nn/layer/math.py
index 8a714c1cde..1ecb20056e 100644
--- a/mindspore/nn/layer/math.py
+++ b/mindspore/nn/layer/math.py
@@ -15,12 +15,16 @@
 """math"""
 import math
 from mindspore.ops import operations as P
+from mindspore.ops.operations import _inner_ops as inner
 from mindspore.common.tensor import Tensor
 from ..cell import Cell
 from ...common import dtype as mstype
 from ..._checkparam import Validator as validator
+from ..._checkparam import Rel
+
+
+__all__ = ['ReduceLogSumExp', 'Range', 'LinSpace']
 
-__all__ = ['ReduceLogSumExp', 'Range']
 
 class ReduceLogSumExp(Cell):
     r"""
@@ -79,8 +83,8 @@ class Range(Cell):
         start (Union[int, float]): If `limit` is `None`, the value acts as limit in the range and first entry
             defaults to `0`. Otherwise, it acts as first entry in the range.
         limit (Union[int, float]): Acts as upper limit of sequence. If `None`, defaults to the value of `start`
-            while set the first entry of the range to `0`.
-        delta (Union[int, float]): Increment of the range. Default: 1.
+            while set the first entry of the range to `0`. It can not be equal to `start`.
+        delta (Union[int, float]): Increment of the range. It can not be equal to zero. Default: 1.
 
     Outputs:
         Tensor, the dtype is int if the dtype of `start`, `limit` and `delta` all are int. Otherwise, dtype is float.
@@ -93,10 +97,12 @@ class Range(Cell):
 
     def __init__(self, start, limit=None, delta=1):
         super(Range, self).__init__()
-        validator.check_value_type("start", start, [int, float], None)
-        validator.check_value_type("delta", delta, [int, float], None)
+        validator.check_value_type("start", start, [int, float], self.cls_name)
+        validator.check_value_type("delta", delta, [int, float], self.cls_name)
+        if delta == 0:
+            raise ValueError("The input of `delta` can not be equal to zero.")
         if limit is not None:
-            validator.check_value_type("limit", limit, [int, float], None)
+            validator.check_value_type("limit", limit, [int, float], self.cls_name)
             if isinstance(start, int) and isinstance(limit, int) and isinstance(delta, int):
                 self.dtype = mstype.int32
             else:
@@ -112,7 +118,7 @@ class Range(Cell):
             limit = float(limit)
         if isinstance(delta, int):
             delta = float(delta)
-        self.range_x = P.Range(start, limit, delta)
+        self.range_x = inner.Range(start, limit, delta)
         if limit is None:
             length_input = math.ceil(start / delta)
         else:
@@ -122,3 +128,48 @@ class Range(Cell):
     def construct(self):
         range_out = self.range_x(self.input_tensor)
         return range_out
+
+
+class LinSpace(Cell):
+    r"""
+    Generates values in an interval. And return the corresponding interpolation accroding to assist.
+
+    Args:
+        - **start** (Union[int, float]) - The start of interval, With shape of 0-D.
+        - **stop** (Union[int, float]) - The end of interval, With shape of 0-D.
+        - **num** (int) - ticks number in the interval, the ticks include start and stop value.
+          With shape of 0-D.
+
+    Outputs:
+        Tensor, With type same as `start`. The shape is 1-D with length of `num`.
+
+    Examples:
+        >>> linspace = nn.LinSpace()
+        >>> start = Tensor(1, mindspore.float32)
+        >>> stop = Tensor(10, mindspore.float32)
+        >>> num = Tensor(5, mindspore.int32)
+        >>> output = linspace(start, stop, num)
+        [1, 3.25, 5.5, 7.75, 10]
+    """
+
+    def __init__(self, start, stop, num):
+        super(LinSpace, self).__init__()
+        validator.check_value_type("start", start, [int, float], self.cls_name)
+        validator.check_value_type("stop", stop, [int, float], self.cls_name)
+        validator.check_value_type("num", num, [int], self.cls_name)
+        validator.check_integer("num", num, 0, Rel.GT, self.cls_name)
+
+        self.is_single = bool(num == 1)
+        self.lin_space = inner.LinSpace()
+        self.start = Tensor(start, mstype.float32)
+        self.stop = Tensor(stop, mstype.float32)
+        self.assist = Tensor(list(range(num)), mstype.float32)
+        self.num = Tensor(num, mstype.int32)
+        self.start_array = Tensor([start], mstype.float32)
+
+    def construct(self):
+        if self.is_single:
+            return self.start_array
+
+        lin_space_out = self.lin_space(self.assist, self.start, self.stop, self.num)
+        return lin_space_out
diff --git a/mindspore/nn/layer/normalization.py b/mindspore/nn/layer/normalization.py
index f90b8d28ed..4c7ea9d4d6 100644
--- a/mindspore/nn/layer/normalization.py
+++ b/mindspore/nn/layer/normalization.py
@@ -18,17 +18,17 @@ from mindspore.ops import functional as F
 from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import initializer
 from mindspore.ops.primitive import constexpr
-from mindspore.common.tensor import Tensor
-import mindspore.common.dtype as mstype
 import mindspore.context as context
 from mindspore._checkparam import check_bool, check_typename
 from mindspore._extends import cell_attr_register
 from mindspore.communication.management import get_group_size, get_rank
 from mindspore.communication import management
 from mindspore._checkparam import check_int_positive
+from mindspore.ops import _selected_ops
 from ..cell import Cell
 
 
+
 __all__ = ['BatchNorm1d', 'BatchNorm2d', 'LayerNorm', 'GroupNorm', 'GlobalBatchNorm']
 
 class _BatchNorm(Cell):
@@ -85,13 +85,12 @@ class _BatchNorm(Cell):
         self.reshape = P.Reshape()
         self.is_ascend = context.get_context("device_target") == "Ascend"
         self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE
-
+        self.momentum = 1.0 - momentum
         if context.get_context("enable_ge"):
             self.is_ge_backend = True
-            self.momentum = Tensor(1.0 - momentum, mstype.float32)
         else:
             self.is_ge_backend = False
-            self.momentum = 1.0 - momentum
+
         if self.is_graph_mode and (self.is_ge_backend or self.is_ascend):
             self.bn_train = P.BatchNorm(is_training=True,
                                         epsilon=self.eps)
@@ -119,12 +118,11 @@ class _BatchNorm(Cell):
                              "local_rank_size is {}".format(group_size, get_group_size()))
         if len(world_rank) % group_size != 0:
             raise ValueError("please make your group size correct.")
-        world_rank_list = zip(*(iter(world_rank),) *group_size)
+        world_rank_list = zip(*(iter(world_rank),) * group_size)
         group_list = [list(i) for i in world_rank_list]
         return group_list
 
 
-
     def _global_sync(self, x, axes, re_shape):
         """calculate global batch normalization output"""
         x_mean = self.reduce_mean(x, axes)
@@ -191,15 +189,19 @@ class _BatchNorm(Cell):
         return 'num_features={}, eps={}, momentum={}, gamma={}, beta={}, moving_mean={}, moving_variance={}'.format(
             self.num_features, self.eps, self.momentum, self.gamma, self.beta, self.moving_mean, self.moving_variance)
 
+
 @constexpr
 def _channel_check(channel, num_channel):
     if channel != num_channel:
         raise ValueError("the input channel is not equal with num_channel")
 
+
 @constexpr
 def _shape_check(in_shape):
     if len(in_shape) != 4:
         raise ValueError("The input must has 4 dims")
+
+
 @constexpr
 def _shape_infer(x_shape, num_feature):
     """global batch normalization shape and axes infer"""
@@ -211,6 +213,7 @@ def _shape_infer(x_shape, num_feature):
         re_shape = (1, num_feature)
     return axes, re_shape
 
+
 class BatchNorm1d(_BatchNorm):
     r"""
     Batch normalization layer over a 2D input.
@@ -260,6 +263,7 @@ class BatchNorm1d(_BatchNorm):
         >>> input = Tensor(np.random.randint(0, 255, [3, 16]), mindspore.float32)
         >>> net(input)
     """
+
     def __init__(self,
                  num_features,
                  eps=1e-5,
@@ -279,6 +283,7 @@ class BatchNorm1d(_BatchNorm):
                                           moving_mean_init,
                                           moving_var_init,
                                           use_batch_statistics)
+
     def _check_data_dim(self, x):
         if x.dim() != 2:
             pass
@@ -333,6 +338,7 @@ class BatchNorm2d(_BatchNorm):
         >>> input = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]), mindspore.float32)
         >>> net(input)
     """
+
     def __init__(self,
                  num_features,
                  eps=1e-5,
@@ -352,6 +358,7 @@ class BatchNorm2d(_BatchNorm):
                                           moving_mean_init,
                                           moving_var_init,
                                           use_batch_statistics)
+
     def _check_data_dim(self, x):
         if x.dim() != 4:
             pass
@@ -375,7 +382,7 @@ class GlobalBatchNorm(_BatchNorm):
 
     Args:
         num_features (int): `C` from an expected input of size (N, C, H, W).
-        device_num_each_group (int): The number of devices in each group.
+        device_num_each_group (int): The number of devices in each group. Default: 1.
         eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
         momentum (float): A floating hyperparameter of the momentum for the
             running_mean and running_var computation. Default: 0.9.
@@ -407,6 +414,7 @@ class GlobalBatchNorm(_BatchNorm):
         >>> input = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]), mindspore.float32)
         >>> global_bn_op(input)
     """
+
     def __init__(self,
                  num_features,
                  eps=1e-5,
@@ -431,10 +439,12 @@ class GlobalBatchNorm(_BatchNorm):
         self.group = check_int_positive(device_num_each_group)
         if self.group <= 1:
             raise ValueError("the number of group must be greater than 1.")
+
     def _check_data_dim(self, x):
         if x.dim == 0:
             pass
 
+
 class LayerNorm(Cell):
     r"""
     Applies Layer Normalization over a mini-batch of inputs.
@@ -474,10 +484,11 @@ class LayerNorm(Cell):
 
     Examples:
         >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
-        >>> shape1 = x.shape()[1:]
+        >>> shape1 = x.shape[1:]
         >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
         >>> m(x)
     """
+
     def __init__(self,
                  normalized_shape,
                  begin_norm_axis=-1,
@@ -498,8 +509,8 @@ class LayerNorm(Cell):
             gamma_init, normalized_shape), name="gamma")
         self.beta = Parameter(initializer(
             beta_init, normalized_shape), name="beta")
-        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis,
-                                      epsilon=self.epsilon)
+        self.layer_norm = _selected_ops.LayerNorm(begin_norm_axis=self.begin_norm_axis,
+                                                  begin_params_axis=self.begin_params_axis)
 
     def construct(self, input_x):
         y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
@@ -511,6 +522,7 @@ class LayerNorm(Cell):
             self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
         return s
 
+
 class GroupNorm(Cell):
     r"""
     Group Normalization over a mini-batch of inputs.
@@ -547,6 +559,7 @@ class GroupNorm(Cell):
         >>> x = Tensor(np.ones([1, 64, 256, 256], np.float32))
         >>> goup_norm_op(x)
     """
+
     def __init__(self, num_groups, num_channels, eps=1e-05, affine=True, gamma_init='ones', beta_init='zeros'):
         super(GroupNorm, self).__init__()
         self.num_groups = check_int_positive(num_groups)
diff --git a/mindspore/nn/layer/pooling.py b/mindspore/nn/layer/pooling.py
index 89bc65bb09..6c26fcea67 100644
--- a/mindspore/nn/layer/pooling.py
+++ b/mindspore/nn/layer/pooling.py
@@ -113,7 +113,7 @@ class MaxPool2d(_PoolNd):
            [0. 0. 4. 0.]
            [1. 8. 7. 0.]]]]
         >>> output = pool(x)
-        >>> output.shape()
+        >>> output.shape
         (1, 2, 2, 2)
         >>> output
         [[[[7. 8.]
@@ -195,7 +195,7 @@ class AvgPool2d(_PoolNd):
             [0. 8. 9. 7.]
             [2. 1. 4. 9.]]]]
         >>> output = pool(x)
-        >>> output.shape()
+        >>> output.shape
         (1, 2, 2, 2)
         >>> output
         [[[[4.888889  4.4444447]
@@ -260,7 +260,7 @@ class AvgPool1d(_PoolNd):
         >>> pool = nn.AvgPool1d(kernel_size=6, strides=1)
         >>> x = Tensor(np.random.randint(0, 10, [1, 3, 6]), mindspore.float32)
         >>> output = pool(x)
-        >>> output.shape()
+        >>> output.shape
         (1, 3, 1)
     """
 
diff --git a/mindspore/nn/layer/quant.py b/mindspore/nn/layer/quant.py
index 305a69800f..14731c6262 100644
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""Aware quantization."""
+"""Quantization aware."""
 
+from functools import partial
 import numpy as np
 import mindspore.common.dtype as mstype
 from mindspore.ops import operations as P
@@ -22,15 +23,21 @@ from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import initializer
 from mindspore.common.tensor import Tensor
 from mindspore._checkparam import check_int_positive, check_bool, twice
-from mindspore._checkparam import Validator as validator
+from mindspore._checkparam import Validator as validator, Rel
 from mindspore.nn.cell import Cell
 from mindspore.nn.layer.activation import get_activation
 import mindspore.context as context
-
+from .normalization import BatchNorm2d
+from .activation import get_activation
+from ..cell import Cell
+from . import conv, basic
+from ..._checkparam import ParamValidator as validator
+from ...ops.operations import _quant_ops as Q
 
 __all__ = [
+    'Conv2dBnAct',
+    'DenseBnAct',
     'FakeQuantWithMinMax',
-    'DepthwiseConv2dBatchNormQuant',
     'Conv2dBatchNormQuant',
     'Conv2dQuant',
     'DenseQuant',
@@ -43,12 +50,171 @@ __all__ = [
 ]
 
 
+class Conv2dBnAct(Cell):
+    r"""
+    A combination of convolution, Batchnorm, activation layer.
+
+    For a more Detailed overview of Conv2d op.
+
+    Args:
+        in_channels (int): The number of input channel :math:`C_{in}`.
+        out_channels (int): The number of output channel :math:`C_{out}`.
+        kernel_size (Union[int, tuple]): The data type is int or tuple with 2 integers. Specifies the height
+            and width of the 2D convolution window. Single int means the value if for both height and width of
+            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
+            width of the kernel.
+        stride (int): Specifies stride for all spatial dimensions with the same value. Value of stride should be
+            greater or equal to 1 but bounded by the height and width of the input. Default: 1.
+        pad_mode (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
+        padding (int): Implicit paddings on both sides of the input. Default: 0.
+        dilation (int): Specifying the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
+            there will be :math:`k - 1` pixels skipped for each sampling location. Its value should be greater
+            or equal to 1 and bounded by the height and width of the input. Default: 1.
+        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
+            divisible by the number of groups. Default: 1.
+        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
+        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
+            It can be a Tensor, a string, an Initializer or a numbers.Number. When a string is specified,
+            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
+            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
+            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
+            Initializer for more details. Default: 'normal'.
+        bias_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the bias vector. Possible
+            Initializer and string are the same as 'weight_init'. Refer to the values of
+            Initializer for more details. Default: 'zeros'.
+        batchnorm (bool): Specifies to used batchnorm or not. Default: None.
+        activation (string): Specifies activation type. The optional values are as following:
+            'softmax', 'logsoftmax', 'relu', 'relu6', 'tanh', 'gelu', 'sigmoid',
+            'prelu', 'leakyrelu', 'hswish', 'hsigmoid'. Default: None.
+
+    Inputs:
+        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
+
+    Outputs:
+        Tensor of shape :math:`(N, C_{out}, H_{out}, W_{out})`.
+
+    Examples:
+        >>> net = Conv2dBnAct(120, 240, 4, batchnorm=True, activation='ReLU')
+        >>> input = Tensor(np.ones([1, 120, 1024, 640]), mindspore.float32)
+        >>> net(input).shape
+        (1, 240, 1024, 640)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 pad_mode='same',
+                 padding=0,
+                 dilation=1,
+                 group=1,
+                 has_bias=False,
+                 weight_init='normal',
+                 bias_init='zeros',
+                 batchnorm=None,
+                 activation=None):
+        super(Conv2dBnAct, self).__init__()
+        self.conv = conv.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            pad_mode,
+            padding,
+            dilation,
+            group,
+            has_bias,
+            weight_init,
+            bias_init)
+        self.has_bn = batchnorm is not None
+        self.has_act = activation is not None
+        self.batchnorm = batchnorm
+        if batchnorm is True:
+            self.batchnorm = BatchNorm2d(out_channels)
+        elif batchnorm is not None:
+            validator.check_isinstance('batchnorm', batchnorm, (BatchNorm2d,))
+        self.activation = get_activation(activation)
+
+    def construct(self, x):
+        x = self.conv(x)
+        if self.has_bn:
+            x = self.batchnorm(x)
+        if self.has_act:
+            x = self.activation(x)
+        return x
+
+
+class DenseBnAct(Cell):
+    r"""
+    A combination of Dense, Batchnorm, activation layer.
+
+    For a more Detailed overview of Dense op.
+
+    Args:
+        in_channels (int): The number of channels in the input space.
+        out_channels (int): The number of channels in the output space.
+        weight_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable weight_init parameter. The dtype
+            is same as input x. The values of str refer to the function `initializer`. Default: 'normal'.
+        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
+            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
+        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
+        activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
+        batchnorm (bool): Specifies to used batchnorm or not. Default: None.
+        activation (string): Specifies activation type. The optional values are as following:
+            'softmax', 'logsoftmax', 'relu', 'relu6', 'tanh', 'gelu', 'sigmoid',
+            'prelu', 'leakyrelu', 'hswish', 'hsigmoid'. Default: None.
+
+    Inputs:
+        - **input** (Tensor) - Tensor of shape :math:`(N, in\_channels)`.
+
+    Outputs:
+        Tensor of shape :math:`(N, out\_channels)`.
+
+    Examples:
+        >>> net = nn.DenseBnAct(3, 4)
+        >>> input = Tensor(np.random.randint(0, 255, [2, 3]), mindspore.float32)
+        >>> net(input)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 weight_init='normal',
+                 bias_init='zeros',
+                 has_bias=True,
+                 batchnorm=None,
+                 activation=None):
+        super(DenseBnAct, self).__init__()
+        self.dense = basic.Dense(
+            in_channels,
+            out_channels,
+            weight_init,
+            bias_init,
+            has_bias)
+        self.has_bn = batchnorm is not None
+        self.has_act = activation is not None
+        if batchnorm is True:
+            self.batchnorm = BatchNorm2d(out_channels)
+        elif batchnorm is not None:
+            validator.check_isinstance('batchnorm', batchnorm, (BatchNorm2d,))
+        self.activation = get_activation(activation)
+
+    def construct(self, x):
+        x = self.dense(x)
+        if self.has_bn:
+            x = self.batchnorm(x)
+        if self.has_act:
+            x = self.activation(x)
+        return x
+
+
 class BatchNormFoldCell(Cell):
     """
     Batch normalization folded.
 
     Args:
-        momentum (float): Momentum value should be [0, 1]. Default: 0.1.
+        momentum (float): Momentum value should be [0, 1]. Default: 0.9.
         epsilon (float): A small float number to avoid dividing by 0. 1e-5 if dtype in
             float32 else 1e-3. Default: 1e-5.
         freeze_bn (int): Delay in steps at which computation switches from regular batch
@@ -76,11 +242,11 @@ class BatchNormFoldCell(Cell):
         self.epsilon = epsilon
         self.is_gpu = context.get_context('device_target') == "GPU"
         if self.is_gpu:
-            self.bn_train = P.BatchNormFold(momentum, epsilon, is_training=True, freeze_bn=freeze_bn)
-            self.bn_infer = P.BatchNormFold(momentum, epsilon, is_training=False, freeze_bn=freeze_bn)
+            self.bn_train = Q.BatchNormFold(momentum, epsilon, is_training=True, freeze_bn=freeze_bn)
+            self.bn_infer = Q.BatchNormFold(momentum, epsilon, is_training=False, freeze_bn=freeze_bn)
         else:
             self.bn_reduce = P.BNTrainingReduce()
-            self.bn_update = P.BatchNormFoldD(momentum, epsilon, is_training=True, freeze_bn=freeze_bn)
+            self.bn_update = Q.BatchNormFoldD(momentum, epsilon, is_training=True, freeze_bn=freeze_bn)
 
     def construct(self, x, mean, variance, global_step):
         if self.is_gpu:
@@ -103,124 +269,22 @@ class BatchNormFoldCell(Cell):
         return batch_mean, batch_std, running_mean, running_std
 
 
-class FakeQuantWithMinMaxD(Cell):
-    r"""
-    Aware Quantization training op of ascend. This OP provide Fake quantization observer
-    function on data with min and max.
-
-    Args:
-        min_init (int, list): The dimension of channel or 1(layer). Default: -6.
-        max_init (int, list): The dimension of channel or 1(layer). Default: 6.
-        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
-        ema (bool): Exponential Moving Average algorithm update min and max. Default: False.
-        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.9999.
-        per_channel (bool): Quantization by layer or channel. Default: False.
-        out_channels (int): declarate the min and max channel size, Default: 1.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
-        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
-        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
-
-    Inputs:
-        - **x** (Tensor) - The input of FakeQuantWithMinMax.
-
-    Outputs:
-        Tensor, with the same type and shape as the `x`.
-
-    Examples:
-        >>> fake_quant = nn.FakeQuantWithMinMaxD()
-        >>> input_x = Tensor(np.array([[1, 2, 1], [-2, 0, -1]]), mindspore.float32)
-        >>> result = fake_quant(input_x)
-    """
-    def __init__(self,
-                 min_init=-6,
-                 max_init=6,
-                 num_bits=8,
-                 ema=False,
-                 ema_decay=0.999,
-                 per_channel=False,
-                 channel_size=1,
-                 quant_delay=0,
-                 symmetric=False,
-                 narrow_range=False,
-                 training=True):
-        """init FakeQuantWithMinMax ascend layer"""
-        super(FakeQuantWithMinMaxD, self).__init__()
-
-        self.min_init = min_init
-        self.num_bits = num_bits
-        self.max_init = max_init
-        self.ema = ema
-        self.ema_decay = ema_decay
-        self.per_channel = per_channel
-        self.channel_size = channel_size
-        self.quant_delay = quant_delay
-        self.symmetric = symmetric
-        self.narrow_range = narrow_range
-        self.training = training
-
-        if not per_channel:
-            self.fake_quant = P.FakeQuantWithMinMax(num_bits=self.num_bits,
-                                                    ema=self.ema,
-                                                    ema_decay=self.ema_decay,
-                                                    quant_delay=self.quant_delay,
-                                                    symmetric=self.symmetric,
-                                                    narrow_range=self.narrow_range,
-                                                    training=training)
-            self.ema_update = P.FakeQuantWithMinMaxUpdate(num_bits=self.num_bits,
-                                                          ema=self.ema,
-                                                          ema_decay=self.ema_decay,
-                                                          quant_delay=self.quant_delay,
-                                                          symmetric=self.symmetric,
-                                                          narrow_range=self.narrow_range,
-                                                          training=training)
-        else:
-            raise RuntimeError("not support per channel")
-
-        if isinstance(min_init, Parameter):
-            self.minq = min_init
-            self.maxq = max_init
-        else:
-            self.minq = Parameter(Tensor(np.array([min_init]).astype(np.float32)),
-                                  name='quant_min',
-                                  requires_grad=False)
-            self.maxq = Parameter(Tensor(np.array([max_init]).astype(np.float32)),
-                                  name='quant_max',
-                                  requires_grad=False)
-        self.reduce_min = P.ReduceMin()
-        self.reduce_max = P.ReduceMax()
-
-    def extend_repr(self):
-        s = 'min_init={}, max_init={}, ema={}, ema_decay={},  per_channel={}, channel_size={}, quant_delay={}'.format(
-            self.min_init, self.max_init, self.ema, self.ema_decay, self.per_channel, self.channel_size,
-            self.quant_delay)
-        return s
-
-    def construct(self, x, minq, maxq):
-        if self.training:
-            min_up, max_up = self.ema_update(x, minq, maxq)
-            out = self.fake_quant(x, min_up, max_up)
-            P.Assign()(self.minq, min_up)
-            P.Assign()(self.maxq, max_up)
-        else:
-            out = self.fake_quant(x, minq, maxq)
-        return out
-
-
 class FakeQuantWithMinMax(Cell):
     r"""
-    Aware Quantization training op. This OP provide Fake quantization observer function on data with min and max.
+    Quantization aware op. This OP provide Fake quantization observer function on data with min and max.
 
     Args:
-        min_init (int, list): The dimension of channel or 1(layer). Default: -6.
-        max_init (int, list): The dimension of channel or 1(layer). Default: 6.
-        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
+        min_init (int, float): The dimension of channel or 1(layer). Default: -6.
+        max_init (int, float): The dimension of channel or 1(layer). Default: 6.
         ema (bool): Exponential Moving Average algorithm update min and max. Default: False.
-        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.9999.
-        per_channel (bool): Quantization by layer or channel. Default: False.
-        out_channels (int): declarate the min and max channel size, Default: 1.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
+        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
+        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
+        channel_axis (int): Quantization by channel axis. Default: 1.
+        num_channels (int): declarate the min and max channel size, Default: 1.
+        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
 
     Inputs:
         - **x** (Tensor) - The input of FakeQuantWithMinMax.
@@ -237,289 +301,83 @@ class FakeQuantWithMinMax(Cell):
     def __init__(self,
                  min_init=-6,
                  max_init=6,
-                 num_bits=8,
                  ema=False,
                  ema_decay=0.999,
                  per_channel=False,
-                 out_channels=1,
-                 quant_delay=0,
+                 channel_axis=1,
+                 num_channels=1,
+                 num_bits=8,
                  symmetric=False,
                  narrow_range=False,
-                 training=True):
+                 quant_delay=0):
         """init FakeQuantWithMinMax layer"""
         super(FakeQuantWithMinMax, self).__init__()
-
         self.min_init = min_init
-        self.num_bits = num_bits
         self.max_init = max_init
+        self.num_bits = num_bits
         self.ema = ema
         self.ema_decay = ema_decay
         self.per_channel = per_channel
-        self.out_channels = out_channels
+        self.num_channels = num_channels
+        self.channel_axis = channel_axis
         self.quant_delay = quant_delay
         self.symmetric = symmetric
         self.narrow_range = narrow_range
-        self.training = training
+        self.is_ascend = context.get_context('device_target') == "Ascend"
 
+        # init tensor min and max for fake quant op
+        if self.per_channel:
+            min_array = np.array([self.min_init] * self.num_channels).astype(np.float32)
+            max_array = np.array([self.max_init] * self.num_channels).astype(np.float32)
+        else:
+            min_array = np.array([self.min_init]).astype(np.float32)
+            max_array = np.array([self.max_init]).astype(np.float32)
+        self.minq = Parameter(Tensor(min_array), name='quant_min', requires_grad=False)
+        self.maxq = Parameter(Tensor(max_array), name='quant_max', requires_grad=False)
+
+        # init fake quant relative op
         if per_channel:
-            min_array = np.array([self.min_init for i in range(0, self.out_channels)]).astype(np.float32)
-            max_array = np.array([self.max_init for i in range(0, self.channel_size)]).astype(np.float32)
-            self.minq = Parameter(Tensor(min_array), name='quant_min', requires_grad=False)
-            self.maxq = Parameter(Tensor(max_array), name='quant_max', requires_grad=False)
-            self.fake_quant_train = P.FakeQuantWithMinMaxPerChannel(num_bits=self.num_bits,
-                                                                    ema=self.ema,
-                                                                    ema_decay=self.ema_decay,
-                                                                    quant_delay=self.quant_delay,
-                                                                    symmetric=self.symmetric,
-                                                                    narrow_range=self.narrow_range,
-                                                                    training=True)
-            self.fake_quant_infer = P.FakeQuantWithMinMaxPerChannel(num_bits=self.num_bits,
-                                                                    ema=self.ema,
-                                                                    ema_decay=self.ema_decay,
-                                                                    quant_delay=self.quant_delay,
-                                                                    symmetric=self.symmetric,
-                                                                    narrow_range=self.narrow_range,
-                                                                    training=False)
+            quant_fun = partial(Q.FakeQuantPerChannel, channel_axis=self.channel_axis)
+            ema_fun = partial(Q.MinMaxUpdatePerChannel, channel_axis=self.channel_axis)
         else:
-            min_array = np.array([min_init]).reshape(1).astype(np.float32)
-            max_array = np.array([max_init]).reshape(1).astype(np.float32)
-            self.minq = Parameter(Tensor(min_array), name='quant_min', requires_grad=False)
-            self.maxq = Parameter(Tensor(max_array), name='quant_max', requires_grad=False)
-            if context.get_context('device_target') == "Ascend":
-                self.fake_quant_train = FakeQuantWithMinMaxD(num_bits=self.num_bits,
-                                                             ema=self.ema,
-                                                             ema_decay=self.ema_decay,
-                                                             quant_delay=self.quant_delay,
-                                                             symmetric=self.symmetric,
-                                                             narrow_range=self.narrow_range,
-                                                             training=True,
-                                                             min_init=self.minq,
-                                                             max_init=self.maxq)
-                self.fake_quant_infer = FakeQuantWithMinMaxD(num_bits=self.num_bits,
-                                                             ema=self.ema,
-                                                             ema_decay=self.ema_decay,
-                                                             quant_delay=self.quant_delay,
-                                                             symmetric=self.symmetric,
-                                                             narrow_range=self.narrow_range,
-                                                             training=False,
-                                                             min_init=self.minq,
-                                                             max_init=self.maxq)
-            elif context.get_context('device_target') == "GPU":
-                self.fake_quant_train = P.FakeQuantWithMinMax(num_bits=self.num_bits,
-                                                              ema=self.ema,
-                                                              ema_decay=self.ema_decay,
-                                                              quant_delay=self.quant_delay,
-                                                              symmetric=self.symmetric,
-                                                              narrow_range=self.narrow_range,
-                                                              training=True)
-                self.fake_quant_infer = P.FakeQuantWithMinMax(num_bits=self.num_bits,
-                                                              ema=self.ema,
-                                                              ema_decay=ema_decay,
-                                                              quant_delay=quant_delay,
-                                                              symmetric=self.symmetric,
-                                                              narrow_range=self.narrow_range,
-                                                              training=False)
-            else:
-                raise ValueError("Not support platform.")
+            quant_fun = Q.FakeQuantPerLayer
+            ema_fun = Q.MinMaxUpdatePerLayer
+
+        self.ema_update = ema_fun(ema=self.ema, ema_decay=self.ema_decay)
+        if self.is_ascend:
+            self.fake_quant_train = quant_fun(num_bits=self.num_bits,
+                                              symmetric=self.symmetric,
+                                              narrow_range=self.narrow_range)
+            self.fake_quant_infer = self.fake_quant_train
+        else:
+            quant_fun = partial(quant_fun,
+                                ema=self.ema,
+                                ema_decay=ema_decay,
+                                num_bits=self.num_bits,
+                                symmetric=self.symmetric,
+                                narrow_range=self.narrow_range,
+                                quant_delay=quant_delay)
+            self.fake_quant_train = quant_fun(training=True)
+            self.fake_quant_infer = quant_fun(training=False)
 
     def extend_repr(self):
-        s = 'min={}, max={}, ema={}, ema_decay={}, per_channel={}, quant_delay={}'.format(
-            self.min_init, self.max_init, self.ema, self.ema_decay, self.per_channel, self.quant_delay)
+        s = 'num_bits={}, symmetric={}, narrow_range={}, ema={}({}), per_channel={}({}, {}), ' \
+            'quant_delay={}, min_init={}, max_init={}'.format(
+                self.num_bits, self.symmetric, self.narrow_range, self.ema, self.ema_decay, self.per_channel,
+                self.channel_axis, self.num_channels, self.quant_delay, self.min_init, self.max_init)
         return s
 
     def construct(self, x):
         if self.training:
+            min_up, max_up = self.ema_update(x, self.minq, self.maxq)
+            P.Assign()(self.minq, min_up)
+            P.Assign()(self.maxq, max_up)
             out = self.fake_quant_train(x, self.minq, self.maxq)
         else:
             out = self.fake_quant_infer(x, self.minq, self.maxq)
         return out
 
 
-class DepthwiseConv2dBatchNormQuant(Cell):
-    r"""
-    2D depthwise convolution with BatchNormal op folded layer.
-
-    For a more Detailed overview of Conv2d op.
-
-    Args:
-        in_channels (int): The number of input channel :math:`C_{in}`.
-        out_channels (int): The number of output channel :math:`C_{out}`.
-        kernel_size (Union[int, tuple]): Specifies the height and width of the 2D convolution window.
-        stride (int): Specifies stride for all spatial dimensions with the same value.
-        pad_mode: (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
-        padding: (int): Implicit paddings on both sides of the input. Default: 0.
-        eps (int): Parameters for BatchNormal. Default: 1e-5.
-        momentum (int): Parameters for BatchNormal op. Default: 0.9.
-        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            convolution kernel. Default: 'None'.
-        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            beta vector. Default: 'None'.
-        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            gamma vector. Default: 'None'.
-        mean_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            mean vector. Default: 'None'.
-        var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            variance vector. Default: 'None'.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
-        freeze_bn (int): Quantization freeze BatchNormal op according by global step. Default: 100000.
-        fake (bool): Conv2dBatchNormQuant Cell add FakeQuantWithMinMax op or not. Default: True.
-        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
-        per_channel (bool): FakeQuantWithMinMax Parameters. Default: False.
-        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
-        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
-
-    Inputs:
-        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
-
-    Outputs:
-        Tensor of shape :math:`(N, C_{out}, H_{out}, W_{out})`.
-
-   Examples:
-        >>> quant = nn.DepthwiseConv2dBatchNormQuant(1, 6,
-                                                     kernel_size= (2, 2),
-                                                     stride=(1, 1),
-                                                     pad_mode="valid",
-        >>>                                          dilation=(1, 1))
-        >>> input_x = Tensor(np.random.randint(-2, 2, (2, 1, 1, 3)), mindspore.float32)
-        >>> result = quant(input_x)
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 pad_mode='same',
-                 padding=0,
-                 dilation=1,
-                 group=1,
-                 eps=1e-5,
-                 momentum=0.997,
-                 weight_init=None,
-                 beta_init=None,
-                 gamma_init=None,
-                 mean_init=None,
-                 var_init=None,
-                 quant_delay=0,
-                 freeze_bn=100000,
-                 fake=True,
-                 num_bits=8,
-                 per_channel=False,
-                 symmetric=False,
-                 narrow_range=False):
-        """init DepthwiseConv2dBatchNormQuant layer"""
-        super(DepthwiseConv2dBatchNormQuant, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.pad_mode = pad_mode
-        self.padding = padding
-        self.dilation = twice(dilation)
-        self.stride = twice(stride)
-        self.group = group
-        self.fake = fake
-        self.freeze_bn = freeze_bn
-        self.momentum = momentum
-        self.quant_delay = quant_delay
-        if isinstance(kernel_size, int):
-            self.kernel_size = (kernel_size, kernel_size)
-        else:
-            self.kernel_size = kernel_size
-        if group > 1:
-            validator.check_integer('group', group, 'in_channels', in_channels, 'Conv2dBatchNormQuant')
-            validator.check_integer('group', group, 'in_channels', out_channels, 'Conv2dBatchNormQuant')
-        self.is_depthwise = group > 1
-
-        channel_multiplier = out_channels // in_channels
-        self.conv = P.DepthwiseConv2dNative(channel_multiplier=channel_multiplier,
-                                            kernel_size=kernel_size,
-                                            stride=stride,
-                                            pad_mode=pad_mode,
-                                            pad=padding)
-
-        if weight_init is None:
-            weight_init = initializer('normal', [channel_multiplier, in_channels, *kernel_size])
-        self.weight = Parameter(weight_init, name='weight')
-        if gamma_init is None:
-            gamma_init = initializer('ones', [out_channels])
-        self.gamma = Parameter(gamma_init, name='gamma')
-        if beta_init is None:
-            beta_init = initializer('zeros', [out_channels])
-        self.beta = Parameter(beta_init, name='beta')
-        if mean_init is None:
-            mean_init = initializer('zeros', [out_channels])
-        self.moving_mean = Parameter(
-            mean_init, name='moving_mean', requires_grad=False)
-        if var_init is None:
-            var_init = initializer('ones', [out_channels])
-        self.moving_variance = Parameter(
-            var_init, name='moving_variance', requires_grad=False)
-
-        self.step = Parameter(initializer(
-            'normal', [1], dtype=mstype.int32), name='step', requires_grad=False)
-
-        self.fake_quant_weight = FakeQuantWithMinMax(min_init=-6,
-                                                     max_init=6,
-                                                     ema=False,
-                                                     num_bits=num_bits,
-                                                     quant_delay=quant_delay,
-                                                     per_channel=per_channel,
-                                                     out_channels=out_channels,
-                                                     symmetric=symmetric,
-                                                     narrow_range=narrow_range)
-        self.batchnorm_fold = BatchNormFoldCell(epsilon=eps, momentum=momentum, freeze_bn=freeze_bn)
-
-        self.correct_mul = P.CorrectionMul(self.is_depthwise)
-        if context.get_context('device_target') == "Ascend":
-            self.batchnorm_fold2_train = P.BatchNormFold2_D(freeze_bn=freeze_bn)
-            self.batchnorm_fold2_infer = P.BatchNormFold2_D(freeze_bn=0)
-        elif context.get_context('device_target') == "GPU":
-            self.batchnorm_fold2_train = P.BatchNormFold2(freeze_bn=freeze_bn)
-            self.batchnorm_fold2_infer = P.BatchNormFold2(freeze_bn=0)
-        else:
-            raise ValueError("Not support platform.")
-        self.one = Tensor(1, mstype.int32)
-        self.assignadd = P.AssignAdd()
-        self.is_gpu = context.get_context('device_target') == "GPU"
-
-    def extend_repr(self):
-        s = 'in_channels={}, out_channels={}, kernel_size={}, stride={}, ' \
-            'pad_mode={}, padding={}, dilation={}, group={}, ' \
-            'fake={}, freeze_bn={}, momentum={}, quant_delay={}'.format(
-                self.in_channels, self.out_channels, self.kernel_size, self.stride,
-                self.pad_mode, self.padding, self.dilation, self.group,
-                self.fake, self.freeze_bn, self.momentum, self.quant_delay)
-        return s
-
-    def construct(self, x):
-        out_conv = self.conv(x, self.weight)
-        # BN fold1
-        batch_mean, batch_std, running_mean, running_std = self.batchnorm_fold(out_conv,
-                                                                               self.moving_mean,
-                                                                               self.moving_variance,
-                                                                               self.step)
-        # fake weight
-        weight = self.correct_mul(self.weight, self.gamma, running_std)
-        if self.fake:
-            weight = self.fake_quant_weight(weight)
-        out = self.conv(x, weight)
-        # BN fold2
-        if self.is_gpu:
-            if self.training:
-                out = self.batchnorm_fold2_train(out, self.beta, self.gamma,
-                                                 batch_std, batch_mean, running_std, running_mean, self.step)
-                F.control_depend(out, self.assignadd(self.step, self.one))
-            else:
-                out = self.batchnorm_fold2_infer(out, self.beta, self.gamma,
-                                                 batch_std, batch_mean, running_std, running_mean, self.step)
-        else:
-            if self.training:
-                out = self.batchnorm_fold2_train(out, self.beta, self.gamma, batch_std, batch_mean, running_std)
-                F.control_depend(out, self.assignadd(self.step, self.one))
-            else:
-                out = self.batchnorm_fold2_infer(out, self.beta, self.gamma, batch_std, batch_mean, running_std)
-        return out
-
-
 class Conv2dBatchNormQuant(Cell):
     r"""
     2D convolution with BatchNormal op folded layer.
@@ -533,25 +391,25 @@ class Conv2dBatchNormQuant(Cell):
         stride (int): Specifies stride for all spatial dimensions with the same value.
         pad_mode: (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
         padding: (int): Implicit paddings on both sides of the input. Default: 0.
-        eps (int): Parameters for BatchNormal. Default: 1e-5.
-        momentum (int): Parameters for BatchNormal op. Default: 0.9.
+        eps (float): Parameters for BatchNormal. Default: 1e-5.
+        momentum (float): Parameters for BatchNormal op. Default: 0.997.
         weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            convolution kernel. Default: 'None'.
+            convolution kernel. Default: 'normal'.
         beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            beta vector. Default: 'None'.
+            beta vector. Default: 'zeros'.
         gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            gamma vector. Default: 'None'.
+            gamma vector. Default: 'ones'.
         mean_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            mean vector. Default: 'None'.
+            mean vector. Default: 'zeros'.
         var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
-            variance vector. Default: 'None'.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
-        freeze_bn (int): Quantization freeze BatchNormal op according by global step. Default: 100000.
+            variance vector. Default: 'ones'.
         fake (bool): Conv2dBatchNormQuant Cell add FakeQuantWithMinMax op or not. Default: True.
-        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
         per_channel (bool): FakeQuantWithMinMax Parameters. Default: False.
+        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
+        freeze_bn (int): Quantization freeze BatchNormal op according by global step. Default: 100000.
 
     Inputs:
         - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -559,7 +417,7 @@ class Conv2dBatchNormQuant(Cell):
     Outputs:
         Tensor of shape :math:`(N, C_{out}, H_{out}, W_{out})`.
 
-   Examples:
+    Examples:
         >>> batchnorm_quant = nn.Conv2dBatchNormQuant(1, 6, kernel_size= (2, 2), stride=(1, 1), pad_mode="valid",
         >>>                                           dilation=(1, 1))
         >>> input_x = Tensor(np.random.randint(-2, 2, (2, 1, 1, 3)), mindspore.float32)
@@ -577,84 +435,92 @@ class Conv2dBatchNormQuant(Cell):
                  group=1,
                  eps=1e-5,
                  momentum=0.997,
-                 weight_init=None,
-                 beta_init=None,
-                 gamma_init=None,
-                 mean_init=None,
-                 var_init=None,
-                 quant_delay=0,
-                 freeze_bn=100000,
+                 weight_init='normal',
+                 beta_init='zeros',
+                 gamma_init='ones',
+                 mean_init='zeros',
+                 var_init='ones',
                  fake=True,
-                 num_bits=8,
                  per_channel=False,
+                 num_bits=8,
                  symmetric=False,
-                 narrow_range=False):
+                 narrow_range=False,
+                 quant_delay=0,
+                 freeze_bn=100000):
         """init Conv2dBatchNormQuant layer"""
         super(Conv2dBatchNormQuant, self).__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
+        self.kernel_size = twice(kernel_size)
+        self.stride = twice(stride)
         self.pad_mode = pad_mode
         self.padding = padding
         self.dilation = twice(dilation)
-        self.stride = twice(stride)
         self.group = group
-        self.fake = fake
-        self.freeze_bn = freeze_bn
+        self.eps = eps
         self.momentum = momentum
         self.quant_delay = quant_delay
-        if isinstance(kernel_size, int):
-            self.kernel_size = (kernel_size, kernel_size)
-        else:
-            self.kernel_size = kernel_size
-        if weight_init is None:
-            weight_init = initializer(
-                'normal', [out_channels, in_channels // group, *self.kernel_size])
-        self.weight = Parameter(weight_init, name='weight')
-        if gamma_init is None:
-            gamma_init = initializer('ones', [out_channels])
-        self.gamma = Parameter(gamma_init, name='gamma')
-        if beta_init is None:
-            beta_init = initializer('zeros', [out_channels])
-        self.beta = Parameter(beta_init, name='beta')
-        if mean_init is None:
-            mean_init = initializer('zeros', [out_channels])
-        self.moving_mean = Parameter(
-            mean_init, name='moving_mean', requires_grad=False)
-        if var_init is None:
-            var_init = initializer('ones', [out_channels])
-        self.moving_variance = Parameter(
-            var_init, name='moving_variance', requires_grad=False)
-
-        self.step = Parameter(initializer(
-            'normal', [1], dtype=mstype.int32), name='step', requires_grad=False)
+        self.freeze_bn = freeze_bn
+        self.fake = fake
+        self.num_bits = num_bits
+        self.per_channel = per_channel
+        self.symmetric = symmetric
+        self.narrow_range = narrow_range
+        self.is_gpu = context.get_context('device_target') == "GPU"
 
+        # initialize convolution op and Parameter
+        if context.get_context('device_target') == "Ascend" and group > 1:
+            validator.check_integer('group', group, in_channels, Rel.EQ)
+            validator.check_integer('group', group, out_channels, Rel.EQ)
+            self.conv = P.DepthwiseConv2dNative(channel_multiplier=1,
+                                                kernel_size=self.kernel_size,
+                                                pad_mode=pad_mode,
+                                                pad=padding,
+                                                stride=self.stride,
+                                                dilation=self.dilation)
+            weight_shape = [1, in_channels, *self.kernel_size]
+            channel_axis = 1
+        else:
+            self.conv = P.Conv2D(out_channel=out_channels,
+                                 kernel_size=self.kernel_size,
+                                 pad_mode=pad_mode,
+                                 pad=padding,
+                                 stride=self.stride,
+                                 dilation=self.dilation,
+                                 group=group)
+            weight_shape = [out_channels, in_channels // group, *self.kernel_size]
+            channel_axis = 0
+        self.weight = Parameter(initializer(weight_init, weight_shape), name='weight')
+
+        # initialize batchnorm Parameter
+        self.gamma = Parameter(initializer(gamma_init, [out_channels]), name='gamma')
+        self.beta = Parameter(initializer(beta_init, [out_channels]), name='beta')
+        self.moving_mean = Parameter(initializer(mean_init, [out_channels]), name='moving_mean', requires_grad=False)
+        self.moving_variance = Parameter(initializer(var_init, [out_channels]), name='moving_variance',
+                                         requires_grad=False)
+
+        # initialize fake ops
         self.fake_quant_weight = FakeQuantWithMinMax(min_init=-6,
                                                      max_init=6,
                                                      ema=False,
-                                                     num_bits=num_bits,
-                                                     quant_delay=quant_delay,
                                                      per_channel=per_channel,
-                                                     out_channels=out_channels,
+                                                     channel_axis=channel_axis,
+                                                     num_channels=out_channels,
+                                                     num_bits=num_bits,
                                                      symmetric=symmetric,
-                                                     narrow_range=narrow_range)
+                                                     narrow_range=narrow_range,
+                                                     quant_delay=quant_delay)
         self.batchnorm_fold = BatchNormFoldCell(epsilon=eps, momentum=momentum, freeze_bn=freeze_bn)
-        self.conv = P.Conv2D(out_channel=out_channels,
-                             kernel_size=kernel_size,
-                             mode=1,
-                             pad_mode=pad_mode,
-                             pad=padding,
-                             stride=stride,
-                             dilation=1,
-                             group=group)
-        self.correct_mul = P.CorrectionMul()
+        self.correct_mul = Q.CorrectionMul(channel_axis)
         if context.get_context('device_target') == "Ascend":
-            self.batchnorm_fold2_train = P.BatchNormFold2_D(freeze_bn=freeze_bn)
-            self.batchnorm_fold2_infer = P.BatchNormFold2_D(freeze_bn=0)
+            self.batchnorm_fold2_train = Q.BatchNormFold2_D(freeze_bn=freeze_bn)
+            self.batchnorm_fold2_infer = Q.BatchNormFold2_D(freeze_bn=0)
         elif context.get_context('device_target') == "GPU":
-            self.batchnorm_fold2_train = P.BatchNormFold2(freeze_bn=freeze_bn)
-            self.batchnorm_fold2_infer = P.BatchNormFold2(freeze_bn=0)
+            self.batchnorm_fold2_train = Q.BatchNormFold2(freeze_bn=freeze_bn)
+            self.batchnorm_fold2_infer = Q.BatchNormFold2(freeze_bn=0)
         else:
-            raise ValueError("Not support platform.")
+            raise ValueError("Unsupported platform: {}".format(context.get_context('device_target')))
+        self.step = Parameter(initializer('normal', [1], dtype=mstype.int32), name='step', requires_grad=False)
         self.one = Tensor(1, mstype.int32)
         self.assignadd = P.AssignAdd()
 
@@ -693,7 +559,7 @@ class Conv2dBatchNormQuant(Cell):
                 out = self.batchnorm_fold2_train(out, self.beta, self.gamma, batch_std, batch_mean, running_std)
                 F.control_depend(out, self.assignadd(self.step, self.one))
             else:
-                out = self.batchnorm_fold2_infer(out, self.beta, self.gamma, batch_std, batch_mean, running_std)
+                out = self.batchnorm_fold2_infer(out, self.beta, self.gamma, running_std, running_mean, running_std)
         return out
 
 
@@ -715,13 +581,13 @@ class Conv2dQuant(Cell):
             divisible by the number of groups. Default: 1.
         has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
         weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
-            Default: None.
-        bias_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the bias vector. Default: None.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
-        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
+            Default: 'normal'.
+        bias_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the bias vector. Default: 'zeros'.
         per_channel (bool): FakeQuantWithMinMax Parameters. Default: False.
+        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
 
     Inputs:
         - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -746,13 +612,13 @@ class Conv2dQuant(Cell):
                  dilation=1,
                  group=1,
                  has_bias=False,
-                 weight_init=None,
-                 bias_init=None,
-                 quant_delay=0,
-                 num_bits=8,
+                 weight_init='normal',
+                 bias_init='zeros',
                  per_channel=False,
+                 num_bits=8,
                  symmetric=False,
-                 narrow_range=False):
+                 narrow_range=False,
+                 quant_delay=0):
         super(Conv2dQuant, self).__init__()
         if isinstance(kernel_size, int):
             self.kernel_size = (kernel_size, kernel_size)
@@ -768,15 +634,14 @@ class Conv2dQuant(Cell):
         self.group = group
         self.quant_delay = quant_delay
 
-        if weight_init is None:
-            weight_init = initializer(
-                'normal', [out_channels, in_channels // group, *self.kernel_size])
-        self.weight = Parameter(weight_init, name='weight')
-        if bias_init is None:
-            bias_init = initializer('zeros', [out_channels])
-        if has_bias:
-            self.bias = Parameter(bias_init, name='bias')
-            self.bias_add = P.BiasAdd()
+        weight_shape = [out_channels, in_channels // group, *self.kernel_size]
+        self.weight = Parameter(initializer(weight_init, weight_shape), name='weight')
+
+        self.bias_add = P.BiasAdd()
+        if check_bool(has_bias):
+            self.bias = Parameter(initializer(bias_init, [out_channels]), name='bias')
+        else:
+            self.bias = None
 
         self.conv = P.Conv2D(out_channel=self.out_channels,
                              kernel_size=self.kernel_size,
@@ -789,12 +654,13 @@ class Conv2dQuant(Cell):
         self.fake_quant_weight = FakeQuantWithMinMax(min_init=-6,
                                                      max_init=6,
                                                      ema=False,
-                                                     num_bits=num_bits,
-                                                     quant_delay=quant_delay,
                                                      per_channel=per_channel,
-                                                     out_channels=out_channels,
+                                                     channel_axis=0,
+                                                     num_channels=out_channels,
+                                                     num_bits=num_bits,
                                                      symmetric=symmetric,
-                                                     narrow_range=narrow_range)
+                                                     narrow_range=narrow_range,
+                                                     quant_delay=quant_delay)
 
     def construct(self, x):
         weight = self.fake_quant_weight(self.weight)
@@ -828,11 +694,11 @@ class DenseQuant(Cell):
             same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
         has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
         activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
-        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
         per_channel (bool): FakeQuantWithMinMax Parameters. Default: False.
+        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
 
     Inputs:
         - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -854,11 +720,11 @@ class DenseQuant(Cell):
             bias_init='zeros',
             has_bias=True,
             activation=None,
-            num_bits=8,
-            quant_delay=0,
             per_channel=False,
+            num_bits=8,
             symmetric=False,
-            narrow_range=False):
+            narrow_range=False,
+            quant_delay=0):
         super(DenseQuant, self).__init__()
         self.in_channels = check_int_positive(in_channels)
         self.out_channels = check_int_positive(out_channels)
@@ -888,12 +754,13 @@ class DenseQuant(Cell):
         self.fake_quant_weight = FakeQuantWithMinMax(min_init=-6,
                                                      max_init=6,
                                                      ema=False,
-                                                     num_bits=num_bits,
-                                                     quant_delay=quant_delay,
                                                      per_channel=per_channel,
-                                                     out_channels=out_channels,
+                                                     channel_axis=0,
+                                                     num_channels=out_channels,
+                                                     num_bits=num_bits,
                                                      symmetric=symmetric,
-                                                     narrow_range=narrow_range)
+                                                     narrow_range=narrow_range,
+                                                     quant_delay=quant_delay)
 
     def construct(self, x):
         """Use operators to construct to Dense layer."""
@@ -917,17 +784,28 @@ class DenseQuant(Cell):
         return str_info
 
 
-class ReLUQuant(Cell):
+class _QuantActivation(Cell):
+    r"""
+    Base class for Quant activation function. Add Fake Quant OP after activation OP.
+    """
+
+    def get_origin(self):
+        raise NotImplementedError
+
+
+class ReLUQuant(_QuantActivation):
     r"""
     ReLUQuant activation function. Add Fake Quant OP after Relu OP.
 
     For a more Detailed overview of ReLU op.
 
     Args:
+        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
+        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
         num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
 
     Inputs:
         - **x** (Tensor) - The input of ReLUQuant.
@@ -942,18 +820,22 @@ class ReLUQuant(Cell):
     """
 
     def __init__(self,
+                 ema_decay=0.999,
+                 per_channel=False,
                  num_bits=8,
-                 quant_delay=0,
                  symmetric=False,
-                 narrow_range=False):
+                 narrow_range=False,
+                 quant_delay=0):
         super(ReLUQuant, self).__init__()
         self.fake_quant_act = FakeQuantWithMinMax(min_init=0,
                                                   max_init=6,
-                                                  num_bits=num_bits,
-                                                  quant_delay=quant_delay,
                                                   ema=True,
+                                                  ema_decay=ema_decay,
+                                                  per_channel=per_channel,
+                                                  num_bits=num_bits,
                                                   symmetric=symmetric,
-                                                  narrow_range=narrow_range)
+                                                  narrow_range=narrow_range,
+                                                  quant_delay=quant_delay)
         self.relu = P.ReLU()
 
     def construct(self, x):
@@ -961,8 +843,11 @@ class ReLUQuant(Cell):
         x = self.fake_quant_act(x)
         return x
 
+    def get_origin(self):
+        return self.relu
 
-class ReLU6Quant(Cell):
+
+class ReLU6Quant(_QuantActivation):
     r"""
     ReLU6Quant activation function.
 
@@ -971,10 +856,12 @@ class ReLU6Quant(Cell):
     For a more Detailed overview of ReLU6 op.
 
     Args:
+        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
+        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
         num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
 
     Inputs:
         - **x** (Tensor) - The input of ReLU6Quant.
@@ -988,16 +875,23 @@ class ReLU6Quant(Cell):
         >>> result = relu6_quant(input_x)
     """
 
-    def __init__(self, num_bits=8, quant_delay=0, symmetric=False,
-                 narrow_range=False):
+    def __init__(self,
+                 ema_decay=0.999,
+                 per_channel=False,
+                 num_bits=8,
+                 symmetric=False,
+                 narrow_range=False,
+                 quant_delay=0):
         super(ReLU6Quant, self).__init__()
         self.fake_quant_act = FakeQuantWithMinMax(min_init=0,
                                                   max_init=6,
-                                                  num_bits=num_bits,
-                                                  quant_delay=quant_delay,
                                                   ema=True,
+                                                  ema_decay=ema_decay,
+                                                  per_channel=per_channel,
+                                                  num_bits=num_bits,
                                                   symmetric=symmetric,
-                                                  narrow_range=narrow_range)
+                                                  narrow_range=narrow_range,
+                                                  quant_delay=quant_delay)
         self.relu6 = P.ReLU6()
 
     def construct(self, x):
@@ -1005,18 +899,23 @@ class ReLU6Quant(Cell):
         x = self.fake_quant_act(x)
         return x
 
+    def get_origin(self):
+        return self.relu6
+
 
-class HSwishQuant(Cell):
+class HSwishQuant(_QuantActivation):
     r"""
     HSwishQuant activation function. Add Fake Quant OP after HSwish OP.
 
     For a more Detailed overview of HSwish op.
 
     Args:
+        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
+        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
         num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
 
     Inputs:
         - **x** (Tensor) - The input of HSwishQuant.
@@ -1031,25 +930,31 @@ class HSwishQuant(Cell):
     """
 
     def __init__(self,
+                 ema_decay=0.999,
+                 per_channel=False,
                  num_bits=8,
-                 quant_delay=0,
                  symmetric=False,
-                 narrow_range=False):
+                 narrow_range=False,
+                 quant_delay=0):
         super(HSwishQuant, self).__init__()
         self.fake_quant_act_before = FakeQuantWithMinMax(min_init=-6,
                                                          max_init=6,
-                                                         num_bits=num_bits,
-                                                         quant_delay=quant_delay,
                                                          ema=True,
+                                                         ema_decay=ema_decay,
+                                                         per_channel=per_channel,
+                                                         num_bits=num_bits,
                                                          symmetric=symmetric,
-                                                         narrow_range=narrow_range)
+                                                         narrow_range=narrow_range,
+                                                         quant_delay=quant_delay)
         self.fake_quant_act_after = FakeQuantWithMinMax(min_init=-6,
                                                         max_init=6,
-                                                        num_bits=num_bits,
-                                                        quant_delay=quant_delay,
                                                         ema=True,
+                                                        ema_decay=ema_decay,
+                                                        per_channel=per_channel,
+                                                        num_bits=num_bits,
                                                         symmetric=symmetric,
-                                                        narrow_range=narrow_range)
+                                                        narrow_range=narrow_range,
+                                                        quant_delay=quant_delay)
         self.act = P.HSwish()
 
     def construct(self, x):
@@ -1058,18 +963,23 @@ class HSwishQuant(Cell):
         x = self.fake_quant_act_after(x)
         return x
 
+    def get_origin(self):
+        return self.act
 
-class HSigmoidQuant(Cell):
+
+class HSigmoidQuant(_QuantActivation):
     r"""
     HSigmoidQuant activation function. Add Fake Quant OP before and after HSigmoid OP.
 
     For a more Detailed overview of HSigmoid op.
 
     Args:
+        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
+        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
         num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
 
     Inputs:
         - **x** (Tensor) - The input of HSigmoidQuant.
@@ -1084,25 +994,31 @@ class HSigmoidQuant(Cell):
     """
 
     def __init__(self,
+                 ema_decay=0.999,
+                 per_channel=False,
                  num_bits=8,
-                 quant_delay=0,
                  symmetric=False,
-                 narrow_range=False):
+                 narrow_range=False,
+                 quant_delay=0):
         super(HSigmoidQuant, self).__init__()
         self.fake_quant_act_before = FakeQuantWithMinMax(min_init=-6,
                                                          max_init=6,
-                                                         num_bits=num_bits,
-                                                         quant_delay=quant_delay,
                                                          ema=True,
+                                                         ema_decay=ema_decay,
+                                                         per_channel=per_channel,
+                                                         num_bits=num_bits,
                                                          symmetric=symmetric,
-                                                         narrow_range=narrow_range)
+                                                         narrow_range=narrow_range,
+                                                         quant_delay=quant_delay)
         self.fake_quant_act_after = FakeQuantWithMinMax(min_init=-6,
                                                         max_init=6,
-                                                        num_bits=num_bits,
-                                                        quant_delay=quant_delay,
                                                         ema=True,
+                                                        ema_decay=ema_decay,
+                                                        per_channel=per_channel,
+                                                        num_bits=num_bits,
                                                         symmetric=symmetric,
-                                                        narrow_range=narrow_range)
+                                                        narrow_range=narrow_range,
+                                                        quant_delay=quant_delay)
         self.act = P.HSigmoid()
 
     def construct(self, x):
@@ -1111,6 +1027,9 @@ class HSigmoidQuant(Cell):
         x = self.fake_quant_act_after(x)
         return x
 
+    def get_origin(self):
+        return self.act
+
 
 class TensorAddQuant(Cell):
     r"""
@@ -1119,10 +1038,12 @@ class TensorAddQuant(Cell):
     For a more Detailed overview of TensorAdd op.
 
     Args:
+        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
+        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
         num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
 
     Inputs:
         - **x** (Tensor) - The input of TensorAddQuant.
@@ -1138,18 +1059,22 @@ class TensorAddQuant(Cell):
     """
 
     def __init__(self,
+                 ema_decay=0.999,
+                 per_channel=False,
                  num_bits=8,
-                 quant_delay=0,
                  symmetric=False,
-                 narrow_range=False):
+                 narrow_range=False,
+                 quant_delay=0):
         super(TensorAddQuant, self).__init__()
         self.fake_quant_act = FakeQuantWithMinMax(min_init=-6,
                                                   max_init=6,
-                                                  num_bits=num_bits,
-                                                  quant_delay=quant_delay,
                                                   ema=True,
+                                                  ema_decay=ema_decay,
+                                                  per_channel=per_channel,
+                                                  num_bits=num_bits,
                                                   symmetric=symmetric,
-                                                  narrow_range=narrow_range)
+                                                  narrow_range=narrow_range,
+                                                  quant_delay=quant_delay)
         self.add = P.TensorAdd()
 
     def construct(self, x1, x2):
@@ -1165,10 +1090,12 @@ class MulQuant(Cell):
     For a more Detailed overview of Mul op.
 
     Args:
+        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
+        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
         num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
-        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
 
     Inputs:
         - **x** (Tensor) - The input of MulQuant.
@@ -1179,21 +1106,99 @@ class MulQuant(Cell):
     """
 
     def __init__(self,
+                 ema_decay=0.999,
+                 per_channel=False,
                  num_bits=8,
-                 quant_delay=0,
                  symmetric=False,
-                 narrow_range=False):
+                 narrow_range=False,
+                 quant_delay=0):
         super(MulQuant, self).__init__()
         self.fake_quant_act = FakeQuantWithMinMax(min_init=-6,
                                                   max_init=6,
-                                                  num_bits=num_bits,
-                                                  quant_delay=quant_delay,
                                                   ema=True,
+                                                  ema_decay=ema_decay,
+                                                  per_channel=per_channel,
+                                                  num_bits=num_bits,
                                                   symmetric=symmetric,
-                                                  narrow_range=narrow_range)
+                                                  narrow_range=narrow_range,
+                                                  quant_delay=quant_delay)
         self.mul = P.Mul()
 
     def construct(self, x1, x2):
         x = self.mul(x1, x2)
         x = self.fake_quant_act(x)
         return x
+
+
+class QuantBlock(Cell):
+    r"""
+    A quant block of Conv/Dense, activation layer for Ascend deploy.
+
+    Calculate Conv or Dense in Int8, with AscendQuant and AscendDeQuant.
+
+    Notes:
+        This block is only for deploy, and not trainable.
+
+    Args:
+        in_channels (int): The number of channels in the input space.
+        out_channels (int): The number of channels in the output space.
+        weight_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable weight_init parameter. The dtype
+            is same as input x. The values of str refer to the function `initializer`. Default: 'normal'.
+        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
+            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
+        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
+        activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
+        batchnorm (bool): Specifies to used batchnorm or not. Default: None.
+        activation (string): Specifies activation type. The optional values are as following:
+            'softmax', 'logsoftmax', 'relu', 'relu6', 'tanh', 'gelu', 'sigmoid',
+            'prelu', 'leakyrelu', 'hswish', 'hsigmoid'. Default: None.
+
+    Inputs:
+        - **input** (Tensor) - Tensor of shape :math:`(N, in\_channels)`.
+
+    Outputs:
+        Tensor of shape :math:`(N, out\_channels)`.
+
+    Examples:
+        >>> net = nn.Dense(3, 4)
+        >>> input = Tensor(np.random.randint(0, 255, [2, 3]), mindspore.float32)
+        >>> net(input)
+    """
+
+    def __init__(self,
+                 core_op,
+                 weight,
+                 quant_op,
+                 dequant_op,
+                 dequant_scale,
+                 bias=None,
+                 activation=None):
+        super(QuantBlock, self).__init__()
+        self.core_op = core_op
+        self.weight = weight
+        self.quant = quant_op
+        self.dequant = dequant_op
+        self.dequant_scale = dequant_scale
+        self.bias = bias
+        self.has_bias = bias is None
+        self.activation = activation
+        self.has_act = activation is None
+
+    def construct(self, x):
+        x = self.quant(x)
+        x = self.core_op(x, self.weight)
+        if self.has_bias:
+            output = self.bias_add(output, self.bias)
+        if self.has_act:
+            x = self.activation(x)
+        x = self.dequant(x, self.dequant_scale)
+        return x
+
+    def extend_repr(self):
+        str_info = f'quant={self.quant}, core_op={type(self.core_op)}'
+        if self.has_bias:
+            str_info = str_info + f', bias={self.bias}'
+        if self.has_act:
+            str_info = str_info + f', activation={self.activation}'
+        str_info = str_info + f', dequant={self.dequant}'
+        return str_info
diff --git a/mindspore/nn/loss/loss.py b/mindspore/nn/loss/loss.py
index c7e38fd943..4639229c41 100644
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -18,6 +18,7 @@ from mindspore.common.tensor import Tensor
 from mindspore.ops import operations as P
 from mindspore.ops import functional as F
 from mindspore.ops.primitive import constexpr
+from mindspore.ops import _selected_ops
 from mindspore.nn.cell import Cell
 from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
@@ -44,7 +45,7 @@ class _Loss(Cell):
         if reduction == 'none':
             self.reduce = False
 
-        self.reduce_mean = P.ReduceMean()
+        self.reduce_mean = _selected_ops.ReduceMean()
         self.reduce_sum = P.ReduceSum()
 
     def get_axis(self, x):
@@ -245,11 +246,11 @@ class SoftmaxCrossEntropyWithLogits(_Loss):
         super(SoftmaxCrossEntropyWithLogits, self).__init__(reduction)
         self.is_grad = is_grad
         self.sparse = sparse
-        validator.check_integer("num_classes", num_classes, 1, Rel.GT, self.cls_name)
-        validator.check_number_range("smooth_factor", smooth_factor, 0, 1, Rel.INC_BOTH, self.cls_name)
+        validator.check_number_range(
+            "smooth_factor", smooth_factor, 0, 1, Rel.INC_BOTH, self.cls_name)
         self.smooth_factor = smooth_factor
         self.num_classes = num_classes
-        self.softmax_cross_entropy = P.SoftmaxCrossEntropyWithLogits()
+        self.softmax_cross_entropy = _selected_ops.SoftmaxCrossEntropyWithLogits()
         self.one_hot = P.OneHot()
         self.on_value = Tensor(1.0 - self.smooth_factor, mstype.float32)
         self.off_value = Tensor(1.0 * self.smooth_factor / (self.num_classes - 1), mstype.float32)
@@ -393,7 +394,7 @@ class CosineEmbeddingLoss(_Loss):
 
         pos_value = 1.0 - cosine
         neg_value = self.maximum(cosine - self.margin, 0.0)
-        zeros = F.zeros_like_tensor(cosine)
+        zeros = F.zeros_like(cosine)
         pos_part = F.select(y == 1, pos_value, zeros)
         neg_part = F.select(y == -1, neg_value, zeros)
         output_unreduced = pos_part + neg_part
diff --git a/mindspore/nn/optim/__init__.py b/mindspore/nn/optim/__init__.py
index 8f21179893..f1dac586bc 100644
--- a/mindspore/nn/optim/__init__.py
+++ b/mindspore/nn/optim/__init__.py
@@ -26,6 +26,8 @@ from .sgd import SGD
 from .lars import LARS
 from .ftrl import FTRL
 from .rmsprop import RMSProp
+from .proximal_ada_grad import ProximalAdagrad
+from .lazyadam import LazyAdam
 
-__all__ = ['Optimizer', 'Momentum', 'LARS', 'Adam', 'AdamWeightDecay',
-           'AdamWeightDecayDynamicLR', 'Lamb', 'SGD', 'FTRL', 'RMSProp']
+__all__ = ['Optimizer', 'Momentum', 'LARS', 'Adam', 'AdamWeightDecay', 'LazyAdam',
+           'AdamWeightDecayDynamicLR', 'Lamb', 'SGD', 'FTRL', 'RMSProp', 'ProximalAdagrad']
diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py
index 2138aed741..5a40d30d5a 100755
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -26,12 +26,10 @@ from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
 from .optimizer import Optimizer
 
-_learning_rate_update_func = ['linear', 'cos', 'sin']
+_adam_opt = C.MultitypeFuncGraph("adam_opt")
 
-adam_opt = C.MultitypeFuncGraph("adam_opt")
 
-
-@adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
+@_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
 def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag):
     """
     Update parameters.
@@ -67,16 +65,16 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, grad
     next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32)
                                             - beta2, op_square(gradient_fp32))
 
-    update = next_m / (op_sqrt(next_v) + eps)
+    update = next_m / (eps + op_sqrt(next_v))
     if decay_flag:
-        update = update + op_mul(weight_decay_tensor, param_fp32)
+        update = op_mul(weight_decay_tensor, param_fp32) + update
 
     update_with_lr = op_mul(lr, update)
     next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))
 
-    next_v = F.depend(next_v, F.assign(param, next_param))
-    next_v = F.depend(next_v, F.assign(m, next_m))
-    next_v = F.depend(next_v, F.assign(v, next_v))
+    next_v = F.depend(next_v, F.assign(param, op_cast(next_param, mstype.float16)))
+    next_v = F.depend(next_v, F.assign(m, op_cast(next_m, mstype.float16)))
+    next_v = F.depend(next_v, F.assign(v, op_cast(next_v, mstype.float16)))
     return next_v
 
 
@@ -94,19 +92,30 @@ def _check_param_value(beta1, beta2, eps, weight_decay, prim_name):
 
 def _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, power, prim_name):
     """Check the type of inputs."""
-    validator.check_float_positive('learning_rate', learning_rate, prim_name)
-    validator.check_float_legal_value('learning_rate', learning_rate, prim_name)
-    validator.check_float_positive('end_learning_rate', end_learning_rate, prim_name)
-    validator.check_float_legal_value('end_learning_rate', end_learning_rate, prim_name)
+    validator.check_value_type("learning_rate", learning_rate, [float], prim_name)
+    validator.check_number_range("learning_rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
+    validator.check_value_type("end_learning_rate", end_learning_rate, [float], prim_name)
+    validator.check_number_range("end_learning_rate", end_learning_rate, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
     validator.check_float_positive('power', power, prim_name)
     validator.check_float_legal_value('power', power, prim_name)
     validator.check_integer('decay_steps', decay_steps, 0, Rel.GT, prim_name)
 
 
-@adam_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", "Tensor",
-                   "Tensor")
-def _run_opt_with_one_number(opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params, moment1,
-                             moment2):
+@_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tuple",
+                    "Tensor", "Tensor", "Tensor")
+def _run_opt_with_sparse(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
+                         moment1, moment2):
+    """Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
+    success = True
+    success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
+                                           eps, gradient[1], gradient[0]))
+    return success
+
+
+@_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor",
+                    "Tensor", "Tensor", "Tensor")
+def _run_opt_with_one_number(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
+                             moment1, moment2):
     """Apply adam optimizer to the weight parameter using Tensor."""
     success = True
     success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
@@ -144,10 +153,16 @@ class Adam(Optimizer):
         value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
         applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
 
+        To improve parameter groups performance, the customized order of parameters can be supported.
+
+        The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
+        `sparse_grad` of `Parameter` being set. The sparse feature is under continuous development. The sparse
+        behavior is currently performed on the CPU, weight decay is not supported.
+
     Args:
         params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
             the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
-            "lr" and "weight_decay" are the keys can be parsed.
+            "lr", "weight_decay" and "order_params" are the keys can be parsed.
 
             - params: Required. The value should be a list of `Parameter`.
 
@@ -157,13 +172,19 @@ class Adam(Optimizer):
             - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
               will be used. If not, the `weight_decay` in the API will be used.
 
-        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
-                                                        Iterable or a Tensor and the dims of the Tensor is 1,
-                                                        use dynamic learning rate, then the i-th step will
-                                                        take the i-th value as the learning rate.
-                                                        When the learning_rate is float or learning_rate is a Tensor
-                                                        but the dims of the Tensor is 0, use fixed learning rate.
-                                                        Other cases are not supported. Default: 1e-3.
+            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
+              in the value of 'order_params' but not in any group will use default learning rate and default weight
+              decay.
+
+        learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
+                                                             Iterable or a Tensor and the dims of the Tensor is 1,
+                                                             use dynamic learning rate, then the i-th step will
+                                                             take the i-th value as the learning rate.
+                                                             When the learning_rate is float or learning_rate is a
+                                                             Tensor but the dims of the Tensor is 0, use fixed learning
+                                                             rate. Other cases are not supported. It should be equal to
+                                                             or greater than 0. Default: 1e-3.
         beta1 (float): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). Default:
                        0.9.
         beta2 (float): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). Default:
@@ -176,9 +197,8 @@ class Adam(Optimizer):
         use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
             If True, updates the gradients using NAG.
             If False, updates the gradients without using NAG. Default: False.
-        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
-        loss_scale (float): A floating point value for the loss scale. Should be equal to or greater than 1. Default:
-                            1.0.
+        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
+        loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0.
 
     Inputs:
         - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@@ -193,13 +213,16 @@ class Adam(Optimizer):
         >>>
         >>> #2) Use parameter groups and set different values
         >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
-        >>>                 {'params': no_conv_params}]
+        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
+        >>>                 {'params': bias_params, 'lr': 0.01},
+        >>>                 {'order_params': net.trainable_params()}]
         >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0)
-        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
-        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
-        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
+        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
+        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
+        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
+        >>> # of default value 0.1 and a weight decay of default value 0.0.
         >>>
         >>> loss = nn.SoftmaxCrossEntropyWithLogits()
         >>> model = Model(net, loss_fn=loss, optimizer=optim)
@@ -211,8 +234,6 @@ class Adam(Optimizer):
         _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
         validator.check_value_type("use_locking", use_locking, [bool], self.cls_name)
         validator.check_value_type("use_nesterov", use_nesterov, [bool], self.cls_name)
-        validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name)
-        validator.check_number_range("loss_scale", loss_scale, 1.0, float("inf"), Rel.INC_LEFT, self.cls_name)
 
         self.beta1 = Tensor(beta1, mstype.float32)
         self.beta2 = Tensor(beta2, mstype.float32)
@@ -225,11 +246,7 @@ class Adam(Optimizer):
 
         self.hyper_map = C.HyperMap()
         self.opt = P.Adam(use_locking, use_nesterov)
-
-        self.pow = P.Pow()
-        self.sqrt = P.Sqrt()
-        self.one = Tensor(np.array([1.0]).astype(np.float32))
-        self.realdiv = P.RealDiv()
+        self.sparse_opt = P.SparseApplyAdam(use_locking, use_nesterov)
 
     def construct(self, gradients):
         params = self.parameters
@@ -244,13 +261,13 @@ class Adam(Optimizer):
         beta2_power = self.beta2_power * self.beta2
         self.beta2_power = beta2_power
         if self.is_group_lr:
-            success = self.hyper_map(F.partial(adam_opt, self.opt, beta1_power, beta2_power, self.beta1,
-                                               self.beta2, self.eps),
-                                     lr, gradients, params, moment1, moment2)
+            success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, beta1_power, beta2_power,
+                                          self.beta1, self.beta2, self.eps),
+                                lr, gradients, params, moment1, moment2)
         else:
-            success = self.hyper_map(F.partial(adam_opt, self.opt, beta1_power, beta2_power, self.beta1,
-                                               self.beta2, self.eps, lr),
-                                     gradients, params, moment1, moment2)
+            success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, beta1_power, beta2_power,
+                                          self.beta1, self.beta2, self.eps, lr),
+                                gradients, params, moment1, moment2)
         return success
 
 
@@ -267,14 +284,15 @@ class AdamWeightDecay(Optimizer):
                                                         take the i-th value as the learning rate.
                                                         When the learning_rate is float or learning_rate is a Tensor
                                                         but the dims of the Tensor is 0, use fixed learning rate.
-                                                        Other cases are not supported. Default: 1e-3.
+                                                        Other cases are not supported. It should be equal to or
+                                                        greater than 0. Default: 1e-3.
         beta1 (float): The exponential decay rate for the 1st moment estimates. Default: 0.9.
             Should be in range (0.0, 1.0).
         beta2 (float): The exponential decay rate for the 2nd moment estimates. Default: 0.999.
             Should be in range (0.0, 1.0).
         eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6.
             Should be greater than 0.
-        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
+        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
         decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
                                  lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name.
 
@@ -310,7 +328,7 @@ class AdamWeightDecay(Optimizer):
 
     def construct(self, gradients):
         lr = self.get_lr()
-        updated_velocity = self.hyper_map(F.partial(adam_opt, self.beta1, self.beta2, self.eps, lr,
+        updated_velocity = self.hyper_map(F.partial(_adam_opt, self.beta1, self.beta2, self.eps, lr,
                                                     self.weight_decay_tensor),
                                           self.params, self.moments1, self.moments2, gradients, self.decay_flag)
 
@@ -324,17 +342,20 @@ class AdamWeightDecayDynamicLR(Optimizer):
     Args:
         params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
                                   should be class mindspore.Parameter.
-        decay_steps (int): The steps of the decay.
-        learning_rate (float): A floating point value for the learning rate. Default: 0.001.
-        end_learning_rate (float): A floating point value for the end learning rate. Default: 0.0001.
-        power (float): Power. Default: 10.0.
+        decay_steps (int): The steps of the decay. It must be int and positive.
+        warmup_steps (int): The steps of lr warm up. Default: 0.
+        learning_rate (float): A floating point value for the learning rate. It should be equal to or
+            greater than 0. Default: 0.001.
+        end_learning_rate (float): A floating point value for the end learning rate. It should be equal
+            to or greater than 0. Default: 0.0001.
+        power (float): The Power of the polynomial. It must be positive. Default: 10.0.
         beta1 (float): The exponential decay rate for the 1st moment estimates. Default: 0.9.
             Should be in range (0.0, 1.0).
         beta2 (float): The exponential decay rate for the 2nd moment estimates. Default: 0.999.
             Should be in range (0.0, 1.0).
         eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6.
             Should be greater than 0.
-        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
+        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
         decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
                                  lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name.
 
@@ -353,6 +374,7 @@ class AdamWeightDecayDynamicLR(Optimizer):
     def __init__(self,
                  params,
                  decay_steps,
+                 warmup_steps=0,
                  learning_rate=0.001,
                  end_learning_rate=0.0001,
                  power=10.0,
@@ -360,13 +382,13 @@ class AdamWeightDecayDynamicLR(Optimizer):
                  beta2=0.999,
                  eps=1e-6,
                  weight_decay=0.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name,
-                 warmup_steps=0):
-        super(AdamWeightDecayDynamicLR, self).__init__(learning_rate, params)
+                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
+        super(AdamWeightDecayDynamicLR, self).__init__(0.0, params)
         if self.is_group:
             raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
         _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, power, self.cls_name)
+        validator.check_integer('warmup_steps', warmup_steps, 0, Rel.GE, self.cls_name)
         # turn them to scalar when me support scalar/tensor mix operations
         self.global_step = Parameter(initializer(0, [1]), name="global_step")
         self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32))
@@ -402,7 +424,7 @@ class AdamWeightDecayDynamicLR(Optimizer):
             warmup_lr = self.start_learning_rate * warmup_percent
             is_warmup = self.cast(self.greater(self.warmup_steps, self.global_step), mstype.float32)
             lr = (self.one - is_warmup) * lr + is_warmup * warmup_lr
-        updated_velocity = self.hyper_map(F.partial(adam_opt, self.beta1, self.beta2, self.eps, lr,
+        updated_velocity = self.hyper_map(F.partial(_adam_opt, self.beta1, self.beta2, self.eps, lr,
                                                     self.weight_decay_tensor),
                                           self.params, self.moments1, self.moments2, gradients, self.decay_flag)
 
diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py
index 33edafa4e2..a40d6737cb 100644
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@@ -18,28 +18,34 @@ from mindspore.common import Tensor
 import mindspore.common.dtype as mstype
 from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
-from .optimizer import Optimizer, apply_decay, grad_scale
+from .optimizer import Optimizer, _apply_decay, _grad_scale
 
-ftrl_opt = C.MultitypeFuncGraph("ftrl_opt")
+_ftrl_opt = C.MultitypeFuncGraph("ftrl_opt")
 
 
-@ftrl_opt.register("Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor")
-def _tensor_run_opt(opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment):
+@_ftrl_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tuple", "Tensor",
+                    "Tensor")
+def _tensor_run_opt_with_sparse(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment):
+    """Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse."""
+    success = True
+    success = F.depend(success, spars_opt(weight, moment, linear, gradient[1], gradient[0]))
+    return success
+
+
+@_ftrl_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor",
+                    "Tensor")
+def _tensor_run_opt(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment):
     """Apply ftrl optimizer to the weight parameter."""
     success = True
     success = F.depend(success, opt(weight, moment, linear, gradient, learning_rate, l1, l2, lr_power))
     return success
 
 
-def _check_param(initial_accum, learning_rate, lr_power, l1, l2, use_locking, loss_scale=1.0, weight_decay=0.0,
-                 prim_name=None):
+def _check_param(initial_accum, lr_power, l1, l2, use_locking, weight_decay=0.0, prim_name=None):
     """Check param."""
     validator.check_value_type("initial_accum", initial_accum, [float], prim_name)
     validator.check_number("initial_accum", initial_accum, 0.0, Rel.GE, prim_name)
 
-    validator.check_value_type("learning_rate", learning_rate, [float], prim_name)
-    validator.check_number("learning_rate", learning_rate, 0.0, Rel.GT, prim_name)
-
     validator.check_value_type("lr_power", lr_power, [float], prim_name)
     validator.check_number("lr_power", lr_power, 0.0, Rel.LE, prim_name)
 
@@ -51,9 +57,6 @@ def _check_param(initial_accum, learning_rate, lr_power, l1, l2, use_locking, lo
 
     validator.check_value_type("use_locking", use_locking, [bool], prim_name)
 
-    validator.check_value_type("loss_scale", loss_scale, [float], prim_name)
-    validator.check_number("loss_scale", loss_scale, 1.0, Rel.GE, prim_name)
-
     validator.check_value_type("weight_decay", weight_decay, [float], prim_name)
     validator.check_number("weight_decay", weight_decay, 0.0, Rel.GE, prim_name)
 
@@ -67,6 +70,11 @@ class FTRL(Optimizer):
     <https://arxiv.org/abs/1002.4908>`_. Refer to paper `Ad Click Prediction: a View from the Trenches
     <https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf>`_ for engineering document.
 
+    Note:
+        The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
+        `sparse_grad` of `Parameter` being set. The sparse feature is under continuous development. The sparse
+        behavior is currently performed on the CPU, weight decay is not supported.
+
     Args:
         params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
             should be Parameter.
@@ -95,32 +103,30 @@ class FTRL(Optimizer):
     """
     def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0,
                  use_locking=False, loss_scale=1.0, weight_decay=0.0):
-        super(FTRL, self).__init__(learning_rate, params)
+        super(FTRL, self).__init__(learning_rate, params, loss_scale=loss_scale)
         if self.is_group:
             raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
-        _check_param(initial_accum, learning_rate, lr_power, l1, l2, use_locking, loss_scale, weight_decay,
-                     self.cls_name)
+        _check_param(initial_accum, lr_power, l1, l2, use_locking, weight_decay, self.cls_name)
         self.moments = self.parameters.clone(prefix="moments", init=initial_accum)
         self.linear = self.parameters.clone(prefix="linear", init='zeros')
         self.l1 = l1
         self.l2 = l2
         self.lr_power = lr_power
-        self.reciprocal_scale = 1.0 / loss_scale
         self.weight_decay = weight_decay
         self.decay_tf = tuple((lambda: True)() for x in self.parameters)
         self.hyper_map = C.HyperMap()
         self.opt = P.ApplyFtrl(use_locking=use_locking)
-        self.one = Tensor(1, mstype.int32)
+        self.sparse_opt = P.SparseApplyFtrl(learning_rate, l1, l2, lr_power, use_locking=use_locking)
 
     def construct(self, grads):
         params = self.parameters
         moments = self.moments
         linear = self.linear
-        if self.weight_decay > 0.0:
-            grads = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_tf, params, grads)
-        if self.reciprocal_scale != 1.0:
-            grads = self.hyper_map(F.partial(grad_scale, self.reciprocal_scale), grads)
         lr = self.learning_rate
-        success = self.hyper_map(F.partial(ftrl_opt, self.opt, lr, self.l1, self.l2, self.lr_power),
-                                 linear, grads, params, moments)
+        if self.weight_decay > 0.0:
+            grads = self.hyper_map(F.partial(_apply_decay, self.weight_decay), self.decay_tf, params, grads)
+
+        grads = self.scale_grad(grads)
+        success = self.map_(F.partial(_ftrl_opt, self.opt, self.sparse_opt, lr, self.l1, self.l2, self.lr_power),
+                            linear, grads, params, moments)
         return success
diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index f189f1cd02..832b35d66f 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -14,6 +14,7 @@
 # ============================================================================
 """lamb"""
 import numpy as np
+from mindspore import context
 from mindspore.common import dtype as mstype
 from mindspore.common.initializer import initializer
 from mindspore.ops import operations as P
@@ -25,13 +26,15 @@ from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
 from .optimizer import Optimizer
 from .. import layer
+from .. import graph_kernels as G
 
 num_one = Tensor(np.ones([1]), mstype.float32)
 
-lamb_opt = C.MultitypeFuncGraph("lamb_opt")
+_lamb_opt = C.MultitypeFuncGraph("lamb_opt")
 
-@lamb_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
-                   "Tensor", "Bool")
+
+@_lamb_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
+                    "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
 def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, param, m, v,
                    gradient, decay_flag):
     """
@@ -72,9 +75,11 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, para
     v_fp32 = op_cast(v, mstype.float32)
     gradient_fp32 = op_cast(gradient, mstype.float32)
 
-    next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(num_one, mstype.float32) - beta1, gradient_fp32)
+    next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(num_one,
+                                                    mstype.float32) - beta1, gradient_fp32)
 
-    next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32))
+    next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(num_one,
+                                                    mstype.float32) - beta2, op_square(gradient_fp32))
 
     next_mm = next_m / (op_cast(num_one, mstype.float32)
                         - op_pow(beta1, op_cast(global_step + num_one, mstype.float32)))
@@ -83,8 +88,9 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, para
     w_norm = op_norm(param_fp32)
     g_norm = op_norm(gradient_fp32)
 
-    g_norm_hat = op_norm(op_mul(next_mm, op_rsqrt(next_vv + eps)) + weight_decay_tensor * param_fp32)
-    zeros = F.zeros_like_tensor(w_norm)
+    g_norm_hat = op_norm(op_mul(next_mm, op_rsqrt(
+        next_vv + eps)) + weight_decay_tensor * param_fp32)
+    zeros = F.zeros_like(w_norm)
     ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
     trust_ratio = op_select(
         op_greater(w_norm, zeros),
@@ -108,13 +114,79 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, para
     return next_v
 
 
+lamb_opt_graph_kernel = C.MultitypeFuncGraph("lamb_opt_graph_kernel")
+
+
+@lamb_opt_graph_kernel.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
+                                "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
+def _update_run_op_graph_kernel(beta1, beta2, eps, lr, weight_decay_tensor,
+                                global_step, param, m, v, gradient, decay_flag):
+    """
+    Update parameters.
+
+    Args:
+        beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0).
+        beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0).
+        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
+        lr (Tensor): Learning rate.
+        weight_decay_tensor (Tensor): Weight decay. Should be equal to or greater than 0.
+        global_step (Tensor): Global step.
+        param (Tensor): Parameters.
+        m (Tensor): m value of parameters.
+        v (Tensor): v value of parameters.
+        gradient (Tensor): Gradient of parameters.
+        decay_flag (bool): Specifies whether param update with weight decay.
+
+    Returns:
+        Tensor, the new value of v after updating.
+    """
+    op_mul = P.Mul()
+    op_square = P.Square()
+    op_cast = P.Cast()
+    op_shape = P.Shape()
+    op_pow = P.Pow()
+    op_norm = layer.Norm()
+    op_fill = P.Fill()
+    op_dtype = P.DType()
+
+    param_fp32 = op_cast(param, mstype.float32)
+    gradient_fp32 = op_cast(gradient, mstype.float32)
+
+    i6_ex = op_cast(global_step + num_one, mstype.float32)
+    i9 = op_cast(num_one, mstype.float32) - beta1
+    x1 = op_cast(num_one, mstype.float32) - beta2
+    i6 = op_cast(num_one, mstype.float32) - op_pow(beta1, i6_ex)
+    i3 = op_cast(num_one, mstype.float32) - op_pow(beta2, i6_ex)
+    i1 = op_square(gradient_fp32)
+    add3, update = G.LambNextMV()(i1, v, i3, gradient, m, i6, param, beta1,
+                                  i9, beta2, x1, weight_decay_tensor, eps)
+
+    if decay_flag:
+        update = update + op_mul(weight_decay_tensor, param_fp32)
+
+    w_norm = op_norm(param_fp32)
+    g_norm = op_norm(gradient_fp32)
+    g_norm_hat = op_norm(add3)
+
+    zeros = F.zeros_like(w_norm)
+    ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
+    tens = op_fill(op_dtype(w_norm), op_shape(w_norm), 10.0)
+
+    next_param = G.LambUpdateWithLR()(g_norm, w_norm, g_norm_hat, lr, update,
+                                      param, zeros, ones, tens)
+    next_v = F.control_depend(add3, next_param)
+    return next_v
+
+
 def _check_param_value(decay_steps, warmup_steps, start_learning_rate,
                        end_learning_rate, power, beta1, beta2, eps, weight_decay, prim_name):
     """Check the type of inputs."""
-    validator.check_float_positive('start_learning_rate', start_learning_rate, prim_name)
-    validator.check_float_legal_value('start_learning_rate', start_learning_rate, prim_name)
+    validator.check_value_type("start_learning_rate", start_learning_rate, [float], prim_name)
+    validator.check_number_range("start_learning_rate rate", start_learning_rate, 0.0, float("inf"), Rel.INC_LEFT,
+                                 prim_name)
     validator.check_value_type("end_learning_rate", end_learning_rate, [float], prim_name)
-    validator.check_float_legal_value('end_learning_rate', end_learning_rate, prim_name)
+    validator.check_number_range("end_learning_rate", end_learning_rate, 0.0, float("inf"), Rel.INC_LEFT,
+                                 prim_name)
     validator.check_float_positive('power', power, prim_name)
     validator.check_float_legal_value('power', power, prim_name)
     validator.check_integer('decay_steps', decay_steps, 0, Rel.GT, prim_name)
@@ -122,11 +194,16 @@ def _check_param_value(decay_steps, warmup_steps, start_learning_rate,
     validator.check_value_type("beta1", beta1, [float], prim_name)
     validator.check_value_type("beta2", beta2, [float], prim_name)
     validator.check_value_type("eps", eps, [float], prim_name)
-    validator.check_value_type("weight_dacay", weight_decay, [float], prim_name)
-    validator.check_number_range("beta1", beta1, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
-    validator.check_number_range("beta2", beta2, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
-    validator.check_number_range("eps", eps, 0.0, float("inf"), Rel.INC_NEITHER, prim_name)
-    validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
+    validator.check_value_type(
+        "weight_dacay", weight_decay, [float], prim_name)
+    validator.check_number_range(
+        "beta1", beta1, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
+    validator.check_number_range(
+        "beta2", beta2, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
+    validator.check_number_range(
+        "eps", eps, 0.0, float("inf"), Rel.INC_NEITHER, prim_name)
+    validator.check_number_range(
+        "weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
 
 
 class Lamb(Optimizer):
@@ -141,10 +218,12 @@ class Lamb(Optimizer):
         params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
                                   should be class mindspore.Parameter.
         decay_steps (int): The steps of the lr decay. Should be equal to or greater than 1.
-        warmup_steps (int): The steps of lr warm up. Default: 0.
-        start_learning_rate (float): A floating point value for the learning rate. Default: 0.1.
-        end_learning_rate (float): A floating point value for the end learning rate. Default: 0.0001.
-        power (float): The power of the polynomial. Default: 1.0.
+        warmup_steps (int): The steps of lr warm up. Should be equal to or greater than 0. Default: 0.
+        start_learning_rate (float): A floating point value for the learning rate. Should be equal to
+            or greater than 0. Default: 0.1.
+        end_learning_rate (float): A floating point value for the end learning rate. Should be equal to
+            or greater than 0. Default: 0.0001.
+        power (float): The power of the polynomial. It must be positive. Default: 1.0.
         beta1 (float): The exponential decay rate for the 1st moment estimates. Default: 0.9.
             Should be in range (0.0, 1.0).
         beta2 (float): The exponential decay rate for the 2nd moment estimates. Default: 0.999.
@@ -180,10 +259,10 @@ class Lamb(Optimizer):
                  eps=1e-6,
                  weight_decay=0.0,
                  decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()):
-
-        super(Lamb, self).__init__(start_learning_rate, params)
+        super(Lamb, self).__init__(0.0, params)
         if self.is_group:
-            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
+            raise RuntimeError(
+                f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param_value(decay_steps, warmup_steps, start_learning_rate, end_learning_rate,
                            power, beta1, beta2, eps, weight_decay, self.cls_name)
 
@@ -195,14 +274,18 @@ class Lamb(Optimizer):
         if warmup_steps > 0:
             self.warmup_flag = True
         self.decay_steps = Tensor(np.array([decay_steps]).astype(np.float32))
-        self.start_learning_rate = Tensor(np.array([start_learning_rate]).astype(np.float32))
-        self.end_learning_rate = Tensor(np.array([end_learning_rate]).astype(np.float32))
-        self.diff_learning_rate = Tensor(np.array([start_learning_rate - end_learning_rate]).astype(np.float32))
+        self.start_learning_rate = Tensor(
+            np.array([start_learning_rate]).astype(np.float32))
+        self.end_learning_rate = Tensor(
+            np.array([end_learning_rate]).astype(np.float32))
+        self.diff_learning_rate = Tensor(
+            np.array([start_learning_rate - end_learning_rate]).astype(np.float32))
         self.power = power
         self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
         self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
         self.eps = Tensor(np.array([eps]).astype(np.float32))
-        self.weight_decay_tensor = Tensor(np.array([weight_decay]).astype(np.float32))
+        self.weight_decay_tensor = Tensor(
+            np.array([weight_decay]).astype(np.float32))
         self.params = self.parameters
         self.moments1 = self.params.clone(prefix="lamb_m", init='zeros')
         self.moments2 = self.params.clone(prefix="lamb_v", init='zeros')
@@ -214,19 +297,29 @@ class Lamb(Optimizer):
         self.greater = P.Greater()
         self.one = Tensor(np.array([1.0]).astype(np.float32))
         self.cast = P.Cast()
+        self.enable_graph_kernel = context.get_context("enable_graph_kernel")
 
     def construct(self, gradients):
         step = self.min(self.global_step, self.decay_steps)
         p = step / self.decay_steps
-        lr = self.diff_learning_rate * self.pow(self.one - p, self.power) + self.end_learning_rate
+        lr = self.diff_learning_rate * \
+            self.pow(self.one - p, self.power) + self.end_learning_rate
         if self.warmup_flag:
             warmup_percent = self.global_step / self.warmup_steps
             warmup_lr = self.start_learning_rate * warmup_percent
-            is_warmup = self.cast(self.greater(self.warmup_steps, self.global_step), mstype.float32)
+            is_warmup = self.cast(self.greater(
+                self.warmup_steps, self.global_step), mstype.float32)
             lr = (self.one - is_warmup) * lr + is_warmup * warmup_lr
-        updated_velocity = self.hyper_map(F.partial(lamb_opt, self.beta1, self.beta2, self.eps, lr,
-                                                    self.weight_decay_tensor, self.global_step),
-                                          self.params, self.moments1, self.moments2, gradients, self.decay_flag)
+        if self.enable_graph_kernel:
+            updated_velocity = self.hyper_map(F.partial(lamb_opt_graph_kernel,
+                                                        self.beta1, self.beta2, self.eps, lr,
+                                                        self.weight_decay_tensor, self.global_step),
+                                              self.params, self.moments1, self.moments2, gradients, self.decay_flag)
+        else:
+            updated_velocity = self.hyper_map(F.partial(_lamb_opt,
+                                                        self.beta1, self.beta2, self.eps, lr,
+                                                        self.weight_decay_tensor, self.global_step),
+                                              self.params, self.moments1, self.moments2, gradients, self.decay_flag)
 
         added_global_step = self.global_step + self.one
         F.control_depend(lr, added_global_step)
diff --git a/mindspore/nn/optim/lars.py b/mindspore/nn/optim/lars.py
index 3d85a05867..b55d1c5574 100755
--- a/mindspore/nn/optim/lars.py
+++ b/mindspore/nn/optim/lars.py
@@ -22,12 +22,12 @@ from mindspore.ops import operations as P
 from mindspore.ops import composite as C
 from mindspore.ops import functional as F
 from mindspore._checkparam import Validator as validator
-from .optimizer import grad_scale, Optimizer
+from .optimizer import _grad_scale, Optimizer
 
-lars_opt = C.MultitypeFuncGraph("lars_opt")
+_lars_opt = C.MultitypeFuncGraph("lars_opt")
 
 
-@lars_opt.register("Function", "Number", "Tensor", "Tensor", "Tensor", "Bool", "Bool")
+@_lars_opt.register("Function", "Number", "Tensor", "Tensor", "Tensor", "Bool", "Bool")
 def _tensor_run_opt(lars, weight_decay, learning_rate, gradient, weight, decay_flag, lars_flag):
     """Apply lars optimizer to the weight parameter."""
     if lars_flag:
@@ -59,13 +59,13 @@ class LARS(Optimizer):
         optimizer (Optimizer): MindSpore optimizer for which to wrap and modify gradients.
         epsilon (float): Term added to the denominator to improve numerical stability. Default: 1e-05.
         hyperpara (float): Trust coefficient for calculating the local learning rate. Default: 0.001.
-        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
+        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
         use_clip (bool): Whether to use clip operation for calculating the local learning rate. Default: False.
         decay_filter (Function): A function to determine whether apply weight decay on parameters. Default:
                                  lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name.
         lars_filter (Function): A function to determine whether apply lars algorithm. Default:
                                 lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name.
-        loss_scale (float): A floating point value for the loss scale. Default: 1.0.
+        loss_scale (float): A floating point value for the loss scale. It should be greater than 0. Default: 1.0.
 
     Inputs:
         - **gradients** (tuple[Tensor]) - The gradients of `params` in optimizer, the shape is
@@ -94,7 +94,7 @@ class LARS(Optimizer):
         self.learning_rate = optimizer.learning_rate
         self.lars = P.LARSUpdate(epsilon, hyperpara, use_clip)
         self.reciprocal_scale = 1.0 / loss_scale
-        self.weight_decay = weight_decay * loss_scale
+        self.weight_decay = weight_decay
         self.cast = P.Cast()
         self.decay_flag = tuple(decay_filter(x) for x in self.parameters)
         self.lars_flag = tuple(lars_filter(x) for x in self.parameters)
@@ -119,9 +119,9 @@ class LARS(Optimizer):
         else:
             lr = self.learning_rate
         if self.reciprocal_scale != 1.0:
-            gradients = self.hyper_map(F.partial(grad_scale, self.reciprocal_scale), gradients)
+            gradients = self.hyper_map(F.partial(_grad_scale, self.reciprocal_scale), gradients)
 
-        grad_t = self.hyper_map(F.partial(lars_opt, self.lars, self.weight_decay, lr),
+        grad_t = self.hyper_map(F.partial(_lars_opt, self.lars, self.weight_decay, lr),
                                 gradients, params, self.decay_flag, self.lars_flag)
         success = self.opt(grad_t)
 
diff --git a/mindspore/nn/optim/lazyadam.py b/mindspore/nn/optim/lazyadam.py
new file mode 100644
index 0000000000..48d33bf798
--- /dev/null
+++ b/mindspore/nn/optim/lazyadam.py
@@ -0,0 +1,199 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""lazy adam"""
+from mindspore.common import dtype as mstype
+from mindspore.common.initializer import initializer
+from mindspore.ops import operations as P
+from mindspore.ops import composite as C
+from mindspore.ops import functional as F
+from mindspore.common.parameter import Parameter
+from mindspore.common.tensor import Tensor
+from mindspore._checkparam import Validator as validator
+from mindspore._checkparam import Rel
+from .optimizer import Optimizer
+
+_lazy_adam_opt = C.MultitypeFuncGraph("lazy_adam_opt")
+
+
+@_lazy_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tuple",
+                         "Tensor", "Tensor", "Tensor")
+def _run_opt_with_sparse(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
+                         moment1, moment2):
+    """Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse."""
+    success = True
+    success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
+                                           eps, gradient[1], gradient[0]))
+    return success
+
+
+@_lazy_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor",
+                         "Tensor", "Tensor", "Tensor")
+def _run_opt_with_one_number(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
+                             moment1, moment2):
+    """Apply adam optimizer to the weight parameter using Tensor."""
+    success = True
+    success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
+                                    eps, gradient))
+    return success
+
+
+def _check_param_value(beta1, beta2, eps, weight_decay, prim_name):
+    """Check the type of inputs."""
+    validator.check_value_type("beta1", beta1, [float], prim_name)
+    validator.check_value_type("beta2", beta2, [float], prim_name)
+    validator.check_value_type("eps", eps, [float], prim_name)
+    validator.check_value_type("weight_dacay", weight_decay, [float], prim_name)
+    validator.check_number_range("beta1", beta1, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
+    validator.check_number_range("beta2", beta2, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
+    validator.check_number_range("eps", eps, 0.0, float("inf"), Rel.INC_NEITHER, prim_name)
+    validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
+
+
+class LazyAdam(Optimizer):
+    r"""
+    Updates gradients by Adaptive Moment Estimation (Adam) algorithm.
+
+    The Adam algorithm is proposed in `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_.
+
+    The updating formulas are as follows,
+
+    .. math::
+        \begin{array}{ll} \\
+            m = \beta_1 * m + (1 - \beta_1) * g \\
+            v = \beta_2 * v + (1 - \beta_2) * g * g \\
+            l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
+            w = w - l * \frac{m}{\sqrt{v} + \epsilon}
+        \end{array}
+
+    :math:`m` represents the 1st moment vector `moment1`, :math:`v` represents the 2nd moment vector `moment2`,
+    :math:`g` represents `gradients`, :math:`l` represents scaling factor `lr`, :math:`\beta_1, \beta_2` represent
+    `beta1` and `beta2`, :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent
+    `beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`,
+    :math:`\epsilon` represents `eps`.
+
+    Note:
+        The LazyAdam optimizer supports separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
+        The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
+        `sparse_grad` of `Parameter` being set. The sparse behavior, to be notice, is not equivalent to the
+        original Adam algorithm, as only the current indices parames will be updated. The sparse feature is under
+        continuous development. The sparse behavior is currently performed on the CPU, weight decay is
+        not supported.
+
+    Args:
+        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
+            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
+        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
+                                                        Iterable or a Tensor and the dims of the Tensor is 1,
+                                                        use dynamic learning rate, then the i-th step will
+                                                        take the i-th value as the learning rate.
+                                                        When the learning_rate is float or learning_rate is a Tensor
+                                                        but the dims of the Tensor is 0, use fixed learning rate.
+                                                        Other cases are not supported. Default: 1e-3.
+        beta1 (float): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). Default:
+                       0.9.
+        beta2 (float): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). Default:
+                       0.999.
+        eps (float): Term added to the denominator to improve numerical stability. Should be greater than 0. Default:
+                     1e-8.
+        use_locking (bool): Whether to enable a lock to protect updating variable tensors.
+            If True, updating of the var, m, and v tensors will be protected by a lock.
+            If False, the result is unpredictable. Default: False.
+        use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
+            If True, updates the gradients using NAG.
+            If False, updates the gradients without using NAG. Default: False.
+        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
+        loss_scale (float): A floating point value for the loss scale. Should be equal to or greater than 1. Default:
+                            1.0.
+
+    Inputs:
+        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
+
+    Outputs:
+        Tensor[bool], the value is True.
+
+    Examples:
+        >>> net = Net()
+        >>> #1) All parameters use the same learning rate and weight decay
+        >>> optim = nn.LazyAdam(params=net.trainable_params())
+        >>>
+        >>> #2) Use parameter groups and set different values
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
+        >>>                 {'params': no_conv_params}]
+        >>> opt = nn.LazyAdam(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
+        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
+        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
+    """
+
+    def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, use_locking=False,
+                 use_nesterov=False, weight_decay=0.0, loss_scale=1.0):
+        super(LazyAdam, self).__init__(learning_rate, params, weight_decay, loss_scale)
+        _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
+        validator.check_value_type("use_locking", use_locking, [bool], self.cls_name)
+        validator.check_value_type("use_nesterov", use_nesterov, [bool], self.cls_name)
+
+        self.beta1 = Tensor(beta1, mstype.float32)
+        self.beta2 = Tensor(beta2, mstype.float32)
+        self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power")
+        self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power")
+        self.eps = eps
+        self.use_nesterov = use_nesterov
+        self.use_locking = use_locking
+
+        self.moment1 = self.parameters.clone(prefix="moment1", init='zeros')
+        self.moment2 = self.parameters.clone(prefix="moment2", init='zeros')
+
+        self.hyper_map = C.HyperMap()
+        self.opt = P.Adam(use_locking, use_nesterov)
+        self.sparse_opt = P.SparseApplyLazyAdam(use_locking, use_nesterov)
+
+    def construct(self, gradients):
+        gradients = self.decay_weight(gradients)
+        gradients = self.scale_grad(gradients)
+        lr = self.get_lr()
+
+        self.beta1_power = self.beta1_power * self.beta1
+        self.beta2_power = self.beta2_power * self.beta2
+
+        if self.is_group_lr:
+            success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self.beta1_power,
+                                          self.beta2_power, self.beta1, self.beta2, self.eps),
+                                lr, gradients, self.parameters, self.moment1, self.moment2)
+        else:
+            success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self.beta1_power,
+                                          self.beta2_power, self.beta1, self.beta2, self.eps, lr),
+                                gradients, self.parameters, self.moment1, self.moment2)
+        return success
diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py
index 080377b71d..ebdc5d86bf 100755
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -13,17 +13,19 @@
 # limitations under the License.
 # ============================================================================
 """momentum"""
-from mindspore.ops import functional as F, composite as C, operations as P
+from mindspore.ops import functional as F, composite as C
+from mindspore.ops import _selected_ops
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 import mindspore.common.dtype as mstype
 from mindspore._checkparam import check_bool
+from mindspore._checkparam import Validator as validator
 from .optimizer import Optimizer
 
-momentum_opt = C.MultitypeFuncGraph("momentum_opt")
+_momentum_opt = C.MultitypeFuncGraph("momentum_opt")
 
 
-@momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
+@_momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
 def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment):
     """Apply momentum optimizer to the weight parameter using Tensor."""
     success = True
@@ -45,10 +47,12 @@ class Momentum(Optimizer):
         value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
         applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
 
+        To improve parameter groups performance, the customized order of parameters can be supported.
+
     Args:
         params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
             the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
-            "lr" and "weight_decay" are the keys can be parsed.
+            "lr", "weight_decay" and "order_params" are the keys can be parsed.
 
             - params: Required. The value should be a list of `Parameter`.
 
@@ -58,16 +62,23 @@ class Momentum(Optimizer):
             - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
               will be used. If not, the `weight_decay` in the API will be used.
 
-        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
-                                                        Iterable or a Tensor and the dims of the Tensor is 1,
-                                                        use dynamic learning rate, then the i-th step will
-                                                        take the i-th value as the learning rate.
-                                                        When the learning_rate is float or learning_rate is a Tensor
-                                                        but the dims of the Tensor is 0, use fixed learning rate.
-                                                        Other cases are not supported.
+            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
+              in the value of 'order_params' but not in any group will use default learning rate and default weight
+              decay.
+
+        learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
+                                                             Iterable or a Tensor and the dims of the Tensor is 1,
+                                                             use dynamic learning rate, then the i-th step will
+                                                             take the i-th value as the learning rate.
+                                                             When the learning_rate is float or learning_rate is a
+                                                             Tensor but the dims of the Tensor is 0, use fixed learning
+                                                             rate. Other cases are not supported. It should be equal to
+                                                             or greater than 0.0.
         momentum (float): Hyperparameter of type float, means momentum for the moving average.
-        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
-        loss_scale (float): A floating point value for the loss scale. Default: 1.0.
+            It should be at least 0.0.
+        weight_decay (int, float): Weight decay (L2 penalty). It should be equal to or greater than 0.0. Default: 0.0.
+        loss_scale (int, float): A floating point value for the loss scale. It should be greater than 0.0. Default: 1.0.
         use_nesterov (bool): Enable Nesterov momentum. Default: False.
 
     Inputs:
@@ -86,19 +97,23 @@ class Momentum(Optimizer):
         >>>
         >>> #2) Use parameter groups and set different values
         >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
-        >>>                 {'params': no_conv_params}]
+        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
+        >>>                 {'params': bias_params, 'lr': 0.01},
+        >>>                 {'order_params': net.trainable_params()}]
         >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0)
-        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
-        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
-        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
+        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
+        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
+        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
+        >>> # of default value 0.1 and a weight decay of default value 0.0.
         >>>
         >>> loss = nn.SoftmaxCrossEntropyWithLogits()
         >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
     """
     def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, use_nesterov=False):
         super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale)
+        validator.check_value_type("momentum", momentum, [float], self.cls_name)
         if isinstance(momentum, float) and momentum < 0.0:
             raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
         self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
@@ -106,7 +121,7 @@ class Momentum(Optimizer):
         self.use_nesterov = check_bool(use_nesterov)
         self.moments = self.params.clone(prefix="moments", init='zeros')
         self.hyper_map = C.HyperMap()
-        self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)
+        self.opt = _selected_ops.ApplyMomentum(use_nesterov=self.use_nesterov)
 
     def construct(self, gradients):
         params = self.params
@@ -115,7 +130,7 @@ class Momentum(Optimizer):
         gradients = self.scale_grad(gradients)
         lr = self.get_lr()
         if self.is_group_lr:
-            success = self.hyper_map(F.partial(momentum_opt, self.opt, self.momentum), lr, gradients, params, moments)
+            success = self.hyper_map(F.partial(_momentum_opt, self.opt, self.momentum), lr, gradients, params, moments)
         else:
-            success = self.hyper_map(F.partial(momentum_opt, self.opt, self.momentum, lr), gradients, params, moments)
+            success = self.hyper_map(F.partial(_momentum_opt, self.opt, self.momentum, lr), gradients, params, moments)
         return success
diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index 28c5d9e939..45eb604bf5 100755
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -48,6 +48,8 @@ class Optimizer(Cell):
         value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
         applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
 
+        To improve parameter groups performance, the customized order of parameters can be supported.
+
     Args:
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
@@ -55,12 +57,12 @@ class Optimizer(Cell):
                                                         take the i-th value as the learning rate.
                                                         When the learning_rate is float or learning_rate is a Tensor
                                                         but the dims of the Tensor is 0, use fixed learning rate.
-                                                        Other cases are not supported. Should be greater than 0.
-                                                        If the type of `learning_rate` input is int, it will be
+                                                        Other cases are not supported. It should be equal to or greater
+                                                        than 0. If the type of `learning_rate` input is int, it will be
                                                         converted to float.
         parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be
             updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`,
-            the "params", "lr" and "weight_decay" are the keys can be parsed.
+            the "params", "lr", "weight_decay" and "order_params" are the keys can be parsed.
 
             - params: Required. The value should be a list of `Parameter`.
 
@@ -70,6 +72,11 @@ class Optimizer(Cell):
             - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
               will be used. If not, the `weight_decay` in the API will be used.
 
+            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
+              in the value of 'order_params' but not in any group will use default learning rate and default weight
+              decay.
+
         weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
             If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
         loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the
@@ -93,16 +100,17 @@ class Optimizer(Cell):
 
         if isinstance(loss_scale, int):
             loss_scale = float(loss_scale)
-        validator.check_value_type("loss_scale", loss_scale, [float], None)
-        validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, None)
+        validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name)
+        validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, self.cls_name)
 
         if isinstance(weight_decay, int):
             weight_decay = float(weight_decay)
-        validator.check_value_type("weight_decay", weight_decay, [float], None)
-        validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, None)
+        validator.check_value_type("weight_decay", weight_decay, [float], self.cls_name)
+        validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name)
 
         self.is_group = False
         self.is_group_lr = False
+        self.is_group_params_ordered = False
         self.loss_scale = loss_scale
         if isinstance(learning_rate, int):
             learning_rate = float(learning_rate)
@@ -145,6 +153,7 @@ class Optimizer(Cell):
         self.reciprocal_scale = 1.0 / loss_scale
         self.exec_weight_decay = any(self.decay_flags)
         self.param_length = len(self.parameters)
+        self.map_ = C.Map()
 
     def decay_weight(self, gradients):
         """
@@ -162,11 +171,11 @@ class Optimizer(Cell):
         params = self.parameters
         if self.is_group:
             if self.exec_weight_decay:
-                gradients = self.hyper_map(F.partial(apply_decay), self.weight_decay, self.decay_flags,
+                gradients = self.hyper_map(F.partial(_apply_decay), self.weight_decay, self.decay_flags,
                                            params, gradients)
         else:
             if self.weight_decay > 0:
-                gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags,
+                gradients = self.hyper_map(F.partial(_apply_decay, self.weight_decay), self.decay_flags,
                                            params, gradients)
 
         return gradients
@@ -187,7 +196,7 @@ class Optimizer(Cell):
 
         """
         if self.reciprocal_scale != 1.0:
-            gradients = self.hyper_map(F.partial(grad_scale, self.reciprocal_scale), gradients)
+            gradients = self.map_(F.partial(_grad_scale, self.reciprocal_scale), gradients)
 
         return gradients
 
@@ -210,9 +219,8 @@ class Optimizer(Cell):
             raise TypeError("Learning rate should be float, Tensor or Iterable.")
         return lr
 
-    def _init_group_params(self, parameters, learning_rate, weight_decay):
-        """Init learning rate or weight decay in group params."""
-        origin_dynamic_lr = self.dynamic_lr
+    def _parse_group_params(self, parameters, learning_rate):
+        """Parse group params."""
         if self.dynamic_lr:
             dynamic_lr_length = learning_rate.size()
         else:
@@ -220,6 +228,15 @@ class Optimizer(Cell):
 
         for group_param in parameters:
             lr_length = dynamic_lr_length
+            if 'order_params' in group_param.keys():
+                if len(group_param.keys()) > 1:
+                    raise ValueError("The order params dict in group parameters should "
+                                     "only include the 'order_params' key.")
+                if not isinstance(group_param['order_params'], Iterable):
+                    raise TypeError("The value of 'order_params' should be an Iterable type.")
+                self.is_group_params_ordered = True
+                continue
+
             if 'lr' in group_param.keys():
                 self.is_group_lr = True
                 self._get_single_lr(group_param['lr'])
@@ -229,10 +246,20 @@ class Optimizer(Cell):
                 elif isinstance(group_param['lr'], Tensor):
                     lr_length = group_param['lr'].size()
                     self.dynamic_lr = True
+
             if dynamic_lr_length not in (lr_length, 0):
                 raise ValueError("The dynamic learning rate in group should be the same size.")
+
+            if not group_param['params']:
+                raise ValueError("Optimizer got an empty group parameter list.")
+
             dynamic_lr_length = lr_length
+        self.dynamic_lr_length = dynamic_lr_length
 
+    def _init_group_params(self, parameters, learning_rate, weight_decay):
+        """Init learning rate or weight decay in group params."""
+        origin_dynamic_lr = self.dynamic_lr
+        self._parse_group_params(parameters, learning_rate)
         if self.dynamic_lr and not origin_dynamic_lr:
             self.gather = P.GatherV2()
             self.assignadd = P.AssignAdd()
@@ -240,20 +267,20 @@ class Optimizer(Cell):
 
         params_store = []
         for group_param in parameters:
-            if not group_param['params']:
-                raise ValueError("Optimizer got an empty parameter list.")
+            if 'order_params' in group_param.keys():
+                ordered_parameters = group_param['order_params']
+                continue
 
             self.group_params += group_param['params']
             if 'lr' in group_param.keys():
                 params_dynamic_lr = isinstance(group_param['lr'], (Iterable, Tensor))
-
                 if self.dynamic_lr and not params_dynamic_lr:
-                    lr = Tensor(np.array([group_param['lr']] * dynamic_lr_length).astype(np.float32))
+                    lr = Tensor(np.array([group_param['lr']] * self.dynamic_lr_length).astype(np.float32))
                 else:
                     lr = self._get_single_lr(group_param['lr'])
             else:
                 if self.dynamic_lr and not origin_dynamic_lr:
-                    lr = Tensor(np.array([self.scalar_lr] * dynamic_lr_length).astype(np.float32))
+                    lr = Tensor(np.array([self.scalar_lr] * self.dynamic_lr_length).astype(np.float32))
                 else:
                     lr = learning_rate
 
@@ -273,10 +300,33 @@ class Optimizer(Cell):
                 validator.check_value_type("parameter", param, [Parameter], self.cls_name)
                 if param.name in params_store:
                     raise RuntimeError(f"The {param.name} parameter has appeared in parameter groups.")
+
                 params_store.append(param.name)
                 self.group_lr.append(Parameter(lr, name="lr_" + param.name))
                 self.group_weight_decay.append(weight_decay_)
 
+        if self.is_group_params_ordered:
+            self._order_and_adjust_group_params(ordered_parameters, learning_rate, weight_decay)
+
+    def _order_and_adjust_group_params(self, ordered_parameters, learning_rate, weight_decay):
+        """
+        Order group parameter, learning rate and weight decay in group params. And assign the parameters
+        which in the value of 'order_params' but not in any group to default value.
+        """
+        params_length = len(ordered_parameters)
+        ordered_learning_rate = [Parameter(learning_rate, name="lr_" + param.name) for param in ordered_parameters]
+        ordered_weight_decay = [weight_decay * self.loss_scale] * params_length
+        params_name = [param.name for param in ordered_parameters]
+
+        for param, lr, wd in zip(self.group_params, self.group_lr, self.group_weight_decay):
+            index = params_name.index(param.name)
+            ordered_learning_rate[index] = lr
+            ordered_weight_decay[index] = wd
+
+        self.group_params = list(ordered_parameters)
+        self.group_lr = ordered_learning_rate
+        self.group_weight_decay = ordered_weight_decay
+
     def get_lr(self):
         """
         Get the learning rate of current step.
@@ -339,10 +389,10 @@ class Optimizer(Cell):
 
 op_add = P.AddN()
 
-apply_decay = C.MultitypeFuncGraph("apply_decay")
+_apply_decay = C.MultitypeFuncGraph("apply_decay")
 
 
-@apply_decay.register("Number", "Bool", "Tensor", "Tensor")
+@_apply_decay.register("Number", "Bool", "Tensor", "Tensor")
 def _tensor_apply_decay(weight_decay, if_apply, weight, gradient):
     """Get grad with weight_decay."""
     if if_apply:
@@ -350,12 +400,20 @@ def _tensor_apply_decay(weight_decay, if_apply, weight, gradient):
     return gradient
 
 
-grad_scale = C.MultitypeFuncGraph("grad_scale")
+_grad_scale = C.MultitypeFuncGraph("grad_scale")
 
 
-@grad_scale.register("Number", "Tensor")
+@_grad_scale.register("Number", "Tensor")
 def tensor_grad_scale(scale, grad):
     """Get grad with scale."""
     if scale == 1.0:
         return grad
     return grad * scale
+
+
+@_grad_scale.register("Number", "Tuple")
+def tensor_grad_scale_with_sparse(scale, grad):
+    """Get grad with scale."""
+    if scale == 1.0:
+        return grad
+    return grad[0], grad[1] * scale, grad[2]
diff --git a/mindspore/nn/optim/proximal_ada_grad.py b/mindspore/nn/optim/proximal_ada_grad.py
new file mode 100644
index 0000000000..380720404a
--- /dev/null
+++ b/mindspore/nn/optim/proximal_ada_grad.py
@@ -0,0 +1,112 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""PROXIMAL_ADA_GRAD"""
+from mindspore.ops import functional as F, composite as C, operations as P
+from mindspore.common import Tensor
+import mindspore.common.dtype as mstype
+from mindspore._checkparam import Validator as validator
+from mindspore._checkparam import Rel
+from .optimizer import Optimizer
+
+_proximal_ada_grad_opt = C.MultitypeFuncGraph("proximal_ada_grad_opt")
+
+@_proximal_ada_grad_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tuple", "Tensor", "Tensor")
+def _tensor_run_opt_with_sparse(opt, sparse_opt, learning_rate, l1, l2, gradient, weight, accum):
+    """Apply sparse proximal_ada_grad optimizer to the weight parameter."""
+    success = True
+    success = F.depend(success, sparse_opt(weight, accum, learning_rate, l1, l2, gradient[1], gradient[0]))
+    return success
+
+
+@_proximal_ada_grad_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
+def _tensor_run_opt(opt, sparse_opt, learning_rate, l1, l2, gradient, weight, accum):
+    """Apply proximal_ada_grad optimizer to the weight parameter."""
+    success = True
+    success = F.depend(success, opt(weight, accum, learning_rate, l1, l2, gradient))
+    return success
+
+
+def _check_param_value(accum, l1, l2, use_locking, prim_name=None):
+    """Check inputs param."""
+    validator.check_value_type("accum", accum, [float], prim_name)
+    validator.check_value_type("l1", l1, [float], prim_name)
+    validator.check_value_type("l2", l2, [float], prim_name)
+    validator.check_value_type("use_locking", use_locking, [bool], prim_name)
+    validator.check_number_range("accum", accum, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
+    validator.check_number_range("l1", l1, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
+    validator.check_number_range("l2", l2, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
+
+
+class ProximalAdagrad(Optimizer):
+    """
+    Implement the ProximalAdagrad algorithm with ApplyProximalAdagrad Operator.
+
+    ProximalAdagrad is an online Learning and Stochastic Optimization.
+    Refer to paper `Efficient Learning using Forward-Backward Splitting
+    <http://papers.nips.cc//paper/3793-efficient-learning-using-forward-backward-splitting.pdf>`_.
+
+    Note:
+        The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
+        `sparse_grad` of `Parameter` being set as True. The sparse feature is under continuous development. The sparse
+        behavior is currently performed on the CPU, weight decay is not supported.
+
+    Args:
+        params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
+            should be Parameter.
+        accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
+        learning_rate (float): The learning rate value, must be greater than or equal to zero. Default: 0.001.
+        l1 (float): l1 regularization strength, must be greater than or equal to zero. Default: 0.0.
+        l2 (float): l2 regularization strength, must be greater than or equal to zero. Default: 0.0.
+        use_locking (bool): If True use locks for update operation. Default: False.
+        loss_scale (float): Value for the loss scale. It should be equal to or greater than 1.0. Default: 1.0.
+        wegith_decay (float): Weight decay value to multiply weight, must be zero or positive value. Default: 0.0.
+
+    Inputs:
+        - **grads** (tuple[Tensor]) - The gradients of `params` in optimizer, the shape is as same as the `params`
+          in optimizer.
+
+    Outputs:
+        Tensor[bool], the value is True.
+
+    Examples:
+        >>> net = Net()
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> opt = nn.ProximalAdagrad(net.trainable_params())
+        >>> model = Model(net, loss_fn=loss, optimizer=opt, metrics=None)
+    """
+
+    def __init__(self, params, accum=0.1, learning_rate=0.001, l1=0.0, l2=0.0,
+                 use_locking=False, loss_scale=1.0, weight_decay=0.0):
+        super(ProximalAdagrad, self).__init__(learning_rate, params, weight_decay, loss_scale)
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
+        _check_param_value(accum, l1, l2, use_locking, self.cls_name)
+        self.accum = self.parameters.clone(prefix="accum", init=accum)
+        self.l1 = Tensor(l1, mstype.float32)
+        self.l2 = Tensor(l2, mstype.float32)
+        self.weight_decay = weight_decay
+        self.hyper_map = C.HyperMap()
+        self.opt = P.ApplyProximalAdagrad(use_locking=use_locking)
+        self.sparse_opt = P.SparseApplyProximalAdagrad(use_locking=use_locking)
+
+    def construct(self, grads):
+        params = self.parameters
+        accum = self.accum
+        grads = self.decay_weight(grads)
+        grads = self.scale_grad(grads)
+        lr = self.learning_rate
+        success = self.map_(F.partial(_proximal_ada_grad_opt, self.opt, self.sparse_opt, lr, self.l1, self.l2),
+                            grads, params, accum)
+        return success
diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py
index 4d572574ae..05c42fb444 100644
--- a/mindspore/nn/optim/rmsprop.py
+++ b/mindspore/nn/optim/rmsprop.py
@@ -18,21 +18,21 @@ from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
 from .optimizer import Optimizer
 
-rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt")
-centered_rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt")
+_rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt")
+_centered_rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt")
 
 
-@rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
-def _rmsprop_opt(opt, decay, epsilon, momentum, learning_rate, weight, ms, mom, grad):
+@_rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
+def _rmsprop_opt_(opt, decay, epsilon, momentum, learning_rate, weight, ms, mom, grad):
     """Apply rmsprop optimizer to the weight parameter using dynamic learning rate."""
     success = True
     success = F.depend(success, opt(weight, ms, mom, learning_rate, grad, decay, momentum, epsilon))
     return success
 
 
-@centered_rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor",
-                               "Tensor", "Tensor")
-def _centered_rmsprop_opt(opt, decay, epsilon, momentum, learning_rate, weight, mg, ms, mom, grad):
+@_centered_rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor",
+                                "Tensor", "Tensor")
+def _centered_rmsprop_opt_(opt, decay, epsilon, momentum, learning_rate, weight, mg, ms, mom, grad):
     """Apply centered rmsprop optimizer to the weight parameter using dynamic learning rate."""
     success = True
     success = F.depend(success, opt(weight, mg, ms, mom, grad, learning_rate, decay, momentum, epsilon))
@@ -51,6 +51,8 @@ class RMSProp(Optimizer):
         value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
         applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
 
+        To improve parameter groups performance, the customized order of parameters can be supported.
+
         Update `params` according to the RMSProp algorithm.
 
         The equation is as follows:
@@ -93,7 +95,7 @@ class RMSProp(Optimizer):
     Args:
         params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
             the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
-            "lr" and "weight_decay" are the keys can be parsed.
+            "lr", "weight_decay" and "order_params" are the keys can be parsed.
 
             - params: Required. The value should be a list of `Parameter`.
 
@@ -103,6 +105,11 @@ class RMSProp(Optimizer):
             - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
               will be used. If not, the `weight_decay` in the API will be used.
 
+            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
+              in the value of 'order_params' but not in any group will use default learning rate and default weight
+              decay.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
@@ -133,13 +140,16 @@ class RMSProp(Optimizer):
         >>>
         >>> #2) Use parameter groups and set different values
         >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
-        >>>                 {'params': no_conv_params}]
+        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
+        >>>                 {'params': bias_params, 'lr': 0.01},
+        >>>                 {'order_params': net.trainable_params()}]
         >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0)
-        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
-        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
-        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
+        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
+        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
+        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
+        >>> # of default value 0.1 and a weight decay of default value 0.0.
         >>>
         >>> loss = nn.SoftmaxCrossEntropyWithLogits()
         >>> model = Model(net, loss_fn=loss, optimizer=optim)
@@ -177,17 +187,17 @@ class RMSProp(Optimizer):
         lr = self.get_lr()
         if self.centered:
             if self.is_group_lr:
-                success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, self.decay, self.epsilon,
+                success = self.hyper_map(F.partial(_centered_rmsprop_opt, self.opt, self.decay, self.epsilon,
                                                    self.momentum), lr, params, self.mg, self.ms, self.moment, gradients)
             else:
-                success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, self.decay, self.epsilon,
+                success = self.hyper_map(F.partial(_centered_rmsprop_opt, self.opt, self.decay, self.epsilon,
                                                    self.momentum, lr), params, self.mg, self.ms, self.moment, gradients)
 
         else:
             if self.is_group_lr:
-                success = self.hyper_map(F.partial(rmsprop_opt, self.opt, self.decay, self.epsilon,
+                success = self.hyper_map(F.partial(_rmsprop_opt, self.opt, self.decay, self.epsilon,
                                                    self.momentum), lr, params, self.ms, self.moment, gradients)
             else:
-                success = self.hyper_map(F.partial(rmsprop_opt, self.opt, self.decay, self.epsilon,
+                success = self.hyper_map(F.partial(_rmsprop_opt, self.opt, self.decay, self.epsilon,
                                                    self.momentum, lr), params, self.ms, self.moment, gradients)
         return success
diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py
index bf49244550..d2680a38e5 100755
--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -20,10 +20,10 @@ import mindspore.common.dtype as mstype
 from mindspore._checkparam import Validator as validator
 from .optimizer import Optimizer
 
-sgd_opt = C.MultitypeFuncGraph("sgd_opt")
+_sgd_opt = C.MultitypeFuncGraph("sgd_opt")
 
 
-@sgd_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
+@_sgd_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
 def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, accum, stat):
     """Apply sgd optimizer to the weight parameter using Tensor."""
     success = True
@@ -47,10 +47,12 @@ class SGD(Optimizer):
         value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
         applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
 
+       To improve parameter groups performance, the customized order of parameters can be supported.
+
     Args:
         params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
             the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
-            "lr" and "weight_decay" are the keys can be parsed.
+            "lr", "weight_decay" and "order_params" are the keys can be parsed.
 
             - params: Required. The value should be a list of `Parameter`.
 
@@ -60,16 +62,22 @@ class SGD(Optimizer):
             - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
               will be used. If not, the `weight_decay` in the API will be used.
 
+            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
+              in the value of 'order_params' but not in any group will use default learning rate and default weight
+              decay.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
                                                         take the i-th value as the learning rate.
                                                         When the learning_rate is float or learning_rate is a Tensor
                                                         but the dims of the Tensor is 0, use fixed learning rate.
-                                                        Other cases are not supported. Default: 0.1.
-        momentum (float): A floating point value the momentum. Default: 0.0.
-        dampening (float): A floating point value of dampening for momentum. Default: 0.0.
-        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
+                                                        Other cases are not supported. It should be equal to or
+                                                        greater than 0. Default: 0.1.
+        momentum (float): A floating point value the momentum. should be at least 0.0. Default: 0.0.
+        dampening (float): A floating point value of dampening for momentum. should be at least 0.0. Default: 0.0.
+        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
         nesterov (bool): Enables the Nesterov momentum. Default: False.
         loss_scale (float): A floating point value for the loss scale, which should be larger
                             than 0.0. Default: 1.0.
@@ -90,13 +98,16 @@ class SGD(Optimizer):
         >>>
         >>> #2) Use parameter groups and set different values
         >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
-        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
-        >>>                 {'params': no_conv_params}]
+        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
+        >>>                 {'params': bias_params, 'lr': 0.01},
+        >>>                 {'order_params': net.trainable_params()}]
         >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0)
-        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
-        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
-        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
+        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
+        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
+        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
+        >>> # of default value 0.1 and a weight decay of default value 0.0.
         >>>
         >>> loss = nn.SoftmaxCrossEntropyWithLogits()
         >>> model = Model(net, loss_fn=loss, optimizer=optim)
@@ -143,7 +154,7 @@ class SGD(Optimizer):
         gradients = self.scale_grad(gradients)
         lr = self.get_lr()
         if self.is_group_lr:
-            success = self.hyper_map(F.partial(sgd_opt, self.opt, self.momentum), lr, gradients, params, accum, stat)
+            success = self.hyper_map(F.partial(_sgd_opt, self.opt, self.momentum), lr, gradients, params, accum, stat)
         else:
-            success = self.hyper_map(F.partial(sgd_opt, self.opt, self.momentum, lr), gradients, params, accum, stat)
+            success = self.hyper_map(F.partial(_sgd_opt, self.opt, self.momentum, lr), gradients, params, accum, stat)
         return success
diff --git a/mindspore/nn/wrap/cell_wrapper.py b/mindspore/nn/wrap/cell_wrapper.py
index fe69a2a6ea..f0d920f51f 100644
--- a/mindspore/nn/wrap/cell_wrapper.py
+++ b/mindspore/nn/wrap/cell_wrapper.py
@@ -21,7 +21,6 @@ from ...common.parameter import Parameter, ParameterTuple
 from ...ops import composite as C
 from ...ops import functional as F
 from ...ops import operations as P
-from ...ops.composite.base import _mp_cast_helper
 from ...ops.operations.comm_ops import _VirtualDataset
 from ..cell import Cell
 from .grad_reducer import DistributedGradReducer
@@ -166,6 +165,7 @@ class TrainOneStepCell(Cell):
     def __init__(self, network, optimizer, sens=1.0):
         super(TrainOneStepCell, self).__init__(auto_prefix=False)
         self.network = network
+        self.network.set_grad()
         self.network.add_flags(defer_inline=True)
         self.weights = optimizer.parameters
         self.optimizer = optimizer
@@ -344,7 +344,7 @@ class WithEvalCell(Cell):
     def construct(self, data, label):
         outputs = self._network(data)
         if self.add_cast_fp32:
-            label = _mp_cast_helper(mstype.float32, label)
+            label = F.mixed_precision_cast(mstype.float32, label)
             outputs = F.cast(outputs, mstype.float32)
         loss = self._loss_fn(outputs, label)
         return loss, outputs, label
diff --git a/mindspore/nn/wrap/grad_reducer.py b/mindspore/nn/wrap/grad_reducer.py
index 8383910a60..c66bfbe646 100644
--- a/mindspore/nn/wrap/grad_reducer.py
+++ b/mindspore/nn/wrap/grad_reducer.py
@@ -52,6 +52,31 @@ def _tensors_allreduce_mean(mul, degree, allreduce_filter, grad):
     return grad
 
 
+@reduce_opt.register("Function", "Number", "Bool", "Tuple")
+def _tensors_allreduce_mean_with_sparse(mul, degree, allreduce_filter, grad):
+    """
+    Apply mean and allgather on gradient instead of allreduce for sparse feature.
+    Allgather is a communication operation used for distributed deep learning.
+
+    Args:
+        mul (Primitive): Div operation.
+        degree (int): The mean coefficient.
+        allreduce_filter (bool): When it is true, allgather would apply.
+        grad (Tuple): The indices, gradient tensor and tensor_shape before operation.
+
+    Returns:
+        Tuple, include indices, the gradient tensor and tensor_shape after operation.
+    """
+    if allreduce_filter:
+        indices = _all_gather(grad[0])
+        degree = F.scalar_cast(degree, F.dtype(grad[1]))
+        dout = _all_gather(grad[1])
+        cast_op = P.Cast()
+        dout = mul(dout, cast_op(F.scalar_to_array(1.0/degree), F.dtype(dout)))
+        grad = (indices, dout, dout[2])
+    return grad
+
+
 @reduce_opt.register("Bool", "Tensor")
 def _tensors_allreduce(allreduce_filter, grad):
     """
@@ -69,6 +94,26 @@ def _tensors_allreduce(allreduce_filter, grad):
     return grad
 
 
+@reduce_opt.register("Bool", "Tuple")
+def _tensors_allreduce_with_sparse(allreduce_filter, grad):
+    """
+    Apply mean and allgather on gradient instead of allreduce for sparse feature.
+    Allgather is a communication operation used for distributed deep learning.
+
+    Args:
+        allreduce_filter (bool): When it is true, allgather would apply.
+        grad (Tuple): The indices, gradient tensor and tensor_shape before operation.
+
+    Returns:
+        Tuple, include indices, the gradient tensor and tensor_shape after operation.
+    """
+    if allreduce_filter:
+        indices = _all_gather(grad[0])
+        dout = _all_gather(grad[1])
+        grad = (indices, dout, dout[2])
+    return grad
+
+
 _get_datatype = C.MultitypeFuncGraph("_get_datatype")
 
 
diff --git a/mindspore/ops/_grad/grad_array_ops.py b/mindspore/ops/_grad/grad_array_ops.py
index b7b7af8082..a2a808781e 100644
--- a/mindspore/ops/_grad/grad_array_ops.py
+++ b/mindspore/ops/_grad/grad_array_ops.py
@@ -19,6 +19,7 @@ from .. import operations as P
 from ..operations import _grad_ops as G
 from ..operations import _inner_ops as inner
 from ..composite.multitype_ops.zeros_like_impl import zeros_like
+from ..functional import broadcast_gradient_args
 from .. import functional as F
 from .grad_base import bprop_getters
 from ..primitive import constexpr
@@ -30,6 +31,7 @@ unsorted_segment_sum = P.UnsortedSegmentSum()
 transpose = P.Transpose()
 shape_op = P.Shape()
 reshape = P.Reshape()
+size_op = P.Size()
 invert_permutation = P.InvertPermutation()
 logical_and = P.LogicalAnd()
 
@@ -192,24 +194,27 @@ def get_bprop_tile(self):
 @bprop_getters.register(inner.EmbeddingLookup)
 def get_bprop_embedding_lookup(self):
     """Generate bprop for EmbeddingLookup"""
-    host_sub = P.Sub().add_prim_attr('primitive_target', 'CPU')
+    sub_op = P.Sub()
+    reshape_op = P.Reshape()
     host_reshape = P.Reshape().add_prim_attr('primitive_target', 'CPU')
     def bprop_sparse(x, indices, offset, reduce_scatter_flag, split_num, out, dout):
         x_shp = shape_op(x)
-        if reduce_scatter_flag is True:
-            elu_grad = G.EmbeddingLookupCommGrad()
-            actual_dout = elu_grad(dout, split_num)
-        else:
-            actual_dout = dout
-        new_indices = host_sub(indices - offset)
+        new_indices = sub_op(indices, offset)
         # Reshape the 'new_indices'
         new_indices_shape_changed = (size_op(new_indices),)
-        new_indices = host_reshape(new_indices, new_indices_shape_changed)
-        # Reshape the 'actual_dout'
+        new_indices = reshape_op(new_indices, new_indices_shape_changed)
         x_shp_tail = x_shp[1:]
         actual_dout_shape_changed = new_indices_shape_changed + x_shp_tail
-        actual_dout = host_reshape(actual_dout, actual_dout_shape_changed)
-        return (new_indices, actual_dout, x_shp), zeros_like(new_indices), zeros_like(axis), \
+        if reduce_scatter_flag is True:
+            # On host
+            elu_grad = G.EmbeddingLookupCommGrad()
+            actual_dout = elu_grad(dout, split_num)
+            # Reshape the 'actual_dout' on host
+            actual_dout = host_reshape(actual_dout, actual_dout_shape_changed)
+        else:
+            # Reshape the 'actual_dout' on device
+            actual_dout = reshape_op(dout, actual_dout_shape_changed)
+        return (new_indices, actual_dout, x_shp), zeros_like(indices), zeros_like(offset), \
                zeros_like(reduce_scatter_flag), zeros_like(split_num)
     return bprop_sparse
 
@@ -309,7 +314,38 @@ def get_bprop_gather_v2(self):
     return bprop
 
 
-@bprop_getters.register(P.Range)
+@bprop_getters.register(P.SparseGatherV2)
+def get_bprop_sparse_gather_v2(self):
+    """Generate bprop for SparseGatherV2"""
+
+    def bprop(x, indices, axis, out, dout):
+        x_shp = shape_op(x)
+        if axis == 0:
+            indices_size = (size_op(indices),)
+            x_tail_shp = x_shp[1:]
+            values_shape = indices_size + x_tail_shp
+            values = reshape(dout, values_shape)
+            indices = reshape(indices, indices_size)
+            return (indices, values, x_shp), zeros_like(indices), zeros_like(axis)
+        if F.rank(dout) == 0:
+            dout = P.ExpandDims()(dout, -1)
+        if F.rank(indices) == 0:
+            indices = P.ExpandDims()(indices, -1)
+        out_shp = shape_op(dout)
+        ind_shp = shape_op(indices)
+        # Example: out_shape:(3,2,3) axis 1 -> (1,0,2)
+        perm_1 = _generate_shape_index(out_shp, ind_shp, axis)
+        values_transpose = transpose(dout, perm_1)
+        params_grad = unsorted_segment_sum(values_transpose, indices, shape_op(x)[axis])
+        # Example: out_shape:(3,2,3) axis 2 -> (1,2,0)
+        perm_2 = _generate_inverse_index(x_shp, axis)
+        params_grad = transpose(params_grad, perm_2)
+        return params_grad, zeros_like(indices), zeros_like(axis)
+
+    return bprop
+
+
+@bprop_getters.register(inner.Range)
 def get_bprop_range(self):
     """Generate bprop for Range"""
 
@@ -449,6 +485,31 @@ def get_bprop_scatter_nd_update(self):
     return bprop
 
 
+@bprop_getters.register(P.TensorScatterUpdate)
+def get_bprop_tensor_scatter_update(self):
+    """Generate bprop for TensorScatterUpdate"""
+    gather_nd = P.GatherNd()
+    tensor_scatter_update = P.TensorScatterUpdate()
+
+    def bprop(x, indices, update, out, dout):
+        x_grad = tensor_scatter_update(dout, indices, zeros_like(update))
+        update_grad = gather_nd(dout, indices)
+        return x_grad, zeros_like(indices), update_grad
+
+    return bprop
+
+
+@bprop_getters.register(P.ScatterMax)
+def get_bprop_scatter_max(self):
+    """Generate bprop for ScatterMax"""
+    gather = P.GatherV2()
+
+    def bprop(x, indices, update, out, dout):
+        return dout, zeros_like(indices), gather(dout, indices, 0)
+
+    return bprop
+
+
 @bprop_getters.register(P.Argmax)
 def get_bprop_argmax(self):
     """Generate bprop for Argmax"""
@@ -607,6 +668,24 @@ def get_bprop_batch_to_space_nd(self):
         return (dx,)
     return bprop
 
+@bprop_getters.register(P.BroadcastTo)
+def get_bprop_broadcast_to(self):
+    """Generate bprop for BroadcastTo"""
+    reduce_keep_dim = P.ReduceSum(keep_dims=True)
+    broadcast_shape = self.shape
+
+    def bprop(x, out, dout):
+        x_shape = shape_op(x)
+        dout_shape = shape_op(dout)
+
+        if x_shape == dout_shape:
+            return (dout,)
+        _, reduction_axes = broadcast_gradient_args(broadcast_shape, x_shape)
+        reduced_grad = reduce_keep_dim(dout, reduction_axes)
+        dx = reshape(reduced_grad, x_shape)
+        return (dx,)
+    return bprop
+
 
 @bprop_getters.register(P.ReverseSequence)
 def get_bprop_reverse_sequence(self):
diff --git a/mindspore/ops/_grad/grad_comm_ops.py b/mindspore/ops/_grad/grad_comm_ops.py
index 057d150be1..7477d50895 100644
--- a/mindspore/ops/_grad/grad_comm_ops.py
+++ b/mindspore/ops/_grad/grad_comm_ops.py
@@ -26,9 +26,10 @@ from .grad_base import bprop_getters
 
 @bprop_getters.register(AllReduce)
 def get_bprop_all_reduce(self):
-    """Generate bprop for AllReduce."""
+    """Generate bprop for AllReduce, do allreduce or allgather, allgather for sparse feature."""
 
     all_reduce_grad = AllReduce(ReduceOp.SUM, self.group)
+    all_gather = AllGather(group=self.group)
     if self.instance_name:
         instance_name = "grad" + self.instance_name
         all_reduce_grad.set_prim_instance_name(instance_name)
@@ -42,15 +43,28 @@ def get_bprop_all_reduce(self):
     if self.op == ReduceOp.SUM:
 
         def bprop(x, out, dout):
-            dx = all_reduce_grad(dout)
+            if F.issubclass_(F.typeof(dout), mstype.tensor):
+                dx = all_reduce_grad(dout)
+            else:
+                indices = all_gather(dout[0])
+                grad = all_gather(dout[1])
+                dx = (indices, grad, dout[2])
             return (dx,)
     else:
 
         def bprop(x, out, dout):
-            dx = all_reduce_grad(dout)
-            z = equal(x, out)
-            z = cast(z, dtype(dx))
-            dx = mul(dx, z)
+            if F.issubclass_(F.typeof(dout), mstype.tensor):
+                dx = all_reduce_grad(dout)
+                z = equal(x, out)
+                z = cast(z, dtype(dx))
+                dx = mul(dx, z)
+            else:
+                indices = all_gather(dout[0])
+                grad = all_gather(dout[1])
+                z = equal(x, out)
+                z = cast(z, dtype(grad))
+                grad = mul(grad, z)
+                dx = (indices, grad, dout[2])
             return (dx,)
     return bprop
 
@@ -147,12 +161,16 @@ def get_bprop_all_to_all(self):
 
 @bprop_getters.register(_MirrorOperator)
 def get_bprop_mirror_operator(self):
-    """Backpropagator for _MirrorOperator, do allreduce for the devices in group(only for one group)."""
+    """
+    Backpropagator for _MirrorOperator, do allreduce or allgather for the devices in group(only for one group),
+    allgather for sparse feature.
+    """
     group = self.group
     dev_num = self.dev_num
     mean_flag = self.mean_flag
 
     all_reduce = AllReduce(group=group)
+    all_gather = AllGather(group=group)
     mul = P.Mul()
     cast = P.Cast()
 
@@ -170,12 +188,25 @@ def get_bprop_mirror_operator(self):
 
     def bprop(x, out, dout):
         if mean_flag:
-            dx = all_reduce(dout)
-            float_one = F.scalar_cast(1.0, F.dtype(dx))
-            num = F.scalar_cast(dev_num, F.dtype(dx))
-            dx = mul(dx, cast(F.scalar_to_array(float_one/num), F.dtype(dx)))
+            if F.issubclass_(F.typeof(dout), mstype.tensor):
+                dx = all_reduce(dout)
+                float_one = F.scalar_cast(1.0, F.dtype(dx))
+                num = F.scalar_cast(dev_num, F.dtype(dx))
+                dx = mul(dx, cast(F.scalar_to_array(float_one/num), F.dtype(dx)))
+            else:
+                indices = all_gather(dout[0])
+                grad = all_gather(dout[1])
+                float_one = F.scalar_cast(1.0, F.dtype(grad))
+                num = F.scalar_cast(dev_num, F.dtype(grad))
+                grad = mul(grad, cast(F.scalar_to_array(float_one/num), F.dtype(grad)))
+                dx = (indices, grad, dout[2])
         else:
-            dx = all_reduce(dout)
+            if F.issubclass_(F.typeof(dout), mstype.tensor):
+                dx = all_reduce(dout)
+            else:
+                indices = all_gather(dout[0])
+                grad = all_gather(dout[1])
+                dx = (indices, grad, dout[2])
 
         return (dx,)
     return bprop
diff --git a/mindspore/ops/_grad/grad_implementations.py b/mindspore/ops/_grad/grad_implementations.py
index ee3117c83a..87566b1110 100644
--- a/mindspore/ops/_grad/grad_implementations.py
+++ b/mindspore/ops/_grad/grad_implementations.py
@@ -195,7 +195,7 @@ def bprop_array_reduce(fn, x, shp, out, dout):
     return F.distribute(dout, F.shape(x)), C.zeros_like(shp)
 
 
-@bprops.register("depend")
+@bprops.register("Depend")
 def bprop_depend(x, y, out, dout):
     """Backpropagator for primitive `depend`."""
     return dout, C.zeros_like(y)
@@ -236,7 +236,6 @@ def bprop_control_depend(x, y, out, dout):
     """Backpropagator for primitive `Control_depend`."""
     return C.zeros_like(x), C.zeros_like(y)
 
-
 @bprops.register("switch")
 def bprop_switch(cond, tb, fb, out, dout):
     """Backpropagator for primitive `switch`."""
diff --git a/mindspore/ops/_grad/grad_math_ops.py b/mindspore/ops/_grad/grad_math_ops.py
index ffd79e49b9..1e4f932442 100755
--- a/mindspore/ops/_grad/grad_math_ops.py
+++ b/mindspore/ops/_grad/grad_math_ops.py
@@ -17,15 +17,18 @@
 
 from functools import reduce
 import numpy as np
+from mindspore.ops import _selected_grad_ops as SG
 from .. import functional as F
 from .. import operations as P
 from ..operations import _grad_ops as G
+from ..operations import _inner_ops as inner
 from ..composite.multitype_ops.zeros_like_impl import zeros_like
 from ..functional import broadcast_gradient_args, reduced_shape, tuple_div
 from .grad_base import bprop_getters
 from ..primitive import constexpr
 from ..composite.multitype_ops import _constexpr_utils as const_utils
 
+
 shape_op = P.Shape()
 reduce_sum = P.ReduceSum()
 reshape = P.Reshape()
@@ -232,6 +235,21 @@ def get_bprop_div(self):
     return bprop
 
 
+@bprop_getters.register(P.DivNoNan)
+def get_bprop_div_no_nan(self):
+    """Grad definition for `DivNoNan` operation."""
+    div_no_nan_op = P.DivNoNan()
+    neg = P.Neg()
+    mul_op = P.Mul()
+
+    def bprop(x, y, out, dout):
+        bc_x = div_no_nan_op(dout, y)
+        bc_y = neg(mul_op(bc_x, out))
+        return binop_grad_common(x, y, bc_x, bc_y)
+
+    return bprop
+
+
 @bprop_getters.register(P.Floor)
 def get_bprop_floor(self):
     """Grad definition for `floor` operation."""
@@ -239,6 +257,21 @@ def get_bprop_floor(self):
     shape_ = P.Shape()
     dtype_ = P.DType()
 
+    def bprop(x, out, dout):
+        bc_x = fill_(dtype_(x), shape_(x), 0.)
+        return (bc_x,)
+
+
+    return bprop
+
+
+@bprop_getters.register(P.Ceil)
+def get_bprop_ceil(self):
+    """Grad definition for `ceil` operation."""
+    fill_ = P.Fill()
+    shape_ = P.Shape()
+    dtype_ = P.DType()
+
     def bprop(x, out, dout):
         bc_x = fill_(dtype_(x), shape_(x), 0.)
         return (bc_x,)
@@ -422,10 +455,23 @@ def get_bprop_exp(self):
     return bprop
 
 
+@bprop_getters.register(P.Expm1)
+def get_bprop_expm1(self):
+    """Grad definition for `Expm1` operation."""
+    exp_ = P.Exp()
+
+    def bprop(x, out, dout):
+        g = exp_(x)
+        dx = g * dout
+        return (dx,)
+
+    return bprop
+
+
 @bprop_getters.register(P.Minimum)
 def get_bprop_minimum(self):
     """Grad definition for `Minimum` operation."""
-    input_grad = G.MinimumGrad()
+    input_grad = SG.MinimumGrad()
 
     def bprop(x, y, out, dout):
         dx, dy = input_grad(x, y, dout)
@@ -437,7 +483,7 @@ def get_bprop_minimum(self):
 @bprop_getters.register(P.Maximum)
 def get_bprop_maximum(self):
     """Grad definition for `Maximum` operation."""
-    input_grad = G.MaximumGrad()
+    input_grad = SG.MaximumGrad()
 
     def bprop(x, y, out, dout):
         dx, dy = input_grad(x, y, dout)
@@ -639,6 +685,16 @@ def get_bprop_not_equal(self):
     return bprop
 
 
+@bprop_getters.register(P.ApproximateEqual)
+def get_bprop_approximate_equal(self):
+    """Grad definition for `ApproximateEqual` operation."""
+
+    def bprop(x, y, out, dout):
+        return zeros_like(x), zeros_like(y)
+
+    return bprop
+
+
 @bprop_getters.register(P.Greater)
 def get_bprop_greater(self):
     """Grad definition for `Greater` operation."""
@@ -793,6 +849,18 @@ def get_bprop_asinh(self):
     return bprop
 
 
+@bprop_getters.register(P.Sinh)
+def get_bprop_sinh(self):
+    """Grad definition for `Sinh` operation."""
+    cosh = P.Cosh()
+
+    def bprop(x, out, dout):
+        dx = cosh(x) * dout
+        return (dx,)
+
+    return bprop
+
+
 @bprop_getters.register(P.Cos)
 def get_bprop_cos(self):
     """Grad definition for `Cos` operation."""
@@ -830,10 +898,22 @@ def get_bprop_acosh(self):
     return bprop
 
 
+@bprop_getters.register(P.Cosh)
+def get_bprop_cosh(self):
+    """Grad definition for `Cosh` operation."""
+    sinh = P.Sinh()
+
+    def bprop(x, out, dout):
+        dx = sinh(x) * dout
+        return (dx,)
+
+    return bprop
+
+
 @bprop_getters.register(P.Abs)
 def get_bprop_abs(self):
     """Grad definition for `Abs` operation."""
-    abs_grad = G.AbsGrad()
+    abs_grad = SG.AbsGrad()
 
     def bprop(x, out, dout):
         dx = abs_grad(x, dout)
@@ -852,6 +932,18 @@ def get_bprop_scalar_cast(self):
     return bprop
 
 
+@bprop_getters.register(P.AccumulateNV2)
+def get_bprop_scalar_accumulatenv2(self):
+    """Generate bprop for AccumulateNV2"""
+
+    def bprop(x, out, dout):
+        dx = ()
+        for _ in range(len(x)):
+            dx = dx + (dout,)
+        return dx
+    return bprop
+
+
 @bprop_getters.register(P.AddN)
 def get_bprop_scalar_addn(self):
     """Generate bprop for AddN"""
@@ -934,15 +1026,16 @@ def get_bprop_bessel_i1e(self):
     reciprocal = P.Reciprocal()
     cast = P.Cast()
     dtype = P.DType()
+    abs_ops = P.Abs()
 
     def bprop(x, out, dout):
         zeros = zeros_like(x)
         np_eps = const_utils.get_np_eps(dtype(x))
         eps = cast(np_eps, dtype(x))
-        x_is_valid = less(eps, x)
+        x_is_valid = less(eps, abs_ops(x))
         x_safe = select(x_is_valid, x, eps + zeros)
-        tmp = bessel_i0e(x_safe) - out * (sign(x) + reciprocal(x_safe))
-        dx = select(x_is_valid, tmp, 0.5 + zeros)
+        tmp = bessel_i0e(x_safe) - out * (sign(x_safe) + reciprocal(x_safe))
+        dx = select(x_is_valid, tmp, cast(0.5, dtype(x)) + zeros) * dout
         return (dx,)
     return bprop
 
@@ -958,3 +1051,24 @@ def get_bprop_atanh(self):
         dx = div(1, tmp) * dout
         return (dx,)
     return bprop
+
+
+@bprop_getters.register(P.Inv)
+def get_bprop_inv(self):
+    """Grad definition for 'Inv' operation"""
+    inv_grad = G.InvGrad()
+
+    def bprop(x, out, dout):
+        dx = inv_grad(out, dout)
+        return (dx,)
+    return bprop
+
+
+@bprop_getters.register(inner.LinSpace)
+def get_bprop_lin_space(self):
+    """Grad definition for `LinSpace` operation."""
+
+    def bprop(assist, start, stop, num, out, dout):
+        return zeros_like(assist), zeros_like(start), zeros_like(stop), zeros_like(num)
+
+    return bprop
diff --git a/mindspore/ops/_grad/grad_nn_ops.py b/mindspore/ops/_grad/grad_nn_ops.py
index 4c4acb802c..1254f9e7a2 100755
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 """Define the grad rules of neural network related operations."""
+from mindspore.ops import _selected_grad_ops as SG
 from .grad_base import bprop_getters
 from .. import functional as F
 from .. import operations as P
@@ -23,10 +24,11 @@ from ..operations import _inner_ops as inner
 from ... import context
 
 
+
 @bprop_getters.register(P.BiasAdd)
 def get_bprop_bias_add(self):
     """Grad definition for `BiasAdd` operation."""
-    bias_grad = G.BiasAddGrad()
+    bias_grad = SG.BiasAddGrad()
 
     def bprop(x, w, out, dout):
         return dout, bias_grad(dout)
@@ -303,7 +305,6 @@ def get_bprop_softmax(self):
     sub = P.Sub()
     mul = P.Mul()
     axis = self.axis
-
     def bprop(x, out, dout):
         dx = mul(out, sub(dout, sum_func(mul(out, dout), axis)))
         return (dx,)
@@ -338,10 +339,10 @@ def get_bprop_softplus(self):
 @bprop_getters.register(P.Tanh)
 def get_bprop_tanh(self):
     """Grad definition for `Tanh` operation."""
-    logsoftmax_grad = G.TanhGrad()
+    tanh_grad = SG.TanhGrad()
 
     def bprop(x, out, dout):
-        dx = logsoftmax_grad(out, dout)
+        dx = tanh_grad(out, dout)
         return (dx,)
 
     return bprop
@@ -404,7 +405,8 @@ def get_bprop_layer_norm(self):
     layer_norm_grad = G.LayerNormGrad(self.begin_norm_axis, self.begin_params_axis)
 
     def bprop(x, gamma, beta, out, dout):
-        dx, d_gamma, d_beta = layer_norm_grad(x, dout[0], out[2], out[1], gamma)
+        dx, d_gamma, d_beta = layer_norm_grad(
+            x, dout[0], out[2], out[1], gamma)
         return dx, d_gamma, d_beta
 
     return bprop
@@ -687,7 +689,7 @@ def get_bprop_binary_cross_entropy(self):
 @bprop_getters.register(P.Dropout)
 def get_bprop_dropout(self):
     """Grad definition for `Dropout` operation."""
-    grad = P.DropoutGrad(self.drop_prob)
+    grad = P.DropoutGrad(self.keep_prob)
 
     def bprop(x, out, dout):
         _, mask = out
diff --git a/mindspore/ops/_grad/grad_quant_ops.py b/mindspore/ops/_grad/grad_quant_ops.py
index 1e694a7dba..a2b0ba8d97 100644
--- a/mindspore/ops/_grad/grad_quant_ops.py
+++ b/mindspore/ops/_grad/grad_quant_ops.py
@@ -13,17 +13,20 @@
 # limitations under the License.
 # ============================================================================
 
-"""Generate bprop for aware quantization ops"""
+"""Generate bprop for quantization aware ops"""
 
 from .. import operations as P
+from ..operations import _quant_ops as Q
 from .grad_base import bprop_getters
 from ..composite.multitype_ops.zeros_like_impl import zeros_like
+from ... import context
 
 
-@bprop_getters.register(P.FakeQuantWithMinMax)
+@bprop_getters.register(Q.FakeQuantPerLayer)
 def get_bprop_fakequant_with_minmax(self):
-    """Generate bprop for FakeQuantWithMinMax for GPU and Ascend"""
-    op = P.FakeQuantWithMinMaxGrad(num_bits=self.num_bits, quant_delay=self.quant_delay)
+    """Generate bprop for FakeQuantPerLayer for GPU and Ascend"""
+    op = Q.FakeQuantPerLayerGrad(
+        num_bits=self.num_bits, quant_delay=self.quant_delay)
 
     def bprop(x, x_min, x_max, out, dout):
         dx = op(dout, x, x_min, x_max)
@@ -32,10 +35,14 @@ def get_bprop_fakequant_with_minmax(self):
     return bprop
 
 
-@bprop_getters.register(P.FakeQuantWithMinMaxPerChannel)
+@bprop_getters.register(Q.FakeQuantPerChannel)
 def get_bprop_fakequant_with_minmax_perchannel(self):
-    """Generate bprop for FakeQuantWithMinMaxPerChannel for GPU"""
-    op = P.FakeQuantWithMinMaxPerChannelGrad(num_bits=self.num_bits, quant_delay=self.quant_delay)
+    """Generate bprop for FakeQuantPerChannel"""
+    op = Q.FakeQuantPerChannelGrad(num_bits=self.num_bits,
+                                   quant_delay=self.quant_delay,
+                                   symmetric=self.symmetric,
+                                   narrow_range=self.symmetric,
+                                   channel_axis=self.channel_axis)
 
     def bprop(x, x_min, x_max, out, dout):
         dx = op(dout, x, x_min, x_max)
@@ -44,10 +51,10 @@ def get_bprop_fakequant_with_minmax_perchannel(self):
     return bprop
 
 
-@bprop_getters.register(P.BatchNormFold)
+@bprop_getters.register(Q.BatchNormFold)
 def get_bprop_batchnorm_fold(self):
     """Generate bprop for BatchNormFold for GPU"""
-    op = P.BatchNormFoldGrad(self.epsilon, self.is_training, self.freeze_bn)
+    op = Q.BatchNormFoldGrad(self.epsilon, self.is_training, self.freeze_bn)
 
     def bprop(x, mean, variance, global_step, out, dout):
         dx = op(dout[0], dout[1], x, out[0], out[1], global_step)
@@ -56,36 +63,45 @@ def get_bprop_batchnorm_fold(self):
     return bprop
 
 
-@bprop_getters.register(P.CorrectionMul)
+@bprop_getters.register(Q.CorrectionMul)
 def get_bprop_correction_mul(self):
     """Generate bprop for CorrectionMul for Ascend and GPU"""
-    grad = P.CorrectionMulGrad(self.channel_axis)
+    grad_dx = Q.CorrectionMulGrad(self.channel_axis)
+    grad_d_batch_std = Q.CorrectionMulGradReduce(self.channel_axis)
 
     def bprop(x, batch_std, running_std, out, dout):
-        dx, d_batch_std = grad(dout, x, batch_std, running_std)
+        dx, d_batch_std = grad_dx(dout, x, batch_std, running_std)
         return dx, d_batch_std, zeros_like(running_std)
 
+    def bprop_npu(x, batch_std, running_std, out, dout):
+        dx, mul_dx = grad_dx(dout, x, batch_std, running_std)
+        d_batch_std = grad_d_batch_std(mul_dx)
+        return dx, d_batch_std, zeros_like(running_std)
+
+    if context.get_context('device_target') == "Ascend":
+        return bprop_npu
+
     return bprop
 
 
-@bprop_getters.register(P.BatchNormFold2)
+@bprop_getters.register(Q.BatchNormFold2)
 def get_bprop_batchnorm_fold2(self):
     """Generate bprop for BatchNormFold2 for GPU"""
-    op_f = P.BatchNormFold2Grad(freeze_bn=self.freeze_bn)
+    op_f = Q.BatchNormFold2Grad(freeze_bn=self.freeze_bn)
 
     def bprop(x, beta, gamma, batch_std, batch_mean, running_std, running_mean, global_step, out, dout):
         d_batch_std, d_batch_mean, d_beta, d_gamma, d_x = op_f(dout, x, gamma, batch_std, batch_mean, running_std,
                                                                running_mean, global_step)
         return d_x, d_beta, d_gamma, d_batch_std, d_batch_mean, zeros_like(running_std), zeros_like(running_mean), \
-               zeros_like(global_step)
+            zeros_like(global_step)
 
     return bprop
 
 
-@bprop_getters.register(P.BatchNormFoldD)
+@bprop_getters.register(Q.BatchNormFoldD)
 def get_bprop_BatchNormFold(self):
     """Generate bprop for BatchNormFold for Ascend"""
-    op = P.BatchNormFoldGrad_(self.epsilon, self.is_training, self.freeze_bn)
+    op = Q.BatchNormFoldGradD(self.epsilon, self.is_training, self.freeze_bn)
 
     def bprop(x, x_sum, x_square_sum, mean, variance, out, dout):
         dx = op(dout[1], dout[2], x, out[1], out[2])
@@ -102,11 +118,11 @@ def get_bprop_BNTrainingReduce(self):
     return bprop
 
 
-@bprop_getters.register(P.BatchNormFold2_D)
+@bprop_getters.register(Q.BatchNormFold2_D)
 def get_bprop_batchnorm_fold2_(self):
     """Generate bprop for BatchNormFold2 for Ascend"""
-    op_reduce = P.BatchNormFold2GradReduce(freeze_bn=self.freeze_bn)
-    op_f = P.BatchNormFold2GradD(freeze_bn=self.freeze_bn)
+    op_reduce = Q.BatchNormFold2GradReduce(freeze_bn=self.freeze_bn)
+    op_f = Q.BatchNormFold2GradD(freeze_bn=self.freeze_bn)
 
     def bprop(x, beta, gamma, batch_std, batch_mean, running_std, out, dout):
         dout_reduce, dout_x_reduce = op_reduce(dout, x)
@@ -117,9 +133,19 @@ def get_bprop_batchnorm_fold2_(self):
     return bprop
 
 
-@bprop_getters.register(P.FakeQuantWithMinMaxUpdate)
-def get_bprop_fakequant_with_minmax_update(self):
-    """Generate bprop for FakeQuantWithMinMaxUpdate for Ascend"""
+@bprop_getters.register(Q.MinMaxUpdatePerLayer)
+def get_bprop_fakequant_with_minmax_per_layer_update(self):
+    """Generate bprop for MinMaxUpdatePerLayer for Ascend"""
+
+    def bprop(x, x_min, x_max, out, dout):
+        return zeros_like(x), zeros_like(x_min), zeros_like(x_max)
+
+    return bprop
+
+
+@bprop_getters.register(Q.MinMaxUpdatePerChannel)
+def get_bprop_fakequant_with_minmax_per_channel_update(self):
+    """Generate bprop for MinMaxUpdatePerChannel for Ascend"""
 
     def bprop(x, x_min, x_max, out, dout):
         return zeros_like(x), zeros_like(x_min), zeros_like(x_max)
diff --git a/mindspore/ops/_op_impl/__init__.py b/mindspore/ops/_op_impl/__init__.py
index 725977877d..65a12cd73c 100644
--- a/mindspore/ops/_op_impl/__init__.py
+++ b/mindspore/ops/_op_impl/__init__.py
@@ -19,6 +19,5 @@ from .aicpu import *
 if "Windows" not in platform.system():
     from .akg.gpu import *
     from .tbe import *
-    from ._custom_op import *
 
 __all__ = []
diff --git a/mindspore/ops/_op_impl/_custom_op/batchnorm_fold.py b/mindspore/ops/_op_impl/_custom_op/batchnorm_fold.py
index 63b9e2b7d2..11434223d3 100644
--- a/mindspore/ops/_op_impl/_custom_op/batchnorm_fold.py
+++ b/mindspore/ops/_op_impl/_custom_op/batchnorm_fold.py
@@ -16,6 +16,7 @@
 """_BatchNormFold op"""
 
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+import te
 from te import tvm
 from topi import generic
 from topi.cce import util
@@ -64,7 +65,6 @@ def batchnorm_fold(x, x_sum, x_square_sum, mean, variance,
                    momentum=0.9, epsilon=1e-5, is_training=True, freeze_bn=0, data_format="NCHW",
                    kernel_name="batchnorm_fold"):
     """batchnorm_fold TBE op"""
-    momentum = 1.0 - momentum
     util.check_kernel_name(kernel_name)
     data_format = data_format.upper()
     if data_format != "NCHW":
@@ -119,13 +119,12 @@ def batchnorm_fold(x, x_sum, x_square_sum, mean, variance,
     variance_div = te.lang.cce.vmuls(x_square_sum, num_rec)
     mean_square = te.lang.cce.vmul(batch_mean, batch_mean)
     batch_var_biased = te.lang.cce.vsub(variance_div, mean_square)
-
+    batch_std = te.lang.cce.vsqrt(te.lang.cce.vadds(batch_var_biased, epsilon))
     if num == 1:
         batch_var_scaler = 0.0
     else:
         batch_var_scaler = float(num) / (num - 1)
-    batch_variance = te.lang.cce.vmuls(batch_var_biased, batch_var_scaler)
-    batch_std = te.lang.cce.vsqrt(te.lang.cce.vadds(batch_variance, epsilon))
+    batch_var_unbiased = te.lang.cce.vmuls(batch_var_biased, batch_var_scaler)
 
     factor = 1.0 - momentum
     factor_reverse = momentum
@@ -133,7 +132,7 @@ def batchnorm_fold(x, x_sum, x_square_sum, mean, variance,
     mean_mul_rev = te.lang.cce.vmuls(mean, factor_reverse)
     mean_updated = te.lang.cce.vadd(mean_mul, mean_mul_rev)
 
-    var_mul = te.lang.cce.vmuls(batch_variance, factor)
+    var_mul = te.lang.cce.vmuls(batch_var_unbiased, factor)
     var_mul_rev = te.lang.cce.vmuls(variance, factor_reverse)
     variance_updated = te.lang.cce.vadd(var_mul, var_mul_rev)
 
diff --git a/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py b/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py
index 810ce7323c..da3a634454 100644
--- a/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py
+++ b/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py
@@ -37,13 +37,7 @@ correction_mul_grad_op_info = TBERegOp("CorrectionMulGrad") \
     .input(2, "batch_std", None, "required", None) \
     .input(3, "running_std", None, "required", None) \
     .output(0, "dx", True, "required", "all") \
-    .output(1, "d_batch_std", True, "required", "all") \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD,
-                  DataType.F16_5HD, DataType.F16_5HD) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default) \
+    .output(1, "mul_dx", True, "required", "all") \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
                   DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
@@ -62,21 +56,14 @@ def correction_mul_grad_compute(dout, x, batch_std, running_std, channel, data_f
     factor = te.lang.cce.vdiv(batch_std, running_std)
     factor_b = te.lang.cce.broadcast(factor, shape_x)
     dx = te.lang.cce.vmul(dout, factor_b)
-    mul_data = te.lang.cce.vmul(dout, x)
-    if channel == 0:
-        if data_format == "NCHW":
-            axis = [1, 2, 3]
-        else:
-            axis = [1, 2, 3, 4]
-    else:
-        axis = [2, 3]
-    red_data = te.lang.cce.sum(mul_data, axis, keepdims=True)
-    d_batch_std = te.lang.cce.vdiv(red_data, running_std)
-    return [dx, d_batch_std]
+    mul_dx = te.lang.cce.vmul(dout, x)
+    running_std_b = te.lang.cce.broadcast(running_std, shape_x)
+    mul_dx = te.lang.cce.vdiv(mul_dx, running_std_b)
+    return [dx, mul_dx]
 
 
 @util.check_input_type(dict, dict, dict, dict, dict, dict, int, str)
-def correction_mul_grad(dout, x, batch_std, running_std, dx, d_batch_std, channel, kernel_name="correction_mul_grad"):
+def correction_mul_grad(dout, x, batch_std, running_std, dx, mul_dx, channel, kernel_name="correction_mul_grad"):
     """CorrectionMulGrad op"""
     shape_dout = dout.get("shape")
     shape_x = dout.get("shape")
@@ -93,13 +80,13 @@ def correction_mul_grad(dout, x, batch_std, running_std, dx, d_batch_std, channe
 
     util.check_dtype_rule(inp_dtype_dout, ("float16", "float32"))
     util.check_dtype_rule(inp_dtype_x, ("float16", "float32"))
-    util.check_dtype_rule(inp_dtype_batch_std, ("float32",))
-    util.check_dtype_rule(inp_dtype_running_std, ("float32",))
+    util.check_dtype_rule(inp_dtype_batch_std, ("float16", "float32"))
+    util.check_dtype_rule(inp_dtype_running_std, ("float16", "float32"))
     util.compare_tensor_dict_key(dout, x, "dtype")
     util.compare_tensor_dict_key(dout, x, "shape")
     util.compare_tensor_dict_key(dx, x, "shape")
     util.compare_tensor_dict_key(batch_std, running_std, "shape")
-    util.compare_tensor_dict_key(batch_std, d_batch_std, "shape")
+    util.compare_tensor_dict_key(dx, mul_dx, "shape")
 
     util.check_kernel_name(kernel_name)
     util.check_shape_rule(shape_x)
@@ -126,7 +113,84 @@ def correction_mul_grad(dout, x, batch_std, running_std, dx, d_batch_std, channe
     with tvm.target.cce():
         sch = generic.auto_schedule(res_list)
 
-    tensor_list = [dout_t, x_t, batch_std_t, running_std_t] + list(res_list)
+    tensor_list = [dout_t, x_t, batch_std_t, running_std_t] + res_list
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list}
+
+    te.lang.cce.cce_build_code(sch, config)
+
+
+correction_mul_grad_reduce_op_info = TBERegOp("CorrectionMulGradReduce") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("correction_mul_grad_reduce.so") \
+    .compute_cost(10) \
+    .kernel_name("correction_mul_grad_reduce") \
+    .partial_flag(True) \
+    .op_pattern("formatAgnostic") \
+    .attr("channel_axis", "optional", "int", "all") \
+    .input(0, "dout", None, "required", None) \
+    .output(0, "d_batch_std", True, "required", "all") \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(correction_mul_grad_reduce_op_info)
+def _correction_mul_grad_reduce_tbe():
+    """CorrectionMulGradReduce TBE register"""
+    return
+
+
+@fusion_manager.register("correction_mul_grad_reduce")
+def correction_mul_grad_reduce_compute(mul_dx, channel, data_format, kernel_name="correction_mul"):
+    """CorrectionMulGradReduce compute"""
+    if channel == 0:
+        if data_format == "NCHW":
+            axis = [1, 2, 3]
+        else:
+            axis = [1, 2, 3, 4]
+    else:
+        axis = [2, 3]
+    d_batch_std = te.lang.cce.sum(mul_dx, axis, keepdims=True)
+    return d_batch_std
+
+
+@util.check_input_type(dict, dict, int, str)
+def correction_mul_grad_reduce(mul_dx, d_batch_std, channel, kernel_name="correction_mul_grad_reduce"):
+    """CorrectionMulGradReduce op"""
+    shape_dout = mul_dx.get("shape")
+    shape_x = mul_dx.get("shape")
+
+    dtype_dout = mul_dx.get("dtype")
+
+    inp_dtype_dout = dtype_dout.lower()
+
+    util.check_dtype_rule(inp_dtype_dout, ("float16", "float32"))
+
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(shape_x)
+    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
+
+    data_format = mul_dx.get("format")
+    ori_format = mul_dx.get("format")
+    if data_format.upper() not in ("NC1HWC0", "NCHW"):
+        raise RuntimeError("Un supported data format {}".format(data_format))
+    if data_format.upper() == "NCHW" and ori_format != "NCHW":
+        raise RuntimeError("data_format(NCHW) must same as ori_format")
+
+    shape_c = [1] * len(shape_x)
+    shape_c[channel] = d_batch_std.get("ori_shape")[0]
+    if data_format == "NC1HWC0" and channel == 1:
+        shape_c = d_batch_std.get("shape")
+
+    dout_t = tvm.placeholder(shape_dout, name="dout", dtype=inp_dtype_dout)
+    res = correction_mul_grad_reduce_compute(dout_t, channel, data_format, kernel_name)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res)
+
+    tensor_list = [dout_t, res]
     config = {"print_ir": False,
               "name": kernel_name,
               "tensor_list": tensor_list}
diff --git a/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel.py b/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel.py
new file mode 100644
index 0000000000..f6c133c808
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel.py
@@ -0,0 +1,146 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FakeQuantPerChannel op"""
+import te.lang.cce
+from te import tvm
+from te.platform.fusion_manager import fusion_manager
+from topi import generic
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+fake_quant_perchannel_op_info = TBERegOp("FakeQuantPerChannel") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("fake_quant_perchannel.so") \
+    .compute_cost(10) \
+    .kernel_name("fake_quant_perchannel") \
+    .partial_flag(True) \
+    .attr("symmetric", "optional", "bool", "all") \
+    .attr("narrow_range", "optional", "bool", "all") \
+    .attr("num_bits", "optional", "int", "all") \
+    .attr("channel_axis", "optional", "int", "all") \
+    .input(0, "x", None, "required", None) \
+    .input(1, "min", None, "required", None) \
+    .input(2, "max", None, "required", None) \
+    .output(0, "y", True, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(fake_quant_perchannel_op_info)
+def _fake_quant_perchannel_tbe():
+    """FakeQuantPerChannel TBE register"""
+    return
+
+
+@fusion_manager.register("fake_quant_perchannel")
+def fake_quant_perchannel_compute(x, min_val, max_val, y, quant_min, quant_max,
+                                  kernel_name="fake_quant_perchannel"):
+    """FakeQuantPerChannel"""
+    x_shape = te.lang.cce.util.shape_to_list(x.shape)
+    minmax_shape = te.lang.cce.util.shape_to_list(min_val.shape)
+    quant_min = tvm.const(quant_min, x.dtype)
+    quant_max = tvm.const(quant_max, x.dtype)
+    quant_min = te.lang.cce.broadcast(quant_min, minmax_shape, x.dtype)
+    quant_max = te.lang.cce.broadcast(quant_max, minmax_shape, x.dtype)
+
+    # CalNudge(NudgeMinMax)
+    scale = te.lang.cce.vdiv(te.lang.cce.vsub(
+        max_val, min_val), te.lang.cce.vsub(quant_max, quant_min))
+    zp_from_min = te.lang.cce.vsub(quant_min, te.lang.cce.vdiv(min_val, scale))
+
+    # Nudge zero point
+    nudge_zp_ = te.lang.cce.vmin(
+        quant_max, te.lang.cce.vmax(quant_min, zp_from_min))
+    nudge_zp = te.lang.cce.floor(te.lang.cce.vadds(nudge_zp_, 0.5))
+    nudge_min = te.lang.cce.vmul(te.lang.cce.vsub(quant_min, nudge_zp), scale)
+    nudge_max = te.lang.cce.vmul(te.lang.cce.vsub(quant_max, nudge_zp), scale)
+
+    # FakeQuant
+    nudge_min_b = te.lang.cce.broadcast(nudge_min, x_shape)
+    nudge_max_b = te.lang.cce.broadcast(nudge_max, x_shape)
+    scale_b = te.lang.cce.broadcast(scale, x_shape)
+
+    input_x = te.lang.cce.vmin(nudge_max_b, te.lang.cce.vmax(nudge_min_b, x))
+    nudge_input_ = te.lang.cce.vdiv(
+        te.lang.cce.vsub(input_x, nudge_min_b), scale_b)
+    nudge_input = te.lang.cce.floor(te.lang.cce.vadds(nudge_input_, 0.5))
+    res = te.lang.cce.vadd(te.lang.cce.vmul(nudge_input, scale_b), nudge_min_b)
+
+    return res
+
+
+@util.check_input_type(dict, dict, dict, dict, bool, bool, int, int, str)
+def fake_quant_perchannel(x, min_val, max_val, y,
+                          symmetric, narrow_range, num_bits, channel_axis,
+                          kernel_name="fake_quant_perchannel"):
+    """FakeQuantPerChannel"""
+    x_shape = x.get("shape")
+    x_shape_ = x.get("ori_shape")
+    x_format = x.get("format")
+    x_dtype = x.get("dtype")
+    min_shape = min_val.get("ori_shape")
+    min_dtype = min_val.get("dtype")
+    max_shape = max_val.get("ori_shape")
+    max_dtype = max_val.get("dtype")
+
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(x_shape)
+    util.check_shape_rule(min_shape, 1, 1, x_shape_[channel_axis])
+    util.check_shape_rule(max_shape, 1, 1, x_shape_[channel_axis])
+    util.check_tensor_shape_size(x_shape)
+    util.check_tensor_shape_size(min_shape)
+    util.check_tensor_shape_size(max_shape)
+
+    check_list = ["float32", "float16"]
+    x_dtype = x_dtype.lower()
+    min_dtype = min_dtype.lower()
+    max_dtype = max_dtype.lower()
+    util.check_dtype_rule(x_dtype, check_list)
+    util.check_dtype_rule(min_dtype, check_list)
+    util.check_dtype_rule(max_dtype, check_list)
+
+    if symmetric:
+        quant_min = 0 - 2 ** (num_bits - 1)
+        quant_max = 2 ** (num_bits - 1) - 1
+    else:
+        quant_min = 0
+        quant_max = 2 ** num_bits - 1
+    if narrow_range:
+        quant_min = quant_min + 1
+
+    shape_c = [1] * len(x_shape)
+    shape_c[channel_axis] = min_val.get("ori_shape")[0]
+    if x_format == "NC1HWC0" and channel_axis == 1:
+        shape_c = min_val.get("shape")
+    input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype)
+    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
+    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
+    res = fake_quant_perchannel_compute(input_data, min_data, max_data, y,
+                                        quant_min, quant_max, kernel_name)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res)
+
+    tensor_list = [input_data, min_data, max_data, res]
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list}
+
+    te.lang.cce.cce_build_code(sch, config)
diff --git a/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel_grad.py b/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel_grad.py
new file mode 100644
index 0000000000..4e9053fcb1
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel_grad.py
@@ -0,0 +1,172 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FakeQuantPerChannelGrad op"""
+import te.lang.cce
+from te import tvm
+from te.platform.fusion_manager import fusion_manager
+from topi import generic
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+SHAPE_SIZE_LIMIT = 2147483648
+D_TYPE = 'float32'
+
+fake_quant_perchannel_grad_op_info = TBERegOp("FakeQuantPerChannelGrad") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("fake_quant_perchannel_grad.so") \
+    .compute_cost(10) \
+    .kernel_name("fake_quant_perchannel_grad") \
+    .partial_flag(True) \
+    .attr("symmetric", "optional", "bool", "all") \
+    .attr("narrow_range", "optional", "bool", "all") \
+    .attr("num_bits", "optional", "int", "all") \
+    .attr("channel_axis", "optional", "int", "all") \
+    .input(0, "dout", None, "required", None) \
+    .input(1, "x", None, "required", None) \
+    .input(2, "min", None, "required", None) \
+    .input(3, "max", None, "required", None) \
+    .output(0, "dx", True, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+def _less_compare_float32(data_x, data_y):
+    """_less_compare_float32 compute"""
+    input_shape = te.lang.cce.util.shape_to_list(data_x.shape)
+    min_value = tvm.const(2 ** (-126), dtype=D_TYPE)
+    max_value = tvm.const(2 ** 62, dtype=D_TYPE)
+    factor_value = tvm.const(2 ** 2, dtype=D_TYPE)
+    data_zero = te.lang.cce.broadcast(
+        tvm.const(0, dtype=D_TYPE), input_shape, D_TYPE)
+    min_value_tensor = te.lang.cce.vadds(data_zero, min_value)
+
+    res_sub = te.lang.cce.vsub(data_y, data_x)
+    res_min = te.lang.cce.vmin(res_sub, min_value_tensor)
+    res_max = te.lang.cce.vmax(res_min, data_zero)
+
+    res_max_mul = te.lang.cce.vmuls(res_max, max_value)
+    res_max_mul_max = te.lang.cce.vmuls(res_max_mul, max_value)
+    res = te.lang.cce.vmuls(res_max_mul_max, factor_value)
+
+    return res
+
+
+@op_info_register(fake_quant_perchannel_grad_op_info)
+def _fake_quant_perchannel_grad_tbe():
+    """FakeQuantPerChannelGrad TBE register"""
+    return
+
+
+@fusion_manager.register("fake_quant_perchannel_grad")
+def fake_quant_perchannel_grad_compute(dout, x, min_val, max_val, quant_min, quant_max,
+                                       kernel_name="fake_quant_perchannel_grad"):
+    """FakeQuantPerChannelGrad"""
+    x_shape = te.lang.cce.util.shape_to_list(x.shape)
+    minmax_shape = te.lang.cce.util.shape_to_list(min_val.shape)
+    quant_min = tvm.const(quant_min, x.dtype)
+    quant_max = tvm.const(quant_max, x.dtype)
+    quant_min = te.lang.cce.broadcast(quant_min, minmax_shape, x.dtype)
+    quant_max = te.lang.cce.broadcast(quant_max, minmax_shape, x.dtype)
+
+    # CalNudge(NudgeMinMax)
+    scale = te.lang.cce.vdiv(te.lang.cce.vsub(
+        max_val, min_val), te.lang.cce.vsub(quant_max, quant_min))
+    zp_from_min = te.lang.cce.vsub(quant_min, te.lang.cce.vdiv(min_val, scale))
+
+    # Nudge zero point
+    nudge_zp_ = te.lang.cce.vmin(
+        quant_max, te.lang.cce.vmax(quant_min, zp_from_min))
+    nudge_zp = te.lang.cce.floor(te.lang.cce.vadds(nudge_zp_, 0.5))
+    nudge_min = te.lang.cce.vmul(te.lang.cce.vsub(quant_min, nudge_zp), scale)
+    nudge_max = te.lang.cce.vmul(te.lang.cce.vsub(quant_max, nudge_zp), scale)
+
+    # FakeQuant Grad
+    nudge_min_b = te.lang.cce.broadcast(nudge_min, x_shape)
+    nudge_max_b = te.lang.cce.broadcast(nudge_max, x_shape)
+
+    bool_over_min = _less_compare_float32(nudge_min_b, x)
+    bool_less_max = _less_compare_float32(x, nudge_max_b)
+    bool_between = te.lang.cce.vmul(bool_over_min, bool_less_max)
+    res = te.lang.cce.vmul(dout, bool_between)
+
+    return res
+
+
+@util.check_input_type(dict, dict, dict, dict, dict, bool, bool, int, int, str)
+def fake_quant_perchannel_grad(dout, x, min_val, max_val, dx,
+                               symmetric, narrow_range, num_bits, channel_axis,
+                               kernel_name="fake_quant_perchannel_grad"):
+    """FakeQuantPerChannelGrad"""
+    x_shape = x.get("shape")
+    x_shape_ = x.get("ori_shape")
+    x_format = x.get("format")
+    x_dtype = x.get("dtype")
+    min_shape = min_val.get("ori_shape")
+    min_dtype = min_val.get("dtype")
+    max_shape = max_val.get("ori_shape")
+    max_dtype = max_val.get("dtype")
+
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(x_shape)
+    util.check_shape_rule(min_shape, 1, 1, x_shape_[channel_axis])
+    util.check_shape_rule(max_shape, 1, 1, x_shape_[channel_axis])
+    util.check_tensor_shape_size(x_shape)
+    util.check_tensor_shape_size(min_shape)
+    util.check_tensor_shape_size(max_shape)
+
+    check_list = ["float32", "float16"]
+    x_dtype = x_dtype.lower()
+    min_dtype = min_dtype.lower()
+    max_dtype = max_dtype.lower()
+    util.check_dtype_rule(x_dtype, check_list)
+    util.check_dtype_rule(min_dtype, check_list)
+    util.check_dtype_rule(max_dtype, check_list)
+
+    if symmetric:
+        quant_min = 0 - 2 ** (num_bits - 1)
+        quant_max = 2 ** (num_bits - 1) - 1
+    else:
+        quant_min = 0
+        quant_max = 2 ** num_bits - 1
+    if narrow_range:
+        quant_min = quant_min + 1
+
+    shape_c = [1] * len(x_shape)
+    shape_c[channel_axis] = min_val.get("ori_shape")[0]
+    if x_format == "NC1HWC0" and channel_axis == 1:
+        shape_c = min_val.get("shape")
+    dout_data = tvm.placeholder(x_shape, name="dout", dtype=x_dtype)
+    input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype)
+    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
+    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
+    res = fake_quant_perchannel_grad_compute(dout_data, input_data, min_data, max_data,
+                                             quant_min, quant_max, kernel_name)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res)
+
+    tensor_list = [dout_data, input_data, min_data, max_data, res]
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list}
+
+    te.lang.cce.cce_build_code(sch, config)
diff --git a/mindspore/ops/_op_impl/_custom_op/fake_quant_with_min_max.py b/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer.py
similarity index 69%
rename from mindspore/ops/_op_impl/_custom_op/fake_quant_with_min_max.py
rename to mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer.py
index 4afdf3a051..3e75e9e0a5 100644
--- a/mindspore/ops/_op_impl/_custom_op/fake_quant_with_min_max.py
+++ b/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 # ============================================================================
 
-"""FakeQuantWithMinMax op"""
-
+"""FakeQuantPerLayer op"""
 from functools import reduce as functools_reduce
 import te.lang.cce
 from te import tvm
@@ -23,20 +22,16 @@ from topi import generic
 from topi.cce import util
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
-fake_quant_op_info = TBERegOp("FakeQuantWithMinMax") \
+fake_quant_per_layer_op_info = TBERegOp("FakeQuantPerLayer") \
     .fusion_type("ELEMWISE") \
     .async_flag(False) \
-    .binfile_name("fake_quant_with_min_max_vars_ema.so") \
+    .binfile_name("fake_quant_per_layer.so") \
     .compute_cost(10) \
-    .kernel_name("fake_quant_with_min_max_vars_ema") \
+    .kernel_name("fake_quant_per_layer") \
     .partial_flag(True) \
-    .attr("ema", "optional", "bool", "all") \
-    .attr("ema_decay", "optional", "float", "all") \
     .attr("symmetric", "optional", "bool", "all") \
     .attr("narrow_range", "optional", "bool", "all") \
-    .attr("training", "optional", "bool", "all") \
     .attr("num_bits", "optional", "int", "all") \
-    .attr("quant_delay", "optional", "int", "all") \
     .input(0, "x", None, "required", None) \
     .input(1, "min", None, "required", None) \
     .input(2, "max", None, "required", None) \
@@ -48,28 +43,32 @@ fake_quant_op_info = TBERegOp("FakeQuantWithMinMax") \
     .get_op_info()
 
 
-@op_info_register(fake_quant_op_info)
-def _fake_quant_tbe():
-    """FakeQuantWithMinMax TBE register"""
+@op_info_register(fake_quant_per_layer_op_info)
+def _fake_quant_per_layer_tbe():
+    """FakeQuantPerLayer TBE register"""
     return
 
 
-@fusion_manager.register("fake_quant_with_min_max_vars_ema")
-def fake_quant_with_min_max_vars_ema_compute(x, min_val, max_val, y, quant_min, quant_max,
-                                             kernel_name="correction_mul"):
-    """FakeQuantWithMinMax"""
+@fusion_manager.register("fake_quant_per_layer")
+def fake_quant_per_layer_compute(x, min_val, max_val, y, quant_min, quant_max, symmetric,
+                                 kernel_name="fake_quant_per_layer"):
+    """FakeQuantPerLayer"""
     shape = te.lang.cce.util.shape_to_list(x.shape)
     shape_min = te.lang.cce.util.shape_to_list(min_val.shape)
     quant_min = te.lang.cce.broadcast(quant_min, shape_min, x.dtype)
     quant_max = te.lang.cce.broadcast(quant_max, shape_min, x.dtype)
-    min_val = te.lang.cce.broadcast(min_val, shape_min, x.dtype)
-    max_val = te.lang.cce.broadcast(max_val, shape_min, x.dtype)
+    if symmetric:
+        max_val = te.lang.cce.vmax(te.lang.cce.vmuls(min_val, -1.), max_val)
+        min_val = te.lang.cce.vmuls(max_val, -1.)
 
     # CalNudge(NudgeMinMax)
-    scale = te.lang.cce.vdiv(te.lang.cce.vsub(max_val, min_val), te.lang.cce.vsub(quant_max, quant_min))
+    scale = te.lang.cce.vdiv(te.lang.cce.vsub(
+        max_val, min_val), te.lang.cce.vsub(quant_max, quant_min))
     zp_from_min = te.lang.cce.vsub(quant_min, te.lang.cce.vdiv(min_val, scale))
     # Nudge zero point
-    nudge_zp = te.lang.cce.round(te.lang.cce.vmin(quant_max, te.lang.cce.vmax(quant_min, zp_from_min)))
+    nudge_zp_ = te.lang.cce.vmin(
+        quant_max, te.lang.cce.vmax(quant_min, zp_from_min))
+    nudge_zp = te.lang.cce.floor(te.lang.cce.vadds(nudge_zp_, 0.5))
     nudge_min = te.lang.cce.vmul(te.lang.cce.vsub(quant_min, nudge_zp), scale)
     nudge_max = te.lang.cce.vmul(te.lang.cce.vsub(quant_max, nudge_zp), scale)
 
@@ -80,18 +79,19 @@ def fake_quant_with_min_max_vars_ema_compute(x, min_val, max_val, y, quant_min,
 
     # FakeQuant
     input_x = te.lang.cce.vmin(nudge_max, te.lang.cce.vmax(nudge_min, x))
-    nudge_input = te.lang.cce.floor(te.lang.cce.vadds(te.lang.cce.vdiv(te.lang.cce.vsub(input_x, nudge_min), scale),
-                                                      0.5))
+    nudge_input_ = te.lang.cce.vdiv(
+        te.lang.cce.vsub(input_x, nudge_min), scale)
+    nudge_input = te.lang.cce.floor(te.lang.cce.vadds(nudge_input_, 0.5))
     res = te.lang.cce.vadd(te.lang.cce.vmul(nudge_input, scale), nudge_min)
 
     return res
 
 
-@util.check_input_type(dict, dict, dict, dict, bool, float, bool, bool, bool, int, int, str)
-def fake_quant_with_min_max_vars_ema(x, min_val, max_val, y,
-                                     ema, ema_decay, symmetric, narrow_range, training, num_bits, quant_delay,
-                                     kernel_name="fake_quant"):
-    """FakeQuantWithMinMax"""
+@util.check_input_type(dict, dict, dict, dict, bool, bool, int, str)
+def fake_quant_per_layer(x, min_val, max_val, y,
+                         symmetric, narrow_range, num_bits,
+                         kernel_name="fake_quant_per_layer"):
+    """FakeQuantPerLayer"""
     input_shape = x.get("shape")
     input_dtype = x.get("dtype")
     min_shape = min_val.get("ori_shape")
@@ -120,20 +120,16 @@ def fake_quant_with_min_max_vars_ema(x, min_val, max_val, y,
     input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),)
     shape_min, _, _ = util.produce_shapes(min_shape, input_shape)
 
-    if symmetric:
-        quant_min = 0 - 2 ** (num_bits - 1)
-        quant_max = 2 ** (num_bits - 1) - 1
-    else:
-        quant_min = 0
-        quant_max = 2 ** num_bits - 1
+    quant_min = 0
+    quant_max = 2 ** num_bits - 1
     if narrow_range:
         quant_min = quant_min + 1
 
     input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
     min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
     max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
-    res = fake_quant_with_min_max_vars_ema_compute(input_data, min_data, max_data, y,
-                                                   quant_min, quant_max, kernel_name)
+    res = fake_quant_per_layer_compute(input_data, min_data, max_data, y,
+                                       quant_min, quant_max, symmetric, kernel_name)
 
     with tvm.target.cce():
         sch = generic.auto_schedule(res)
diff --git a/mindspore/ops/_op_impl/_custom_op/fake_quant_with_min_max_grad.py b/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer_grad.py
similarity index 75%
rename from mindspore/ops/_op_impl/_custom_op/fake_quant_with_min_max_grad.py
rename to mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer_grad.py
index be5dcb6591..a78effcc4f 100644
--- a/mindspore/ops/_op_impl/_custom_op/fake_quant_with_min_max_grad.py
+++ b/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer_grad.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 
-"""FakeQuantWithMinMaxGrad op"""
+"""FakeQuantPerLayerGrad op"""
 
 from functools import reduce as functools_reduce
 import te.lang.cce
@@ -26,15 +26,16 @@ from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 SHAPE_SIZE_LIMIT = 2147483648
 D_TYPE = 'float32'
 
-fake_quant_grad_op_info = TBERegOp("FakeQuantWithMinMaxGrad") \
+fake_quant_per_layer_grad_op_info = TBERegOp("FakeQuantPerLayerGrad") \
     .fusion_type("OPAQUE") \
     .async_flag(False) \
-    .binfile_name("fake_quant_with_min_max_grad.so") \
+    .binfile_name("fake_quant_per_layer_grad.so") \
     .compute_cost(10) \
-    .kernel_name("fake_quant_with_min_max_grad") \
+    .kernel_name("fake_quant_per_layer_grad") \
     .partial_flag(True) \
     .attr("num_bits", "optional", "int", "all") \
-    .attr("quant_delay", "optional", "int", "all") \
+    .attr("symmetric", "optional", "bool", "all") \
+    .attr("narrow_range", "optional", "bool", "all") \
     .input(0, "dout", None, "required", None) \
     .input(1, "x", None, "required", None) \
     .input(2, "min", None, "required", None) \
@@ -55,7 +56,8 @@ def _less_compare_float32(data_x, data_y):
     min_value = tvm.const(2 ** (-126), dtype=D_TYPE)
     max_value = tvm.const(2 ** 62, dtype=D_TYPE)
     factor_value = tvm.const(2 ** 2, dtype=D_TYPE)
-    data_zero = te.lang.cce.broadcast(tvm.const(0, dtype=D_TYPE), shape_inputs, D_TYPE)
+    data_zero = te.lang.cce.broadcast(
+        tvm.const(0, dtype=D_TYPE), shape_inputs, D_TYPE)
     min_value_tensor = te.lang.cce.vadds(data_zero, min_value)
 
     res_sub = te.lang.cce.vsub(data_y, data_x)
@@ -69,16 +71,16 @@ def _less_compare_float32(data_x, data_y):
     return res
 
 
-@op_info_register(fake_quant_grad_op_info)
-def _fake_quant_grad_tbe():
-    """FakeQuantWithMinMaxGrad TBE register"""
+@op_info_register(fake_quant_per_layer_grad_op_info)
+def _fake_quant_per_layer_grad_tbe():
+    """FakeQuantPerLayerGrad TBE register"""
     return
 
 
-@fusion_manager.register("fake_quant_with_min_max_grad")
-def fake_quant_with_min_max_grad_compute(dout, x, min_val, max_val, quant_min, quant_max,
-                                         kernel_name="fake_quant_with_min_max_grad"):
-    """FakeQuantWithMinMaxGrad"""
+@fusion_manager.register("fake_quant_per_layer_grad")
+def fake_quant_per_layer_grad_compute(dout, x, min_val, max_val, quant_min, quant_max, symmetric,
+                                      kernel_name="fake_quant_per_layer_grad"):
+    """FakeQuantPerLayerGrad"""
     shape = te.lang.cce.util.shape_to_list(x.shape)
     shape_min = te.lang.cce.util.shape_to_list(min_val.shape)
     quant_min = tvm.const(quant_min, x.dtype)
@@ -86,11 +88,18 @@ def fake_quant_with_min_max_grad_compute(dout, x, min_val, max_val, quant_min, q
     quant_min = te.lang.cce.broadcast(quant_min, shape_min)
     quant_max = te.lang.cce.broadcast(quant_max, shape_min)
 
+    if symmetric:
+        max_val = te.lang.cce.vmax(te.lang.cce.vmuls(min_val, -1.), max_val)
+        min_val = te.lang.cce.vmuls(max_val, -1.)
+
     # CalNudge(NudgeMinMax)
-    scale = te.lang.cce.vdiv(te.lang.cce.vsub(max_val, min_val), te.lang.cce.vsub(quant_max, quant_min))
+    scale = te.lang.cce.vdiv(te.lang.cce.vsub(
+        max_val, min_val), te.lang.cce.vsub(quant_max, quant_min))
     zp_from_min = te.lang.cce.vsub(quant_min, te.lang.cce.vdiv(min_val, scale))
     # Nudge zero point
-    nudge_zp = te.lang.cce.round(te.lang.cce.vmin(quant_max, te.lang.cce.vmax(quant_min, zp_from_min)))
+    nudge_zp_ = te.lang.cce.vmin(
+        quant_max, te.lang.cce.vmax(quant_min, zp_from_min))
+    nudge_zp = te.lang.cce.floor(te.lang.cce.vadds(nudge_zp_, 0.5))
     nudge_min = te.lang.cce.vmul(te.lang.cce.vsub(quant_min, nudge_zp), scale)
     nudge_max = te.lang.cce.vmul(te.lang.cce.vsub(quant_max, nudge_zp), scale)
     nudge_min = te.lang.cce.broadcast(nudge_min, shape)
@@ -104,10 +113,11 @@ def fake_quant_with_min_max_grad_compute(dout, x, min_val, max_val, quant_min, q
     return res
 
 
-@util.check_input_type(dict, dict, dict, dict, dict, int, int, str)
-def fake_quant_with_min_max_grad(dout, x, min_val, max_val, dx, num_bits, quant_delay,
-                                 kernel_name="fake_quant_with_min_max_grad"):
-    """FakeQuantWithMinMaxGrad"""
+@util.check_input_type(dict, dict, dict, dict, dict, int, bool, bool, str)
+def fake_quant_per_layer_grad(dout, x, min_val, max_val, dx,
+                              num_bits, symmetric, narrow_range,
+                              kernel_name="fake_quant_per_layer_grad"):
+    """FakeQuantPerLayerGrad"""
     input_shape = x.get("shape")
     input_dtype = x.get("dtype")
     min_shape = min_val.get("ori_shape")
@@ -138,12 +148,15 @@ def fake_quant_with_min_max_grad(dout, x, min_val, max_val, dx, num_bits, quant_
 
     quant_min = 0
     quant_max = 2 ** num_bits - 1
+    if narrow_range:
+        quant_min = quant_min + 1
+
     dout_data = tvm.placeholder(input_shape, name="dout", dtype=x_dtype)
     input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
     min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
     max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
-    res = fake_quant_with_min_max_grad_compute(dout_data, input_data, min_data, max_data, quant_min,
-                                               quant_max, kernel_name)
+    res = fake_quant_per_layer_grad_compute(dout_data, input_data, min_data, max_data,
+                                            quant_min, quant_max, symmetric, kernel_name)
 
     with tvm.target.cce():
         sch = generic.auto_schedule(res)
diff --git a/mindspore/ops/_op_impl/_custom_op/minmax_update_perchannel.py b/mindspore/ops/_op_impl/_custom_op/minmax_update_perchannel.py
new file mode 100644
index 0000000000..1ff63464c3
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/minmax_update_perchannel.py
@@ -0,0 +1,126 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MinMaxUpdatePerChannel op"""
+import te.lang.cce
+from te import tvm
+from te.platform.fusion_manager import fusion_manager
+from topi import generic
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+minmax_update_perchannel_op_info = TBERegOp("MinMaxUpdatePerChannel") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("minmax_update_perchannel.so") \
+    .compute_cost(10) \
+    .kernel_name("minmax_update_perchannel") \
+    .partial_flag(True) \
+    .attr("ema", "optional", "bool", "all") \
+    .attr("ema_decay", "optional", "float", "all") \
+    .attr("channel_axis", "optional", "int", "all") \
+    .input(0, "x", None, "required", None) \
+    .input(1, "min", None, "required", None) \
+    .input(2, "max", None, "required", None) \
+    .output(0, "min_up", True, "required", "all") \
+    .output(1, "max_up", True, "required", "all") \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(minmax_update_perchannel_op_info)
+def _minmax_update_perchannel_tbe():
+    """MinMaxUpdatePerChannel TBE register"""
+    return
+
+
+@fusion_manager.register("minmax_update_perchannel")
+def minmax_update_perchannel_compute(x, min_val, max_val,
+                                     ema, ema_decay, channel_axis):
+    """MinMaxUpdatePerChannel compute"""
+    shape_min = te.lang.cce.util.shape_to_list(min_val.shape)
+
+    if not ema:
+        ema_decay = 0.0
+
+    # CalMinMax
+    if channel_axis == 0:
+        axis = [1, 2, 3, 4]
+    else:
+        axis = [0, 2, 3]
+
+    x_min = te.lang.cce.reduce_min(x, axis=axis)
+    x_max = te.lang.cce.reduce_max(x, axis=axis)
+    x_min = te.lang.cce.broadcast(x_min, shape_min)
+    x_max = te.lang.cce.broadcast(x_max, shape_min)
+    min_val = te.lang.cce.vadd(te.lang.cce.vmuls(
+        min_val, ema_decay), te.lang.cce.vmuls(x_min, (1 - ema_decay)))
+    max_val = te.lang.cce.vadd(te.lang.cce.vmuls(
+        max_val, ema_decay), te.lang.cce.vmuls(x_max, (1 - ema_decay)))
+    min_val = te.lang.cce.vmins(min_val, 0)
+    max_val = te.lang.cce.vmaxs(max_val, 0)
+
+    return [min_val, max_val]
+
+
+@util.check_input_type(dict, dict, dict, dict, dict, bool, float, int, str)
+def minmax_update_perchannel(x, min_val, max_val, min_up, max_up,
+                             ema, ema_decay, channel_axis,
+                             kernel_name="minmax_update_perchannel"):
+    """MinMaxUpdatePerChannel op"""
+    x_shape = x.get("ori_shape")
+    x_format = x.get("format")
+    x_dtype = x.get("dtype")
+    min_shape = min_val.get("ori_shape")
+    min_dtype = min_val.get("dtype")
+    max_shape = max_val.get("ori_shape")
+    max_dtype = max_val.get("dtype")
+
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(x_shape)
+    util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis])
+    util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis])
+    util.check_tensor_shape_size(x_shape)
+    util.check_tensor_shape_size(min_shape)
+    util.check_tensor_shape_size(max_shape)
+
+    check_list = ["float32", "float16"]
+    x_dtype = x_dtype.lower()
+    min_dtype = min_dtype.lower()
+    max_dtype = max_dtype.lower()
+    util.check_dtype_rule(x_dtype, check_list)
+    util.check_dtype_rule(min_dtype, check_list)
+    util.check_dtype_rule(max_dtype, check_list)
+
+    if channel_axis == 0:
+        shape_c = min_val.get("ori_shape")
+    else:
+        shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]]
+    input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype)
+    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
+    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
+    res_list = minmax_update_perchannel_compute(input_data, min_data, max_data,
+                                                ema, ema_decay, channel_axis)
+
+    with tvm.target.cce():
+        sch = generic.auto_schedule(res_list)
+
+    tensor_list = [input_data, min_data, max_data] + list(res_list)
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list}
+
+    te.lang.cce.cce_build_code(sch, config)
diff --git a/mindspore/ops/_op_impl/_custom_op/fake_quant_with_min_max_update.py b/mindspore/ops/_op_impl/_custom_op/minmax_update_perlayer.py
similarity index 60%
rename from mindspore/ops/_op_impl/_custom_op/fake_quant_with_min_max_update.py
rename to mindspore/ops/_op_impl/_custom_op/minmax_update_perlayer.py
index e5c932aa0f..4d2096d55b 100644
--- a/mindspore/ops/_op_impl/_custom_op/fake_quant_with_min_max_update.py
+++ b/mindspore/ops/_op_impl/_custom_op/minmax_update_perlayer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 
-"""FakeQuantWithMinMaxUpdate op"""
+"""MinMaxUpdatePerLayer op"""
 from functools import reduce as functools_reduce
 import te.lang.cce
 from te import tvm
@@ -22,21 +22,15 @@ from topi import generic
 from topi.cce import util
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
-
-fake_quant_update5d_op_info = TBERegOp("FakeQuantWithMinMaxUpdate") \
+minmax_update_perlayer_op_info = TBERegOp("MinMaxUpdatePerLayer") \
     .fusion_type("OPAQUE") \
     .async_flag(False) \
-    .binfile_name("fake_quant_with_min_max_update5d.so") \
+    .binfile_name("minmax_update_perlayer.so") \
     .compute_cost(10) \
-    .kernel_name("fake_quant_with_min_max_update") \
+    .kernel_name("minmax_update_perlayer") \
     .partial_flag(True) \
     .attr("ema", "optional", "bool", "all") \
     .attr("ema_decay", "optional", "float", "all") \
-    .attr("symmetric", "optional", "bool", "all") \
-    .attr("narrow_range", "optional", "bool", "all") \
-    .attr("training", "optional", "bool", "all") \
-    .attr("num_bits", "optional", "int", "all") \
-    .attr("quant_delay", "optional", "int", "all") \
     .input(0, "x", None, "required", None) \
     .input(1, "min", None, "required", None) \
     .input(2, "max", None, "required", None) \
@@ -47,42 +41,42 @@ fake_quant_update5d_op_info = TBERegOp("FakeQuantWithMinMaxUpdate") \
     .get_op_info()
 
 
-@op_info_register(fake_quant_update5d_op_info)
-def _fake_quant_update5d_tbe():
-    """_FakeQuantWithMinMaxUpdate5D TBE register"""
+@op_info_register(minmax_update_perlayer_op_info)
+def _minmax_update_perlayer_tbe():
+    """MinMaxUpdatePerLayer TBE register"""
     return
 
 
-@fusion_manager.register("fake_quant_with_min_max_update")
-def fake_quant_with_min_max_update_compute(x, min_val, max_val, ema, ema_decay, quant_min, quant_max, training,
-                                           kernel_name="fake_quant_update"):
-    """FakeQuantWithMinMaxUpdate compute"""
+@fusion_manager.register("minmax_update_perlayer")
+def minmax_update_perlayer_compute(x, min_val, max_val, ema, ema_decay):
+    """MinMaxUpdatePerLayer compute"""
     shape = te.lang.cce.util.shape_to_list(x.shape)
     shape_min = te.lang.cce.util.shape_to_list(min_val.shape)
     min_val = te.lang.cce.broadcast(min_val, shape_min, x.dtype)
     max_val = te.lang.cce.broadcast(max_val, shape_min, x.dtype)
     if not ema:
         ema_decay = 0.0
-    if training:
-        # CalMinMax
-        axis = tuple(range(len(shape)))
-        x_min = te.lang.cce.reduce_min(x, axis=axis)
-        x_max = te.lang.cce.reduce_max(x, axis=axis)
-        x_min = te.lang.cce.broadcast(x_min, shape_min)
-        x_max = te.lang.cce.broadcast(x_max, shape_min)
-        min_val = te.lang.cce.vadd(te.lang.cce.vmuls(min_val, ema_decay), te.lang.cce.vmuls(x_min, (1 - ema_decay)))
-        max_val = te.lang.cce.vadd(te.lang.cce.vmuls(max_val, ema_decay), te.lang.cce.vmuls(x_max, (1 - ema_decay)))
-        min_val = te.lang.cce.vmins(min_val, 0)
-        max_val = te.lang.cce.vmaxs(max_val, 0)
+
+    # CalMinMax
+    axis = tuple(range(len(shape)))
+    x_min = te.lang.cce.reduce_min(x, axis=axis)
+    x_max = te.lang.cce.reduce_max(x, axis=axis)
+    x_min = te.lang.cce.broadcast(x_min, shape_min)
+    x_max = te.lang.cce.broadcast(x_max, shape_min)
+    min_val = te.lang.cce.vadd(te.lang.cce.vmuls(
+        min_val, ema_decay), te.lang.cce.vmuls(x_min, (1 - ema_decay)))
+    max_val = te.lang.cce.vadd(te.lang.cce.vmuls(
+        max_val, ema_decay), te.lang.cce.vmuls(x_max, (1 - ema_decay)))
+    min_val = te.lang.cce.vmins(min_val, 0)
+    max_val = te.lang.cce.vmaxs(max_val, 0)
 
     return [min_val, max_val]
 
 
-@util.check_input_type(dict, dict, dict, dict, dict, bool, float, bool, bool, bool, int, int, str)
-def fake_quant_with_min_max_update(x, min_val, max_val, min_up, max_up,
-                                   ema, ema_decay, symmetric, narrow_range, training, num_bits, quant_delay,
-                                   kernel_name="fake_quant_update"):
-    """FakeQuantWithMinMax op"""
+@util.check_input_type(dict, dict, dict, dict, dict, bool, float, str)
+def minmax_update_perlayer(x, min_val, max_val, min_up, max_up,
+                           ema, ema_decay, kernel_name="minmax_update_perlayer"):
+    """MinMaxUpdatePerLayer op"""
     input_shape = x.get("shape")
     input_dtype = x.get("dtype")
     min_shape = min_val.get("ori_shape")
@@ -111,20 +105,10 @@ def fake_quant_with_min_max_update(x, min_val, max_val, min_up, max_up,
     input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),)
     shape_min, _, _ = util.produce_shapes(min_shape, input_shape)
 
-    if symmetric:
-        quant_min = 0 - 2 ** (num_bits - 1)
-        quant_max = 2 ** (num_bits - 1) - 1
-    else:
-        quant_min = 0
-        quant_max = 2 ** num_bits - 1
-    if narrow_range:
-        quant_min = quant_min + 1
-
     input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
     min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
     max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
-    res_list = fake_quant_with_min_max_update_compute(input_data, min_data, max_data,
-                                                      ema, ema_decay, quant_min, quant_max, training, kernel_name)
+    res_list = minmax_update_perlayer_compute(input_data, min_data, max_data, ema, ema_decay)
 
     with tvm.target.cce():
         sch = generic.auto_schedule(res_list)
diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index f514ac183e..c83a6ec46e 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -25,11 +25,12 @@ from .flatten import _flatten_aicpu
 from .squeeze import _squeeze_aicpu
 from .expand_dims import _expand_dims_aicpu
 from .random_choice_with_mask import _random_choice_with_mask_aicpu
+from .pack import _pack_aicpu
+from .normal import _normal_aicpu
 from .ctcloss import _ctcloss_aicpu
-from .rnnt_loss import _rnnt_loss_aicpu
-from .random_categorical import _random_categorical_aicpu
 from .reverse_sequence import _reverse_sequence_aicpu
-from .pack import _pack_aicpu
 from .crop_and_resize import _crop_and_resize_aicpu
+from .rnnt_loss import _rnnt_loss_aicpu
+from .random_categorical import _random_categorical_aicpu
 from .cast import _cast_aicpu
 from .mirror_pad import _mirror_pad_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/normal.py b/mindspore/ops/_op_impl/aicpu/normal.py
new file mode 100644
index 0000000000..fdb96e362f
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/normal.py
@@ -0,0 +1,33 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Normal op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+normal_op_info = AiCPURegOp("Normal") \
+    .fusion_type("OPAQUE") \
+    .input(0, "shape", "required") \
+    .input(1, "mean", "required") \
+    .input(2, "stddev", "required") \
+    .output(0, "y", "required") \
+    .attr("seed", "int") \
+    .dtype_format(DataType.I32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW) \
+    .get_op_info()
+
+@op_info_register(normal_op_info)
+def _normal_aicpu():
+    """Normal AiCPU register"""
+    return
diff --git a/mindspore/ops/_op_impl/aicpu/topk.py b/mindspore/ops/_op_impl/aicpu/topk.py
index a68ae3557d..80cf1c5203 100644
--- a/mindspore/ops/_op_impl/aicpu/topk.py
+++ b/mindspore/ops/_op_impl/aicpu/topk.py
@@ -24,6 +24,7 @@ top_k_op_info = AiCPURegOp("TopK") \
     .output(0, "values", "required") \
     .output(1, "indices", "required") \
     .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.F16_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.F32_Default, DataType.I32_Default) \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/akg/__init__.py b/mindspore/ops/_op_impl/akg/__init__.py
index e69de29bb2..f38b99f5e4 100644
--- a/mindspore/ops/_op_impl/akg/__init__.py
+++ b/mindspore/ops/_op_impl/akg/__init__.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""autodiff ops"""
+from .abs import _abs_akg
+from .add_n import _add_n_akg
+from .add import _add_akg
+from .apply_momentum import _apply_momentum_akg
+from .assign import _assign_akg
+from .inplace_assign import _inplace_assign_akg
+from .assign_add import _assign_add_akg
+from .bias_add_grad import _bias_add_grad_akg
+from .bias_add import _bias_add_akg
+from .cast import _cast_akg
+from .clear_zero import _clear_zero_akg
+from .conv_bn1 import _conv_bn1_akg
+from .conv2d_backprop_filter import _conv2d_backprop_filter_akg
+from .conv2d_backprop_input import _conv2d_backprop_input_akg
+from .conv2d import _conv2d_akg
+from .div import _div_akg
+from .equal_count import _equal_count_akg
+from .exp import _exp_akg
+from .five2four import _five2four_akg
+from .four2five import _four2five_akg
+from .fused_batch_norm_grad import _fused_batch_norm_grad_akg
+from .fused_batch_norm_infer import _fused_batch_norm_infer_akg
+from .fused_batch_norm import _fused_batch_norm_akg
+from .fused_bn1_grad import _bn1_grad_akg
+from .fused_bn1 import _fused_bn1_akg
+from .fused_bn2_grad import _bn2_grad_akg
+from .fused_bn2 import _fused_bn2_akg
+from .fused_bn3_grad import _bn3_grad_akg
+from .fused_bn3 import _fused_bn3_akg
+from .gather_v2 import _gather_v2_akg
+from .less import _less_akg
+from .log import _log_akg
+from .matmul import _matmul_akg
+from .max_pool_grad_with_argmax import _max_pool_grad_with_argmax_akg
+from .max_pool_with_argmax import _max_pool_with_argmax_akg
+from .max import _max_akg
+from .maximum import _maximum_akg
+from .mean_grad import _mean_grad_akg
+from .mean import _mean_akg
+from .minimum import _minimum_akg
+from .mul import _mul_akg
+from .neg import _neg_akg
+from .one_hot import _one_hot_akg
+from .pow import _power_akg
+from .real_div import _real_div_akg
+from .reciprocal import _reciprocal_akg
+from .reduce_max import _reduce_max_akg
+from .reduce_mean import _reduce_mean_akg
+from .reduce_sum import _reduce_sum_akg
+from .relu_grad import _relu_grad_akg
+from .relu import _relu_akg
+from .reshape import _reshape_akg
+from .round import _round_akg
+from .rsqrt import _rsqrt_akg
+from .select import _select_akg
+from .softmax import _softmax_akg
+from .sparse_softmax_cross_entropy_with_logits import _sparse_softmax_cross_entropy_with_logits_akg
+from .sqrt import _sqrt_akg
+from .strided_slice import _strided_slice_akg
+from .sub import _sub_akg
+from .sum import _sum_akg
+from .tile import _tile_akg
+from .zeros_like import _zeros_like_akg
+from .argmax import _argmax_akg
+from .floordiv import _floor_div_akg
+from .equal import _equal_akg
+from .greater_equal import _greater_equal_akg
+from .less_equal import _less_equal_akg
+from .expand_dims import _expand_dims_akg
+from .greater import _greater_akg
+from .equiv_format import _equiv_format_akg
+from . import gpu
diff --git a/mindspore/ops/_op_impl/akg/abs.py b/mindspore/ops/_op_impl/akg/abs.py
new file mode 100644
index 0000000000..8c08f405da
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/abs.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Abs op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Abs",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _abs_akg():
+    """Abs AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/add.py b/mindspore/ops/_op_impl/akg/add.py
new file mode 100644
index 0000000000..60544ea1c7
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/add.py
@@ -0,0 +1,72 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""TensorAdd op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "TensorAdd",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _add_akg():
+    """TensorAdd AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/add_n.py b/mindspore/ops/_op_impl/akg/add_n.py
new file mode 100644
index 0000000000..53320f752e
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/add_n.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""AddN op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "AddN",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32", "float16", "float32",
+                "float16","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","NC1HWC0","NC1HWC0", "FracZ", "FracZ",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "dynamic",
+            "name": "inputs"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32", "float16", "float32",
+                "float16","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","NC1HWC0","NC1HWC0", "FracZ", "FracZ",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _add_n_akg():
+    """AddN AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/apply_momentum.py b/mindspore/ops/_op_impl/akg/apply_momentum.py
new file mode 100644
index 0000000000..7160571882
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/apply_momentum.py
@@ -0,0 +1,103 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ApplyMomentum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ApplyMomentum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "use_nesterov",
+            "param_type": "optional",
+            "type": "bool"
+        },
+        {
+            "name": "gradient_scale",
+            "param_type": "optional",
+            "type": "float"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NC1HWC0","FracZ"
+            ],
+            "name": "variable"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NC1HWC0","FracZ"
+            ],
+            "name": "accumulation"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","DefaultFormat"
+            ],
+            "name": "learning_rate"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NC1HWC0","FracZ"
+            ],
+            "name": "gradient"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","DefaultFormat"
+            ],
+            "name": "momentum"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NC1HWC0","FracZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _apply_momentum_akg():
+    """ApplyMomentum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/argmax.py b/mindspore/ops/_op_impl/akg/argmax.py
new file mode 100644
index 0000000000..b04862cbeb
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/argmax.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Argmax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Argmax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "int32", "int32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _argmax_akg():
+    """Argmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/assign.py b/mindspore/ops/_op_impl/akg/assign.py
new file mode 100644
index 0000000000..e7c5a082bd
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/assign.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Assign op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Assign",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "ref"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "value"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _assign_akg():
+    """Assign AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/assign_add.py b/mindspore/ops/_op_impl/akg/assign_add.py
new file mode 100644
index 0000000000..7d0d345764
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/assign_add.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""AssignAdd op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "AssignAdd",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "ref"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "value"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _assign_add_akg():
+    """AssignAdd AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/bias_add.py b/mindspore/ops/_op_impl/akg/bias_add.py
new file mode 100644
index 0000000000..74f2bf7bcf
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/bias_add.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BiasAdd op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+@op_info_register("""{
+    "op_name": "BiasAdd",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "NHWC","NHWC","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "NHWC","NHWC","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "b"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bias_add_akg():
+    """BiasAddGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/bias_add_grad.py b/mindspore/ops/_op_impl/akg/bias_add_grad.py
new file mode 100644
index 0000000000..7726af6692
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/bias_add_grad.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BiasAddGrad op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "BiasAddGrad",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "NHWC","NHWC","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "dout"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bias_add_grad_akg():
+    """BiasAddGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/cast.py b/mindspore/ops/_op_impl/akg/cast.py
new file mode 100644
index 0000000000..a78d4d87e4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/cast.py
@@ -0,0 +1,74 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Cast op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Cast",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "dst_type",
+            "param_type": "required",
+            "type": "str"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "bool", "bool", 
+                "float16", "float32", "int32", "int32", 
+                "bool", 
+                "float16", "float32", "bool", "bool",
+                "float16", "float32", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "DefaultFormat", 
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "DefaultFormat", 
+                "DefaultFormat", 
+                "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32", "float16", "int32", "float16", 
+                "int32", "int32", "float16", "float32", 
+                "float32", 
+                "float32", "float16", "int32", "float32",
+                "float32", "float16", "int32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "DefaultFormat", 
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "DefaultFormat", 
+                "DefaultFormat", 
+                "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _cast_akg():
+    """Cast AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/clear_zero.py b/mindspore/ops/_op_impl/akg/clear_zero.py
new file mode 100644
index 0000000000..38bf35044f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/clear_zero.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ClearZero op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ClearZero",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "pad_mod",
+            "param_type": "optional",
+            "type": "string"
+        },
+        {
+            "name": "window",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+    ]
+}""")
+def _clear_zero_akg():
+    """MaxPoolGradWithArgmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/conv2d.py b/mindspore/ops/_op_impl/akg/conv2d.py
new file mode 100644
index 0000000000..709aca7001
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/conv2d.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Conv2D op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Conv2D",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "x_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "w_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "pad_list",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "dilation",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "FracZ"
+            ],
+            "name": "w"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _conv2d_akg():
+    """Conv2D AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/conv2d_backprop_filter.py b/mindspore/ops/_op_impl/akg/conv2d_backprop_filter.py
new file mode 100644
index 0000000000..1e4e4f1a1e
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/conv2d_backprop_filter.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Conv2DBackpropFilter op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Conv2DBackpropFilter",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "input_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "filter_sizes",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad_list",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "dilation",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "out_backprop"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "input"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "FracZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _conv2d_backprop_filter_akg():
+    """Conv2DBackpropFilter AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/conv2d_backprop_input.py b/mindspore/ops/_op_impl/akg/conv2d_backprop_input.py
new file mode 100644
index 0000000000..52c7f2e7b3
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/conv2d_backprop_input.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Conv2DBackpropInput op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Conv2DBackpropInput",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "input_sizes",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "filter_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad_list",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "dilation",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "out_backprop"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "FracZ"
+            ],
+            "name": "filter"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _conv2d_backprop_input_akg():
+    """Conv2DBackpropInput AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/conv_bn1.py b/mindspore/ops/_op_impl/akg/conv_bn1.py
new file mode 100644
index 0000000000..118c94e6fc
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/conv_bn1.py
@@ -0,0 +1,108 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ConvBN1 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ConvBN1",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "x_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "w_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "pad_list",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "dilation",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "FracZ"
+            ],
+            "name": "w"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "conv_res_16"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "var_part"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        }
+    ]
+}""")
+def _conv_bn1_akg():
+    """ConvBN1 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/div.py b/mindspore/ops/_op_impl/akg/div.py
new file mode 100644
index 0000000000..56cdcca868
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/div.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Div op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Div",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _div_akg():
+    """Div AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/equal.py b/mindspore/ops/_op_impl/akg/equal.py
new file mode 100644
index 0000000000..35874c62bb
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/equal.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Equal op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Equal",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _equal_akg():
+    """Equal AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/equal_count.py b/mindspore/ops/_op_impl/akg/equal_count.py
new file mode 100644
index 0000000000..9c575db7b3
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/equal_count.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""EqualCount op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "EqualCount",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _equal_count_akg():
+    """EqualCount AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/equiv_format.py b/mindspore/ops/_op_impl/akg/equiv_format.py
new file mode 100644
index 0000000000..111451b15c
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/equiv_format.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""EquivFormat op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "EquivFormat",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "FRACTAL_NZ", "FRACTAL_NZ", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _equiv_format_akg():
+    """EquivFormat AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/exp.py b/mindspore/ops/_op_impl/akg/exp.py
new file mode 100644
index 0000000000..273b3348a4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/exp.py
@@ -0,0 +1,59 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Exp op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Exp",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _exp_akg():
+    """Exp AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/expand_dims.py b/mindspore/ops/_op_impl/akg/expand_dims.py
new file mode 100644
index 0000000000..9e1b18153a
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/expand_dims.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ExpandDims op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ExpandDims",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "y"
+        }
+    ]
+}""")
+def _expand_dims_akg():
+    """ExpandDims AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/five2four.py b/mindspore/ops/_op_impl/akg/five2four.py
new file mode 100644
index 0000000000..1dac2c3628
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/five2four.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Five2Four op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Five2Four",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "shape4d",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "dstType",
+            "param_type": "required",
+            "type": "str"
+        },
+        {
+            "name": "output_format",
+            "param_type": "required",
+            "type": "str"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float16","float16","float32","float16","float32"
+            ],
+            "format": [
+                "NC1HWC0","NC1HWC0","NC1HWC0","NC1HWC0","NC1HWC0","NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float16","float32","float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NHWC","DefaultFormat","DefaultFormat","NHWC","NHWC"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _five2four_akg():
+    """Five2Four AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/floordiv.py b/mindspore/ops/_op_impl/akg/floordiv.py
new file mode 100644
index 0000000000..99e577b4be
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/floordiv.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FloorDiv op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FloorDiv",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "int32", "int32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _floor_div_akg():
+    """FloorDiv AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/four2five.py b/mindspore/ops/_op_impl/akg/four2five.py
new file mode 100644
index 0000000000..01b6f85715
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/four2five.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Four2Five op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Four2Five",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        },
+        {
+            "name": "dst_type",
+            "param_type": "required",
+            "type": "str"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float32", "float16","float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NHWC", "NHWC", "NHWC"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16", "float32", "float16", "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _four2five_akg():
+    """Four2Five AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_batch_norm.py b/mindspore/ops/_op_impl/akg/fused_batch_norm.py
new file mode 100644
index 0000000000..5ce9839328
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_batch_norm.py
@@ -0,0 +1,149 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBatchNorm op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBatchNorm",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "momentum",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "epsilon",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "scale"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "b"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "variance"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "y"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "running_mean"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "running_variance"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "save_mean"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "save_inv_variance"
+        }
+    ]
+}""")
+def _fused_batch_norm_akg():
+    """FusedBatchNorm AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_batch_norm_grad.py b/mindspore/ops/_op_impl/akg/fused_batch_norm_grad.py
new file mode 100644
index 0000000000..9191548f73
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_batch_norm_grad.py
@@ -0,0 +1,119 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBatchNormGrad op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBatchNormGrad",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "dy"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "scale"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "save_mean"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "save_inv_variance"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "dx"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "bn_scale"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "bn_bias"
+        }
+    ]
+}""")
+def _fused_batch_norm_grad_akg():
+    """BiasAddGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_batch_norm_infer.py b/mindspore/ops/_op_impl/akg/fused_batch_norm_infer.py
new file mode 100644
index 0000000000..1e7743fa8f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_batch_norm_infer.py
@@ -0,0 +1,109 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBatchNormInfer op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBatchNormInfer",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "momentum",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "epsilon",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "scale"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "b"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "variance"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ]
+}""")
+def _fused_batch_norm_infer_akg():
+    """FusedBatchNormInfer AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn1.py b/mindspore/ops/_op_impl/akg/fused_bn1.py
new file mode 100644
index 0000000000..fdaa673f25
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn1.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBN1 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBN1",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "data"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _fused_bn1_akg():
+    """FusedBN1 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn1_grad.py b/mindspore/ops/_op_impl/akg/fused_bn1_grad.py
new file mode 100644
index 0000000000..8de6796d6f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn1_grad.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BNGrad1 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "BNGrad1",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "dy"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "data"
+        },{
+            "index": 2,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "mean"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bn1_grad_akg():
+    """BNGrad1 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn2.py b/mindspore/ops/_op_impl/akg/fused_bn2.py
new file mode 100644
index 0000000000..e26a5ad8a0
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn2.py
@@ -0,0 +1,108 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBN2 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBN2",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "momentum",
+            "param_type": "optional",
+            "type": "float"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "var_part"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "running_mean"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "running_var"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _fused_bn2_akg():
+    """FusedBN2 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn2_grad.py b/mindspore/ops/_op_impl/akg/fused_bn2_grad.py
new file mode 100644
index 0000000000..e29a9177b6
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn2_grad.py
@@ -0,0 +1,132 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BNGrad1 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "BNGrad2",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "eps",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "data_shape",
+            "param_type": "optional",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "dgamma_red_hw"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "dbeta_red_hw"
+        },{
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "variance"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "gamma"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bn2_grad_akg():
+    """BNGrad2 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn3.py b/mindspore/ops/_op_impl/akg/fused_bn3.py
new file mode 100644
index 0000000000..74f3f652f3
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn3.py
@@ -0,0 +1,95 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBN3 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBN3",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "eps",
+            "param_type": "optional",
+            "type": "float"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "data"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        },{
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "variance"
+        },{
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "gamma"
+        },{
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "beta"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _fused_bn3_akg():
+    """FusedBN3 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn3_grad.py b/mindspore/ops/_op_impl/akg/fused_bn3_grad.py
new file mode 100644
index 0000000000..5ffc57a68e
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn3_grad.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BNGrad3 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "BNGrad3",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "dy"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "rs"
+        },{
+            "index": 2,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "dgamma_dx"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "dbeta_dx"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "data_minus_mean"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bn3_grad_akg():
+    """BNGrad3 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/gather_v2.py b/mindspore/ops/_op_impl/akg/gather_v2.py
new file mode 100644
index 0000000000..84ab7eb669
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/gather_v2.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""GatherV2 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "GatherV2",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "params"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "int32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "indices"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _gather_v2_akg():
+    """GatherV2 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/gpu/__init__.py b/mindspore/ops/_op_impl/akg/gpu/__init__.py
index 08beb44340..7af6949104 100644
--- a/mindspore/ops/_op_impl/akg/gpu/__init__.py
+++ b/mindspore/ops/_op_impl/akg/gpu/__init__.py
@@ -32,3 +32,5 @@ from .logical_and import _logical_and_akg
 from .logical_not import _logical_not_akg
 from .logical_or import _logical_or_akg
 from .lessequal import _lessequal_akg
+from .notequal import _notequal_akg
+from .greater_equal import _greater_equal_akg
diff --git a/mindspore/ops/_op_impl/akg/gpu/greater_equal.py b/mindspore/ops/_op_impl/akg/gpu/greater_equal.py
new file mode 100644
index 0000000000..b000cbd0e3
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/gpu/greater_equal.py
@@ -0,0 +1,32 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GreaterEqual op"""
+from mindspore.ops.op_info_register import op_info_register, AkgRegOp, DataType
+
+greater_equal_op_info = AkgRegOp("GreaterEqual") \
+    .fusion_type("OPAQUE") \
+    .input(0, "x") \
+    .input(1, "y") \
+    .output(0, "output") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \
+    .get_op_info()
+
+
+@op_info_register(greater_equal_op_info)
+def _greater_equal_akg():
+    """GreaterEqual register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/gpu/lessequal.py b/mindspore/ops/_op_impl/akg/gpu/lessequal.py
index a3e4d4dc35..a8babf7ae4 100644
--- a/mindspore/ops/_op_impl/akg/gpu/lessequal.py
+++ b/mindspore/ops/_op_impl/akg/gpu/lessequal.py
@@ -15,7 +15,7 @@
 """LessEqual op"""
 from mindspore.ops.op_info_register import op_info_register, AkgRegOp, DataType
 
-equal_op_info = AkgRegOp("LessEqual") \
+lessequal_op_info = AkgRegOp("LessEqual") \
     .fusion_type("OPAQUE") \
     .input(0, "x") \
     .input(1, "y") \
@@ -26,7 +26,7 @@ equal_op_info = AkgRegOp("LessEqual") \
     .get_op_info()
 
 
-@op_info_register(equal_op_info)
+@op_info_register(lessequal_op_info)
 def _lessequal_akg():
     """LessEqual register"""
     return
diff --git a/mindspore/ops/_op_impl/akg/gpu/notequal.py b/mindspore/ops/_op_impl/akg/gpu/notequal.py
new file mode 100644
index 0000000000..dc13449fc1
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/gpu/notequal.py
@@ -0,0 +1,32 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NotEqual op"""
+from mindspore.ops.op_info_register import op_info_register, AkgRegOp, DataType
+
+notequal_op_info = AkgRegOp("NotEqual") \
+    .fusion_type("OPAQUE") \
+    .input(0, "x") \
+    .input(1, "y") \
+    .output(0, "output") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \
+    .get_op_info()
+
+
+@op_info_register(notequal_op_info)
+def _notequal_akg():
+    """NotEqual AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/greater.py b/mindspore/ops/_op_impl/akg/greater.py
new file mode 100644
index 0000000000..941946163a
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/greater.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Greater op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Greater",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float16", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _greater_akg():
+    """Greater AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/greater_equal.py b/mindspore/ops/_op_impl/akg/greater_equal.py
new file mode 100644
index 0000000000..11642baa86
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/greater_equal.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""GreaterEqual op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "GreaterEqual",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _greater_equal_akg():
+    """Equal AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/inplace_assign.py b/mindspore/ops/_op_impl/akg/inplace_assign.py
new file mode 100644
index 0000000000..1cc40abe9b
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/inplace_assign.py
@@ -0,0 +1,78 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""InplaceAssign op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "InplaceAssign",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "fake_output",
+            "param_type": "optional",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "y"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "z"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _inplace_assign_akg():
+    """InplaceAssign AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/less.py b/mindspore/ops/_op_impl/akg/less.py
new file mode 100644
index 0000000000..499ed2e8fc
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/less.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Less op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Less",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _less_akg():
+    """Less AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/less_equal.py b/mindspore/ops/_op_impl/akg/less_equal.py
new file mode 100644
index 0000000000..97fbdec090
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/less_equal.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""LessEqual op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "LessEqual",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _less_equal_akg():
+    """Equal AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/log.py b/mindspore/ops/_op_impl/akg/log.py
new file mode 100644
index 0000000000..526538d17d
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/log.py
@@ -0,0 +1,55 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Log op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Log",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _log_akg():
+    """Log AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/matmul.py b/mindspore/ops/_op_impl/akg/matmul.py
new file mode 100644
index 0000000000..084ba754fa
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/matmul.py
@@ -0,0 +1,73 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MatMul op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "MatMul",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "transpose_a",
+            "param_type": "optional",
+            "type": "bool"
+        },
+        {
+            "name": "transpose_b",
+            "param_type": "optional",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "x1"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "x2"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _matmul_akg():
+    """MatMul AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/max.py b/mindspore/ops/_op_impl/akg/max.py
new file mode 100644
index 0000000000..21fd4ef9c4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/max.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Max op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Max",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keep_dims",
+            "param_type": "required",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _max_akg():
+    """Max AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/max_pool_grad_with_argmax.py b/mindspore/ops/_op_impl/akg/max_pool_grad_with_argmax.py
new file mode 100644
index 0000000000..4adad3eb88
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/max_pool_grad_with_argmax.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MaxPoolGradWithArgmax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "MaxPoolGradWithArgmax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "pad_mode",
+            "param_type": "optional",
+            "type": "str"
+         },
+        {
+            "name": "window",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "argmax"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "grad"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _max_pool_grad_with_argmax_akg():
+    """MaxPoolGradWithArgmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/max_pool_with_argmax.py b/mindspore/ops/_op_impl/akg/max_pool_with_argmax.py
new file mode 100644
index 0000000000..3ae36d4793
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/max_pool_with_argmax.py
@@ -0,0 +1,83 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MaxPoolWithArgmax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "MaxPoolWithArgmax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "pad_mode",
+            "param_type": "optional",
+            "type": "str"
+        },
+        {
+            "name": "window",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "argmax"
+        }
+    ]
+}""")
+def _max_pool_with_argmax_akg():
+    """MaxPoolWithArgmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/maximum.py b/mindspore/ops/_op_impl/akg/maximum.py
new file mode 100644
index 0000000000..8d8de5270a
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/maximum.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Maximum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Maximum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _maximum_akg():
+    """Maximum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/mean.py b/mindspore/ops/_op_impl/akg/mean.py
new file mode 100644
index 0000000000..0b49e76865
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/mean.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""SimpleMean op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "SimpleMean",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _mean_akg():
+    """SimpleMean AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/mean_grad.py b/mindspore/ops/_op_impl/akg/mean_grad.py
new file mode 100644
index 0000000000..3b8379d1f0
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/mean_grad.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""SimpleMeanGrad op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "SimpleMeanGrad",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "input_shape",
+            "param_type": "required",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "HEAD"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _mean_grad_akg():
+    """SimpleMeanGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/minimum.py b/mindspore/ops/_op_impl/akg/minimum.py
new file mode 100644
index 0000000000..759df2085f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/minimum.py
@@ -0,0 +1,70 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Minimum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Minimum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32",
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32",
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32",
+                 "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _minimum_akg():
+    """Minimum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/mul.py b/mindspore/ops/_op_impl/akg/mul.py
new file mode 100644
index 0000000000..ab02c2d89e
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/mul.py
@@ -0,0 +1,86 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Mul op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Mul",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "x_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "y_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "data_format",
+            "param_type": "required",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "FracZ", "FracZ", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "FracZ", "FracZ", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "FracZ", "FracZ", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _mul_akg():
+    """Mul AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/neg.py b/mindspore/ops/_op_impl/akg/neg.py
new file mode 100644
index 0000000000..bc00d60271
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/neg.py
@@ -0,0 +1,59 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Neg op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Neg",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32",
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32",
+                 "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _neg_akg():
+    """Neg AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/one_hot.py b/mindspore/ops/_op_impl/akg/one_hot.py
new file mode 100644
index 0000000000..c5034dbbd4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/one_hot.py
@@ -0,0 +1,83 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""OneHot op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "OneHot",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "depth",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "int32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "indices"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "on_value"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "int32", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "off_value"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _one_hot_akg():
+    """OneHot AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/pow.py b/mindspore/ops/_op_impl/akg/pow.py
new file mode 100644
index 0000000000..d782968c05
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/pow.py
@@ -0,0 +1,65 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Pow op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Pow",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "power"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _power_akg():
+    """Pow AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/real_div.py b/mindspore/ops/_op_impl/akg/real_div.py
new file mode 100644
index 0000000000..9fa37a24e3
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/real_div.py
@@ -0,0 +1,72 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""RealDiv op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "RealDiv",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _real_div_akg():
+    """RealDiv AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reciprocal.py b/mindspore/ops/_op_impl/akg/reciprocal.py
new file mode 100644
index 0000000000..9fd7cc40b4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reciprocal.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Reciprocal op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Reciprocal",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reciprocal_akg():
+    """Reciprocal AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reduce_max.py b/mindspore/ops/_op_impl/akg/reduce_max.py
new file mode 100644
index 0000000000..b9db8ea83a
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reduce_max.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReduceMax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReduceMax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keep_dims",
+            "param_type": "required",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reduce_max_akg():
+    """ReduceMax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reduce_mean.py b/mindspore/ops/_op_impl/akg/reduce_mean.py
new file mode 100644
index 0000000000..0a4ffdf221
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reduce_mean.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReduceMean op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReduceMean",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keep_dims",
+            "param_type": "required",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reduce_mean_akg():
+    """ReduceMean AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reduce_sum.py b/mindspore/ops/_op_impl/akg/reduce_sum.py
new file mode 100644
index 0000000000..20d091ac76
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reduce_sum.py
@@ -0,0 +1,73 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReduceSum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReduceSum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keep_dims",
+            "param_type": "required",
+            "type": "bool"
+        },
+        {
+            "name": "atomic_add",
+            "param_type": "optional",
+            "type": "str"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reduce_sum_akg():
+    """ReduceSum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/relu.py b/mindspore/ops/_op_impl/akg/relu.py
new file mode 100644
index 0000000000..b32725f885
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/relu.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReLU op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReLU",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _relu_akg():
+    """ReLU AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/relu_grad.py b/mindspore/ops/_op_impl/akg/relu_grad.py
new file mode 100644
index 0000000000..c785b750fe
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/relu_grad.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReluGrad op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReluGrad",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "y_backprop"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _relu_grad_akg():
+    """ReluGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reshape.py b/mindspore/ops/_op_impl/akg/reshape.py
new file mode 100644
index 0000000000..d200b66fa2
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reshape.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Reshape op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Reshape",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "shape",
+            "param_type": "required",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "tensor"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reshape_akg():
+    """Reshape AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/round.py b/mindspore/ops/_op_impl/akg/round.py
new file mode 100644
index 0000000000..0625c3ceda
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/round.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Round op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Round",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _round_akg():
+    """Round AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/rsqrt.py b/mindspore/ops/_op_impl/akg/rsqrt.py
new file mode 100644
index 0000000000..9264864f91
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/rsqrt.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Rsqrt op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Rsqrt",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _rsqrt_akg():
+    """Rsqrt AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/select.py b/mindspore/ops/_op_impl/akg/select.py
new file mode 100644
index 0000000000..006c6a5444
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/select.py
@@ -0,0 +1,76 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Select op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Select",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "condition"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _select_akg():
+    """Select AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/softmax.py b/mindspore/ops/_op_impl/akg/softmax.py
new file mode 100644
index 0000000000..a41c2aef36
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/softmax.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Softmax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Softmax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _softmax_akg():
+    """Softmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/sparse_softmax_cross_entropy_with_logits.py b/mindspore/ops/_op_impl/akg/sparse_softmax_cross_entropy_with_logits.py
new file mode 100644
index 0000000000..e9e828f312
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/sparse_softmax_cross_entropy_with_logits.py
@@ -0,0 +1,73 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""SparseSoftmaxCrossEntropyWithLogits op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "SparseSoftmaxCrossEntropyWithLogits",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "is_grad",
+            "param_type": "optional",
+            "type": "bool"
+        },
+        {
+            "name": "sens",
+            "param_type": "optional",
+            "type": "float"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "features"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "labels"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _sparse_softmax_cross_entropy_with_logits_akg():
+    """SparseSoftmaxCrossEntropyWithLogits AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/sqrt.py b/mindspore/ops/_op_impl/akg/sqrt.py
new file mode 100644
index 0000000000..fcaa84b3d4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/sqrt.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Sqrt op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Sqrt",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _sqrt_akg():
+    """Sqrt AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/strided_slice.py b/mindspore/ops/_op_impl/akg/strided_slice.py
new file mode 100644
index 0000000000..bdbd8dfc2f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/strided_slice.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""StridedSlice op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "StridedSlice",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "begin",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "end",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "strides",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "begin_mask",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "end_mask",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "ellipsis_mask",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "new_axis_mask",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "shrink_axis_mask",
+            "param_type": "required",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32",  "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32",  "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _strided_slice_akg():
+    """StridedSlice AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/sub.py b/mindspore/ops/_op_impl/akg/sub.py
new file mode 100644
index 0000000000..846aa280bb
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/sub.py
@@ -0,0 +1,72 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Sub op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Sub",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _sub_akg():
+    """Sub AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/sum.py b/mindspore/ops/_op_impl/akg/sum.py
new file mode 100644
index 0000000000..501b387b25
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/sum.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Sum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Sum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keepdims",
+            "param_type": "required",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _sum_akg():
+    """Sum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/tile.py b/mindspore/ops/_op_impl/akg/tile.py
new file mode 100644
index 0000000000..bd13978fe7
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/tile.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Tile op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Tile",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "multiples",
+            "param_type": "required",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _tile_akg():
+    """Tile AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/zeros_like.py b/mindspore/ops/_op_impl/akg/zeros_like.py
new file mode 100644
index 0000000000..a02ece22d7
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/zeros_like.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ZerosLike op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ZerosLike",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _zeros_like_akg():
+    """ZerosLike AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/__init__.py b/mindspore/ops/_op_impl/tbe/__init__.py
index 3d1825b53e..631ec1bf44 100644
--- a/mindspore/ops/_op_impl/tbe/__init__.py
+++ b/mindspore/ops/_op_impl/tbe/__init__.py
@@ -15,6 +15,8 @@
 
 """tbe ops"""
 from .abs import _abs_tbe
+from .inplace_add import _inplace_add_tbe
+from .inplace_sub import _inplace_sub_tbe
 from .abs_grad import _abs_grad_tbe
 from .acos import _acos_tbe
 from .acos_grad import _acos_grad_tbe
@@ -24,9 +26,15 @@ from .adam_apply_one_with_decay import _adam_apply_one_with_decay_tbe
 from .add import _add_tbe
 from .apply_centered_rms_prop import _apply_centered_rms_prop_tbe
 from .add_n import _add_n_tbe
+from .accumulate_n_v2 import _accumulate_n_v2_tbe
 from .apply_ftrl import _apply_ftrl_tbe
 from .apply_momentum import _apply_momentum_tbe
 from .apply_adam import _apply_adam_tbe
+from .apply_ada_max import _apply_ada_max_tbe
+from .apply_adadelta import _apply_adadelta_tbe
+from .apply_adagrad import _apply_adagrad_tbe
+from .apply_adagrad_v2 import _apply_adagrad_v2_tbe
+from .approximate_equal import _approximate_equal_tbe
 from .adam_apply_one import _adam_apply_one_tbe
 from .assign import _assign_tbe
 from .assign_add import _assign_add_tbe
@@ -83,6 +91,7 @@ from .strided_slice_d import _strided_slice_d_tbe
 from .strided_slice_grad_d import _strided_slice_grad_d_tbe
 from .split_d import _split_d_tbe
 from .exp import _exp_tbe
+from .expm1 import _expm1_tbe
 from .elu import _elu_tbe
 from .elu_grad import _elu_grad_tbe
 from .div import _div_tbe
@@ -177,6 +186,7 @@ from .space_to_batch import _space_to_batch_tbe
 from .depth_to_space import _depth_to_space_tbe
 from .space_to_depth import _space_to_depth_tbe
 from .floor import _floor_tbe
+from .ceil import _ceil_tbe
 from .log1p import _log1p_tbe
 from .resize_bilinear import _resize_bilinear_tbe
 from .resize_bilinear_grad import _resize_bilinear_grad_tbe
@@ -193,6 +203,7 @@ from .sgd import _sgd_tbe
 from .lars_update import _lars_update_tbe
 from .arg_min import _arg_min_tbe
 from .bn_training_update_v2 import _bn_training_update_v2_tbe
+from .bn_training_update_v3 import _bn_training_update_v3_tbe
 from .square_sum_all import _square_sum_all_tbe
 from .pack import _pack_tbe
 from .unpack import _unpack_tbe
@@ -214,9 +225,9 @@ from .bessel_i0e import _bessel_i0e_tbe
 from .bessel_i1e import _bessel_i1e_tbe
 from .batch_to_space_nd import _batch_to_space_nd_tbe
 from .space_to_batch_nd import _space_to_batch_nd_tbe
-from .bitwise_and import bitwise_and_op_info
-from .bitwise_or import bitwise_or_op_info
-from .bitwise_xor import bitwise_xor_op_info
+from .bitwise_and import _bitwise_and_tbe
+from .bitwise_or import _bitwise_or_tbe
+from .bitwise_xor import _bitwise_xor_tbe
 from .reduce_all import _reduce_all_tbe
 from .sparse_apply_adagrad import _sparse_apply_adagrad_tbe
 from .unsorted_segment_min import _unsorted_segment_min_tbe
@@ -224,10 +235,35 @@ from .asin import _asin_tbe
 from .asin_grad import _asin_grad_tbe
 from .asinh import _asinh_tbe
 from .asinh_grad import _asinh_grad_tbe
+from .div_no_nan import _div_no_nan_tbe
 from .atan import _atan_tbe
 from .atan_grad import _atan_grad_tbe
 from .atanh import _atanh_tbe
+from .cosh import _cosh_tbe
+from .sinh import _sinh_tbe
+from .inv import _inv_tbe
+from .inv_grad import _inv_grad_tbe
+from .invert import _invert_tbe
 from .basic_lstm_cell import _basic_lstm_cell_tbe
 from .basic_lstm_cell_c_state_grad import _basic_lstm_cell_c_state_grad_tbe
 from .basic_lstm_cell_weight_grad import _basic_lstm_cell_weight_grad_tbe
 from .basic_lstm_cell_input_grad import _basic_lstm_cell_input_grad_tbe
+from .confusion_matrix import _confusion_matrix_tbe
+from .broadcast_to import _broadcast_to_tbe
+from .strided_read import _strided_read_tbe
+from .strided_write import _strided_write_tbe
+from .range import _range_tbe
+from .fused_mul_add_n_l2loss import _fused_mul_add_n_l2loss_tbe
+from .fused_mul_apply_momentum_extern import _fused_mul_apply_momentum_extern_tbe
+from .lamb_next_right import _lamb_next_right_tbe
+from .sparse_gather_v2 import _sparse_gather_v2_tbe
+from .data_format_dim_map import _data_format_dim_map_tbe
+from .histogram_fixed_width import _histogram_fixed_width_tbe
+from .tensor_scatter_update import _tensor_scatter_update_tbe
+from .inplace_update import _inplace_update_tbe
+from .splitv import _split_v_tbe
+from .in_top_k import _in_top_k_tbe
+from .lin_space import _lin_space_tbe
+from .matrix_diag import _matrix_diag_tbe
+from .matrix_diag_part import _matrix_diag_part_tbe
+from .matrix_set_diag import _matrix_set_diag_tbe
diff --git a/mindspore/ops/_op_impl/tbe/abs.py b/mindspore/ops/_op_impl/tbe/abs.py
index 30a75812bd..66c1d409fb 100644
--- a/mindspore/ops/_op_impl/tbe/abs.py
+++ b/mindspore/ops/_op_impl/tbe/abs.py
@@ -26,12 +26,9 @@ abs_op_info = TBERegOp("Abs") \
     .op_pattern("formatAgnostic") \
     .input(0, "x", None, "required", None) \
     .output(0, "y", True, "required", "all") \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
-    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.F16_None, DataType.F16_None) \
+    .dtype_format(DataType.F32_None, DataType.F32_None) \
+    .dtype_format(DataType.I32_None, DataType.I32_None) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/abs_grad.py b/mindspore/ops/_op_impl/tbe/abs_grad.py
index ba630f6570..3e7ac70d80 100644
--- a/mindspore/ops/_op_impl/tbe/abs_grad.py
+++ b/mindspore/ops/_op_impl/tbe/abs_grad.py
@@ -23,7 +23,6 @@ abs_grad_op_info = TBERegOp("AbsGrad") \
     .compute_cost(10) \
     .kernel_name("abs_grad") \
     .partial_flag(True) \
-    .op_pattern("formatAgnostic") \
     .input(0, "y", None, "required", None) \
     .input(1, "dy", None, "required", None) \
     .output(0, "z", False, "required", "all") \
diff --git a/mindspore/ops/_op_impl/tbe/accumulate_n_v2.py b/mindspore/ops/_op_impl/tbe/accumulate_n_v2.py
new file mode 100644
index 0000000000..fdd72a9494
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/accumulate_n_v2.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""AccumulateNV2 op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+accumulate_n_v2_op_info = TBERegOp("AccumulateNV2") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("accumulate_n_v2.so") \
+    .compute_cost(10) \
+    .kernel_name("accumulate_n_v2") \
+    .partial_flag(True) \
+    .attr("n", "required", "int", "all") \
+    .input(0, "x", False, "dynamic", "all") \
+    .output(0, "y", False, "required", "all") \
+    .op_pattern("broadcast") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default) \
+    .get_op_info()
+
+
+@op_info_register(accumulate_n_v2_op_info)
+def _accumulate_n_v2_tbe():
+    """AccumulateNV2 TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/acos.py b/mindspore/ops/_op_impl/tbe/acos.py
index 94dd8ba2bd..98516f4496 100644
--- a/mindspore/ops/_op_impl/tbe/acos.py
+++ b/mindspore/ops/_op_impl/tbe/acos.py
@@ -26,7 +26,9 @@ acos_op_info = TBERegOp("ACos") \
     .op_pattern("formatAgnostic") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/acosh.py b/mindspore/ops/_op_impl/tbe/acosh.py
index 6be222f115..0bf8755bc0 100644
--- a/mindspore/ops/_op_impl/tbe/acosh.py
+++ b/mindspore/ops/_op_impl/tbe/acosh.py
@@ -26,7 +26,9 @@ acosh_op_info = TBERegOp("Acosh") \
     .op_pattern("formatAgnostic") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/add.py b/mindspore/ops/_op_impl/tbe/add.py
index 63e1efb1c6..d3db3de0ad 100644
--- a/mindspore/ops/_op_impl/tbe/add.py
+++ b/mindspore/ops/_op_impl/tbe/add.py
@@ -26,6 +26,7 @@ add_op_info = TBERegOp("Add") \
     .input(0, "x1", False, "required", "all") \
     .input(1, "x2", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
diff --git a/mindspore/ops/_op_impl/tbe/add_n.py b/mindspore/ops/_op_impl/tbe/add_n.py
index 3e8a6c0016..1c42b4bb2d 100644
--- a/mindspore/ops/_op_impl/tbe/add_n.py
+++ b/mindspore/ops/_op_impl/tbe/add_n.py
@@ -26,17 +26,10 @@ add_n_op_info = TBERegOp("AddN") \
     .attr("n", "required", "int", "all") \
     .input(0, "x", False, "dynamic", "all") \
     .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
-    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ) \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
-    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ) \
-    .dtype_format(DataType.F32_FracNZ, DataType.F32_FracNZ) \
-    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.I32_5HD, DataType.I32_5HD) \
-    .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ) \
+    .op_pattern("broadcast") \
+    .dtype_format(DataType.F16_None, DataType.F16_None) \
+    .dtype_format(DataType.F32_None, DataType.F32_None) \
+    .dtype_format(DataType.I32_None, DataType.I32_None) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/apply_ada_max.py b/mindspore/ops/_op_impl/tbe/apply_ada_max.py
new file mode 100644
index 0000000000..8394623bbf
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/apply_ada_max.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ApplyAdaMaxD op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+apply_ada_max_d_op_info = TBERegOp("ApplyAdaMax") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("apply_ada_max_d.so") \
+    .compute_cost(10) \
+    .kernel_name("apply_ada_max_d") \
+    .partial_flag(True) \
+    .input(0, "var", False, "required", "all") \
+    .input(1, "m", False, "required", "all") \
+    .input(2, "v", False, "required", "all") \
+    .input(3, "beta1_power", False, "required", "all") \
+    .input(4, "lr", False, "required", "all") \
+    .input(5, "beta1", False, "required", "all") \
+    .input(6, "beta2", False, "required", "all") \
+    .input(7, "epsilon", False, "required", "all") \
+    .input(8, "grad", False, "required", "all") \
+    .output(0, "var", False, "required", "all") \
+    .output(1, "m", False, "required", "all") \
+    .output(2, "v", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(apply_ada_max_d_op_info)
+def _apply_ada_max_tbe():
+    """ApplyAdaMaxD TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/apply_adadelta.py b/mindspore/ops/_op_impl/tbe/apply_adadelta.py
new file mode 100644
index 0000000000..a5c76b62cc
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/apply_adadelta.py
@@ -0,0 +1,66 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ApplyAdadeltaD op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+apply_adadelta_d_op_info = TBERegOp("ApplyAdadelta") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("apply_adadelta_d.so") \
+    .compute_cost(10) \
+    .kernel_name("apply_adadelta_d") \
+    .partial_flag(True) \
+    .input(0, "var", False, "required", "all") \
+    .input(1, "accum", False, "required", "all") \
+    .input(2, "accum_update", False, "required", "all") \
+    .input(3, "lr", False, "required", "all") \
+    .input(4, "rho", False, "required", "all") \
+    .input(5, "epsilon", False, "required", "all") \
+    .input(6, "grad", False, "required", "all") \
+    .output(0, "var", False, "required", "all") \
+    .output(1, "accum", False, "required", "all") \
+    .output(2, "accum_update", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_5HD, DataType.F16_5HD,
+                  DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_FracZ, DataType.F16_FracZ,
+                  DataType.F16_FracZ, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0,
+                  DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_FracZ, DataType.F32_FracZ,
+                  DataType.F32_FracZ, DataType.F32_FracZ) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0,
+                  DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(apply_adadelta_d_op_info)
+def _apply_adadelta_tbe():
+    """ApplyAdadeltaD TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/apply_adagrad.py b/mindspore/ops/_op_impl/tbe/apply_adagrad.py
new file mode 100644
index 0000000000..6b9975a479
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/apply_adagrad.py
@@ -0,0 +1,55 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ApplyAdagradD op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+apply_adagrad_d_op_info = TBERegOp("ApplyAdagrad") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("apply_adagrad_d.so") \
+    .compute_cost(10) \
+    .kernel_name("apply_adagrad_d") \
+    .partial_flag(True) \
+    .attr("update_slots", "optional", "bool", "true,false", "false") \
+    .input(0, "var", False, "required", "all") \
+    .input(1, "accum", False, "required", "all") \
+    .input(2, "lr", False, "required", "all") \
+    .input(3, "grad", False, "required", "all") \
+    .output(0, "var", False, "required", "all") \
+    .output(1, "accum", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_Default, DataType.F16_5HD,
+                  DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_Default, DataType.F16_FracZ,
+                  DataType.F16_FracZ, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_Default, DataType.F16_C1HWNCoC0,
+                  DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_Default, DataType.F32_5HD,
+                  DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_Default, DataType.F32_FracZ,
+                  DataType.F32_FracZ, DataType.F32_FracZ) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_Default, DataType.F32_C1HWNCoC0,
+                  DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(apply_adagrad_d_op_info)
+def _apply_adagrad_tbe():
+    """ApplyAdagradD TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/apply_adagrad_v2.py b/mindspore/ops/_op_impl/tbe/apply_adagrad_v2.py
new file mode 100644
index 0000000000..fbaf51e643
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/apply_adagrad_v2.py
@@ -0,0 +1,56 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ApplyAdagradV2D op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+apply_adagrad_v2_d_op_info = TBERegOp("ApplyAdagradV2") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("apply_adagradv2_d.so") \
+    .compute_cost(10) \
+    .kernel_name("apply_adagradv2_d") \
+    .partial_flag(True) \
+    .attr("epsilon", "required", "float", "all") \
+    .attr("update_slots", "optional", "bool", "true,false", "false") \
+    .input(0, "var", False, "required", "all") \
+    .input(1, "accum", False, "required", "all") \
+    .input(2, "lr", False, "required", "all") \
+    .input(3, "grad", False, "required", "all") \
+    .output(0, "var", False, "required", "all") \
+    .output(1, "accum", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_Default, DataType.F16_5HD,
+                  DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_Default, DataType.F16_FracZ,
+                  DataType.F16_FracZ, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_Default, DataType.F16_C1HWNCoC0,
+                  DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_Default, DataType.F32_5HD,
+                  DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_Default, DataType.F32_FracZ,
+                  DataType.F32_FracZ, DataType.F32_FracZ) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_Default, DataType.F32_C1HWNCoC0,
+                  DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(apply_adagrad_v2_d_op_info)
+def _apply_adagrad_v2_tbe():
+    """ApplyAdagradV2D TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/apply_ftrl.py b/mindspore/ops/_op_impl/tbe/apply_ftrl.py
index e37648191e..56c6bf3612 100644
--- a/mindspore/ops/_op_impl/tbe/apply_ftrl.py
+++ b/mindspore/ops/_op_impl/tbe/apply_ftrl.py
@@ -32,30 +32,32 @@ apply_ftrl_op_info = TBERegOp("ApplyFtrl") \
     .input(6, "l2", False, "required", "all") \
     .input(7, "lr_power", False, "required", "all") \
     .output(0, "var", False, "required", "all") \
+    .output(1, "accum", False, "required", "all") \
+    .output(2, "linear", False, "required", "all") \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD,
                   DataType.F16_5HD, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_5HD) \
+                  DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ,
                   DataType.F16_FracZ, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_FracZ) \
+                  DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
     .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0,
                   DataType.F16_C1HWNCoC0, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_C1HWNCoC0) \
+                  DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
                   DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_Default) \
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
                   DataType.F32_5HD, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_5HD) \
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ,
                   DataType.F32_FracZ, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_FracZ) \
+                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
     .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0,
                   DataType.F32_C1HWNCoC0, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_C1HWNCoC0) \
+                  DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
                   DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/apply_momentum.py b/mindspore/ops/_op_impl/tbe/apply_momentum.py
index 42ce9d0e41..deb8f0d387 100644
--- a/mindspore/ops/_op_impl/tbe/apply_momentum.py
+++ b/mindspore/ops/_op_impl/tbe/apply_momentum.py
@@ -30,22 +30,23 @@ apply_momentum_op_info = TBERegOp("ApplyMomentum") \
     .input(3, "grad", False, "required", "all") \
     .input(4, "momentum", False, "required", "all") \
     .output(0, "var", False, "required", "all") \
+    .output(1, "accum", False, "required", "all") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_Default, DataType.F16_Default) \
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_Default, DataType.F16_5HD,
-                  DataType.F16_Default, DataType.F16_5HD) \
+                  DataType.F16_Default, DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_Default, DataType.F16_C1HWNCoC0,
-                  DataType.F16_Default, DataType.F16_C1HWNCoC0) \
+                  DataType.F16_Default, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
     .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_Default, DataType.F16_FracZ,
-                  DataType.F16_Default, DataType.F16_FracZ) \
+                  DataType.F16_Default, DataType.F16_FracZ, DataType.F16_FracZ) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_Default, DataType.F32_5HD,
-                  DataType.F32_Default, DataType.F32_5HD) \
+                  DataType.F32_Default, DataType.F32_5HD, DataType.F32_5HD) \
     .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_Default, DataType.F32_C1HWNCoC0,
-                  DataType.F32_Default, DataType.F32_C1HWNCoC0) \
+                  DataType.F32_Default, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_Default, DataType.F32_FracZ,
-                  DataType.F32_Default, DataType.F32_FracZ) \
+                  DataType.F32_Default, DataType.F32_FracZ, DataType.F32_FracZ) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/apply_proximal_adagrad.py b/mindspore/ops/_op_impl/tbe/apply_proximal_adagrad.py
index 9099c6e24f..c9b8adf4f4 100644
--- a/mindspore/ops/_op_impl/tbe/apply_proximal_adagrad.py
+++ b/mindspore/ops/_op_impl/tbe/apply_proximal_adagrad.py
@@ -13,15 +13,15 @@
 # limitations under the License.
 # ============================================================================
 
-"""ApplyProximalAdagrad op"""
+"""ApplyProximalAdagradD op"""
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
-apply_proximal_adagrad_op_info = TBERegOp("ApplyProximalAdagrad") \
+apply_proximal_adagrad_d_op_info = TBERegOp("ApplyProximalAdagrad") \
     .fusion_type("OPAQUE") \
     .async_flag(False) \
-    .binfile_name("apply_proximal_adagrad.so") \
+    .binfile_name("apply_proximal_adagrad_d.so") \
     .compute_cost(10) \
-    .kernel_name("apply_proximal_adagrad") \
+    .kernel_name("apply_proximal_adagrad_d") \
     .partial_flag(True) \
     .attr("use_locking", "optional", "bool", "true,false", "false") \
     .input(0, "var", False, "required", "all") \
@@ -31,26 +31,27 @@ apply_proximal_adagrad_op_info = TBERegOp("ApplyProximalAdagrad") \
     .input(4, "l2", False, "required", "all") \
     .input(5, "grad", False, "required", "all") \
     .output(0, "var", False, "required", "all") \
+    .output(1, "accum", False, "required", "all") \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_Default, DataType.F16_5HD, DataType.F16_5HD) \
+                  DataType.F16_Default, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_Default, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+                  DataType.F16_Default, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_Default, DataType.F16_FracZ, DataType.F16_FracZ) \
+                  DataType.F16_Default, DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_5HD, DataType.F32_5HD) \
+                  DataType.F32_Default, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
     .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+                  DataType.F32_Default, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_FracZ, DataType.F32_FracZ) \
+                  DataType.F32_Default, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
     .get_op_info()
 
 
-@op_info_register(apply_proximal_adagrad_op_info)
+@op_info_register(apply_proximal_adagrad_d_op_info)
 def _apply_proximal_adagrad():
-    """ApplyProximalAdagrad TBE register"""
+    """ApplyProximalAdagradD TBE register"""
     return
diff --git a/mindspore/ops/_op_impl/tbe/approximate_equal.py b/mindspore/ops/_op_impl/tbe/approximate_equal.py
new file mode 100644
index 0000000000..62b8a0c16d
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/approximate_equal.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ApproximateEqual op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+approximate_equal_op_info = TBERegOp("ApproximateEqual") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("approximate_equal.so") \
+    .compute_cost(10) \
+    .kernel_name("approximate_equal") \
+    .partial_flag(True) \
+    .op_pattern("broadcast") \
+    .attr("tolerance", "optional", "float", "all") \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.BOOL_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.BOOL_5HD) \
+    .get_op_info()
+
+
+@op_info_register(approximate_equal_op_info)
+def _approximate_equal_tbe():
+    """ApproximateEqual TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/atan.py b/mindspore/ops/_op_impl/tbe/atan.py
index 9562c573e3..293839eaf0 100644
--- a/mindspore/ops/_op_impl/tbe/atan.py
+++ b/mindspore/ops/_op_impl/tbe/atan.py
@@ -26,7 +26,9 @@ atan_op_info = TBERegOp("Atan") \
     .op_pattern("formatAgnostic") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/atan2.py b/mindspore/ops/_op_impl/tbe/atan2.py
index 30bea25d70..26ffdcb59a 100644
--- a/mindspore/ops/_op_impl/tbe/atan2.py
+++ b/mindspore/ops/_op_impl/tbe/atan2.py
@@ -27,7 +27,9 @@ atan2_op_info = TBERegOp("Atan2") \
     .input(0, "x1", False, "required", "all") \
     .input(1, "x2", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/atanh.py b/mindspore/ops/_op_impl/tbe/atanh.py
index d88e0d6105..f60b01967c 100644
--- a/mindspore/ops/_op_impl/tbe/atanh.py
+++ b/mindspore/ops/_op_impl/tbe/atanh.py
@@ -26,7 +26,9 @@ atanh_op_info = TBERegOp("Atanh") \
     .op_pattern("formatAgnostic") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/atomic_addr_clean.py b/mindspore/ops/_op_impl/tbe/atomic_addr_clean.py
index e707a1f26f..98662fed91 100644
--- a/mindspore/ops/_op_impl/tbe/atomic_addr_clean.py
+++ b/mindspore/ops/_op_impl/tbe/atomic_addr_clean.py
@@ -23,7 +23,7 @@ atomic_addr_clean_op_info = TBERegOp("AtomicAddrClean") \
     .compute_cost(10) \
     .kernel_name("atomic_addr_clean") \
     .partial_flag(True) \
-    .attr("automic_add_mem_size", "required", "listInt", "all") \
+    .attr("automic_add_mem_size", "required", "listUInt64", "all") \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/batch_matmul.py b/mindspore/ops/_op_impl/tbe/batch_matmul.py
index 4efcf8031c..02f2dd5880 100644
--- a/mindspore/ops/_op_impl/tbe/batch_matmul.py
+++ b/mindspore/ops/_op_impl/tbe/batch_matmul.py
@@ -29,6 +29,7 @@ batch_matmul_op_info = TBERegOp("BatchMatMul") \
     .input(1, "x2", False, "required", "all") \
     .input(2, "bias", False, "optional", "all") \
     .output(0, "y", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
diff --git a/mindspore/ops/_op_impl/tbe/bias_add.py b/mindspore/ops/_op_impl/tbe/bias_add.py
index 24607af141..5ab1916299 100644
--- a/mindspore/ops/_op_impl/tbe/bias_add.py
+++ b/mindspore/ops/_op_impl/tbe/bias_add.py
@@ -27,6 +27,7 @@ bias_add_grad_op_info = TBERegOp("BiasAdd") \
     .input(0, "x", False, "required", "all") \
     .input(1, "bias", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
diff --git a/mindspore/ops/_op_impl/tbe/bias_add_grad.py b/mindspore/ops/_op_impl/tbe/bias_add_grad.py
index 557dececb7..e59c197bce 100644
--- a/mindspore/ops/_op_impl/tbe/bias_add_grad.py
+++ b/mindspore/ops/_op_impl/tbe/bias_add_grad.py
@@ -26,6 +26,8 @@ bias_add_grad_op_info = TBERegOp("BiasAddGrad") \
     .attr("data_format", "required", "str", "all") \
     .input(0, "output_backprop", False, "required", "all") \
     .output(0, "output", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_FracNZ, DataType.F32_Default) \
     .get_op_info()
diff --git a/mindspore/ops/_op_impl/tbe/bn_training_reduce.py b/mindspore/ops/_op_impl/tbe/bn_training_reduce.py
index e19d4b65ff..f33cba2110 100644
--- a/mindspore/ops/_op_impl/tbe/bn_training_reduce.py
+++ b/mindspore/ops/_op_impl/tbe/bn_training_reduce.py
@@ -26,6 +26,7 @@ bn_training_reduce_op_info = TBERegOp("BNTrainingReduce") \
     .input(0, "x", False, "required", "all", reshape_type="NC") \
     .output(0, "sum", False, "required", "all") \
     .output(1, "square_sum", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_5HD, DataType.F32_5HD, DataType.F32_5HD) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
diff --git a/mindspore/ops/_op_impl/tbe/bn_training_reduce_grad.py b/mindspore/ops/_op_impl/tbe/bn_training_reduce_grad.py
index 66dc55ab10..89736a0097 100644
--- a/mindspore/ops/_op_impl/tbe/bn_training_reduce_grad.py
+++ b/mindspore/ops/_op_impl/tbe/bn_training_reduce_grad.py
@@ -32,6 +32,7 @@ bn_training_reduce_grad_op_info = TBERegOp("BNTrainingReduceGrad") \
     .input(5, "batch_mean", False, "required", "all") \
     .input(6, "batch_variance", False, "required", "all") \
     .output(0, "y", False, "required", "all", reshape_type="NC") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
                   DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
diff --git a/mindspore/ops/_op_impl/tbe/bn_training_update_grad.py b/mindspore/ops/_op_impl/tbe/bn_training_update_grad.py
index 5098923281..1aa822a3c1 100644
--- a/mindspore/ops/_op_impl/tbe/bn_training_update_grad.py
+++ b/mindspore/ops/_op_impl/tbe/bn_training_update_grad.py
@@ -30,6 +30,7 @@ bn_training_update_grad_op_info = TBERegOp("BNTrainingUpdateGrad") \
     .input(3, "batch_variance", False, "required", "all") \
     .output(0, "diff_scale", False, "required", "all") \
     .output(1, "diff_offset", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F32_5HD, DataType.F32_5HD,
                   DataType.F32_5HD, DataType.F32_5HD) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
diff --git a/mindspore/ops/_op_impl/tbe/bn_training_update_v2.py b/mindspore/ops/_op_impl/tbe/bn_training_update_v2.py
index 03a51664e8..a54d91a483 100644
--- a/mindspore/ops/_op_impl/tbe/bn_training_update_v2.py
+++ b/mindspore/ops/_op_impl/tbe/bn_training_update_v2.py
@@ -32,6 +32,7 @@ bn_training_update_v2_op_info = TBERegOp("BNTrainingUpdateV2") \
     .output(0, "y", False, "required", "all", reshape_type="NC") \
     .output(1, "batch_mean", False, "required", "all") \
     .output(2, "batch_variance", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_5HD, DataType.F32_5HD, DataType.F32_5HD,
                   DataType.F32_5HD, DataType.F32_5HD, DataType.F16_5HD,
                   DataType.F32_5HD, DataType.F32_5HD) \
diff --git a/mindspore/ops/_op_impl/tbe/bn_training_update_v3.py b/mindspore/ops/_op_impl/tbe/bn_training_update_v3.py
new file mode 100644
index 0000000000..6d69c6e4be
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/bn_training_update_v3.py
@@ -0,0 +1,51 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BNTrainingUpdateV3 op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+bn_training_update_v3_op_info = TBERegOp("BNTrainingUpdateV3") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("bn_training_update_v3.so") \
+    .compute_cost(10) \
+    .kernel_name("bn_training_update_v3") \
+    .partial_flag(True) \
+    .attr("epsilon", "required", "float", "all") \
+    .input(0, "x", False, "required", "all", reshape_type="NC") \
+    .input(1, "sum", False, "required", "all") \
+    .input(2, "square_sum", False, "required", "all") \
+    .input(3, "scale", False, "required", "all") \
+    .input(4, "offset", False, "required", "all") \
+    .output(0, "y", False, "required", "all", reshape_type="NC") \
+    .output(1, "batch_mean", False, "required", "all") \
+    .output(2, "batch_variance", False, "required", "all") \
+    .output(3, "reserve_1", False, "required", "all") \
+    .output(4, "reserve_2", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.F16_5HD,
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(bn_training_update_v3_op_info)
+def _bn_training_update_v3_tbe():
+    """BNTrainingUpdateV3 TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/broadcast_to.py b/mindspore/ops/_op_impl/tbe/broadcast_to.py
new file mode 100644
index 0000000000..5d4b642017
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/broadcast_to.py
@@ -0,0 +1,40 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BroadcastTo op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+broadcast_to_op_info = TBERegOp("BroadcastTo") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("broadcast_to_d.so") \
+    .compute_cost(10) \
+    .kernel_name("broadcast_to_d") \
+    .partial_flag(True) \
+    .attr("shape", "required", "listInt", "all") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U16_Default) \
+    .get_op_info()
+
+
+@op_info_register(broadcast_to_op_info)
+def _broadcast_to_tbe():
+    """BroadcastTo TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/cast.py b/mindspore/ops/_op_impl/tbe/cast.py
index 07e14139da..0a809e28a7 100644
--- a/mindspore/ops/_op_impl/tbe/cast.py
+++ b/mindspore/ops/_op_impl/tbe/cast.py
@@ -26,32 +26,27 @@ cast_op_info = TBERegOp("Cast") \
     .attr("dst_type", "required", "int", "all") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.BOOL_Default, DataType.F16_Default) \
-    .dtype_format(DataType.BOOL_Default, DataType.U8_Default) \
-    .dtype_format(DataType.BOOL_Default, DataType.F32_Default) \
-    .dtype_format(DataType.BOOL_Default, DataType.I32_Default) \
-    .dtype_format(DataType.I8_Default, DataType.F16_Default) \
-    .dtype_format(DataType.I8_Default, DataType.F32_Default) \
-    .dtype_format(DataType.I8_Default, DataType.I32_Default) \
-    .dtype_format(DataType.U8_Default, DataType.F16_Default) \
-    .dtype_format(DataType.U8_Default, DataType.F32_Default) \
-    .dtype_format(DataType.U8_Default, DataType.I32_Default) \
-    .dtype_format(DataType.I32_Default, DataType.BOOL_Default) \
-    .dtype_format(DataType.I32_Default, DataType.F16_Default) \
-    .dtype_format(DataType.I32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.I32_Default, DataType.I8_Default) \
-    .dtype_format(DataType.I32_Default, DataType.U8_Default) \
-    .dtype_format(DataType.F16_Default, DataType.U8_Default) \
-    .dtype_format(DataType.F16_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F16_Default, DataType.I32_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F32_5HD) \
-    .dtype_format(DataType.F16_FracZ, DataType.F32_FracZ) \
-    .dtype_format(DataType.F16_FracNZ, DataType.F32_FracNZ) \
-    .dtype_format(DataType.F32_5HD, DataType.F16_5HD) \
-    .dtype_format(DataType.F32_FracZ, DataType.F16_FracZ) \
-    .dtype_format(DataType.F32_FracNZ, DataType.F16_FracNZ) \
-    .dtype_format(DataType.F32_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F32_Default, DataType.I32_Default) \
+    .op_pattern("formatAgnostic") \
+    .dtype_format(DataType.BOOL_None, DataType.F16_None) \
+    .dtype_format(DataType.BOOL_None, DataType.U8_None) \
+    .dtype_format(DataType.BOOL_None, DataType.F32_None) \
+    .dtype_format(DataType.BOOL_None, DataType.I32_None) \
+    .dtype_format(DataType.I8_None, DataType.F16_None) \
+    .dtype_format(DataType.I8_None, DataType.F32_None) \
+    .dtype_format(DataType.I8_None, DataType.I32_None) \
+    .dtype_format(DataType.U8_None, DataType.F16_None) \
+    .dtype_format(DataType.U8_None, DataType.F32_None) \
+    .dtype_format(DataType.U8_None, DataType.I32_None) \
+    .dtype_format(DataType.I32_None, DataType.BOOL_None) \
+    .dtype_format(DataType.I32_None, DataType.F16_None) \
+    .dtype_format(DataType.I32_None, DataType.F32_None) \
+    .dtype_format(DataType.I32_None, DataType.I8_None) \
+    .dtype_format(DataType.I32_None, DataType.U8_None) \
+    .dtype_format(DataType.F16_None, DataType.U8_None) \
+    .dtype_format(DataType.F16_None, DataType.F32_None) \
+    .dtype_format(DataType.F16_None, DataType.I32_None) \
+    .dtype_format(DataType.F32_None, DataType.F16_None) \
+    .dtype_format(DataType.F32_None, DataType.I32_None) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/ceil.py b/mindspore/ops/_op_impl/tbe/ceil.py
new file mode 100644
index 0000000000..d9a127603f
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/ceil.py
@@ -0,0 +1,36 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Ceil op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+ceil_op_info = TBERegOp("Ceil") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("ceil.so") \
+    .compute_cost(10) \
+    .kernel_name("ceil") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(ceil_op_info)
+def _ceil_tbe():
+    """Ceil TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/concat.py b/mindspore/ops/_op_impl/tbe/concat.py
index 56807b15fc..0bf636016f 100644
--- a/mindspore/ops/_op_impl/tbe/concat.py
+++ b/mindspore/ops/_op_impl/tbe/concat.py
@@ -26,6 +26,7 @@ concat_op_info = TBERegOp("Concat") \
     .attr("axis", "required", "int", "all") \
     .input(0, "input_values", False, "dynamic", "all") \
     .output(0, "output_data", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.BOOL_Default, DataType.BOOL_Default) \
     .dtype_format(DataType.BOOL_5HD, DataType.BOOL_5HD) \
     .dtype_format(DataType.I8_Default, DataType.I8_Default) \
diff --git a/mindspore/ops/_op_impl/tbe/confusion_matrix.py b/mindspore/ops/_op_impl/tbe/confusion_matrix.py
new file mode 100644
index 0000000000..28dd17f23f
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/confusion_matrix.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ConfusionMatrix op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+confusion_matrix_op_info = TBERegOp("ConfusionMatrix") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("confusion_matrix.so") \
+    .compute_cost(10) \
+    .kernel_name("confusion_matrix") \
+    .partial_flag(True) \
+    .attr("num_classes", "required", "int", "all") \
+    .attr("dtype", "required", "str", "all") \
+    .input(0, "labels", False, "required", "all") \
+    .input(1, "predictions", False, "required", "all") \
+    .input(2, "weights", False, "optional", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.U8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.U8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.U8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.U8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
+    .get_op_info()
+
+
+@op_info_register(confusion_matrix_op_info)
+def _confusion_matrix_tbe():
+    """ConfusionMatrix TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/conv2d.py b/mindspore/ops/_op_impl/tbe/conv2d.py
index 425521901d..a2879d521a 100644
--- a/mindspore/ops/_op_impl/tbe/conv2d.py
+++ b/mindspore/ops/_op_impl/tbe/conv2d.py
@@ -23,6 +23,7 @@ conv2d_op_info = TBERegOp("Conv2D") \
     .compute_cost(10) \
     .kernel_name("conv2d") \
     .partial_flag(True) \
+    .op_pattern("dynamicFormat") \
     .attr("stride", "required", "listInt", "all") \
     .attr("pad_list", "required", "listInt", "all") \
     .attr("dilation", "required", "listInt", "all") \
@@ -32,8 +33,7 @@ conv2d_op_info = TBERegOp("Conv2D") \
     .input(2, "bias", False, "optional", "all") \
     .input(3, "offset_w", False, "optional", "all") \
     .output(0, "y", True, "required", "all") \
-    .dtype_format(DataType.F16_5HD, DataType.F16_FracZ, DataType.F16_Default, DataType.I8_Default,
-                  DataType.F16_5HD) \
+    .dtype_format(DataType.F16_None, DataType.F16_None, DataType.F16_None, DataType.I8_None, DataType.F16_None) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/cos.py b/mindspore/ops/_op_impl/tbe/cos.py
index ecb1062100..3acb0c2a7e 100644
--- a/mindspore/ops/_op_impl/tbe/cos.py
+++ b/mindspore/ops/_op_impl/tbe/cos.py
@@ -26,7 +26,9 @@ cos_op_info = TBERegOp("Cos") \
     .op_pattern("formatAgnostic") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/cosh.py b/mindspore/ops/_op_impl/tbe/cosh.py
new file mode 100644
index 0000000000..75d48293e9
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/cosh.py
@@ -0,0 +1,37 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Cosh op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+cosh_op_info = TBERegOp("Cosh") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("cosh.so") \
+    .compute_cost(10) \
+    .kernel_name("cosh") \
+    .partial_flag(True) \
+    .op_pattern("formatAgnostic") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", True, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(cosh_op_info)
+def _cosh_tbe():
+    """Cosh TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/data_format_dim_map.py b/mindspore/ops/_op_impl/tbe/data_format_dim_map.py
new file mode 100644
index 0000000000..0bbccd30b1
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/data_format_dim_map.py
@@ -0,0 +1,38 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""DataFormatDimMap op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+data_format_dim_map_op_info = TBERegOp("DataFormatDimMap") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("data_format_dim_map.so") \
+    .compute_cost(10) \
+    .kernel_name("data_format_dim_map") \
+    .partial_flag(True) \
+    .attr("dst_format", "optional", "str", "all") \
+    .attr("src_format", "optional", "str", "all") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .get_op_info()
+
+
+@op_info_register(data_format_dim_map_op_info)
+def _data_format_dim_map_tbe():
+    """DataFormatDimMap TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/div_no_nan.py b/mindspore/ops/_op_impl/tbe/div_no_nan.py
new file mode 100644
index 0000000000..893b38042e
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/div_no_nan.py
@@ -0,0 +1,45 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""DivNoNan op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+div_no_nan_op_info = TBERegOp("DivNoNan") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("div_no_nan.so") \
+    .compute_cost(10) \
+    .kernel_name("div_no_nan") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(div_no_nan_op_info)
+def _div_no_nan_tbe():
+    """DivNoNan TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/dropout_do_mask.py b/mindspore/ops/_op_impl/tbe/dropout_do_mask.py
index 2bef489b96..a24e02f964 100644
--- a/mindspore/ops/_op_impl/tbe/dropout_do_mask.py
+++ b/mindspore/ops/_op_impl/tbe/dropout_do_mask.py
@@ -27,6 +27,7 @@ drop_out_do_mask_op_info = TBERegOp("DropoutDoMask") \
     .input(1, "mask", False, "required", "all") \
     .input(2, "keep_prob", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_Default, DataType.U8_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.U8_Default, DataType.F32_Default, DataType.F32_Default) \
     .get_op_info()
diff --git a/mindspore/ops/_op_impl/tbe/elu.py b/mindspore/ops/_op_impl/tbe/elu.py
index 9125d14727..e61e2851af 100644
--- a/mindspore/ops/_op_impl/tbe/elu.py
+++ b/mindspore/ops/_op_impl/tbe/elu.py
@@ -28,9 +28,7 @@ elu_op_info = TBERegOp("Elu") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/erf.py b/mindspore/ops/_op_impl/tbe/erf.py
index 2247197c4e..4c4893d505 100644
--- a/mindspore/ops/_op_impl/tbe/erf.py
+++ b/mindspore/ops/_op_impl/tbe/erf.py
@@ -26,9 +26,7 @@ erf_op_info = TBERegOp("Erf") \
     .op_pattern("formatAgnostic") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/erfc.py b/mindspore/ops/_op_impl/tbe/erfc.py
index 7e1b76649a..7b0eccf52e 100644
--- a/mindspore/ops/_op_impl/tbe/erfc.py
+++ b/mindspore/ops/_op_impl/tbe/erfc.py
@@ -26,9 +26,7 @@ erfc_op_info = TBERegOp("Erfc") \
     .op_pattern("formatAgnostic") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/expm1.py b/mindspore/ops/_op_impl/tbe/expm1.py
new file mode 100644
index 0000000000..a126aca36f
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/expm1.py
@@ -0,0 +1,37 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Expm1 op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+expm1_op_info = TBERegOp("Expm1") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("expm1.so") \
+    .compute_cost(10) \
+    .kernel_name("expm1") \
+    .partial_flag(True) \
+    .op_pattern("formatAgnostic") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(expm1_op_info)
+def _expm1_tbe():
+    """Expm1 TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/fused_mul_add.py b/mindspore/ops/_op_impl/tbe/fused_mul_add.py
index ad3c601e5d..fa104fb561 100644
--- a/mindspore/ops/_op_impl/tbe/fused_mul_add.py
+++ b/mindspore/ops/_op_impl/tbe/fused_mul_add.py
@@ -27,6 +27,7 @@ fused_mul_add_op_info = TBERegOp("FusedMulAdd") \
     .input(1, "x2", False, "required", "all") \
     .input(2, "x3", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
     .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ) \
diff --git a/mindspore/ops/_op_impl/tbe/fused_mul_add_n_l2loss.py b/mindspore/ops/_op_impl/tbe/fused_mul_add_n_l2loss.py
new file mode 100644
index 0000000000..e4f3f8be16
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/fused_mul_add_n_l2loss.py
@@ -0,0 +1,53 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedMulAddNL2loss op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+fused_mul_add_n_l2loss_op_info = TBERegOp("FusedMulAddNL2loss") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("fused_mul_addn_l2loss.so") \
+    .compute_cost(10) \
+    .kernel_name("fused_mul_addn_l2loss") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .input(2, "x3", False, "required", "all") \
+    .output(0, "y1", False, "required", "all") \
+    .output(1, "y2", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_Default,
+                  DataType.F16_5HD, DataType.F16_Default) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_Default,
+                  DataType.F16_C1HWNCoC0, DataType.F16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_Default,
+                  DataType.F16_FracZ, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_Default,
+                  DataType.F32_5HD, DataType.F32_Default) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_Default,
+                  DataType.F32_C1HWNCoC0, DataType.F32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_Default,
+                  DataType.F32_FracZ, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(fused_mul_add_n_l2loss_op_info)
+def _fused_mul_add_n_l2loss_tbe():
+    """FusedMulAddNL2loss TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/fused_mul_apply_momentum_extern.py b/mindspore/ops/_op_impl/tbe/fused_mul_apply_momentum_extern.py
new file mode 100644
index 0000000000..37b0deec12
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/fused_mul_apply_momentum_extern.py
@@ -0,0 +1,67 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedMulApplyMomentumExtern op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+fused_mul_apply_momentum_extern_op_info = TBERegOp("FusedMulApplyMomentumExtern") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("fused_mul_apply_momentum_extern.so") \
+    .compute_cost(10) \
+    .kernel_name("fused_mul_apply_momentum_extern") \
+    .partial_flag(True) \
+    .attr("use_nesterov", "optional", "bool", "true,false", "false") \
+    .input(0, "var", False, "required", "all") \
+    .input(1, "accum", False, "required", "all") \
+    .input(2, "lr", False, "required", "all") \
+    .input(3, "x1", False, "required", "all") \
+    .input(4, "momentum", False, "required", "all") \
+    .input(5, "x2", False, "required", "all") \
+    .input(6, "var_copy", False, "required", "all") \
+    .output(0, "var", False, "required", "all") \
+    .output(1, "var_copy", False, "required", "all") \
+    .output(2, "accum", False, "required", "all") \
+    .dtype_format(DataType.F32_5HD, DataType.F16_5HD, DataType.F16_Default, DataType.F16_5HD,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_5HD, DataType.F32_5HD,
+                  DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_Default, DataType.F16_C1HWNCoC0,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_C1HWNCoC0, DataType.F32_C1HWNCoC0,
+                  DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F32_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F32_Default,
+                  DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_FracZ, DataType.F16_FracZ, DataType.F16_Default, DataType.F16_FracZ,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_FracZ, DataType.F32_FracZ,
+                  DataType.F16_FracZ, DataType.F16_FracZ) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_Default, DataType.F32_5HD,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F16_5HD, DataType.F32_5HD,
+                  DataType.F16_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_Default, DataType.F32_C1HWNCoC0,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F16_C1HWNCoC0, DataType.F32_C1HWNCoC0,
+                  DataType.F16_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_Default,
+                  DataType.F16_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_Default, DataType.F32_FracZ,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F16_FracZ, DataType.F32_FracZ,
+                  DataType.F16_FracZ, DataType.F32_FracZ) \
+    .get_op_info()
+
+
+@op_info_register(fused_mul_apply_momentum_extern_op_info)
+def _fused_mul_apply_momentum_extern_tbe():
+    """FusedMulApplyMomentumExtern TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/histogram_fixed_width.py b/mindspore/ops/_op_impl/tbe/histogram_fixed_width.py
new file mode 100644
index 0000000000..32195f1f3c
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/histogram_fixed_width.py
@@ -0,0 +1,40 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""HistogramFixedWidth op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+histogram_fixed_width_op_info = TBERegOp("HistogramFixedWidth") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("histogram_fixed_width_d.so") \
+    .compute_cost(10) \
+    .kernel_name("histogram_fixed_width_d") \
+    .partial_flag(True) \
+    .attr("nbins", "required", "int", "all") \
+    .attr("dtype", "optional", "str", "all") \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "range", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .get_op_info()
+
+
+@op_info_register(histogram_fixed_width_op_info)
+def _histogram_fixed_width_tbe():
+    """HistogramFixedWidth TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/in_top_k.py b/mindspore/ops/_op_impl/tbe/in_top_k.py
new file mode 100644
index 0000000000..46d7258e2a
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/in_top_k.py
@@ -0,0 +1,37 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""InTopK op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+in_top_k_op_info = TBERegOp("InTopK") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("in_top_k.so") \
+    .compute_cost(10) \
+    .kernel_name("in_top_k") \
+    .partial_flag(True) \
+    .attr("k", "required", "int", "all") \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.BOOL_Default) \
+    .get_op_info()
+
+
+@op_info_register(in_top_k_op_info)
+def _in_top_k_tbe():
+    """InTopK TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/inplace_add.py b/mindspore/ops/_op_impl/tbe/inplace_add.py
new file mode 100644
index 0000000000..9a14fc9a63
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/inplace_add.py
@@ -0,0 +1,39 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""InplaceAdd op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+inplace_add_op_info = TBERegOp("InplaceAdd") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("inplace_add_d.so") \
+    .compute_cost(10) \
+    .kernel_name("inplace_add_d") \
+    .partial_flag(True) \
+    .attr("indices", "required", "listInt", "all") \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "v", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(inplace_add_op_info)
+def _inplace_add_tbe():
+    """InplaceAdd TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/inplace_sub.py b/mindspore/ops/_op_impl/tbe/inplace_sub.py
new file mode 100644
index 0000000000..07f59e05fc
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/inplace_sub.py
@@ -0,0 +1,39 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""InplaceSub op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+inplace_sub_op_info = TBERegOp("InplaceSub") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("inplace_sub_d.so") \
+    .compute_cost(10) \
+    .kernel_name("inplace_sub_d") \
+    .partial_flag(True) \
+    .attr("indices", "required", "listInt", "all") \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "v", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(inplace_sub_op_info)
+def _inplace_sub_tbe():
+    """InplaceSub TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/inplace_update.py b/mindspore/ops/_op_impl/tbe/inplace_update.py
new file mode 100644
index 0000000000..b8c7454d77
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/inplace_update.py
@@ -0,0 +1,39 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""InplaceUpdate op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+inplace_update_op_info = TBERegOp("InplaceUpdate") \
+    .fusion_type("INPLACE") \
+    .async_flag(False) \
+    .binfile_name("inplace_update_d.so") \
+    .compute_cost(10) \
+    .kernel_name("inplace_update_d") \
+    .partial_flag(True) \
+    .attr("indices", "required", "listInt", "all") \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "v", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .get_op_info()
+
+
+@op_info_register(inplace_update_op_info)
+def _inplace_update_tbe():
+    """InplaceUpdate TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/inv.py b/mindspore/ops/_op_impl/tbe/inv.py
new file mode 100644
index 0000000000..e2b749a5aa
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/inv.py
@@ -0,0 +1,37 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Inv op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+inv_op_info = TBERegOp("Inv") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("inv.so") \
+    .compute_cost(10) \
+    .kernel_name("inv") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .get_op_info()
+
+
+@op_info_register(inv_op_info)
+def _inv_tbe():
+    """Inv TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/inv_grad.py b/mindspore/ops/_op_impl/tbe/inv_grad.py
new file mode 100644
index 0000000000..70626b8808
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/inv_grad.py
@@ -0,0 +1,39 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""InvGrad op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+inv_grad_op_info = TBERegOp("InvGrad") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("inv_grad.so") \
+    .compute_cost(10) \
+    .kernel_name("inv_grad") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "grad", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
+    .get_op_info()
+
+
+@op_info_register(inv_grad_op_info)
+def _inv_grad_tbe():
+    """InvGrad TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/invert.py b/mindspore/ops/_op_impl/tbe/invert.py
new file mode 100644
index 0000000000..887eee45e7
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/invert.py
@@ -0,0 +1,36 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Invert op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+invert_op_info = TBERegOp("Invert") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("invert.so") \
+    .compute_cost(10) \
+    .kernel_name("invert") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.I16_Default, DataType.I16_Default) \
+    .dtype_format(DataType.U16_Default, DataType.U16_Default) \
+    .get_op_info()
+
+
+@op_info_register(invert_op_info)
+def _invert_tbe():
+    """Invert TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/l2_normalize_grad.py b/mindspore/ops/_op_impl/tbe/l2_normalize_grad.py
index e164120c75..b6a099d286 100644
--- a/mindspore/ops/_op_impl/tbe/l2_normalize_grad.py
+++ b/mindspore/ops/_op_impl/tbe/l2_normalize_grad.py
@@ -27,7 +27,7 @@ l2_normalize_grad_op_info = TBERegOp("L2NormalizeGrad") \
     .attr("epsilon", "required", "float", "all") \
     .input(0, "x", False, "required", "all") \
     .input(1, "y", False, "required", "all") \
-    .input(2, "dy", False, "requried", "all") \
+    .input(2, "dy", False, "required", "all") \
     .output(0, "dx", True, "required", "all") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
diff --git a/mindspore/ops/_op_impl/tbe/lamb_next_right.py b/mindspore/ops/_op_impl/tbe/lamb_next_right.py
new file mode 100644
index 0000000000..716c5a88fb
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/lamb_next_right.py
@@ -0,0 +1,44 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""LambNextRight op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+lamb_next_right_op_info = TBERegOp("LambNextRight") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("lamb_next_right.so") \
+    .compute_cost(10) \
+    .kernel_name("lamb_next_right") \
+    .partial_flag(True) \
+    .input(0, "input_square", False, "required", "all") \
+    .input(1, "input_mul2", False, "required", "all") \
+    .input(2, "mul2_x", False, "required", "all") \
+    .input(3, "mul3_x", False, "required", "all") \
+    .input(4, "truediv1_recip", False, "required", "all") \
+    .input(5, "add2_y", False, "required", "all") \
+    .output(0, "y1", False, "required", "all") \
+    .output(1, "y2", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(lamb_next_right_op_info)
+def _lamb_next_right_tbe():
+    """LambNextRight TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/layer_norm.py b/mindspore/ops/_op_impl/tbe/layer_norm.py
index c52be2d4ef..03ddd2dc6c 100644
--- a/mindspore/ops/_op_impl/tbe/layer_norm.py
+++ b/mindspore/ops/_op_impl/tbe/layer_norm.py
@@ -32,6 +32,7 @@ layer_norm_op_info = TBERegOp("LayerNorm") \
     .output(0, "y", False, "required", "all") \
     .output(1, "mean", False, "required", "all") \
     .output(2, "variance", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
                   DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD,
diff --git a/mindspore/ops/_op_impl/tbe/layer_norm_beta_gamma_backprop.py b/mindspore/ops/_op_impl/tbe/layer_norm_beta_gamma_backprop.py
index ef254465bc..deca384032 100644
--- a/mindspore/ops/_op_impl/tbe/layer_norm_beta_gamma_backprop.py
+++ b/mindspore/ops/_op_impl/tbe/layer_norm_beta_gamma_backprop.py
@@ -30,6 +30,7 @@ layer_norm_beta_gamma_backprop_op_info = TBERegOp("LayerNormBetaGammaBackprop")
     .input(3, "mean", False, "required", "all") \
     .output(0, "pd_gamma", False, "required", "all") \
     .output(1, "pd_beta", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
                   DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD,
diff --git a/mindspore/ops/_op_impl/tbe/layer_norm_x_backprop.py b/mindspore/ops/_op_impl/tbe/layer_norm_x_backprop.py
index bbab66816d..1d4f1ef231 100644
--- a/mindspore/ops/_op_impl/tbe/layer_norm_x_backprop.py
+++ b/mindspore/ops/_op_impl/tbe/layer_norm_x_backprop.py
@@ -29,6 +29,7 @@ layer_norm_x_backprop_op_info = TBERegOp("LayerNormXBackprop") \
     .input(3, "mean", False, "required", "all") \
     .input(4, "gamma", False, "required", "all") \
     .output(0, "pd_x", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
                   DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD,
diff --git a/mindspore/ops/_op_impl/tbe/lin_space.py b/mindspore/ops/_op_impl/tbe/lin_space.py
new file mode 100644
index 0000000000..aed41e80d4
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/lin_space.py
@@ -0,0 +1,40 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""LinSpace op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+lin_space_op_info = TBERegOp("LinSpace") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("lin_space.so") \
+    .compute_cost(10) \
+    .kernel_name("lin_space") \
+    .partial_flag(True) \
+    .op_pattern("broadcast") \
+    .input(0, "assist", False, "required", "all") \
+    .input(1, "start", False, "required", "all") \
+    .input(2, "stop", False, "required", "all") \
+    .input(3, "num", False, "required", "all") \
+    .output(0, "output", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.I32_Default,
+                  DataType.F32_Default,) \
+    .get_op_info()
+
+
+@op_info_register(lin_space_op_info)
+def _lin_space_tbe():
+    """LinSpace TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/matmul.py b/mindspore/ops/_op_impl/tbe/matmul.py
index c29378f721..7784d5e222 100644
--- a/mindspore/ops/_op_impl/tbe/matmul.py
+++ b/mindspore/ops/_op_impl/tbe/matmul.py
@@ -17,22 +17,32 @@
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 matmul_op_info = TBERegOp("MatMul") \
-    .fusion_type("OPAQUE") \
+    .fusion_type("DYNAMIC") \
     .async_flag(False) \
     .binfile_name("matmul.so") \
     .compute_cost(10) \
     .kernel_name("matmul") \
     .partial_flag(True) \
-    .attr("transpose_a", "required", "bool", "all") \
-    .attr("transpose_b", "required", "bool", "all") \
+    .attr("transpose_x1", "required", "bool", "all") \
+    .attr("transpose_x2", "required", "bool", "all") \
+    .attr("offset_x", "optional", "int", "all") \
     .input(0, "x1", False, "required", "all") \
     .input(1, "x2", False, "required", "all") \
-    .input(2, "x3", False, "optional", "all") \
+    .input(2, "bias", False, "optional", "all") \
+    .input(3, "offset_w", False, "optional", "all") \
     .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F32_Default, DataType.F32_FracNZ) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I8_Default,
+                  DataType.I32_Default) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.I8_Default,
+                  DataType.F16_FracNZ) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F32_Default, DataType.I8_Default,
+                  DataType.F32_FracNZ) \
+    .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.I8_Default,
+                  DataType.F32_NHWC) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.I8_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.I32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC, DataType.I8_Default,
+                  DataType.I32_NHWC) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/matrix_diag.py b/mindspore/ops/_op_impl/tbe/matrix_diag.py
new file mode 100644
index 0000000000..9d080e34a2
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/matrix_diag.py
@@ -0,0 +1,45 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MatrixDiagD op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+matrix_diag_d_op_info = TBERegOp("MatrixDiag") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("matrix_diag_d.so") \
+    .compute_cost(10) \
+    .kernel_name("matrix_diag_d") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "assist", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
+    .get_op_info()
+
+
+@op_info_register(matrix_diag_d_op_info)
+def _matrix_diag_tbe():
+    """MatrixDiagD TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/matrix_diag_part.py b/mindspore/ops/_op_impl/tbe/matrix_diag_part.py
new file mode 100644
index 0000000000..1cb320bbce
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/matrix_diag_part.py
@@ -0,0 +1,45 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MatrixDiagPartD op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+matrix_diag_part_d_op_info = TBERegOp("MatrixDiagPart") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("matrix_diag_part_d.so") \
+    .compute_cost(10) \
+    .kernel_name("matrix_diag_part_d") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "assist", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
+    .get_op_info()
+
+
+@op_info_register(matrix_diag_part_d_op_info)
+def _matrix_diag_part_tbe():
+    """MatrixDiagPartD TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/matrix_set_diag.py b/mindspore/ops/_op_impl/tbe/matrix_set_diag.py
new file mode 100644
index 0000000000..db0b460084
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/matrix_set_diag.py
@@ -0,0 +1,46 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MatrixSetDiagD op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+matrix_diag_d_op_info = TBERegOp("MatrixSetDiag") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("matrix_diag_d.so") \
+    .compute_cost(10) \
+    .kernel_name("matrix_diag_d") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "diagonal", False, "required", "all") \
+    .input(2, "assist", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
+    .get_op_info()
+
+
+@op_info_register(matrix_diag_d_op_info)
+def _matrix_set_diag_tbe():
+    """MatrixSetDiagD TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/mul.py b/mindspore/ops/_op_impl/tbe/mul.py
index fa74c88de3..5433bf0b53 100644
--- a/mindspore/ops/_op_impl/tbe/mul.py
+++ b/mindspore/ops/_op_impl/tbe/mul.py
@@ -26,21 +26,8 @@ mul_op_info = TBERegOp("Mul") \
     .input(0, "x", False, "required", "all") \
     .input(1, "y", False, "required", "all") \
     .output(0, "output", False, "required", "all") \
-    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
-    .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ) \
-    .dtype_format(DataType.I32_FracNZ, DataType.I32_FracNZ, DataType.I32_FracNZ) \
-    .dtype_format(DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0) \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
-    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ) \
-    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
-    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
-    .dtype_format(DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ) \
-    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .op_pattern("dynamicFormat") \
+    .dtype_format(DataType.None_None, DataType.None_None, DataType.None_None) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/range.py b/mindspore/ops/_op_impl/tbe/range.py
new file mode 100644
index 0000000000..257c087b40
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/range.py
@@ -0,0 +1,39 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Range op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+range_op_info = TBERegOp("Range") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("range_d.so") \
+    .compute_cost(10) \
+    .kernel_name("range_d") \
+    .partial_flag(True) \
+    .attr("start", "required", "float", "all") \
+    .attr("limit", "required", "float", "all") \
+    .attr("delta", "required", "float", "all") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .get_op_info()
+
+
+@op_info_register(range_op_info)
+def _range_tbe():
+    """Range TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/real_div.py b/mindspore/ops/_op_impl/tbe/real_div.py
index b39948971d..9c6d9e0b27 100644
--- a/mindspore/ops/_op_impl/tbe/real_div.py
+++ b/mindspore/ops/_op_impl/tbe/real_div.py
@@ -26,10 +26,9 @@ realdiv_op_info = TBERegOp("RealDiv") \
     .input(0, "x", False, "required", "all") \
     .input(1, "y", False, "required", "all") \
     .output(0, "z", False, "required", "all") \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .op_pattern("broadcast") \
+    .dtype_format(DataType.F16_None, DataType.F16_None, DataType.F16_None) \
+    .dtype_format(DataType.F32_None, DataType.F32_None, DataType.F32_None) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/reciprocal.py b/mindspore/ops/_op_impl/tbe/reciprocal.py
index dfa126384c..77f3bfac27 100644
--- a/mindspore/ops/_op_impl/tbe/reciprocal.py
+++ b/mindspore/ops/_op_impl/tbe/reciprocal.py
@@ -25,6 +25,7 @@ reciprocal_op_info = TBERegOp("Reciprocal") \
     .partial_flag(True) \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F16_NHWC, DataType.F16_NHWC) \
diff --git a/mindspore/ops/_op_impl/tbe/reduce_mean.py b/mindspore/ops/_op_impl/tbe/reduce_mean.py
index 67b96933a1..b01fd3bebd 100644
--- a/mindspore/ops/_op_impl/tbe/reduce_mean.py
+++ b/mindspore/ops/_op_impl/tbe/reduce_mean.py
@@ -27,11 +27,11 @@ reduce_mean_op_info = TBERegOp("ReduceMean") \
     .attr("keep_dims", "optional", "bool", "all") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.I8_Default, DataType.I8_Default) \
-    .dtype_format(DataType.U8_Default, DataType.U8_Default) \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .op_pattern("reduce") \
+    .dtype_format(DataType.I8_None, DataType.I8_None) \
+    .dtype_format(DataType.U8_None, DataType.U8_None) \
+    .dtype_format(DataType.F16_None, DataType.F16_None) \
+    .dtype_format(DataType.F32_None, DataType.F32_None) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/relu_grad_v2.py b/mindspore/ops/_op_impl/tbe/relu_grad_v2.py
index 93d7dede62..e5f82c8b78 100644
--- a/mindspore/ops/_op_impl/tbe/relu_grad_v2.py
+++ b/mindspore/ops/_op_impl/tbe/relu_grad_v2.py
@@ -24,7 +24,7 @@ relu_grad_v2_op_info = TBERegOp("ReluGradV2") \
     .kernel_name("relu_grad_v2") \
     .partial_flag(True) \
     .input(0, "gradients", False, "required", "all") \
-    .input(1, "mask", False, "rerequired", "all") \
+    .input(1, "mask", False, "required", "all") \
     .output(0, "backprops", True, "required", "all") \
     .dtype_format(DataType.F16_5HD, DataType.U8_Default, DataType.F16_5HD) \
     .dtype_format(DataType.F32_5HD, DataType.U8_Default, DataType.F32_5HD) \
diff --git a/mindspore/ops/_op_impl/tbe/scatter_nd_update.py b/mindspore/ops/_op_impl/tbe/scatter_nd_update.py
index df0996f26f..74fb7c9b72 100644
--- a/mindspore/ops/_op_impl/tbe/scatter_nd_update.py
+++ b/mindspore/ops/_op_impl/tbe/scatter_nd_update.py
@@ -31,7 +31,7 @@ scatter_nd_update_op_info = TBERegOp("ScatterNdUpdate") \
     .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.I8_Default, DataType.I32_Default, DataType.I8_Default, DataType.I8_Default) \
-    .dtype_format(DataType.U8_Default, DataType.I32_Default, DataType.U8_Default, DataType.U8_Default,) \
+    .dtype_format(DataType.U8_Default, DataType.I32_Default, DataType.U8_Default, DataType.U8_Default) \
     .dtype_format(DataType.BOOL_Default, DataType.I32_Default, DataType.BOOL_Default, DataType.BOOL_Default) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/scatter_update.py b/mindspore/ops/_op_impl/tbe/scatter_update.py
index 3c330fe435..244b8ab21f 100644
--- a/mindspore/ops/_op_impl/tbe/scatter_update.py
+++ b/mindspore/ops/_op_impl/tbe/scatter_update.py
@@ -31,7 +31,7 @@ scatter_update_op_info = TBERegOp("ScatterUpdate") \
     .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.I8_Default, DataType.I32_Default, DataType.I8_Default, DataType.I8_Default) \
-    .dtype_format(DataType.U8_Default, DataType.I32_Default, DataType.U8_Default, DataType.U8_Default,) \
+    .dtype_format(DataType.U8_Default, DataType.I32_Default, DataType.U8_Default, DataType.U8_Default) \
     .dtype_format(DataType.BOOL_Default, DataType.I32_Default, DataType.BOOL_Default, DataType.BOOL_Default) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/select.py b/mindspore/ops/_op_impl/tbe/select.py
index 4af4325312..e924f05021 100644
--- a/mindspore/ops/_op_impl/tbe/select.py
+++ b/mindspore/ops/_op_impl/tbe/select.py
@@ -27,6 +27,7 @@ select_op_info = TBERegOp("Select") \
     .input(1, "x1", False, "required", "all") \
     .input(2, "x2", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.BOOL_Default, DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
     .dtype_format(DataType.BOOL_Default, DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
     .dtype_format(DataType.BOOL_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
diff --git a/mindspore/ops/_op_impl/tbe/sign.py b/mindspore/ops/_op_impl/tbe/sign.py
index 823715aa9f..99f7970316 100644
--- a/mindspore/ops/_op_impl/tbe/sign.py
+++ b/mindspore/ops/_op_impl/tbe/sign.py
@@ -27,11 +27,8 @@ sign_op_info = TBERegOp("Sign") \
     .input(0, "x", None, "required", None) \
     .output(0, "y", True, "required", "all") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .dtype_format(DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.I32_5HD, DataType.I32_5HD) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/sin.py b/mindspore/ops/_op_impl/tbe/sin.py
index 187c0f0f32..f01f687926 100644
--- a/mindspore/ops/_op_impl/tbe/sin.py
+++ b/mindspore/ops/_op_impl/tbe/sin.py
@@ -26,7 +26,9 @@ sin_op_info = TBERegOp("Sin") \
     .op_pattern("formatAgnostic") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/sinh.py b/mindspore/ops/_op_impl/tbe/sinh.py
new file mode 100644
index 0000000000..27eb66d274
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/sinh.py
@@ -0,0 +1,37 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Sinh op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+sinh_op_info = TBERegOp("Sinh") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("sinh.so") \
+    .compute_cost(10) \
+    .kernel_name("sinh") \
+    .partial_flag(True) \
+    .op_pattern("formatAgnostic") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", True, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(sinh_op_info)
+def _sinh_tbe():
+    """Sinh TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/softmax_grad_ext.py b/mindspore/ops/_op_impl/tbe/softmax_grad_ext.py
index 51060d717b..d43183dcb7 100644
--- a/mindspore/ops/_op_impl/tbe/softmax_grad_ext.py
+++ b/mindspore/ops/_op_impl/tbe/softmax_grad_ext.py
@@ -24,12 +24,13 @@ softmax_grad_ext_op_info = TBERegOp("SoftmaxGradExt") \
     .kernel_name("softmax_grad_ext") \
     .partial_flag(True) \
     .dynamic_format(True) \
-    .attr("axes", "required", "listInt", "all") \
-    .attr("keep_dims", "required", "bool", "all") \
+    .attr("axis", "required", "listInt", "all") \
+    .attr("keepdims", "required", "bool", "all") \
     .input(0, "grad", False, "required", "all") \
     .input(1, "x1", False, "required", "all") \
     .input(2, "x2", False, "required", "all") \
     .output(0, "y", True, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default,
                   DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD,
diff --git a/mindspore/ops/_op_impl/tbe/softplus.py b/mindspore/ops/_op_impl/tbe/softplus.py
index d362cd06db..92261d91ef 100644
--- a/mindspore/ops/_op_impl/tbe/softplus.py
+++ b/mindspore/ops/_op_impl/tbe/softplus.py
@@ -27,9 +27,7 @@ softplus_op_info = TBERegOp("Softplus") \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/softplus_grad.py b/mindspore/ops/_op_impl/tbe/softplus_grad.py
index 4bf7a82440..3dc0e7ee0c 100644
--- a/mindspore/ops/_op_impl/tbe/softplus_grad.py
+++ b/mindspore/ops/_op_impl/tbe/softplus_grad.py
@@ -28,9 +28,7 @@ softplus_grad_op_info = TBERegOp("SoftplusGrad") \
     .input(1, "features", False, "required", "all") \
     .output(0, "backprops", False, "required", "all") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/sparse_apply_adagrad.py b/mindspore/ops/_op_impl/tbe/sparse_apply_adagrad.py
index ca77a5eaed..c1083af9f6 100644
--- a/mindspore/ops/_op_impl/tbe/sparse_apply_adagrad.py
+++ b/mindspore/ops/_op_impl/tbe/sparse_apply_adagrad.py
@@ -13,15 +13,15 @@
 # limitations under the License.
 # ============================================================================
 
-"""SparseApplyAdagrad op"""
+"""SparseApplyAdagradD op"""
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
-sparse_apply_adagrad_op_info = TBERegOp("SparseApplyAdagrad") \
+sparse_apply_adagrad_d_op_info = TBERegOp("SparseApplyAdagrad") \
     .fusion_type("OPAQUE") \
     .async_flag(False) \
-    .binfile_name("sparse_apply_adagrad.so") \
+    .binfile_name("sparse_apply_adagrad_d.so") \
     .compute_cost(10) \
-    .kernel_name("sparse_apply_adagrad") \
+    .kernel_name("sparse_apply_adagrad_d") \
     .partial_flag(True) \
     .attr("lr", "required", "float", "all") \
     .attr("update_slots", "optional", "bool", "all") \
@@ -31,14 +31,17 @@ sparse_apply_adagrad_op_info = TBERegOp("SparseApplyAdagrad") \
     .input(2, "grad", False, "required", "all") \
     .input(3, "indices", False, "required", "all") \
     .output(0, "var", False, "required", "all") \
-    .dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.I32_NCHW, DataType.F32_NCHW) \
-    .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.F32_NHWC) \
+    .output(1, "accum", False, "required", "all") \
+    .dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.I32_NCHW,
+                  DataType.F32_NCHW, DataType.F32_NCHW) \
+    .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC, DataType.F32_NHWC) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.I32_Default,
-                  DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default) \
     .get_op_info()
 
 
-@op_info_register(sparse_apply_adagrad_op_info)
+@op_info_register(sparse_apply_adagrad_d_op_info)
 def _sparse_apply_adagrad_tbe():
-    """SparseApplyAdagrad TBE register"""
+    """SparseApplyAdagradD TBE register"""
     return
diff --git a/mindspore/ops/_op_impl/tbe/sparse_apply_proximal_adagrad.py b/mindspore/ops/_op_impl/tbe/sparse_apply_proximal_adagrad.py
index f665890c55..782be983fa 100644
--- a/mindspore/ops/_op_impl/tbe/sparse_apply_proximal_adagrad.py
+++ b/mindspore/ops/_op_impl/tbe/sparse_apply_proximal_adagrad.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 # ============================================================================
 
-"""SparseApplyProximalAdagrad op"""
+"""SparseApplyProximalAdagradD op"""
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
-sparse_apply_proximal_adagrad_op_info = TBERegOp("SparseApplyProximalAdagrad") \
+sparse_apply_proximal_adagrad_d_op_info = TBERegOp("SparseApplyProximalAdagrad") \
     .fusion_type("OPAQUE") \
     .async_flag(False) \
     .binfile_name("sparse_apply_proximal_adagrad.so") \
@@ -32,70 +32,101 @@ sparse_apply_proximal_adagrad_op_info = TBERegOp("SparseApplyProximalAdagrad") \
     .input(5, "grad", False, "required", "all") \
     .input(6, "indices", False, "required", "all") \
     .output(0, "var", False, "required", "all") \
+    .output(1, "accum", False, "required", "all") \
     .dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW,
-                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.I16_NCHW, DataType.F32_NCHW) \
+                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.I16_NCHW, DataType.F32_NCHW,
+                  DataType.F32_NCHW) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
-                  DataType.F32_5HD, DataType.F32_5HD, DataType.I16_5HD, DataType.F32_5HD) \
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.I16_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
     .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC,
-                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.I16_NHWC, DataType.F32_NHWC) \
+                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.I16_NHWC, DataType.F32_NHWC,
+                  DataType.F32_NHWC) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default, DataType.I16_Default, DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.I16_Default, DataType.F32_Default,
+                  DataType.F32_Default) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ,
-                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.I16_FracZ, DataType.F32_FracZ) \
+                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.I16_FracZ, DataType.F32_FracZ,
+                  DataType.F32_FracZ) \
     .dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW,
-                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.I32_NCHW, DataType.F32_NCHW) \
+                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.I32_NCHW, DataType.F32_NCHW,
+                  DataType.F32_NCHW) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
-                  DataType.F32_5HD, DataType.F32_5HD, DataType.I32_5HD, DataType.F32_5HD) \
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.I32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
     .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC,
-                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.F32_NHWC) \
+                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.F32_NHWC,
+                  DataType.F32_NHWC) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default, DataType.I32_Default, DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.I32_Default, DataType.F32_Default,
+                  DataType.F32_Default) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ,
-                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.I32_FracZ, DataType.F32_FracZ) \
+                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.I32_FracZ, DataType.F32_FracZ,
+                  DataType.F32_FracZ) \
     .dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW,
-                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.I64_NCHW, DataType.F32_NCHW) \
+                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.I64_NCHW, DataType.F32_NCHW,
+                  DataType.F32_NCHW) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
-                  DataType.F32_5HD, DataType.F32_5HD, DataType.I64_5HD, DataType.F32_5HD) \
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.I64_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
     .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC,
-                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.I64_NHWC, DataType.F32_NHWC) \
+                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.I64_NHWC, DataType.F32_NHWC,
+                  DataType.F32_NHWC) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default, DataType.I64_Default, DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.I64_Default, DataType.F32_Default,
+                  DataType.F32_Default) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ,
-                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.I64_FracZ, DataType.F32_FracZ) \
+                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.I64_FracZ, DataType.F32_FracZ,
+                  DataType.F32_FracZ) \
     .dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW,
-                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.U16_NCHW, DataType.F32_NCHW) \
+                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.U16_NCHW, DataType.F32_NCHW,
+                  DataType.F32_NCHW) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
-                  DataType.F32_5HD, DataType.F32_5HD, DataType.U16_5HD, DataType.F32_5HD) \
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.U16_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
     .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC,
-                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.U16_NHWC, DataType.F32_NHWC) \
+                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.U16_NHWC, DataType.F32_NHWC,
+                  DataType.F32_NHWC) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default, DataType.U16_Default, DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.U16_Default, DataType.F32_Default,
+                  DataType.F32_Default) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ,
-                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.U16_FracZ, DataType.F32_FracZ) \
+                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.U16_FracZ, DataType.F32_FracZ,
+                  DataType.F32_FracZ) \
     .dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW,
-                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.U32_NCHW, DataType.F32_NCHW) \
+                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.U32_NCHW, DataType.F32_NCHW,
+                  DataType.F32_NCHW) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
-                  DataType.F32_5HD, DataType.F32_5HD, DataType.U32_5HD, DataType.F32_5HD) \
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.U32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
     .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC,
-                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.U32_NHWC, DataType.F32_NHWC) \
+                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.U32_NHWC, DataType.F32_NHWC,
+                  DataType.F32_NHWC) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default, DataType.U32_Default, DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.U32_Default, DataType.F32_Default,
+                  DataType.F32_Default) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ,
-                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.U32_FracZ, DataType.F32_FracZ) \
+                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.U32_FracZ, DataType.F32_FracZ,
+                  DataType.F32_FracZ) \
     .dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW,
-                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.U64_NCHW, DataType.F32_NCHW) \
+                  DataType.F32_NCHW, DataType.F32_NCHW, DataType.U64_NCHW, DataType.F32_NCHW,
+                  DataType.F32_NCHW) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
-                  DataType.F32_5HD, DataType.F32_5HD, DataType.U64_5HD, DataType.F32_5HD) \
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.U64_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
     .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC,
-                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.U64_NHWC, DataType.F32_NHWC) \
+                  DataType.F32_NHWC, DataType.F32_NHWC, DataType.U64_NHWC, DataType.F32_NHWC,
+                  DataType.F32_NHWC) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default, DataType.U64_Default, DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.U64_Default, DataType.F32_Default,
+                  DataType.F32_Default) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ,
-                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.U64_FracZ, DataType.F32_FracZ) \
+                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.U64_FracZ, DataType.F32_FracZ,
+                  DataType.F32_FracZ) \
     .get_op_info()
 
 
-@op_info_register(sparse_apply_proximal_adagrad_op_info)
+@op_info_register(sparse_apply_proximal_adagrad_d_op_info)
 def _sparse_apply_proximal_adagrad():
-    """SparseApplyProximalAdagrad TBE register"""
+    """SparseApplyProximalAdagradD TBE register"""
     return
diff --git a/mindspore/ops/_op_impl/tbe/sparse_gather_v2.py b/mindspore/ops/_op_impl/tbe/sparse_gather_v2.py
new file mode 100644
index 0000000000..b824836312
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/sparse_gather_v2.py
@@ -0,0 +1,66 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""SparseGatherV2 op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+sparse_gather_v2_op_info = TBERegOp("SparseGatherV2") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("gather_v2_d.so") \
+    .compute_cost(10) \
+    .kernel_name("gather_v2_d") \
+    .partial_flag(True) \
+    .attr("axis", "optional", "int", "all") \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "indices", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.I8_Default, DataType.I32_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I64_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I8_5HD, DataType.I32_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.I8_5HD, DataType.I64_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.I8_FracZ, DataType.I32_FracZ, DataType.I8_FracZ) \
+    .dtype_format(DataType.I8_FracZ, DataType.I64_FracZ, DataType.I8_FracZ) \
+    .dtype_format(DataType.U8_Default, DataType.I32_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I64_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U8_5HD, DataType.I32_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.U8_5HD, DataType.I64_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.U8_FracZ, DataType.I32_FracZ, DataType.U8_FracZ) \
+    .dtype_format(DataType.U8_FracZ, DataType.I64_FracZ, DataType.U8_FracZ) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_5HD, DataType.I64_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ) \
+    .dtype_format(DataType.I32_FracZ, DataType.I64_FracZ, DataType.I32_FracZ) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.I32_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_5HD, DataType.I64_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_FracZ, DataType.I32_FracZ, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_FracZ, DataType.I64_FracZ, DataType.F16_FracZ) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.I32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_5HD, DataType.I64_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_FracZ, DataType.I32_FracZ, DataType.F32_FracZ) \
+    .dtype_format(DataType.F32_FracZ, DataType.I64_FracZ, DataType.F32_FracZ) \
+    .get_op_info()
+
+
+@op_info_register(sparse_gather_v2_op_info)
+def _sparse_gather_v2_tbe():
+    """SparseGatherV2 TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/split_d.py b/mindspore/ops/_op_impl/tbe/split_d.py
index dcc8219fd4..d2faf31096 100644
--- a/mindspore/ops/_op_impl/tbe/split_d.py
+++ b/mindspore/ops/_op_impl/tbe/split_d.py
@@ -27,6 +27,7 @@ split_d_op_info = TBERegOp("Split") \
     .attr("output_num", "required", "int", "all") \
     .input(0, "value", False, "required", "all") \
     .output(0, "output", False, "dynamic", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.BOOL_Default, DataType.BOOL_Default) \
     .dtype_format(DataType.BOOL_NHWC, DataType.BOOL_NHWC) \
     .dtype_format(DataType.I8_Default, DataType.I8_Default) \
diff --git a/mindspore/ops/_op_impl/tbe/splitv.py b/mindspore/ops/_op_impl/tbe/splitv.py
new file mode 100644
index 0000000000..29f65c7e87
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/splitv.py
@@ -0,0 +1,60 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""SplitV op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+split_v_op_info = TBERegOp("SplitV") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("split_v_d.so") \
+    .compute_cost(10) \
+    .kernel_name("split_v_d") \
+    .partial_flag(True) \
+    .attr("size_splits", "required", "listInt", "all") \
+    .attr("split_dim", "required", "int", "all") \
+    .attr("num_split", "required", "int", "all") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "dynamic", "all") \
+    .op_pattern("dynamicFormat") \
+    .dtype_format(DataType.BOOL_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.BOOL_NHWC, DataType.BOOL_NHWC) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I8_NHWC, DataType.I8_NHWC) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U8_NHWC, DataType.U8_NHWC) \
+    .dtype_format(DataType.I16_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I16_NHWC, DataType.I16_NHWC) \
+    .dtype_format(DataType.U16_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U16_NHWC, DataType.U16_NHWC) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I32_NHWC, DataType.I32_NHWC) \
+    .dtype_format(DataType.U32_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U32_NHWC, DataType.U32_NHWC) \
+    .dtype_format(DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.I64_NHWC, DataType.I64_NHWC) \
+    .dtype_format(DataType.U64_Default, DataType.U64_Default) \
+    .dtype_format(DataType.U64_NHWC, DataType.U64_NHWC) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_NHWC, DataType.F16_NHWC) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC) \
+    .get_op_info()
+
+
+@op_info_register(split_v_op_info)
+def _split_v_tbe():
+    """SplitV TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/strided_read.py b/mindspore/ops/_op_impl/tbe/strided_read.py
new file mode 100644
index 0000000000..1ebd29f8f2
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/strided_read.py
@@ -0,0 +1,38 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""StridedRead op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+strided_read_op_info = TBERegOp("StridedRead") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("strided_read.so") \
+    .compute_cost(10) \
+    .kernel_name("strided_read") \
+    .partial_flag(True) \
+    .attr("axis", "required", "int", "all") \
+    .attr("stride", "required", "int", "all") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.I8_5HD, DataType.I8_5HD) \
+    .get_op_info()
+
+
+@op_info_register(strided_read_op_info)
+def _strided_read_tbe():
+    """StridedRead TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/strided_write.py b/mindspore/ops/_op_impl/tbe/strided_write.py
new file mode 100644
index 0000000000..feda752b28
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/strided_write.py
@@ -0,0 +1,38 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""StridedWrite op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+strided_write_op_info = TBERegOp("StridedWrite") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("strided_write.so") \
+    .compute_cost(10) \
+    .kernel_name("strided_write") \
+    .partial_flag(True) \
+    .attr("axis", "required", "int", "all") \
+    .attr("stride", "required", "int", "all") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.I8_5HD, DataType.I8_5HD) \
+    .get_op_info()
+
+
+@op_info_register(strided_write_op_info)
+def _strided_write_tbe():
+    """StridedWrite TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/tensor_add.py b/mindspore/ops/_op_impl/tbe/tensor_add.py
index 255c1b1278..a1f21bee77 100644
--- a/mindspore/ops/_op_impl/tbe/tensor_add.py
+++ b/mindspore/ops/_op_impl/tbe/tensor_add.py
@@ -26,6 +26,7 @@ tensor_add_op_info = TBERegOp("TensorAdd") \
     .input(0, "x1", False, "required", "all") \
     .input(1, "x2", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
diff --git a/mindspore/ops/_op_impl/tbe/tensor_scatter_update.py b/mindspore/ops/_op_impl/tbe/tensor_scatter_update.py
new file mode 100644
index 0000000000..46d6b20357
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/tensor_scatter_update.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""TensorScatterUpdate op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+tensor_scatter_update_op_info = TBERegOp("TensorScatterUpdate") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("tensor_scatter_update.so") \
+    .compute_cost(10) \
+    .kernel_name("tensor_scatter_update") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "indices", False, "required", "all") \
+    .input(1, "updates", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I32_Default, DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I32_Default, DataType.U8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .get_op_info()
+
+
+@op_info_register(tensor_scatter_update_op_info)
+def _tensor_scatter_update_tbe():
+    """TensorScatterUpdate TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/trans_data.py b/mindspore/ops/_op_impl/tbe/trans_data.py
index f961491b37..c0cce302cd 100644
--- a/mindspore/ops/_op_impl/tbe/trans_data.py
+++ b/mindspore/ops/_op_impl/tbe/trans_data.py
@@ -23,41 +23,112 @@ trans_data_op_info = TBERegOp("TransData") \
     .compute_cost(10) \
     .kernel_name("trans_data") \
     .partial_flag(True) \
-    .attr("src_format", "required", "str", "DefaultFormat,NC1HWC0,FracZ,FRACTAL_NZ,HWCN,C1HWNCoC0")\
-    .attr("dst_format", "required", "str", "DefaultFormat,NC1HWC0,FracZ,FRACTAL_NZ,HWCN,C1HWNCoC0")\
+    .attr("src_format", "required", "str", "DefaultFormat, NC1HWC0, FracZ, FRACTAL_NZ, HWCN, C1HWNCoC0, NDHWC, NHWC") \
+    .attr("dst_format", "required", "str", "DefaultFormat, NC1HWC0, FracZ, FRACTAL_NZ, HWCN, C1HWNCoC0, NDHWC, NHWC") \
     .input(0, "src", False, "required", "all") \
     .output(0, "dst", False, "required", "all") \
-    .dtype_format(DataType.U16_Default, DataType.U16_5HD) \
-    .dtype_format(DataType.U16_Default, DataType.U16_FracZ) \
-    .dtype_format(DataType.U16_Default, DataType.U16_FracNZ) \
-    .dtype_format(DataType.U16_FracZ, DataType.U16_Default) \
-    .dtype_format(DataType.U16_FracZ, DataType.U16_HWCN) \
-    .dtype_format(DataType.U16_FracNZ, DataType.U16_Default) \
-    .dtype_format(DataType.U16_5HD, DataType.U16_Default) \
-    .dtype_format(DataType.U16_HWCN, DataType.U16_FracZ) \
-    .dtype_format(DataType.U16_HWCN, DataType.U16_C1HWNCoC0) \
-    .dtype_format(DataType.U16_C1HWNCoC0, DataType.U16_HWCN) \
-    .dtype_format(DataType.BOOL_Default, DataType.BOOL_5HD) \
-    .dtype_format(DataType.F16_Default, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_NHWC, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_NHWC) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_Default) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_FracZ) \
+    .dtype_format(DataType.F32_HWCN, DataType.F32_FracZ) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_HWCN) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_HWCN) \
+    .dtype_format(DataType.F32_HWCN, DataType.F32_C1HWNCoC0) \
     .dtype_format(DataType.F16_Default, DataType.F16_FracZ) \
-    .dtype_format(DataType.F16_Default, DataType.F16_FracNZ) \
-    .dtype_format(DataType.F16_FracZ, DataType.F16_Default) \
-    .dtype_format(DataType.F16_FracZ, DataType.F16_HWCN) \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default) \
+    .dtype_format(DataType.F16_NHWC, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_HWCN, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_Default, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_NHWC, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_HWCN, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_NHWC) \
     .dtype_format(DataType.F16_5HD, DataType.F16_Default) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_FracZ) \
     .dtype_format(DataType.F16_HWCN, DataType.F16_FracZ) \
-    .dtype_format(DataType.F16_HWCN, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_HWCN) \
     .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_HWCN) \
-    .dtype_format(DataType.F32_Default, DataType.F32_5HD) \
-    .dtype_format(DataType.F32_Default, DataType.F32_FracZ) \
+    .dtype_format(DataType.F16_HWCN, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_Default, DataType.F16_FracNZ) \
     .dtype_format(DataType.F32_Default, DataType.F32_FracNZ) \
-    .dtype_format(DataType.F32_FracZ, DataType.F32_Default) \
-    .dtype_format(DataType.F32_FracZ, DataType.F32_HWCN) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default) \
     .dtype_format(DataType.F32_FracNZ, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_Default) \
-    .dtype_format(DataType.F32_HWCN, DataType.F32_FracZ) \
-    .dtype_format(DataType.F32_HWCN, DataType.F32_C1HWNCoC0) \
-    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_HWCN) \
+    .dtype_format(DataType.BOOL_NHWC, DataType.BOOL_5HD) \
+    .dtype_format(DataType.BOOL_Default, DataType.BOOL_5HD) \
+    .dtype_format(DataType.BOOL_5HD, DataType.BOOL_NHWC) \
+    .dtype_format(DataType.BOOL_5HD, DataType.BOOL_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_NHWC) \
+    .dtype_format(DataType.F16_Default, DataType.F16_HWCN) \
+    .dtype_format(DataType.F16_NHWC, DataType.F16_Default) \
+    .dtype_format(DataType.F16_NHWC, DataType.F16_HWCN) \
+    .dtype_format(DataType.F16_HWCN, DataType.F16_Default) \
+    .dtype_format(DataType.F16_HWCN, DataType.F16_NHWC) \
+    .dtype_format(DataType.F32_Default, DataType.F32_NHWC) \
+    .dtype_format(DataType.F32_Default, DataType.F32_HWCN) \
+    .dtype_format(DataType.F32_NHWC, DataType.F32_Default) \
+    .dtype_format(DataType.F32_NHWC, DataType.F32_HWCN) \
+    .dtype_format(DataType.F32_HWCN, DataType.F32_Default) \
+    .dtype_format(DataType.F32_HWCN, DataType.F32_NHWC) \
+    .dtype_format(DataType.I8_Default, DataType.I8_FracNZ) \
+    .dtype_format(DataType.I8_Default, DataType.I8_FracZ) \
+    .dtype_format(DataType.I8_Default, DataType.I8_NHWC) \
+    .dtype_format(DataType.I8_Default, DataType.I8_HWCN) \
+    .dtype_format(DataType.I8_NHWC, DataType.I8_Default) \
+    .dtype_format(DataType.I8_NHWC, DataType.I8_HWCN) \
+    .dtype_format(DataType.I8_HWCN, DataType.I8_Default) \
+    .dtype_format(DataType.I8_HWCN, DataType.I8_NHWC) \
+    .dtype_format(DataType.I16_Default, DataType.I16_NHWC) \
+    .dtype_format(DataType.I16_Default, DataType.I16_HWCN) \
+    .dtype_format(DataType.I16_NHWC, DataType.I16_Default) \
+    .dtype_format(DataType.I16_NHWC, DataType.I16_HWCN) \
+    .dtype_format(DataType.I16_HWCN, DataType.I16_Default) \
+    .dtype_format(DataType.I16_HWCN, DataType.I16_NHWC) \
+    .dtype_format(DataType.I32_Default, DataType.I32_NHWC) \
+    .dtype_format(DataType.I32_Default, DataType.I32_HWCN) \
+    .dtype_format(DataType.I32_NHWC, DataType.I32_Default) \
+    .dtype_format(DataType.I32_NHWC, DataType.I32_HWCN) \
+    .dtype_format(DataType.I32_HWCN, DataType.I32_Default) \
+    .dtype_format(DataType.I32_HWCN, DataType.I32_NHWC) \
+    .dtype_format(DataType.I64_Default, DataType.I64_NHWC) \
+    .dtype_format(DataType.I64_Default, DataType.I64_HWCN) \
+    .dtype_format(DataType.I64_NHWC, DataType.I64_Default) \
+    .dtype_format(DataType.I64_NHWC, DataType.I64_HWCN) \
+    .dtype_format(DataType.I64_HWCN, DataType.I64_Default) \
+    .dtype_format(DataType.I64_HWCN, DataType.I64_NHWC) \
+    .dtype_format(DataType.U8_Default, DataType.U8_NHWC) \
+    .dtype_format(DataType.U8_Default, DataType.U8_HWCN) \
+    .dtype_format(DataType.U8_NHWC, DataType.U8_Default) \
+    .dtype_format(DataType.U8_NHWC, DataType.U8_HWCN) \
+    .dtype_format(DataType.U8_HWCN, DataType.U8_Default) \
+    .dtype_format(DataType.U8_HWCN, DataType.U8_NHWC) \
+    .dtype_format(DataType.U16_Default, DataType.U16_NHWC) \
+    .dtype_format(DataType.U16_Default, DataType.U16_HWCN) \
+    .dtype_format(DataType.U16_NHWC, DataType.U16_Default) \
+    .dtype_format(DataType.U16_NHWC, DataType.U16_HWCN) \
+    .dtype_format(DataType.U16_HWCN, DataType.U16_Default) \
+    .dtype_format(DataType.U16_HWCN, DataType.U16_NHWC) \
+    .dtype_format(DataType.U32_Default, DataType.U32_NHWC) \
+    .dtype_format(DataType.U32_Default, DataType.U32_HWCN) \
+    .dtype_format(DataType.U32_NHWC, DataType.U32_Default) \
+    .dtype_format(DataType.U32_NHWC, DataType.U32_HWCN) \
+    .dtype_format(DataType.U32_HWCN, DataType.U32_Default) \
+    .dtype_format(DataType.U32_HWCN, DataType.U32_NHWC) \
+    .dtype_format(DataType.U64_Default, DataType.U64_NHWC) \
+    .dtype_format(DataType.U64_Default, DataType.U64_HWCN) \
+    .dtype_format(DataType.U64_NHWC, DataType.U64_Default) \
+    .dtype_format(DataType.U64_NHWC, DataType.U64_HWCN) \
+    .dtype_format(DataType.U64_HWCN, DataType.U64_Default) \
+    .dtype_format(DataType.U64_HWCN, DataType.U64_NHWC) \
+    .dtype_format(DataType.I32_FracNZ, DataType.I32_Default) \
+    .dtype_format(DataType.F16_NDHWC, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_NDHWC) \
+    .dtype_format(DataType.I8_HWCN, DataType.I8_C1HWNCoC0) \
+    .dtype_format(DataType.F16_HWCN, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_HWCN) \
+    .dtype_format(DataType.F16_HWCN, DataType.F16_FracNZ) \
+    .dtype_format(DataType.F32_HWCN, DataType.F16_FracNZ) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/unsorted_segment_sum.py b/mindspore/ops/_op_impl/tbe/unsorted_segment_sum.py
index 5dc07dd59f..b1f81b72b0 100644
--- a/mindspore/ops/_op_impl/tbe/unsorted_segment_sum.py
+++ b/mindspore/ops/_op_impl/tbe/unsorted_segment_sum.py
@@ -27,6 +27,7 @@ unsorted_segment_sum_op_info = TBERegOp("UnsortedSegmentSum") \
     .input(0, "x", False, "required", "all") \
     .input(1, "segment_ids", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
+    .op_pattern("dynamicFormat") \
     .dtype_format(DataType.I8_Default, DataType.I32_Default, DataType.I8_Default) \
     .dtype_format(DataType.I8_5HD, DataType.I32_5HD, DataType.I8_5HD) \
     .dtype_format(DataType.U8_Default, DataType.I32_Default, DataType.U8_Default) \
diff --git a/mindspore/ops/_op_impl/tbe/zeros_like.py b/mindspore/ops/_op_impl/tbe/zeros_like.py
index 144b0c95cb..7e15a19996 100644
--- a/mindspore/ops/_op_impl/tbe/zeros_like.py
+++ b/mindspore/ops/_op_impl/tbe/zeros_like.py
@@ -25,18 +25,13 @@ zeros_like_op_info = TBERegOp("ZerosLike") \
     .partial_flag(True) \
     .input(0, "x", False, "required", "all") \
     .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.BOOL_Default, DataType.BOOL_Default) \
-    .dtype_format(DataType.BOOL_5HD, DataType.BOOL_5HD) \
-    .dtype_format(DataType.I8_Default, DataType.I8_Default) \
-    .dtype_format(DataType.I8_5HD, DataType.I8_5HD) \
-    .dtype_format(DataType.U8_Default, DataType.U8_Default) \
-    .dtype_format(DataType.U8_5HD, DataType.U8_5HD) \
-    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.I32_5HD, DataType.I32_5HD) \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
+    .op_pattern("formatAgnostic") \
+    .dtype_format(DataType.BOOL_None, DataType.BOOL_None) \
+    .dtype_format(DataType.I8_None, DataType.I8_None) \
+    .dtype_format(DataType.U8_None, DataType.U8_None) \
+    .dtype_format(DataType.I32_None, DataType.I32_None) \
+    .dtype_format(DataType.F16_None, DataType.F16_None) \
+    .dtype_format(DataType.F32_None, DataType.F32_None) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_selected_grad_ops.py b/mindspore/ops/_selected_grad_ops.py
new file mode 100644
index 0000000000..5da1d53abf
--- /dev/null
+++ b/mindspore/ops/_selected_grad_ops.py
@@ -0,0 +1,50 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+""" resolved grad ops """
+from mindspore.ops.op_selector import new_ops_selector
+
+op_selector = new_ops_selector(
+    "mindspore.ops.operations._grad_ops", "mindspore.nn.graph_kernels")
+
+
+@op_selector
+class MaximumGrad:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class MinimumGrad:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class AbsGrad:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class BiasAddGrad:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class TanhGrad:
+    def __call__(self, *args):
+        pass
diff --git a/mindspore/ops/_selected_ops.py b/mindspore/ops/_selected_ops.py
new file mode 100644
index 0000000000..5e125025c9
--- /dev/null
+++ b/mindspore/ops/_selected_ops.py
@@ -0,0 +1,108 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+""" resolve ops """
+from mindspore.ops.op_selector import new_ops_selector
+
+op_selector = new_ops_selector(
+    "mindspore.ops.operations", "mindspore.nn.graph_kernels")
+opt_selector = new_ops_selector(
+    "mindspore.nn.optim", "mindspore.nn.graph_kernels")
+nn_selector = new_ops_selector(
+    "mindspore.nn", "mindspore.nn.graph_kernels")
+
+
+@nn_selector
+class BatchNorm2d:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class ReLU:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class ReduceMean:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class BiasAdd:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class FusedBatchNorm:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class ApplyMomentum:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class SoftmaxCrossEntropyWithLogits:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class LogSoftmax:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class Tanh:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class Gelu:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class LayerNorm:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class Softmax:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class LambUpdateWithLR:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class LambNextMV:
+    def __call__(self, *args):
+        pass
diff --git a/mindspore/ops/composite/__init__.py b/mindspore/ops/composite/__init__.py
index e4c6e35d3a..a531503d94 100644
--- a/mindspore/ops/composite/__init__.py
+++ b/mindspore/ops/composite/__init__.py
@@ -20,7 +20,7 @@ Pre-defined combination of operators.
 """
 
 
-from .base import GradOperation, HyperMap, MultitypeFuncGraph, add_flags, \
+from .base import GradOperation, HyperMap, Map, MultitypeFuncGraph, add_flags, \
                   grad, grad_all, grad_all_with_sens, grad_by_list, grad_by_list_with_sens, grad_with_sens, \
                   core, env_get, tail, zip_operation
 from .clip_ops import clip_by_value
diff --git a/mindspore/ops/composite/base.py b/mindspore/ops/composite/base.py
index 4306e0c8cb..e283867684 100644
--- a/mindspore/ops/composite/base.py
+++ b/mindspore/ops/composite/base.py
@@ -18,15 +18,16 @@
 """Basic composite operations."""
 from functools import partial
 
-from ..._c_expression import EnvInstance_, GradOperation_, HyperMap_, MultitypeFuncGraph_, Tail_, TensorSlice_, \
+from mindspore import context
+from ..._c_expression import EnvInstance_, GradOperation_, HyperMap_, Map_, MultitypeFuncGraph_, Tail_, \
                              TupleAdd_, TupleSlice_, UnpackCall_, ZipOperation_, ListAppend_, TupleGetItemTensor_
 from ...common import dtype as mstype
-from ...common.api import ms_function
+from ...common.api import ms_function, _pynative_exec, _wrap_func
 from .. import functional as F
-from .. import operations as P
 from ...common.parameter import Parameter
 
-__all__ = [EnvInstance_, TensorSlice_, TupleAdd_, TupleSlice_, UnpackCall_, TupleGetItemTensor_]
+
+__all__ = [EnvInstance_, TupleAdd_, TupleSlice_, UnpackCall_, TupleGetItemTensor_]
 
 
 def add_flags(fn, **flags):
@@ -105,14 +106,35 @@ class GradOperation(GradOperation_):
         GradOperation_.__init__(self, name, get_all, get_by_list, sens_param)
         self.grad_fn = None
         self.fn = None
+        self.need_forward = False
 
     def __call__(self, fn, weights=None):
         grad_ = GradOperation('grad', self.get_all, self.get_by_list, self.sens_param)
         if self.grad_fn is None or self.fn != fn:
             if self.get_by_list:
-                @ms_function(obj=fn)
-                def after_grad(*args):
-                    return grad_(fn, weights)(*args)
+                if context.get_context("mode") == context.GRAPH_MODE:
+                    @ms_function(obj=fn)
+                    def after_grad(*args):
+                        return grad_(fn, weights)(*args)
+                else:
+                    @_wrap_func
+                    def after_grad(*args):
+                        if fn.is_run and not fn.requires_grad:
+                            raise ValueError("obj must set_grad.")
+                        if not fn.is_run:
+                            self.need_forward = True
+                            print("already has forward run before grad by user")
+                        if self.need_forward:
+                            fn.set_grad()
+                            if self.sens_param:
+                                f_args = args[:-1]
+                                fn(*f_args)
+                            else:
+                                fn(*args)
+                        _pynative_exec.grad(grad_, fn, weights, *args)
+                        out = _pynative_exec(*args)
+                        _pynative_exec.clear()
+                        return out
             else:
                 @ms_function(obj=fn)
                 def after_grad(*args):
@@ -219,6 +241,45 @@ class HyperMap(HyperMap_):
             return func(*args_list)
         return tuple(map(hypermap, *args_list))
 
+
+class Map(Map_):
+    """
+    Map will apply the set operation on input sequences.
+
+    Which will apply the operations of every elements of the sequence.
+
+    Args:
+        ops (Union[MultitypeFuncGraph, None]): `ops` is the operation to apply. If `ops` is `None`,
+            the operations should be putted in the first input of the instance.
+
+    Inputs:
+        - **args** (Tuple[sequence]) - If `ops` is not `None`, all the inputs should be the same length sequences,
+          and each row of the sequences. e.g. If args length is 2, and for `i` in length of each sequence
+          `(args[0][i], args[1][i])` will be the input of the operation.
+
+          If `ops` is not `None`, the first input is the operation, and the other is inputs.
+
+    Outputs:
+        sequence, the output will be same type and same length of sequence from input and the value of each element
+        is the result of operation apply each row of element. e.g. `operation(args[0][i], args[1][i])`.
+    """
+
+    def __init__(self, ops=None):
+        self.ops = ops
+        if ops:
+            Map_.__init__(self, ops)
+        else:
+            Map_.__init__(self)
+
+    def __call__(self, *args):
+        func = self.ops
+        args_list = args
+        if self.ops is None:
+            func = args[0]
+            args_list = args[1:]
+        return tuple(map(func, *args_list))
+
+
 class _ListAppend(ListAppend_):
     """
     A metafuncgraph class that append one element to list.
@@ -274,33 +335,4 @@ env_get = MultitypeFuncGraph("env_get")
 @env_get.register("EnvType", "Tensor")
 def _tensor_env_get(env, parameter):
     """Used to get env."""
-    return F.env_getitem(env, F.ref_to_embed(parameter), F.zeros_like_tensor(parameter))
-
-
-_mp_cast_helper = MultitypeFuncGraph('mixed_precision_cast_helper')
-
-
-@_mp_cast_helper.register("TypeType", "Number")
-@core
-def _mixed_precision_cast_helper_1(type_, x):
-    """if x is float cast to type."""
-    # type_ is place holder
-    return x
-
-
-@_mp_cast_helper.register("TypeType", "Tensor")
-@core
-def _mixed_precision_cast_helper_2(type_, x):
-    """if x is float cast to type."""
-    if F.issubclass_(F.dtype(x), mstype.float_):
-        return P.Cast()(x, type_)
-    return x
-
-@_mp_cast_helper.register("TypeType", "Tuple")
-@core
-def _mixed_precision_cast_helper_3(type_, x):
-    """if x is a tuple"""
-    t = ()
-    for item in x:
-        t = t + (_mp_cast_helper(type_, item),)
-    return t
+    return F.env_getitem(env, F.ref_to_embed(parameter), F.zeros_like(parameter))
diff --git a/mindspore/ops/composite/multitype_ops/_compile_utils.py b/mindspore/ops/composite/multitype_ops/_compile_utils.py
index 8954470b76..906d74948a 100644
--- a/mindspore/ops/composite/multitype_ops/_compile_utils.py
+++ b/mindspore/ops/composite/multitype_ops/_compile_utils.py
@@ -18,13 +18,15 @@ from . import _constexpr_utils as const_utils
 from ... import functional as F
 from ... import operations as P
 from ...composite import base
+from ....common.tensor import Tensor
 from ....common import dtype as mstype
+from ....common._register_for_tensor import tensor_operator_registry
 
 hyper_map = base.HyperMap()
 pack = P.Pack(axis=-1)
 
 
-def broadcast(broadcast_shape, x):
+def _broadcast(broadcast_shape, x):
     """Broadcast tensor to the required shape."""
     if F.shape(x) == broadcast_shape:
         return x
@@ -34,13 +36,13 @@ def broadcast(broadcast_shape, x):
     return x
 
 
-def transform_indexing_tensor(broadcast_shape, final_shape, new_shape, x):
+def _transform_indexing_tensor(broadcast_shape, final_shape, new_shape, x):
     """Transform indexing tensor to the required."""
-    x = broadcast(broadcast_shape, x)
-    return broadcast(final_shape, F.reshape(x, new_shape))
+    x = _broadcast(broadcast_shape, x)
+    return _broadcast(final_shape, F.reshape(x, new_shape))
 
 
-def generate_indices_from_tuple_of_tensor(data, tuple_index, op_name):
+def _generate_indices_from_tuple_of_tensor(data, tuple_index, op_name):
     """Generate an indices tensor from a tuple of tensor."""
     indices = None
     check_index_tensor_number = const_utils.check_number_of_index_tensor(F.shape(data), len(tuple_index), op_name)
@@ -50,26 +52,31 @@ def generate_indices_from_tuple_of_tensor(data, tuple_index, op_name):
         if check_dtypes:
             shape_tuple = hyper_map(F.shape, tuple_index)
             broadcast_shape = const_utils.generate_broadcast_shape(shape_tuple, op_name)
-            broadcast_tensors = hyper_map(F.partial(broadcast, broadcast_shape), tuple_index)
+            broadcast_tensors = hyper_map(F.partial(_broadcast, broadcast_shape), tuple_index)
             indices = pack(broadcast_tensors)
     return indices
 
 
-def generate_indices_from_tuple_of_mixed_tensors(data, tuple_index, op_name):
+def _generate_indices_from_tuple_of_mixed_tensors(data, tuple_index, op_name):
     """Generate an indices tensor from a tuple that contains slice, int, ellipsis, tensor."""
     indexes_types = hyper_map(F.typeof, tuple_index)
     int_positions = const_utils.get_pos_of_int_index(indexes_types)
-    for i in int_positions:
-        tuple_index = F.tuple_setitem(tuple_index, i, F.scalar_to_tensor(tuple_index[i], mstype.int32))
-    indexes_types = hyper_map(F.typeof, tuple_index)
+    tuple_index_new = ()
+    tuple_len = len(tuple_index)
+    for i in range(tuple_len):
+        if i in int_positions:
+            tuple_index_new = tuple_index_new + (F.scalar_to_tensor(tuple_index[i], mstype.int32),)
+        else:
+            tuple_index_new = tuple_index_new + (tuple_index[i],)
+    indexes_types = hyper_map(F.typeof, tuple_index_new)
     tensor_positions, slice_positions, ellipsis_position = \
         const_utils.separate_mixed_tensors_index(indexes_types, op_name)
     tensor_indexes = []
     slice_indexes = []
     for i in tensor_positions:
-        tensor_indexes.append(tuple_index[i])
+        tensor_indexes.append(tuple_index_new[i])
     for j in slice_positions:
-        slice_indexes.append(tuple_index[j])
+        slice_indexes.append(tuple_index_new[j])
     data_shape = F.shape(data)
     tensor_indexes_shapes = hyper_map(F.shape, tensor_indexes)
     tensor_indexes_dtypes = hyper_map(F.dtype, tensor_indexes)
@@ -83,14 +90,14 @@ def generate_indices_from_tuple_of_mixed_tensors(data, tuple_index, op_name):
 
     slice_number = 0
     final_index_tensors = []
-    tuple_index_size = len(tuple_index)
+    tuple_index_size = len(tuple_index_new)
     index_tensor_new_shape = const_utils.compute_new_shape(broadcast_shape, indexes_shapes_info)
     for i in range(tuple_index_size):
         if i in tensor_positions:
-            transform_tensor = transform_indexing_tensor(broadcast_shape,
-                                                         final_shape,
-                                                         index_tensor_new_shape,
-                                                         tuple_index[i])
+            transform_tensor = _transform_indexing_tensor(broadcast_shape,
+                                                          final_shape,
+                                                          index_tensor_new_shape,
+                                                          tuple_index_new[i])
             final_index_tensors.append(transform_tensor)
         if i in slice_positions:
             slice_tensor = const_utils.convert_slice_to_tensor(slice_number,
@@ -112,7 +119,7 @@ def generate_indices_from_tuple_of_mixed_tensors(data, tuple_index, op_name):
     return indices
 
 
-def generate_updates_from_scalar(data, indices, value, op_type):
+def _generate_updates_from_scalar(data, indices, value, op_type):
     """Generate an updates tensor from a scalar."""
     data_shape = F.shape(data)
     indices_shape = F.shape(indices)
@@ -120,7 +127,7 @@ def generate_updates_from_scalar(data, indices, value, op_type):
     return const_utils.convert_scalar_to_tensor(data_shape, data_dtype, indices_shape, value, op_type)
 
 
-def generate_updates_from_tuple(data, index, value, op_type):
+def _generate_updates_from_tuple(data, index, value, op_type):
     """Generate an updates tensor from a tuple."""
     value_types = hyper_map(F.typeof, value)
     data_dtype = F.dtype(data)
@@ -130,14 +137,14 @@ def generate_updates_from_tuple(data, index, value, op_type):
         shapes_same = const_utils.check_shapes_same(value_shapes, const_utils.TENSOR_SETITEM)
         if shapes_same:
             value = F.pack(value)
-        return generate_updates_from_tensor(data, index, value, op_type)
+        return _generate_updates_from_tensor(data, index, value, op_type)
 
     data_shape = F.shape(data)
     index_shape = F.shape(index)
     return const_utils.convert_tuple_of_scalar_to_tensor(data_shape, data_dtype, index_shape, value, op_type)
 
 
-def generate_updates_from_tensor(data, index, value, op_type):
+def _generate_updates_from_tensor(data, index, value, op_type):
     """Generate an updates tensor from a tensor."""
     data_shape = F.shape(data)
     index_shape = F.shape(index)
@@ -150,5 +157,410 @@ def generate_updates_from_tensor(data, index, value, op_type):
         updates_shape = const_utils.generate_updates_shape(data_shape, index_shape, op_type)
     need_broadcast = const_utils.check_two_shapes_need_broadcast(updates_shape, value_shape)
     if need_broadcast:
-        return broadcast(updates_shape, value)
+        return _broadcast(updates_shape, value)
     return value
+
+
+def _tensor_getitem(self, index):
+    """Handle tensor getitem"""
+    if isinstance(index, Tensor):
+        return tensor_index_by_tensor(self, index)
+    if isinstance(index, tuple):
+        return tensor_index_by_tuple(self, index)
+    if isinstance(index, int):
+        return _tensor_index_by_integer(self, index)
+    if isinstance(index, slice):
+        return tensor_index_by_slice(self, index)
+    if isinstance(index, bool):
+        return _tensor_index_by_bool(self, index)
+    if index is None:
+        return F.expand_dims(self, 0)
+    if index is ...:
+        return self
+    raise IndexError(f"Only support integers, slices(`:`), ellipsis(`...`), None, bool and tensor with int32, "
+                     f"got {index} with type {type(index)}.")
+
+
+tensor_operator_registry.register("__getitem__", _tensor_getitem)
+
+
+def _tensor_getitem_by_tuple_of_tensor(data, tuple_index):
+    """Tensor getitem by a tuple of tensor."""
+    indices = _generate_indices_from_tuple_of_tensor(data,
+                                                     tuple_index,
+                                                     const_utils.TENSOR_GETITEM)
+    result = F.gather_nd(data, indices)
+    return result
+
+
+def _tensor_getitem_by_tuple_of_mixed_tensors(data, tuple_index):
+    """Tensor getitem by a tuple of mixed tensor."""
+    indices = _generate_indices_from_tuple_of_mixed_tensors(data,
+                                                            tuple_index,
+                                                            const_utils.TENSOR_GETITEM)
+    result = F.gather_nd(data, indices)
+    return result
+
+
+def tensor_index_by_slice(data, slice_index):
+    """Tensor getitem by a single slice"""
+    shape = F.shape(data)
+    if not shape:
+        const_utils.raise_index_error("When tensor is indexed by a slice, the dimension of the tensor cannot be 0.")
+    begin_strides, end_strides, step_strides = const_utils.get_stride_info_from_slice(shape, slice_index)
+    return F.strided_slice(data, begin_strides, end_strides, step_strides)
+
+
+def _tensor_index_by_integer(data, number):
+    """Tensor getitem by a single integer number"""
+    shape = F.shape(data)
+    if not shape:
+        const_utils.raise_index_error("When tensor is indexed by an integer, the dimension of the tensor cannot be 0.")
+    begin_strides, end_strides, step_strides = const_utils.get_stride_info_from_integer(shape, number)
+    shrink_axis_mask = 1
+    return P.StridedSlice(0, 0, 0, 0, shrink_axis_mask)(data, begin_strides, end_strides, step_strides)
+
+
+def _tensor_index_by_bool(data, bool_value):
+    """Tensor getitem by a single bool value"""
+    if bool_value:
+        return F.expand_dims(data, 0)
+    return const_utils.raise_index_error("When tensor is indexed by a bool object, the value only support 'True'.")
+
+
+def tensor_index_by_number(data, number):
+    """Tensor getitem by a Number which may be integer/float/bool value"""
+    number_type = const_utils.check_number_index_type(number)
+    if number_type == const_utils.BOOL_:
+        return _tensor_index_by_bool(data, number)
+    if number_type == const_utils.INT_:
+        return _tensor_index_by_integer(data, number)
+    return const_utils.raise_index_error("Only support integers, slices(`:`), ellipsis(`...`), None and bool.")
+
+
+def tensor_index_by_tensor(data, tensor_index):
+    """Tensor getitem by a single tensor"""
+    dtype_valid = const_utils.check_index_tensor_dtype(F.dtype(tensor_index),
+                                                       const_utils.TENSOR_GETITEM)
+    if dtype_valid:
+        return F.gather(data, tensor_index, 0)
+    return const_utils.raise_index_error("For 'tensor getitem', "
+                                         "the index tensor data type only support mstype.int32.")
+
+
+def _tensor_index_by_tuple_slice(data, t):
+    """Tensor getitem by a tuple of slice"""
+    shape = F.shape(data)
+    if len(t) > len(shape):
+        const_utils.raise_index_error("When tensor is indexed by a tuple, "
+                                      "the length of the tuple cannot be greater than the dimension of the tensor.")
+    begin_strides, end_strides, step_strides, shrink_axis_mask = \
+        const_utils.get_stride_info_from_tuple(shape, t)
+    return P.StridedSlice(0, 0, 0, 0, shrink_axis_mask)(data, begin_strides, end_strides, step_strides)
+
+
+def tensor_index_by_tuple(data, tuple_index):
+    """Tensor getitem by tuple of various types"""
+    indexes_types = hyper_map(F.typeof, tuple_index)
+    index_elements_type = const_utils.tuple_index_elements_type(indexes_types, const_utils.TENSOR_GETITEM)
+    if index_elements_type == const_utils.NO_TENSOR:
+        return _tensor_index_by_tuple_slice(data, tuple_index)
+    if index_elements_type == const_utils.ALL_TENSOR:
+        return _tensor_getitem_by_tuple_of_tensor(data, tuple_index)
+    return _tensor_getitem_by_tuple_of_mixed_tensors(data, tuple_index)
+
+
+def _tensor_setitem(self, index, value):
+    """Handle tensor getitem"""
+    if isinstance(index, Tensor):
+        if isinstance(value, (int, float, bool)):
+            return tensor_setitem_by_tensor_with_number(self, index, value)
+        if isinstance(value, Tensor):
+            return tensor_setitem_by_tensor_with_tensor(self, index, value)
+        if isinstance(value, tuple):
+            return tensor_setitem_by_tensor_with_tuple(self, index, value)
+    if isinstance(index, tuple):
+        if isinstance(value, (int, float, bool)):
+            return tensor_setitem_by_tuple_with_number(self, index, value)
+        if isinstance(value, Tensor):
+            return tensor_setitem_by_tuple_with_tensor(self, index, value)
+        if isinstance(value, tuple):
+            return tensor_setitem_by_tuple_with_tuple(self, index, value)
+    if isinstance(index, int):
+        if isinstance(value, (int, float, bool)):
+            return tensor_setitem_by_number_with_number(self, index, value)
+        if isinstance(value, Tensor):
+            return tensor_setitem_by_number_with_tensor(self, index, value)
+    if isinstance(index, slice):
+        if isinstance(value, (int, float, bool)):
+            return tensor_setitem_by_slice_with_number(self, index, value)
+        if isinstance(value, Tensor):
+            return tensor_setitem_by_slice_with_tensor(self, index, value)
+    if isinstance(index, bool):
+        return _tensor_index_by_bool(self, index)
+    if index is ...:
+        if isinstance(value, (int, float, bool)):
+            return tensor_setitem_by_ellipsis_with_number(self, index, value)
+        if isinstance(value, Tensor):
+            return tensor_setitem_by_ellipsis_with_tensor(self, index, value)
+    raise IndexError("Tensor setitem index only support integers, slices(`:`), ellipsis(`...`), None, bool\
+                         and tensor with int32, got {} with type{}".format(index, type(index)))
+
+
+tensor_operator_registry.register("__setitem__", _tensor_setitem)
+
+
+def _tensor_setitem_by_int_tensor_with_tensor(data, index, value):
+    """Set a tensor item by a int tensor with a tensor."""
+    updates = _generate_updates_from_tensor(data, index, value,
+                                            const_utils.SET_ITEM_BY_ONE_TENSOR)
+    index = F.expand_dims(index, -1)
+    return P.TensorScatterUpdate()(data, index, updates)
+
+
+def _tensor_setitem_by_bool_tensor_with_tensor(data, index, value):
+    """Set a tensor item by a bool tensor with a tensor."""
+    index_shape = F.shape(index)
+    data_shape = F.shape(data)
+    data_shape = const_utils.check_equal(data_shape, index_shape,
+                                         "The tensor(shape={}) and tensor index(shape={}) should be the same shape.")
+    size = F.size(value)
+    size = const_utils.check_equal(1, size,
+                                   "When assign value is a tensor, its size should be {}, but current size is {}.")
+    dtype = F.dtype(data)
+    u_cast = F.cast(value, dtype)
+    one_data = F.ones_like(data)
+    u = F.tensor_mul(one_data, u_cast)
+    result = F.select(index, u, data)
+    return result
+
+
+def tensor_setitem_by_tensor_with_tensor(data, index, value_tensor):
+    """setitem by tensor index(dtype is int or bool) with tensor as value"""
+    index_dtype = F.dtype(index)
+    tensor_dtype = const_utils.get_index_tensor_dtype(index_dtype)
+    if tensor_dtype == const_utils.INT_:
+        return _tensor_setitem_by_int_tensor_with_tensor(data, index, value_tensor)
+    return _tensor_setitem_by_bool_tensor_with_tensor(data, index, value_tensor)
+
+
+def _tensor_setitem_by_bool_tensor_with_scalar(data, index, value):
+    """Set a tensor item by a bool tensor with a scalar."""
+    index_shape = F.shape(index)
+    shape = F.shape(data)
+    shape = const_utils.check_equal(
+        shape, index_shape, "The tensor(shape={}) and tensor index(shape={}) should be the same shape.")
+    dtype = F.dtype(data)
+    u = F.fill(dtype, shape, value)
+    return F.select(index, u, data)
+
+
+def _tensor_setitem_by_int_tensor_with_scalar(data, index, value):
+    """Set a tensor item by a int tensor with a scalar."""
+    updates = _generate_updates_from_scalar(data, index, value,
+                                            const_utils.SET_ITEM_BY_ONE_TENSOR)
+    index = F.expand_dims(index, -1)
+    return P.TensorScatterUpdate()(data, index, updates)
+
+
+def tensor_setitem_by_tensor_with_number(data, index, value):
+    index_dtype = F.dtype(index)
+    tensor_dtype = const_utils.get_index_tensor_dtype(index_dtype)
+    if tensor_dtype == const_utils.BOOL_:
+        return _tensor_setitem_by_bool_tensor_with_scalar(data, index, value)
+    if tensor_dtype == const_utils.INT_:
+        return _tensor_setitem_by_int_tensor_with_scalar(data, index, value)
+    return const_utils.raise_index_error("For tensor setitem, indexing tensor dtype only supports bool/int")
+
+
+def tensor_setitem_by_tensor_with_tuple(data, index, value):
+    """Assigns the tensor by tensor  with tuple value."""
+    index_dtype = F.dtype(index)
+    check_dtype = const_utils.check_index_tensor_dtype(index_dtype, const_utils.TENSOR_SETITEM)
+    result = None
+    if check_dtype:
+        result = _tensor_setitem_by_tensor_with_tuple(data, index, value)
+    return result
+
+
+def _tensor_indices_number(data, data_shape, index, indices, value):
+    """Assigns a scalar value to the tensor."""
+    data_size = F.size(data)
+    data_dtype = F.dtype(data)
+    indices_size = F.size(indices)
+    indices_size = const_utils.check_indices(indices_size, index)
+    update = F.fill(mstype.int32, (indices_size,), 1)
+    condition_1d = F.scatter_nd(indices, update, (data_size,))
+    condition = F.reshape(condition_1d, data_shape)
+    condition = F.cast(condition, mstype.bool_)
+    value_fill = F.fill(data_dtype, (indices_size,), value)
+    value_1d = F.scatter_nd(indices, value_fill, (data_size,))
+    u = F.reshape(value_1d, data_shape)
+    return F.select(condition, u, data)
+
+
+def _tensor_setitem_by_tensor_with_tuple(data, index, value):
+    """Set a tensor item by a tensor with a tuple."""
+    updates = _generate_updates_from_tuple(data, index, value,
+                                           const_utils.SET_ITEM_BY_ONE_TENSOR)
+    index = F.expand_dims(index, -1)
+    result = P.TensorScatterUpdate()(data, index, updates)
+    return result
+
+
+def tensor_setitem_by_slice_with_number(data, input_slice, value):
+    """Givens a scalar assign to tensor by slice"""
+    check_result = const_utils.check_tensor_setitem_index(input_slice)
+    result = None
+    if check_result:
+        data_shape = F.shape(data)
+        indices = const_utils.slice2indices(input_slice, data_shape)
+        is_tuple_int = const_utils.tuple_element_is_int(input_slice)
+        if is_tuple_int:
+            indices = const_utils.integer_to_indices(input_slice, data_shape)
+        result = _tensor_indices_number(data, data_shape, input_slice, indices, value)
+    return result
+
+
+def tensor_setitem_by_tuple_with_number(data, tuple_index, value):
+    """Assigns the tensor by tuple  with number value."""
+    indexes_types = hyper_map(F.typeof, tuple_index)
+    index_elements_type = const_utils.tuple_index_elements_type(indexes_types, const_utils.TENSOR_SETITEM)
+
+    if index_elements_type == const_utils.NO_TENSOR:
+        return tensor_setitem_by_slice_with_number(data, tuple_index, value)
+    if index_elements_type == const_utils.ALL_TENSOR:
+        indices = _generate_indices_from_tuple_of_tensor(data,
+                                                         tuple_index,
+                                                         const_utils.TENSOR_SETITEM)
+    else:
+        indices = _generate_indices_from_tuple_of_mixed_tensors(data,
+                                                                tuple_index,
+                                                                const_utils.TENSOR_SETITEM)
+    updates = _generate_updates_from_scalar(data,
+                                            indices,
+                                            value,
+                                            const_utils.SET_ITEM_BY_TUPLE_OF_TENSOR)
+    return P.TensorScatterUpdate()(data, indices, updates)
+
+
+def _tensor_indices_tensor(data, data_shape, index, indices, value):
+    """Assigns a tensor value to the tensor."""
+    data_size = F.size(data)
+    data_dtype = F.dtype(data)
+    indices_size = F.size(indices)
+    indices_size = const_utils.check_indices(indices_size, index)
+    update = F.fill(mstype.int32, (indices_size,), 1)
+    condition_1d = F.scatter_nd(indices, update, (data_size,))
+    condition = F.reshape(condition_1d, data_shape)
+    condition = F.cast(condition, mstype.bool_)
+    value_fill = None
+    value_size = F.size(value)
+
+    value_size = const_utils.check_indices_value_size(indices_size, value_size)
+    if value_size == 1:
+        value_fill = F.fill(data_dtype, (indices_size,), 1)
+        value = F.cast(value, data_dtype)
+        value_fill = F.tensor_mul(value_fill, value)
+    elif value_size > 1:
+        value_fill = F.reshape(value, (indices_size,))
+    value_1d = F.scatter_nd(indices, value_fill, (data_size,))
+    u = F.reshape(value_1d, data_shape)
+    return F.select(condition, u, data)
+
+
+def tensor_setitem_by_slice_with_tensor(data, input_slice, value):
+    """Assigns a tensor value to the tensor by slice."""
+    result = None
+    check_result = const_utils.check_tensor_setitem_index(input_slice)
+    if check_result:
+        data_shape = F.shape(data)
+        indices = const_utils.slice2indices(input_slice, data_shape)
+        is_tuple_int = const_utils.tuple_element_is_int(input_slice)
+        if is_tuple_int:
+            indices = const_utils.integer_to_indices(input_slice, data_shape)
+        result = _tensor_indices_tensor(data, data_shape, input_slice, indices, value)
+    return result
+
+
+def tensor_setitem_by_tuple_with_tensor(data, tuple_index, value):
+    """Assigns the tensor by tuple  with tensor value."""
+    indexes_types = hyper_map(F.typeof, tuple_index)
+    index_elements_type = const_utils.tuple_index_elements_type(indexes_types, const_utils.TENSOR_SETITEM)
+
+    if index_elements_type == const_utils.NO_TENSOR:
+        return tensor_setitem_by_slice_with_tensor(data, tuple_index, value)
+    if index_elements_type == const_utils.ALL_TENSOR:
+        indices = _generate_indices_from_tuple_of_tensor(data,
+                                                         tuple_index,
+                                                         const_utils.TENSOR_SETITEM)
+    else:
+        indices = _generate_indices_from_tuple_of_mixed_tensors(data,
+                                                                tuple_index,
+                                                                const_utils.TENSOR_SETITEM)
+    updates = _generate_updates_from_tensor(data,
+                                            indices,
+                                            value,
+                                            const_utils.SET_ITEM_BY_TUPLE_OF_TENSOR)
+    return P.TensorScatterUpdate()(data, indices, updates)
+
+
+def tensor_setitem_by_tuple_with_tuple(data, tuple_index, value):
+    """Assigns the tensor by tuple  with tuple of  value."""
+    indexes_types = hyper_map(F.typeof, tuple_index)
+    index_elements_type = const_utils.tuple_index_elements_type(indexes_types, const_utils.TENSOR_SETITEM)
+
+    if index_elements_type == const_utils.ALL_TENSOR:
+        indices = _generate_indices_from_tuple_of_tensor(data,
+                                                         tuple_index,
+                                                         const_utils.TENSOR_SETITEM)
+    else:
+        indices = _generate_indices_from_tuple_of_mixed_tensors(data,
+                                                                tuple_index,
+                                                                const_utils.TENSOR_SETITEM)
+    updates = _generate_updates_from_tuple(data,
+                                           indices,
+                                           value,
+                                           const_utils.SET_ITEM_BY_TUPLE_OF_TENSOR)
+    return P.TensorScatterUpdate()(data, indices, updates)
+
+
+def tensor_setitem_by_number_with_number(data, index, value):
+    """Assigns the tensor by number  with number value."""
+    data_shape = F.shape(data)
+    indices = const_utils.integer_to_indices(index, data_shape)
+    return _tensor_indices_number(data, data_shape, index, indices, value)
+
+
+def tensor_setitem_by_number_with_tensor(data, index, value):
+    """Assigns the tensor by number  with tensor value."""
+    data_shape = F.shape(data)
+    indices = const_utils.integer_to_indices(index, data_shape)
+    return _tensor_indices_tensor(data, data_shape, index, indices, value)
+
+
+def tensor_setitem_by_ellipsis_with_number(data, index, value):
+    """Assigns the tensor by ellipsis  with number value."""
+    data_shape = F.shape(data)
+    data_dtype = F.dtype(data)
+    return F.fill(data_dtype, data_shape, value)
+
+
+def tensor_setitem_by_ellipsis_with_tensor(data, index, value):
+    """Assigns the tensor by ellipsis  with tensor value."""
+    result = None
+    data_shape = F.shape(data)
+    data_dtype = F.dtype(data)
+    data_size = F.size(data)
+    value_shape = F.shape(value)
+    value_size = F.size(value)
+    check_result = const_utils.check_ellipsis_shape_size(data_shape, value_shape, data_size, value_size)
+    if check_result:
+        if data_size == value_size:
+            result = F.reshape(value, data_shape)
+            result = F.cast(result, data_dtype)
+        elif value_size == 1:
+            param1 = F.fill(data_dtype, data_shape, 1)
+            param2 = F.cast(value, data_dtype)
+            result = F.tensor_mul(param1, param2)
+    return result
diff --git a/mindspore/ops/composite/multitype_ops/_constexpr_utils.py b/mindspore/ops/composite/multitype_ops/_constexpr_utils.py
index e4d42aed03..02756ffe56 100644
--- a/mindspore/ops/composite/multitype_ops/_constexpr_utils.py
+++ b/mindspore/ops/composite/multitype_ops/_constexpr_utils.py
@@ -20,7 +20,6 @@ import numpy as np
 
 from ...primitive import constexpr
 from .... import log as logger
-from ...._extends.utils import Slice, Ellipsis_
 from ....common import dtype as mstype
 from ....common.tensor import Tensor
 from ....ops import _utils as op_utils
@@ -41,6 +40,11 @@ SET_ITEM_BY_ONE_TENSOR = 0
 SET_ITEM_BY_TUPLE_OF_TENSOR = 1
 
 
+@constexpr
+def raise_index_error(msg):
+    raise IndexError(msg)
+
+
 @constexpr
 def check_equal(param1, param2, msg="{},{}"):
     """Checks whether the two parameters are equal or not."""
@@ -54,7 +58,8 @@ def check_ellipsis_shape_size(data_shape, value_shape, data_size, value_size):
     """Checks the shape and size of the sensor and value."""
     if data_shape == value_shape or data_size == value_size or value_size == 1:
         return True
-    raise ValueError("The value(shape={}), can not assign to tensor(shape={}).".format(value_shape, data_shape))
+    raise ValueError("The value(shape={}), can not assign to tensor(shape={}).".format(
+        value_shape, data_shape))
 
 
 @constexpr
@@ -63,16 +68,18 @@ def check_tensor_setitem_index(index, element_type=None):
     if index is None:
         raise IndexError("Tensor's index cannot be None.")
     # eg. Tensor[Slice] = u
-    if isinstance(index, Slice):
+    if isinstance(index, slice):
         return True
     # eg. Tensor[tuple] = u
     if isinstance(index, tuple):
         if not index:
             raise IndexError("Tensor's index cannot be empty.")
-        # eg. Tensor[tuple(Slice...)] = u
-        if isinstance(index[0], (Slice, Ellipsis_, int)):
-            return True
-        raise IndexError("Index of type '{}' is not supported yet.".format(type(index[0])))
+        # eg. Tensor[tuple(Slice,...)] = u
+        for item in index:
+            if not isinstance(item, (slice, type(...), int)):
+                raise IndexError(
+                    "Index of type '{}' is not supported yet.".format(type(item)))
+        return True
     # eg. Tensor[Tensor[dtype=bool]] = u
     if isinstance(index, mstype.tensor_type):
         if element_type is None or element_type != mstype.bool_:
@@ -81,7 +88,8 @@ def check_tensor_setitem_index(index, element_type=None):
                 "{} type is not supported yet.".format(element_type))
         return True
 
-    raise IndexError("Index of type '{}' is not supported yet.".format(type(index)))
+    raise IndexError(
+        "Index of type '{}' is not supported yet.".format(type(index)))
 
 
 @constexpr
@@ -116,12 +124,12 @@ def slice_expand(input_slices, shape):
     index = 0
     slices = None
     # Slice or tuple(Slice...)
-    if isinstance(input_slices, Slice):
+    if isinstance(input_slices, slice):
         slices = (input_slices,)
-    elif isinstance(input_slices, (tuple, list)) and input_slices and isinstance(input_slices[0], (Slice, Ellipsis_)):
+    elif isinstance(input_slices, (tuple, list)) and input_slices and isinstance(input_slices[0], (slice, type(...))):
         is_have_ellipsis = False
         for _, element in enumerate(input_slices):
-            if isinstance(element, Ellipsis_):
+            if isinstance(element, type(...)):
                 is_have_ellipsis = True
                 break
         if is_have_ellipsis:
@@ -130,10 +138,9 @@ def slice_expand(input_slices, shape):
             slices = input_slices
     else:
         raise IndexError("Tensor's index type is not supported yet.")
-
     for s in slices:
         start = 0 if (s.start is None) else s.start
-        stop = shape[index] if (s.end is None) else s.end
+        stop = shape[index] if (s.stop is None) else s.stop
         step = 1 if (s.step is None) else s.step
         begin.append(start)
         end.append(stop)
@@ -151,11 +158,11 @@ def ellipsis2slice(input_, shape):
     """Converts ellipsis to slice."""
     input_slice = input_
     result = []
-    if isinstance(input_, Ellipsis_):
+    if isinstance(input_, type(...)):
         input_slice = (input_,)
     ell_count = 0
     for _, element in enumerate(input_slice):
-        if not isinstance(element, Ellipsis_):
+        if not isinstance(element, type(...)):
             result.append(element)
             continue
         ell_count += 1
@@ -163,7 +170,7 @@ def ellipsis2slice(input_, shape):
             raise IndexError("There cannot be more than one ellisis (...) in the index of the tensor, "
                              "but it is currently {}".format(input_slice))
         for _ in range(len(shape) - len(input_slice) + 1):
-            result.append(Slice(None, None, None))
+            result.append(slice(None, None, None))
     return tuple(result)
 
 
@@ -196,7 +203,8 @@ def slice2indices(input_slices, shape):
 def check_indices(indices_size, index):
     """Checks indices whether is empty."""
     if indices_size < 1:
-        raise IndexError("The tensor's index is unreasonable. index:{}".format(index))
+        raise IndexError(
+            "The tensor's index is unreasonable. index:{}".format(index))
     return indices_size
 
 
@@ -230,7 +238,7 @@ def tuple_element_is_slice(indexs):
         raise IndexError("Tensor's index cannot be empty.")
     if isinstance(indexs, tuple):
         for _, ele in enumerate(indexs):
-            if not isinstance(ele, Slice):
+            if not isinstance(ele, slice):
                 return False
         return True
     return False
@@ -285,7 +293,8 @@ def check_value_elements(data_dtype, types):
         return ALL_TENSOR
     if scalars_number == len(types):
         return ALL_SCALAR
-    raise TypeError(f"For '{TENSOR_SETITEM}', the value does not support scalar and tensor mixing, but got {types}.")
+    raise TypeError(
+        f"For '{TENSOR_SETITEM}', the value does not support scalar and tensor mixing, but got {types}.")
 
 
 @constexpr
@@ -295,7 +304,8 @@ def get_index_tensor_dtype(dtype):
         return INT_
     if dtype == mstype.bool_:
         return BOOL_
-    raise IndexError(f"For '{TENSOR_SETITEM}', the index tensor data type '{dtype}' is not supported.")
+    raise IndexError(
+        f"For '{TENSOR_SETITEM}', the index tensor data type '{dtype}' is not supported.")
 
 
 @constexpr
@@ -313,7 +323,8 @@ def check_index_tensor_dtype(dtype, op_name):
     """Check a tensor data type."""
     if dtype == mstype.int32:
         return True
-    raise IndexError(f"For '{op_name}', the index tensor data type should be mstype.int32, but got {dtype}.")
+    raise IndexError(
+        f"For '{op_name}', the index tensor data type should be mstype.int32, but got {dtype}.")
 
 
 @constexpr
@@ -332,7 +343,8 @@ def generate_broadcast_shape(shapes, op_name):
     for i, shape in enumerate(shapes):
         logger.debug(f"Broadcasts the {i}th tensor, the shape is {shape}.")
         try:
-            broadcast_shape = op_utils.get_broadcast_shape(broadcast_shape, shape, op_name)
+            broadcast_shape = op_utils.get_broadcast_shape(
+                broadcast_shape, shape, op_name)
         except ValueError as ex:
             raise IndexError(ex)
     return tuple(broadcast_shape)
@@ -398,7 +410,8 @@ def convert_ellipsis_to_tensors(slice_number,
             if isinstance(ele, tuple):
                 shape.extend([1] * len(ele))
         if array is None:
-            raise ValueError(f"For '{op_name}', generate tensors from ellipsis failed.")
+            raise ValueError(
+                f"For '{op_name}', generate tensors from ellipsis failed.")
         array = np.reshape(array, shape)
         reps = compute_multiples(shape, final_shape)
         tensor = Tensor(np.tile(array, reps))
@@ -428,7 +441,8 @@ def convert_slice_to_tensor(slice_number, final_shape, indexes_shapes_info, op_n
         else:
             shape.append(1)
     if array is None:
-        raise ValueError(f"For '{op_name}', generate tensor from 'slice' failed.")
+        raise ValueError(
+            f"For '{op_name}', generate tensor from 'slice' failed.")
     array = np.reshape(array, shape)
     reps = compute_multiples(shape, final_shape)
     tensor = Tensor(np.tile(array, reps))
@@ -523,14 +537,15 @@ def generate_index_info_from_tuple_of_mixed_tensors(data_shape,
             tensor_count += 1
         elif isinstance(ele_type, mstype.slice_type):
             slice_obj = slice(slice_indexes[slice_count].start,
-                              slice_indexes[slice_count].end,
+                              slice_indexes[slice_count].stop,
                               slice_indexes[slice_count].step)
             # Use list to represent slicing result.
             indexes_info[pos] = list(range(data_shape[pos]))[slice_obj]
             slice_count += 1
         elif isinstance(ele_type, mstype.ellipsis_type):
             if ellipsis_num != 0:
-                raise IndexError(f"For '{op_name}', the index could only contain one ellipsis.")
+                raise IndexError(
+                    f"For '{op_name}', the index could only contain one ellipsis.")
             ellipsis_occupied_dims = data_rank - indexes_size + 1
             for j in range(pos, pos + ellipsis_occupied_dims):
                 # Use list to represent slicing result.
@@ -540,7 +555,8 @@ def generate_index_info_from_tuple_of_mixed_tensors(data_shape,
             raise IndexError(f"For '{op_name}', the index elements only support "
                              f"'Tensor', 'int', 'Slice', 'Ellipsis', but got {ele_type}.")
     broadcast_shape, final_shape, indexes_shapes_info = \
-        _derive_result_shape_info_from_tuple_of_mixed_tensors(indexes_info, index_tensors_info, op_name)
+        _derive_result_shape_info_from_tuple_of_mixed_tensors(
+            indexes_info, index_tensors_info, op_name)
     return broadcast_shape, final_shape, indexes_shapes_info, ellipsis_occupied_dims
 
 
@@ -556,10 +572,12 @@ def _derive_result_shape_info_from_tuple_of_mixed_tensors(indexes_info, index_te
     """Derive the resulting shape information from the a tuple index of mixed tensors."""
     index_tensor_info_key = list(index_tensors_info.keys())
     index_tensor_info_value = list(index_tensors_info.values())
-    broadcast_shape = generate_broadcast_shape(index_tensor_info_value, op_name)
+    broadcast_shape = generate_broadcast_shape(
+        index_tensor_info_value, op_name)
     final_shape = []
     indexes_shapes_info = []
-    mixed_tensors_continuous = _judge_tuple_of_mixed_tensors_continuous(index_tensor_info_key)
+    mixed_tensors_continuous = _judge_tuple_of_mixed_tensors_continuous(
+        index_tensor_info_key)
     if mixed_tensors_continuous:
         tensor_shape_dealt = False
         for ele in indexes_info.values():
@@ -638,3 +656,98 @@ def get_np_eps(input_dtype):
     nptype = mstype.dtype_to_nptype(input_dtype)
     eps = np.finfo(nptype).eps
     return float(eps)
+
+
+@constexpr
+def check_number_index_type(number):
+    """Check if it is int or bool number"""
+    if isinstance(number, bool):
+        return BOOL_
+    if isinstance(number, int):
+        return INT_
+    raise IndexError("Only support integers, slices(`:`), ellipsis(`...`), None and bool, got {0} type is {1} "
+                     .format(number, type(number)))
+
+
+@constexpr
+def get_stride_info_from_slice(data_shape, slice_index):
+    """Get stride info from a python slice"""
+    begin, end, step = get_slice_stride(data_shape[0], slice_index)
+    begin_strides = [begin]
+    end_strides = [end]
+    step_strides = [step]
+    for end in data_shape[1:]:
+        begin_strides.append(0)
+        end_strides.append(end)
+        step_strides.append(1)
+    return tuple(begin_strides), tuple(end_strides), tuple(step_strides)
+
+
+@constexpr
+def get_stride_info_from_integer(data_shape, number):
+    """Get stride info from a integer"""
+    begin_strides = [number]
+    end_strides = [number+1]
+    step_strides = [1]
+    for end in data_shape[1:]:
+        begin_strides.append(0)
+        end_strides.append(end)
+        step_strides.append(1)
+    return tuple(begin_strides), tuple(end_strides), tuple(step_strides)
+
+
+def get_slice_stride(dim_size, index_slice):
+    """Get slice stride info"""
+    step = 1 if index_slice.step is None else index_slice.step
+    start_default = 0
+    stop_default = dim_size
+    if step < 0:
+        start_default = -1
+        stop_default = -(dim_size+1)
+    start = start_default if index_slice.start is None else index_slice.start
+    stop = stop_default if index_slice.stop is None else index_slice.stop
+    return start, stop, step
+
+
+@constexpr
+def get_stride_info_from_tuple(data_shape, index_tuple):
+    """Get stride info from a tuple"""
+    begin_strides = []
+    end_strides = []
+    step_strides = []
+    index_size = len(index_tuple)
+    data_shape_size = len(data_shape)
+    shrink_axis = 0
+    index_count = 0
+    ellipsis_count = 0
+    for idx, item in enumerate(index_tuple):
+        if isinstance(item, slice):
+            start, stop, step = get_slice_stride(data_shape[idx], item)
+            begin_strides.append(start)
+            end_strides.append(stop)
+            step_strides.append(step)
+            index_count = index_count + 1
+        elif isinstance(item, int):
+            begin_strides.append(item)
+            end_strides.append(item + 1)
+            step_strides.append(1)
+            shrink_axis = shrink_axis + (1 << index_count)
+            index_count = index_count + 1
+        elif item is ...:
+            ellipsis_count = ellipsis_count + 1
+            if ellipsis_count > 1:
+                raise IndexError("An index can have only one ellipsis (...)")
+            ellipsis_range_size = data_shape_size - (index_size - 1)
+            begin_strides.extend([0] * (ellipsis_range_size))
+            end_strides.extend(
+                [i for i in data_shape[index_count: index_count + (ellipsis_range_size)]])
+            step_strides.extend([1] * (ellipsis_range_size))
+            index_count = index_count + ellipsis_range_size
+        else:
+            raise IndexError("Not supported index data type, got ",
+                             item, " type is ", type(item))
+    for item in range(index_count, data_shape_size):
+        begin_strides.append(0)
+        end_strides.append(data_shape[item])
+        step_strides.append(1)
+    return tuple(begin_strides), tuple(end_strides), tuple(step_strides), shrink_axis
diff --git a/mindspore/ops/composite/multitype_ops/div_impl.py b/mindspore/ops/composite/multitype_ops/div_impl.py
index c37fcb9c36..85a4e035c0 100644
--- a/mindspore/ops/composite/multitype_ops/div_impl.py
+++ b/mindspore/ops/composite/multitype_ops/div_impl.py
@@ -47,8 +47,8 @@ def _div_tensor(x, y):
     Two tensors divide by element.
 
     Args:
-        x (Tensor): x
-        y (Tensor): The dtype is same as x.
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
 
     Returns:
         Tensor, has the same dtype as x.
diff --git a/mindspore/ops/composite/multitype_ops/floordiv_impl.py b/mindspore/ops/composite/multitype_ops/floordiv_impl.py
index c1a47f881f..8e9e941309 100644
--- a/mindspore/ops/composite/multitype_ops/floordiv_impl.py
+++ b/mindspore/ops/composite/multitype_ops/floordiv_impl.py
@@ -34,7 +34,7 @@ def _floordiv_scalar(x, y):
 
 @floordiv.register("Tensor", "Tensor")
 def _floordiv_tensor(x, y):
-    """Returns x // y where x and y are all tensors and have save dtype."""
+    """Returns x // y where x and y are all tensors."""
     return F.tensor_floordiv(x, y)
 
 
diff --git a/mindspore/ops/composite/multitype_ops/getitem_impl.py b/mindspore/ops/composite/multitype_ops/getitem_impl.py
index 1295aba87e..ffd5ea4d62 100644
--- a/mindspore/ops/composite/multitype_ops/getitem_impl.py
+++ b/mindspore/ops/composite/multitype_ops/getitem_impl.py
@@ -15,7 +15,6 @@
 
 """Implementation for getitem."""
 from . import _compile_utils as compile_utils
-from . import _constexpr_utils as const_utils
 from .. import base
 from ... import functional as F
 
@@ -50,29 +49,6 @@ _tuple_slice = _TupleSlice('tuple_slice')
 """_tuple_slice is an metafuncgraph object which will slice a tuple."""
 
 
-class _TensorSlice(base.TensorSlice_):
-    """
-    Slices a tensor.
-
-    Inputs:
-        data (Tensor): A tensor to be sliced.
-        s (slice): The index to slice tuple data.
-
-    Outputs:
-        Tensor, consists of some elements of data.
-    """
-
-    def __init__(self, name):
-        base.TensorSlice_.__init__(self, name)
-
-    def __call__(self, *args):
-        pass
-
-
-_tensor_slice = _TensorSlice('tensor_slice')
-"""_tensor_slice is an metafuncgraph object which will slice a tensor."""
-
-
 class _TupleGetItemTensor(base.TupleGetItemTensor_):
     """
     Getting item of tuple by tensor index.
@@ -182,13 +158,13 @@ def _tensor_getitem_by_number(data, number_index):
     Outputs:
         Tensor, element type is as same as the element type of data.
     """
-    return _tensor_slice(data, number_index)
+    return compile_utils.tensor_index_by_number(data, number_index)
 
 
 @getitem.register("Tensor", "None")
 def _tensor_getitem_by_none(data, index):
     """
-    Getting item of tensor by None.
+    For none indexing , expand data with one dim.
 
     Inputs:
         data (Tensor): A tensor.
@@ -197,7 +173,7 @@ def _tensor_getitem_by_none(data, index):
     Outputs:
         Tensor, element type is as same as the element type of data.
     """
-    return _tensor_slice(data, index)
+    return F.expand_dims(data, 0)
 
 
 @getitem.register("Tensor", "Slice")
@@ -212,13 +188,13 @@ def _tensor_getitem_by_slice(data, slice_index):
     Outputs:
         Tensor, element type is same as the element type of data.
     """
-    return _tensor_slice(data, slice_index)
+    return compile_utils.tensor_index_by_slice(data, slice_index)
 
 
 @getitem.register("Tensor", "Tensor")
 def _tensor_getitem_by_tensor(data, tensor_index):
     """
-    Getting item of tensor by slice.
+    Getting item of tensor by tensor indice.
 
     Inputs:
         data (Tensor): A tensor.
@@ -227,18 +203,13 @@ def _tensor_getitem_by_tensor(data, tensor_index):
     Outputs:
         Tensor, element type is same as the element type of data.
     """
-    check_dtypes = const_utils.check_index_tensor_dtype(F.dtype(tensor_index),
-                                                        const_utils.TENSOR_GETITEM)
-    result = None
-    if check_dtypes:
-        result = F.gather(data, tensor_index, 0)
-    return result
+    return compile_utils.tensor_index_by_tensor(data, tensor_index)
 
 
 @getitem.register("Tensor", "Tuple")
 def _tensor_getitem_by_tuple(data, tuple_index):
     """
-    Getting item of tensor by slice tuple.
+    Getting item of tensor by tuple.
 
     Inputs:
         data (Tensor): A tensor.
@@ -247,13 +218,7 @@ def _tensor_getitem_by_tuple(data, tuple_index):
     Outputs:
         Tensor, element type is same as the element type of data.
     """
-    indexes_types = compile_utils.hyper_map(F.typeof, tuple_index)
-    index_elements_type = const_utils.tuple_index_elements_type(indexes_types, const_utils.TENSOR_GETITEM)
-    if index_elements_type == const_utils.NO_TENSOR:
-        return _tensor_slice(data, tuple_index)
-    if index_elements_type == const_utils.ALL_TENSOR:
-        return _tensor_getitem_by_tuple_of_tensor(data, tuple_index)
-    return _tensor_getitem_by_tuple_of_mixed_tensors(data, tuple_index)
+    return compile_utils.tensor_index_by_tuple(data, tuple_index)
 
 
 @getitem.register("Tensor", "Ellipsis")
@@ -268,22 +233,4 @@ def _tensor_getitem_by_ellipsis(data, ellipsis_index):
     Outputs:
         Tensor, same as data.
     """
-    return _tensor_slice(data, ellipsis_index)
-
-
-def _tensor_getitem_by_tuple_of_tensor(data, tuple_index):
-    """Tensor getitem by a tuple of tensor."""
-    indices = compile_utils.generate_indices_from_tuple_of_tensor(data,
-                                                                  tuple_index,
-                                                                  const_utils.TENSOR_GETITEM)
-    result = F.gather_nd(data, indices)
-    return result
-
-
-def _tensor_getitem_by_tuple_of_mixed_tensors(data, tuple_index):
-    """Tensor getitem by a tuple of mixed tensor."""
-    indices = compile_utils.generate_indices_from_tuple_of_mixed_tensors(data,
-                                                                         tuple_index,
-                                                                         const_utils.TENSOR_GETITEM)
-    result = F.gather_nd(data, indices)
-    return result
+    return data
diff --git a/mindspore/ops/composite/multitype_ops/greater_equal_impl.py b/mindspore/ops/composite/multitype_ops/greater_equal_impl.py
index 2073abb762..93f1acbc54 100644
--- a/mindspore/ops/composite/multitype_ops/greater_equal_impl.py
+++ b/mindspore/ops/composite/multitype_ops/greater_equal_impl.py
@@ -25,7 +25,7 @@ greater_equal = base.MultitypeFuncGraph("greater_equal")
 @greater_equal.register("Number", "Number")
 def _greater_equal_scala(x, y):
     """
-    Determine whether x is greater equal than y
+    Determine whether x is greater equal than y.
 
     Args:
        x(Number): Number.
diff --git a/mindspore/ops/composite/multitype_ops/greater_impl.py b/mindspore/ops/composite/multitype_ops/greater_impl.py
index 7bbf53da49..2f3a2dbb83 100644
--- a/mindspore/ops/composite/multitype_ops/greater_impl.py
+++ b/mindspore/ops/composite/multitype_ops/greater_impl.py
@@ -23,7 +23,7 @@ greater = base.MultitypeFuncGraph("greater")
 
 
 @greater.register("Number", "Number")
-def _greater_scala(x, y):
+def _greater_scalar(x, y):
     """
     Determine whether two numbers are greater.
 
@@ -48,6 +48,6 @@ def _greater_tensor(x, y):
        y(Tensor): Tensor.
 
     Returns:
-       tensor, return operation of x and y by P.Greater
+       tensor, return operation of x and y by P.Greater.
    """
     return F.tensor_gt(x, y)
diff --git a/mindspore/ops/composite/multitype_ops/less_equal_impl.py b/mindspore/ops/composite/multitype_ops/less_equal_impl.py
index dc1438da2c..5927c4b349 100644
--- a/mindspore/ops/composite/multitype_ops/less_equal_impl.py
+++ b/mindspore/ops/composite/multitype_ops/less_equal_impl.py
@@ -25,7 +25,7 @@ less_equal = base.MultitypeFuncGraph("less_equal")
 @less_equal.register("Number", "Number")
 def _less_equal_scala(x, y):
     """
-    Determine whether x is less equal than y
+    Determine whether x is less equal than y.
 
     Args:
        x(Number): Number.
@@ -41,7 +41,7 @@ def _less_equal_scala(x, y):
 @less_equal.register("Tensor", "Tensor")
 def _less_equal_tensor(x, y):
     """
-    Determine  whether tensor x is less equal than tensor y elementwise
+    Determine  whether tensor x is less equal than tensor y elementwise.
 
     Args:
        x(Tensor): Tensor.
diff --git a/mindspore/ops/composite/multitype_ops/logic_not_impl.py b/mindspore/ops/composite/multitype_ops/logic_not_impl.py
index 35ae766433..6705145a64 100644
--- a/mindspore/ops/composite/multitype_ops/logic_not_impl.py
+++ b/mindspore/ops/composite/multitype_ops/logic_not_impl.py
@@ -25,13 +25,13 @@ logical_not = base.MultitypeFuncGraph("logical_not")
 @logical_not.register("Number")
 def _logical_not_scala(x):
     """
-    Return logical not operation result of x
+    Return logical not operation result of x.
 
     Args:
        x(Number): Number.
 
     Returns:
-       bool, Return logical not operation result of x
+       bool, Return logical not operation result of x.
    """
     return F.bool_not(x.__bool__())
 
@@ -39,10 +39,24 @@ def _logical_not_scala(x):
 @logical_not.register("Tensor")
 def _logical_not_tensor(x):
     """
-    Return logical not operation result of x
+    Return logical not operation result of x.
     Args:
        x(Tensor): Tensor.
     Returns:
-       Tensor, Return logical not operation result of x
+       Tensor, Return logical not operation result of x.
    """
-    return  F.logical_not(x)
+    return F.logical_not(x)
+
+
+@logical_not.register("Tuple")
+def _logical_not_tuple(x):
+    """
+    Return logical not operation result of a tuple object.
+
+    Args:
+       x(Tuple): The input tuple.
+
+    Returns:
+       bool, Return logical not operation result of x.
+   """
+    return F.bool_not(x.__bool__())
diff --git a/mindspore/ops/composite/multitype_ops/logical_and_impl.py b/mindspore/ops/composite/multitype_ops/logical_and_impl.py
index 324ce3a78d..79001f43e8 100644
--- a/mindspore/ops/composite/multitype_ops/logical_and_impl.py
+++ b/mindspore/ops/composite/multitype_ops/logical_and_impl.py
@@ -25,14 +25,14 @@ logical_and = base.MultitypeFuncGraph("logical_and")
 @logical_and.register("Number", "Number")
 def _logical_and_scala(x, y):
     """
-    Return logical and operation result of x  and y
+    Return logical and operation result of x and y.
 
     Args:
        x(Number): Number.
        y(Number): Number.
 
     Returns:
-       bool, Return logical and operation result of x  and y
+       bool, Return logical and operation result of x and y.
    """
     return F.bool_and(x.__bool__(), y.__bool__())
 
@@ -40,13 +40,13 @@ def _logical_and_scala(x, y):
 @logical_and.register("Tensor", "Tensor")
 def _logical_and_tensor(x, y):
     """
-    Return logical and operation result of x  and y
+    Return logical and operation result of x and y.
 
     Args:
        x(Tensor): Tensor.
        y(Tensor): Tensor.
 
     Returns:
-       Tensor, Return logical and operation result of x  and y
+       Tensor, Return logical and operation result of x and y.
    """
     return  F.logical_and(x, y)
diff --git a/mindspore/ops/composite/multitype_ops/logical_or_impl.py b/mindspore/ops/composite/multitype_ops/logical_or_impl.py
index fd106f7685..6d070d5cbf 100644
--- a/mindspore/ops/composite/multitype_ops/logical_or_impl.py
+++ b/mindspore/ops/composite/multitype_ops/logical_or_impl.py
@@ -25,14 +25,14 @@ logical_or = base.MultitypeFuncGraph("logical_or")
 @logical_or.register("Number", "Number")
 def _logical_or_scala(x, y):
     """
-    Return logical or operation result of x  and y
+    Return logical or operation result of x and y.
 
     Args:
        x(Number): Number.
        y(Number): Number.
 
     Returns:
-       bool, Return logical or operation result of x  and y
+       bool, Return logical or operation result of x and y.
    """
     return F.bool_or(x.__bool__(), y.__bool__())
 
@@ -40,13 +40,13 @@ def _logical_or_scala(x, y):
 @logical_or.register("Tensor", "Tensor")
 def _logical_or_tensor(x, y):
     """
-    Return logical operation or result of x  and y
+    Return logical operation or result of x and y.
 
     Args:
        x(Tensor): Tensor.
        y(Tensor): Tensor.
 
     Returns:
-       Tensor, Return logical operation or result of x  and y
+       Tensor, Return logical operation or result of x and y.
    """
-    return  F.logical_or(x, y)
+    return F.logical_or(x, y)
diff --git a/mindspore/ops/composite/multitype_ops/mod_impl.py b/mindspore/ops/composite/multitype_ops/mod_impl.py
index e9947677ac..4b6a13bbc8 100644
--- a/mindspore/ops/composite/multitype_ops/mod_impl.py
+++ b/mindspore/ops/composite/multitype_ops/mod_impl.py
@@ -34,7 +34,7 @@ def _mod_scalar(x, y):
 
 @mod.register("Tensor", "Tensor")
 def _mod_tensor(x, y):
-    """Returns x % y where x and y are all tensors and have save dtype."""
+    """Returns x % y where x and y are all tensors."""
     return F.tensor_mod(x, y)
 
 
diff --git a/mindspore/ops/composite/multitype_ops/mul_impl.py b/mindspore/ops/composite/multitype_ops/mul_impl.py
index ce9ec391af..b5535df135 100644
--- a/mindspore/ops/composite/multitype_ops/mul_impl.py
+++ b/mindspore/ops/composite/multitype_ops/mul_impl.py
@@ -40,7 +40,7 @@ def _mul_scalar(x, y):
 @mul.register("Tensor", "Tensor")
 def _mul_tensor(x, y):
     """
-    Returns x * y by element-wise where x and y are all tensors and have same dtype.
+    Returns x * y by element-wise where x and y are all tensors.
 
     Outputs:
         Tensor, has the same dtype as x.
diff --git a/mindspore/ops/composite/multitype_ops/setitem_impl.py b/mindspore/ops/composite/multitype_ops/setitem_impl.py
index 53659c6205..38cf0141f0 100644
--- a/mindspore/ops/composite/multitype_ops/setitem_impl.py
+++ b/mindspore/ops/composite/multitype_ops/setitem_impl.py
@@ -16,10 +16,8 @@
 """Implementation for setitem."""
 
 from . import _compile_utils as compile_utils
-from . import _constexpr_utils as const_utils
 from ... import functional as F
 from ...composite import base
-from ....common import dtype as mstype
 
 setitem = base.MultitypeFuncGraph('setitem')
 
@@ -139,11 +137,7 @@ def _tensor_setitem_by_tensor_with_tensor(data, index, value_tensor):
     Outputs:
         Tensor, element type and shape is same as data.
     """
-    index_dtype = F.dtype(index)
-    tensor_dtype = const_utils.get_index_tensor_dtype(index_dtype)
-    if tensor_dtype == const_utils.INT_:
-        return _tensor_setitem_by_int_tensor_with_tensor(data, index, value_tensor)
-    return _tensor_setitem_by_bool_tensor_with_tensor(data, index, value_tensor)
+    return compile_utils.tensor_setitem_by_tensor_with_tensor(data, index, value_tensor)
 
 
 @setitem.register("Tensor", "Tensor", "Number")
@@ -166,11 +160,7 @@ def _tensor_setitem_by_tensor_with_number(data, index, value):
     Outputs:
         Tensor, element type and shape is same as data.
     """
-    index_dtype = F.dtype(index)
-    tensor_dtype = const_utils.get_index_tensor_dtype(index_dtype)
-    if tensor_dtype == const_utils.BOOL_:
-        return _tensor_setitem_by_bool_tensor_with_scalar(data, index, value)
-    return _tensor_setitem_by_int_tensor_with_scalar(data, index, value)
+    return compile_utils.tensor_setitem_by_tensor_with_number(data, index, value)
 
 
 @setitem.register("Tensor", "Tuple", "Number")
@@ -191,24 +181,7 @@ def _tensor_setitem_by_tuple_with_number(data, tuple_index, value):
     Outputs:
         Tensor, element type and shape is same as data.
     """
-    indexes_types = compile_utils.hyper_map(F.typeof, tuple_index)
-    index_elements_type = const_utils.tuple_index_elements_type(indexes_types, const_utils.TENSOR_SETITEM)
-
-    if index_elements_type == const_utils.NO_TENSOR:
-        return _tensor_assgin_number(data, tuple_index, value)
-    if index_elements_type == const_utils.ALL_TENSOR:
-        indices = compile_utils.generate_indices_from_tuple_of_tensor(data,
-                                                                      tuple_index,
-                                                                      const_utils.TENSOR_SETITEM)
-    else:
-        indices = compile_utils.generate_indices_from_tuple_of_mixed_tensors(data,
-                                                                             tuple_index,
-                                                                             const_utils.TENSOR_SETITEM)
-    updates = compile_utils.generate_updates_from_scalar(data,
-                                                         indices,
-                                                         value,
-                                                         const_utils.SET_ITEM_BY_TUPLE_OF_TENSOR)
-    return F.scatter_nd_update(data, indices, updates)
+    return compile_utils.tensor_setitem_by_tuple_with_number(data, tuple_index, value)
 
 
 @setitem.register("Tensor", "Tuple", "Tensor")
@@ -229,24 +202,7 @@ def _tensor_setitem_by_tuple_with_tensor(data, tuple_index, value):
     Outputs:
         Tensor, element type and shape is same as data.
     """
-    indexes_types = compile_utils.hyper_map(F.typeof, tuple_index)
-    index_elements_type = const_utils.tuple_index_elements_type(indexes_types, const_utils.TENSOR_SETITEM)
-
-    if index_elements_type == const_utils.NO_TENSOR:
-        return _tensor_assgin_tensor(data, tuple_index, value)
-    if index_elements_type == const_utils.ALL_TENSOR:
-        indices = compile_utils.generate_indices_from_tuple_of_tensor(data,
-                                                                      tuple_index,
-                                                                      const_utils.TENSOR_SETITEM)
-    else:
-        indices = compile_utils.generate_indices_from_tuple_of_mixed_tensors(data,
-                                                                             tuple_index,
-                                                                             const_utils.TENSOR_SETITEM)
-    updates = compile_utils.generate_updates_from_tensor(data,
-                                                         indices,
-                                                         value,
-                                                         const_utils.SET_ITEM_BY_TUPLE_OF_TENSOR)
-    return F.scatter_nd_update(data, indices, updates)
+    return compile_utils.tensor_setitem_by_tuple_with_tensor(data, tuple_index, value)
 
 
 @setitem.register("Tensor", "Tuple", "Tuple")
@@ -268,22 +224,7 @@ def _tensor_setitem_by_tuple_with_tuple(data, tuple_index, value):
     Outputs:
         Tensor, element type and shape is same as data.
     """
-    indexes_types = compile_utils.hyper_map(F.typeof, tuple_index)
-    index_elements_type = const_utils.tuple_index_elements_type(indexes_types, const_utils.TENSOR_SETITEM)
-
-    if index_elements_type == const_utils.ALL_TENSOR:
-        indices = compile_utils.generate_indices_from_tuple_of_tensor(data,
-                                                                      tuple_index,
-                                                                      const_utils.TENSOR_SETITEM)
-    else:
-        indices = compile_utils.generate_indices_from_tuple_of_mixed_tensors(data,
-                                                                             tuple_index,
-                                                                             const_utils.TENSOR_SETITEM)
-    updates = compile_utils.generate_updates_from_tuple(data,
-                                                        indices,
-                                                        value,
-                                                        const_utils.SET_ITEM_BY_TUPLE_OF_TENSOR)
-    return F.scatter_nd_update(data, indices, updates)
+    return compile_utils.tensor_setitem_by_tuple_with_tuple(data, tuple_index, value)
 
 
 @setitem.register("Tensor", "Tensor", "Tuple")
@@ -299,12 +240,7 @@ def _tensor_setitem_by_tensor_v2(data, index, value):
     Outputs:
         Tensor, element type and shape is same as data.
     """
-    index_dtype = F.dtype(index)
-    check_dtype = const_utils.check_index_tensor_dtype(index_dtype, const_utils.TENSOR_SETITEM)
-    result = None
-    if check_dtype:
-        result = _tensor_setitem_by_tensor_with_tuple(data, index, value)
-    return result
+    return compile_utils.tensor_setitem_by_tensor_with_tuple(data, index, value)
 
 
 @setitem.register("Tensor", "Slice", "Tensor")
@@ -326,7 +262,7 @@ def _tensor_setitem_with_slice_v3(data, input_slice, value):
     Outputs:
         Tensor, element type and shape is same as data.
     """
-    return _tensor_assgin_tensor(data, input_slice, value)
+    return compile_utils.tensor_setitem_by_slice_with_tensor(data, input_slice, value)
 
 
 @setitem.register("Tensor", "Slice", "Number")
@@ -348,168 +284,28 @@ def _tensor_setitem_with_slice_v1(data, input_slice, value):
     Outputs:
         Tensor, element type and shape is same as data.
     """
-    return _tensor_assgin_number(data, input_slice, value)
-
-
-def _tensor_assgin_number(data, input_slice, value):
-    """Givens a scalar assign to tensor by slice"""
-    check_result = const_utils.check_tensor_setitem_index(input_slice)
-    result = None
-    if check_result:
-        data_shape = F.shape(data)
-        indices = const_utils.slice2indices(input_slice, data_shape)
-        is_tuple_int = const_utils.tuple_element_is_int(input_slice)
-        if is_tuple_int:
-            indices = const_utils.integer_to_indices(input_slice, data_shape)
-        result = _tensor_indices_number(data, data_shape, input_slice, indices, value)
-    return result
+    return compile_utils.tensor_setitem_by_slice_with_number(data, input_slice, value)
 
 
 @setitem.register("Tensor", "Number", "Number")
 def _tensor_setitem_with_int_v1(data, index, value):
     """Syntax: A[1] = 3"""
-    data_shape = F.shape(data)
-    indices = const_utils.integer_to_indices(index, data_shape)
-    return _tensor_indices_number(data, data_shape, index, indices, value)
+    return compile_utils.tensor_setitem_by_number_with_number(data, index, value)
 
 
 @setitem.register("Tensor", "Number", "Tensor")
 def _tensor_setitem_with_int_v2(data, index, value):
     """Syntax: A[1] = Tensor"""
-    data_shape = F.shape(data)
-    indices = const_utils.integer_to_indices(index, data_shape)
-    return _tensor_indices_tensor(data, data_shape, index, indices, value)
+    return compile_utils.tensor_setitem_by_number_with_tensor(data, index, value)
 
 
 @setitem.register("Tensor", "Ellipsis", "Number")
 def _tensor_setitem_with_ellipsis_v1(data, index, value):
     """Syntax: A[...] = number."""
-    data_shape = F.shape(data)
-    data_dtype = F.dtype(data)
-    return F.fill(data_dtype, data_shape, value)
+    return compile_utils.tensor_setitem_by_ellipsis_with_number(data, index, value)
 
 
 @setitem.register("Tensor", "Ellipsis", "Tensor")
 def _tensor_setitem_with_ellipsis_v2(data, index, value):
     """Syntax: A[...] = Tensor."""
-    result = None
-    data_shape = F.shape(data)
-    data_dtype = F.dtype(data)
-    data_size = F.size(data)
-    value_shape = F.shape(value)
-    value_size = F.size(value)
-    check_result = const_utils.check_ellipsis_shape_size(data_shape, value_shape, data_size, value_size)
-    if check_result:
-        if data_size == value_size:
-            result = F.reshape(value, data_shape)
-            result = F.cast(result, data_dtype)
-        elif value_size == 1:
-            param1 = F.fill(data_dtype, data_shape, 1)
-            param2 = F.cast(value, data_dtype)
-            result = F.tensor_mul(param1, param2)
-    return result
-
-
-def _tensor_assgin_tensor(data, input_slice, value):
-    """Assigns a tensor value to the tensor by slice."""
-    result = None
-    check_result = const_utils.check_tensor_setitem_index(input_slice)
-    if check_result:
-        data_shape = F.shape(data)
-        indices = const_utils.slice2indices(input_slice, data_shape)
-        is_tuple_int = const_utils.tuple_element_is_int(input_slice)
-        if is_tuple_int:
-            indices = const_utils.integer_to_indices(input_slice, data_shape)
-        result = _tensor_indices_tensor(data, data_shape, input_slice, indices, value)
-    return result
-
-
-def _tensor_indices_tensor(data, data_shape, index, indices, value):
-    """Assigns a tensor value to the tensor."""
-    data_size = F.size(data)
-    data_dtype = F.dtype(data)
-    indices_size = F.size(indices)
-    indices_size = const_utils.check_indices(indices_size, index)
-    update = F.fill(mstype.int32, (indices_size,), 1)
-    condition_1d = F.scatter_nd(indices, update, (data_size,))
-    condition = F.reshape(condition_1d, data_shape)
-    condition = F.cast(condition, mstype.bool_)
-    value_fill = None
-    value_size = F.size(value)
-
-    value_size = const_utils.check_indices_value_size(indices_size, value_size)
-    if value_size == 1:
-        value_fill = F.fill(data_dtype, (indices_size,), 1)
-        value = F.cast(value, data_dtype)
-        value_fill = F.tensor_mul(value_fill, value)
-    elif value_size > 1:
-        value_fill = F.reshape(value, (indices_size,))
-    value_1d = F.scatter_nd(indices, value_fill, (data_size,))
-    u = F.reshape(value_1d, data_shape)
-    return F.select(condition, u, data)
-
-
-def _tensor_indices_number(data, data_shape, index, indices, value):
-    """Assigns a scalar value to the tensor."""
-    data_size = F.size(data)
-    data_dtype = F.dtype(data)
-    indices_size = F.size(indices)
-    indices_size = const_utils.check_indices(indices_size, index)
-    update = F.fill(mstype.int32, (indices_size,), 1)
-    condition_1d = F.scatter_nd(indices, update, (data_size,))
-    condition = F.reshape(condition_1d, data_shape)
-    condition = F.cast(condition, mstype.bool_)
-    value_fill = F.fill(data_dtype, (indices_size,), value)
-    value_1d = F.scatter_nd(indices, value_fill, (data_size,))
-    u = F.reshape(value_1d, data_shape)
-    return F.select(condition, u, data)
-
-
-def _tensor_setitem_by_tensor_with_tuple(data, index, value):
-    """Set a tensor item by a tensor with a tuple."""
-    updates = compile_utils.generate_updates_from_tuple(data, index, value,
-                                                        const_utils.SET_ITEM_BY_ONE_TENSOR)
-    result = F.scatter_update(data, index, updates)
-    return result
-
-
-def _tensor_setitem_by_int_tensor_with_scalar(data, index, value):
-    """Set a tensor item by a int tensor with a scalar."""
-    updates = compile_utils.generate_updates_from_scalar(data, index, value,
-                                                         const_utils.SET_ITEM_BY_ONE_TENSOR)
-    return F.scatter_update(data, index, updates)
-
-
-def _tensor_setitem_by_bool_tensor_with_scalar(data, index, value):
-    """Set a tensor item by a bool tensor with a scalar."""
-    index_shape = F.shape(index)
-    shape = F.shape(data)
-    shape = const_utils.check_equal(
-        shape, index_shape, "The tensor(shape={}) and tensor index(shape={}) should be the same shape.")
-    dtype = F.dtype(data)
-    u = F.fill(dtype, shape, value)
-    return F.select(index, u, data)
-
-
-def _tensor_setitem_by_int_tensor_with_tensor(data, index, value):
-    """Set a tensor item by a int tensor with a tensor."""
-    updates = compile_utils.generate_updates_from_tensor(data, index, value,
-                                                         const_utils.SET_ITEM_BY_ONE_TENSOR)
-    return F.scatter_update(data, index, updates)
-
-
-def _tensor_setitem_by_bool_tensor_with_tensor(data, index, value):
-    """Set a tensor item by a bool tensor with a tensor."""
-    index_shape = F.shape(index)
-    data_shape = F.shape(data)
-    data_shape = const_utils.check_equal(data_shape, index_shape,
-                                         "The tensor(shape={}) and tensor index(shape={}) should be the same shape.")
-    size = F.size(value)
-    size = const_utils.check_equal(1, size,
-                                   "When assign value is a tensor, its size should be {}, but current size is {}.")
-    dtype = F.dtype(data)
-    u_cast = F.cast(value, dtype)
-    one_data = F.ones_like(data)
-    u = F.tensor_mul(one_data, u_cast)
-    result = F.select(index, u, data)
-    return result
+    return compile_utils.tensor_setitem_by_ellipsis_with_tensor(data, index, value)
diff --git a/mindspore/ops/composite/multitype_ops/sub_impl.py b/mindspore/ops/composite/multitype_ops/sub_impl.py
index 431a58b991..864b8678d4 100644
--- a/mindspore/ops/composite/multitype_ops/sub_impl.py
+++ b/mindspore/ops/composite/multitype_ops/sub_impl.py
@@ -34,7 +34,7 @@ def _sub_scalar(x, y):
 
 @sub.register("Tensor", "Tensor")
 def _sub_tensor(x, y):
-    """Returns x - y where x and y are all tensors and have save dtype."""
+    """Returns x - y where x and y are all tensors."""
     return F.tensor_sub(x, y)
 
 
diff --git a/mindspore/ops/composite/multitype_ops/zeros_like_impl.py b/mindspore/ops/composite/multitype_ops/zeros_like_impl.py
index 1308bfd62a..9732d84fdc 100644
--- a/mindspore/ops/composite/multitype_ops/zeros_like_impl.py
+++ b/mindspore/ops/composite/multitype_ops/zeros_like_impl.py
@@ -57,7 +57,7 @@ def _zeros_like_func(x):
 @zeros_like_leaf.register("Tensor")
 def _zeros_like_tensor(x):
     """Returns a tensor with the same shape and dtype as x and all elements ars 1."""
-    return F.zeros_like_tensor(x)
+    return F.zeros_like(x)
 
 
 @zeros_like_leaf.register("TypeType")
diff --git a/mindspore/ops/functional.py b/mindspore/ops/functional.py
index 6559d9b2ab..5637274bfb 100644
--- a/mindspore/ops/functional.py
+++ b/mindspore/ops/functional.py
@@ -21,11 +21,13 @@ from mindspore.common._register_for_tensor import tensor_operator_registry
 from .primitive import Primitive
 from . import operations as P
 from .operations import _grad_ops
+from .._extends import builtin_operations as BP
 
 typeof = Primitive('typeof')
 hastype = Primitive('hastype')
 cast = P.Cast()
 dtype = P.DType()
+isconstant = Primitive('is_constant')
 
 
 issubclass_ = P.IsSubClass()
@@ -76,6 +78,9 @@ gather_nd = P.GatherNd()
 scatter_update = P.ScatterUpdate()
 scatter_nd_update = P.ScatterNdUpdate()
 pack = P.Pack()
+partial = P.Partial()
+# depend: mount a node to another node
+depend = P.Depend()
 
 
 tuple_setitem = Primitive('tuple_setitem')
@@ -126,15 +131,13 @@ is_ = Primitive("is_")
 is_not = Primitive("is_not")
 in_dict = Primitive("in_dict")
 not_in_dict = Primitive("not_in_dict")
+mixed_precision_cast = Primitive("mixed_precision_cast")
 broadcast_gradient_args = Primitive('BroadcastGradientArgs')
 dot = Primitive('dot')
 array_reduce = Primitive('array_reduce')
-partial = Primitive('partial')
-zeros_like_tensor = Primitive('zeros_like_tensor')
+zeros_like = P.ZerosLike()
 identity = Primitive('identity')
 distribute = Primitive('distribute')
-# depend: mount a node to another node
-depend = Primitive('depend')
 embed = Primitive('embed')
 ref_to_embed = _grad_ops.RefToEmbed()
 env_setitem = Primitive('env_setitem')
@@ -151,7 +154,17 @@ shape_mul = Primitive("shape_mul")
 stop_gradient = Primitive("stop_gradient")
 
 tensor_operator_registry.register('__add__', tensor_add)
+tensor_operator_registry.register('__sub__', tensor_sub)
 tensor_operator_registry.register('__mul__', tensor_mul)
-tensor_operator_registry.register('__div__', tensor_div)
+tensor_operator_registry.register('__truediv__', tensor_div)
 #ms cannot support Tensor(True) compare
 tensor_operator_registry.register('__eq__', equal)
+tensor_operator_registry.register('__ne__', not_equal)
+tensor_operator_registry.register('__neg__', neg_tensor)
+tensor_operator_registry.register('__lt__', tensor_lt)
+tensor_operator_registry.register('__le__', tensor_le)
+tensor_operator_registry.register('__gt__', tensor_gt)
+tensor_operator_registry.register('__ge__', tensor_ge)
+tensor_operator_registry.register('shape', shape)
+#support GE backend for no compare operators
+tensor_operator_registry.register('vm_compare', BP.vm_compare)
diff --git a/mindspore/ops/op_info_register.py b/mindspore/ops/op_info_register.py
index 3096e90250..a7a60b7181 100644
--- a/mindspore/ops/op_info_register.py
+++ b/mindspore/ops/op_info_register.py
@@ -97,6 +97,7 @@ class RegOp:
         """
         if not isinstance(value, str):
             raise TypeError("%s value must be str" % str(value))
+        return True
 
     def _is_int(self, value):
         """
@@ -110,6 +111,7 @@ class RegOp:
         """
         if not isinstance(value, int):
             raise TypeError("%s value must be int" % str(value))
+        return True
 
     def _is_bool(self, value):
         """
@@ -123,6 +125,7 @@ class RegOp:
         """
         if not isinstance(value, bool):
             raise TypeError("%s value must be bool" % str(value))
+        return True
 
     def _check_param(self, param_list, key_list, fn_list, kwargs):
         """
@@ -494,6 +497,7 @@ class DataType:
     The current list below maybe not completed. If necessary, please add it.
     """
 
+    None_None = ("", "")
     BOOL_None = ("bool", "")
     BOOL_Default = ("bool", "DefaultFormat")
     BOOL_5HD = ("bool", "NC1HWC0")
diff --git a/mindspore/ops/op_selector.py b/mindspore/ops/op_selector.py
new file mode 100644
index 0000000000..bdd00ac7f1
--- /dev/null
+++ b/mindspore/ops/op_selector.py
@@ -0,0 +1,120 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+A factory class that create op selector instance to config switch on a class,
+which can be used to control the switch of op type: GraphKernel or Primitive.
+"""
+import importlib
+import inspect
+from mindspore import context
+
+
+class _OpSelector:
+    """
+    A helper class, which can be used to choose different type of operator.
+
+    When an instance of this class is called, we return the right operator
+    according to the context['enable_graph_kernel'] and the name of the
+    parameter. returned operator will be a GraphKernel op ora  Primitive op.
+
+    Args:
+        op (class): an empty class has an operator name as its class name
+        config_optype (str): operator type, which must be either 'GraphKernel'
+        or 'Primitive'
+        graph_kernel_pkg (str): real operator's package name
+        primitive_pkg (str): graph kernel operator's package name
+
+    Examples:
+        >>> class A: pass
+        >>> selected_op = _OpSelector(A, "GraphKernel",
+        >>>                           "graph_kernel.ops.pkg", "primitive.ops.pkg")
+        >>> # selected_op() will call graph_kernel.ops.pkg.A()
+    """
+    GRAPH_KERNEL = "GraphKernel"
+    PRIMITIVE = "Primitive"
+    DEFAULT_OP_TYPE = PRIMITIVE
+    KW_STR = "op_type"
+
+    def __init__(self, op, config_optype, primitive_pkg, graph_kernel_pkg):
+        self.op_name = op.__name__
+        self.config_optype = config_optype
+        self.graph_kernel_pkg = graph_kernel_pkg
+        self.primitive_pkg = primitive_pkg
+
+    def __call__(self, *args, **kwargs):
+        _op_type = _OpSelector.DEFAULT_OP_TYPE
+        if context.get_context("enable_graph_kernel"):
+            if _OpSelector.KW_STR in kwargs:
+                _op_type = kwargs.get(_OpSelector.KW_STR)
+                kwargs.pop(_OpSelector.KW_STR, None)
+            elif self.config_optype is not None:
+                _op_type = self.config_optype
+        if _op_type == _OpSelector.GRAPH_KERNEL:
+            pkg = self.graph_kernel_pkg
+        else:
+            pkg = self.primitive_pkg
+        op = getattr(importlib.import_module(pkg, __package__), self.op_name)
+        return op(*args, **kwargs)
+
+
+def new_ops_selector(primitive_pkg, graph_kernel_pkg):
+    """
+    A factory method to return an op selector
+
+    When the GraphKernel switch is on:
+        `context.get_context('enable_graph_kernel') == True`, we have 2 ways to control the op type:
+        (1). call the real op with an extra parameter `op_type='Primitive'` or `op_type='GraphKernel'`
+        (2). pass a parameter to the op selector, like `@op_selector('Primitive')` or
+                `@op_selector('GraphKernel')`
+        (3). default op type is PRIMITIVE
+        The order of the highest priority to lowest priority is (1), (2), (3)
+    If the GraphKernel switch is off, then op_type will always be PRIMITIVE.
+
+    Args:
+        primitive_pkg (str): primitive op's package name
+        graph_kernel_pkg (str): graph kernel op's package name
+
+    Returns:
+        returns an op selector, which can control what operator should be actually called.
+
+    Examples:
+        >>> op_selector = new_ops_selector("primitive_pkg.some.path",
+        >>>                                "graph_kernel_pkg.some.path")
+        >>> @op_selector
+        >>> class ReduceSum: pass
+    """
+
+    def op_selector(cls_or_optype):
+
+        _primitive_pkg = primitive_pkg
+        _graph_kernel_pkg = graph_kernel_pkg
+
+        def direct_op_type():
+            darg = None
+            if cls_or_optype is None:
+                pass
+            elif not inspect.isclass(cls_or_optype):
+                darg = cls_or_optype
+            return darg
+
+        if direct_op_type() is not None:
+            def deco_cls(_real_cls):
+                return _OpSelector(_real_cls, direct_op_type(), _primitive_pkg, _graph_kernel_pkg)
+            return deco_cls
+
+        return _OpSelector(cls_or_optype, direct_op_type(), _primitive_pkg, _graph_kernel_pkg)
+
+    return op_selector
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index 47c14f592c..beed99f713 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -22,16 +22,16 @@ A collection of operators to build nerual networks or computing functions.
 from .image_ops import (CropAndResize)
 from .array_ops import (Argmax, Argmin, Cast, Concat, Pack, Unpack,
                         Diag, DiagPart, DType, ExpandDims, Eye,
-                        Fill, GatherNd, GatherV2, InvertPermutation,
+                        Fill, GatherNd, GatherV2, SparseGatherV2, InvertPermutation,
                         IsInstance, IsSubClass, ArgMaxWithValue, OnesLike, ZerosLike,
-                        Rank, Reshape, ResizeNearestNeighbor, ArgMinWithValue, Range,
+                        Rank, Reshape, ResizeNearestNeighbor, ArgMinWithValue,
                         SameTypeShape, ScatterAdd, ScatterMax, ScatterUpdate,
                         ScalarToArray, ScalarToTensor, ScatterNd, ScatterNdUpdate, Select,
-                        Shape, Size, Slice, Split, EmbeddingLookup,
-                        Squeeze, StridedSlice, Tile,
+                        Shape, Size, Slice, Split,
+                        Squeeze, StridedSlice, Tile, TensorScatterUpdate,
                         Transpose, TruncatedNormal, TupleToArray, UnsortedSegmentMin,
                         UnsortedSegmentSum, SpaceToDepth, DepthToSpace, SpaceToBatch, BatchToSpace,
-                        SpaceToBatchND, BatchToSpaceND, ReverseSequence)
+                        SpaceToBatchND, BatchToSpaceND, BroadcastTo, InplaceUpdate, ReverseSequence)
 from .comm_ops import (AllGather, AllReduce, _AlltoAll, ReduceScatter, Broadcast,
                        _MirrorOperator, ReduceOp, _VirtualDataset,
                        _VirtualDiv, _GetTensorSlice,
@@ -41,27 +41,29 @@ from .debug_ops import (ImageSummary, InsertGradientOf, HookBackward, ScalarSumm
 from .control_ops import ControlDepend, GeSwitch, Merge
 from .inner_ops import ScalarCast
 
-from .math_ops import (Abs, ACos, Asin, Asinh, AddN, AssignAdd, AssignSub, Atan2, BatchMatMul, BitwiseAnd, BitwiseOr, BitwiseXor,
+from .math_ops import (Abs, ACos, Asin, Asinh, AddN, AccumulateNV2, AssignAdd, AssignSub, Atan2, BatchMatMul, BitwiseAnd, BitwiseOr,
+                       BitwiseXor, Inv, Invert, ApproximateEqual, InplaceAdd, InplaceSub,
                        ReduceMax, ReduceMin, ReduceMean, ReduceSum, ReduceAll, ReduceProd, CumProd,
-                       Cos, Div, Equal, EqualCount, Exp, Erf, Erfc, Floor, FloorDiv, FloorMod, Acosh,
-                       Greater, GreaterEqual, Less, LessEqual, Log, Log1p, LogicalAnd,
+                       Cos, Div, DivNoNan, Equal, EqualCount, Exp, Expm1, Erf, Erfc, Floor, FloorDiv, FloorMod, Ceil,
+                       Acosh, Greater, GreaterEqual, Less, LessEqual, Log, Log1p, LogicalAnd,
                        LogicalNot, LogicalOr, MatMul, Maximum,
                        Minimum, Mul, Neg, NMSWithMask, NotEqual,
                        NPUAllocFloatStatus, NPUClearFloatStatus,
                        NPUGetFloatStatus, Pow, RealDiv, IsNan, IsInf, IsFinite, FloatStatus,
-                       Reciprocal, CumSum,
+                       Reciprocal, CumSum, HistogramFixedWidth,
                        Sin, Sqrt, Rsqrt, BesselI0e, BesselI1e,
-                       Square, Sub, TensorAdd, Sign, Round, SquareSumAll, Atan, Atanh)
-from .random_ops import (RandomChoiceWithMask, RandomCategorical)
-from .nn_ops import (LSTM, SGD, Adam, ApplyMomentum, BatchNorm,
+                       Square, Sub, TensorAdd, Sign, Round, SquareSumAll, Atan, Atanh, Cosh, Sinh, Eps)
+
+from .random_ops import (RandomChoiceWithMask, Normal, RandomCategorical)
+from .nn_ops import (LSTM, SGD, Adam, SparseApplyAdam, SparseApplyLazyAdam, ApplyMomentum, BatchNorm,
                      BiasAdd, Conv2D,
                      DepthwiseConv2dNative,
                      DropoutDoMask, DropoutGrad, Dropout,
-                     DropoutGenMask, Flatten, FusedBatchNorm,
+                     DropoutGenMask, Flatten, FusedBatchNorm, BNTrainingReduce, BNTrainingUpdate,
                      Gelu, Elu,
                      GetNext, L2Normalize, LayerNorm, L2Loss, CTCLoss,
                      LogSoftmax,
-                     MaxPool,
+                     MaxPool, DataFormatDimMap,
                      AvgPool, Conv2DBackpropInput, ConfusionMulGrad,
                      MaxPoolWithArgmax, OneHot, Pad, MirrorPad, PReLU, ReLU, ReLU6, ReLUV2, HSwish, HSigmoid,
                      ResizeBilinear, Sigmoid,
@@ -72,19 +74,24 @@ from .nn_ops import (LSTM, SGD, Adam, ApplyMomentum, BatchNorm,
                      SparseSoftmaxCrossEntropyWithLogits, Tanh,
                      TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl, SparseApplyFtrl,
                      ApplyProximalAdagrad, SparseApplyProximalAdagrad,
-                     ApplyRMSProp, ApplyCenteredRMSProp, BasicLSTMCell)
-from .other_ops import Assign, IOU, BoundingBoxDecode, BoundingBoxEncode, CheckValid, MakeRefKey, CheckBprop
+                     ApplyAdaMax, ApplyAdadelta, ApplyAdagrad, ApplyAdagradV2,
+                     ApplyRMSProp, ApplyCenteredRMSProp, BasicLSTMCell, InTopK)
+from .other_ops import (Assign, IOU, BoundingBoxDecode, BoundingBoxEncode,
+                        CheckValid, MakeRefKey, Partial, Depend, CheckBprop)
 from . import _quant_ops
 from ._quant_ops import *
 from .thor_ops import *
 
 __all__ = [
+    'ReverseSequence',
+    'CropAndResize',
     'TensorAdd',
     'Argmax',
     'Argmin',
     'ArgMaxWithValue',
     'ArgMinWithValue',
     'AddN',
+    'AccumulateNV2',
     'Sub',
     'CumSum',
     'MatMul',
@@ -92,6 +99,7 @@ __all__ = [
     'Mul',
     'Pow',
     'Exp',
+    'Expm1',
     'Rsqrt',
     'Sqrt',
     'Square',
@@ -99,10 +107,14 @@ __all__ = [
     'Flatten',
     'MaxPoolWithArgmax',
     'FusedBatchNorm',
+    'BNTrainingReduce',
+    'BNTrainingUpdate',
     'BatchNorm',
     'MaxPool',
     'TopK',
     'Adam',
+    'SparseApplyAdam',
+    'SparseApplyLazyAdam',
     'Softplus',
     'Softmax',
     'LogSoftmax',
@@ -121,6 +133,7 @@ __all__ = [
     'Transpose',
     'OneHot',
     'GatherV2',
+    'SparseGatherV2',
     'Concat',
     'Pack',
     'Unpack',
@@ -132,14 +145,15 @@ __all__ = [
     'StridedSlice',
     'ReduceSum',
     'ReduceMean',
-    'Range',
     'LayerNorm',
-    'EmbeddingLookup',
     'Rank',
     'Less',
     'LessEqual',
     'RealDiv',
     'Div',
+    'DivNoNan',
+    'Inv',
+    'Invert',
     'TruncatedNormal',
     'Fill',
     'OnesLike',
@@ -148,7 +162,6 @@ __all__ = [
     'Split',
     'ReLU',
     'ReLU6',
-    'ReLUV2',
     'Elu',
     'Erf',
     'Erfc',
@@ -157,6 +170,7 @@ __all__ = [
     'HSigmoid',
     'Tanh',
     'RandomChoiceWithMask',
+    'Normal',
     'RandomCategorical',
     'ResizeBilinear',
     'ScalarSummary',
@@ -173,6 +187,8 @@ __all__ = [
     'DropoutGrad',
     'Dropout',
     'Neg',
+    'InplaceAdd',
+    'InplaceSub',
     'Slice',
     'DType',
     'NPUAllocFloatStatus',
@@ -204,15 +220,19 @@ __all__ = [
     'ScatterNd',
     'ScatterMax',
     'ResizeNearestNeighbor',
+    'HistogramFixedWidth',
     'Pad',
     'MirrorPad',
     'GatherNd',
+    'TensorScatterUpdate',
     'ScatterUpdate',
     'ScatterNdUpdate',
     'Floor',
     'NMSWithMask',
     'IOU',
     'MakeRefKey',
+    'Partial',
+    'Depend',
     'AvgPool',
     # Back Primitive
     'Equal',
@@ -245,10 +265,12 @@ __all__ = [
     'SigmoidCrossEntropyWithLogits',
     'FloorDiv',
     'FloorMod',
+    'Ceil',
     'Acosh',
     'Asinh',
     "PReLU",
     "Cos",
+    "Cosh",
     "ACos",
     "Diag",
     "DiagPart",
@@ -257,6 +279,7 @@ __all__ = [
     'AssignAdd',
     'AssignSub',
     "Sin",
+    "Sinh",
     "Asin",
     "LSTM",
     "Abs",
@@ -268,11 +291,16 @@ __all__ = [
     "Sign",
     "LARSUpdate",
     "Round",
+    "Eps",
     "ApplyFtrl",
     "SpaceToBatch",
     "SparseApplyFtrl",
     "ApplyProximalAdagrad",
     "SparseApplyProximalAdagrad",
+    "ApplyAdaMax",
+    "ApplyAdadelta",
+    "ApplyAdagrad",
+    "ApplyAdagradV2",
     "BatchToSpace",
     "Atan2",
     "ApplyRMSProp",
@@ -289,8 +317,12 @@ __all__ = [
     "Atan",
     "Atanh",
     "BasicLSTMCell",
+    "BroadcastTo",
+    "DataFormatDimMap",
+    "ApproximateEqual",
+    "InplaceUpdate",
+    "InTopK",
     "CropAndResize"
 ]
 
-__all__.extend(_quant_ops.__all__)
 __all__.sort()
diff --git a/mindspore/ops/operations/_grad_ops.py b/mindspore/ops/operations/_grad_ops.py
index 008f5f0edb..c3f97b9f33 100644
--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@@ -21,6 +21,7 @@ from ..primitive import Primitive, PrimitiveWithInfer, prim_attr_register
 from ..._checkparam import Validator as validator, Rel
 from .._utils import get_concat_offset
 from ...common import dtype as mstype
+from .. import functional as F
 
 
 class AbsGrad(PrimitiveWithInfer):
@@ -404,6 +405,33 @@ class FusedBatchNormGrad(Primitive):
     def __call__(self, dy, x, scale, save_mean, save_inv_variance):
         raise NotImplementedError
 
+class BNTrainingReduceGrad(PrimitiveWithInfer):
+    """Gradients of FusedBatchNorm operation."""
+
+    @prim_attr_register
+    def __init__(self, epsilon=0.0001):
+        _inputs = ['grads', 'x', 'diff_scale', 'diff_offset', 'scale', 'batch_mean', 'batch_variance']
+        self.init_prim_io_names(inputs=_inputs, outputs=['y'])
+
+    def infer_shape(self, grads, x, diff_scale, diff_offset, scale, batch_mean, batch_variance):
+        return grads
+
+    def infer_dtype(self, grads, x, diff_scale, diff_offset, scale, batch_mean, batch_variance):
+        return grads
+
+class BNTrainingUpdateGrad(PrimitiveWithInfer):
+    """Gradients of FusedBatchNorm operation."""
+
+    @prim_attr_register
+    def __init__(self, epsilon=0.0001):
+        self.init_prim_io_names(inputs=['grads', 'x', 'batch_mean', 'batch_variance'],
+                                outputs=['diff_scale', 'diff_offset'])
+
+    def infer_shape(self, grads, x, batch_mean, batch_variance):
+        return (batch_mean, batch_variance)
+
+    def infer_dtype(self, grads, x, batch_mean, batch_variance):
+        return (batch_mean, batch_variance)
 
 class GeluGrad(PrimitiveWithInfer):
     """Gradients of Gelu operation."""
@@ -1065,6 +1093,18 @@ class StridedSliceGrad(PrimitiveWithInfer):
         self.init_prim_io_names(inputs=['dy', 'shapex', 'begin', 'end', 'strides'], outputs=['output'])
 
     def __infer__(self, dy, shapex, begin, end, strides):
+        args = {"dy": dy['dtype']}
+        validator.check_tensor_type_same(args, mstype.number_type, self.name)
+
+        for idx, item in enumerate(shapex['value']):
+            validator.check_value_type("shapex[%d]" % idx, item, [int], self.name)
+        for idx, item in enumerate(begin['value']):
+            validator.check_value_type("begin[%d]" % idx, item, [int], self.name)
+        for idx, item in enumerate(end['value']):
+            validator.check_value_type("end[%d]" % idx, item, [int], self.name)
+        for idx, item in enumerate(strides['value']):
+            validator.check_value_type("strides[%d]" % idx, item, [int], self.name)
+
         return {'shape': shapex['value'],
                 'dtype': dy['dtype'],
                 'value': None}
@@ -1121,6 +1161,37 @@ class MirrorPadGrad(PrimitiveWithInfer):
                 'value': None}
 
 
+class EmbeddingLookupCommGrad(PrimitiveWithInfer):
+    """
+    Perform the gradient for the communication part of EmbeddingLookup operator.
+
+    This works ONLY when 'reduce_scatter_flag' is True in 'EmbeddingLookup'. Roughly speaking,
+    this primitive is implemented by StridedSlice --> HostAllGather --> Concat. This primitive runs on host.
+    """
+    @prim_attr_register
+    def __init__(self):
+        self.init_prim_io_names(inputs=['dy', 'split_num'], outputs=['output'])
+        self.add_prim_attr('primitive_target', 'CPU')
+
+    def __infer__(self, dy, split_num):
+        """
+        This primitive is implemented by three steps:
+            1) Split the 'dy' along dimension 0 into 'split_num' parts.
+            2) For each part, perform HostAllGather((0, 1, 2, 3, 4, 5, 6, 7)) on the host.
+            3) After HostAllGather, there are still 'split_num' parts in each process. Then, perform Concat on them
+              along dimension 0.
+
+        The output shape of this primitive: shape(output)[0] == shape(dy)[0] * 8
+        """
+        dy_shape = tuple(dy['shape'])
+        split_num_value = split_num['value']
+        validator.check_value_type("split_num_value", split_num_value, [int], self.name)
+        dy_shape_all = F.tuple_setitem(dy_shape, 0, dy_shape[0] * 8)
+        return {'shape': dy_shape_all,
+                'dtype': dy['dtype'],
+                'value': None}
+
+
 class RefToEmbed(Primitive):
     r"""
     Make a key from Ref.
@@ -1276,3 +1347,20 @@ class BasicLSTMCellInputGrad(PrimitiveWithInfer):
         validator.check_type_name("dgate", dgate_dtype, [mstype.float16, mstype.float32], self.name)
         validator.check_type_name("w", w_dtype, [mstype.float16, mstype.float32], self.name)
         return (dgate_dtype, dgate_dtype)
+
+
+class InvGrad(PrimitiveWithInfer):
+    """Computes gradients for inv operation."""
+
+    @prim_attr_register
+    def __init__(self):
+        pass
+
+    def infer_shape(self, x, grad):
+        validator.check("x_shape", x, "grad_shape", grad, Rel.EQ, self.name)
+        return x
+
+    def infer_dtype(self, x, grad):
+        validator.check_type_name("dgate", x, [mstype.float16, mstype.float32, mstype.int32, mstype.int8], self.name)
+        validator.check_type_name("grad", grad, [mstype.float16, mstype.float32, mstype.int32, mstype.int8], self.name)
+        return x
diff --git a/mindspore/ops/operations/_inner_ops.py b/mindspore/ops/operations/_inner_ops.py
index 2f9970eb0c..49834fc168 100644
--- a/mindspore/ops/operations/_inner_ops.py
+++ b/mindspore/ops/operations/_inner_ops.py
@@ -15,9 +15,10 @@
 
 """Inner operators."""
 
+from ..._checkparam import Rel
 from ..._checkparam import Validator as validator
 from ...common import dtype as mstype
-from ..primitive import  PrimitiveWithInfer, prim_attr_register
+from ..primitive import PrimitiveWithInfer, prim_attr_register
 
 
 class ExtractImagePatches(PrimitiveWithInfer):
@@ -98,6 +99,167 @@ class ExtractImagePatches(PrimitiveWithInfer):
         return input_x
 
 
+class Range(PrimitiveWithInfer):
+    r"""
+    Creates a sequence of numbers.
+    Set `input_x` as :math:`x_i` for each element, `output` as follows:
+
+    .. math::
+        \text{output}(x_i) = x_i * \text{delta} + \text{start}
+
+    Args:
+        start (float): If `limit` is `None`, the value acts as limit in the range and first entry
+            defaults to `0`. Otherwise, it acts as first entry in the range.
+        limit (float): Acts as upper limit of sequence. If `None`, defaults to the value of `start`
+            while set the first entry of the range to `0`. It can not be equal to `start`.
+        delta (float): Increment of the range. It can not be equal to zero. Default: 1.0.
+
+    Inputs:
+        - **input_x** (Tensor) - The assistant data. A `1-D` tensor of type float32 or int32.
+
+    Outputs:
+        Tensor, has the same shape and dtype as `input_x`.
+
+    Examples:
+        >>> range = P.Range(1.0, 8.0, 2.0)
+        >>> x = Tensor(np.array([1, 2, 3, 2]), mindspore.int32)
+        >>> range(x)
+        [3, 5, 7, 5]
+    """
+
+    @prim_attr_register
+    def __init__(self, start, limit=None, delta=1.0):
+        self.init_prim_io_names(inputs=['x'], outputs=['y'])
+        self.delta = validator.check_value_type("delta", delta, [float], self.name)
+        validator.check_value_type("start", start, [float], self.name)
+        if limit is None:
+            self.start = 0.0
+            self.limit = start
+            self.add_prim_attr("start", self.start)
+            self.add_prim_attr("limit", self.limit)
+        else:
+            validator.check_value_type("limit", limit, [float], self.name)
+        validator.check('start', self.start, 'limit', self.limit, Rel.NE, self.name)
+        if self.delta == 0.0:
+            raise ValueError("The input of `delta` can not be equal to zero.")
+        if self.delta > 0.0 and self.start > self.limit:
+            raise ValueError(f"Limit should be greater than start when delta:{self.delta} is more than zero, "
+                             f"but got start:{self.start}, limit:{self.limit}")
+        if self.delta < 0.0 and self.start < self.limit:
+            raise ValueError(f"Start should be greater than limit when delta:{self.delta} is less than zero, "
+                             f"but got start:{self.start}, limit:{self.limit}")
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_dtype):
+        validator.check_tensor_type_same({'x_dtype': x_dtype}, [mstype.float32, mstype.int32], self.name)
+        return x_dtype
+
+
+class AscendQuant(PrimitiveWithInfer):
+    r"""
+    Returns the quantized value of input_x.
+
+    If `sqrt_mode` is False:
+
+    .. math::
+        y = round(scale * x + offset)
+
+    If `sqrt_mode` is True:
+
+    .. math::
+        y = round(scale * x * scale + offset)
+
+    Note:
+        This operation only support Ascend 310 inference environment.
+
+    Args:
+        scale (float) : Specifies the scaling ratio.
+        offset (float): Specifies the offset.
+        sqrt_mode (bool) : Specifies whether to perform square root on `scale`. Default: False.
+        round_mode (str): Specifies the way to round. Should be one of ["Round", "Floor", "Ceil", "Trunc"].
+          Default: "Round".
+
+    Inputs:
+        - **input_x** (Tensor) : Input tensor. Its data type should be mindspore.float16 or mindspore.float32.
+
+    Outputs:
+        - Tensor: The quantized output tensor of type mindspore.int8.
+
+    Examples:
+        >>> input_x = Tensor([100.0, 150.0], mstype.float32)
+        >>> quant = P.AscendQuant(80.0, 0.0, False, "Round")
+        >>> y = quant(input_x)
+    """
+
+    @prim_attr_register
+    def __init__(self, scale, offset, sqrt_mode=False, round_mode="Round"):
+        self.scale = validator.check_value_type("scale", scale, [float], self.name)
+        self.offset = validator.check_value_type("offset", offset, [float], self.name)
+        self.sqrt_mode = validator.check_value_type("sqrt_mode", sqrt_mode, [bool], self.name)
+        self.round_mode = validator.check_string("round_mode", round_mode,
+                                                 ["Round", "Floor", "Ceil", "Trunc"], self.name)
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_type):
+        validator.check_subclass("input_x", x_type, mstype.tensor, self.name)
+        validator.check_type_name("input_x", x_type, [mstype.float16, mstype.float32], self.name)
+        return mstype.int8
+
+
+class AscendDequant(PrimitiveWithInfer):
+    r"""
+    Returns the dequantized value of input_x.
+    This operation will do ReLU to the dequantized value if `relu_flag` is True.
+
+    If `sqrt_mode` is False:
+
+    .. math::
+        y = x * deq\_scale
+
+    If `sqrt_mode` is True:
+
+    .. math::
+        y = x * deq\_scale * deq\_scale
+
+    Note:
+        This operation only support Ascend 310 inference environment.
+
+    Args:
+        sqrt_mode (bool) : Specifies whether to perform square root on `scale`. Default: False.
+        relu_flag (bool): Specifies whether to perform ReLU. Default: False.
+
+    Inputs:
+        - **input_x** (Tensor) : Input tensor. Should be mindspore.int32.
+        - **deq_scale** (Tensor) : Specifies the scaling ratio.
+          Data type should be mindspore.float16 or mindspore.uint64
+
+    Outputs:
+        - Tensor: The quantized output tensor of type mindspore.float16.
+
+    Examples:
+        >>> input_x = Tensor([100.0, 150.0], mstype.float32)
+        >>> dequant = P.AscendDequant(False, False)
+        >>> y = dequant(input_x)
+    """
+    @prim_attr_register
+    def __init__(self, sqrt_mode=False, relu_flag=False):
+        self.sqrt_mode = validator.check_value_type("sqrt_mode", sqrt_mode, [bool], self.name)
+        self.relu_flag = validator.check_value_type("relu_flag", relu_flag, [bool], self.name)
+
+    def infer_shape(self, x_shape, deq_scale_shape):
+        return x_shape
+
+    def infer_dtype(self, x_type, deq_scale_type):
+        validator.check_subclass("x", x_type, mstype.tensor, self.name)
+        validator.check_type_name("x", x_type, [mstype.int32], self.name)
+        validator.check_type_name("deq_scale", deq_scale_type, [mstype.float16, mstype.uint64], self.name)
+        return mstype.float16
+
+
 class EmbeddingLookup(PrimitiveWithInfer):
     """
     Returns a slice of input tensor based on the specified indices.
@@ -166,3 +328,183 @@ class EmbeddingLookup(PrimitiveWithInfer):
                'dtype': params['dtype'],
                'value': None}
         return out
+
+
+class LinSpace(PrimitiveWithInfer):
+    r"""
+    Generates values in an interval. And return the corresponding interpolation accroding to assist.
+
+    Inputs:
+        - **assist** (Tensor[float32]) - The assist value, With shape of 0-D or 1-D.
+        - **start** (Tensor[float32]) - The start of interval, With shape of 0-D.
+        - **stop** (Tensor[float32]) - The end of interval, With shape of 0-D.
+        - **num** (Tensor[int32]) - ticks number in the interval, the ticks include start and stop value.
+          With shape of 0-D.
+
+    Outputs:
+        Tensor, has the same shape as `assist`.
+
+    Examples:
+        >>> linspace = P.LinSpace()
+        >>> assist = Tensor([5, 5.5], mindspore.float32)
+        >>> start = Tensor(1, mindspore.float32)
+        >>> stop = Tensor(10, mindspore.float32)
+        >>> num = Tensor(5, mindspore.int32)
+        >>> output = linspace(assist, start, stop, num)
+        [12.25, 13.375]
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        pass
+
+    def infer_shape(self, assist, start, stop, num):
+        return assist
+
+    def infer_dtype(self, assist, start, stop, num):
+        args = {"num": num}
+        validator.check_tensor_type_same(args, (mstype.int32,), self.name)
+        args = {"assist": assist, "start": start, "stop": stop}
+        validator.check_tensor_type_same(args, (mstype.float32,), self.name)
+        return assist
+
+
+class MatrixDiag(PrimitiveWithInfer):
+    """
+    Returns a batched diagonal tensor with a given batched diagonal values.
+
+    Inputs:
+        - **x** (Tensor) - A tensor which to be element-wise multi by `assist`. It can be of the following data types:
+          float32, float16, int32, int8, uint8.
+        - **assist** (Tensor) - A eye tensor of the same type as `x`. It's rank must greater than or equal to 2 and
+          it's last dimension must equal to the second to last dimension.
+
+    Outputs:
+        Tensor, has the same type and shape as input `assist`.
+
+    Examples:
+        >>> x = Tensor(np.array([1, -1]), mstype.float32)
+        >>> assist = Tensor(np.arange(-12, 0).reshape(3, 2, 2), mindspore.float32)
+        >>> matrix_diag = P.MatrixDiag()
+        >>> result = matrix_diag(x, assist)
+        [[[-12.   11.]
+          [-10.    9.]]
+         [[ -8.    7.]
+          [ -6.    5.]]
+         [[ -4.    3.]
+          [ -2.    1.]]]
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """init MatrixDiag"""
+
+    def infer_dtype(self, x_dtype, assist_dtype):
+        valid_type = [mstype.float16, mstype.float32, mstype.int32, mstype.int8, mstype.uint8]
+        args = {"x": x_dtype, "assist": assist_dtype}
+        validator.check_tensor_type_same(args, valid_type, self.name)
+        return x_dtype
+
+    def infer_shape(self, x_shape, assist_shape):
+        validator.check_integer("assist rank", len(assist_shape), 2, Rel.GE, self.name)
+        validator.check('rank of x', len(x_shape)+1,
+                        'rank of assist', len(assist_shape), Rel.LE, self.name)
+        validator.check('assist\'s penultimate dimension', assist_shape[-2], 'assist\'s last dimension',
+                        assist_shape[-1], Rel.EQ, self.name)
+
+        r_end_dim = -len(x_shape)
+        r_idx = -1
+        while r_idx >= r_end_dim:
+            if x_shape[r_idx] != 1:
+                validator.check("reverse x dim %d" % r_idx, x_shape[r_idx], "reverse assist dim %d" %
+                                assist_shape[r_idx-1], assist_shape[r_idx-1], Rel.EQ, self.name)
+            r_idx = r_idx - 1
+
+        return assist_shape
+
+
+class MatrixDiagPart(PrimitiveWithInfer):
+    r"""
+    Returns the batched diagonal part of a batched tensor.
+
+    Inputs:
+        - **x** (Tensor) - The batched tensor. It can be of the following data types:
+          float32, float16, int32, int8, uint8.
+        - **assist** (Tensor) - A eye tensor of the same type as `x`. With shape same as `x`.
+
+    Outputs:
+        Tensor, data type same as input `x`. The shape should be x.shape[:-2] + [min(x.shape[-2:])].
+
+    Examples:
+        >>> x = Tensor([[[-1, 0], [0, 1]], [-1, 0], [0, 1]], [[-1, 0], [0, 1]]], mindspore.float32)
+        >>> assist = Tensor(np.arange(-12, 0).reshape(3, 2, 2), mindspore.float32)
+        >>> matrix_diag_part = P.MatrixDiagPart()
+        >>> result = matrix_diag_part(x, assist)
+        [[12., -9.], [8., -5.], [4., -1.]]
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """init MatrixDiagPart"""
+
+    def infer_dtype(self, x_dtype, assist_dtype):
+        valid_type = [mstype.float16, mstype.float32, mstype.int32, mstype.int8, mstype.uint8]
+        args = {"x": x_dtype, "assist": assist_dtype}
+        validator.check_tensor_type_same(args, valid_type, self.name)
+        return x_dtype
+
+    def infer_shape(self, x_shape, assist_shape):
+        validator.check_integer("x rank", len(x_shape), 2, Rel.GE, self.name)
+        validator.check("x shape", x_shape, "assist shape", assist_shape, Rel.EQ, self.name)
+
+        if assist_shape[-2] < assist_shape[-1]:
+            out_shape = assist_shape[:-1]
+        else:
+            out_shape = assist_shape[:-2] + assist_shape[-1:]
+        return out_shape
+
+
+class MatrixSetDiag(PrimitiveWithInfer):
+    r"""
+    Modify the batched diagonal part of a batched tensor.
+
+    Inputs:
+        - **x** (Tensor) - The batched tensor. It can be of the following data types:
+          float32, float16, int32, int8, uint8.
+        - **assist** (Tensor) - A eye tensor of the same type as `x`. With shape same as `x`.
+        - **diagonal** (Tensor) - The diagonal values.
+
+    Outputs:
+        Tensor, data type same as input `x`. The shape same as `x`.
+
+    Examples:
+        >>> x = Tensor([[[-1, 0], [0, 1]], [-1, 0], [0, 1]], [[-1, 0], [0, 1]]], mindspore.float32)
+        >>> diagonal = Tensor([[-1., 2.], [-1., 1.], [-1., 1.]], mindspore.float32)
+        >>> matrix_set_diag = P.MatrixSetDiag()
+        >>> result = matrix_set_diag(x, diagonal)
+        [[[-1, 0], [0, 2]], [-1, 0], [0, 1]], [[-1, 0], [0, 1]]]
+
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """init MatrixSetDiag"""
+
+    def infer_dtype(self, x_dtype, diagonal_dtype, assist_dtype):
+        valid_type = [mstype.float16, mstype.float32, mstype.int32, mstype.int8, mstype.uint8]
+        args = {"x": x_dtype, "diagonal": diagonal_dtype, "assist": assist_dtype}
+        validator.check_tensor_type_same(args, valid_type, self.name)
+        return x_dtype
+
+    def infer_shape(self, x_shape, diagonal_shape, assist_shape):
+        validator.check_integer("x rank", len(x_shape), 2, Rel.GE, self.name)
+        validator.check("x shape", x_shape, "assist shape", assist_shape, Rel.EQ, self.name)
+
+        if x_shape[-2] < x_shape[-1]:
+            validator.check("x shape excluding the last dimension", x_shape[:-1], "diagnoal shape",
+                            diagonal_shape, Rel.EQ, self.name)
+        else:
+            validator.check("x shape excluding the second to last dimension", x_shape[:-2]+x_shape[-1:],
+                            "diagonal shape", diagonal_shape, Rel.EQ, self.name)
+
+        return assist_shape
diff --git a/mindspore/ops/operations/_quant_ops.py b/mindspore/ops/operations/_quant_ops.py
index 705968be65..42c2406906 100644
--- a/mindspore/ops/operations/_quant_ops.py
+++ b/mindspore/ops/operations/_quant_ops.py
@@ -15,38 +15,161 @@
 
 """Operators for quantization."""
 
+import mindspore.context as context
 from ..._checkparam import Validator as validator
 from ..._checkparam import Rel
 from ..primitive import PrimitiveWithInfer, prim_attr_register
 from ...common import dtype as mstype
 
-__all__ = ["FakeQuantWithMinMax",
-           "FakeQuantWithMinMaxGrad",
-           "FakeQuantWithMinMaxPerChannel",
-           "FakeQuantWithMinMaxPerChannelGrad",
+__all__ = ["MinMaxUpdatePerLayer",
+           "MinMaxUpdatePerChannel",
+           "FakeQuantPerLayer",
+           "FakeQuantPerLayerGrad",
+           "FakeQuantPerChannel",
+           "FakeQuantPerChannelGrad",
            "BatchNormFold",
            "BatchNormFoldGrad",
            "CorrectionMul",
            "CorrectionMulGrad",
+           "CorrectionMulGradReduce",
            "BatchNormFold2",
            "BatchNormFold2Grad",
            "BatchNormFoldD",
-           "BNTrainingReduce",
+           "BatchNormFoldGradD",
            "BatchNormFold2_D",
-           "FakeQuantWithMinMaxUpdate",
+           "BatchNormFold2GradD",
+           "BatchNormFold2GradReduce"
            ]
 
 
-class FakeQuantWithMinMax(PrimitiveWithInfer):
+class MinMaxUpdatePerLayer(PrimitiveWithInfer):
+    r"""
+    Update min and max per layer.
+
+    Args:
+        ema (bool): Use EMA algorithm update value min and max. Default: False.
+        ema_decay (int) : EMA algorithm decay parameter. Default: 0.999.
+
+    Inputs:
+        - **x** (Tensor) : float32 Tensor representing the shape of the output tensor.
+        - **min** (Tensor) : Value of the min range of the input data x.
+        - **max** (Tensor) : Value of the max range of the input data x.
+
+    Outputs:
+        - Tensor: Simulate quantize tensor of x.
+
+    Examples:
+        >>> input_tensor = Tensor(np.random.rand(3, 16, 5, 5), mstype.float32)
+        >>> min_tensor = Tensor(np.array([-6]), mstype.float32)
+        >>> max_tensor = Tensor(np.array([6]), mstype.float32)
+        >>> output_tensor = MinMaxUpdatePerLayer(num_bits=8)(input_tensor, min_tensor, max_tensor)
+    """
+    support_quant_bit = [4, 7, 8]
+
+    @prim_attr_register
+    def __init__(self, ema=False, ema_decay=0.999):
+        """init FakeQuantMinMaxPerLayerUpdate OP"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import minmax_update_perlayer
+        if ema and not ema_decay:
+            raise ValueError(
+                f"For '{self.name}' attr \'ema\' and \'ema_decay\' should set together.")
+
+        self.ema = validator.check_value_type('ema', ema, (bool,), self.name)
+        self.ema_decay = validator.check_number_range(
+            'ema_decay', ema_decay, 0, 1, Rel.INC_BOTH, self.name)
+        self.init_prim_io_names(inputs=['x', 'min', 'max'],
+                                outputs=['min_up', 'max_up'])
+
+    def infer_shape(self, x_shape, min_shape, max_shape):
+        validator.check_integer("x rank", len(x_shape), 1, Rel.GE, self.name)
+        validator.check("min shape", min_shape, "max shape",
+                        max_shape, Rel.EQ, self.name)
+        validator.check_integer("min shape", len(
+            min_shape), 1, Rel.EQ, self.name)
+        return min_shape, max_shape
+
+    def infer_dtype(self, x_type, min_type, max_type):
+        valid_types = (mstype.float16, mstype.float32)
+        validator.check_tensor_type_same({"x": x_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"min": min_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"max": max_type}, valid_types, self.name)
+        return min_type, max_type
+
+
+class MinMaxUpdatePerChannel(PrimitiveWithInfer):
+    r"""
+     Update min and max per channel.
+
+    Args:
+        ema (bool): Use EMA algorithm update value min and max. Default: False.
+        ema_decay (int) : EMA algorithm decay parameter. Default: 0.999.
+        channel_axis (int): Channel asis for per channel compute. Default: 1.
+
+    Inputs:
+        - **x** (Tensor) : float32 Tensor representing the shape of the output tensor.
+        - **min** (Tensor) : Value of the min range of the input data x.
+        - **max** (Tensor) : Value of the max range of the input data x.
+
+    Outputs:
+        - Tensor: Simulate quantize tensor of x.
+
+    Examples:
+        >>> x = Tensor(np.random.rand(3, 16, 5, 5), mstype.float32)
+        >>> min = Tensor(np.random.uniform(-1, 1, size=16), mstype.float32)
+        >>> max = Tensor(np.random.uniform(-1, 1, size=16), mstype.float32)
+        >>> output_tensor = MinMaxUpdatePerChannel(num_bits=8)(x, min, max)
+    """
+    support_quant_bit = [4, 7, 8]
+
+    @prim_attr_register
+    def __init__(self, ema=False, ema_decay=0.999, channel_axis=1):
+        """init FakeQuantPerChannelUpdate OP for Ascend"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import minmax_update_perchannel
+        if ema and not ema_decay:
+            raise ValueError(
+                f"For '{self.name}' attr \'ema\' and \'ema_decay\' should set together.")
+
+        self.ema = validator.check_value_type('ema', ema, (bool,), self.name)
+        self.ema_decay = validator.check_number_range(
+            'ema_decay', ema_decay, 0, 1, Rel.INC_BOTH, self.name)
+        self.channel_axis = validator.check_integer(
+            'channel axis', channel_axis, 0, Rel.GE, self.name)
+        self.init_prim_io_names(
+            inputs=['x', 'min', 'max'], outputs=['min_up', 'max_up'])
+
+    def infer_shape(self, x_shape, min_shape, max_shape):
+        validator.check_integer("x rank", len(x_shape), 1, Rel.GT, self.name)
+        validator.check("min shape", min_shape, "max shape",
+                        max_shape, Rel.EQ, self.name)
+        validator.check_integer("min shape", len(
+            min_shape), 1, Rel.EQ, self.name)
+        return min_shape, max_shape
+
+    def infer_dtype(self, x_type, min_type, max_type):
+        valid_types = (mstype.float16, mstype.float32)
+        validator.check_tensor_type_same(
+            {"x": x_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"min": min_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"max": max_type}, valid_types, self.name)
+        return min_type, max_type
+
+
+class FakeQuantPerLayer(PrimitiveWithInfer):
     r"""
     Simulate the quantize and dequantize operations in training time.
 
     Args:
-        num_bits (int) : Number bits for aware quantilization. Default: 8.
+        num_bits (int) : Number bits for quantization aware. Default: 8.
         ema (bool): Use EMA algorithm update value min and max. Default: False.
         ema_decay (int) : EMA algorithm decay parameter. Default: 0.999.
         quant_delay (int): Quantilization delay parameter. Before delay step in training time not update
-            simulate aware quantize funcion. After delay step in training time begin simulate the aware
+            simulate quantization aware funcion. After delay step in training time begin simulate the aware
             quantize funcion. Default: 0.
         symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
         narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
@@ -64,82 +187,120 @@ class FakeQuantWithMinMax(PrimitiveWithInfer):
         >>> input_tensor = Tensor(np.random.rand(3, 16, 5, 5), mstype.float32)
         >>> min_tensor = Tensor(np.array([-6]), mstype.float32)
         >>> max_tensor = Tensor(np.array([6]), mstype.float32)
-        >>> output_tensor = P.FakeQuantWithMinMax(num_bits=8)(input_tensor, min_tensor, max_tensor)
+        >>> output_tensor = FakeQuantPerLayer(num_bits=8)(input_tensor, min_tensor, max_tensor)
     """
     support_quant_bit = [4, 7, 8]
 
     @prim_attr_register
-    def __init__(self, num_bits=8, ema=False, ema_decay=0.999, quant_delay=0, symmetric=False, narrow_range=False,
+    def __init__(self,
+                 num_bits=8,
+                 ema=False,
+                 ema_decay=0.999,
+                 quant_delay=0,
+                 symmetric=False,
+                 narrow_range=False,
                  training=True):
-        """init FakeQuantWithMinMax OP"""
+        """init FakeQuantPerLayer OP"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import fake_quant_perlayer
         if num_bits not in self.support_quant_bit:
-            raise ValueError(f"For '{self.name}' attr \'num_bits\' is not support.")
+            raise ValueError(
+                f"For '{self.name}' attr \'num_bits\' is not support.")
         if ema and not ema_decay:
-            raise ValueError(f"For '{self.name}' attr \'ema\' and \'ema_decay\' should set together.")
+            raise ValueError(
+                f"For '{self.name}' attr \'ema\' and \'ema_decay\' should set together.")
 
         self.ema = validator.check_value_type('ema', ema, (bool,), self.name)
-        self.symmetric = validator.check_value_type('symmetric', symmetric, (bool,), self.name)
-        self.narrow_range = validator.check_value_type('narrow_range', narrow_range, (bool,), self.name)
-        self.training = validator.check_value_type('training', training, (bool,), self.name)
-        self.ema_decay = validator.check_number_range('ema_decay', ema_decay, 0, 1, Rel.INC_BOTH, self.name)
-        self.num_bits = validator.check_integer('num_bits', num_bits, 0, Rel.GT, self.name)
-        self.quant_delay = validator.check_value_type('quant_delay', quant_delay, (int,), self.name)
+        self.symmetric = validator.check_value_type(
+            'symmetric', symmetric, (bool,), self.name)
+        self.narrow_range = validator.check_value_type(
+            'narrow_range', narrow_range, (bool,), self.name)
+        self.training = validator.check_value_type(
+            'training', training, (bool,), self.name)
+        self.ema_decay = validator.check_number_range(
+            'ema_decay', ema_decay, 0, 1, Rel.INC_BOTH, self.name)
+        self.num_bits = validator.check_integer(
+            'num_bits', num_bits, 0, Rel.GT, self.name)
+        self.quant_delay = validator.check_value_type(
+            'quant_delay', quant_delay, (int,), self.name)
         self.init_prim_io_names(inputs=['x', 'min', 'max'],
                                 outputs=['out'])
 
     def infer_shape(self, x_shape, min_shape, max_shape):
-        validator.check_integer("x rank", len(x_shape), 1, Rel.GT, self.name)
+        validator.check_integer("x rank", len(x_shape), 1, Rel.GE, self.name)
         validator.check("min shape", min_shape, "max shape", max_shape, Rel.EQ, self.name)
-        validator.check_integer("min rank", len(min_shape), 1, Rel.EQ, self.name)
+        validator.check_integer("min shape", len(min_shape), 1, Rel.EQ, self.name)
         return x_shape
 
     def infer_dtype(self, x_type, min_type, max_type):
         valid_types = (mstype.float16, mstype.float32)
         validator.check_tensor_type_same({"x": x_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"min": min_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"max": max_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"min": min_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"max": max_type}, valid_types, self.name)
         return x_type
 
 
-class FakeQuantWithMinMaxGrad(PrimitiveWithInfer):
+class FakeQuantPerLayerGrad(PrimitiveWithInfer):
     r"""
-    Performs grad of FakeQuantWithMinMax operation.
+    Performs grad of FakeQuantPerLayerGrad operation.
 
     Examples:
-        >>> fake_min_max_grad = P.FakeQuantWithMinMaxGrad()
+        >>> fake_min_max_grad = FakeQuantPerLayerGrad()
         >>> dout = Tensor(np.array([[-2.3, 1.2], [5.7, 0.2]]), mindspore.float32)
         >>> input_x = Tensor(np.array([[18, -23], [0.2, 6]]), mindspore.float32)
         >>> _min = Tensor(np.array([-4]), mindspore.float32)
         >>> _max = Tensor(np.array([2]), mindspore.float32)
         >>> result = fake_min_max_grad(dout, input_x, _min, _max)
     """
-    support_quant_bit = [4, 8]
+    support_quant_bit = [4, 7, 8]
 
     @prim_attr_register
-    def __init__(self, num_bits=8, quant_delay=0):
+    def __init__(self,
+                 num_bits=8,
+                 quant_delay=0,
+                 symmetric=False,
+                 narrow_range=False):
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import fake_quant_perlayer_grad
         if num_bits not in self.support_quant_bit:
-            raise ValueError(f"For '{self.name}' attr \'num_bits\' is not support.")
-
-        self.quant_delay = validator.check_value_type('quant_delay', quant_delay, (int,), self.name)
-        self.num_bits = validator.check_integer('num_bits', num_bits, 0, Rel.GT, self.name)
-        self.init_prim_io_names(inputs=['dout', 'x', 'min', 'max'], outputs=['dx'])
+            raise ValueError(
+                f"For '{self.name}' attr \'num_bits\' is not support.")
+
+        self.num_bits = validator.check_integer(
+            'num_bits', num_bits, 0, Rel.GT, self.name)
+        self.quant_delay = validator.check_value_type(
+            'quant_delay', quant_delay, (int,), self.name)
+        self.symmetric = validator.check_value_type(
+            'symmetric', symmetric, (bool,), self.name)
+        self.narrow_range = validator.check_value_type(
+            'narrow_range', narrow_range, (bool,), self.name)
+        self.init_prim_io_names(
+            inputs=['dout', 'x', 'min', 'max'], outputs=['dx'])
 
     def infer_shape(self, dout_shape, x_shape, min_shape, max_shape):
-        validator.check("dout shape", dout_shape, "x shape", x_shape, Rel.EQ, self.name)
-        validator.check("min shape", min_shape, "max shape", max_shape, Rel.EQ, self.name)
-        validator.check_integer("min rank", len(min_shape), 1, Rel.EQ, self.name)
+        validator.check("dout shape", dout_shape, "x shape",
+                        x_shape, Rel.EQ, self.name)
+        validator.check("min shape", min_shape, "max shape",
+                        max_shape, Rel.EQ, self.name)
+        validator.check_integer("min shape", len(
+            min_shape), 1, Rel.EQ, self.name)
         return dout_shape
 
     def infer_dtype(self, dout_type, x_type, min_type, max_type):
         valid_types = (mstype.float16, mstype.float32)
-        validator.check_tensor_type_same({"dout": dout_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"dout": dout_type}, valid_types, self.name)
         validator.check_tensor_type_same({"x": x_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"min": min_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"max": max_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"min": min_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"max": max_type}, valid_types, self.name)
         return dout_type
 
 
-class FakeQuantWithMinMaxPerChannel(PrimitiveWithInfer):
+class FakeQuantPerChannel(PrimitiveWithInfer):
     r"""
     Simulate the quantize and dequantize operations in training time base on per channel.
 
@@ -163,70 +324,110 @@ class FakeQuantWithMinMaxPerChannel(PrimitiveWithInfer):
         - Tensor, has the same type as input.
 
     Examples:
-        >>> fake_quant = P.FakeQuantWithMinMaxPerChannel()
+        >>> fake_quant = FakeQuantPerChannel()
         >>> input_x = Tensor(np.array([3, 4, 5, -2, -3, -1]).reshape(3, 2), mindspore.float32)
         >>> _min = Tensor(np.linspace(-2, 2, 12).reshape(3, 2, 2), mindspore.float32)
         >>> _max = Tensor(np.linspace(8, 12, 12).reshape(3, 2, 2), mindspore.float32)
         >>> result = fake_quant(input_x, _min, _max)
     """
-    support_quant_bit = [4, 8]
-    channel_axis = 0
+    support_quant_bit = [4, 7, 8]
 
     @prim_attr_register
-    def __init__(self, num_bits=8, ema=False, ema_decay=0.999, quant_delay=0, symmetric=False, narrow_range=False,
-                 training=True):
-        """init FakeQuantWithMinMaxPerChannel OP"""
+    def __init__(self,
+                 num_bits=8,
+                 ema=False,
+                 ema_decay=0.999,
+                 quant_delay=0,
+                 symmetric=False,
+                 narrow_range=False,
+                 training=True,
+                 channel_axis=1):
+        """init FakeQuantPerChannel OP"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import fake_quant_perchannel
         if num_bits not in self.support_quant_bit:
-            raise ValueError(f"For '{self.name}' Attr \'num_bits\' is not support.")
+            raise ValueError(
+                f"For '{self.name}' Attr \'num_bits\' is not support.")
         if ema and not ema_decay:
-            raise ValueError(f"For '{self.name}' attr \'ema\' and \'ema_decay\' should set together.")
+            raise ValueError(
+                f"For '{self.name}' attr \'ema\' and \'ema_decay\' should set together.")
 
         self.ema = validator.check_value_type('ema', ema, (bool,), self.name)
-        self.symmetric = validator.check_value_type('symmetric', symmetric, (bool,), self.name)
-        self.narrow_range = validator.check_value_type('narrow_range', narrow_range, (bool,), self.name)
-        self.training = validator.check_value_type('training', training, (bool,), self.name)
-        self.ema_decay = validator.check_number_range('ema_decay', ema_decay, 0, 1, Rel.INC_BOTH, self.name)
-        self.num_bits = validator.check_integer('num_bits', num_bits, 0, Rel.GT, self.name)
-        self.quant_delay = validator.check_value_type('quant_delay', quant_delay, (int,), self.name)
+        self.symmetric = validator.check_value_type(
+            'symmetric', symmetric, (bool,), self.name)
+        self.narrow_range = validator.check_value_type(
+            'narrow_range', narrow_range, (bool,), self.name)
+        self.training = validator.check_value_type(
+            'training', training, (bool,), self.name)
+        self.ema_decay = validator.check_number_range(
+            'ema_decay', ema_decay, 0, 1, Rel.INC_BOTH, self.name)
+        self.num_bits = validator.check_integer(
+            'num_bits', num_bits, 0, Rel.GT, self.name)
+        self.quant_delay = validator.check_value_type(
+            'quant_delay', quant_delay, (int,), self.name)
+        self.channel_axis = validator.check_integer(
+            'channel_axis', channel_axis, 0, Rel.GE, self.name)
         self.init_prim_io_names(inputs=['x', 'min', 'max'], outputs=['out'])
 
     def infer_shape(self, x_shape, min_shape, max_shape):
-        validator.check_integer("x rank", len(x_shape), 1, Rel.GT, self.name)
-        validator.check_integer("min shape[0]", min_shape[0], x_shape[self.channel_axis], Rel.EQ, self.name)
-        validator.check_integer("max shape[0]", max_shape[0], x_shape[self.channel_axis], Rel.EQ, self.name)
+        validator.check_integer("x rank", len(x_shape), 1, Rel.GE, self.name)
+        validator.check("min shape", min_shape, "max shape", max_shape, Rel.EQ, self.name)
+        validator.check_integer(
+            "min shape", min_shape[0], x_shape[self.channel_axis], Rel.EQ, self.name)
+        validator.check_integer(
+            "max shape", max_shape[0], x_shape[self.channel_axis], Rel.EQ, self.name)
         return x_shape
 
     def infer_dtype(self, x_type, min_type, max_type):
         valid_types = (mstype.float16, mstype.float32)
         validator.check_tensor_type_same({"x": x_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"min": min_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"max": max_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"min": min_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"max": max_type}, valid_types, self.name)
         return x_type
 
 
-class FakeQuantWithMinMaxPerChannelGrad(PrimitiveWithInfer):
+class FakeQuantPerChannelGrad(PrimitiveWithInfer):
     r"""
-    Performs grad of FakeQuantWithMinMaxPerChannel operation.
+    Performs grad of FakeQuantPerChannelGrad operation.
 
     Examples:
-        >>> fqmmpc_grad = P.FakeQuantWithMinMaxPerChannelGrad()
+        >>> fqmmpc_grad = FakeQuantPerChannelGrad()
         >>> input_x = Tensor(np.random.randint(-4, 4, (2, 3, 4)), mindspore.float32)
         >>> dout = Tensor(np.random.randint(-2, 2, (2, 3, 4)), mindspore.float32)
         >>> _min = Tensor(np.random.randint(-8, 2, (2, 3, 4)), mindspore.float32)
         >>> _max = Tensor(np.random.randint(-2, 8, (2, 3, 4)), mindspore.float32)
         >>> result = fqmmpc_grad(dout, input_x, _min, _max)
     """
-    support_quant_bit = [4, 8]
+    support_quant_bit = [4, 7, 8]
 
     @prim_attr_register
-    def __init__(self, num_bits=8, quant_delay=0):
-        """init FakeQuantWithMinMaxPerChannel Fill"""
+    def __init__(self,
+                 num_bits=8,
+                 quant_delay=0,
+                 symmetric=False,
+                 narrow_range=False,
+                 channel_axis=1):
+        """init FakeQuantPerChannelGrad Fill"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import fake_quant_perchannel_grad
         if num_bits not in self.support_quant_bit:
-            raise ValueError(f"For '{self.name}' attr \'num_bits\' is not support.")
-
-        self.quant_delay = validator.check_value_type('quant_delay', quant_delay, (int,), self.name)
-        self.num_bits = validator.check_integer('num_bits', num_bits, 0, Rel.GT, self.name)
-        self.init_prim_io_names(inputs=['dout', 'x', 'min', 'max'], outputs=['dx'])
+            raise ValueError(
+                f"For '{self.name}' attr \'num_bits\' is not support.")
+
+        self.num_bits = validator.check_integer(
+            'num_bits', num_bits, 0, Rel.GT, self.name)
+        self.quant_delay = validator.check_value_type(
+            'quant_delay', quant_delay, (int,), self.name)
+        self.symmetric = validator.check_value_type(
+            'symmetric', symmetric, (bool,), self.name)
+        self.narrow_range = validator.check_value_type(
+            'narrow_range', narrow_range, (bool,), self.name)
+        self.channel_axis = validator.check_integer(
+            'channel axis', channel_axis, 0, Rel.GE, self.name)
+        self.init_prim_io_names(
+            inputs=['dout', 'x', 'min', 'max'], outputs=['dx'])
 
     def infer_shape(self, dout_shape, x_shape, min_shape, max_shape):
         validator.check("dout shape", dout_shape, "x shape", x_shape)
@@ -235,10 +436,13 @@ class FakeQuantWithMinMaxPerChannelGrad(PrimitiveWithInfer):
 
     def infer_dtype(self, dout_type, x_type, min_type, max_type):
         valid_types = (mstype.float16, mstype.float32)
-        validator.check_tensor_type_same({"dout": dout_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"dout": dout_type}, valid_types, self.name)
         validator.check_tensor_type_same({"x": x_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"min": min_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"max": max_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"min": min_type}, valid_types, self.name)
+        validator.check_tensor_type_same(
+            {"max": max_type}, valid_types, self.name)
         return dout_type
 
 
@@ -247,7 +451,7 @@ class BatchNormFold(PrimitiveWithInfer):
     Batch normalization folded.
 
     Args:
-        momentum (float): Momentum value should be [0, 1]. Default: 0.1.
+        momentum (float): Momentum value should be [0, 1]. Default: 0.9.
         epsilon (float): A small float number to avoid dividing by 0. 1e-5 if dtype in
             float32 else 1e-3. Default: 1e-5.
         is_training (bool): In training mode set True, else set False. Default: True.
@@ -279,7 +483,7 @@ class BatchNormFold(PrimitiveWithInfer):
     channel_axis = 1
 
     @prim_attr_register
-    def __init__(self, momentum=0.1, epsilon=1e-5, is_training=True, freeze_bn=0):
+    def __init__(self, momentum=0.9, epsilon=1e-5, is_training=True, freeze_bn=0):
         """init batch norm fold layer"""
         self.momentum = validator.check_number_range('momentum', momentum, 0, 1, Rel.INC_BOTH, self.name)
         self.epsilon = validator.check_float_positive('epsilon', epsilon, self.name)
@@ -292,7 +496,7 @@ class BatchNormFold(PrimitiveWithInfer):
     def infer_shape(self, x_shape, mean_shape, variance_shape, global_step_shape):
         validator.check("mean shape", mean_shape, "gamma_shape", variance_shape, Rel.EQ, self.name)
         validator.check("mean_shape[0]", mean_shape[0], "input channel", x_shape[self.channel_axis], Rel.EQ, self.name)
-        validator.check_integer("global_step rank", len(global_step_shape), 1, Rel.EQ, self.name)
+        validator.check_integer("global step shape len", len(global_step_shape), 1, Rel.EQ, self.name)
         return mean_shape, mean_shape, mean_shape, mean_shape
 
     def infer_dtype(self, x_type, mean_type, variance_type, global_step_type):
@@ -339,7 +543,7 @@ class BatchNormFoldGrad(PrimitiveWithInfer):
                         "batch_std shape", batch_std_shape, Rel.EQ, self.name)
         validator.check("d_batch_mean_shape[0]", d_batch_mean_shape[0],
                         "input channel", x_shape[self.channel_axis], Rel.EQ, self.name)
-        validator.check_integer("global_step rank", len(global_step_shape), 1, Rel.EQ, self.name)
+        validator.check_integer("global step shape len", len(global_step_shape), 1, Rel.EQ, self.name)
         return x_shape
 
     def infer_dtype(self, d_batch_mean_type, d_batch_std_type, x_type, batch_mean_type, batch_std_type,
@@ -376,6 +580,8 @@ class CorrectionMul(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self, channel_axis=0):
         """init correction mul layer"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import correction_mul
         self.channel_axis = channel_axis
         self.init_prim_io_names(inputs=['x', 'batch_std', 'running_std'],
                                 outputs=['out'])
@@ -408,9 +614,11 @@ class CorrectionMulGrad(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self, channel_axis=0):
         """init correction mul layer"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import correction_mul_grad
         self.channel_axis = channel_axis
         self.init_prim_io_names(inputs=['dout', 'x', 'gamma', 'running_std'],
-                                outputs=['dx', 'd_gamma'])
+                                outputs=['dx', 'mul_dx'])
 
     def infer_shape(self, dout_shape, x_shape, gamma_shape, running_std_shape):
         validator.check("dout shape", dout_shape, "x_shape x", x_shape, Rel.EQ, self.name)
@@ -418,12 +626,45 @@ class CorrectionMulGrad(PrimitiveWithInfer):
                         Rel.EQ, self.name)
         validator.check("running_std_shape[0]", running_std_shape[0],
                         "dout channel size", dout_shape[self.channel_axis], Rel.EQ, self.name)
+        if context.get_context('device_target') == "Ascend":
+            return x_shape, x_shape
         return x_shape, gamma_shape
 
     def infer_dtype(self, dout_type, x_type, gamma_type, running_std_type):
         args = {"dout": dout_type, "x": x_type, "gamma": gamma_type, "running_std": running_std_type}
         validator.check_tensor_type_same(args, (mstype.float16, mstype.float32), self.name)
-        return x_type, x_type
+        if context.get_context('device_target') == "Ascend":
+            return x_type, x_type
+        return x_type, gamma_type
+
+
+class CorrectionMulGradReduce(PrimitiveWithInfer):
+    r"""
+    Performs grad reduce of CorrectionMul operation.
+
+    Examples:
+        >>> correction_mul_grad_rd = P.CorrectionMulGradReduce()
+        >>> dout = Tensor(np.array([1.5, -2.2, 0.7, -3, 1.6, 2.8]).reshape(2, 1, 1, 3), mindspore.float32)
+        >>> input_x = Tensor(np.random.randint(0, 256, (2, 1, 1, 3)), mindspore.float32)
+        >>> gamma = Tensor(np.array([0.2, -0.2, 2.5, -1.]).reshape(2, 1, 2), mindspore.float32)
+        >>> running_std = Tensor(np.array([1.2, 0.1, 0.7, 2.3]).reshape(2, 1, 2), mindspore.float32)
+        >>> result = correction_mul_grad_rd(dout, input_x, gamma, running_std)
+    """
+
+    @prim_attr_register
+    def __init__(self, channel_axis=0):
+        """init correction mul reduce layer"""
+        if context.get_context('device_target') == "Ascend":
+            from mindspore.ops._op_impl._custom_op import correction_mul_grad
+        self.channel_axis = channel_axis
+        self.init_prim_io_names(inputs=['mul_dx'],
+                                outputs=['d_gamma'])
+
+    def infer_shape(self, mul_dx_shape):
+        return [mul_dx_shape[self.channel_axis]]
+
+    def infer_dtype(self, mul_dx_type):
+        return mul_dx_type
 
 
 class BatchNormFold2(PrimitiveWithInfer):
@@ -477,7 +718,7 @@ class BatchNormFold2(PrimitiveWithInfer):
         validator.check("batch_std shape", batch_std_shape, "batch_mean shape", gamma_shape, Rel.EQ, self.name)
         validator.check("batch_std_shape[0]", batch_std_shape[0], "x_shape channel size", x_shape[self.channel_axis],
                         Rel.EQ, self.name)
-        validator.check_integer("global_step rank", len(global_step_shape), 1, Rel.EQ, self.name)
+        validator.check_integer("global step shape len", len(global_step_shape), 1, Rel.EQ, self.name)
         return x_shape
 
     def infer_dtype(self, x_type, beta_type, gamma_type, batch_std_type, running_std_type, batch_mean_type,
@@ -525,7 +766,7 @@ class BatchNormFold2Grad(PrimitiveWithInfer):
         validator.check("batch_std shape", batch_std_shape, "gamma shape", gamma_shape, Rel.EQ, self.name)
         validator.check("batch_std size", batch_std_shape[0], "dout channel size", dout_shape[self.channel_axis],
                         Rel.EQ, self.name)
-        validator.check_integer("global_step rank", len(global_step_shape), 1, Rel.EQ, self.name)
+        validator.check_integer("global step shape len", len(global_step_shape), 1, Rel.EQ, self.name)
         return gamma_shape, gamma_shape, gamma_shape, gamma_shape, x_shape
 
     def infer_dtype(self, dout_type, x_type, gamma_type,
@@ -607,32 +848,6 @@ class BatchNormFoldGradD(PrimitiveWithInfer):
         return x_type
 
 
-class BNTrainingReduce(PrimitiveWithInfer):
-    """
-    reduce sum at axis [0, 2, 3].
-
-    Inputs:
-        - **x** (Tensor)  - Tensor of shape :math:`(N, C)`.
-
-    Outputs:
-        - **x_sum** (Tensor) - Tensor has the same shape as x.
-        - **x_square_sum** (Tensor) - Tensor has the same shape as x.
-
-    """
-
-    @prim_attr_register
-    def __init__(self):
-        """init _BNTrainingReduce layer"""
-        self.init_prim_io_names(inputs=['x'],
-                                outputs=['x_sum', 'x_square_sum'])
-
-    def infer_shape(self, x_shape):
-        return [x_shape[1]], [x_shape[1]]
-
-    def infer_dtype(self, x_type):
-        return x_type, x_type
-
-
 class BatchNormFold2_D(PrimitiveWithInfer):
     """
     Scale the bias with a correction factor to the long term statistics
@@ -735,70 +950,3 @@ class BatchNormFold2GradReduce(PrimitiveWithInfer):
     def infer_dtype(self, dout_type, x_type):
         validator.check("dout type", dout_type, "x type", x_type)
         return dout_type, dout_type
-
-
-class FakeQuantWithMinMaxUpdate(PrimitiveWithInfer):
-    r"""
-    Simulate the quantize and dequantize operations in training time.
-
-    Args:
-        num_bits (int) : Number bits for aware quantilization. Default: 8.
-        ema (bool): Use EMA algorithm update value min and max. Default: False.
-        ema_decay (int) : EMA algorithm decay parameter. Default: 0.999.
-        quant_delay (int): Quantilization delay parameter. Before delay step in training time not update
-            simulate aware quantize funcion. After delay step in training time begin simulate the aware
-            quantize funcion. Default: 0.
-        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
-        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
-        training (bool): Training the network or not. Default: True.
-
-    Inputs:
-        - **x** (Tensor) : float32 Tensor representing the shape of the output tensor.
-        - **min** (Tensor) : Value of the min range of the input data x.
-        - **max** (Tensor) : Value of the max range of the input data x.
-
-    Outputs:
-        - Tensor: Simulate quantize tensor of x.
-
-    Examples:
-        >>> input_tensor = Tensor(np.random.rand(3, 16, 5, 5), mstype.float32)
-        >>> min_tensor = Tensor(np.array([-6]), mstype.float32)
-        >>> max_tensor = Tensor(np.array([6]), mstype.float32)
-        >>> output_tensor = P.FakeQuantWithMinMax(num_bits=8)(input_tensor, min_tensor, max_tensor)
-    """
-    support_quant_bit = [4, 7, 8]
-
-    @prim_attr_register
-    def __init__(self, num_bits=8, ema=False, ema_decay=0.999, quant_delay=0, symmetric=False, narrow_range=False,
-                 training=True):
-        """init FakeQuantWithMinMax OP"""
-        from mindspore.ops._op_impl._custom_op import correction_mul, correction_mul_grad
-        from mindspore.ops._op_impl._custom_op import fake_quant_with_min_max, fake_quant_with_min_max_grad
-        from mindspore.ops._op_impl._custom_op import fake_quant_with_min_max_update
-        if num_bits not in self.support_quant_bit:
-            raise ValueError(f"For '{self.name}' attr \'num_bits\' is not support.")
-        if ema and not ema_decay:
-            raise ValueError(f"For '{self.name}' attr \'ema\' and \'ema_decay\' should set together.")
-
-        self.ema = validator.check_value_type('ema', ema, (bool,), self.name)
-        self.symmetric = validator.check_value_type('symmetric', symmetric, (bool,), self.name)
-        self.narrow_range = validator.check_value_type('narrow_range', narrow_range, (bool,), self.name)
-        self.training = validator.check_value_type('training', training, (bool,), self.name)
-        self.ema_decay = validator.check_number_range('ema_decay', ema_decay, 0, 1, Rel.INC_BOTH, self.name)
-        self.num_bits = validator.check_integer('num_bits', num_bits, 0, Rel.GT, self.name)
-        self.quant_delay = validator.check_value_type('quant_delay', quant_delay, (int,), self.name)
-        self.init_prim_io_names(inputs=['x', 'min', 'max'],
-                                outputs=['min_up', 'max_up'])
-
-    def infer_shape(self, x_shape, min_shape, max_shape):
-        validator.check_integer("x rank", len(x_shape), 1, Rel.GT, self.name)
-        validator.check("min shape", min_shape, "max shape", max_shape, Rel.EQ, self.name)
-        validator.check_integer("min rank", len(min_shape), 1, Rel.EQ, self.name)
-        return min_shape, max_shape
-
-    def infer_dtype(self, x_type, min_type, max_type):
-        valid_types = (mstype.float16, mstype.float32)
-        validator.check_tensor_type_same({"x": x_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"min": min_type}, valid_types, self.name)
-        validator.check_tensor_type_same({"max": max_type}, valid_types, self.name)
-        return min_type, max_type
diff --git a/mindspore/ops/operations/array_ops.py b/mindspore/ops/operations/array_ops.py
index 79a92ed7c8..1bb39d1547 100644
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@@ -28,12 +28,14 @@ from ..._checkparam import Validator as validator
 from ..._checkparam import Rel
 from ...common import dtype as mstype
 from ...common.tensor import Tensor
+from ...common.parameter import Parameter
 from ..operations.math_ops import _infer_shape_reduce
 from .._utils import get_concat_offset
-from ..primitive import Primitive, PrimitiveWithInfer, prim_attr_register
+from ..primitive import Primitive, PrimitiveWithInfer, prim_attr_register, _run_op
 from ..._c_expression import signature_rw as sig_rw
 from ..._c_expression import signature_kind as sig_kind
 from ..._c_expression import signature_dtype as sig_dtype
+from ..._c_expression import typing
 
 def _check_infer_attr_reduce(axis, keep_dims, prim_name):
     validator.check_value_type('keep_dims', keep_dims, [bool], prim_name)
@@ -81,12 +83,17 @@ class ExpandDims(PrimitiveWithInfer):
         axis_v = axis['value']
         rank = len(x_shape)
         validator.check_int_range('axis', axis_v, -rank - 1, rank, Rel.INC_BOTH, self.name)
+        value = None
+        if x['value'] is not None:
+            value = x['value'].asnumpy()
+            value = np.expand_dims(value, axis_v)
+            value = Tensor(value)
         if axis_v < 0:
             axis_v = rank + 1 + axis_v
         x_shape.insert(axis_v, 1)
         out = {'shape': x_shape,
                'dtype': x['dtype'],
-               'value': None}
+               'value': value}
         return out
 
 
@@ -122,7 +129,8 @@ class SameTypeShape(PrimitiveWithInfer):
     Checks whether data type and shape of two tensors are the same.
 
     Raises:
-        ValueError: If not the same.
+        TypeError: If data type not the same.
+        ValueError: If shape of two tensors not the same.
 
     Inputs:
         - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
@@ -144,10 +152,10 @@ class SameTypeShape(PrimitiveWithInfer):
 
     def __call__(self, x, y):
         """run in PyNative mode"""
-        validator.check_subclass('x', x.dtype(), mstype.tensor, self.name)
-        validator.check_subclass('y', y.dtype(), mstype.tensor, self.name)
-        validator.check('x dtype', x.dtype(), 'y dtype', y.dtype(), Rel.EQ, self.name, TypeError)
-        validator.check('x shape', x.shape(), 'y shape', y.shape(), Rel.EQ, self.name)
+        validator.check_value_type('x', x, Tensor, self.name)
+        validator.check_value_type('y', y, Tensor, self.name)
+        validator.check('x dtype', x.dtype, 'y dtype', y.dtype, Rel.EQ, self.name, TypeError)
+        validator.check('x shape', x.shape, 'y shape', y.shape, Rel.EQ, self.name)
         return x
 
     def __infer__(self, x, y):
@@ -184,6 +192,18 @@ class Cast(PrimitiveWithInfer):
         """init Cast"""
         self.init_prim_io_names(inputs=['x', 'dst_type'], outputs=['output'])
 
+    def check_elim(self, x, dtype):
+        if  isinstance(x, (Tensor, numbers.Number, Parameter)):
+            if isinstance(x, Tensor) and x.dtype == dtype:
+                return (True, x)
+            if isinstance(x, numbers.Number):
+                return (True, Tensor(x, dtype=dtype))
+            if isinstance(x, Parameter):
+                data = x.default_input
+                if data.dtype == dtype:
+                    return (True, x)
+        return (False, None)
+
     def __infer__(self, x, t):
         src_type = x['dtype']
         dst_type = t['value']
@@ -490,7 +510,7 @@ class GatherV2(PrimitiveWithInfer):
           The original Tensor.
         - **input_indices** (Tensor) - The shape of tensor is :math:`(y_1, y_2, ..., y_S)`.
           Specifies the indices of elements of the original Tensor. Must be in the range
-          `[0, input_param.shape()[axis])`.
+          `[0, input_param.shape[axis])`.
         - **axis** (int) - Specifies the dimension index to gather indices.
 
     Outputs:
@@ -525,99 +545,27 @@ class GatherV2(PrimitiveWithInfer):
         return out
 
 
-class Range(PrimitiveWithInfer):
-    r"""
-    Creates a sequence of numbers.
-    Set `input_x` as :math:`x_i` for each element, `output` as follows:
-
-    .. math::
-        \text{output}(x_i) = x_i * \text{delta} + \text{start}
-
-    Args:
-        start (float): If `limit` is `None`, the value acts as limit in the range and first entry
-            defaults to `0`. Otherwise, it acts as first entry in the range.
-        limit (float): Acts as upper limit of sequence. If `None`, defaults to the value of `start`
-            while set the first entry of the range to `0`.
-        delta (float): Increment of the range. Default: 1.0.
-
-    Inputs:
-        - **input_x** (Tensor) - The assistant data. A `1-D` tensor of type float32 or int32.
-
-    Outputs:
-        Tensor, has the same shape and dtype as `input_x`.
-
-    Examples:
-        >>> range = P.Range(1.0, 8.0, 2.0)
-        >>> x = Tensor(np.array([1, 2, 3, 2]), mindspore.int32)
-        >>> range(x)
-        [3, 5, 7, 5]
+class SparseGatherV2(GatherV2):
     """
-
-    @prim_attr_register
-    def __init__(self, start, limit=None, delta=1.0):
-        self.init_prim_io_names(inputs=['x'], outputs=['y'])
-        self.delta = validator.check_value_type("delta", delta, [float], self.name)
-        validator.check_value_type("start", start, [float], self.name)
-        if limit is None:
-            self.start = 0.0
-            self.limit = start
-            self.add_prim_attr("start", self.start)
-            self.add_prim_attr("limit", self.limit)
-        else:
-            validator.check_value_type("limit", limit, [float], self.name)
-
-    def infer_shape(self, x_shape):
-        return x_shape
-
-    def infer_dtype(self, x_dtype):
-        validator.check_tensor_type_same({'x_dtype': x_dtype}, [mstype.float32, mstype.int32], self.name)
-        return x_dtype
-
-
-class EmbeddingLookup(PrimitiveWithInfer):
-    """
-    Returns a slice of input tensor based on the specified indices and axis. This Primitive has the similar
-    functionality as GatherV2, but has one more inputs: `offset`.
-    This primitive runs on the acipu devices.
+    Returns a slice of input tensor based on the specified indices and axis.
 
     Inputs:
-        - **params** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
-          The Tensor slice, instead of the entire Tensor.
-        - **indices** (Tensor) - The shape of tensor is :math:`(y_1, y_2, ..., y_S)`.
-          Specifies the indices of elements of the original Tensor. Values can be out of range of `params`,
-          and the exceeding part will be filled with 0 in the output.
-          The indices to do lookup operation whose data type should be mindspore.int32 or mindspore.int64.
-        - **offset** (int) - Specifies the offset value of this `params` slice. Thus the real indices
-          are equal to `indices` minus `offset`.
-
+        - **input_params** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+          The original Tensor.
+        - **input_indices** (Tensor) - The shape of tensor is :math:`(y_1, y_2, ..., y_S)`.
+          Specifies the indices of elements of the original Tensor. Must be in the range
+          `[0, input_param.shape[axis])`.
+        - **axis** (int) - Specifies the dimension index to gather indices.
 
     Outputs:
         Tensor, the shape of tensor is :math:`(z_1, z_2, ..., z_N)`.
 
     Examples:
-        >>> params = Tensor(np.array([[8, 9], [10, 11], [12, 13], [14, 15]]), mindspore.float32)
-        >>> indices = Tensor(np.array([[5, 2], [8, 5]]), mindspore.int32)
-        >>> offset = 4
-        >>> out = P.EmbeddingLookup()(params, indices, offset)
-        [[[10, 11], [0 ,0]], [[0, 0], [10, 11]]]
+        >>> input_params = Tensor(np.array([[1, 2, 7, 42], [3, 4, 54, 22], [2, 2, 55, 3]]), mindspore.float32)
+        >>> input_indices = Tensor(np.array([1, 2]), mindspore.int32)
+        >>> axis = 1
+        >>> out = P.GatherV2()(input_params, input_indices, axis)
     """
-    @prim_attr_register
-    def __init__(self):
-        """init index_select"""
-        self.init_prim_io_names(inputs=['params', 'indices', 'offset'],
-                                outputs=['output'])
-
-    def __infer__(self, params, indices, offset):
-        validator.check_subclass("params", params['dtype'], mstype.tensor, self.name)
-        valid_types = (mstype.int32, mstype.int64)
-        validator.check_tensor_type_same({"indices": indices['dtype']}, valid_types, self.name)
-        validator.check_subclass("offset", offset['dtype'], mstype.int_, self.name)
-        params_shp = params['shape']
-        out_shape = indices['shape'] + params_shp[1:]
-        out = {'shape': out_shape,
-               'dtype': params['dtype'],
-               'value': None}
-        return out
 
 
 class Split(PrimitiveWithInfer):
@@ -629,7 +577,7 @@ class Split(PrimitiveWithInfer):
         output_num (int): The number of output tensors. Default: 1.
 
     Raises:
-        ValueError: If axis is out of the range [-len(input_x.shape()), len(input_x.shape())),
+        ValueError: If axis is out of the range [-len(input_x.shape), len(input_x.shape)),
             or if the output_num is less than or equal to 0, or if the
             dimension which to split cannot be evenly divided by output_num.
 
@@ -919,9 +867,16 @@ class TupleToArray(PrimitiveWithInfer):
             ret = np.array(x, np.int32)
         else:
             ret = np.array(x, np.float32)
-
         return Tensor(ret)
 
+    def __call__(self, x):
+        args = list()
+        if isinstance(x, range):
+            args.append(tuple(x))
+        else:
+            args.append(x)
+        return _run_op(self, self.name, args)
+
 
 class ScalarToArray(PrimitiveWithInfer):
     """
@@ -1000,7 +955,7 @@ class InvertPermutation(PrimitiveWithInfer):
         - **input_x** (Union(tuple[int], Tensor[int])) - The input tuple is constructed by multiple
           integers, i.e., :math:`(y_1, y_2, ..., y_S)` representing the indices.
           The values must include 0. There can be no duplicate values or negative values.
-          If the input is Tensor, it must be 1-d and the dtype is int.
+          If the input is Tensor, it must be 1-d and the dtype is int. Only constant value is allowed.
 
 
     Outputs:
@@ -1020,6 +975,8 @@ class InvertPermutation(PrimitiveWithInfer):
     def __infer__(self, x):
         x_shp = x['shape']
         x_value = x['value']
+        if x_value is None:
+            raise ValueError(f'For \'{self.name}\' the input value must be const.')
         validator.check_value_type("shape", x_shp, [tuple, list], self.name)
         if mstype.issubclass_(x['dtype'], mstype.tensor):
             validator.check('x dimension', len(x_shp), '', 1, Rel.EQ, self.name)
@@ -1028,6 +985,12 @@ class InvertPermutation(PrimitiveWithInfer):
         z = [x_value[i] for i in range(len(x_value))]
         z.sort()
 
+        for i in range(1, len(z)):
+            if z[i-1] == z[i]:
+                raise ValueError(f"For {self.name}, {z[i]} is duplicated in the input.")
+        validator.check(f'value min', min(x_value), '', 0, Rel.EQ, self.name)
+        validator.check(f'value max', max(x_value), '', len(x_value)-1, Rel.EQ, self.name)
+
         y = [None] * len(x_value)
         for i, value in enumerate(x_value):
             validator.check_value_type("input[%d]" % i, value, [int], self.name)
@@ -1273,14 +1236,20 @@ class Tile(PrimitiveWithInfer):
         """init Tile"""
         self.init_prim_io_names(inputs=['x', 'multiples'], outputs=['output'])
 
+    def check_elim(self, base_tensor, multiplier):
+        if (not isinstance(base_tensor, Tensor)) or (not isinstance(multiplier, tuple)):
+            raise TypeError("Expecting (Tensor, tuple), got: ({}, {})".format(base_tensor, multiplier))
+        if all(v == 1 for v in multiplier):
+            return (True, base_tensor)
+        return (False, None)
+
     def __infer__(self, x, multiples):
         multiples_v = multiples['value']
         x_shp = x['shape']
         validator.check_value_type("shape", multiples_v, [tuple], self.name)
         for i, multiple in enumerate(multiples_v):
             validator.check_value_type("multiples[%d]" % i, multiple, [int], self.name)
-        valid_types = [mstype.int16, mstype.int32, mstype.bool_, mstype.float16, mstype.float32]
-        validator.check_tensor_type_same({'x': x['dtype']}, valid_types, self.name)
+        validator.check_value_type("x[\'dtype\']", x["dtype"], typing.TensorType, self.name)
         len_sub = len(multiples_v) - len(x_shp)
         multiples_w = None
         if len_sub == 0:
@@ -1323,7 +1292,7 @@ class UnsortedSegmentSum(PrimitiveWithInfer):
         Tensor, the shape is :math:`(z, x_{N+1}, ..., x_R)`.
 
     Examples:
-        >>> input_x = Tensor([1, 2, 3, 4], mindspore.float)
+        >>> input_x = Tensor([1, 2, 3, 4], mindspore.float32)
         >>> segment_ids = Tensor([0, 0, 1, 2], mindspore.int32)
         >>> num_segments = 4
         >>> P.UnsortedSegmentSum()(input_x, segment_ids, num_segments)
@@ -1556,7 +1525,7 @@ class Unpack(PrimitiveWithInfer):
         A tuple of Tensors, the shape of each objects is same.
 
     Raises:
-        ValueError: If axis is out of the range [-len(input_x.shape()), len(input_x.shape())).
+        ValueError: If axis is out of the range [-len(input_x.shape), len(input_x.shape)).
 
     Examples:
         >>> unpack = P.Unpack()
@@ -1697,6 +1666,7 @@ class Select(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init"""
+        self.init_prim_io_names(inputs=['condition', 'x', 'y'], outputs=['output'])
 
     def infer_shape(self, cond_shape, x_shape, y_shape):
         if cond_shape != x_shape or x_shape != y_shape:
@@ -1712,6 +1682,16 @@ class Select(PrimitiveWithInfer):
             raise TypeError('\'%s\' the x_type %s must be the same as y_type %s.' % (self.name, x_type, y_type))
         return x_type
 
+    def infer_value(self, cond, x, y):
+        if cond is not None and x is not None and y is not None:
+            cond = cond.asnumpy()
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.where(cond, x, y)
+            return Tensor(out)
+        return None
+
+
 
 class StridedSlice(PrimitiveWithInfer):
     r"""
@@ -1762,7 +1742,7 @@ class StridedSlice(PrimitiveWithInfer):
         >>>                   [[5, 5, 5], [6, 6, 6]]], mindspore.float32)
         >>> slice = P.StridedSlice()
         >>> output = slice(input_x, (1, 0, 0), (2, 1, 3), (1, 1, 1))
-        >>> output.shape()
+        >>> output.shape
         (1, 1, 3)
         >>> output
         [[[3, 3, 3]]]
@@ -1886,7 +1866,7 @@ class Diag(PrimitiveWithInfer):
         if x is None:
             return None
         # do constant-folding only when x rank is 1
-        if len(x.shape()) != 1:
+        if len(x.shape) != 1:
             return None
         ret = np.diag(x.asnumpy())
         return Tensor(ret)
@@ -1938,7 +1918,7 @@ class DiagPart(PrimitiveWithInfer):
         if x is None:
             return None
         # do constant-folding only when x rank is 2
-        if len(x.shape()) != 2:
+        if len(x.shape) != 2:
             return None
         ret = np.diag(x.asnumpy())
         return Tensor(ret)
@@ -1952,7 +1932,7 @@ class Eye(PrimitiveWithInfer):
     Inputs:
         - **n** (int) - Number of rows of returned tensor
         - **m** (int) - Number of columns of returned tensor
-        - **t** (mindspore.dtype) - Mindspore's dtype, The data type of the returned tensor.
+        - **t** (mindspore.dtype) - MindSpore's dtype, The data type of the returned tensor.
 
     Outputs:
         Tensor, a tensor with ones on the diagonal and zeros elsewhere.
@@ -1983,7 +1963,7 @@ class ScatterNd(PrimitiveWithInfer):
     Creates an empty tensor, and set values by scattering the update tensor depending on indices.
 
     Inputs:
-        - **indices** (Tensor) - The index of scattering in the new tensor.
+        - **indices** (Tensor) - The index of scattering in the new tensor. With int32 data type.
         - **update** (Tensor) - The source Tensor to be scattered.
         - **shape** (tuple[int]) - Define the shape of the output tensor. Has the same type as indices.
 
@@ -2006,7 +1986,7 @@ class ScatterNd(PrimitiveWithInfer):
     def __infer__(self, indices, update, shape):
         shp = shape['value']
         validator.check_subclass("update_dtype", update['dtype'], mstype.tensor, self.name)
-        validator.check_tensor_type_same({"indices": indices['dtype']}, mstype.int_type, self.name)
+        validator.check_tensor_type_same({"indices": indices['dtype']}, [mstype.int32], self.name)
         validator.check_value_type("shape", shp, [tuple], self.name)
         for i, x in enumerate(shp):
             validator.check_integer("shape[%d]" % i, x, 0, Rel.GT, self.name)
@@ -2099,6 +2079,47 @@ class GatherNd(PrimitiveWithInfer):
         return x_dtype
 
 
+class TensorScatterUpdate(PrimitiveWithInfer):
+    """
+    Update tensor value by using input indices and value.
+
+    Using given values to update tensor value, along with the input indices.
+
+    Inputs:
+        - **input_x** (Tensor) - The target tensor.
+        - **indices** (Tensor) - The index of input tensor whose data type is int32.
+        - **update** (Tensor) - The tensor to update the input tensor, has the same type as input,
+          and update.shape = indices.shape + input_x.shape[1:].
+
+    Outputs:
+        Tensor, has the same shape and type as `input_x`.
+
+    Examples:
+        >>> input_x = Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mindspore.float32)
+        >>> indices = Tensor(np.array([[0, 0], [1, 1]]), mindspore.int32)
+        >>> update = Tensor(np.array([1.0, 2.2]), mindspore.float32)
+        >>> op = P.TensorScatterUpdate()
+        >>> output = op(input_x, indices, update)
+    """
+    @prim_attr_register
+    def __init__(self):
+        """Init TensorScatterUpdate"""
+        self.init_prim_io_names(inputs=['x', 'indices', 'value'], outputs=['y'])
+
+    def infer_shape(self, x_shape, indices_shape, value_shape):
+        validator.check('the dimension of x', len(x_shape),
+                        'the dimension of indices', indices_shape[-1], Rel.GE)
+        if indices_shape[:-1] + x_shape[indices_shape[-1]:] != value_shape:
+            raise ValueError("For 'TensorScatterUpdate', input value are not match with input indices.")
+        return x_shape
+
+    def infer_dtype(self, x_dtype, indices_dtype, value_dtype):
+        validator.check_tensor_type_same({'indices': indices_dtype}, [mstype.int32], self.name)
+        args = {"x": x_dtype, "value": value_dtype}
+        validator.check_tensor_type_same(args, (mstype.bool_,) + mstype.number_type, self.name)
+        return x_dtype
+
+
 class ScatterUpdate(PrimitiveWithInfer):
     """
     Update tensor value by using input indices and value.
@@ -2110,7 +2131,7 @@ class ScatterUpdate(PrimitiveWithInfer):
 
     Inputs:
         - **input_x** (Parameter) - The target tensor, with data type of Parameter.
-        - **indices** (Tensor) - The index of input tensor.
+        - **indices** (Tensor) - The index of input tensor. With int32 data type.
         - **update** (Tensor) - The tensor to update the input tensor, has the same type as input,
           and update.shape = indices.shape + input_x.shape[1:].
 
@@ -2118,9 +2139,11 @@ class ScatterUpdate(PrimitiveWithInfer):
         Tensor, has the same shape and type as `input_x`.
 
     Examples:
-        >>> input_x = mindspore.Parameter(Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mindspore.float32))
+        >>> np_x = np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]])
+        >>> input_x = mindspore.Parameter(Tensor(np_x, mindspore.float32), name="x")
         >>> indices = Tensor(np.array([[0, 0], [1, 1]]), mindspore.int32)
-        >>> update = Tensor(np.array([1.0, 2.2]), mindspore.float32)
+        >>> np_update = np.array([[[1.0, 2.2, 1.0], [2.0, 1.2, 1.0]], [[2.0, 2.2, 1.0], [3.0, 1.2, 1.0]]])
+        >>> update = Tensor(np_update, mindspore.float32)
         >>> op = P.ScatterUpdate()
         >>> output = op(input_x, indices, update)
     """
@@ -2132,15 +2155,16 @@ class ScatterUpdate(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self, use_locking=True):
         """Init ScatterUpdate"""
+        validator.check_value_type('use_locking', use_locking, [bool], self.name)
         self.init_prim_io_names(inputs=['x', 'indices', 'value'], outputs=['y'])
 
     def infer_shape(self, x_shape, indices_shape, value_shape):
         if indices_shape + x_shape[1:] != value_shape:
-            raise ValueError('Input value are not match with input indices.')
+            raise ValueError("For 'ScatterUpdate', input value are not match with input indices.")
         return x_shape
 
     def infer_dtype(self, x_dtype, indices_dtype, value_dtype):
-        validator.check_tensor_type_same({'indices': indices_dtype}, mstype.int_type, self.name)
+        validator.check_tensor_type_same({'indices': indices_dtype}, [mstype.int32], self.name)
         args = {"x": x_dtype, "value": value_dtype}
         validator.check_tensor_type_same(args, (mstype.bool_,) + mstype.number_type, self.name)
         return x_dtype
@@ -2157,14 +2181,15 @@ class ScatterNdUpdate(PrimitiveWithInfer):
 
     Inputs:
         - **input_x** (Parameter) - The target tensor, with data type of Parameter.
-        - **indices** (Tensor) - The index of input tensor.
+        - **indices** (Tensor) - The index of input tensor, with int32 data type.
         - **update** (Tensor) - The tensor to add to the input tensor, has the same type as input.
 
     Outputs:
         Tensor, has the same shape and type as `input_x`.
 
     Examples:
-        >>> input_x = mindspore.Parameter(Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mindspore.float32))
+        >>> np_x = np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]])
+        >>> input_x = mindspore.Parameter(Tensor(np_x, mindspore.float32), name="x")
         >>> indices = Tensor(np.array([[0, 0], [1, 1]]), mindspore.int32)
         >>> update = Tensor(np.array([1.0, 2.2]), mindspore.float32)
         >>> op = P.ScatterNdUpdate()
@@ -2178,17 +2203,18 @@ class ScatterNdUpdate(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self, use_locking=True):
         """Init ScatterNdUpdate"""
+        validator.check_value_type('use_locking', use_locking, [bool], self.name)
         self.init_prim_io_names(inputs=['x', 'indices', 'value'], outputs=['y'])
 
     def infer_shape(self, x_shape, indices_shape, value_shape):
         validator.check('the dimension of x', len(x_shape),
                         'the dimension of indices', indices_shape[-1], Rel.GE)
         if indices_shape[:-1] + x_shape[indices_shape[-1]:] != value_shape:
-            raise ValueError('Input value are not match with input indices.')
+            raise ValueError("For 'ScatterNdUpdate', input value are not match with input indices.")
         return x_shape
 
     def infer_dtype(self, x_dtype, indices_dtype, value_dtype):
-        validator.check_tensor_type_same({'indices': indices_dtype}, mstype.int_type, self.name)
+        validator.check_tensor_type_same({'indices': indices_dtype}, [mstype.int32], self.name)
         args = {"x": x_dtype, "value": value_dtype}
         validator.check_tensor_type_same(args, (mstype.bool_,) + mstype.number_type, self.name)
         return x_dtype
@@ -2204,7 +2230,8 @@ class ScatterMax(PrimitiveWithInfer):
     """
     Update the value of the input tensor through the max operation.
 
-    Using given values to update tensor value through the max operation, along with the input indices,.
+    Using given values to update tensor value through the max operation, along with the input indices.
+    This operation outputs the `input_x` after the update is done, which makes it convenient to use the updated value.
 
     Args:
         use_locking (bool): Whether protect the assignment by a lock. Default: True.
@@ -2216,7 +2243,7 @@ class ScatterMax(PrimitiveWithInfer):
           the data type is same as `input_x`, the shape is `indices_shape + x_shape[1:]`.
 
     Outputs:
-        Tensor, has the same shape and data type as `input_x`.
+        Parameter, the updated `input_x`.
 
     Examples:
         >>> input_x = Parameter(Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), mindspore.float32), name="input_x")
@@ -2249,6 +2276,7 @@ class ScatterAdd(PrimitiveWithInfer):
     Update the value of the input tensor through the add operation.
 
     Using given values to update tensor value through the add operation, along with the input indices.
+    This operation outputs the `input_x` after the update is done, which makes it convenient to use the updated value.
 
     Args:
         use_locking (bool): Whether protect the assignment by a lock. Default: False.
@@ -2260,7 +2288,7 @@ class ScatterAdd(PrimitiveWithInfer):
           the data type is same as `input_x`, the shape is `indices_shape + x_shape[1:]`.
 
     Outputs:
-        Tensor, has the same shape and data type as `input_x`.
+        Parameter, the updated `input_x`.
 
     Examples:
         >>> input_x = Parameter(Tensor(np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]), mindspore.float32), name="x")
@@ -2460,8 +2488,7 @@ class SpaceToBatch(PrimitiveWithInfer):
         validator.check_integer('rank of input_x', len(x_shape), 4, Rel.EQ, self.name)
         out_shape = copy.deepcopy(x_shape)
         for i in range(2):
-            padded = out_shape[i + 2] + self.paddings[i][0] + \
-                     self.paddings[i][1]
+            padded = out_shape[i + 2] + self.paddings[i][0] + self.paddings[i][1]
             if padded % self.block_size != 0:
                 raise ValueError(f'For \'{self.name}\' padded[{i}] {padded} should be divisible by '
                                  f'block_size {self.block_size}')
@@ -2479,7 +2506,7 @@ class BatchToSpace(PrimitiveWithInfer):
     dimension and block_size with given amount to crop from dimension, respectively.
 
     Args:
-        block_size (int): The block size of dividing block with value >= 1.
+        block_size (int): The block size of dividing block with value >= 2.
         crops (list): The crop value for H and W dimension, containing 2 sub list, each containing 2 int value.
             All values must be >= 0. crops[i] specifies the crop values for spatial dimension i, which corresponds to
             input dimension i+2. It is required that input_shape[i+2]*block_size >= crops[i][0]+crops[i][1].
@@ -2513,7 +2540,7 @@ class BatchToSpace(PrimitiveWithInfer):
     def __init__(self, block_size, crops):
         """Init BatchToSpace"""
         validator.check_value_type('block_size', block_size, [int], self.name)
-        validator.check('block_size', block_size, '', 1, Rel.GE, self.name)
+        validator.check('block_size', block_size, '', 2, Rel.GE, self.name)
         self.block_size = block_size
         validator.check('crops shape', np.array(crops).shape, '', (2, 2))
         for elem in itertools.chain(*crops):
@@ -2592,6 +2619,8 @@ class SpaceToBatchND(PrimitiveWithInfer):
 
         for elem in block_shape:
             validator.check('block_shape element', elem, '', 1, Rel.GE, self.name)
+            validator.check_value_type('block_shape element', elem, [int], self.name)
+
         self.block_shape = block_shape
 
         validator.check('paddings shape', np.array(paddings).shape, '', (block_rank, 2), Rel.EQ, self.name)
@@ -2634,7 +2663,7 @@ class BatchToSpaceND(PrimitiveWithInfer):
             The length of block_shape is M correspoding to the number of spatial dimensions.
         crops (list): The crop value for H and W dimension, containing 2 sub list, each containing 2 int value.
             All values must be >= 0. crops[i] specifies the crop values for spatial dimension i, which corresponds to
-            input dimension i+2. It is required that input_shape[i+2]*block_size[i] >= crops[i][0]+crops[i][1].
+            input dimension i+2. It is required that input_shape[i+2]*block_shape[i] > crops[i][0]+crops[i][1].
 
     Inputs:
         - **input_x** (Tensor) - The input tensor.
@@ -2670,6 +2699,8 @@ class BatchToSpaceND(PrimitiveWithInfer):
 
         for elem in block_shape:
             validator.check('block_shape element', elem, '', 1, Rel.GE, self.name)
+            validator.check_value_type('block_shape element', elem, [int], self.name)
+
         self.block_shape = block_shape
 
         validator.check('crops shape', np.array(crops).shape, '', (block_rank, 2), Rel.EQ, self.name)
@@ -2701,32 +2732,144 @@ class BatchToSpaceND(PrimitiveWithInfer):
         return out_shape
 
 
+class BroadcastTo(PrimitiveWithInfer):
+    """
+    Broadcasts input tensor to a given shape.
+    Input shape can be broadcast to target shape if for each dimension pair they are either equal or input is one.
+    When input shape is broadcast to target shape, it starts with the trailing dimensions.
+
+    Args:
+        shape (tuple): The target shape to broadcast.
+
+    Inputs:
+        - **input_x** (Tensor) - The input tensor.
+
+    Outputs:
+        Tensor, with the given `shape` and the same data type as `input_x`.
+
+    Examples:
+        >>> shape = (2, 3)
+        >>> input_x = Tensor(np.array([1, 2, 3]).astype(np.float32))
+        >>> broadcast_to = P.BroadcastTo(shape)
+        >>> broadcast_to(input_x)
+        [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]
+    """
+
+    @prim_attr_register
+    def __init__(self, shape):
+        """Init BroadcastTo"""
+        validator.check_value_type("shape", shape, (tuple), self.name)
+        validator.check("shape length", len(shape), "", 0, Rel.GT, self.name)
+        for i in shape:
+            validator.check_integer("shape element", i, 0, Rel.GT, self.name)
+        self.shape = shape
+
+    def infer_shape(self, x_shape):
+        validator.check("input_x shape length", len(x_shape), "target shape", len(self.shape), Rel.LE, self.name)
+
+        reversed_x_shape = tuple(reversed(x_shape))
+        reversed_target = tuple(reversed(self.shape))
+        for i, v in enumerate(reversed_x_shape):
+            if v not in (reversed_target[i], 1):
+                raise ValueError(f"Not supported shapes for broadcast, "
+                                 f"x_shape: {tuple(x_shape)}, target shape {self.shape}.")
+        return self.shape
+
+    def infer_dtype(self, x_dtype):
+        validator.check_subclass("input_x", x_dtype, mstype.tensor, self.name)
+        return x_dtype
+
+
+class InplaceUpdate(PrimitiveWithInfer):
+    r"""
+    Updates specified rows with values in `v`.
+
+    Args:
+        indices (Union[int, tuple]): Indices into the left-most dimension of `x`.
+
+    Inputs:
+        - **x** (Tensor) - A tensor which to be inplace updated. It can be of the following data types:
+          float32, float16, int32.
+        - **v** (Tensor) - A tensor of the same type as `x`. Same dimension size as `x` except
+          the first dimension, which must be the same as the size of `indices`.
+
+    Outputs:
+        Tensor, with the same type and shape as the input `x`.
+
+    Examples:
+        >>> x = Tensor(np.arange(24).reshape(3, 4, 2), mindspore.float32)
+        >>> v = Tensor(np.arange(-8, 8).reshape(2, 4, 2), mindspore.float32)
+        >>> inplace_update = P.InplaceUpdate((0, 2))
+        >>> result = inplace_update(x, v)
+        [[[-8.  -7.]
+          [-6.  -5.]
+          [-4.  -3.]
+          [-2.  -1.]]
+         [[ 8.   9.]
+          [10.  11.]
+          [12.  13.]
+          [14.  15.]]
+         [[ 0.   1.]
+          [ 2.   3.]
+          [ 4.   5.]
+          [ 6.   7.]]]
+    """
+    @prim_attr_register
+    def __init__(self, indices):
+        """Init InplaceUpdate"""
+        self.init_prim_io_names(inputs=['x', 'indices', 'v'], outputs=['y'])
+        validator.check_value_type("indices", indices, [int, tuple], self.name)
+        if isinstance(indices, int):
+            self.add_prim_attr('indices', (indices,))
+        for item in self.indices:
+            validator.check_value_type("item of indices", item, [int], self.name)
+
+    def infer_dtype(self, x_dtype, v_dtype):
+        valid_type = [mstype.int32, mstype.float16, mstype.float32]
+        validator.check_tensor_type_same(
+            {
+                "x": x_dtype,
+                "v": v_dtype
+            }, valid_type, self.name)
+
+        return x_dtype
+
+    def infer_shape(self, x_shape, v_shape):
+        validator.check("x", len(x_shape), "v", len(v_shape), Rel.EQ, self.name)
+
+        x_rank = len(x_shape)
+        for idx in range(x_rank)[1:]:
+            validator.check("x dim %d" % idx, x_shape[idx], 'v dim %d' % idx, v_shape[idx], Rel.EQ, self.name)
+
+        validator.check("size of indices", len(self.indices), "v's first dimension", v_shape[0],
+                        Rel.EQ, self.name)
+
+        return x_shape
+
+
 class ReverseSequence(PrimitiveWithInfer):
     """
     Reverses variable length slices.
 
-    Note:
-        If the specified axis is a negative number, the index is counted
-        backward from the end and starts at 1.
-
-    Raises:
-        ValueError: If axis is not an integer or not in the valid range.
     Args:
-        seq_dim (int): The dimension which is partially reversed. Required.
-        batch_dim (int): The dimension along which reversal is performed. Default: 0
+        seq_dim (int): The dimension along which reversal is performed. Required.
+        batch_dim (int): The input is sliced along this dimmension. Default: 0.
 
     Inputs:
-        - **x** (Tensor) - The input to reverse.
-        - **seq_lengths** (int) - Must be 1-D vector with types: int32, int64
+        - **x** (Tensor) - The input to reverse, support all number types including bool.
+        - **seq_lengths** (Tensor) - Must be 1-D vector with types: int32, int64.
 
     Outputs:
-        Reversed tensor with the same shape and data type as x.
+        Reversed tensor with the same shape and data type as input.
 
     Examples:
         >>> x = Tensor(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), mindspore.float32)
         >>> seq_lengths = Tensor(np.array([1, 2, 3]))
         >>> reverse_sequence = P.ReverseSequence(seq_dim=1)
         >>> output = reverse_sequence(x, seq_lengths)
+        [[1 2 3]
+         [5 4 6]
+         [9 8 7]]
     """
 
     @prim_attr_register
@@ -2748,5 +2891,6 @@ class ReverseSequence(PrimitiveWithInfer):
         return x
 
     def infer_dtype(self, x, seq_lengths):
+        validator.check_tensor_type_same({"x_dtype": x}, mstype.number_type + (mstype.bool_,), self.name)
         validator.check_tensor_type_same({"seq_lengths_dtype": seq_lengths}, [mstype.int32, mstype.int64], self.name)
         return x
diff --git a/mindspore/ops/operations/comm_ops.py b/mindspore/ops/operations/comm_ops.py
index 6e0c22f584..dc690b5f6e 100644
--- a/mindspore/ops/operations/comm_ops.py
+++ b/mindspore/ops/operations/comm_ops.py
@@ -68,7 +68,11 @@ class AllReduce(PrimitiveWithInfer):
 
     Examples:
         >>> from mindspore.communication import init
+        >>> from mindspore import Tensor
+        >>> from mindspore.ops.operations.comm_ops import ReduceOp
+        >>> import mindspore.nn as nn
         >>> import mindspore.ops.operations as P
+        >>>
         >>> init('nccl')
         >>> class Net(nn.Cell):
         >>>     def __init__(self):
@@ -131,8 +135,11 @@ class AllGather(PrimitiveWithInfer):
         then the shape of output is :math:`(N, x_1, x_2, ..., x_R)`.
 
     Examples:
-        >>> from mindspore.communication import init
         >>> import mindspore.ops.operations as P
+        >>> import mindspore.nn as nn
+        >>> from mindspore.communication import init
+        >>> from mindspore import Tensor
+        >>>
         >>> init('nccl')
         >>> class Net(nn.Cell):
         >>>     def __init__(self):
@@ -175,14 +182,16 @@ class HostAllGather(PrimitiveWithInfer):
 
     Note:
         Tensor must have the same shape and format in all processes participating in the collective.
+        HostAllGather is a host-side operator, it depends on OpenMPI and must use build option -M on
+        to enable it. Using mpirun command to run it:
+        mpirun -output-filename log -merge-stderr-to-stdout -np 3 python test_host_all_gather.py
 
     Args:
         group (Union[tuple[int],list[int]]): The rand_ids of communication group to work on.
 
     Raises:
         TypeError: If group is not a list nor tuple, or elements of group are not int.
-        ValueError: If the local rank id of the calling process not in group,
-                    or rank_id from group not in [0, 7].
+        ValueError: If group is not set, or rank_id from group not in [0, 7].
 
     Inputs:
         - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
@@ -192,9 +201,14 @@ class HostAllGather(PrimitiveWithInfer):
         then the shape of output is :math:`(N, x_1, x_2, ..., x_R)`.
 
     Examples:
-        >>> from mindspore.communication import init
+        >>> import mindspore.nn as nn
+        >>> import mindspore.context as context
         >>> import mindspore.ops.operations as P
-        >>> init('nccl')
+        >>> from mindspore import Tensor
+        >>>
+        >>> context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+        >>> context.set_mpi_config(enable_mpi=True)
+        >>>
         >>> class Net(nn.Cell):
         >>>     def __init__(self):
         >>>         super(Net, self).__init__()
@@ -218,8 +232,6 @@ class HostAllGather(PrimitiveWithInfer):
             validator.check_int_range("rank_id", r, 0, 7, Rel.INC_BOTH, self.name)
             validator.check_value_type("rank_id", r, (int,), self.name)
         self.group_size = len(group)
-        self.rank = get_rank()
-        validator.check('rank', self.rank, 'group', self.group, Rel.IN, self.name)
         self.add_prim_attr('group', group)
 
     def infer_shape(self, x_shape):
@@ -253,8 +265,12 @@ class ReduceScatter(PrimitiveWithInfer):
         ValueError: If the first dimension of input can not be divided by rank size.
 
     Examples:
+        >>> from mindspore import Tensor
         >>> from mindspore.communication import init
+        >>> from mindspore.ops.operations.comm_ops import ReduceOp
+        >>> import mindspore.nn as nn
         >>> import mindspore.ops.operations as P
+        >>>
         >>> init('nccl')
         >>> class Net(nn.Cell):
         >>>     def __init__(self):
@@ -264,7 +280,7 @@ class ReduceScatter(PrimitiveWithInfer):
         >>>     def construct(self, x):
         >>>         return self.reducescatter(x)
         >>>
-        >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> input_ = Tensor(np.ones([8, 8]).astype(np.float32))
         >>> net = Net()
         >>> output = net(input_)
     """
@@ -298,6 +314,9 @@ class HostReduceScatter(PrimitiveWithInfer):
 
     Note:
         Tensor must have the same shape and format in all processes participating in the collective.
+        HostReduceScatter is a host-side operator, it depends on OpenMPI and must use build option
+        -M on to enable it. Using mpirun command to run it:
+        mpirun -output-filename log -merge-stderr-to-stdout -np 3 python test_host_reduce_scatter.py
 
     Args:
         op (str): Specifies an operation used for element-wise reductions,
@@ -307,13 +326,19 @@ class HostReduceScatter(PrimitiveWithInfer):
     Raises:
         TypeError: If op is not a string and group is not a list nor tuple,
                    or elements of group are not int.
-        ValueError: If the first dimension of input can not be divided by rank size,
-                    or group is not set, or rank_id not in [1, 7].
+        ValueError: If the first dimension of input can not be divided by group size,
+                    or group is not set, or rank_id not in [0, 7].
 
     Examples:
-        >>> from mindspore.communication import init
+        >>> import mindspore.nn as nn
+        >>> import mindspore.context as context
         >>> import mindspore.ops.operations as P
-        >>> init('nccl')
+        >>> from mindspore import Tensor
+        >>> from mindspore.ops.operations.comm_ops import ReduceOp
+        >>>
+        >>> context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+        >>> context.set_mpi_config(enable_mpi=True)
+        >>>
         >>> class Net(nn.Cell):
         >>>     def __init__(self):
         >>>         super(Net, self).__init__()
@@ -322,7 +347,7 @@ class HostReduceScatter(PrimitiveWithInfer):
         >>>     def construct(self, x):
         >>>         return self.hostreducescatter(x)
         >>>
-        >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> input_ = Tensor(np.ones([8, 8]).astype(np.float32))
         >>> net = Net()
         >>> output = net(input_)
     """
@@ -377,8 +402,11 @@ class Broadcast(PrimitiveWithInfer):
         TypeError: If root_rank is not a integer or group is not a string.
 
     Examples:
+        >>> from mindspore import Tensor
         >>> from mindspore.communication import init
+        >>> import mindspore.nn as nn
         >>> import mindspore.ops.operations as P
+        >>>
         >>> init('nccl')
         >>> class Net(nn.Cell):
         >>>     def __init__(self):
diff --git a/mindspore/ops/operations/control_ops.py b/mindspore/ops/operations/control_ops.py
index 2c804c483f..e7ac4572ce 100644
--- a/mindspore/ops/operations/control_ops.py
+++ b/mindspore/ops/operations/control_ops.py
@@ -144,7 +144,7 @@ class Merge(PrimitiveWithInfer):
     One and only one of the inputs should be selected as the output
 
     Inputs:
-        - **inputs** (Tuple) - The data to be merged. All tuple elements should have same data type.
+        - **inputs** (Union(Tuple, List)) - The data to be merged. All tuple elements should have same data type.
 
     Outputs:
         tuple. Output is tuple(`data`, `output_index`). The `data` has the same shape of `inputs` element.
diff --git a/mindspore/ops/operations/debug_ops.py b/mindspore/ops/operations/debug_ops.py
index f1b56b2850..c6b635a69f 100644
--- a/mindspore/ops/operations/debug_ops.py
+++ b/mindspore/ops/operations/debug_ops.py
@@ -191,7 +191,7 @@ class InsertGradientOf(PrimitiveWithInfer):
         f (Function): MindSpore's Function. Callback function.
 
     Inputs:
-        - **input_x** (Tensor) - The graph node to attach to.
+        - **input_x** (Any) - The graph node to attach to.
 
     Outputs:
         Tensor, returns `input_x` directly. `InsertGradientOf` does not affect the forward result.
@@ -286,12 +286,6 @@ class HookBackward(PrimitiveWithInfer):
         self.register_hook(hook_fn)
         self.cell_id = cell_id
 
-    def __call__(self, *inputs):
-        """run in PyNative mode."""
-        if len(inputs) == 1:
-            return inputs[0]
-        return inputs
-
     def infer_shape(self, *inputs_shape):
         if len(inputs_shape) == 1:
             return inputs_shape[0]
diff --git a/mindspore/ops/operations/math_ops.py b/mindspore/ops/operations/math_ops.py
index 9afdc50caa..08cd481582 100644
--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@@ -15,6 +15,7 @@
 
 """Operators for math."""
 
+import copy
 import numpy as np
 from ... import context
 from ..._c_expression import signature_rw as sig_rw
@@ -119,18 +120,20 @@ class TensorAdd(_MathBinaryOp):
     Adds two input tensors element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Examples:
         >>> add = P.TensorAdd()
@@ -140,6 +143,15 @@ class TensorAdd(_MathBinaryOp):
         [5,7,9]
     """
 
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = x + y
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class AssignAdd(PrimitiveWithInfer):
     """
@@ -198,14 +210,14 @@ class AssignSub(PrimitiveWithInfer):
         >>>     def __init__(self):
         >>>         super(Net, self).__init__()
         >>>         self.AssignSub = P.AssignSub()
-        >>>         self.variable = mindspore.Parameter(initializer(1, [1], mindspore.int64), name="global_step")
+        >>>         self.variable = mindspore.Parameter(initializer(1, [1], mindspore.int32), name="global_step")
         >>>
         >>>     def construct(self, x):
         >>>         self.AssignSub(self.variable, x)
         >>>         return self.variable
         >>>
         >>> net = Net()
-        >>> value = Tensor(np.ones([1]).astype(np.int64)*100)
+        >>> value = Tensor(np.ones([1]).astype(np.int32)*100)
         >>> net(value)
     """
 
@@ -253,15 +265,35 @@ class _Reduce(PrimitiveWithInfer):
         return output
 
     def do_infer(self, input_x, axis, valid_dtype=mstype.number_type):
+        """ return meta infos of input parameters """
         axis_v = axis['value']
         input_shp = input_x['shape']
         args = {'input_x': input_x['dtype']}
         validator.check_tensor_type_same(args, valid_dtype, self.name)
 
+        if axis_v is None:
+            raise ValueError(f"For {self.name}, axis must be const.")
         input_shp = _infer_shape_reduce(input_shp, axis_v, self.keep_dims, self.name)
+        value = None
+        if input_x['value'] is not None:
+            prim_map = {
+                'ReduceSum': np.sum,
+                'ReduceMax': np.max,
+                'ReduceMin': np.min,
+            }
+            np_reduce_func = prim_map.get(self.name, None)
+
+            if np_reduce_func is not None:
+                value = input_x['value'].asnumpy()
+                if not axis_v:
+                    axis_v = [i for i in range(len(input_x['shape']))]
+                    axis_v = tuple(axis_v)
+                value = np_reduce_func(value, axis_v, keepdims=self.keep_dims)
+                value = np.array(value)
+                value = Tensor(value)
         return {'shape': input_shp,
                 'dtype': input_x['dtype'],
-                'value': None}
+                'value': value}
 
     def __infer__(self, input_x, axis):
         return self.do_infer(input_x, axis)
@@ -330,6 +362,12 @@ class ReduceSum(_Reduce):
         >>> output = op(input_x, 1)
     """
 
+    @prim_attr_register
+    def __init__(self, keep_dims=False):
+        """init ReduceSum"""
+        super(ReduceSum, self).__init__(keep_dims)
+        self.__setattr_flag__ = True
+
 
 class ReduceAll(_Reduce):
     """
@@ -399,6 +437,12 @@ class ReduceMax(_Reduce):
         >>> output = op(input_x, 1)
     """
 
+    @prim_attr_register
+    def __init__(self, keep_dims=False):
+        """ReduceMax"""
+        super(ReduceMax, self).__init__(keep_dims)
+        self.__setattr_flag__ = True
+
 
 class ReduceMin(_Reduce):
     """
@@ -445,8 +489,9 @@ class ReduceProd(_Reduce):
                           Default : False, don't keep these reduced dimensions.
 
     Inputs:
-         - **input_x** (Tensor[Number]) - The input tensor.
-         - **axis** (Union[int, tuple(int), list(int)]) - The dimensions to reduce. Default: (), reduce all dimensions.
+        - **input_x** (Tensor[Number]) - The input tensor.
+        - **axis** (Union[int, tuple(int), list(int)]) - The dimensions to reduce. Default: (), reduce all dimensions.
+          Only constant value is allowed.
 
     Outputs:
         Tensor, has the same dtype as the 'input_x'.
@@ -474,8 +519,9 @@ class CumProd(PrimitiveWithInfer):
         reverse (bool): If True, reverse the result along axis. Default: False
 
     Inputs:
-         - **input_x** (Tensor[Number]) - The input tensor.
-         - **axis** (int) - The dimensions to compute the cumulative product.
+        - **input_x** (Tensor[Number]) - The input tensor.
+        - **axis** (int) - The dimensions to compute the cumulative product.
+          Only constant value is allowed.
 
     Outputs:
         Tensor, has the same shape and dtype as the 'input_x'.
@@ -507,6 +553,10 @@ class CumProd(PrimitiveWithInfer):
         validator.check_subclass("axis", axis_type, mstype.int_, cls_name)
         return x_type
 
+    def infer_value(self, x, axis):
+        if axis is None:
+            raise ValueError(f"For {self.name}, axis must be const.")
+
 
 class MatMul(PrimitiveWithInfer):
     """
@@ -574,6 +624,8 @@ class MatMul(PrimitiveWithInfer):
     def infer_dtype(self, x, y):
         args = {"x": x, "y": y}
         validator.check_tensor_type_same(args, mstype.float_type + mstype.int_type, self.name)
+        if x.element_type() == mstype.int8:
+            return mstype.tensor_type(mstype.int32)
         return x
 
 
@@ -662,6 +714,8 @@ class CumSum(PrimitiveWithInfer):
     def __infer__(self, x, axis):
         cls_name = self.name
         x_shp = x['shape']
+        if axis['value'] is None:
+            raise ValueError(f"For {self.name}, axis must be const.")
         validator.check_value_type('axis', axis['value'], [int], cls_name)
         valid_types = [mstype.uint8, mstype.int8, mstype.int32, mstype.float16, mstype.float32]
         validator.check_tensor_type_same({'x': x['dtype']}, valid_types, cls_name)
@@ -703,6 +757,85 @@ class AddN(PrimitiveWithInfer):
     def __init__(self):
         self.init_prim_io_names(inputs=["inputs"], outputs=["sum"])
 
+    def check_elim(self, inputs):
+        if len(inputs) != 1:
+            return (False, None)
+        if isinstance(inputs[0], Tensor):
+            return (True, inputs[0])
+        raise TypeError("Expecting Tensor, got : {}".format(type(inputs[0])))
+
+    def infer_shape(self, inputs):
+        cls_name = self.name
+        validator.check_integer("inputs", len(inputs), 1, Rel.GE, cls_name)
+        self.add_prim_attr('n', len(inputs))
+        shp0 = inputs[0]
+        for i, shp in enumerate(inputs):
+            validator.check(f"shape of inputs[{i}]", shp, 'shape of inputs[0]', shp0, Rel.EQ, cls_name)
+        return shp0
+
+    def infer_dtype(self, inputs):
+        cls_name = self.name
+        validator.check_value_type("inputs", inputs, [tuple, list], cls_name)
+        validator.check_integer("inputs", len(inputs), 1, Rel.GE, cls_name)
+        args = {}
+        for i, dtype in enumerate(inputs):
+            args[f"inputs[{i}]"] = dtype
+        validator.check_tensor_type_same(args, mstype.number_type + (mstype.bool_,), cls_name)
+        return inputs[0]
+
+    def infer_value(self, inputs):
+        if inputs is None:
+            return None
+
+        for x in inputs:
+            if x is None:
+                return None
+
+        added = copy.deepcopy(inputs[0].asnumpy())
+        for x in inputs[1:]:
+            added += x.asnumpy()
+        out = np.array(added, inputs[0].asnumpy().dtype)
+        return Tensor(out)
+
+
+class AccumulateNV2(PrimitiveWithInfer):
+    """
+    Computes accumulation of all input tensors element-wise.
+
+    AccumulateNV2 is like AddN with a significant difference: AccumulateNV2 won't
+    wait for all of its inputs to be ready before beginning to sum. That is to say,
+    AccumulateNV2 will be able to save memory when inputs are ready at different
+    times since minimum temporary storage is proportional to the output size rather
+    than the inputs size.
+
+    Inputs:
+        - **input_x** (Union(tuple[Tensor], list[Tensor])) - The input tuple or list
+          is made up of multiple tensors whose dtype is number to be added together.
+
+    Outputs:
+        Tensor, has the same shape and dtype as each entry of the `input_x`.
+
+    Examples:
+        >>> class NetAccumulateNV2(nn.Cell):
+        >>>     def __init__(self):
+        >>>         super(NetAccumulateNV2, self).__init__()
+        >>>         self.accumulateNV2 = P.AccumulateNV2()
+        >>>
+        >>>     def construct(self, *z):
+        >>>         return self.accumulateNV2(z)
+        >>>
+        >>> net = NetAccumulateNV2()
+        >>> input_x = Tensor(np.array([1, 2, 3]), mindspore.float32)
+        >>> input_y = Tensor(np.array([4, 5, 6]), mindspore.float32)
+        >>> net(input_x, input_y, input_x, input_y)
+        Tensor([10., 14., 18.], shape=(3,), dtype=mindspore.float32)
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        self.__setattr_flag__ = True
+        self.init_prim_io_names(inputs=["inputs"], outputs=["sum"])
+
     def infer_shape(self, inputs):
         cls_name = self.name
         validator.check_integer("inputs", len(inputs), 1, Rel.GE, cls_name)
@@ -752,24 +885,151 @@ class Neg(PrimitiveWithInfer):
         validator.check_tensor_type_same({"input_x": input_x}, mstype.number_type, self.name)
         return input_x
 
+    def infer_value(self, input_x):
+        if input_x is not None:
+            input_x = input_x.asnumpy()
+            return Tensor(-input_x)
+
+        return None
+
+
+class InplaceAdd(PrimitiveWithInfer):
+    """
+    Adds v into specified rows of x. Computes y = x; y[i,] += v.
+
+    Args:
+        indices (Union[int, tuple]): Indices into the left-most dimension of x, and determines which rows of x
+            to add with v. It is a int or tuple, whose value is in [0, the first dimension size of x).
+
+    Inputs:
+        - **input_x** (Tensor) - The first input is a tensor whose data type is number.
+        - **input_v** (Tensor) - The second input is a tensor who has the same dimension sizes as x except
+          the first dimension, which must be the same as indices's size.
+
+    Outputs:
+        Tensor, has the same shape and dtype as input.
+
+    Examples:
+        >>> indices = (0, 1)
+        >>> input_x = Tensor(np.array([[1, 2], [3, 4], [5, 6]]), mindspore.float32)
+        >>> input_v = Tensor(np.array([[0.5, 1.0], [1.0, 1.5]]), mindspore.float32)
+        >>> inplaceAdd = P.InplaceAdd(indices)
+        >>> inplaceAdd(input_x, input_v)
+        [[1.5 3.]
+         [4. 5.5]
+         [5. 6.]]
+    """
+
+    @prim_attr_register
+    def __init__(self, indices):
+        """init InplaceAdd"""
+        self.init_prim_io_names(inputs=['x', 'v'], outputs=['y'])
+        self.indices = indices
+
+    def infer_shape(self, x_shape, v_shape):
+        validator.check("x", len(x_shape), "v", len(v_shape), Rel.EQ, self.name)
+        if isinstance(self.indices, int):
+            validator.check("size of indices", 1, "v's first dimension", v_shape[0],
+                            Rel.EQ, self.name)
+            if self.indices < 0 or self.indices >= x_shape[0]:
+                raise ValueError(f'The value of indices must be in [0, {x_shape[0]}), but got {self.indices}.')
+        else:
+            validator.check("size of indices", len(self.indices), "v's first dimension", v_shape[0],
+                            Rel.EQ, self.name)
+            for i in self.indices:
+                if i < 0 or i >= x_shape[0]:
+                    raise ValueError(f'The value of indices must be in [0, {x_shape[0]}), but got {i}.')
+        if len(x_shape) > 1:
+            validator.check("x's ith dimension", x_shape[1:], "v's ith dimension", v_shape[1:],
+                            Rel.EQ, self.name)
+        return x_shape
+
+    def infer_dtype(self, x_dtype, v_dtype):
+        args = {'x': x_dtype, 'v': v_dtype}
+        valid_type = [mstype.int32, mstype.float16, mstype.float32]
+        validator.check_tensor_type_same(args, valid_type, self.name)
+        validator.check_value_type('indices', self.indices, [tuple, int], self.name)
+        return x_dtype
+
+
+class InplaceSub(PrimitiveWithInfer):
+    """
+    Subtracts v into specified rows of x. Computes y = x; y[i, :] -= v; return y.
+
+    Args:
+        indices (Union[int, tuple]): Indices into the left-most dimension of x, and determines which rows of x
+            to sub with v. It is a int or tuple, whose value is in [0, the first dimension size of x).
+
+    Inputs:
+        - **input_x** (Tensor) - The first input is a tensor whose data type is number.
+        - **input_v** (Tensor) - The second input is a tensor who has the same dimension sizes as x except
+          the first dimension, which must be the same as indices's size.
+
+    Outputs:
+        Tensor, has the same shape and dtype as input.
+
+    Examples:
+        >>> indices = (0, 1)
+        >>> input_x = Tensor(np.array([[1, 2], [3, 4], [5, 6]]), mindspore.float32)
+        >>> input_v = Tensor(np.array([[0.5, 1.0], [1.0, 1.5]]), mindspore.float32)
+        >>> inplaceSub = P.InplaceSub(indices)
+        >>> inplaceSub(input_x, input_v)
+        [[0.5 1.]
+         [2. 2.5]
+         [5. 6.]]
+    """
+
+    @prim_attr_register
+    def __init__(self, indices):
+        """init InplaceSub"""
+        self.init_prim_io_names(inputs=['x', 'v'], outputs=['y'])
+        self.indices = indices
+
+    def infer_shape(self, x_shape, v_shape):
+        validator.check("x", len(x_shape), "v", len(v_shape), Rel.EQ, self.name)
+        if isinstance(self.indices, int):
+            validator.check("size of indices", 1, "v's first dimension", v_shape[0],
+                            Rel.EQ, self.name)
+            if self.indices < 0 or self.indices >= x_shape[0]:
+                raise ValueError(f'The value of indices must be in [0, {x_shape[0]}), but got {self.indices}.')
+        else:
+            validator.check("size of indices", len(self.indices), "v's first dimension", v_shape[0],
+                            Rel.EQ, self.name)
+            for i in self.indices:
+                if i < 0 or i >= x_shape[0]:
+                    raise ValueError(f'The value of indices must be in [0, {x_shape[0]}), but got {i}.')
+        if len(x_shape) > 1:
+            validator.check("x's ith dimension", x_shape[1:], "v's ith dimension", v_shape[1:],
+                            Rel.EQ, self.name)
+        return x_shape
+
+    def infer_dtype(self, x_dtype, v_dtype):
+        args = {'x': x_dtype, 'v': v_dtype}
+        valid_type = [mstype.int32, mstype.float16, mstype.float32]
+        validator.check_tensor_type_same(args, valid_type, self.name)
+        validator.check_value_type('indices', self.indices, [tuple, int], self.name)
+        return x_dtype
+
 
 class Sub(_MathBinaryOp):
     """
     Subtracts the second input tensor from the first input tensor element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Examples:
         >>> input_x = Tensor(np.array([1, 2, 3]), mindspore.int32)
@@ -779,24 +1039,35 @@ class Sub(_MathBinaryOp):
         [-3, -3, -3]
     """
 
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = x - y
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Mul(_MathBinaryOp):
     """
     Multiplies two tensors element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Examples:
         >>> input_x = Tensor(np.array([1.0, 2.0, 3.0]), mindspore.float32)
@@ -835,6 +1106,7 @@ class Square(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init Square"""
+        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -843,6 +1115,14 @@ class Square(PrimitiveWithInfer):
         validator.check_tensor_type_same({"x": x_type}, mstype.number_type, self.name)
         return x_type
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = x * x
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Rsqrt(PrimitiveWithInfer):
     """
@@ -864,6 +1144,7 @@ class Rsqrt(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init Rsqrt"""
+        self.init_prim_io_names(inputs=['x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -872,6 +1153,14 @@ class Rsqrt(PrimitiveWithInfer):
         validator.check_tensor_type_same({"x": x_type}, mstype.number_type, self.name)
         return x_type
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = 1.0 / np.sqrt(x)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Sqrt(PrimitiveWithInfer):
     """
@@ -893,6 +1182,7 @@ class Sqrt(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init Sqrt"""
+        self.init_prim_io_names(inputs=['x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -901,6 +1191,14 @@ class Sqrt(PrimitiveWithInfer):
         validator.check_tensor_type_same({"x": x_type}, mstype.number_type, self.name)
         return x_type
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = np.sqrt(x)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Reciprocal(PrimitiveWithInfer):
     """
@@ -931,33 +1229,34 @@ class Reciprocal(PrimitiveWithInfer):
         validator.check_subclass("x", x, mstype.tensor, self.name)
         return x
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = 1.0 / x
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Pow(_MathBinaryOp):
     """
     Computes a tensor to the power of the second input.
 
-    The first input must be a tensor, and the second input should be a tensor or a number.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be the same.
-    When the inputs are one tensor and one scalar, the scalar could not be a parameter,
-    only could be a constant, and the type of the scalar is the same as the data type of the tensor.
-
-    Inputs:
-        - **input_x** (Union[Tensor]) - The first input is a tensor whose data type is number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
-
-    Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
-
+    The inputs must be two tensors or one tensor and one scalar.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Tensor) - The input tensor.
-        - **input_y** (Union[Tensor, Number]) - The exponent part. If exponent is a tensor, its shape must be able to
-          broadcast to the shape of the `input_x`.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, has the same shape as the `input_x`.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Examples:
         >>> input_x = Tensor(np.array([1.0, 2.0, 4.0]), mindspore.float32)
@@ -973,6 +1272,15 @@ class Pow(_MathBinaryOp):
         [1.0, 16.0, 64.0]
     """
 
+    def infer_value(self, x, power):
+        if x is not None and power is not None:
+            x = x.asnumpy()
+            power = power.asnumpy()
+            out = np.power(x, power)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Exp(PrimitiveWithInfer):
     """
@@ -1003,6 +1311,88 @@ class Exp(PrimitiveWithInfer):
         validator.check_subclass("x", x_type, mstype.tensor, self.name)
         return x_type
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = np.exp(x)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
+
+class Expm1(PrimitiveWithInfer):
+    """
+    Returns exponential then minus 1 of a tensor element-wise.
+
+    Inputs:
+        - **input_x** (Tensor) - The input tensor.
+
+    Outputs:
+        Tensor, has the same shape as the `input_x`.
+
+    Examples:
+        >>> input_x = Tensor(np.array([0.0, 1.0, 2.0, 4.0]), mindspore.float32)
+        >>> expm1 = P.Expm1()
+        >>> expm1(input_x)
+        [ 0.,  1.71828183,  6.3890561 , 53.59815003]
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """init Exp"""
+        self.init_prim_io_names(inputs=['x'], outputs=['y'])
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_type):
+        validator.check_subclass("x", x_type, mstype.tensor, self.name)
+        return x_type
+
+
+class HistogramFixedWidth(PrimitiveWithInfer):
+    """
+    Returns a rank 1 histogram counting the number of entries in values that fall into every bin. The bins are equal
+    width and determined by the arguments range and nbins.
+
+    Args:
+        dtype (string): An optional attribute. Must be one of the following types: "int32", "int64". Default: "int32".
+        nbins (Tensor): Number of histogram bins, the type is int32.
+
+    Inputs:
+        - **x** (Tensor) - Numeric Tensor. Must be one of the following types: int32, float32, float16.
+        - **range** (Tensor) - Must have the same type as x. Shape [2] Tensor of same dtype as x.
+        x <= range[0] will be mapped to hist[0], x >= range[1] will be mapped to hist[-1].
+
+    Outputs:
+        Tensor, the type is int32.
+
+    Examples:
+        >>> x = Tensor([-1.0, 0.0, 1.5, 2.0, 5.0, 15], mindspore.float16)
+        >>> range = Tensor([0.0, 5.0], mindspore.float16)
+        >>> hist = P.HistogramFixedWidth(5)
+        >>> hist(x, range)
+        [2 1 1 0 2]
+    """
+
+    @prim_attr_register
+    def __init__(self, nbins, dtype='int32'):
+        self.nbins = validator.check_value_type("nbins", nbins, [int], self.name)
+        valid_values = ['int32', 'int64']
+        self.dtype = validator.check_string("dtype", dtype, valid_values, self.name)
+        self.init_prim_io_names(inputs=['x', 'range'], outputs=['y'])
+
+    def infer_shape(self, x_shape, range_shape):
+        return (self.nbins,)
+
+    def infer_dtype(self, x_dtype, range_dtype):
+        validator.check_subclass("x", x_dtype, mstype.tensor, self.name)
+        valid_types = (mstype.float16, mstype.float32, mstype.int32)
+        validator.check_tensor_type_same({"x": x_dtype}, valid_types, self.name)
+        validator.check_tensor_type_same({"range": range_dtype}, valid_types, self.name)
+        y_dtype = mstype.int32
+        return y_dtype
+
 
 class Log(PrimitiveWithInfer):
     """
@@ -1032,6 +1422,14 @@ class Log(PrimitiveWithInfer):
         validator.check_subclass("x", x, mstype.tensor, self.name)
         return x
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = np.log(x)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Log1p(PrimitiveWithInfer):
     """
@@ -1127,18 +1525,20 @@ class Minimum(_MathBinaryOp):
     Computes the element-wise minimum of input tensors.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Examples:
         >>> input_x = Tensor(np.array([1.0, 5.0, 3.0]), mindspore.float32)
@@ -1148,24 +1548,35 @@ class Minimum(_MathBinaryOp):
         [1.0, 2.0, 3.0]
     """
 
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.minimum(x, y)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Maximum(_MathBinaryOp):
     """
     Computes the element-wise maximum of input tensors.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Examples:
         >>> input_x = Tensor(np.array([1.0, 5.0, 3.0]), mindspore.float32)
@@ -1175,24 +1586,34 @@ class Maximum(_MathBinaryOp):
         [4.0, 5.0, 6.0]
     """
 
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.maximum(x, y)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
 
 class RealDiv(_MathBinaryOp):
     """
     Divide the first input tensor by the second input tensor in floating-point type element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Examples:
         >>> input_x = Tensor(np.array([1.0, 2.0, 3.0]), mindspore.float32)
@@ -1217,18 +1638,20 @@ class Div(_MathBinaryOp):
     Computes the quotient of dividing the first input tensor by the second input tensor element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Raises:
         ValueError: When `input_x` and `input_y` are not the same dtype.
@@ -1248,23 +1671,67 @@ class Div(_MathBinaryOp):
         return None
 
 
+class DivNoNan(_MathBinaryOp):
+    """
+    Computes a safe divide which returns 0 if the y is zero.
+
+    The inputs must be two tensors or one tensor and one scalar.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
+
+    Inputs:
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
+
+    Outputs:
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
+
+    Raises:
+        ValueError: When `input_x` and `input_y` are not the same dtype.
+
+    Examples:
+        >>> input_x = Tensor(np.array([-1.0, 0., 1.0, 5.0, 6.0]), mindspore.float32)
+        >>> input_y = Tensor(np.array([0., 0., 0., 2.0, 3.0]), mindspore.float32)
+        >>> div_no_nan = P.DivNoNan()
+        >>> div_no_nan(input_x, input_y)
+        [0., 0., 0., 2.5, 2.0]
+    """
+
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            with np.errstate(divide='ignore', invalid='ignore'):
+                out = np.true_divide(x, y)
+                out[~np.isfinite(out)] = 0
+            return out
+        return None
+
+
 class FloorDiv(_MathBinaryOp):
     """
     Divide the first input tensor by the second input tensor element-wise and rounds down to the closest integer.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Examples:
         >>> input_x = Tensor(np.array([2, 4, -1]), mindspore.int32)
@@ -1309,18 +1776,20 @@ class FloorMod(_MathBinaryOp):
     Compute element-wise remainder of division.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as 'input_x' or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as 'input_x'.
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
 
     Examples:
         >>> input_x = Tensor(np.array([2, 4, -1]), mindspore.int32)
@@ -1331,6 +1800,35 @@ class FloorMod(_MathBinaryOp):
     """
 
 
+class Ceil(PrimitiveWithInfer):
+    """
+    Round a tensor up to the closest integer element-wise.
+
+    Inputs:
+        - **input_x** (Tensor) - The input tensor. Its element data type must be float.
+
+    Outputs:
+        Tensor, has the same shape as `input_x`.
+
+    Examples:
+        >>> input_x = Tensor(np.array([1.1, 2.5, -1.5]), mindspore.float32)
+        >>> ceil_op = P.Ceil()
+        >>> ceil_op(input_x)
+        [2.0, 3.0, -1.0]
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        self.init_prim_io_names(inputs=['x'], outputs=['y'])
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_dtype):
+        validator.check_tensor_type_same({"x": x_dtype}, mstype.float_type, self.name)
+        return x_dtype
+
+
 class Acosh(PrimitiveWithInfer):
     """
     Compute inverse hyperbolic cosine of x element-wise.
@@ -1359,6 +1857,35 @@ class Acosh(PrimitiveWithInfer):
         return x_dtype
 
 
+class Cosh(PrimitiveWithInfer):
+    """
+    Computes hyperbolic cosine of input element-wise.
+
+    Inputs:
+        - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+
+    Outputs:
+        Tensor, has the same shape as `input_x`.
+
+    Examples:
+        >>> cosh = P.Cosh()
+        >>> input_x = Tensor(np.array([0.24, 0.83, 0.31, 0.09]), mindspore.float32)
+        >>> output = cosh(input_x)
+        [1.0289385 1.364684 1.048436 1.4228927]
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """init Cosh"""
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_dtype):
+        validator.check_tensor_type_same({'x': x_dtype}, mstype.number_type, self.name)
+        return x_dtype
+
+
 class Asinh(PrimitiveWithInfer):
     """
     Compute inverse hyperbolic cosine of x element-wise.
@@ -1376,7 +1903,6 @@ class Asinh(PrimitiveWithInfer):
         [-2.3212, 1.1976, 1.8184, 5.2983]
     """
 
-
     @prim_attr_register
     def __init__(self):
         """init Asinh"""
@@ -1389,6 +1915,35 @@ class Asinh(PrimitiveWithInfer):
         return x_dtype
 
 
+class Sinh(PrimitiveWithInfer):
+    """
+    Computes hyperbolic sine of input element-wise.
+
+    Inputs:
+        - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+
+    Outputs:
+        Tensor, has the same shape as `input_x`.
+
+    Examples:
+        >>> sinh = P.Sinh()
+        >>> input_x = Tensor(np.array([0.62, 0.28, 0.43, 0.62]), mindspore.float32)
+        >>> output = sinh(input_x)
+        [0.6604918 0.28367308 0.44337422 0.6604918]
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """init Sinh"""
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_dtype):
+        validator.check_tensor_type_same({'x': x_dtype}, mstype.number_type, self.name)
+        return x_dtype
+
+
 class _LogicBinaryOp(_BinaryOp):
     """
     Define logic binary operators.
@@ -1409,19 +1964,17 @@ class Equal(_LogicBinaryOp):
     Computes the equivalence between two tensors element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors, the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar, the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number, bool]) - The first input is a tensor whose data type is number or bool, or
-          a number or a bool object.
-        - **input_y** (Union[Tensor, Number, bool]) - The second input tensor whose data type is same as 'input_x' or
-          a number or a bool object.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is bool.
+        Tensor, the shape is same as the shape after broadcasting,and the data type is bool.
 
     Examples:
         >>> input_x = Tensor(np.array([1, 2, 3]), mindspore.float32)
@@ -1440,6 +1993,44 @@ class Equal(_LogicBinaryOp):
         return _LogicBinaryOp.do_infer_dtype(x_dtype, y_dtype, mstype.number_type + (mstype.bool_,), self.name)
 
 
+class ApproximateEqual(_LogicBinaryOp):
+    """
+    Returns the truth value of abs(x1-x2) < tolerance element-wise.
+
+    Args:
+        tolerance (float): The maximum deviation that two elements can be considered equal. Default: 1e-05.
+
+    Inputs:
+        - **x1** (Tensor) - A tensor. Must be one of the following types: float32, float16.
+        - **x2** (Tensor) - A tensor of the same type and shape as 'x1'.
+
+    Outputs:
+        Tensor, the shape is same as the shape of 'x1', and the data type is bool.
+
+    Examples:
+        >>> x1 = Tensor(np.array([1, 2, 3]), mindspore.float32)
+        >>> x2 = Tensor(np.array([2, 4, 6]), mindspore.float32)
+        >>> approximate_equal = P.ApproximateEqual(2.)
+        >>> result = approximate_equal(x1, x2)
+        [True  True  False]
+    """
+
+    @prim_attr_register
+    def __init__(self, tolerance=1e-05):
+        """Init ApproximateEqual"""
+        validator.check_value_type("tolerance", tolerance, [float], self.name)
+
+    def infer_shape(self, x_shape, y_shape):
+        validator.check("x_shape", x_shape, "y_shape", y_shape, Rel.EQ, self.name)
+        return x_shape
+
+    def infer_dtype(self, x_dtype, y_dtype):
+        args_dtype = {"x": x_dtype, "y": y_dtype}
+        valid_type = [mstype.float32, mstype.float16]
+        validator.check_tensor_type_same(args_dtype, valid_type, prim_name=self.name)
+        return mstype.tensor_type(mstype.bool_)
+
+
 class EqualCount(PrimitiveWithInfer):
     """
     Computes the number of the same elements of two tensors.
@@ -1482,19 +2073,17 @@ class NotEqual(_LogicBinaryOp):
     Computes the non-equivalence of two tensors element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors, the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar, the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number, bool]) - The first input is a tensor whose data type is number or bool, or
-          a number or a bool object.
-        - **input_y** (Union[Tensor, Number, bool]) - The second input tensor whose data type is same as `input_x` or
-          a number or a bool object.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is bool.
+        Tensor, the shape is same as the shape after broadcasting,and the data type is bool.
 
     Examples:
         >>> input_x = Tensor(np.array([1, 2, 3]), mindspore.float32)
@@ -1518,18 +2107,19 @@ class Greater(_LogicBinaryOp):
     Computes the boolean value of :math:`x > y` element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as `input_x` or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is bool.
+        Tensor, the shape is same as the shape after broadcasting,and the data type is bool.
 
     Examples:
         >>> input_x = Tensor(np.array([1, 2, 3]), mindspore.int32)
@@ -1538,6 +2128,13 @@ class Greater(_LogicBinaryOp):
         >>> greater(input_x, input_y)
         [False, True, False]
     """
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.array(np.greater(x, y))
+            return Tensor(out)
+        return None
 
 
 class GreaterEqual(_LogicBinaryOp):
@@ -1545,18 +2142,19 @@ class GreaterEqual(_LogicBinaryOp):
     Computes the boolean value of :math:`x >= y` element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as `input_x` or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is bool.
+        Tensor, the shape is same as the shape after broadcasting,and the data type is bool.
 
     Examples:
         >>> input_x = Tensor(np.array([1, 2, 3]), mindspore.int32)
@@ -1565,6 +2163,13 @@ class GreaterEqual(_LogicBinaryOp):
         >>> greater_equal(input_x, input_y)
         [True, True, False]
     """
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.array(np.greater_equal(x, y))
+            return Tensor(out)
+        return None
 
 
 class Less(_LogicBinaryOp):
@@ -1572,18 +2177,19 @@ class Less(_LogicBinaryOp):
     Computes the boolean value of :math:`x < y` element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as `input_x` or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is bool.
+        Tensor, the shape is same as the shape after broadcasting,and the data type is bool.
 
     Examples:
         >>> input_x = Tensor(np.array([1, 2, 3]), mindspore.int32)
@@ -1592,6 +2198,13 @@ class Less(_LogicBinaryOp):
         >>> less(input_x, input_y)
         [False, False, True]
     """
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.array(np.less(x, y))
+            return Tensor(out)
+        return None
 
 
 class LessEqual(_LogicBinaryOp):
@@ -1599,18 +2212,19 @@ class LessEqual(_LogicBinaryOp):
     Computes the boolean value of :math:`x <= y` element-wise.
 
     The inputs must be two tensors or one tensor and one scalar.
-    When the inputs are two tensors, the shapes of them could be broadcast,
-    and the data types of them should be same.
-    When the inputs are one tensor and one scalar, the scalar cannot be a parameter, only can be a constant,
-    and the type of the scalar is the same as the data type of the tensor.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
 
     Inputs:
-        - **input_x** (Union[Tensor, Number]) - The first input is a tensor whose data type is number or a number.
-        - **input_y** (Union[Tensor, Number]) - The second input is a tensor whose data type is same as `input_x` or
-          a number.
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is bool.
+        Tensor, the shape is same as the shape after broadcasting,and the data type is bool.
 
     Examples:
         >>> input_x = Tensor(np.array([1, 2, 3]), mindspore.int32)
@@ -1619,6 +2233,13 @@ class LessEqual(_LogicBinaryOp):
         >>> less_equal(input_x, input_y)
         [True, False, True]
     """
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.array(np.less_equal(x, y))
+            return Tensor(out)
+        return None
 
 
 class LogicalNot(PrimitiveWithInfer):
@@ -1655,15 +2276,16 @@ class LogicalAnd(_LogicBinaryOp):
     """
     Computes the "logical AND" of two tensors element-wise.
 
-    The inputs must be two tensors or one tensor and one bool object.
+    The inputs must be two tensors or one tensor and one bool.
     When the inputs are two tensors, the shapes of them could be broadcast,
     and the data types of them should be bool.
-    When the inputs are one tensor and one bool object, the bool object cannot be a parameter, only can be a constant,
+    When the inputs are one tensor and one bool, the bool object only could be a constant,
     and the data type of the tensor should be bool.
 
     Inputs:
-        - **input_x** (Union[Tensor, bool]) - The first input is a tensor whose data type is bool or a bool object.
-        - **input_y** (Union[Tensor, bool]) - The second input is a tensor whose data type is bool or a bool object.
+        - **input_x** (Union[Tensor, bool]) - The first input is a bool or a tensor whose data type is bool.
+        - **input_y** (Union[Tensor, bool]) - The second input is a bool when the first input is a tensor or
+          a tensor whose data type is bool.
 
     Outputs:
         Tensor, the shape is same as the shape after broadcasting, and the data type is bool.
@@ -1684,18 +2306,19 @@ class LogicalOr(_LogicBinaryOp):
     """
     Computes the "logical OR" of two tensors element-wise.
 
-    The inputs must be two tensors or one tensor and one bool object.
+    The inputs must be two tensors or one tensor and one bool.
     When the inputs are two tensors, the shapes of them could be broadcast,
     and the data types of them should be bool.
-    When the inputs are one tensor and one bool object, the bool object cannot be a parameter, only can be a constant,
+    When the inputs are one tensor and one bool, the bool object only could be a constant,
     and the data type of the tensor should be bool.
 
     Inputs:
-        - **input_x** (Union[Tensor, bool]) - The first input is a tensor whose data type is bool or a bool object.
-        - **input_y** (Union[Tensor, bool]) - The second input is a tensor whose data type is bool or a bool object.
+        - **input_x** (Union[Tensor, bool]) - The first input is a bool or a tensor whose data type is bool.
+        - **input_y** (Union[Tensor, bool]) - The second input is a bool when the first input is a tensor or
+          a tensor whose data type is bool.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is bool.
+        Tensor, the shape is same as the shape after broadcasting,and the data type is bool.
 
     Examples:
         >>> input_x = Tensor(np.array([True, False, True]), mindspore.bool_)
@@ -2097,8 +2720,8 @@ class NMSWithMask(PrimitiveWithInfer):
     def infer_shape(self, bboxes_shape):
         cls_name = self.name
         validator.check_integer("bboxes rank", len(bboxes_shape), 2, Rel.EQ, cls_name)
-        validator.check_integer("bboxes.shape()[0]", bboxes_shape[0], 0, Rel.GT, cls_name)
-        validator.check_integer("bboxes.shape()[1]", bboxes_shape[1], 5, Rel.EQ, cls_name)
+        validator.check_integer("bboxes.shape[0]", bboxes_shape[0], 0, Rel.GT, cls_name)
+        validator.check_integer("bboxes.shape[1]", bboxes_shape[1], 5, Rel.EQ, cls_name)
         num = bboxes_shape[0]
         return (bboxes_shape, (num,), (num,))
 
@@ -2127,6 +2750,7 @@ class Abs(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init Abs"""
+        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -2138,7 +2762,7 @@ class Abs(PrimitiveWithInfer):
     def infer_value(self, x):
         if x is not None:
             x = x.asnumpy()
-            out = np.abs(x, dtype=x.dtype)
+            out = np.array(np.abs(x, dtype=x.dtype))
             return Tensor(out)
         return None
 
@@ -2197,7 +2821,8 @@ class Round(PrimitiveWithInfer):
 
     @prim_attr_register
     def __init__(self):
-        pass
+        """init Round"""
+        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -2279,7 +2904,7 @@ class Atan2(_MathBinaryOp):
         - **input_y** (Tensor) - The input tensor.
 
     Outputs:
-        Tensor, the shape is same as the shape after broadcasting, and the data type is same as `input_x`.
+        Tensor, the shape is same as the shape after broadcasting,and the data type is same as `input_x`.
 
     Examples:
          >>> input_x = Tensor(np.array([[0, 1]]), mindspore.float32)
@@ -2289,7 +2914,6 @@ class Atan2(_MathBinaryOp):
          [[0. 0.7853982]]
     """
 
-
 class SquareSumAll(PrimitiveWithInfer):
     """
     Returns square sum all of a tensor element-wise
@@ -2315,6 +2939,7 @@ class SquareSumAll(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init SquareSumAll"""
+
     def infer_shape(self, x_shape, y_shape):
         validator.check("x1_shape", x_shape, "x2_shape", y_shape, Rel.EQ, self.name)
         return [], []
@@ -2441,3 +3066,101 @@ class BesselI1e(PrimitiveWithInfer):
     def infer_dtype(self, x):
         validator.check_tensor_type_same({'x': x}, mstype.number_type, self.name)
         return x
+
+
+class Inv(PrimitiveWithInfer):
+    """
+    Computes Inv(Reciprocal) of input tensor element-wise.
+
+    Inputs:
+        - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+          Must be one of the following types: float16, float32, int32.
+
+    Outputs:
+        Tensor, has the same shape and data type as `input_x`.
+
+    Examples:
+        >>> inv = P.Inv()
+        >>> input_x = Tensor(np.array([0.25, 0.4, 0.31, 0.52]), mindspore.float32)
+        >>> output = inv(input_x)
+        [4., 2.5, 3.2258065, 1.923077]
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        pass
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_dtype):
+        validator.check_tensor_type_same({'x_dtype': x_dtype}, [mstype.float16, mstype.float32,
+                                                                mstype.int32], self.name)
+        return x_dtype
+
+
+class Invert(PrimitiveWithInfer):
+    """
+    Flips all bits of input tensor element-wise.
+
+    Inputs:
+        - **input_x** (Tensor[int16], Tensor[uint16]) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+
+    Outputs:
+        Tensor, has the same shape as `input_x`.
+
+    Examples:
+        >>> invert = P.Invert()
+        >>> input_x = Tensor(np.array([25, 4, 13, 9]), mindspore.int16)
+        >>> output = invert(input_x)
+        [-26, -5, -14, -10]
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        pass
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_dtype):
+        validator.check_tensor_type_same({'x_dtype': x_dtype}, [mstype.int16, mstype.uint16], self.name)
+        return x_dtype
+
+
+class Eps(PrimitiveWithInfer):
+    """
+    Creates a tensor filled with `input_x` dtype minimum val.
+
+    Inputs:
+        - **input_x** (Tensor) - Input tensor.
+
+    Outputs:
+        Tensor, has the same type and shape as `input_x`, but filled with `input_x` dtype minimum val.
+
+    Examples:
+        >>> out = P.Eps()(input_x)
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """init Eps"""
+        self.init_prim_io_names(inputs=['input_x'], outputs=['y'])
+
+    def __infer__(self, input_x):
+        valid_types = [mstype.float16, mstype.float32]
+        validator.check_tensor_type_same({'input_x': input_x['dtype']}, valid_types, self.name)
+
+        x_nptype = mstype.dtype_to_nptype(input_x['dtype'].element_type())
+        if x_nptype == np.float16:
+            min_val = 2 ** (-14)
+        else:
+            min_val = 2 ** (-16)
+
+        res = np.full(input_x['shape'], min_val, x_nptype)
+        out = {
+            'value': Tensor(res),
+            'shape': input_x['shape'],
+            'dtype': input_x['dtype'],
+        }
+        return out
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index 027a9e9525..ce8536c001 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -78,7 +78,7 @@ class Flatten(PrimitiveWithInfer):
         >>> input_tensor = Tensor(np.ones(shape=[1, 2, 3, 4]), mindspore.float32)
         >>> flatten = P.Flatten()
         >>> output = flatten(input_tensor)
-        >>> assert output.shape() == (1, 24)
+        >>> assert output.shape == (1, 24)
     """
 
     @prim_attr_register
@@ -585,6 +585,50 @@ class FusedBatchNorm(Primitive):
         self.momentum = validator.check_number_range('momentum', momentum, 0, 1, Rel.INC_BOTH, self.name)
 
 
+class BNTrainingReduce(PrimitiveWithInfer):
+    """
+    reduce sum at axis [0, 2, 3].
+
+    Inputs:
+        - **x** (Tensor)  - Tensor of shape :math:`(N, C)`.
+
+    Outputs:
+        - **sum** (Tensor) - Tensor of shape :math:`(C,)`.
+        - **square_sum** (Tensor) - Tensor of shape :math:`(C,)`.
+
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        self.init_prim_io_names(inputs=['x'], outputs=['sum', 'square_sum'])
+
+    def infer_shape(self, x_shape):
+        validator.check_integer("x rank", len(x_shape), 4, Rel.EQ, self.name)
+        return ([x_shape[1]], [x_shape[1]])
+
+    def infer_dtype(self, x_type):
+        return (x_type, x_type)
+
+
+class BNTrainingUpdate(PrimitiveWithInfer):
+    """
+    primitive operator of bn_training_update's register and info descriptor
+    """
+    @prim_attr_register
+    def __init__(self, isRef=True, epsilon=1e-5, factor=0.1):
+        self.init_prim_io_names(inputs=['x', 'sum', 'square_sum', 'scale', 'b', 'mean', 'variance'],
+                                outputs=['y', 'running_mean', 'running_variance', 'save_mean', 'save_inv_variance'])
+        #self.isRef = validator.check_integer('isRef', isRef, [0, 1], Rel.IN)
+        self.epsilon = validator.check_number_range('epsilon', epsilon, 0, 1, Rel.INC_RIGHT, 'BNTrainingUpdate')
+        self.factor = validator.check_number_range('factor', factor, 0, 1, Rel.INC_BOTH, 'BNTrainingUpdate')
+
+    def infer_shape(self, x, sum, square_sum, scale, b, mean, variance):
+        return (x, variance, variance, variance, variance)
+
+    def infer_dtype(self, x, sum, square_sum, scale, b, mean, variance):
+        return (x, variance, variance, variance, variance)
+
+
 class BatchNorm(PrimitiveWithInfer):
     r"""
     Batch Normalization for input data and updated parameters.
@@ -629,7 +673,7 @@ class BatchNorm(PrimitiveWithInfer):
         >>> mean = Tensor(np.ones([64]), mindspore.float32)
         >>> variance = Tensor(np.ones([64]), mindspore.float32)
         >>> batch_norm = P.BatchNorm()
-        >>> output = batch_norm(input_x, scale, bias, mean, variance
+        >>> output = batch_norm(input_x, scale, bias, mean, variance)
     """
 
     @prim_attr_register
@@ -756,7 +800,7 @@ class Conv2D(PrimitiveWithInfer):
     def infer_shape(self, x_shape, w_shape):
         validator.check_integer("weight rank", len(w_shape), 4, Rel.EQ, self.name)
         validator.check_integer("x rank", len(x_shape), 4, Rel.EQ, self.name)
-        validator.check("x_shape[1] / group", x_shape[1] // self.group, "w_shape[1]", w_shape[1], Rel.EQ, self.name)
+        validator.check(f"x_shape[1] / group", x_shape[1] // self.group, "w_shape[1]", w_shape[1], Rel.EQ, self.name)
         validator.check('out_channel', self.out_channel, 'w_shape[0]', w_shape[0], Rel.EQ, self.name)
         validator.check('kernel_size', self.kernel_size, 'w_shape[2:4]', tuple(w_shape[2:4]), Rel.EQ, self.name)
 
@@ -786,9 +830,9 @@ class Conv2D(PrimitiveWithInfer):
             pad_top, pad_bottom, pad_left, pad_right = self.pad, self.pad, self.pad, self.pad
 
             h_out = 1 + (x_shape[2] + 2 * self.pad - kernel_size_h - (kernel_size_h - 1) * (dilation_h - 1)) \
-                    / stride_h
+                / stride_h
             w_out = 1 + (x_shape[3] + 2 * self.pad - kernel_size_w - (kernel_size_w - 1) * (dilation_w - 1)) \
-                    / stride_w
+                / stride_w
             h_out = math.floor(h_out)
             w_out = math.floor(w_out)
 
@@ -802,6 +846,8 @@ class Conv2D(PrimitiveWithInfer):
         args = {'x': x_dtype, 'w': w_dtype}
         valid_types = [mstype.int8, mstype.int32, mstype.float16, mstype.float32]
         validator.check_tensor_type_same(args, valid_types, self.name)
+        if x_dtype.element_type() == mstype.int8:
+            return mstype.tensor_type(mstype.int32)
         return x_dtype
 
 
@@ -840,7 +886,7 @@ class DepthwiseConv2dNative(PrimitiveWithInfer):
         >>> weight = Tensor(np.ones([1, 32, 3, 3]), mindspore.float32)
         >>> depthwise_conv2d = P.DepthwiseConv2dNative(channel_multiplier = 3, kernel_size = (3, 3))
         >>> output = depthwise_conv2d(input, weight)
-        >>> assert output.shape() == (10, 96, 30, 30)
+        >>> assert output.shape == (10, 96, 30, 30)
     """
 
     @prim_attr_register
@@ -907,9 +953,9 @@ class DepthwiseConv2dNative(PrimitiveWithInfer):
             pad_top, pad_bottom, pad_left, pad_right = self.pad, self.pad, self.pad, self.pad
 
             h_out = 1 + (x_shape[2] + 2 * self.pad - kernel_size_h - (kernel_size_h - 1) * (dilation_h - 1)) \
-                    / stride_h
+                / stride_h
             w_out = 1 + (x_shape[3] + 2 * self.pad - kernel_size_w - (kernel_size_w - 1) * (dilation_w - 1)) \
-                    / stride_w
+                / stride_w
             h_out = math.floor(h_out)
             w_out = math.floor(w_out)
 
@@ -1498,17 +1544,20 @@ class ApplyMomentum(PrimitiveWithInfer):
         ('accumulation', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
          sig_dtype.T),
         ('learning_rate', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
-         sig_dtype.T),
+         sig_dtype.T1),
         ('gradient', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
-        ('momentum', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T)
+        ('momentum', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T2)
     )
 
     @prim_attr_register
     def __init__(self, use_nesterov=False, use_locking=False, gradient_scale=1.0):
         self.init_prim_io_names(inputs=['variable', 'accumulation', 'learning_rate', 'gradient', 'momentum'],
                                 outputs=['output'])
+        self.is_tbe = context.get_context("device_target") == "Ascend"
 
     def infer_shape(self, v_shape, a_shape, l_shape, g_shape, m_shape):
+        if self.is_tbe:
+            return v_shape, v_shape
         return v_shape
 
     def infer_dtype(self, v_dtype, a_dtype, l_dtype, g_dtype, m_dtype):
@@ -1519,6 +1568,8 @@ class ApplyMomentum(PrimitiveWithInfer):
         validator.check_scalar_or_tensor_type_same({"l_dtype": l_dtype}, valid_types, self.name)
         validator.check_scalar_or_tensor_type_same({"g_dtype": g_dtype}, valid_types, self.name)
         validator.check_scalar_or_tensor_type_same({"m_dtype": m_dtype}, valid_types, self.name)
+        if self.is_tbe:
+            return g_dtype, g_dtype
         return g_dtype
 
 
@@ -1608,6 +1659,44 @@ class L2Loss(PrimitiveWithInfer):
         return x_type
 
 
+class DataFormatDimMap(PrimitiveWithInfer):
+    """
+    Returns the dimension index in the destination data format given the one in the source data format.
+
+    Args:
+        src_format (string): An optional value for source data format. Default: 'NHWC'.
+        dst_format (string): An optional value for destination data format. Default: 'NCHW'.
+
+    Inputs:
+        - **input_x** (Tensor) - A Tensor with each element as a dimension index in source data format.
+        Must be in the range [-4, 4). It's type is int32.
+
+    Outputs:
+        Tensor, has the same type as the `input_x`.
+
+    Examples:
+        >>> x = Tensor([0, 1, 2, 3], mindspore.int32)
+        >>> dfdm = P.DataFormatDimMap()
+        >>> dfdm(x)
+        [0 3 1 2]
+    """
+
+    @prim_attr_register
+    def __init__(self, src_format='NHWC', dst_format='NCHW'):
+        valid_values = ['NHWC', 'NCHW']
+        self.src_format = validator.check_string("src_format", src_format, valid_values, self.name)
+        self.dst_format = validator.check_string("dst_format", dst_format, valid_values, self.name)
+        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_type):
+        validator.check_subclass("x", x_type, mstype.tensor, self.name)
+        valid_types = [mstype.int32]
+        validator.check_tensor_type_same({"x": x_type}, valid_types, self.name)
+        return x_type
+
 class RNNTLoss(PrimitiveWithInfer):
     """
     Computes the RNNTLoss and its gradient with respect to the softmax outputs.
@@ -1762,9 +1851,9 @@ class ApplyRMSProp(PrimitiveWithInfer):
         - **moment** (Tensor) - Delta of `var`, must have the same type as `var`.
         - **learning_rate** (Union[Number, Tensor]) - Learning rate.
         - **grad** (Tensor) - Gradients, must have the same type as `var`.
-        - **decay** (float) - Decay rate.
-        - **momentum** (float) - Momentum.
-        - **epsilon** (float) - Ridge term.
+        - **decay** (float) - Decay rate. Only constant value is allowed.
+        - **momentum** (float) - Momentum. Only constant value is allowed.
+        - **epsilon** (float) - Ridge term. Only constant value is allowed.
 
     Outputs:
         Tensor, parameters to be update.
@@ -1814,6 +1903,10 @@ class ApplyRMSProp(PrimitiveWithInfer):
             return var_dtype, var_dtype, var_dtype
         return var_dtype
 
+    def infer_value(self, var, mean_square, moment, learning_rate, grad, decay, momentum, epsilon):
+        if decay is None or momentum is None or epsilon is None:
+            raise ValueError(f"For {self.name}, decay, momentum, epsilon must be const.")
+
 
 class ApplyCenteredRMSProp(PrimitiveWithInfer):
     """
@@ -1862,18 +1955,23 @@ class ApplyCenteredRMSProp(PrimitiveWithInfer):
 
     Examples:
         >>> centered_rms_prop = P.ApplyCenteredRMSProp()
-        >>> input_x = Tensor(1., mindspore.float32)
-        >>> mean_grad = Tensor(2., mindspore.float32)
-        >>> mean_square = Tensor(1., mindspore.float32)
-        >>> moment = Tensor(2., mindspore.float32)
-        >>> grad = Tensor(1., mindspore.float32)
+        >>> input_x = Tensor(np.arange(-6, 6).astype(np.float32).reshape(2, 3, 2), mindspore.float32)
+        >>> mean_grad = Tensor(np.arange(12).astype(np.float32).reshape(2, 3, 2), mindspore.float32)
+        >>> mean_square = Tensor(np.arange(-8, 4).astype(np.float32).reshape(2, 3, 2), mindspore.float32)
+        >>> moment = Tensor(np.arange(12).astype(np.float32).reshape(2, 3, 2), mindspore.float32)
+        >>> grad = Tensor(np.arange(12).astype(np.float32).rehspae(2, 3, 2), mindspore.float32)
         >>> learning_rate = Tensor(0.9, mindspore.float32)
         >>> decay = 0.0
         >>> momentum = 1e-10
-        >>> epsilon = 0.001
+        >>> epsilon = 0.05
         >>> result = centered_rms_prop(input_x, mean_grad, mean_square, moment, grad,
         >>>                            learning_rate, decay, momentum, epsilon)
-        -27.460497
+        [[[ -6.        -9.024922]
+          [-12.049845 -15.074766]
+          [-18.09969  -21.124613]]
+         [[-24.149532 -27.174456]
+          [-30.199379 -33.2243  ]
+          [-36.249226 -39.274143]]]
     """
 
     @prim_attr_register
@@ -1910,7 +2008,7 @@ class LayerNorm(Primitive):
     `Layer Normalization <https://arxiv.org/abs/1607.06450>`_.
 
     .. math::
-        y = \frac{x - mean]}{\sqrt{variance + \epsilon}} * \gamma + \beta
+        y = \frac{x - mean}{\sqrt{variance + \epsilon}} * \gamma + \beta
 
     where :math:`\gamma` is scale, :math:`\beta` is bias, :math:`\epsilon` is epsilon.
 
@@ -2059,7 +2157,7 @@ class DropoutDoMask(PrimitiveWithInfer):
         >>> dropout_do_mask = P.DropoutDoMask()
         >>> mask = dropout_gen_mask(shape, keep_prob)
         >>> output = dropout_do_mask(x, mask, keep_prob)
-        >>> assert output.shape() == (20, 16, 50)
+        >>> assert output.shape == (20, 16, 50)
     """
 
     @prim_attr_register
@@ -2113,10 +2211,10 @@ class ResizeBilinear(PrimitiveWithInfer):
         Tensor, resized image. Tensor of shape `(N_i, ..., N_n, new_height, new_width)` in `float32`.
 
     Examples:
-        >>> tensor = Tensor([[[[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]]], mindspore.int32)
+        >>> tensor = Tensor([[[[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]]], mindspore.float32)
         >>> resize_bilinear = P.ResizeBilinear((5, 5))
         >>> result = resize_bilinear(tensor)
-        >>> assert result.shape() == (5, 5)
+        >>> assert result.shape == (1, 1, 5, 5)
     """
 
     @prim_attr_register
@@ -2132,6 +2230,7 @@ class ResizeBilinear(PrimitiveWithInfer):
         return out_shape
 
     def infer_dtype(self, input_dtype):
+        validator.check_tensor_type_same({'input_dtype': input_dtype}, [mstype.float16, mstype.float32], self.name)
         return mstype.tensor_type(mstype.float32)
 
 
@@ -2701,9 +2800,25 @@ class Adam(PrimitiveWithInfer):
         - **v** (Tensor) - The same shape and data type as `v`.
 
     Examples:
-        Please refer to the usage in nn.Adam.
+        >>> import numpy as np
+        >>> import mindspore.nn as nn
+        >>> from mindspore import Tensor, Parameter
+        >>> from mindspore.ops import operations as P
+        >>> class Net(nn.Cell):
+        >>>     def __init__(self):
+        >>>         super(Net, self).__init__()
+        >>>         self.apply_adam = P.Adam()
+        >>>         self.var = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="var")
+        >>>         self.m = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="m")
+        >>>         self.v = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="v")
+        >>>     def construct(self, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad):
+        >>>         out = self.apply_adam(self.var, self.m, self.v, beta1_power, beta2_power, lr, beta1, beta2,
+        >>>                               epsilon, grad)
+        >>>         return out
+        >>> net = Net()
+        >>> gradient = Tensor(np.random.rand(3, 3, 3).astype(np.float32))
+        >>> result = net(0.9, 0.999, 0.001, 0.9, 0.999, 1e-8, gradient)
     """
-
     @prim_attr_register
     def __init__(self, use_locking=False, use_nesterov=False):
         validator.check_value_type("use_locking", use_locking, [bool], self.name)
@@ -2727,6 +2842,274 @@ class Adam(PrimitiveWithInfer):
         return var_dtype, m_dtype, v_dtype
 
 
+class SparseApplyAdam(PrimitiveWithInfer):
+    r"""
+    Merge the duplicate value of the gradient and then updates parameters by Adaptive Moment Estimation (Adam)
+    algorithm. This operator is used when the gradient is sparse.
+
+    The Adam algorithm is proposed in `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_.
+
+    The updating formulas are as follows,
+
+    .. math::
+        \begin{array}{ll} \\
+            m = \beta_1 * m + (1 - \beta_1) * g \\
+            v = \beta_2 * v + (1 - \beta_2) * g * g \\
+            l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
+            w = w - l * \frac{m}{\sqrt{v} + \epsilon}
+        \end{array}
+
+    :math:`m` represents the 1st moment vector, :math:`v` represents the 2nd moment vector, :math:`g` represents
+    `gradient`, :math:`l` represents scaling factor `lr`, :math:`\beta_1, \beta_2` represent `beta1` and `beta2`,
+    :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent `beta1_power` and
+    `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `var`, :math:`\epsilon` represents
+    `epsilon`.
+
+    Args:
+        use_locking (bool): Whether to enable a lock to protect updating variable tensors.
+            If True, updating of the var, m, and v tensors will be protected by a lock.
+            If False, the result is unpredictable. Default: False.
+        use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
+            If True, updates the gradients using NAG.
+            If False, updates the gradients without using NAG. Default: False.
+
+    Inputs:
+        - **var** (Parameter) - Parameters to be updated. With float32 data type.
+        - **m** (Parameter) - The 1st moment vector in the updating formula. Has the same type as `var`. With
+                              float32 data type.
+        - **v** (Parameter) - The 2nd moment vector in the updating formula. Mean square gradients,
+          has the same type as `var`. With float32 data type.
+        - **beta1_power** (Tensor) - :math:`beta_1^t` in the updating formula. With float32 data type.
+        - **beta2_power** (Tensor) - :math:`beta_2^t` in the updating formula. With float32 data type.
+        - **lr** (Tensor) - :math:`l` in the updating formula. With float32 data type.
+        - **beta1** (Tensor) - The exponential decay rate for the 1st moment estimates. With float32 data type.
+        - **beta2** (Tensor) - The exponential decay rate for the 2nd moment estimates. With float32 data type.
+        - **epsilon** (Tensor) - Term added to the denominator to improve numerical stability. With float32 data type.
+        - **gradient** (Tensor) - Gradient value. With float32 data type.
+        - **indices** (Tensor) - Gradient indices. With int32 data type.
+
+    Outputs:
+        Tuple of 3 Tensor, the updated parameters.
+
+        - **var** (Tensor) - The same shape and data type as `var`.
+        - **m** (Tensor) - The same shape and data type as `m`.
+        - **v** (Tensor) - The same shape and data type as `v`.
+
+    Examples:
+        >>> import numpy as np
+        >>> import mindspore.nn as nn
+        >>> from mindspore import Tensor, Parameter
+        >>> from mindspore.ops import operations as P
+        >>> import mindspore.common.dtype as mstype
+        >>> class Net(nn.Cell):
+        >>>     def __init__(self):
+        >>>         super(Net, self).__init__()
+        >>>         self.sparse_apply_adam = P.SparseApplyAdam()
+        >>>         self.var = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="var")
+        >>>         self.m = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="m")
+        >>>         self.v = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="v")
+        >>>     def construct(self, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad, indices):
+        >>>         out = self.sparse_apply_adam(self.var, self.m, self.v, beta1_power, beta2_power, lr, beta1, beta2,
+        >>>                                      epsilon, grad, indices)
+        >>>         return out
+        >>> net = Net()
+        >>> beta1_power = Tensor(0.9, mstype.float32)
+        >>> beta2_power = Tensor(0.999, mstype.float32)
+        >>> lr = Tensor(0.001, mstype.float32)
+        >>> beta1 = Tensor(0.9, mstype.float32)
+        >>> beta2 = Tensor(0.999, mstype.float32)
+        >>> epsilon = Tensor(1e-8, mstype.float32)
+        >>> gradient = Tensor(np.random.rand(2, 1, 2), mstype.float32)
+        >>> indices = Tensor([0, 1], mstype.int32)
+        >>> result = net(beta1_power, beta2_power, lr, beta1, beta2, epsilon, gradient, indices)
+    """
+    __mindspore_signature__ = (
+        ('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('m', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('v', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('beta1_power', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('beta2_power', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('lr', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('beta1', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('beta2', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('epsilon', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('indices', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T1)
+    )
+
+    @prim_attr_register
+    def __init__(self, use_locking=False, use_nesterov=False):
+        validator.check_value_type("use_locking", use_locking, [bool], self.name)
+        validator.check_value_type("use_nesterov", use_nesterov, [bool], self.name)
+        self.init_prim_io_names(inputs=['var', 'm', 'v', 'beta1_power', 'beta2_power', 'lr', 'beta1', 'beta2',
+                                        'epsilon', 'grad', 'indices'],
+                                outputs=['var', 'm', 'v'])
+
+    def infer_shape(self, var_shape, m_shape, v_shape, beta1_power_shape, beta2_power_shape, lr_shape,
+                    beta1_shape, beta2_shape, epsilon_shape, grad_shape, indices_shape):
+        validator.check("var_shape", var_shape, "m_shape", m_shape, Rel.EQ, self.name)
+        validator.check("var_shape", var_shape, "v_shape", v_shape, Rel.EQ, self.name)
+        validator.check_integer("indices rank", len(indices_shape), 1, Rel.EQ, self.name)
+        validator.check('grad_shape[0]', grad_shape[0], 'indices_shape[0]', indices_shape[0], Rel.EQ, self.name)
+        if len(var_shape) > 1 and grad_shape != indices_shape + var_shape[1:]:
+            raise ValueError(f"For '{self.name}', the shape of updates should be [] or "
+                             f"grad_shape = indices_shape + var_shape[1:], but got var_shape: {var_shape}, "
+                             f"indices_shape: {indices_shape}, grad_shape: {grad_shape}.")
+        return var_shape, m_shape, v_shape
+
+    def infer_dtype(self, var_dtype, m_dtype, v_dtype, beta1_power_dtype, beta2_power_dtype, lr_dtype,
+                    beta1_dtype, beta2_dtype, epsilon_dtype, grad_dtype, indices_dtype):
+        args = {"var": var_dtype, "m": m_dtype, "v": v_dtype, "grad": grad_dtype}
+        validator.check_tensor_type_same(args, mstype.number_type, self.name)
+
+        args = {"beta1_power": beta1_power_dtype, "beta2_power": beta2_power_dtype, 'lr': lr_dtype,
+                "beta1": beta1_dtype, "beta2": beta2_dtype, "epsilon": epsilon_dtype}
+        validator.check_scalar_or_tensor_type_same(args, [mstype.float16, mstype.float32], self.name, True)
+        validator.check_tensor_type_same({"indices_dtype": indices_dtype}, [mstype.int32], self.name)
+        return var_dtype, m_dtype, v_dtype
+
+
+class SparseApplyLazyAdam(PrimitiveWithInfer):
+    r"""
+    Merge the duplicate value of the gradient and then updates parameters by Adaptive Moment Estimation (Adam)
+    algorithm. This operator is used when the gradient is sparse. The behavior is not equivalent to the
+    original Adam algorithm, as only the current indices parameters will be updated.
+
+    The Adam algorithm is proposed in `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_.
+
+    The updating formulas are as follows,
+
+    .. math::
+        \begin{array}{ll} \\
+            m = \beta_1 * m + (1 - \beta_1) * g \\
+            v = \beta_2 * v + (1 - \beta_2) * g * g \\
+            l = \alpha * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t} \\
+            w = w - l * \frac{m}{\sqrt{v} + \epsilon}
+        \end{array}
+
+    :math:`m` represents the 1st moment vector, :math:`v` represents the 2nd moment vector, :math:`g` represents
+    `gradient`, :math:`l` represents scaling factor `lr`, :math:`\beta_1, \beta_2` represent `beta1` and `beta2`,
+    :math:`t` represents updating step while :math:`beta_1^t` and :math:`beta_2^t` represent `beta1_power` and
+    `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `var`, :math:`\epsilon` represents
+    `epsilon`.
+
+    Args:
+        use_locking (bool): Whether to enable a lock to protect updating variable tensors.
+            If True, updating of the var, m, and v tensors will be protected by a lock.
+            If False, the result is unpredictable. Default: False.
+        use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
+            If True, updates the gradients using NAG.
+            If False, updates the gradients without using NAG. Default: False.
+
+    Inputs:
+        - **var** (Parameter) - Parameters to be updated. With float32 data type.
+        - **m** (Parameter) - The 1st moment vector in the updating formula. Has the same type as `var`. With
+                              float32 data type.
+        - **v** (Parameter) - The 2nd moment vector in the updating formula. Mean square gradients,
+          has the same type as `var`. With float32 data type.
+        - **beta1_power** (Tensor) - :math:`beta_1^t` in the updating formula. With float32 data type.
+        - **beta2_power** (Tensor) - :math:`beta_2^t` in the updating formula. With float32 data type.
+        - **lr** (Tensor) - :math:`l` in the updating formula. With float32 data type.
+        - **beta1** (Tensor) - The exponential decay rate for the 1st moment estimates. With float32 data type.
+        - **beta2** (Tensor) - The exponential decay rate for the 2nd moment estimates. With float32 data type.
+        - **epsilon** (Tensor) - Term added to the denominator to improve numerical stability. With float32 data type.
+        - **gradient** (Tensor) - Gradient value. With float32 data type.
+        - **indices** (Tensor) - Gradient indices. With int32 data type.
+
+    Outputs:
+        Tuple of 3 Tensor, the updated parameters.
+
+        - **var** (Tensor) - The same shape and data type as `var`.
+        - **m** (Tensor) - The same shape and data type as `m`.
+        - **v** (Tensor) - The same shape and data type as `v`.
+
+    Examples:
+        >>> import numpy as np
+        >>> import mindspore.nn as nn
+        >>> from mindspore import Tensor, Parameter
+        >>> from mindspore.ops import operations as P
+        >>> import mindspore.common.dtype as mstype
+        >>> class Net(nn.Cell):
+        >>>     def __init__(self):
+        >>>         super(Net, self).__init__()
+        >>>         self.sparse_apply_lazyadam = P.SparseApplyLazyAdam()
+        >>>         self.var = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="var")
+        >>>         self.m = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="m")
+        >>>         self.v = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="v")
+        >>>     def construct(self, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad, indices):
+        >>>         out = self.sparse_apply_lazyadam(self.var, self.m, self.v, beta1_power, beta2_power, lr, beta1,
+        >>>                                          beta2, epsilon, grad, indices)
+        >>>         return out
+        >>> net = Net()
+        >>> beta1_power = Tensor(0.9, mstype.float32)
+        >>> beta2_power = Tensor(0.999, mstype.float32)
+        >>> lr = Tensor(0.001, mstype.float32)
+        >>> beta1 = Tensor(0.9, mstype.float32)
+        >>> beta2 = Tensor(0.999, mstype.float32)
+        >>> epsilon = Tensor(1e-8, mstype.float32)
+        >>> gradient = Tensor(np.random.rand(2, 1, 2), mstype.float32)
+        >>> indices = Tensor([0, 1], mstype.int32)
+        >>> result = net(beta1_power, beta2_power, lr, beta1, beta2, epsilon, gradient, indices)
+    """
+    __mindspore_signature__ = (
+        ('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('m', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('v', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('beta1_power', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('beta2_power', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('lr', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('beta1', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('beta2', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('epsilon', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('indices', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T1)
+    )
+
+    @prim_attr_register
+    def __init__(self, use_locking=False, use_nesterov=False):
+        validator.check_value_type("use_locking", use_locking, [bool], self.name)
+        validator.check_value_type("use_nesterov", use_nesterov, [bool], self.name)
+        self.init_prim_io_names(inputs=['var', 'm', 'v', 'beta1_power', 'beta2_power', 'lr', 'beta1', 'beta2',
+                                        'epsilon', 'grad', 'indices'],
+                                outputs=['var', 'm', 'v'])
+
+    def infer_shape(self, var_shape, m_shape, v_shape, beta1_power_shape, beta2_power_shape, lr_shape,
+                    beta1_shape, beta2_shape, epsilon_shape, grad_shape, indices_shape):
+        validator.check("var_shape", var_shape, "m_shape", m_shape, Rel.EQ, self.name)
+        validator.check("var_shape", var_shape, "v_shape", v_shape, Rel.EQ, self.name)
+        validator.check_integer("indices rank", len(indices_shape), 1, Rel.EQ, self.name)
+        validator.check('grad_shape[0]', grad_shape[0], 'indices_shape[0]', indices_shape[0], Rel.EQ, self.name)
+        if len(var_shape) > 1 and grad_shape != indices_shape + var_shape[1:]:
+            raise ValueError(f"For '{self.name}', the shape of updates should be [] or "
+                             f"grad_shape = indices_shape + var_shape[1:], but got var_shape: {var_shape}, "
+                             f"indices_shape: {indices_shape}, grad_shape: {grad_shape}.")
+        return var_shape, m_shape, v_shape
+
+    def infer_dtype(self, var_dtype, m_dtype, v_dtype, beta1_power_dtype, beta2_power_dtype, lr_dtype,
+                    beta1_dtype, beta2_dtype, epsilon_dtype, grad_dtype, indices_dtype):
+        args = {"var": var_dtype, "m": m_dtype, "v": v_dtype, "grad": grad_dtype}
+        validator.check_tensor_type_same(args, mstype.number_type, self.name)
+
+        args = {"beta1_power": beta1_power_dtype, "beta2_power": beta2_power_dtype, 'lr': lr_dtype,
+                "beta1": beta1_dtype, "beta2": beta2_dtype, "epsilon": epsilon_dtype}
+        validator.check_scalar_or_tensor_type_same(args, [mstype.float16, mstype.float32], self.name, True)
+
+        validator.check_tensor_type_same({"indices_dtype": indices_dtype}, [mstype.int32], self.name)
+        return var_dtype, m_dtype, v_dtype
+
+
 class BinaryCrossEntropy(PrimitiveWithInfer):
     r"""
     Computes the Binary Cross Entropy between the target and the output.
@@ -2807,6 +3190,283 @@ class BinaryCrossEntropy(PrimitiveWithInfer):
         return x_type
 
 
+class ApplyAdaMax(PrimitiveWithInfer):
+    r"""
+    Update relevant entries according to the adamax scheme.
+
+    The updating formulas are as follows,
+
+    .. math::
+        \begin{array}{ll} \\
+            m_{t} = \beta_1 * m_{t-1} + (1 - \beta_1) * g \\
+            v_{t} = \max(\beta_2 * v_{t-1}, \left| g \right|) \\
+            var = var - \frac{l}{1 - \beta_1^t} * \frac{m_{t}}{v_{t} + \epsilon}
+        \end{array}
+
+    :math:`t` represents updating step while, :math:`m` represents the 1st moment vector, :math:`m_{t-1}`
+    is the last momentent of :math:`m_{t}`, :math:`v` represents the 2nd moment vector, :math:`v_{t-1}`
+    is the last momentent of :math:`v_{t}`, :math:`l` represents scaling factor `lr`,
+    :math:`g` represents `grad`, :math:`\beta_1, \beta_2` represent `beta1` and `beta2`,
+    :math:`beta_1^t` represent `beta1_power`, :math:`var` represents Variable to be updated,
+    :math:`\epsilon` represents `epsilon`.
+
+    Inputs:
+        - **var** (Parameter) - Variable to be updated.
+        - **m** (Parameter) - The 1st moment vector in the updating formula. Has the same shape and type as `var`.
+        - **v** (Parameter) - The 2nd moment vector in the updating formula. Mean square gradients,
+          has the same shape and type as `var`.
+        - **beta1_power** (float) - :math:`beta_1^t` in the updating formula.
+        - **lr** (float) - Learning rate, :math:`l` in the updating formula. Has the same type as `var`.
+        - **beta1** (float) - The exponential decay rate for the 1st moment estimates.
+        - **beta2** (float) - The exponential decay rate for the 2nd moment estimates.
+        - **epsilon** (float) - A small value added for numerical stability.
+        - **grad** (Tensor) - A tensor for gradient. Has the same shape and type as `var`.
+
+    Outputs:
+        Tuple of 3 Tensor, the updated parameters.
+
+        - **var** (Tensor) - The same shape and data type as `var`.
+        - **m** (Tensor) - The same shape and data type as `m`.
+        - **v** (Tensor) - The same shape and data type as `v`.
+
+    Examples:
+        >>> var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        >>> m = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="m")
+        >>> v = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="v")
+        >>> grad = Tensor(np.random.rand(3, 3).astype(np.float32))
+        >>> beta1_power = 0.9
+        >>> lr = 0.001
+        >>> beta1 = 0.9
+        >>> beta2 = 0.99
+        >>> epsilon = 1e-10
+        >>> apply_ada_max = P.ApplyAdaMax()
+        >>> output = apply_ada_max(var, m, v, beta1_power, lr, beta1, beta2, epsilon, grad)
+    """
+
+    __mindspore_signature__ = (
+        ('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('m', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('v', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('beta1_power', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('lr', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('beta1', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('beta2', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('epsilon', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T)
+    )
+
+    @prim_attr_register
+    def __init__(self):
+        """init ApplyAdaMax"""
+
+    def infer_shape(self, var_shape, m_shape, v_shape, beta1_power_shape, lr_shape,
+                    beta1_shape, beta2_shape, epsilon_shape, grad_shape):
+        validator.check("var_shape", var_shape, "m_shape", m_shape, Rel.EQ, self.name)
+        validator.check("var_shape", var_shape, "v_shape", v_shape, Rel.EQ, self.name)
+        validator.check("var_shape", var_shape, "grad_shape", grad_shape, Rel.EQ, self.name)
+        return var_shape, m_shape, v_shape
+
+    def infer_dtype(self, var_dtype, m_dtype, v_dtype, beta1_power_dtype, lr_dtype,
+                    beta1_dtype, beta2_dtype, epsilon_dtype, grad_dtype):
+        args = {"var": var_dtype, "m": m_dtype, "v": v_dtype, "grad": grad_dtype}
+        validator.check_tensor_type_same(args, mstype.number_type, self.name)
+
+        scalar_args = {"beta1_power": beta1_power_dtype, 'lr': lr_dtype, "beta1": beta1_dtype,
+                       "beta2": beta2_dtype, "epsilon": epsilon_dtype}
+        validator.check_scalar_or_tensor_type_same(scalar_args, [mstype.float16, mstype.float32], self.name, True)
+        return var_dtype, m_dtype, v_dtype
+
+
+class ApplyAdadelta(PrimitiveWithInfer):
+    r"""
+    Update relevant entries according to the adadelta scheme.
+
+    .. math::
+            accum = \rho * accum + (1 - \rho) * grad^2
+    .. math::
+            \text{update} = \sqrt{\text{accum_update} + \epsilon} * \frac{grad}{\sqrt{accum + \epsilon}}
+    .. math::
+            \text{accum_update} = \rho * \text{accum_update} + (1 - \rho) * update^2
+    .. math::
+            var -= lr * update
+
+    Inputs:
+        - **var** (Parameter) - Weights to be updated.
+        - **accum** (Parameter) - Accum to be updated, has the same shape and type as `var`.
+        - **accum_update** (Parameter) - Accum_update to be updated, has the same shape and type as `var`.
+        - **lr** (float) - Learning rate, has the same type as `var`.
+        - **rho** (float) - Decay rate.
+        - **epsilon** (float) - A small value added for numerical stability.
+        - **grad** (Tensor) - Gradients, has the same shape and type as `var`.
+
+    Outputs:
+        Tuple of 3 Tensor, the updated parameters.
+
+        - **var** (Tensor) - The same shape and data type as `var`.
+        - **accum** (Tensor) - The same shape and data type as `accum`.
+        - **accum_update** (Tensor) - The same shape and data type as `accum_update`.
+
+    Examples:
+        >>> var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        >>> accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+        >>> accum_update = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum_update")
+        >>> grad = Tensor(np.random.rand(3, 3).astype(np.float32))
+        >>> lr = 0.001
+        >>> rho = 0.0
+        >>> epsilon = 1e-6
+        >>> apply_adadelta = P.ApplyAdadelta()
+        >>> output = apply_adadelta(var, accum, accum_update, lr, rho, epsilon, grad)
+    """
+
+    __mindspore_signature__ = (
+        ('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('accum', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('accum_update', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE,
+         sig_dtype.T),
+        ('lr', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('rho', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('epsilon', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T)
+    )
+
+    @prim_attr_register
+    def __init__(self):
+        """init ApplyAdadelta"""
+
+    def infer_shape(self, var_shape, accum_shape, accum_update_shape, lr_shape, rho_shape,
+                    epsilon_shape, grad_shape):
+        validator.check("var_shape", var_shape, "accum_shape", accum_shape, Rel.EQ, self.name)
+        validator.check("var_shape", var_shape, "accum_update_shape", accum_update_shape, Rel.EQ, self.name)
+        validator.check("var_shape", var_shape, "grad_shape", grad_shape, Rel.EQ, self.name)
+        return var_shape, accum_shape, accum_update_shape
+
+    def infer_dtype(self, var_dtype, accum_dtype, accum_update_dtype, lr_dtype, rho_shape,
+                    epsilon_dtype, grad_dtype):
+        args = {"var": var_dtype, "accum": accum_dtype, "accum_update": accum_update_dtype, "grad": grad_dtype}
+        validator.check_tensor_type_same(args, mstype.number_type, self.name)
+
+        scalar_args = {"lr": lr_dtype, "rho": rho_shape, "epsilon": epsilon_dtype}
+        validator.check_scalar_or_tensor_type_same(scalar_args, [mstype.float16, mstype.float32], self.name, True)
+        return var_dtype, accum_dtype, accum_update_dtype
+
+
+class ApplyAdagrad(PrimitiveWithInfer):
+    r"""
+    Update relevant entries according to the adagrad scheme.
+
+    .. math::
+            accum += grad * grad
+    .. math::
+            var -= lr * grad * \frac{1}{\sqrt{accum}}
+
+    Args:
+        update_slots (bool): If `True`, `accum` will be updated. Default: True.
+
+    Inputs:
+        - **var** (Parameter) - Variable to be updated.
+        - **accum** (Parameter) - Accum to be updated. The shape and dtype should be the same as `var`.
+        - **lr** (float): The learning rate value, has the same type as `var`.
+        - **grad** (Tensor) - A tensor for gradient. The shape and dtype should be the same as `var`.
+
+    Outputs:
+        Tuple of 2 Tensor, the updated parameters.
+
+        - **var** (Tensor) - The same shape and data type as `var`.
+        - **accum** (Tensor) - The same shape and data type as `accum`.
+
+    Examples:
+        >>> var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        >>> accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+        >>> grad = Tensor(np.random.rand(3, 3).astype(np.float32))
+        >>> lr = 0.01
+        >>> apply_adagrad = P.ApplyAdagrad()
+        >>> output = apply_adagrad(var, accum, lr, grad)
+    """
+
+    __mindspore_signature__ = (
+        ('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('accum', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('lr', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T)
+    )
+
+    @prim_attr_register
+    def __init__(self, update_slots=True):
+        validator.check_value_type("update_slots", update_slots, [bool], self.name)
+
+    def infer_shape(self, var_shape, accum_shape, lr_shape, grad_shape):
+        validator.check('var shape', var_shape, 'accum shape', accum_shape, Rel.EQ, self.name)
+        validator.check('var shape', var_shape, 'grad shape', grad_shape, Rel.EQ, self.name)
+        return var_shape, accum_shape
+
+    def infer_dtype(self, var_dtype, accum_dtype, lr_dtype, grad_dtype):
+        args = {'var': var_dtype, 'accum': accum_dtype, 'grad': grad_dtype}
+        validator.check_tensor_type_same(args, mstype.number_type, self.name)
+        valid_types = [mstype.float16, mstype.float32]
+        validator.check_scalar_or_tensor_type_same({'lr': lr_dtype}, valid_types, self.name)
+        return var_dtype, accum_dtype
+
+
+class ApplyAdagradV2(PrimitiveWithInfer):
+    r"""
+    Update relevant entries according to the adagradv2 scheme.
+
+    .. math::
+            accum += grad * grad
+    .. math::
+            var -= lr * grad * \frac{1}{\sqrt{accum} + \epsilon}
+
+    Args:
+        epsilon (float): A small value added for numerical stability.
+        update_slots (bool): If `True`, `accum` will be updated. Default: True.
+
+    Inputs:
+        - **var** (Parameter) - Variable to be updated.
+        - **accum** (Parameter) - Accum to be updated. The shape and dtype should be the same as `var`.
+        - **lr** (float): The learning rate value, has the same type as `var`.
+        - **grad** (Tensor) - A tensor for gradient. The shape and dtype should be the same as `var`.
+
+    Outputs:
+        Tuple of 2 Tensor, the updated parameters.
+
+        - **var** (Tensor) - The same shape and data type as `var`.
+        - **accum** (Tensor) - The same shape and data type as `m`.
+
+    Examples:
+        >>> var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        >>> accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+        >>> grad = Tensor(np.random.rand(3, 3).astype(np.float32))
+        >>> lr = 0.01
+        >>> apply_adagrad_v2 = P.ApplyAdagradV2(epsilon=1e-6)
+        >>> output = apply_adagrad_v2(var, accum, lr, grad)
+    """
+
+    __mindspore_signature__ = (
+        ('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('accum', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('lr', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T)
+    )
+
+    @prim_attr_register
+    def __init__(self, epsilon, update_slots=True):
+        validator.check_value_type("epsilon", epsilon, [float], self.name)
+        validator.check_value_type("update_slots", update_slots, [bool], self.name)
+
+    def infer_shape(self, var_shape, accum_shape, lr_shape, grad_shape):
+        validator.check('var shape', var_shape, 'accum shape', accum_shape, Rel.EQ, self.name)
+        validator.check('var shape', var_shape, 'grad shape', grad_shape, Rel.EQ, self.name)
+        return var_shape, accum_shape
+
+    def infer_dtype(self, var_dtype, accum_dtype, lr_dtype, grad_dtype):
+        args = {'var': var_dtype, 'accum': accum_dtype, 'grad': grad_dtype}
+        validator.check_tensor_type_same(args, mstype.number_type, self.name)
+        valid_types = [mstype.float16, mstype.float32]
+        validator.check_scalar_or_tensor_type_same({'lr': lr_dtype}, valid_types, self.name)
+        return var_dtype, accum_dtype
+
+
 class SparseApplyAdagrad(PrimitiveWithInfer):
     r"""
     Update relevant entries according to the adagrad scheme.
@@ -2818,11 +3478,12 @@ class SparseApplyAdagrad(PrimitiveWithInfer):
 
     Args:
         lr (float): Learning rate.
+        update_slots (bool): If `True`, `accum` will be updated. Default: True.
         use_locking (bool): If True, updating of the var and accum tensors will be protected. Default: False.
 
     Inputs:
-        - **var** (Tensor) - Variable to be updated. The type must be float32.
-        - **accum** (Tensor) - Accum to be updated. The shape must be the same as `var`'s shape,
+        - **var** (Parameter) - Variable to be updated. The type must be float32.
+        - **accum** (Parameter) - Accum to be updated. The shape must be the same as `var`'s shape,
           the type must be float32.
         - **grad** (Tensor) - Gradient. The shape must be the same as `var`'s shape
           except first dimension, the type must be float32.
@@ -2830,21 +3491,45 @@ class SparseApplyAdagrad(PrimitiveWithInfer):
           The shape of `indices` must be the same as `grad` in first dimension, the type must be int32.
 
     Outputs:
-        Tensor, has the same shape and type as `var`.
+        Tuple of 2 Tensor, the updated parameters.
+
+        - **var** (Tensor) - The same shape and data type as `var`.
+        - **accum** (Tensor) - The same shape and data type as `accum`.
 
     Examples:
-        >>> var = Tensor(np.random.random((3, 3)), mindspore.float32)
-        >>> accum = Tensor(np.random.random((3, 3)), mindspore.float32)
-        >>> grad = Tensor(np.random.random((3, 3)), mindspore.float32)
-        >>> indices = Tensor(np.ones((3,), np.int32))
-        >>> sparse_apply_ada_grad = P.SparseApplyAdagrad(0.5)
-        >>> sparse_apply_ada_grad(var, accum, grad, indices)
+        >>> import numpy as np
+        >>> import mindspore.nn as nn
+        >>> from mindspore import Tensor, Parameter
+        >>> from mindspore.ops import operations as P
+        >>> import mindspore.common.dtype as mstype
+        >>> class Net(nn.Cell):
+        >>>     def __init__(self):
+        >>>         super(Net, self).__init__()
+        >>>         self.sparse_apply_adagrad = P.SparseApplyAdagrad(lr=1e-8)
+        >>>         self.var = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="var")
+        >>>         self.accum = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="accum")
+        >>>     def construct(self, grad, indices):
+        >>>         out = self.sparse_apply_adagrad(self.var, self.accum, grad, indices)
+        >>>         return out
+        >>> net = Net()
+        >>> grad = Tensor(np.random.rand(3, 3, 3).astype(np.float32))
+        >>> indices = Tensor([0, 1, 2], mstype.int32)
+        >>> result = net(grad, indices)
     """
 
+    __mindspore_signature__ = (
+        ('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('accum', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('indices', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T1)
+    )
+
     @prim_attr_register
-    def __init__(self, lr, use_locking=False):
-        self.lr = validator.check_value_type("lr", lr, [float], self.name)
-        self.use_locking = validator.check_value_type("use_locking", use_locking, [bool], self.name)
+    def __init__(self, lr, update_slots=True, use_locking=False):
+        validator.check_value_type("lr", lr, [float], self.name)
+        validator.check_number_range("lr", lr, float("-inf"), float("inf"), Rel.INC_NEITHER, self.name)
+        validator.check_value_type("update_slots", update_slots, [bool], self.name)
+        validator.check_value_type("use_locking", use_locking, [bool], self.name)
 
     def infer_shape(self, var_shape, accum_shape, grad_shape, indices_shape):
         validator.check('var shape', var_shape, 'accum shape', accum_shape, Rel.EQ, self.name)
@@ -2853,13 +3538,13 @@ class SparseApplyAdagrad(PrimitiveWithInfer):
             validator.check('var_shape[1:]', var_shape[1:], 'grad_shape[1:]', grad_shape[1:], Rel.EQ, self.name)
         validator.check_integer("indices rank", len(indices_shape), 1, Rel.EQ, self.name)
         validator.check('grad_shape[0]', grad_shape[0], 'indices_shape[0]', indices_shape[0], Rel.EQ, self.name)
-        return var_shape
+        return var_shape, accum_shape
 
     def infer_dtype(self, var_type, accum_type, grad_type, indices_type):
         args = {'var': var_type, 'accum': accum_type, 'grad': grad_type}
         validator.check_tensor_type_same(args, (mstype.float32,), self.name)
         validator.check_tensor_type_same({'indices': indices_type}, [mstype.int32], self.name)
-        return var_type
+        return var_type, accum_type
 
 
 class ApplyProximalAdagrad(PrimitiveWithInfer):
@@ -2869,38 +3554,61 @@ class ApplyProximalAdagrad(PrimitiveWithInfer):
     .. math::
             accum += grad * grad
     .. math::
-            prox_v = var - lr * grad * \frac{1}{\sqrt{accum}}
+            \text{prox_v} = var - lr * grad * \frac{1}{\sqrt{accum}}
     .. math::
-            var = \frac{sign(prox_v)}{1 + lr * l2} * \max(\left| prox_v \right| - lr * l1, 0)
+            var = \frac{sign(\text{prox_v})}{1 + lr * l2} * \max(\left| \text{prox_v} \right| - lr * l1, 0)
 
     Args:
         use_locking (bool): If True, updating of the var and accum tensors will be protected. Default: False.
 
     Inputs:
-        - **var** (Tensor) - Variable to be updated.
-        - **accum** (Tensor) - Accum to be updated. The shape must be the same as `var`'s shape.
-        - **lr** (Union[Number, Tensor]): The learning rate value, must be positive. It should be
-          a scalar tensor or number.
+        - **var** (Parameter) - Variable to be updated. The data type should be float.
+        - **accum** (Parameter) - Accum to be updated. Must has the same shape and dtype as `var`.
+        - **lr** (Union[Number, Tensor]): The learning rate value. It should be a scalar tensor or number.
+          The data type should be float.
         - **l1** (Union[Number, Tensor]): l1 regularization strength, must be greater than or equal to zero.
-          It should be a scalar tensor or number.
+          It should be a scalar tensor or number. The data type should be float.
         - **l2** (Union[Number, Tensor]): l2 regularization strength, must be greater than or equal to zero.
-          It should be a scalar tensor or number.
-        - **grad** (Tensor) - Gradient. The shape must be the same as `var`'s shape.
+          It should be a scalar tensor or number. The data type should be float.
+        - **grad** (Tensor) - Gradient. Must has the same shape and dtype as `var`.
 
     Outputs:
-        Tensor, has the same shape and type as `var`.
+        Tuple of 2 Tensor, the updated parameters.
+
+        - **var** (Tensor) - The same shape and data type as `var`.
+        - **accum** (Tensor) - The same shape and data type as `accum`.
 
     Examples:
-        >>> var = Tensor(np.random.random((3, 3)), mindspore.float32)
-        >>> accum = Tensor(np.random.random((3, 3)), mindspore.float32)
-        >>> grad = Tensor(np.random.random((3, 3)), mindspore.float32)
-        >>> lr = 0.01
-        >>> l1 = 0.0
-        >>> l2 = 0.0
-        >>> apply_proximal_ada_grad = P.ApplyProximalAdagrad()
-        >>> output = apply_proximal_ada_grad(var, accum, lr, l1, l2, grad)
+        >>> import numpy as np
+        >>> import mindspore.nn as nn
+        >>> from mindspore import Tensor, Parameter
+        >>> from mindspore.ops import operations as P
+        >>> class Net(nn.Cell):
+        >>>     def __init__(self):
+        >>>         super(Net, self).__init__()
+        >>>         self.apply_proximal_adagrad = P.ApplyProximalAdagrad()
+        >>>         self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        >>>         self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+        >>>         self.lr = 0.01
+        >>>         self.l1 = 0.0
+        >>>         self.l2 = 0.0
+        >>>     def construct(self, grad):
+        >>>         out = self.apply_proximal_adagrad(self.var, self.accum, self.lr, self.l1, self.l2, grad)
+        >>>         return out
+        >>> net = Net()
+        >>> grad = Tensor(np.random.rand(3, 3).astype(np.float32))
+        >>> output = net(grad)
     """
 
+    __mindspore_signature__ = (
+        ('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('accum', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('lr', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('l1', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('l2', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T)
+    )
+
     @prim_attr_register
     def __init__(self, use_locking=False):
         self.init_prim_io_names(inputs=['var', 'accum', 'lr', 'l1', 'l2', 'grad'], outputs=['output'])
@@ -2909,7 +3617,7 @@ class ApplyProximalAdagrad(PrimitiveWithInfer):
     def infer_shape(self, var_shape, accum_shape, lr_shape, l1_shape, l2_shape, grad_shape):
         validator.check('var shape', var_shape, 'accum shape', accum_shape, Rel.EQ, self.name)
         validator.check('var shape', var_shape, 'grad shape', grad_shape, Rel.EQ, self.name)
-        return var_shape
+        return var_shape, accum_shape
 
     def infer_dtype(self, var_dtype, accum_dtype, lr_dtype, l1_dtype, l2_dtype, grad_dtype):
         valid_types = [mstype.float16, mstype.float32]
@@ -2917,7 +3625,7 @@ class ApplyProximalAdagrad(PrimitiveWithInfer):
         validator.check_tensor_type_same(args, valid_types, self.name)
         scalar_args = {"lr": lr_dtype, "l1": l1_dtype, "l2": l2_dtype}
         validator.check_scalar_or_tensor_type_same(scalar_args, valid_types, self.name)
-        return var_dtype
+        return var_dtype, accum_dtype
 
 
 class SparseApplyProximalAdagrad(PrimitiveWithInfer):
@@ -2928,40 +3636,65 @@ class SparseApplyProximalAdagrad(PrimitiveWithInfer):
     .. math::
             accum += grad * grad
     .. math::
-            prox_v = var - lr * grad * \frac{1}{\sqrt{accum}}
+            \text{prox_v} = var - lr * grad * \frac{1}{\sqrt{accum}}
     .. math::
-            var = \frac{sign(prox_v)}{1 + lr * l2} * \max(\left| prox_v \right| - lr * l1, 0)
+            var = \frac{sign(\text{prox_v})}{1 + lr * l2} * \max(\left| \text{prox_v} \right| - lr * l1, 0)
 
     Args:
         use_locking (bool): If True, updating of the var and accum tensors will be protected. Default: False.
 
     Inputs:
-        - **var** (Tensor) - Variable tensor to be updated.
-        - **accum** (Tensor) - Variable tensor to be updated. The shape must be the same as `var`'s shape.
-        - **lr** (Union[Number, Tensor]): The learning rate value, must be positive. It should be
-          a scalar tensor or number.
+        - **var** (Parameter) - Variable tensor to be updated. The data type must be float32.
+        - **accum** (Parameter) - Variable tensor to be updated. Has the same dtype as `var`.
+        - **lr** (Union[Number, Tensor]): The learning rate value. It should be a scalar tensor or number.
+          The data type must be float32.
         - **l1** (Union[Number, Tensor]): l1 regularization strength, must be greater than or equal to zero.
-          It should be a scalar tensor or number.
+          It should be a scalar tensor or number. The data type must be float32.
         - **l2** (Union[Number, Tensor]): l2 regularization strength, must be greater than or equal to zero.
-          It should be a scalar tensor or number.
-        - **grad** (Tensor) - A tensor of the same type as `var`, for the gradient.
+          It should be a scalar tensor or number. The data type must be float32.
+        - **grad** (Tensor) - A tensor of the same type as `var`, for the gradient. The data type must be float32.
         - **indices** (Tensor) - A vector of indices into the first dimension of `var` and `accum`.
 
     Outputs:
-        Tensor, has the same shape and type as `var`.
+        Tuple of 2 Tensor, the updated parameters.
+
+        - **var** (Tensor) - The same shape and data type as `var`.
+        - **accum** (Tensor) - The same shape and data type as `accum`.
 
     Examples:
-        >>> var = Tensor(np.random.random((3, 3)), mindspore.float32)
-        >>> accum = Tensor(np.random.random((3, 3)), mindspore.float32)
-        >>> grad = Tensor(np.random.random((3, 3)), mindspore.float32)
+        >>> import numpy as np
+        >>> import mindspore.nn as nn
+        >>> from mindspore import Tensor, Parameter
+        >>> from mindspore.ops import operations as P
+        >>> class Net(nn.Cell):
+        >>>     def __init__(self):
+        >>>         super(Net, self).__init__()
+        >>>         self.sparse_apply_proximal_adagrad = P.SparseApplyProximalAdagrad()
+        >>>         self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        >>>         self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+        >>>         self.lr = 0.01
+        >>>         self.l1 = 0.0
+        >>>         self.l2 = 0.0
+        >>>     def construct(self, grad, indices):
+        >>>         out = self.sparse_apply_proximal_adagrad(self.var, self.accum, self.lr, self.l1,
+                                                             self.l2, grad, indices)
+        >>>         return out
+        >>> net = Net()
+        >>> grad = Tensor(np.random.rand(3, 3).astype(np.float32))
         >>> indices = Tensor(np.ones((3,), np.int32))
-        >>> lr = 0.01
-        >>> l1 = 0.0
-        >>> l2 = 0.0
-        >>> sparse_apply_proximal_ada_grad = P.SparseApplyProximalAdagrad()
-        >>> output = sparse_apply_proximal_ada_grad(var, accum, lr, l1, l2, grad, indices)
+        >>> output = net(grad, indices)
     """
 
+    __mindspore_signature__ = (
+        ('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('accum', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('lr', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('l1', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('l2', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('indices', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T1)
+    )
+
     @prim_attr_register
     def __init__(self, use_locking=False):
         self.init_prim_io_names(inputs=['var', 'accum', 'lr', 'l1', 'l2', 'grad', 'indices'],
@@ -2969,7 +3702,8 @@ class SparseApplyProximalAdagrad(PrimitiveWithInfer):
         self.use_locking = validator.check_value_type("use_locking", use_locking, [bool], self.name)
 
     def infer_shape(self, var_shape, accum_shape, lr_shape, l1_shape, l2_shape, grad_shape, indices_shape):
-        return var_shape
+        validator.check_integer("indices rank", len(indices_shape), 1, Rel.EQ, self.name)
+        return var_shape, accum_shape
 
     def infer_dtype(self, var_dtype, accum_dtype, lr_dtype, l1_dtype, l2_dtype, grad_dtype, indices_dtype):
         args = {'var': var_dtype, 'accum': accum_dtype, 'grad': grad_dtype}
@@ -2979,7 +3713,7 @@ class SparseApplyProximalAdagrad(PrimitiveWithInfer):
         valid_types = [mstype.int16, mstype.int32, mstype.int64,
                        mstype.uint16, mstype.uint32, mstype.uint64]
         validator.check_tensor_type_same({'indices': indices_dtype}, valid_types, self.name)
-        return var_dtype
+        return var_dtype, accum_dtype
 
 
 class LARSUpdate(PrimitiveWithInfer):
@@ -3119,11 +3853,14 @@ class ApplyFtrl(PrimitiveWithInfer):
         self.init_prim_io_names(inputs=['var', 'accum', 'linear', 'grad', 'lr', 'l1', 'l2', 'lr_power'],
                                 outputs=['output'])
         self.use_locking = validator.check_value_type("use_locking", use_locking, [bool], self.name)
+        self.is_tbe = context.get_context("device_target") == "Ascend"
 
     def infer_shape(self, var_shape, accum_shape, linear_shape, grad_shape, lr_shape, l1_shape, l2_shape,
                     lr_power_shape):
         validator.check('var shape', var_shape, 'accum shape', accum_shape, Rel.EQ, self.name)
         validator.check('var shape', var_shape, 'linear shape', linear_shape, Rel.EQ, self.name)
+        if self.is_tbe:
+            return var_shape, var_shape, var_shape
         return var_shape
 
     def infer_dtype(self, var_type, accum_type, linear_type, grad_type, lr_type, l1_type, l2_type, lr_power_type):
@@ -3135,6 +3872,8 @@ class ApplyFtrl(PrimitiveWithInfer):
         validator.check_scalar_or_tensor_type_same({"l1": l1_type}, valid_types, self.name)
         validator.check_scalar_or_tensor_type_same({"l2": l2_type}, valid_types, self.name)
         validator.check_scalar_or_tensor_type_same({"lr_power": lr_power_type}, valid_types, self.name)
+        if self.is_tbe:
+            return var_type, var_type, var_type
         return var_type
 
 
@@ -3174,17 +3913,17 @@ class SparseApplyFtrl(PrimitiveWithInfer):
         >>>     def __init__(self):
         >>>         super(SparseApplyFtrlNet, self).__init__()
         >>>         self.sparse_apply_ftrl = P.SparseApplyFtrl(lr=0.01, l1=0.0, l2=0.0, lr_power=-0.5)
-        >>>         self.var = Parameter(Tensor(np.random.random(3, 3).astype(np.float32)), name="var")
-        >>>         self.accum = Parameter(Tensor(np.random.random(3, 3).astype(np.float32)), name="accum")
-        >>>         self.linear = Parameter(Tensor(np.random.random(3, 3).astype(np.float32)), name="linear")
+        >>>         self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        >>>         self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+        >>>         self.linear = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="linear")
         >>>
         >>>     def construct(self, grad, indices):
-        >>>         out = self.apply_ftrl(self.var, self.accum, self.linear, grad, indices)
+        >>>         out = self.sparse_apply_ftrl(self.var, self.accum, self.linear, grad, indices)
         >>>         return out
         >>>
         >>> net = SparseApplyFtrlNet()
-        >>> grad = Tensor(np.random.random(3, 3).astype(np.float32))
-        >>> indices = Tnsor(np.ones([3]), mindspore.float32)
+        >>> grad = Tensor(np.random.rand(3, 3).astype(np.float32))
+        >>> indices = Tensor(np.ones([3]), mindspore.int32)
         >>> output = net(grad, indices)
     """
 
@@ -3194,9 +3933,9 @@ class SparseApplyFtrl(PrimitiveWithInfer):
         validator.check_value_type("l1", l1, [float], self.name)
         validator.check_value_type("l2", l2, [float], self.name)
         validator.check_value_type("lr_power", lr_power, [float], self.name)
-        self.lr = validator.check_number("lr", lr, 0.0, Rel.GT, self.name)
-        self.l1 = validator.check_number("l1", l1, 0.0, Rel.GE, self.name)
-        self.l2 = validator.check_number("l2", l2, 0.0, Rel.GE, self.name)
+        self.lr = validator.check_number_range("lr", lr, 0.0, float("inf"), Rel.INC_NEITHER, self.name)
+        self.l1 = validator.check_number_range("l1", l1, 0.0, float("inf"), Rel.INC_LEFT, self.name)
+        self.l2 = validator.check_number_range("l2", l2, 0.0, float("inf"), Rel.INC_LEFT, self.name)
         self.lr_power = validator.check_number("lr_power", lr_power, 0, Rel.LE, self.name)
         self.use_locking = validator.check_value_type("use_locking", use_locking, [bool], self.name)
 
@@ -3227,8 +3966,8 @@ class ConfusionMulGrad(PrimitiveWithInfer):
         axis (Union[int, tuple[int], list[int]]): The dimensions to reduce.
             Default:(), reduce all dimensions. Only constant value is allowed.
         keep_dims (bool):
-            - If true, keep these reduced dimensions and the length is 1.
-            - If false, don't keep these dimensions. Default:False.
+            - If True, keep these reduced dimensions and the length is 1.
+            - If False, don't keep these dimensions. Default:False.
 
     Inputs:
         - **input_0** (Tensor) - The input Tensor.
@@ -3282,7 +4021,8 @@ class Dropout(PrimitiveWithInfer):
     During training, randomly zeroes some of the elements of the input tensor with probability.
 
     Args:
-        drop_prob (float): probability of an element to be zeroed. Default: 0.
+        keep_prob (float): The keep rate, between 0 and 1, e.g. keep_prob = 0.9,
+          means dropping out 10% of input units.
 
     Inputs:
         - **shape** (tuple[int]) - The shape of target mask.
@@ -3291,14 +4031,14 @@ class Dropout(PrimitiveWithInfer):
         Tensor, the value of generated mask for input shape.
 
     Examples:
-        >>> dropout = P.Dropout(drop_prob=0.5)
+        >>> dropout = P.Dropout(keep_prob=0.5)
         >>> in = Tensor((20, 16, 50, 50))
         >>> out = dropout(in)
     """
 
     @prim_attr_register
-    def __init__(self, drop_prob=0):
-        self.drop_prob = validator.check_number_range("drop_prob", drop_prob, 0, 1, Rel.INC_BOTH, self.name)
+    def __init__(self, keep_prob=0.5):
+        self.keep_prob = validator.check_number_range("keep_prob", keep_prob, 0, 1, Rel.INC_RIGHT, self.name)
 
     def infer_shape(self, x_shape):
         validator.check_integer("x_shape", len(x_shape), 1, Rel.GE, self.name)
@@ -3317,7 +4057,8 @@ class DropoutGrad(PrimitiveWithInfer):
     of the input tensor with probability.
 
     Args:
-        drop_prob (float): probability of an element to be zeroed. Default: 0.
+        keep_prob (float): The keep rate, between 0 and 1, e.g. keep_prob = 0.9,
+          means dropping out 10% of input units.
 
     Inputs:
         - **shape** (tuple[int]) - The shape of target mask.
@@ -3326,14 +4067,14 @@ class DropoutGrad(PrimitiveWithInfer):
         Tensor, the value of generated mask for input shape.
 
     Examples:
-        >>> dropout_grad = P.DropoutGrad(drop_prob=0.5)
+        >>> dropout_grad = P.DropoutGrad(keep_prob=0.5)
         >>> in = Tensor((20, 16, 50, 50))
         >>> out = dropout_grad(in)
     """
 
     @prim_attr_register
-    def __init__(self, drop_prob=0):
-        self.drop_prob = validator.check_number_range("drop_prob", drop_prob, 0, 1, Rel.INC_BOTH, self.name)
+    def __init__(self, keep_prob=0.5):
+        self.keep_prob = validator.check_number_range("keep_prob", keep_prob, 0, 1, Rel.INC_RIGHT, self.name)
 
     def infer_shape(self, dy_shape, mask_shape):
         return dy_shape
@@ -3383,7 +4124,7 @@ class CTCLoss(PrimitiveWithInfer):
     """
 
     @prim_attr_register
-    def __init__(self, preprocess_collapse_repeated=False, ctc_merge_repeated=False,
+    def __init__(self, preprocess_collapse_repeated=False, ctc_merge_repeated=True,
                  ignore_longer_outputs_than_inputs=False):
         self.init_prim_io_names(inputs=["inputs", "labels_indices", "labels_values", "sequence_length"],
                                 outputs=["loss", "gradient"])
@@ -3460,12 +4201,12 @@ class BasicLSTMCell(PrimitiveWithInfer):
     Outputs:
         - **ct** (Tensor) - Forward :math:`c_t` cache at moment `t`. Tensor of shape (`batch_size`, `hidden_size`).
         - **ht** (Tensor) - Cell output. Tensor of shape (`batch_size`, `hidden_size`).
-        - **it** (Tensor) - Forward :math:`i_t` cache at moment `t`. Tensor of shape (`batch_size`, `4 x hidden_size`).
-        - **jt** (Tensor) - Forward :math:`j_t` cache at moment `t`. Tensor of shape (`batch_size`, `4 x hidden_size`).
-        - **ft** (Tensor) - Forward :math:`f_t` cache at moment `t`. Tensor of shape (`batch_size`, `4 x hidden_size`).
-        - **ot** (Tensor) - Forward :math:`o_t` cache at moment `t`. Tensor of shape (`batch_size`, `4 x hidden_size`).
+        - **it** (Tensor) - Forward :math:`i_t` cache at moment `t`. Tensor of shape (`batch_size`, `hidden_size`).
+        - **jt** (Tensor) - Forward :math:`j_t` cache at moment `t`. Tensor of shape (`batch_size`, `hidden_size`).
+        - **ft** (Tensor) - Forward :math:`f_t` cache at moment `t`. Tensor of shape (`batch_size`, `hidden_size`).
+        - **ot** (Tensor) - Forward :math:`o_t` cache at moment `t`. Tensor of shape (`batch_size`, `hidden_size`).
         - **tanhct** (Tensor) - Forward :math:`tanh c_t` cache at moment `t`.
-          Tensor of shape (`batch_size`, `4 x hidden_size`).
+          Tensor of shape (`batch_size`, `hidden_size`).
 
     Examples:
          'block': P.BasicLSTMCell(keep_prob=1.0, forget_bias=1.0, state_is_tuple=True, activation='tanh'),
@@ -3482,7 +4223,7 @@ class BasicLSTMCell(PrimitiveWithInfer):
     """
 
     @prim_attr_register
-    def __init__(self, keep_prob=1.0, forget_bias=1.0, state_is_tuple=True, activation="tanh"):
+    def __init__(self, keep_prob=1.0, forget_bias=1.0, state_is_tuple=True, activation='tanh'):
         self.keep_prob = validator.check_value_type("keep_prob", keep_prob, [float], self.name)
         self.keep_prob = validator.check_number_range("keep_prob", keep_prob, 0.0, 1.0, Rel.INC_BOTH, self.name)
         self.forget_bias = validator.check_value_type("forget_bias", forget_bias, [float], self.name)
@@ -3501,7 +4242,7 @@ class BasicLSTMCell(PrimitiveWithInfer):
         validator.check_integer("b rank", len(b_shape), 4, Rel.EQ, self.name)
         validator.check("w_shape[0]", w_shape[0], "4*h_shape[1]", 4 * h_shape[1], Rel.EQ, self.name)
         validator.check("w_shape[1]", w_shape[1], "x_shape[1]+h_shape[1]", x_shape[1] + h_shape[1], Rel.EQ, self.name)
-        validator.check("b_shape[0]", b_shape[0], "4*h_shape[1]", 4*h_shape[1], Rel.EQ, self.name)
+        validator.check("b_shape[0]", b_shape[0], "4*h_shape[1]", 4 * h_shape[1], Rel.EQ, self.name)
         ct_shape = c_shape
         ht_shape = h_shape
         it_shape = h_shape
@@ -3524,3 +4265,44 @@ class BasicLSTMCell(PrimitiveWithInfer):
         validator.check_type_name("w", w_dtype, [mstype.float16, mstype.float32], self.name)
         validator.check_type_name("b", b_dtype, [mstype.float16, mstype.float32], self.name)
         return (x_dtype, x_dtype, x_dtype, x_dtype, x_dtype, x_dtype, x_dtype)
+
+
+class InTopK(PrimitiveWithInfer):
+    r"""
+    Says whether the targets are in the top `k` predictions.
+
+    Args:
+        k (int): Special the number of top elements to look at for computing precision.
+
+    Inputs:
+        - **x1** (Tensor) - A 2D Tensor define the predictions of a batch of samples with float32 data type.
+        - **x2** (Tensor) - A 1D Tensor define the labels of a batch of samples with int32 data type.
+
+    Outputs:
+        Tensor, which is 1 dimension of type bool and has same shape with `x2`. for label of sample `i` in `x2`,
+        if label in first `k` predictions for sample `i` in `x1`, then the value is True else False.
+
+    Examples:
+        >>> x1 = Tensor(np.array([[1, 8, 5, 2, 7], [4, 9, 1, 3, 5]]), mindspore.float32)
+        >>> x2 = Tensor(np.array([1, 3]), mindspore.int32)
+        >>> in_top_k = P.InTopK(3)
+        >>> result = in_top_k(x1, x2)
+        [True  False]
+    """
+    @prim_attr_register
+    def __init__(self, k):
+        """Init InTopK"""
+        self.init_prim_io_names(inputs=['x1', 'x2', 'k'], outputs=['y'])
+        validator.check_value_type("k", k, [int], self.name)
+
+    def infer_dtype(self, x1_dtype, x2_dtype):
+        validator.check_tensor_type_same({"x1": x1_dtype}, (mstype.float32,), self.name)
+        validator.check_tensor_type_same({"x2": x2_dtype}, (mstype.int32,), self.name)
+
+        return mstype.tensor_type(mstype.bool_)
+
+    def infer_shape(self, x1_shape, x2_shape):
+        validator.check("x1", len(x1_shape), "", 2, Rel.EQ, self.name)
+        validator.check("x2", len(x2_shape), "", 1, Rel.EQ, self.name)
+        validator.check("size of x2", x2_shape[0], "x1's first dimension", x1_shape[0], Rel.EQ, self.name)
+        return x2_shape
diff --git a/mindspore/ops/operations/other_ops.py b/mindspore/ops/operations/other_ops.py
index d73f53eb6a..74c6080ab4 100644
--- a/mindspore/ops/operations/other_ops.py
+++ b/mindspore/ops/operations/other_ops.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 """Other operators."""
+import functools
 from ..._c_expression import signature_rw as sig_rw
 from ..._c_expression import signature_kind as sig_kind
 from ..._c_expression import signature_dtype as sig_dtype
@@ -52,7 +53,7 @@ class Assign(PrimitiveWithInfer):
     )
     @prim_attr_register
     def __init__(self):
-        pass
+        self.init_prim_io_names(inputs=['ref', 'value'], outputs=['output'])
 
     def infer_shape(self, variable, value):
         return variable
@@ -227,20 +228,20 @@ class IOU(PrimitiveWithInfer):
 
     Inputs:
         - **anchor_boxes** (Tensor) - Anchor boxes, tensor of shape (N, 4). "N" indicates the number of anchor boxes,
-          and the value "4" refers to "x0", "x1", "y0", and "y1".
+          and the value "4" refers to "x0", "x1", "y0", and "y1". Data type must be float16.
         - **gt_boxes** (Tensor) - Ground truth boxes, tensor of shape (M, 4). "M" indicates the number of ground
-          truth boxes, and the value "4" refers to "x0", "x1", "y0", and "y1".
+          truth boxes, and the value "4" refers to "x0", "x1", "y0", and "y1". Data type must be float16.
 
     Outputs:
-        Tensor, the 'iou' values, tensor of shape (M, N).
+        Tensor, the 'iou' values, tensor of shape (M, N), with data type float16.
 
     Raises:
         KeyError: When `mode` is not 'iou' or 'iof'.
 
     Examples:
         >>> iou = P.IOU()
-        >>> anchor_boxes = Tensor(np.random.randint(1.0, 5.0, [3, 4]), mindspore.float32)
-        >>> gt_boxes = Tensor(np.random.randint(1.0, 5.0, [3, 4]), mindspore.float32)
+        >>> anchor_boxes = Tensor(np.random.randint(1.0, 5.0, [3, 4]), mindspore.float16)
+        >>> gt_boxes = Tensor(np.random.randint(1.0, 5.0, [3, 4]), mindspore.float16)
         >>> iou(anchor_boxes, gt_boxes)
     """
 
@@ -304,6 +305,46 @@ class MakeRefKey(Primitive):
         pass
 
 
+class Partial(Primitive):
+    """
+    Make a partial function instance, used for pynative mode.
+
+    Inputs:
+        - **args** (Union[FunctionType, Tensor]) - The function and bind arguments.
+
+    Outputs:
+        FunctionType, partial function binded with arguments.
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        pass
+
+    def __call__(self, *args):
+        func = args[0].__call__
+        partial_func = functools.partial(func, *args[1:])
+        return partial_func
+
+class Depend(Primitive):
+    """
+    Depend is used for process side-effect operations.
+
+    Inputs:
+        - **value** (Tensor) - the real value to return for depend operator.
+        - **expr** (Expression) - the expression to execute with no outputs.
+
+    Outputs:
+        Tensor, the value passed by last operator.
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        pass
+
+    def __call__(self, value, expr):
+        return value
+
+
 class CheckBprop(PrimitiveWithInfer):
     """
     Checks whether data type and shape of corresponding element from tuple x and y are the same.
@@ -332,6 +373,8 @@ class CheckBprop(PrimitiveWithInfer):
 
     def infer_shape(self, xshapes, yshapes):
         tips = f'Bprop of {self.prim_to_check}'
+        validator.check_value_type('grads', xshapes, (tuple,), tips)
+        validator.check_value_type('params', yshapes, (tuple,), tips)
         if len(xshapes) < len(yshapes):
             raise TypeError(f"{tips}, the size of output should be {len(yshapes)},"
                             f" but got {len(xshapes)}.")
@@ -348,6 +391,8 @@ class CheckBprop(PrimitiveWithInfer):
 
     def infer_dtype(self, xdtypes, ydtypes):
         tips = f'Bprop of {self.prim_to_check}'
+        validator.check_value_type('grads', xdtypes, (tuple,), tips)
+        validator.check_value_type('params', ydtypes, (tuple,), tips)
         if len(xdtypes) < len(ydtypes):
             raise TypeError(f"{tips}, the size of output should be {len(ydtypes)},"
                             f" but got {len(xdtypes)}.")
@@ -366,3 +411,50 @@ class CheckBprop(PrimitiveWithInfer):
                 raise TypeError(f"{tips}, the dtype of {i}th output should be {ydtype},"
                                 f" but got {xdtype}.")
         return xdtypes
+
+
+class ConfusionMatrix(PrimitiveWithInfer):
+    r"""
+    Calculate the confusion matrix from labels and predictions.
+
+    Args:
+        num_classes (int): The num of classes.
+        dtype (str): Data type of confusion matrix. Default: 'int32'.
+
+    Inputs:
+        - **labels** (Tensor) - real labels, tensor of 1-D. the dtype must be non-negative Integer.
+        - **predictions** (Tensor) - the labels from prediction, tensor of 1-D.
+          the shape same as `labels` and the dtype must be non-negative Integer.
+        - **weights** (Tensor) - tensor of 1-D. the shape same as `predictions`.
+
+    Outputs:
+        Tensor, the confusion matrix, with shape (`num_classes`, `num_classes`).
+
+    Examples:
+        >>> confusion_matrix = P.ConfusionMatrix(4)
+        >>> labels = Tensor([0, 1, 1, 3], mindspore.int32)
+        >>> predictions = Tensor([1, 2, 1, 3], mindspore.int32)
+        >>> confusion_matrix(labels, predictions)
+    """
+
+    @prim_attr_register
+    def __init__(self, num_classes, dtype="int32"):
+        validator.check_value_type("num_classes", num_classes, [int], self.name)
+        validator.check_value_type("dtype", dtype, [str], self.name)
+
+    def infer_shape(self, labels, predictions, weights=None):
+        validator.check('labels dimension', len(labels), '', 1, Rel.EQ, self.name)
+        validator.check('labels shape', labels, 'predictions shape', predictions, Rel.EQ, self.name)
+        if weights is not None:
+            validator.check('labels shape', labels, 'weights shape', weights, Rel.EQ, self.name)
+        ret = (self.num_classes, self.num_classes)
+        return ret
+
+    def infer_dtype(self, labels, predictions, weights=None):
+        validator.check_subclass('labels', labels, mstype.tensor, self.name)
+        validator.check_subclass('predictions', predictions, mstype.tensor, self.name)
+        if weights is not None:
+            validator.check_subclass('weights', weights, mstype.tensor, self.name)
+        args = {"labels": labels, "predictions": predictions}
+        validator.check_tensor_type_same(args, (mstype.number_type), self.name)
+        return labels
diff --git a/mindspore/ops/operations/random_ops.py b/mindspore/ops/operations/random_ops.py
index 77201c25f9..cde7dd41e3 100644
--- a/mindspore/ops/operations/random_ops.py
+++ b/mindspore/ops/operations/random_ops.py
@@ -66,6 +66,49 @@ class RandomChoiceWithMask(PrimitiveWithInfer):
         return (mstype.int32, mstype.bool_)
 
 
+class Normal(PrimitiveWithInfer):
+    """
+    Generates random samples from a normal(Gaussian) distribution.
+
+    Args:
+        seed (int): Random seed. Default: 0.
+
+    Inputs:
+        - **shape** (tuple[int]) - The shape of output tensor. Only constant value is allowed.
+        - **mean** (Tensor) - The mean of the distribution, with float32 data type.
+        - **stddev** (Tensor) - The standard deviation of the distribution, with float32 data type.
+
+    Outputs:
+        Tensor, with the given shape from the specific distribution and float32 data type.
+
+    Examples:
+        >>> normal = P.Normal()
+        >>> mean = Tensor(0., mstype.float32)
+        >>> stddev = Tensor(1., mstype.float32)
+        >>> out = normal((32, 3, 3), mean, stddev)
+    """
+
+    @prim_attr_register
+    def __init__(self, seed=0):
+        """Init Normal"""
+        validator.check_value_type("seed", seed, [int], self.name)
+
+    def __infer__(self, shape, mean, stddev):
+        shape_value = shape["value"]
+        if shape_value is None:
+            raise ValueError(f"For {self.name}, shape must be const.")
+        validator.check_value_type("shape", shape_value, [tuple], self.name)
+        for i, shape_i in enumerate(shape_value):
+            validator.check_integer("shape[%d]" % i, shape_i, 0, Rel.GE, self.name)
+
+        validator.check_tensor_type_same({"mean": mean["dtype"]}, [mstype.float32], self.name)
+        validator.check_tensor_type_same({"stddev": stddev["dtype"]}, [mstype.float32], self.name)
+
+        out = {"shape": shape_value,
+               "dtype": mstype.float32,
+               "value": None}
+        return out
+
 class RandomCategorical(PrimitiveWithInfer):
     """
     Generates random samples from a given categorical distribution tensor.
diff --git a/mindspore/ops/operations/thor_ops.py b/mindspore/ops/operations/thor_ops.py
index f84b5d1ffd..d2de0190a6 100644
--- a/mindspore/ops/operations/thor_ops.py
+++ b/mindspore/ops/operations/thor_ops.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 # ============================================================================
 """thor_ops"""
-from mindspore.ops import prim_attr_register, PrimitiveWithInfer
-from mindspore.ops.composite import multitype_ops as C
+from ..primitive import prim_attr_register, PrimitiveWithInfer
+from ...common import dtype as mstype
 
-import mindspore as ms
 
 __all__ = ["CusBatchMatMul",
            "CusCholeskyTrsm",
@@ -58,11 +57,6 @@ class CusBatchMatMul(PrimitiveWithInfer):
         """init CusBatchMatMul"""
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
         from mindspore.ops._op_impl._custom_op.batch_matmul_impl import CusBatchMatMul
-    def get_bprop(self):
-        def bprop(x1, x2, out, dout):
-            return (C.zeros_like(x1), C.zeros_like(x2))
-
-        return bprop
 
     def infer_shape(self, data1_shape, data2_shape):
         return data1_shape
@@ -97,11 +91,6 @@ class CusCholeskyTrsm(PrimitiveWithInfer):
         self.init_prim_io_names(inputs=['x1'], outputs=['y'])
         from mindspore.ops._op_impl._custom_op.cholesky_trsm_impl import CusCholeskyTrsm
 
-    def get_bprop(self):
-        def bprop(x, out, dout):
-            return (C.zeros_like(x),)
-        return bprop
-
     def infer_shape(self, data1_shape):
         ll = []
         m, _ = data1_shape
@@ -138,11 +127,6 @@ class CusFusedAbsMax1(PrimitiveWithInfer):
         self.init_prim_io_names(inputs=['x1'], outputs=['y'])
         self.origin_shape = origin_shape
         from mindspore.ops._op_impl._custom_op.fused_abs_max1_impl import CusFusedAbsMax1
-    def get_bprop(self):
-        def bprop(x, out, dout):
-            return (C.zeros_like(x),)
-
-        return bprop
 
     def infer_shape(self, data1_shape):
         ll = []
@@ -182,11 +166,6 @@ class CusImg2Col(PrimitiveWithInfer):
         self.dilates = dilates
         self.mode = mode
         from mindspore.ops._op_impl._custom_op.img2col_impl import CusImg2Col
-    def get_bprop(self):
-        def bprop(x, out, dout):
-            return (C.zeros_like(x),)
-
-        return bprop
 
     def infer_shape(self, data1_shape):
         bs, c, h, w = data1_shape
@@ -229,17 +208,12 @@ class CusMatMulCubeDenseLeft(PrimitiveWithInfer):
         """init CusMatMulCubeDenseLeft"""
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
         from mindspore.ops._op_impl._custom_op.matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft
-    def get_bprop(self):
-        def bprop(x1, x2, out, dout):
-            return (C.zeros_like(x1), C.zeros_like(x2))
-
-        return bprop
 
     def infer_shape(self, data1_shape, data2_shape):
         return data2_shape
 
     def infer_dtype(self, data1_dtype, data2_dtype):
-        return ms.common.dtype.tensor_type(getattr(ms, "float16"))
+        return mstype.float16
 
 
 class CusMatMulCubeFraczRightMul(PrimitiveWithInfer):
@@ -269,17 +243,12 @@ class CusMatMulCubeFraczRightMul(PrimitiveWithInfer):
         """init CusMatMulCubeFraczRightMul"""
         self.init_prim_io_names(inputs=['x1', 'x2', 'x3'], outputs=['y'])
         from mindspore.ops._op_impl._custom_op.matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
-    def get_bprop(self):
-        def bprop(x1, x2, x3, out, dout):
-            return (C.zeros_like(x1), C.zeros_like(x2), C.zeros_like(x3))
-
-        return bprop
 
     def infer_shape(self, data1_shape, data2_shape, data3_shape):
         return data1_shape
 
     def infer_dtype(self, data1_dtype, data2_dtype, data3_dtype):
-        return ms.common.dtype.tensor_type(getattr(ms, "float32"))
+        return mstype.float32
 
 
 class CusMatMulCube(PrimitiveWithInfer):
@@ -315,11 +284,6 @@ class CusMatMulCube(PrimitiveWithInfer):
         self.transpose_a = transpose_a
         self.transpose_b = transpose_b
         from mindspore.ops._op_impl._custom_op.matmul_cube_impl import CusMatMulCube
-    def get_bprop(self):
-        def bprop(x1, x2, out, dout):
-            return (C.zeros_like(x1), C.zeros_like(x2))
-
-        return bprop
 
     def infer_shape(self, data1_shape, data2_shape):
         # shape = [1, data1_shape[1], data2_shape[2], 16, 16]
@@ -337,7 +301,7 @@ class CusMatMulCube(PrimitiveWithInfer):
         return shape
 
     def infer_dtype(self, data1_dtype, data2_dtype):
-        return ms.common.dtype.tensor_type(getattr(ms, "float32"))
+        return mstype.float32
 
 
 class CusMatrixCombine(PrimitiveWithInfer):
@@ -362,11 +326,6 @@ class CusMatrixCombine(PrimitiveWithInfer):
         """init CusMatrixCombine"""
         self.init_prim_io_names(inputs=['x'], outputs=['y'])
         from mindspore.ops._op_impl._custom_op.matrix_combine_impl import CusMatrixCombine
-    def get_bprop(self):
-        def bprop(x, out, dout):
-            return (C.zeros_like(x),)
-
-        return bprop
 
     def infer_shape(self, data_shape):
         a, b, c = data_shape
@@ -446,17 +405,12 @@ class CusMatMulCubeDenseRight(PrimitiveWithInfer):
         """init CusMatMulCubeDenseRight"""
         self.init_prim_io_names(inputs=['x1', 'x2', 'x3'], outputs=['y'])
         from mindspore.ops._op_impl._custom_op.matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
-    def get_bprop(self):
-        def bprop(x1, x2, x3, out, dout):
-            return (C.zeros_like(x1), C.zeros_like(x2), C.zeros_like(x3))
-
-        return bprop
 
     def infer_shape(self, data1_shape, data2_shape, data3_shape):
         return data1_shape
 
     def infer_dtype(self, data1_dtype, data2_dtype, data3_dtype):
-        return ms.common.dtype.tensor_type(getattr(ms, "float32"))
+        return mstype.float32
 
 
 class CusMatMulCubeFraczLeftCast(PrimitiveWithInfer):
@@ -486,14 +440,9 @@ class CusMatMulCubeFraczLeftCast(PrimitiveWithInfer):
         """init CusMatMulCubeFraczLeftCast"""
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
         from mindspore.ops._op_impl._custom_op.matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
-    def get_bprop(self):
-        def bprop(x1, x2, out, dout):
-            return (C.zeros_like(x1), C.zeros_like(x2))
-
-        return bprop
 
     def infer_shape(self, data1_shape, data2_shape):
         return data2_shape
 
     def infer_dtype(self, data1_dtype, data2_dtype):
-        return ms.common.dtype.tensor_type(getattr(ms, "float16"))
+        return mstype.float16
diff --git a/mindspore/ops/primitive.py b/mindspore/ops/primitive.py
index 95e148204b..7ceb687778 100644
--- a/mindspore/ops/primitive.py
+++ b/mindspore/ops/primitive.py
@@ -43,11 +43,12 @@ class Primitive(Primitive_):
         >>> # init a Primitive obj with attr1=1 and attr2=2
         >>> add = Add(attr1=1, attr2=2)
     """
+    _repr_ignore_list = ['input_names', 'output_names']
 
     def __init__(self, name):
         self.name = name
         self.attrs = {}
-        self.init_attrs = {}
+        self.init_attrs = {"name": name}
         Primitive_.__init__(self, name, self)
         if hasattr(self.__class__, '__mindspore_signature__'):
             sig = self._fill_signature(self.__class__.__mindspore_signature__)
@@ -140,9 +141,24 @@ class Primitive(Primitive_):
             return self.attrs[item]
         raise AttributeError(item)
 
+    def check_elim(self, *args):
+        """
+        Check whether or not certain inputs should go into backend. Subclass in need should override this method.
+
+        Args:
+            Same as arguments of current Primitive
+
+        Returns:
+            A tuple of two elements, first element indicates whether or not we should filter out current arguments;
+            seconde element is the output in case where we should filter out the arguments.
+        """
+        return (False, None)
+
     def __call__(self, *args):
-        output = _run_op(self, self.name, args)
-        return output
+        should_elim, output = self.check_elim(*args)
+        if should_elim:
+            return output
+        return _run_op(self, self.name, args)
 
     def __getstate__(self):
         return self.__dict__
@@ -150,6 +166,16 @@ class Primitive(Primitive_):
     def __setstate__(self, d):
         self.__dict__.update(d)
 
+    def __deepcopy__(self, memo):
+        return type(self)(**self.init_attrs)
+
+    def __repr__(self):
+        attr = ', '.join([f'{k}={self.attrs[k]}'for k in self.attrs if not k in Primitive._repr_ignore_list])
+        info_str = f'Prim[{self.name}]'
+        if attr:
+            info_str += f'<{attr}>'
+        return info_str
+
     def init_prim_io_names(self, inputs, outputs):
         """
         Initializes inputs and outpus name of Tensor or attributes.
@@ -170,8 +196,8 @@ class PrimitiveWithInfer(Primitive):
 
     There are four method can be overide to define the infer logic of the primitive: __infer__(), infer_shape(),
     infer_dtype(), and infer_value(). If __infer__() is defined in primitive, the __infer__() has highest priority
-    to be called. If __infer__() is not defined, infer_shape() and infer_dtype() can be defined to describle shape
-    and type infer logic. The infer_value() is used for constant propogation.
+    to be called. If __infer__() is not defined, infer_shape() and infer_dtype() can be defined to describe shape
+    and type infer logic. The infer_value() is used for constant propagation.
 
     Args:
         name (str): Name for current Primitive.
@@ -273,6 +299,7 @@ def prim_attr_register(fn):
         bound_args.apply_defaults()
         arguments = bound_args.arguments
         del arguments['self']
+        del self.init_attrs['name']
         for name in arguments:
             value = arguments[name]
             self.add_prim_attr(name, value)
@@ -284,7 +311,8 @@ def prim_attr_register(fn):
 
 def constexpr(fn=None, get_instance=True, name=None):
     """
-    Makes a PrimitiveWithInfer operator, which infer the value while compiling.
+    Makes a PrimitiveWithInfer operator, which infer the value while compiling. We can define a function
+    to compute between constant variable and used in constructß.
 
     Args:
         fn (function): A `fn` use as the infer_value of the output operator.
@@ -310,6 +338,7 @@ def constexpr(fn=None, get_instance=True, name=None):
             def __init__(self):
                 op_name = name if name else fn.__name__
                 PrimitiveWithInfer.__init__(self, op_name)
+                self.const_value = True
 
             def infer_value(self, *args):
                 return fn(*args)
@@ -324,19 +353,7 @@ def constexpr(fn=None, get_instance=True, name=None):
 @_wrap_func
 def _run_op(obj, op_name, args):
     """Single op execution function supported by ge in PyNative mode."""
-    op_mask = [0] * len(args)
-    op_inputs = []
-    for i, arg in enumerate(args):
-        if hasattr(arg, '__parameter__'):
-            op_inputs.append(arg.default_input)
-            op_mask[i] = 1
-        elif isinstance(arg, tuple):
-            convert = lambda x: x.default_input if hasattr(x, '__parameter__') else x
-            args_ = tuple(convert(x) for x in arg)
-            op_inputs.append(args_)
-        else:
-            op_inputs.append(arg)
-    output = real_run_op(obj, op_name, tuple(op_inputs), tuple(op_mask))
+    output = real_run_op(obj, op_name, args)
     if not output:
         raise RuntimeError("Pynative run op %s failed!" % op_name)
     if len(output) == 1:
diff --git a/mindspore/parallel/_auto_parallel_context.py b/mindspore/parallel/_auto_parallel_context.py
index 21ef1d59f2..74250f12e5 100644
--- a/mindspore/parallel/_auto_parallel_context.py
+++ b/mindspore/parallel/_auto_parallel_context.py
@@ -185,13 +185,20 @@ class _AutoParallelContext:
         self.check_context_handle()
         return self._context_handle.get_parallel_mode()
 
-    def set_strategy_search_mode(self, strategy_search_mode):
+    def set_strategy_search_mode(self, auto_parallel_search_mode):
+        """
+        Set search mode of strategy.
+
+        Args:
+            auto_parallel_search_mode (str): The search mode of strategy.
+        """
         self.check_context_handle()
-        ret = self._context_handle.set_strategy_search_mode(strategy_search_mode)
+        ret = self._context_handle.set_strategy_search_mode(auto_parallel_search_mode)
         if ret is False:
-            raise ValueError("Strategy search mode does not support {}".format(strategy_search_mode))
+            raise ValueError("Strategy search mode does not support {}".format(auto_parallel_search_mode))
 
     def get_strategy_search_mode(self):
+        """Get search mode of strategy."""
         self.check_context_handle()
         return self._context_handle.get_strategy_search_mode()
 
@@ -225,6 +232,21 @@ class _AutoParallelContext:
         self.check_context_handle()
         return self._context_handle.get_strategy_ckpt_load_file()
 
+    def set_full_batch(self, full_batch):
+        """
+        Set whether load full batch on each device.
+
+        Args:
+            full_batch (bool): True if load full batch on each device.
+        """
+        self.check_context_handle()
+        self._context_handle.set_full_batch(full_batch)
+
+    def get_full_batch(self):
+        """Get whether load full batch on each device."""
+        self.check_context_handle()
+        return self._context_handle.get_full_batch()
+
     def set_strategy_ckpt_save_file(self, strategy_ckpt_save_file):
         """
         Set strategy checkpoint save path.
@@ -407,9 +429,11 @@ _set_auto_parallel_context_func_map = {
     "cast_before_mirror": auto_parallel_context().set_cast_before_mirror,
     "loss_repeated_mean": auto_parallel_context().set_loss_repeated_mean,
     "parallel_mode": auto_parallel_context().set_parallel_mode,
+    "auto_parallel_search_mode": auto_parallel_context().set_strategy_search_mode,
     "parameter_broadcast": auto_parallel_context().set_parameter_broadcast,
     "strategy_ckpt_load_file": auto_parallel_context().set_strategy_ckpt_load_file,
-    "strategy_ckpt_save_file": auto_parallel_context().set_strategy_ckpt_save_file}
+    "strategy_ckpt_save_file": auto_parallel_context().set_strategy_ckpt_save_file,
+    "full_batch": auto_parallel_context().set_full_batch}
 
 
 _get_auto_parallel_context_func_map = {
@@ -419,14 +443,17 @@ _get_auto_parallel_context_func_map = {
     "cast_before_mirror": auto_parallel_context().get_cast_before_mirror,
     "loss_repeated_mean": auto_parallel_context().get_loss_repeated_mean,
     "parallel_mode": auto_parallel_context().get_parallel_mode,
+    "auto_parallel_search_mode": auto_parallel_context().get_strategy_search_mode,
     "parameter_broadcast": auto_parallel_context().get_parameter_broadcast,
     "strategy_ckpt_load_file": auto_parallel_context().get_strategy_ckpt_load_file,
-    "strategy_ckpt_save_file": auto_parallel_context().get_strategy_ckpt_save_file}
+    "strategy_ckpt_save_file": auto_parallel_context().get_strategy_ckpt_save_file,
+    "full_batch": auto_parallel_context().get_full_batch}
 
 
 @args_type_check(device_num=int, global_rank=int, mirror_mean=bool, cast_before_mirror=bool,
-                 loss_repeated_mean=bool, parallel_mode=str, parameter_broadcast=bool,
-                 strategy_ckpt_load_file=str, strategy_ckpt_save_file=str)
+                 loss_repeated_mean=bool, parallel_mode=str, auto_parallel_search_mode=str,
+                 parameter_broadcast=bool, strategy_ckpt_load_file=str,
+                 strategy_ckpt_save_file=str, full_batch=bool)
 def _set_auto_parallel_context(**kwargs):
     """
     Set auto parallel context.
@@ -454,11 +481,18 @@ def _set_auto_parallel_context(**kwargs):
                        setting parallel strategies.
 
                      - auto_parallel: Achieving parallelism automatically.
+        auto_parallel_search_mode (str): There are two kinds of search modes, "recursive_programming"
+                     and "dynamic_programming". Default: "dynamic_programming".
+
+                     - recursive_programming: Recursive programming search mode.
+
+                     - dynamic_programming: Dynamic programming search mode.
         parameter_broadcast (bool): Indicating whether to broadcast parameters before training.
                        "stand_alone", "semi_auto_parallel" and "auto_parallel" do not support parameter
                        broadcast. Default: False.
         strategy_ckpt_load_file (str): The path to load parallel strategy checkpoint. Default: ''
         strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
+        full_batch (bool): Whether to load the whole batch on each device. Default: False.
 
     Raises:
         ValueError: If input key is not attribute in auto parallel context.
diff --git a/mindspore/parallel/_tensor.py b/mindspore/parallel/_tensor.py
index 073ad9809a..fca8b88920 100644
--- a/mindspore/parallel/_tensor.py
+++ b/mindspore/parallel/_tensor.py
@@ -168,21 +168,21 @@ def _chunk_tensor_by_strategy(np_tensor, strategy):
         raise ValueError("The length of np_tensor does not match the length of strategy!")
     return _chunk_tensor(np_tensor, strategy, len(strategy))
 
-def _get_seed(dev_mat, tensor_map):
+def _get_slice_index(dev_mat, tensor_map):
     """
-    Get the random seed for current slice.
+    Get the slice index for current slice.
 
     Args:
         dev_mat (list): The device matrix of devices.
         tensor_map (list): The split strategy of tensor.
 
     Returns:
-        Integer, the local random seed for this device.
+        Integer, the slice index for slice on this device.
     """
     rank = get_rank()
     tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map)
-    tensor_slice_seed = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank)
-    return tensor_slice_seed
+    tensor_slice_index = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank)
+    return tensor_slice_index
 
 def _load_tensor(tensor, dev_mat, tensor_map):
     """
diff --git a/mindspore/parallel/_utils.py b/mindspore/parallel/_utils.py
index 3301c3c970..c5b4d57702 100644
--- a/mindspore/parallel/_utils.py
+++ b/mindspore/parallel/_utils.py
@@ -20,10 +20,26 @@ from mindspore.parallel._auto_parallel_context import auto_parallel_context
 
 
 def _get_parallel_mode():
+    """Get parallel mode."""
     return auto_parallel_context().get_parallel_mode()
 
 
+def _get_full_batch():
+    """Get whether to use full_batch."""
+    return auto_parallel_context().get_full_batch()
+
+
+def _need_to_full():
+    """Check whether to convert input to full shape or tensor."""
+    parallel_mode = _get_parallel_mode()
+    full_batch = _get_full_batch()
+    need = ((parallel_mode in ("semi_auto_parallel", "auto_parallel"))
+            and (not full_batch))
+    return need
+
+
 def _get_mirror_mean():
+    """Get if using mirror_mean."""
     return auto_parallel_context().get_mirror_mean()
 
 
diff --git a/mindspore/parallel/mpi/__init__.py b/mindspore/parallel/mpi/__init__.py
new file mode 100644
index 0000000000..e30774307c
--- /dev/null
+++ b/mindspore/parallel/mpi/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
diff --git a/mindspore/parallel/mpi/_mpi_config.py b/mindspore/parallel/mpi/_mpi_config.py
new file mode 100644
index 0000000000..e43305fb76
--- /dev/null
+++ b/mindspore/parallel/mpi/_mpi_config.py
@@ -0,0 +1,111 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+The MPI config, used to configure the MPI environment.
+"""
+import threading
+from mindspore._c_expression import MpiConfig
+from mindspore._checkparam import args_type_check
+
+class _MpiConfig:
+    """
+    _MpiConfig is the config tool for controlling MPI
+
+    Note:
+        Create a config through instantiating MpiConfig object is not recommended.
+        should use MpiConfig() to get the config since MpiConfig is singleton.
+    """
+    _instance = None
+    _instance_lock = threading.Lock()
+
+    def __init__(self):
+        self._mpiconfig_handle = MpiConfig.get_instance()
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance_lock.acquire()
+            cls._instance = object.__new__(cls)
+            cls._instance_lock.release()
+        return cls._instance
+
+    def __getattribute__(self, attr):
+        value = object.__getattribute__(self, attr)
+        if attr == "_mpiconfig_handle" and value is None:
+            raise ValueError("mpiconfig handle is none in MpiConfig!!!")
+        return value
+
+    @property
+    def enable_mpi(self):
+        return self._mpiconfig_handle.get_enable_mpi()
+
+    @enable_mpi.setter
+    def enable_mpi(self, enable_mpi):
+        self._mpiconfig_handle.set_enable_mpi(enable_mpi)
+
+_k_mpi_config = None
+def _mpi_config():
+    """
+    Get the global mpi config, if mpi config is not created, create a new one.
+
+    Returns:
+        _MpiConfig, the global mpi config.
+    """
+    global _k_mpi_config
+    if _k_mpi_config is None:
+        _k_mpi_config = _MpiConfig()
+    return _k_mpi_config
+
+@args_type_check(enable_mpi=bool)
+def _set_mpi_config(**kwargs):
+    """
+    Sets mpi config for running environment.
+
+    mpi config should be configured before running your program. If there is no configuration,
+    mpi moudle will be disabled by default.
+
+    Note:
+        Attribute name is required for setting attributes.
+
+    Args:
+        enable_mpi (bool): Whether to enable mpi. Default: False.
+
+    Raises:
+        ValueError: If input key is not an attribute in mpi config.
+
+    Examples:
+        >>> mpiconfig.set_mpi_config(enable_mpi=True)
+    """
+    for key, value in kwargs.items():
+        if not hasattr(_mpi_config(), key):
+            raise ValueError("Set mpi config keyword %s is not recognized!" % key)
+        setattr(_mpi_config(), key, value)
+
+
+def _get_mpi_config(attr_key):
+    """
+    Gets mpi config attribute value according to the input key.
+
+    Args:
+        attr_key (str): The key of the attribute.
+
+    Returns:
+        Object, The value of given attribute key.
+
+    Raises:
+        ValueError: If input key is not an attribute in context.
+    """
+    if not hasattr(_mpi_config(), attr_key):
+        raise ValueError("Get context keyword %s is not recognized!" % attr_key)
+    return getattr(_mpi_config(), attr_key)
diff --git a/mindspore/train/_utils.py b/mindspore/train/_utils.py
index 7bc07b126e..85fd6fa189 100644
--- a/mindspore/train/_utils.py
+++ b/mindspore/train/_utils.py
@@ -14,14 +14,17 @@
 # ============================================================================
 """Train utility."""
 import os
+from collections.abc import Iterable
+
 import numpy as np
+
 from mindspore.common.tensor import Tensor
-from mindspore.common.dtype import dtype_to_nptype
+from mindspore.common.dtype import dtype_to_nptype, pytype_to_dtype
 from mindspore.common import dtype as mstype
 from mindspore import log as logger
 from mindspore.common.api import _executor
-from mindspore.common.dtype import pytype_to_dtype
 
+from .lineage_pb2 import DatasetGraph, TrainLineage, EvaluationLineage, UserDefinedInfo
 
 def _convert_type(types):
     """
@@ -64,8 +67,6 @@ def _exec_datagraph(exec_dataset, dataset_size, phase='dataset'):
                            input_indexs,
                            phase=phase)
 
-    # engine dataset to write data to tdt queue
-    exec_dataset.send()
     return exec_dataset
 
 
@@ -157,8 +158,8 @@ def _to_full_tensor(elem, device_num, global_rank, scaling_sens=None):
             data = Tensor(data)
         if not isinstance(data, Tensor):
             raise ValueError("elements in tensors must be Tensor")
-        shape_ = data.shape()
-        type_ = data.dtype()
+        shape_ = data.shape
+        type_ = data.dtype
         new_shape = ()
         batchsize_per_device = 1
         for i, item in enumerate(shape_):
@@ -196,3 +197,56 @@ def _to_full_shapes(shapes, device_num):
                 new_shape += (item,)
         new_shapes.append(new_shape)
     return new_shapes
+
+
+def _check_to_numpy(plugin, tensor):
+    """Check the tensor and return a numpy.ndarray."""
+    np_value = tensor.asnumpy()
+    if plugin == 'scalar':
+        if np_value.size == 1:
+            return np_value
+        raise ValueError('The tensor holds more than one value, but the scalar plugin expects on value.')
+    if plugin == 'image':
+        if np_value.ndim == 4:
+            return np_value
+        raise ValueError('The tensor seems not to hold a valid image.')
+    if plugin in ('tensor', 'histogram'):
+        if np_value.ndim > 0:
+            return np_value
+        raise ValueError('The tensor should not be empty.')
+    return np_value
+
+
+def _check_lineage_value(plugin, value):
+    """Check the lineage value."""
+    def raises(plugin, prototype):
+        raise TypeError(f'Plugin {repr(plugin)} expects a {prototype.__name__} value.')
+
+    if plugin == 'dataset_graph' and not isinstance(value, DatasetGraph):
+        raises(plugin, DatasetGraph)
+
+    if plugin == 'eval_lineage' and not isinstance(value, EvaluationLineage):
+        raises(plugin, EvaluationLineage)
+
+    if plugin == 'train_lineage' and not isinstance(value, TrainLineage):
+        raises(plugin, TrainLineage)
+
+    if plugin == 'custom_lineage_data' and not isinstance(value, UserDefinedInfo):
+        raises(plugin, UserDefinedInfo)
+
+
+def check_value_type(arg_name, arg_value, valid_types):
+    """Checks whether a value is instance of some types."""
+    valid_types = tuple(valid_types) if isinstance(valid_types, Iterable) else (valid_types,)
+    is_valid = True
+
+    # bool is subclass of int, so for a bool value, we need to extra check
+    if isinstance(arg_value, int) and isinstance(arg_value, bool) and bool not in valid_types:
+        is_valid = False
+
+    if not isinstance(arg_value, valid_types):
+        is_valid = False
+
+    if not is_valid:
+        raise TypeError(f'For `{arg_name}` the type should be a valid type of {[t.__name__ for t in valid_types]}, '
+                        f'bug got {type(arg_value).__name__}.')
diff --git a/mindspore/train/amp.py b/mindspore/train/amp.py
index da0626d6e8..a47b16d0e0 100644
--- a/mindspore/train/amp.py
+++ b/mindspore/train/amp.py
@@ -21,7 +21,6 @@ from .._checkparam import Rel
 from ..common import dtype as mstype
 from ..nn.wrap.cell_wrapper import _VirtualDatasetCell
 from ..ops import functional as F
-from ..ops.composite.base import _mp_cast_helper
 from ..parallel._utils import _get_parallel_mode
 from .loss_scale_manager import DynamicLossScaleManager, LossScaleManager
 from .parallel_utils import ParallelMode
@@ -66,7 +65,11 @@ _config_level = {
     "O2": {
         "keep_batchnorm_fp32": True,
         "cast_model_type": mstype.float16,
-        "loss_scale_manager": DynamicLossScaleManager()}}
+        "loss_scale_manager": DynamicLossScaleManager()},
+    "O3": {
+        "keep_batchnorm_fp32": False,
+        "cast_model_type": mstype.float16,
+        "loss_scale_manager": None}}
 
 
 def _check_kwargs(key_words):
@@ -98,7 +101,7 @@ def _add_loss_network(network, loss_fn, cast_model_type):
 
         def construct(self, data, label):
             out = self._backbone(data)
-            label = _mp_cast_helper(mstype.float32, label)
+            label = F.mixed_precision_cast(mstype.float32, label)
             return self._loss_fn(F.cast(out, mstype.float32), label)
 
     validator.check_value_type('loss_fn', loss_fn, nn.Cell, None)
@@ -118,11 +121,14 @@ def build_train_network(network, optimizer, loss_fn=None, level='O0', **kwargs):
         loss_fn (Union[None, Cell]): Definition of the loss_fn. If None, the `network` should have the loss inside.
             Default: None.
         optimizer (Optimizer): Optimizer to update the Parameter.
-        level (str): Supports [O0, O2]. Default: "O0".
+        level (str): Supports [O0, O2, O3]. Default: "O0".
 
             - O0: Do not change.
             - O2: Cast network to float16, keep batchnorm and `loss_fn` (if set) run in float32,
               using dynamic loss scale.
+            - O3: Cast network to float16, with additional property 'keep_batchnorm_fp32=False'.
+
+            O2 is recommended on GPU, O3 is recommended on Ascend.
 
         cast_model_type (:class:`mindspore.dtype`): Supports `mstype.float16` or `mstype.float32`.
             If set to `mstype.float16`, use `float16` mode to train. If set, overwrite the level setting.
@@ -132,7 +138,7 @@ def build_train_network(network, optimizer, loss_fn=None, level='O0', **kwargs):
     """
     validator.check_value_type('network', network, nn.Cell, None)
     validator.check_value_type('optimizer', optimizer, nn.Optimizer, None)
-    validator.check('level', level, "", ['O0', 'O2'], Rel.IN, None)
+    validator.check('level', level, "", ['O0', 'O2', 'O3'], Rel.IN, None)
     _check_kwargs(kwargs)
     config = dict(_config_level[level], **kwargs)
     config = edict(config)
diff --git a/mindspore/train/callback/__init__.py b/mindspore/train/callback/__init__.py
new file mode 100644
index 0000000000..6ef171cc87
--- /dev/null
+++ b/mindspore/train/callback/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Callback related classes and functions."""
+
+from ._callback import Callback
+from ._callback import CallbackManager as _CallbackManager
+from ._callback import InternalCallbackParam as _InternalCallbackParam
+from ._callback import RunContext
+from ._callback import checkpoint_cb_for_save_op as _checkpoint_cb_for_save_op
+from ._callback import set_cur_net as _set_cur_net
+from ._checkpoint import CheckpointConfig
+from ._checkpoint import CheckpointManager as _CheckpointManager
+from ._checkpoint import ModelCheckpoint
+from ._loss_monitor import LossMonitor
+from ._time_monitor import TimeMonitor
+from ._summary_collector import SummaryCollector
+
+__all__ = ["Callback", "LossMonitor", "TimeMonitor", "ModelCheckpoint",
+           "SummaryCollector", "CheckpointConfig", "RunContext"]
diff --git a/mindspore/train/callback/_callback.py b/mindspore/train/callback/_callback.py
new file mode 100644
index 0000000000..c75e099693
--- /dev/null
+++ b/mindspore/train/callback/_callback.py
@@ -0,0 +1,269 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Callback related classes and functions."""
+
+from contextlib import ExitStack
+
+from mindspore import log as logger
+from mindspore.train.serialization import _fill_param_into_net
+from mindspore.train.summary.summary_record import _cache_summary_tensor_data
+
+_cur_net = None
+
+def set_cur_net(net):
+    """
+    Set current net for which we are using to save checkpoint.
+
+    Args:
+        net (Cell): train network
+    """
+    global _cur_net
+    _cur_net = net
+
+
+def checkpoint_cb_for_save_op(parameter_list):
+    """
+    The checkpoint callback function for MindSpore.
+
+    Will be executed by checkpoint save op.
+
+    Args:
+        parameter_list (list): Format is like [{"name",name},{"data",value}] and value type is Tensor.
+
+    Returns:
+        bool, true: means save checkpoint success.
+    """
+    if _cur_net is None:
+        logger.warning("_cur_net is None. parameters are not updated.")
+        return False
+
+    logger.info("update parameters in the net.")
+    _fill_param_into_net(_cur_net, parameter_list)
+    set_cur_net(None)
+    return True
+
+
+def summary_cb_for_save_op(summary_list):
+    """
+    The summary callback function for MindSpore.
+
+    Will be executed by summary op.
+
+    Args:
+        summary_list (list): Format is like [{"name": tag_name, "data": tensor},...] and value is Scalar/Tensor.
+
+    Returns:
+        bool, true: means save summary success.
+    """
+    ret = _cache_summary_tensor_data(summary_list)
+    return ret
+
+
+class Callback:
+    """
+    Abstract base class used to build a callback class. Callbacks are context managers
+    which will be entered and exited when passing into the Model.
+    You can leverage this mechanism to init and release resources automatically.
+
+    Callback function will execution some operating to the current step or epoch.
+
+    Examples:
+        >>> class Print_info(Callback):
+        >>>     def step_end(self, run_context):
+        >>>         cb_params = run_context.original_args()
+        >>>         print(cb_params.cur_epoch_num)
+        >>>         print(cb_params.cur_step_num)
+        >>>
+        >>> print_cb = Print_info()
+        >>> model.train(epoch, dataset, callbacks=print_cb)
+    """
+
+    def __enter__(self):
+        """Return the enter target."""
+        return self
+
+    def __exit__(self, *err):
+        """Release resources here if have any."""
+
+    def begin(self, run_context):
+        """
+        Called once before the network executing.
+
+        Args:
+            run_context (RunContext): Include some information of the model.
+        """
+
+    def epoch_begin(self, run_context):
+        """
+        Called before each epoch beginning.
+
+        Args:
+            run_context (RunContext): Include some information of the model.
+        """
+
+    def epoch_end(self, run_context):
+        """
+        Called after each epoch finished.
+
+        Args:
+            run_context (RunContext): Include some information of the model.
+        """
+
+    def step_begin(self, run_context):
+        """
+        Called before each epoch beginning.
+
+        Args:
+            run_context (RunContext): Include some information of the model.
+        """
+
+    def step_end(self, run_context):
+        """
+        Called after each step finished.
+
+        Args:
+            run_context (RunContext): Include some information of the model.
+        """
+
+    def end(self, run_context):
+        """
+        Called once after network training.
+
+        Args:
+            run_context (RunContext): Include some information of the model.
+        """
+
+
+class CallbackManager(Callback):
+    """
+    Sequential execution of callback functions.
+
+    Execute Callback functions at certain points.
+
+    Args:
+        callbacks (Optional[list[Callback], Callback]): None, callback, or callbacks list.
+    """
+
+    def __init__(self, callbacks):
+        self._callbacks, self._stack = [], None
+        if isinstance(callbacks, Callback):
+            self._callbacks.append(callbacks)
+        elif isinstance(callbacks, list):
+            for cb in callbacks:
+                if not isinstance(cb, Callback):
+                    raise TypeError("The 'callbacks' contains not-a-Callback item.")
+                self._callbacks.append(cb)
+        elif callbacks is not None:
+            raise TypeError("The 'callbacks' is not a Callback or a list of Callback.")
+
+    def __enter__(self):
+        if self._stack is None:
+            callbacks, self._stack = [], ExitStack().__enter__()
+            for callback in self._callbacks:
+                target = self._stack.enter_context(callback)
+                if not isinstance(target, Callback):
+                    logger.warning("Please return 'self' or a Callback as the enter target.")
+                    callbacks.append(callback)
+                else:
+                    callbacks.append(target)
+            self._callbacks = callbacks
+        return self
+
+    def __exit__(self, *err):
+        return self._stack.__exit__(*err)
+
+    def begin(self, run_context):
+        """Called once before network training."""
+        for cb in self._callbacks:
+            cb.begin(run_context)
+
+    def epoch_begin(self, run_context):
+        """Called before each epoch begin."""
+        for cb in self._callbacks:
+            cb.epoch_begin(run_context)
+
+    def epoch_end(self, run_context):
+        """Called after each epoch finished."""
+        for cb in self._callbacks:
+            cb.epoch_end(run_context)
+
+    def step_begin(self, run_context):
+        """Called before each epoch begin."""
+        for cb in self._callbacks:
+            cb.step_begin(run_context)
+
+    def step_end(self, run_context):
+        """Called after each step finished."""
+        for cb in self._callbacks:
+            cb.step_end(run_context)
+
+    def end(self, run_context):
+        """Called once after network training."""
+        for cb in self._callbacks:
+            cb.end(run_context)
+
+
+class InternalCallbackParam(dict):
+    """Internal callback object's parameters."""
+
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        self[key] = value
+
+
+class RunContext:
+    """
+    Provides information about the model.
+
+    Run call being made. Provides information about original request to model function.
+    callback objects can stop the loop by calling request_stop() of run_context.
+
+    Args:
+        original_args (dict): Holding the related information of model etc.
+    """
+    def __init__(self, original_args):
+        if not isinstance(original_args, dict):
+            raise TypeError("The arg of RunContext should be dict type.")
+        self._original_args = original_args
+        self._stop_requested = False
+
+    def original_args(self):
+        """
+        Get the _original_args object.
+
+        Returns:
+           Dict, a object holding the original arguments of model.
+        """
+        return self._original_args
+
+    def request_stop(self):
+        """
+        Sets stop requested during training.
+
+        Callbacks can use this function to request stop of iterations.
+        model.train() checks whether this is called or not.
+        """
+        self._stop_requested = True
+
+    def get_stop_requested(self):
+        """
+        Returns whether a stop is requested or not.
+
+        Returns:
+            bool, if true, model.train() stops iterations.
+        """
+        return self._stop_requested
diff --git a/mindspore/train/callback.py b/mindspore/train/callback/_checkpoint.py
similarity index 59%
rename from mindspore/train/callback.py
rename to mindspore/train/callback/_checkpoint.py
index e691cfd837..d185377c83 100644
--- a/mindspore/train/callback.py
+++ b/mindspore/train/callback/_checkpoint.py
@@ -12,95 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""Callback related classes and functions."""
+"""Checkpoint related classes and functions."""
 
 import os
-import stat
 import shutil
+import stat
 import time
-import numpy as np
 
 import mindspore.context as context
-from mindspore.train.serialization import _exec_save_checkpoint, _fill_param_into_net, _save_graph
-from mindspore.train._utils import _make_directory
 from mindspore import log as logger
-from mindspore._checkparam import check_int_non_negative, check_bool
-from mindspore.common.tensor import Tensor
-from .summary.summary_record import _cache_summary_tensor_data
-
-
-__all__ = ["Callback", "LossMonitor", "TimeMonitor", "ModelCheckpoint", "SummaryStep", "CheckpointConfig", "RunContext"]
+from mindspore._checkparam import check_bool, check_int_non_negative
+from mindspore.train._utils import _make_directory
+from mindspore.train.serialization import _exec_save_checkpoint, _save_graph
 
+from ._callback import Callback, set_cur_net
 
 _cur_dir = os.getcwd()
-_cur_net = None
 _save_dir = _cur_dir
 
 
-class _CheckpointManager:
-    """Manage checkpoint files according to train_config of checkpoint."""
-    def __init__(self):
-        self._ckpoint_filelist = []
-
-    @property
-    def ckpoint_filelist(self):
-        """Get all the related checkpoint files managed here."""
-        return self._ckpoint_filelist
-
-    @property
-    def ckpoint_num(self):
-        """Get the number of the related checkpoint files managed here."""
-        return len(self._ckpoint_filelist)
-
-    def update_ckpoint_filelist(self, directory, prefix):
-        """Update the checkpoint file list."""
-        self._ckpoint_filelist = []
-        files = os.listdir(directory)
-        for filename in files:
-            if os.path.splitext(filename)[-1] == ".ckpt" and filename.startswith(prefix):
-                mid_name = filename[len(prefix):-5]
-                flag = True
-                for char in mid_name:
-                    if char.isalpha():
-                        flag = False
-                if flag:
-                    self._ckpoint_filelist.append(directory + '/' + filename)
-
-    def remove_ckpoint_file(self, file_name):
-        """Remove the specified checkpoint file from this checkpoint manager and also from the directory."""
-        try:
-            os.chmod(file_name, stat.S_IWRITE)
-            os.remove(file_name)
-            self._ckpoint_filelist.remove(file_name)
-        except OSError:
-            logger.warning("OSError, failed to remove the older ckpt file %s.", file_name)
-        except ValueError:
-            logger.warning("ValueError, failed to remove the older ckpt file %s.", file_name)
-
-    def remove_oldest_ckpoint_file(self):
-        """Remove the oldest checkpoint file from this checkpoint manager and also from the directory."""
-        ckpoint_files = sorted(self._ckpoint_filelist, key=os.path.getmtime)
-        self.remove_ckpoint_file(ckpoint_files[0])
-
-    def keep_one_ckpoint_per_minutes(self, minutes, cur_time):
-        """Only keep the latest one ckpt file per minutes, remove other files generated in [last_time, cur_time]."""
-        movs = []
-        oldest_file = ''
-        oldest_time = cur_time
-        for ck_file in self._ckpoint_filelist:
-            modify_time = os.path.getmtime(ck_file)
-            if cur_time - modify_time < 60 * minutes:
-                movs.append(ck_file)
-
-                if modify_time < oldest_time:
-                    oldest_time = modify_time
-                    oldest_file = ck_file
-
-        for mv_file in movs:
-            if mv_file == oldest_file:
-                continue
-            self.remove_ckpoint_file(mv_file)
-
 
 def _check_file_name_prefix(file_name_prefix):
     """
@@ -236,278 +166,6 @@ class CheckpointConfig:
         return checkpoint_policy
 
 
-def _set_cur_net(net):
-    """
-    Set current net for which we are using to save checkpoint.
-
-    Args:
-        net (Cell): train network
-    """
-    global _cur_net
-    _cur_net = net
-
-
-def _checkpoint_cb_for_save_op(parameter_list):
-    """
-    The checkpoint callback function for MindSpore.
-
-    Will be executed by checkpoint save op.
-
-    Args:
-        parameter_list (list): Format is like [{"name",name},{"data",value}] and value type is Tensor.
-
-    Returns:
-        bool, true: means save checkpoint success.
-    """
-    if _cur_net is None:
-        logger.warning("_cur_net is None. parameters are not updated.")
-        return False
-
-    logger.info("update parameters in the net.")
-    _fill_param_into_net(_cur_net, parameter_list)
-    _set_cur_net(None)
-    return True
-
-
-def _summary_cb_for_save_op(summary_list):
-    """
-    The summary callback function for MindSpore.
-
-    Will be executed by summary op.
-
-    Args:
-        summary_list (list): Format is like [{"name": tag_name, "data": tensor},...] and value is Scalar/Tensor.
-
-    Returns:
-        bool, true: means save summary success.
-    """
-    ret = _cache_summary_tensor_data(summary_list)
-    return ret
-
-
-def _build_callbacks(callbacks):
-    """
-    Contain a list of callback.
-
-    Args:
-        callbacks (list): Callback functions list, Support None, a single Callback object, or a list.
-
-    Returns:
-        List, a list of callback functions.
-    """
-    if callbacks:
-        if isinstance(callbacks, tuple):
-            raise TypeError("Callbacks cannot be a tuple. Please check it.")
-        if not isinstance(callbacks, list):
-            callbacks = [callbacks]
-    else:
-        callbacks = []
-
-    excute_callbacks = []
-    for cb in callbacks:
-        if cb is None or not isinstance(cb, Callback):
-            raise TypeError("Callback must inheriting base class Callback. Some callback is Wrong. Please check it.")
-        excute_callbacks.append(cb)
-
-    return _ListCallback(excute_callbacks)
-
-
-class _ListCallback:
-    """
-    Sequential execution of callback functions.
-
-    Execute Callback functions at certain points.
-
-    Args:
-        callbacks (list): Callback functions list.
-    """
-    def __init__(self, callbacks):
-        super(_ListCallback, self).__init__()
-        self._callbacks = callbacks
-
-    def begin(self, run_context):
-        """Called once before network training."""
-        for cb in self._callbacks:
-            cb.begin(run_context)
-
-    def epoch_begin(self, run_context):
-        """Called before each epoch begin."""
-        for cb in self._callbacks:
-            cb.epoch_begin(run_context)
-
-    def epoch_end(self, run_context):
-        """Called after each epoch finished."""
-        for cb in self._callbacks:
-            cb.epoch_end(run_context)
-
-    def step_begin(self, run_context):
-        """Called before each epoch begin."""
-        for cb in self._callbacks:
-            cb.step_begin(run_context)
-
-    def step_end(self, run_context):
-        """Called after each step finished."""
-        for cb in self._callbacks:
-            cb.step_end(run_context)
-
-    def end(self, run_context):
-        """Called once after network training."""
-        for cb in self._callbacks:
-            cb.end(run_context)
-
-
-class Callback:
-    """
-    Abstract base class used to build a callback function.
-
-    Callback function will execution some operating to the current step or epoch.
-
-    Examples:
-        >>> class Print_info(Callback):
-        >>>     def step_end(self, run_context):
-        >>>         cb_params = run_context.original_args()
-        >>>         print(cb_params.cur_epoch_num)
-        >>>         print(cb_params.cur_step_num)
-        >>>
-        >>> print_cb = Print_info()
-        >>> model.train(epoch, dataset, callbacks=print_cb)
-    """
-    def __init__(self):
-        pass
-
-    def begin(self, run_context):
-        """
-        Called once before the network executing.
-
-        Args:
-            run_context (RunContext): Include some information of the model.
-        """
-
-    def epoch_begin(self, run_context):
-        """
-        Called before each epoch beginning.
-
-        Args:
-            run_context (RunContext): Include some information of the model.
-        """
-
-    def epoch_end(self, run_context):
-        """
-        Called after each epoch finished.
-
-        Args:
-            run_context (RunContext): Include some information of the model.
-        """
-
-    def step_begin(self, run_context):
-        """
-        Called before each epoch beginning.
-
-        Args:
-            run_context (RunContext): Include some information of the model.
-        """
-
-    def step_end(self, run_context):
-        """
-        Called after each step finished.
-
-        Args:
-            run_context (RunContext): Include some information of the model.
-        """
-
-    def end(self, run_context):
-        """
-        Called once after network training.
-
-        Args:
-            run_context (RunContext): Include some information of the model.
-        """
-
-
-class SummaryStep(Callback):
-    """
-    The summary callback class.
-
-    Args:
-        summary (Object): Summary recode object.
-        flush_step (int): Number of interval steps to execute. Default: 10.
-    """
-    def __init__(self, summary, flush_step=10):
-        super(SummaryStep, self).__init__()
-        if not isinstance(flush_step, int) or isinstance(flush_step, bool) or flush_step <= 0:
-            raise ValueError("`flush_step` should be int and greater than 0")
-        self._summary = summary
-        self._flush_step = flush_step
-
-    def step_end(self, run_context):
-        """
-        Save summary.
-
-        Args:
-            run_context (RunContext): Context of the train running.
-        """
-        cb_params = run_context.original_args()
-        if cb_params.cur_step_num % self._flush_step == 0:
-            self._summary.record(cb_params.cur_step_num, cb_params.train_network)
-
-    @property
-    def summary_file_name(self):
-        return self._summary.full_file_name
-
-
-class _InternalCallbackParam(dict):
-    """Internal callback object's parameters."""
-
-    def __getattr__(self, key):
-        return self[key]
-
-    def __setattr__(self, key, value):
-        self[key] = value
-
-
-class RunContext:
-    """
-    Provides information about the model.
-
-    Run call being made. Provides information about original request to model function.
-    callback objects can stop the loop by calling request_stop() of run_context.
-
-    Args:
-        original_args (dict): Holding the related information of model etc.
-    """
-    def __init__(self, original_args):
-        if not isinstance(original_args, dict):
-            raise TypeError("The arg of RunContext should be dict type.")
-        self._original_args = original_args
-        self._stop_requested = False
-
-    def original_args(self):
-        """
-        Get the _original_args object.
-
-        Returns:
-           Dict, a object holding the original arguments of model.
-        """
-        return self._original_args
-
-    def request_stop(self):
-        """
-        Sets stop requested during training.
-
-        Callbacks can use this function to request stop of iterations.
-        model.train() checks whether this is called or not.
-        """
-        self._stop_requested = True
-
-    def get_stop_requested(self):
-        """
-        Returns whether a stop is requested or not.
-
-        Returns:
-            bool, if true, model.train() stops iterations.
-        """
-        return self._stop_requested
-
 
 class ModelCheckpoint(Callback):
     """
@@ -551,7 +209,7 @@ class ModelCheckpoint(Callback):
             self._config = config
 
         # get existing checkpoint files
-        self._manager = _CheckpointManager()
+        self._manager = CheckpointManager()
         self._prefix = _chg_ckpt_file_name_if_same_exist(self._directory, self._prefix)
         self._graph_saved = False
 
@@ -631,7 +289,7 @@ class ModelCheckpoint(Callback):
             self._last_triggered_step = cb_params.cur_step_num
 
             if context.get_context("enable_ge"):
-                _set_cur_net(cb_params.train_network)
+                set_cur_net(cb_params.train_network)
                 cb_params.train_network.exec_checkpoint_graph()
 
             _exec_save_checkpoint(cb_params.train_network, gen_file, self._config.integrated_save)
@@ -646,57 +304,66 @@ class ModelCheckpoint(Callback):
         return self._latest_ckpt_file_name
 
 
-class LossMonitor(Callback):
-    """
-    Monitor the loss in training.
-
-    If the loss is NAN or INF, it will terminate training.
-
-    Note:
-        If per_print_times is 0 do not print loss.
-
-    Args:
-        per_print_times (int): Print loss every times. Default: 1.
-
-    Raises:
-        ValueError: If print_step is not int or less than zero.
-    """
-    def __init__(self, per_print_times=1):
-        super(LossMonitor, self).__init__()
-        if not isinstance(per_print_times, int) or per_print_times < 0:
-            raise ValueError("print_step must be int and >= 0.")
-        self._per_print_times = per_print_times
-
-    def step_end(self, run_context):
-        cb_params = run_context.original_args()
-        loss = cb_params.net_outputs
+class CheckpointManager:
+    """Manage checkpoint files according to train_config of checkpoint."""
+    def __init__(self):
+        self._ckpoint_filelist = []
 
-        if isinstance(loss, (tuple, list)):
-            if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
-                loss = loss[0]
+    @property
+    def ckpoint_filelist(self):
+        """Get all the related checkpoint files managed here."""
+        return self._ckpoint_filelist
 
-        if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
-            loss = np.mean(loss.asnumpy())
+    @property
+    def ckpoint_num(self):
+        """Get the number of the related checkpoint files managed here."""
+        return len(self._ckpoint_filelist)
 
-        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
+    def update_ckpoint_filelist(self, directory, prefix):
+        """Update the checkpoint file list."""
+        self._ckpoint_filelist = []
+        files = os.listdir(directory)
+        for filename in files:
+            if os.path.splitext(filename)[-1] == ".ckpt" and filename.startswith(prefix):
+                mid_name = filename[len(prefix):-5]
+                flag = True
+                for char in mid_name:
+                    if char.isalpha():
+                        flag = False
+                if flag:
+                    self._ckpoint_filelist.append(directory + '/' + filename)
 
-        if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
-            raise ValueError("epoch: {} step: {}. Invalid loss, terminating training."
-                             .format(cb_params.cur_epoch_num, cur_step_in_epoch))
-        if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
-            print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss), flush=True)
+    def remove_ckpoint_file(self, file_name):
+        """Remove the specified checkpoint file from this checkpoint manager and also from the directory."""
+        try:
+            os.chmod(file_name, stat.S_IWRITE)
+            os.remove(file_name)
+            self._ckpoint_filelist.remove(file_name)
+        except OSError:
+            logger.warning("OSError, failed to remove the older ckpt file %s.", file_name)
+        except ValueError:
+            logger.warning("ValueError, failed to remove the older ckpt file %s.", file_name)
 
+    def remove_oldest_ckpoint_file(self):
+        """Remove the oldest checkpoint file from this checkpoint manager and also from the directory."""
+        ckpoint_files = sorted(self._ckpoint_filelist, key=os.path.getmtime)
+        self.remove_ckpoint_file(ckpoint_files[0])
 
-class TimeMonitor(Callback):
-    """Time Monitor."""
-    def __init__(self, data_size):
-        super(TimeMonitor, self).__init__()
-        self.data_size = data_size
+    def keep_one_ckpoint_per_minutes(self, minutes, cur_time):
+        """Only keep the latest one ckpt file per minutes, remove other files generated in [last_time, cur_time]."""
+        movs = []
+        oldest_file = ''
+        oldest_time = cur_time
+        for ck_file in self._ckpoint_filelist:
+            modify_time = os.path.getmtime(ck_file)
+            if cur_time - modify_time < 60 * minutes:
+                movs.append(ck_file)
 
-    def epoch_begin(self, run_context):
-        self.epoch_time = time.time()
+                if modify_time < oldest_time:
+                    oldest_time = modify_time
+                    oldest_file = ck_file
 
-    def epoch_end(self, run_context):
-        epoch_mseconds = (time.time() - self.epoch_time) * 1000
-        per_step_mseconds = epoch_mseconds / self.data_size
-        print("epoch time: {0}, per step time: {1}".format(epoch_mseconds, per_step_mseconds), flush=True)
+        for mv_file in movs:
+            if mv_file == oldest_file:
+                continue
+            self.remove_ckpoint_file(mv_file)
diff --git a/mindspore/train/callback/_dataset_graph.py b/mindspore/train/callback/_dataset_graph.py
new file mode 100644
index 0000000000..e8c8dcb2ba
--- /dev/null
+++ b/mindspore/train/callback/_dataset_graph.py
@@ -0,0 +1,128 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Define dataset graph related operations."""
+import json
+from importlib import import_module
+
+from mindspore.train import lineage_pb2
+
+
+class DatasetGraph:
+    """Handle the data graph and packages it into binary data."""
+    def package_dataset_graph(self, dataset):
+        """
+        packages dataset graph into binary data
+
+        Args:
+            dataset (MindData): refer to MindDataset
+
+        Returns:
+            DatasetGraph, a object of lineage_pb2.DatasetGraph.
+        """
+        dataset_package = import_module('mindspore.dataset')
+        dataset_dict = dataset_package.serialize(dataset)
+        json_str = json.dumps(dataset_dict, indent=2)
+        dataset_dict = json.loads(json_str)
+        dataset_graph_proto = lineage_pb2.DatasetGraph()
+        if "children" in dataset_dict:
+            children = dataset_dict.pop("children")
+            if children:
+                self._package_children(children=children, message=dataset_graph_proto)
+            self._package_current_dataset(operation=dataset_dict, message=dataset_graph_proto)
+        return dataset_graph_proto
+
+    def _package_children(self, children, message):
+        """
+        Package children in dataset operation.
+
+        Args:
+            children (list[dict]): Child operations.
+            message (DatasetGraph): Children proto message.
+        """
+        for child in children:
+            if child:
+                child_graph_message = getattr(message, "children").add()
+                grandson = child.pop("children")
+                if grandson:
+                    self._package_children(children=grandson, message=child_graph_message)
+                # package other parameters
+                self._package_current_dataset(operation=child, message=child_graph_message)
+
+    def _package_current_dataset(self, operation, message):
+        """
+        Package operation parameters in event message.
+
+        Args:
+            operation (dict): Operation dict.
+            message (Operation): Operation proto message.
+        """
+        for key, value in operation.items():
+            if value and key == "operations":
+                for operator in value:
+                    self._package_enhancement_operation(
+                        operator,
+                        message.operations.add()
+                    )
+            elif value and key == "sampler":
+                self._package_enhancement_operation(
+                    value,
+                    message.sampler
+                )
+            else:
+                self._package_parameter(key, value, message.parameter)
+
+    def _package_enhancement_operation(self, operation, message):
+        """
+        Package enhancement operation in MapDataset.
+
+        Args:
+            operation (dict): Enhancement operation.
+            message (Operation): Enhancement operation proto message.
+        """
+        for key, value in operation.items():
+            if isinstance(value, list):
+                if all(isinstance(ele, int) for ele in value):
+                    message.size.extend(value)
+                else:
+                    message.weights.extend(value)
+            else:
+                self._package_parameter(key, value, message.operationParam)
+
+    @staticmethod
+    def _package_parameter(key, value, message):
+        """
+        Package parameters in operation.
+
+        Args:
+            key (str): Operation name.
+            value (Union[str, bool, int, float, list, None]): Operation args.
+            message (OperationParameter): Operation proto message.
+        """
+        if isinstance(value, str):
+            message.mapStr[key] = value
+        elif isinstance(value, bool):
+            message.mapBool[key] = value
+        elif isinstance(value, int):
+            message.mapInt[key] = value
+        elif isinstance(value, float):
+            message.mapDouble[key] = value
+        elif isinstance(value, list) and key != "operations":
+            if value:
+                replace_value_list = list(map(lambda x: "" if x is None else x, value))
+                message.mapStrList[key].strValue.extend(replace_value_list)
+        elif value is None:
+            message.mapStr[key] = "None"
+        else:
+            raise ValueError(f"Parameter {key} is not supported in event package.")
diff --git a/mindspore/train/callback/_loss_monitor.py b/mindspore/train/callback/_loss_monitor.py
new file mode 100644
index 0000000000..3c1da218c2
--- /dev/null
+++ b/mindspore/train/callback/_loss_monitor.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""LossMonitor Callback class."""
+
+import time
+import numpy as np
+from mindspore.common.tensor import Tensor
+
+from ._callback import Callback
+
+
+class LossMonitor(Callback):
+    """
+    Monitor the loss in training.
+
+    If the loss is NAN or INF, it will terminate training.
+
+    Note:
+        If per_print_times is 0 do not print loss.
+
+    Args:
+        per_print_times (int): Print loss every times. Default: 1.
+        lr_init (numpy array): train learning rate. Default: None.
+
+    Raises:
+        ValueError: If print_step is not int or less than zero.
+
+    Examples:
+        >>> LossMonitor(100, lr_init=Tensor([0.05]*100).asnumpy())
+    """
+
+    def __init__(self, per_print_times=1, lr_init=None):
+        super(LossMonitor, self).__init__()
+        if not isinstance(per_print_times, int) or per_print_times < 0:
+            raise ValueError("print_step must be int and >= 0.")
+        self._per_print_times = per_print_times
+        self.lr_init = lr_init
+
+    def epoch_begin(self, run_context):
+        self.losses = []
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        cb_params = run_context.original_args()
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        per_step_mseconds = epoch_mseconds / cb_params.batch_num
+        print("Epoch time: {:5.3f}, per step time: {:5.3f}, "
+              "avg loss: {:5.3f}".format(epoch_mseconds,
+                                         per_step_mseconds,
+                                         np.mean(self.losses)))
+        print("*" * 60)
+
+    def step_begin(self, run_context):
+        self.step_time = time.time()
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        step_mseconds = (time.time() - self.step_time) * 1000
+        step_loss = cb_params.net_outputs
+
+        if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor):
+            step_loss = step_loss[0]
+        if isinstance(step_loss, Tensor):
+            step_loss = np.mean(step_loss.asnumpy())
+
+        self.losses.append(step_loss)
+        cur_step_in_epoch = int((cb_params.cur_step_num - 1) % cb_params.batch_num)
+
+        if isinstance(step_loss, float) and (np.isnan(step_loss) or np.isinf(step_loss)):
+            raise ValueError("Epoch: [{:3d}/{:3d}], step: [{:5d}/{:5d}]. "
+                             "Invalid loss, terminating training.".format(
+                                 cb_params.cur_epoch_num - 1, cb_params.epoch_num,
+                                 cur_step_in_epoch, cb_params.batch_num))
+
+        if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
+            print("Epoch: [{:3d}/{:3d}], step: [{:5d}/{:5d}], "
+                  "loss: [{:5.4f}/{:5.4f}], time: [{:5.4f}]".format(
+                      cb_params.cur_epoch_num - 1, cb_params.epoch_num,
+                      cur_step_in_epoch, int(cb_params.batch_num),
+                      step_loss, np.mean(self.losses),
+                      step_mseconds), flush=True)
diff --git a/mindspore/train/callback/_summary_collector.py b/mindspore/train/callback/_summary_collector.py
new file mode 100644
index 0000000000..e2e4a9cc2d
--- /dev/null
+++ b/mindspore/train/callback/_summary_collector.py
@@ -0,0 +1,786 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Summary collector callback."""
+
+import os
+import re
+import json
+
+from importlib import import_module
+
+import numpy as np
+
+from mindspore import log as logger
+from mindspore.common.tensor import Tensor
+from mindspore.common.parameter import Parameter
+from mindspore.train.summary.summary_record import SummaryRecord
+from mindspore.train.summary.enum import PluginEnum, ModeEnum
+from mindspore.train.callback import Callback, ModelCheckpoint
+from mindspore.train import lineage_pb2
+from mindspore.train.callback._dataset_graph import DatasetGraph
+from mindspore.nn.optim.optimizer import Optimizer
+from mindspore.nn.loss.loss import _Loss
+from mindspore.train._utils import check_value_type
+
+
+class LineageMetadata:
+    """Initialize parameters used in model lineage management."""
+    train_dataset_path = 'train_dataset_path'
+    valid_dataset_path = 'valid_dataset_path'
+    train_network = 'train_network'
+    loss_function = 'loss_function'
+    loss = 'loss'
+    optimizer = 'optimizer'
+    learning_rate = 'learning_rate'
+    epoch = 'epoch'
+    step_num = 'step_num'
+    parallel_mode = 'parallel_mode'
+    device_num = 'device_num'
+    batch_size = 'batch_size'
+    model_path = 'model_path'
+    model_ckpt = 'model_ckpt'
+    model_size = 'model_size'
+    metrics = 'metrics'
+    train_dataset_size = 'train_dataset_size'
+    valid_dataset_size = 'valid_dataset_size'
+
+
+class SummaryCollector(Callback):
+    """
+    SummaryCollector can help you to collect some common information.
+
+    It can help you to collect loss, learning late, computational graph and so on.
+    SummaryCollector also persists data collected by the summary operator into a summary file.
+
+    Note:
+        1. Multiple SummaryCollector instances in callback list are not allowed.
+        2. Not all information is collected at the training phase or at the eval phase.
+        3. SummaryCollector always record the data collected by the summary operator.
+
+    Args:
+        summary_dir (str): The collected data will be persisted to this directory.
+            If the directory does not exist, it will be created automatically.
+        collect_freq (int): Set the frequency of data collection, it should be greater then zero,
+            and the unit is `step`. Default: 10.
+            It is important to note that if the data sink mode is used, the unit will become the `epoch`.
+            It is not recommended to collect data too frequently, which can affect performance.
+        collect_specified_data (Union[None, dict]): Perform custom operations on the collected data. Default: None.
+            By default, if set to None, all data is collected as the default behavior.
+            If you want to customize the data collected, you can do so with a dictionary.
+            Examples,you can set {'collect_metric': False} to control not collecting metrics.
+            The data that supports control is shown below.
+
+            - collect_metric: Whether to collect training metrics, currently only loss is collected.
+              Optional: True/False. Default: True.
+            - collect_graph: Whether to collect computational graph, currently only
+              training computational graph is collected. Optional: True/False. Default: True.
+            - collect_train_lineage: Whether to collect lineage data for the training phase,
+              this field will be displayed on the lineage page of Mindinsight. Optional: True/False. Default: True.
+            - collect_eval_lineage: Whether to collect lineage data for the eval phase,
+              this field will be displayed on the lineage page of Mindinsight. Optional: True/False. Default: True.
+            - collect_input_data: Whether to collect dataset for each training. Currently only image data is supported.
+              Optional: True/False. Default: True.
+            - collect_dataset_graph: Whether to collect dataset graph for the training phase.
+              Optional: True/False. Default: True.
+            - histogram_regular: Collect weight and bias for parameter distribution page display in MindInsight.
+              This field allows regular strings to control which parameters to collect.
+              Default: None, it means only the first five parameters are collected.
+              It is not recommended to collect too many parameters at once, as it can affect performance.
+              Note that if you collect too many parameters and run out of memory, the training will fail.
+        keep_default_action (bool): This field affects the collection behavior of the 'collect_specified_data' field.
+            Optional: True/False, Default: True.
+            True: means that after specified data is set, non-specified data is collected as the default behavior.
+            False: means that after specified data is set, only the specified data is collected,
+            and the others are not collected.
+        custom_lineage_data (Union[dict, None]): Allows you to customize the data and present it on the MingInsight
+            lineage page. In the custom data, the key type support str, and the value type support str/int/float.
+            Default: None, it means there is no custom data.
+
+    Raises:
+        ValueError: If the parameter value is not expected.
+        TypeError: If the parameter type is not expected.
+        RuntimeError: If an error occurs during data collection.
+
+    Examples:
+        >>> # Simple usage:
+        >>> summary_collector = SummaryCollector(summary_dir='./summary_dir')
+        >>> model.train(epoch, dataset, callbacks=summary_collector)
+        >>>
+        >>> # Do not collect metric and collect the first layer parameter, others are collected by default
+        >>> specified={'collect_metric': False, 'histogram_regular': '^conv1.*'}
+        >>> summary_collector = SummaryCollector(summary_dir='./summary_dir', collect_specified_data=specified)
+        >>> model.train(epoch, dataset, callbacks=summary_collector)
+        >>>
+        >>> # Only collect metric, custom lineage data and record data that collected by the summary operator,
+        >>> # others are not collected
+        >>> specified = {'collect_metric':True, 'custom_lineage_data': {'version': 'resnet50_v1'}}
+        >>> summary_collector = SummaryCollector('./summary_dir',
+        >>>                                      collect_specified_data=specified,
+        >>>                                      keep_default_action=False)
+        >>> model.train(epoch, dataset, callbacks=summary_collector)
+    """
+
+    _DEFAULT_SPECIFIED_DATA = {
+        'collect_metric': True,
+        'collect_graph': True,
+        'collect_train_lineage': True,
+        'collect_eval_lineage': True,
+        'collect_input_data': True,
+        'collect_dataset_graph': True,
+        'histogram_regular': None
+    }
+
+    # _OPTIMIZER_FAILED means find optimizer failed, so we will not collect data about optimizer.
+    _OPTIMIZER_FAILED = 'Failed'
+
+    def __init__(self, summary_dir, collect_freq=10, collect_specified_data=None,
+                 keep_default_action=True, custom_lineage_data=None):
+        super(SummaryCollector, self).__init__()
+
+        self._summary_dir = self._process_summary_dir(summary_dir)
+        self._record = None
+
+        self._check_collect_freq(collect_freq)
+        self._collect_freq = collect_freq
+
+        self._check_action(keep_default_action)
+
+        self._collect_specified_data = self._process_specified_data(collect_specified_data, keep_default_action)
+        logger.info(f"For `collect_specified_data` the value after processing is: {self._collect_specified_data}.")
+
+        self._check_custom_lineage_data(custom_lineage_data)
+        self._custom_lineage_data = custom_lineage_data
+
+        self._optimizer = None
+        self._has_saved_train_network = False
+        self._has_saved_custom_data = False
+        self._is_parse_loss_success = True
+
+    def __enter__(self):
+        self._record = SummaryRecord(log_dir=self._summary_dir)
+        return self
+
+    def __exit__(self, *err):
+        self._record.close()
+
+    @staticmethod
+    def _process_summary_dir(summary_dir):
+        """Check the summary dir, and create a new directory if it not exists."""
+        check_value_type('summary_dir', summary_dir, str)
+        summary_dir = summary_dir.strip()
+        if not summary_dir:
+            raise ValueError('For `summary_dir` the value should be a valid string of path, but got empty string.')
+
+        summary_dir = os.path.realpath(summary_dir)
+        if not os.path.exists(summary_dir):
+            os.makedirs(summary_dir, exist_ok=True)
+        else:
+            if not os.path.isdir(summary_dir):
+                raise NotADirectoryError('For `summary_dir` it should be a directory path.')
+
+        return summary_dir
+
+    @staticmethod
+    def _check_collect_freq(freq):
+        """Check collect freq type and value."""
+        check_value_type('collect_freq', freq, int)
+        if freq <= 0:
+            raise ValueError(f'For `collect_freq` the value should be greater than 0, but got `{freq}`.')
+
+    @staticmethod
+    def _check_custom_lineage_data(custom_lineage_data):
+        """
+        Check user custom lineage data.
+
+        Args:
+            custom_lineage_data (dict): The user custom defined data.
+
+        Raises:
+            TypeError: If the type of parameters is invalid.
+        """
+        if custom_lineage_data is None:
+            return
+
+        check_value_type('custom_lineage_data', custom_lineage_data, [dict, type(None)])
+        for key, value in custom_lineage_data.items():
+            check_value_type(f'custom_lineage_data -> {key}', key, str)
+            check_value_type(f'the value of custom_lineage_data -> {key}', value, (int, str, float))
+
+    @staticmethod
+    def _check_action(action):
+        """Check action type."""
+        check_value_type('keep_default_action', action, bool)
+
+    def _process_specified_data(self, specified_data, action):
+        """Check specified data type and value."""
+        if specified_data is None:
+            if action:
+                return self._DEFAULT_SPECIFIED_DATA
+            return None
+
+        check_value_type('collect_specified_data', specified_data, [dict, type(None)])
+
+        for param_name in specified_data:
+            check_value_type(param_name, param_name, [str])
+
+        unexpected_params = set(specified_data) - set(self._DEFAULT_SPECIFIED_DATA)
+        if unexpected_params:
+            raise ValueError(f'For `collect_specified_data` the keys {unexpected_params} are unsupported.')
+
+        if 'histogram_regular' in specified_data:
+            check_value_type('histogram_regular', specified_data.get('histogram_regular'), (str, type(None)))
+
+        bool_items = set(self._DEFAULT_SPECIFIED_DATA) - {'histogram_regular'}
+        for item in bool_items:
+            if item in specified_data:
+                check_value_type(item, specified_data.get(item), bool)
+
+        if action:
+            result = dict(self._DEFAULT_SPECIFIED_DATA).update(specified_data)
+        else:
+            result = specified_data
+        return result
+
+    def begin(self, run_context):
+        cb_params = run_context.original_args()
+        self._check_callbacks(cb_params)
+
+        if cb_params.mode not in ModeEnum.to_list():
+            raise ValueError('Only support `train` (model.train) and `eval` (model.eval) mode, '
+                             'but got `{cb_params.mode}` mode.')
+
+        self._record.set_mode(cb_params.mode)
+        if cb_params.mode == ModeEnum.TRAIN.value:
+            # Note: if model.init is not executed then the computed graph will not be obtained here
+            # The purpose of recording the graph here was to collect_freq if it was set to a large size,
+            # but also want to see the graph as soon after compilation.
+            self._collect_graphs(cb_params)
+
+            self._collect_dataset_graph(cb_params)
+
+        if self._custom_lineage_data and not self._has_saved_custom_data:
+            packaged_custom_data = self._package_custom_lineage_data(self._custom_lineage_data)
+            self._record.add_value('custom_lineage_data', 'custom_lineage_data', packaged_custom_data)
+            self._has_saved_custom_data = True
+
+        # There's nothing special about setting step to 0 here, just to satisfy the interface call
+        self._record.record(step=0)
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+
+        if cb_params.mode == ModeEnum.TRAIN.value:
+            if cb_params.cur_step_num % self._collect_freq:
+                return
+
+            if not self._has_saved_train_network:
+                self._collect_graphs(cb_params)
+
+            self._collect_input_data(cb_params)
+            self._collect_metric(cb_params)
+            self._collect_histogram(cb_params)
+
+        self._record.record(cb_params.cur_step_num)
+
+    def end(self, run_context):
+        cb_params = run_context.original_args()
+        if cb_params.mode == ModeEnum.TRAIN.value:
+            self._collect_train_lineage(cb_params)
+        else:
+            self._collect_eval_lineage(cb_params)
+
+        # There's nothing special about setting step to 0 here, just to satisfy the interface call
+        self._record.record(step=0)
+
+    def _check_callbacks(self, cb_params):
+        """Check there if there are duplicate instances of SummaryCollector."""
+        callbacks = cb_params.list_callback
+
+        is_find = False
+        for callback in callbacks:
+            if type(callback).__name__ == self.__class__.__name__:
+                if not is_find:
+                    is_find = True
+                    continue
+                raise ValueError(f"There are more than one {self.__class__.__name__} instance in callback list,"
+                                 f"but expected only one {self.__class__.__name__} instance.")
+
+    @staticmethod
+    def _package_custom_lineage_data(custom_lineage_data):
+        """
+        Package user-defined lineage data into binary data.
+
+        Args:
+            custom_lineage_data (dict): User custom lineage data.
+
+        Returns:
+            UserDefinedInfo, a object of lineage_pb2.UserDefinedInfo.
+        """
+        user_defined_info = lineage_pb2.UserDefinedInfo()
+        for key, value in custom_lineage_data.items():
+            if isinstance(value, int):
+                attr_name = "map_int32"
+            elif isinstance(value, float):
+                attr_name = "map_double"
+            else:
+                attr_name = "map_str"
+
+            user_info = user_defined_info.user_info.add()
+            getattr(user_info, attr_name)[key] = value
+
+        return user_defined_info
+
+    def _collect_input_data(self, cb_params):
+        """Only support to collect image data."""
+        if not self._collect_specified_data.get('collect_input_data'):
+            return
+
+        input_data = getattr(cb_params, 'train_dataset_element', None)
+        if input_data is None:
+            self._collect_specified_data['collect_input_data'] = False
+            logger.info("There is not a `train_dataset_element` in cb_params.")
+            return
+
+        if isinstance(input_data, (list, tuple)):
+            input_data = input_data[0]
+        try:
+            self._record.add_value(PluginEnum.IMAGE.value, 'input_data/auto', input_data)
+        except ValueError:
+            self._collect_specified_data['collect_input_data'] = False
+            return
+
+    def _collect_dataset_graph(self, cb_params):
+        """Only collect train dataset graph."""
+        if not self._collect_specified_data.get('collect_dataset_graph'):
+            return
+
+        # After analysis, we think that the validated dataset graph and the training dataset graph
+        # should be consistent under normal scenarios, so only the training dataset graph is collected.
+        if cb_params.mode == ModeEnum.TRAIN.value:
+            train_dataset = cb_params.train_dataset
+            dataset_graph = DatasetGraph()
+            graph_bytes = dataset_graph.package_dataset_graph(train_dataset)
+            self._record.add_value('dataset_graph', 'train_dataset', graph_bytes)
+
+    def _collect_graphs(self, cb_params):
+        """Collect the graph of train network and eval network."""
+        if not self._collect_specified_data.get('collect_graph'):
+            return
+
+        network = cb_params.train_network if cb_params.mode == ModeEnum.TRAIN.value else cb_params.eval_network
+        graph_proto = network.get_func_graph_proto()
+        if graph_proto is None:
+            return
+
+        self._has_saved_train_network = True
+        self._record.add_value(PluginEnum.GRAPH.value, 'train_network/auto', graph_proto)
+
+    def _collect_metric(self, cb_params):
+        """Collect metric, currently only collection Loss is supported."""
+        if not self._collect_specified_data.get('collect_metric'):
+            return
+
+        loss = self._get_loss(cb_params)
+        if loss is None:
+            return
+        self._record.add_value(PluginEnum.SCALAR.value, 'loss/auto', loss)
+
+    def _get_loss(self, cb_params):
+        """
+        Get loss from the network output.
+
+        Args:
+            cb_params (_InternalCallbackParam): Callback parameters.
+
+        Returns:
+            Union[Tensor, None], if parse loss success, will return a Tensor value(shape is [1]), else return None.
+        """
+        if not self._is_parse_loss_success:
+            # If parsing has failed before, avoid repeating it
+            return None
+
+        output = cb_params.net_outputs
+        if output is None:
+            logger.warning("Can not find any output by this network.")
+            self._is_parse_loss_success = False
+            return None
+
+        if isinstance(output, (int, float)):
+            loss = output
+        elif isinstance(output, (list, tuple)):
+            # If the output is a list, since the default network returns loss first,
+            # we assume that the first one is loss.
+            loss = output[0]
+        elif isinstance(output, Tensor) and (not output.shape or output.shape == [1]):
+            loss_numpy = output.asnumpy()
+            loss = float(np.atleast_1d(loss_numpy)[0])
+        else:
+            logger.warning("The output type could not be identified, so no loss was recorded in SummaryCollector.")
+            self._is_parse_loss_success = False
+            return None
+
+        if not isinstance(loss, Tensor):
+            loss = Tensor(loss)
+
+        return loss
+
+    def _get_optimizer(self, cb_params):
+        """
+        Get optimizer from the cb_params or parse from the network.
+
+        Args:
+            cb_params (_InternalCallbackParam): Callback parameters.
+
+        Returns:
+            Union[Optimizer, None], if parse optimizer success, will return a optimizer, else return None.
+        """
+        if self._optimizer == self._OPTIMIZER_FAILED:
+            return None
+
+        if self._optimizer is not None:
+            return self._optimizer
+
+        optimizer = cb_params.optimizer
+        if optimizer is None:
+            network = cb_params.train_network if cb_params.mode == 'train' else cb_params.eval_work
+            optimizer = self._parse_optimizer_by_network(network)
+
+        if optimizer is None or not isinstance(optimizer, Optimizer):
+            logger.warning("Can not find optimizer in network, or the optimizer does not inherit Mindpore's optimizer, "
+                           "so we will not collect data about optimizer in SummaryCollector.")
+            optimizer = self._OPTIMIZER_FAILED
+
+        return optimizer
+
+    @staticmethod
+    def _parse_optimizer_by_network(network):
+        """Parse optimizer from network, if parse success will return a optimizer, else return None."""
+        optimizer = None
+        for _, cell in network.cells_and_names():
+            try:
+                optimizer = getattr(cell, 'optimizer')
+            except AttributeError:
+                continue
+
+            if not isinstance(optimizer, Optimizer):
+                continue
+
+            # Optimizer found successfully
+            break
+
+        return optimizer
+
+    def _collect_histogram(self, cb_params):
+        """Collect histogram data, contain the parameter weight and bias."""
+        # Note: if there is not a key named `histogram_regular` in `self._collect_specified_data`,
+        # it means we will not collect histogram data.
+        if 'histogram_regular' not in self._collect_specified_data:
+            return
+
+        self._optimizer = self._get_optimizer(cb_params)
+        if self._optimizer is None:
+            return
+
+        parameters = self._optimizer.parameters
+        regular = self._collect_specified_data.get('histogram_regular')
+        if regular is not None:
+            for parameter in parameters:
+                if re.match(regular, parameter.name):
+                    self._record.add_value(PluginEnum.HISTOGRAM.value, parameter.name+'/auto', parameter.data)
+            return
+
+        # Note: If `histogram_regular` in `self._collect_specified_data` and the value is None,
+        # we will collect the first five parameters.
+        default_parameter_count = 5
+        for parameter in parameters[:default_parameter_count]:
+            self._record.add_value(PluginEnum.HISTOGRAM.value, parameter.name+'/auto', parameter.data)
+
+    @staticmethod
+    def _get_learning_rate(optimizer):
+        """
+        parse the learning rate from optimizer.
+
+        Args:
+            optimizer (Optimizer): A optimizer which inherit the MindSpore Optimizer class.
+
+        Returns:
+            Union[Tensor, None], if parse learning rate success, will return a Tensor, else return None.
+        """
+        learning_rate = optimizer.learning_rate
+        if not isinstance(learning_rate, Parameter):
+            logger.info("The learning rate detected in the optimizer is not a Parameter type, so it is not recorded.")
+            return None
+        return learning_rate.data
+
+    def _collect_train_lineage(self, cb_params):
+        """Collect train lineage data, the detail refer to lineage_pb2.TrainLineage."""
+        if not self._collect_specified_data.get('collect_train_lineage'):
+            return
+        train_lineage = {}
+        loss = self._get_loss(cb_params)
+        if loss:
+            loss_numpy = loss.asnumpy()
+            loss = float(np.atleast_1d(loss_numpy)[0])
+            train_lineage[LineageMetadata.loss] = loss
+        else:
+            train_lineage[LineageMetadata.loss] = None
+
+        optimizer = self._get_optimizer(cb_params)
+        learning_rate = self._get_learning_rate(optimizer)
+
+        if learning_rate is not None:
+            train_lineage[LineageMetadata.learning_rate] = list(np.atleast_1d(learning_rate.asnumpy()))[0]
+        else:
+            train_lineage[LineageMetadata.learning_rate] = None
+        train_lineage[LineageMetadata.optimizer] = type(optimizer).__name__ if optimizer else None
+        train_lineage[LineageMetadata.train_network] = self._get_backbone(cb_params.train_network)
+
+        loss_fn = self._get_loss_fn(cb_params)
+        train_lineage[LineageMetadata.loss_function] = type(loss_fn).__name__ if loss_fn else None
+
+        train_lineage[LineageMetadata.epoch] = cb_params.epoch_num
+        train_lineage[LineageMetadata.step_num] = cb_params.cur_step_num
+        train_lineage[LineageMetadata.parallel_mode] = cb_params.parallel_mode
+        train_lineage[LineageMetadata.device_num] = cb_params.device_number
+        train_lineage[LineageMetadata.batch_size] = cb_params.batch_num
+
+        ckpt_file_path = self._get_ckpt_file_path(cb_params)
+        train_lineage[LineageMetadata.model_path] = json.dumps(dict(ckpt=ckpt_file_path))
+
+        model_size = os.path.getsize(ckpt_file_path) if ckpt_file_path else 0
+        train_lineage[LineageMetadata.model_size] = model_size
+
+        self._parse_dataset(cb_params, train_lineage)
+
+        train_lineage_message = self._package_train_lineage_message(train_lineage)
+
+        self._record.add_value(PluginEnum.TRAIN_LINEAGE.value, 'train_lineage', train_lineage_message)
+
+    @staticmethod
+    def _package_train_lineage_message(train_lineage):
+        """
+        Package train lineage data into binary data.
+
+        Args:
+            train_lineage (dict): The train lineage dict, refer to the attribute of `_collect_train_lineage` method.
+
+        Returns:
+            TrainLineage, a object of lineage_pb2.TrainLineage.
+        """
+        lineage_message = lineage_pb2.TrainLineage()
+
+        if train_lineage.get(LineageMetadata.train_network) is not None:
+            lineage_message.algorithm.network = train_lineage.get(LineageMetadata.train_network)
+        if train_lineage.get(LineageMetadata.loss) is not None:
+            lineage_message.algorithm.loss = train_lineage.get(LineageMetadata.loss)
+
+        # Construct train_dataset message.
+        if train_lineage.get(LineageMetadata.train_dataset_path) is not None:
+            lineage_message.train_dataset.train_dataset_path = train_lineage.get(LineageMetadata.train_dataset_path)
+        if train_lineage.get(LineageMetadata.train_dataset_size) is not None:
+            lineage_message.train_dataset.train_dataset_size = train_lineage.get(LineageMetadata.train_dataset_size)
+
+        # Construct model message
+        lineage_message.model.path = train_lineage.get(LineageMetadata.model_path)
+        lineage_message.model.size = train_lineage.get(LineageMetadata.model_size)
+
+        # Construct hyper_parameters message.
+        if train_lineage.get(LineageMetadata.learning_rate) is not None:
+            lineage_message.hyper_parameters.learning_rate = train_lineage.get(LineageMetadata.learning_rate)
+        if train_lineage.get(LineageMetadata.optimizer) is not None:
+            lineage_message.hyper_parameters.optimizer = train_lineage.get(LineageMetadata.optimizer)
+        if train_lineage.get(LineageMetadata.loss_function) is not None:
+            lineage_message.hyper_parameters.loss_function = train_lineage.get(LineageMetadata.loss_function)
+        if train_lineage.get(LineageMetadata.parallel_mode) is not None:
+            lineage_message.hyper_parameters.parallel_mode = train_lineage.get(LineageMetadata.parallel_mode)
+
+        lineage_message.hyper_parameters.epoch = train_lineage.get(LineageMetadata.epoch)
+        lineage_message.hyper_parameters.device_num = train_lineage.get(LineageMetadata.device_num)
+        lineage_message.hyper_parameters.batch_size = train_lineage.get(LineageMetadata.batch_size)
+
+        return lineage_message
+
+    def _parse_dataset(self, cb_params, lineage_dict):
+        """
+        Analyze Dataset to get the dataset path and dataset size.
+
+        Args:
+            cb_params (_InternalCallbackParam): Callback parameters.
+            lineage_dict (dict): The lineage dict, refer to the attribute
+                of `_collect_train_lineage` method or `_collect_eval_lineage`.
+
+        Returns:
+            dict, the lineage metadata.
+        """
+        dataset = cb_params.train_dataset if cb_params.mode == ModeEnum.TRAIN.value else cb_params.valid_dataset
+
+        try:
+            dataset_path = self._get_dataset_path(dataset)
+        except IndexError:
+            dataset_path = None
+
+        if dataset_path and os.path.isfile(dataset_path):
+            dataset_dir = os.path.dirname(dataset_path)
+        else:
+            dataset_dir = dataset_path
+
+        batch_num = dataset.get_dataset_size()
+        batch_size = dataset.get_batch_size()
+        dataset_size = int(batch_num * batch_size)
+
+        if cb_params.mode == ModeEnum.TRAIN.value:
+            lineage_dict[LineageMetadata.train_dataset_path] = dataset_dir
+            lineage_dict[LineageMetadata.train_dataset_size] = dataset_size
+        else:
+            lineage_dict[LineageMetadata.valid_dataset_path] = dataset_dir
+            lineage_dict[LineageMetadata.valid_dataset_size] = dataset_size
+
+        return lineage_dict
+
+    def _get_dataset_path(self, output_dataset):
+        """
+        Get dataset path of MindDataset object.
+
+        Args:
+            output_dataset (Union[Dataset, ImageFolderDatasetV2, MnistDataset, Cifar10Dataset, Cifar100Dataset,
+                VOCDataset, CelebADataset, MindDataset, ManifestDataset, TFRecordDataset, TextFileDataset]):
+                Refer to mindspore.dataset.Dataset.
+
+        Returns:
+            str, dataset path.
+
+        Raises:
+            IndexError: it means get dataset path failed.
+        """
+        dataset_package = import_module('mindspore.dataset')
+        dataset_dir_set = (dataset_package.ImageFolderDatasetV2, dataset_package.MnistDataset,
+                           dataset_package.Cifar10Dataset, dataset_package.Cifar100Dataset,
+                           dataset_package.VOCDataset, dataset_package.CelebADataset)
+        dataset_file_set = (dataset_package.MindDataset, dataset_package.ManifestDataset)
+        dataset_files_set = (dataset_package.TFRecordDataset, dataset_package.TextFileDataset)
+
+        if isinstance(output_dataset, dataset_file_set):
+            return output_dataset.dataset_file
+        if isinstance(output_dataset, dataset_dir_set):
+            return output_dataset.dataset_dir
+        if isinstance(output_dataset, dataset_files_set):
+            return output_dataset.dataset_files[0]
+        return self._get_dataset_path(output_dataset.input[0])
+
+    @staticmethod
+    def _get_ckpt_file_path(cb_params):
+        """
+        Get checkpoint file path from MindSpore callback list.
+
+        Args:
+            cb_params (_InternalCallbackParam): Callback parameters.
+
+        Returns:
+            Union[str, None], if parse success will checkpoint file absolute path, else return None.
+        """
+        callbacks = cb_params.list_callback
+        ckpt_file_path = None
+        for callback in callbacks:
+            if isinstance(callback, ModelCheckpoint):
+                ckpt_file_path = callback.latest_ckpt_file_name
+
+        if ckpt_file_path:
+            ckpt_file_path = os.path.realpath(ckpt_file_path)
+
+        return ckpt_file_path
+
+    @staticmethod
+    def _get_backbone(network):
+        """
+        Get the name of backbone network.
+
+        Args:
+            network (Cell): The train network.
+
+        Returns:
+            Union[str, None], If parse success, will return the name of the backbone network, else return None.
+        """
+        backbone_name = None
+        backbone_key = '_backbone'
+
+        for _, cell in network.cells_and_names():
+            if hasattr(cell, backbone_key):
+                backbone_network = getattr(cell, backbone_key)
+                backbone_name = type(backbone_network).__name__
+
+        if backbone_name is None and network is not None:
+            backbone_name = type(network).__name__
+
+        return backbone_name
+
+    @staticmethod
+    def _get_loss_fn(cb_params):
+        """
+        Get loss function by cb_params and analyzing network.
+
+        Args:
+            cb_params (_InternalCallbackParam): Callback parameters.
+
+        Returns:
+            Union[Loss_fn, None], a Cell object, if parse failed, will return None.
+        """
+        loss_fn = cb_params.loss_fn
+        if loss_fn is not None:
+            return loss_fn
+
+        if cb_params.mode == ModeEnum.TRAIN.value:
+            network = cb_params.train_network
+        else:
+            network = cb_params.eval_network
+
+        for _, cell in network.cells_and_names():
+            if isinstance(cell, _Loss):
+                loss_fn = cell
+                break
+        return loss_fn
+
+    def _collect_eval_lineage(self, cb_params):
+        """Collect eval lineage data, the detail refer to lineage_pb2.EvaluationLineage."""
+        if not self._collect_specified_data.get('collect_eval_lineage'):
+            return
+        eval_lineage = dict()
+
+        eval_lineage[LineageMetadata.metrics] = json.dumps(cb_params.metrics)
+        self._parse_dataset(cb_params, eval_lineage)
+
+        eval_lineage_message = self._package_eval_lineage_message(eval_lineage)
+        self._record.add_value(PluginEnum.EVAL_LINEAGE.value, 'eval_lineage', eval_lineage_message)
+
+    @staticmethod
+    def _package_eval_lineage_message(eval_lineage):
+        """
+        Package eval lineage data into binary data.
+
+        Args:
+            eval_lineage (dict): The eval lineage dict, refer to the attribute of `_collect_eval_lineage` method.
+
+        Returns:
+            EvaluationLineage, a object of lineage_pb2.EvaluationLineage.
+        """
+        lineage_message = lineage_pb2.EvaluationLineage()
+
+        if eval_lineage.get(LineageMetadata.metrics) is not None:
+            lineage_message.metric = eval_lineage.get(LineageMetadata.metrics)
+        if eval_lineage.get(LineageMetadata.valid_dataset_path) is not None:
+            lineage_message.valid_dataset.valid_dataset_path = eval_lineage.get(LineageMetadata.valid_dataset_path)
+        if eval_lineage.get(LineageMetadata.valid_dataset_size) is not None:
+            lineage_message.valid_dataset.valid_dataset_size = eval_lineage.get(LineageMetadata.valid_dataset_size)
+
+        return lineage_message
diff --git a/mindspore/train/callback/_time_monitor.py b/mindspore/train/callback/_time_monitor.py
new file mode 100644
index 0000000000..9fbdf83aa8
--- /dev/null
+++ b/mindspore/train/callback/_time_monitor.py
@@ -0,0 +1,35 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""TimeMonitor Callback class."""
+
+import time
+
+from ._callback import Callback
+
+
+class TimeMonitor(Callback):
+    """Time Monitor."""
+
+    def __init__(self, data_size):
+        super(TimeMonitor, self).__init__()
+        self.data_size = data_size
+
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        per_step_mseconds = epoch_mseconds / self.data_size
+        print("Epoch time: {:5.3f}, per step time: {:5.3f}".format(epoch_mseconds, per_step_mseconds), flush=True)
diff --git a/mindspore/train/dataset_helper.py b/mindspore/train/dataset_helper.py
index 52797b631c..14797e568b 100644
--- a/mindspore/train/dataset_helper.py
+++ b/mindspore/train/dataset_helper.py
@@ -13,13 +13,22 @@
 # limitations under the License.
 # ============================================================================
 """Dataset help for minddata dataset"""
+import math
+
 from mindspore._checkparam import check_bool
 from .. import context
-from .parallel_utils import ParallelMode
 from ._utils import _exec_datagraph, _get_types_and_shapes, _to_tensor, \
     _construct_tensor_list, _to_full_shapes, _to_full_tensor
 from ..nn.wrap import GetNextSingleOp
-from ..parallel._utils import _get_device_num, _get_global_rank, _get_parallel_mode
+from ..parallel._utils import _get_device_num, _get_global_rank, _need_to_full
+
+
+def _send_data(dataset):
+    """Engine dataset to write data to tdt queue."""
+    if not hasattr(dataset, '__has_sent__'):
+        exec_dataset = dataset.__TRANSFER_DATASET__
+        exec_dataset.send()
+        dataset.__has_sent__ = True
 
 
 class DatasetHelper:
@@ -74,13 +83,19 @@ class DatasetHelper:
 class _DatasetIter:
     """Base iter for dataset help"""
     def __init__(self, dataset):
-        self.loop_size = 1
+        if not hasattr(dataset, '__loop_size__'):
+            self.loop_size = dataset.get_dataset_size()
+        else:
+            self.loop_size = dataset.__loop_size__
+
         if not hasattr(dataset, '__ME_INITED__'):
-            if not hasattr(dataset, '__loop_size__'):
-                self.loop_size = dataset.get_dataset_size()
-            else:
-                self.loop_size = dataset.__loop_size__
-            dataset.__ME_INITED__ = _exec_datagraph(dataset, self.loop_size).queue_name
+            dataset.__TRANSFER_DATASET__ = _exec_datagraph(dataset, self.loop_size)
+            dataset.__ME_INITED__ = dataset.__TRANSFER_DATASET__.queue_name
+
+            if not hasattr(dataset, '__no_send__'):
+                _send_data(dataset)
+        else:
+            _send_data(dataset)
 
         self.ind = 0
         self.dataset = dataset
@@ -104,10 +119,10 @@ class _DatasetIter:
         loop_count = 1
         if hasattr(dataset, '__loop_size__'):
             loop_size = dataset.__loop_size__
-            if dataset.get_dataset_size() % loop_size != 0:
+            if loop_size <= dataset.get_dataset_size() and dataset.get_dataset_size() % loop_size != 0:
                 raise ValueError(f'Dataset size {dataset.get_dataset_size()} and '
                                  f'loop_size {loop_size} are not matched.')
-            loop_count = int(dataset.get_dataset_size() / loop_size)
+            loop_count = math.ceil(dataset.get_dataset_size() / loop_size)
         return loop_count
 
 
@@ -116,10 +131,10 @@ class _DatasetIterMSLoopSink(_DatasetIter):
     def __init__(self, dataset):
         super(_DatasetIterMSLoopSink, self).__init__(dataset)
         self.loop_count = self.get_loop_count(dataset)
-        # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, use a complete tensor to
-        # compile, and slice tensor to run. The batch dimension of tensors for compile is device_number
-        # times the batch dimension of tensors for run. Now only support LoopSink.
-        if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
+        # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, and not using full_batch,
+        # use a complete tensor to compile, and slice tensor to run. The batch dimension of tensors for
+        # compile is device_number times the batch dimension of tensors for run. Now only support LoopSink.
+        if _need_to_full():
             device_num = _get_device_num()
             self.dataset_shapes = _to_full_shapes(self.dataset_shapes, device_num)
 
@@ -144,10 +159,8 @@ class _DatasetIterGE(_DatasetIter):
     def __init__(self, dataset):
         super(_DatasetIterGE, self).__init__(dataset)
         self.loop_count = self.get_loop_count(dataset)
-        parallel_mode = _get_parallel_mode()
-        self.need_to_full = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
         batch_expand_num = 1
-        if self.need_to_full:
+        if _need_to_full():
             batch_expand_num = _get_device_num()
         tensor_list_run = _construct_tensor_list(self.dataset_types, self.dataset_shapes, batch_expand_num)
 
@@ -168,9 +181,6 @@ class _DatasetIterFeed:
         self.loop_count = dataset.get_dataset_size()
         self.ind = 0
 
-        parallel_mode = context.get_auto_parallel_context("parallel_mode")
-        self.need_to_full = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
-
     def __iter__(self):
         if self.repeat_ind % self.repeat_count == 0:
             self.iter = self.dataset.__iter__()
@@ -184,6 +194,6 @@ class _DatasetIterFeed:
             raise StopIteration()
         self.ind += 1
         data = self.iter.__next__()
-        if self.need_to_full:
+        if _need_to_full():
             return _to_full_tensor(data, self.device_num, self.global_rank)
         return _to_tensor(data)
diff --git a/mindspore/train/model.py b/mindspore/train/model.py
index d2b5d4f5d8..79bd6bc90b 100755
--- a/mindspore/train/model.py
+++ b/mindspore/train/model.py
@@ -13,13 +13,15 @@
 # limitations under the License.
 # ============================================================================
 """Model."""
+from collections.abc import Iterable
+
 import numpy as np
 
 from mindspore import log as logger
 from ..common.tensor import Tensor
 from ..nn.metrics import get_metrics
 from .._checkparam import check_input_data, check_output_data, check_int_positive, check_bool
-from .callback import _InternalCallbackParam, RunContext, _build_callbacks
+from .callback import _InternalCallbackParam, RunContext, _CallbackManager
 from .. import context
 from ..parallel._utils import _get_parallel_mode, _get_device_num, _get_global_rank, \
     _get_parameter_broadcast, _device_number_check, _parameter_broadcast_check
@@ -54,10 +56,13 @@ class Model:
                              value would be passed to `Loss` metric, predict value and label would be passed to other
                              metric. Default: None.
         amp_level (str): Option for argument `level` in `mindspore.amp.build_train_network`, level for mixed
-            precision training. Supports [O0, O2]. Default: "O0".
+            precision training. Supports [O0, O2, O3]. Default: "O0".
 
             - O0: Do not change.
             - O2: Cast network to float16, keep batchnorm run in float32, using dynamic loss scale.
+            - O3: Cast network to float16, with additional property 'keep_batchnorm_fp32=False'.
+
+            O2 is recommended on GPU, O3 is recommended on Ascend.
 
         loss_scale_manager (Union[None, LossScaleManager]): If None, not scale the loss, or else
             scale the loss by LossScaleManager. If it is set, overwrite the level setting. It's a eyword argument.
@@ -111,7 +116,7 @@ class Model:
         self._build_predict_network()
 
     def _process_amp_args(self, kwargs):
-        if self._amp_level == "O0":
+        if self._amp_level in ["O0", "O3"]:
             self._keep_bn_fp32 = False
         if 'keep_batchnorm_fp32' in kwargs:
             self._keep_bn_fp32 = kwargs['keep_batchnorm_fp32']
@@ -169,6 +174,8 @@ class Model:
             self._eval_indexes = [0, 1, 2]
 
         if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
+            if self._optimizer:
+                self._eval_network = _VirtualDatasetCell(self._eval_network)
             self._eval_network.set_auto_parallel()
 
     def _build_predict_network(self):
@@ -281,7 +288,7 @@ class Model:
 
             if self._parameter_broadcast:
                 self._train_network.set_broadcast_flag()
-
+            train_dataset.__no_send__ = True
             train_dataset_helper, train_network = self._exec_preprocess(self._train_network,
                                                                         is_train=True,
                                                                         phase='train',
@@ -298,6 +305,7 @@ class Model:
 
             self._eval_network.set_train(False)
             self._eval_network.phase = 'eval'
+            valid_dataset.__no_send__ = True
             valid_dataset_helper, eval_network = self._exec_preprocess(self._eval_network,
                                                                        is_train=False,
                                                                        phase='eval',
@@ -330,8 +338,6 @@ class Model:
         if self._parameter_broadcast:
             self._train_network.set_broadcast_flag()
 
-        # build callback list
-        list_callback = _build_callbacks(callbacks)
         cb_params = _InternalCallbackParam()
         cb_params.train_network = self._train_network
         cb_params.epoch_num = epoch
@@ -342,17 +348,30 @@ class Model:
         cb_params.parallel_mode = self._parallel_mode
         cb_params.device_number = self._device_number
         cb_params.train_dataset = train_dataset
-        cb_params.list_callback = list_callback
+        cb_params.list_callback = self._transform_callbacks(callbacks)
+        cb_params.train_dataset_element = None
 
-        if dataset_sink_mode:
-            if context.get_context("mode") == context.PYNATIVE_MODE:
+        # build callback list
+        with _CallbackManager(callbacks) as list_callback:
+            if not dataset_sink_mode:
+                self._train_process(epoch, train_dataset, list_callback, cb_params)
+            elif context.get_context("mode") == context.PYNATIVE_MODE:
                 logger.warning("The pynative mode cannot support dataset sink mode currently."
                                "So the training process will be performed with dataset not sink.")
                 self._train_process(epoch, train_dataset, list_callback, cb_params)
             else:
                 self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params)
-        else:
-            self._train_process(epoch, train_dataset, list_callback, cb_params)
+
+    @staticmethod
+    def _transform_callbacks(callbacks):
+        """Transform callback to a list."""
+        if callbacks is None:
+            return []
+
+        if isinstance(callbacks, Iterable):
+            return list(callbacks)
+
+        return [callbacks]
 
     def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None):
         """
@@ -365,7 +384,7 @@ class Model:
                                      returned and passed to the network. Otherwise, a tuple (data, label) should
                                      be returned, and the data and label are passed to the network and loss
                                      function respectively.
-            list_callback (_ListCallback): Executor of callback list. Default: None.
+            list_callback (Callback): Executor of callback list. Default: None.
             cb_params (_InternalCallbackParam): Callback parameters. Default: None.
         """
         dataset_helper, train_network = self._exec_preprocess(self._train_network,
@@ -413,7 +432,7 @@ class Model:
                                      returned and passed to the network. Otherwise, a tuple (data, label) should
                                      be returned, and the data and label are passed to the network and loss
                                      function respectively.
-            list_callback (_ListCallback): Executor of callback list. Default: None.
+            list_callback (Callback): Executor of callback list. Default: None.
             cb_params (_InternalCallbackParam): Callback parameters. Default: None.
         """
         dataset_helper, _ = self._exec_preprocess(self._train_network,
@@ -445,6 +464,7 @@ class Model:
                     scaling_sens = self._get_scaling_sens()
                     next_element = tuple(next_element) + (Tensor(scaling_sens, mstype.float32),)
 
+                cb_params.train_dataset_element = next_element
                 outputs = self._train_network(*next_element)
                 cb_params.net_outputs = outputs
                 if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update():
@@ -520,7 +540,7 @@ class Model:
 
         Args:
             valid_dataset (Dataset): Dataset to evaluate the model.
-            list_callback (ListCallback): Executor of callback list. Default: None.
+            list_callback (Callback): Executor of callback list. Default: None.
             cb_params (_InternalCallbackParam): Callback parameters. Default: None.
 
         Returns:
@@ -559,7 +579,7 @@ class Model:
 
         Args:
             valid_dataset (Dataset): Dataset to evaluate the model.
-            list_callback (ListCallback): Executor of callback list. Default: None.
+            list_callback (Callback): Executor of callback list. Default: None.
             cb_params (_InternalCallbackParam): Callback parameters. Default: None.
 
         Returns:
@@ -618,22 +638,23 @@ class Model:
         if not self._metric_fns:
             raise ValueError("metric fn can not be None or empty.")
 
-        list_callback = _build_callbacks(callbacks)
         cb_params = _InternalCallbackParam()
         cb_params.eval_network = self._eval_network
         cb_params.valid_dataset = valid_dataset
         cb_params.batch_num = valid_dataset.get_dataset_size()
         cb_params.mode = "eval"
         cb_params.cur_step_num = 0
+        cb_params.list_callback = self._transform_callbacks(callbacks)
 
         self._eval_network.set_train(mode=False)
         self._eval_network.phase = 'eval'
 
         self._clear_metrics()
 
-        if dataset_sink_mode:
-            return self._eval_dataset_sink_process(valid_dataset, list_callback, cb_params)
-        return self._eval_process(valid_dataset, list_callback, cb_params)
+        with _CallbackManager(callbacks) as list_callback:
+            if dataset_sink_mode:
+                return self._eval_dataset_sink_process(valid_dataset, list_callback, cb_params)
+            return self._eval_process(valid_dataset, list_callback, cb_params)
 
     def predict(self, *predict_data):
         """
diff --git a/mindspore/train/quant/__init__.py b/mindspore/train/quant/__init__.py
index 531db34b2b..51e8c20ded 100644
--- a/mindspore/train/quant/__init__.py
+++ b/mindspore/train/quant/__init__.py
@@ -15,10 +15,10 @@
 """
 quantization.
 
-User can use aware quantization to train a model. Mindspore supports quantization aware training,
+User can use quantization aware to train a model. MindSpore supports quantization aware training,
 which models quantization errors in both the forward and backward passes using fake-quantization
 ops. Note that the entire computation is carried out in floating point. At the end of quantization
-aware training, Mindspore provides conversion functions to convert the trained model into lower precision.
+aware training, MindSpore provides conversion functions to convert the trained model into lower precision.
 """
 
 from .quant import convert_quant_network
diff --git a/mindspore/train/quant/quant.py b/mindspore/train/quant/quant.py
index e2a035bc77..937e54a7e4 100644
--- a/mindspore/train/quant/quant.py
+++ b/mindspore/train/quant/quant.py
@@ -12,15 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""aware quantization."""
+"""quantization aware."""
 
+import copy
 import re
-from ... import nn
-from ... import ops
+
+import numpy as np
+import mindspore.context as context
+
+from ... import log as logger
+from ... import nn, ops
 from ..._checkparam import ParamValidator as validator
 from ..._checkparam import Rel
-from ...nn.layer import combined
+from ...common import Tensor
+from ...common import dtype as mstype
+from ...common.api import _executor
 from ...nn.layer import quant
+from ...ops import functional as F
+from ...ops.operations import _inner_ops as inner
+from ...train import serialization
+from . import quant_utils
+
 
 _ACTIVATION_MAP = {nn.ReLU: quant.ReLUQuant,
                    nn.ReLU6: quant.ReLU6Quant,
@@ -28,25 +40,21 @@ _ACTIVATION_MAP = {nn.ReLU: quant.ReLUQuant,
                    nn.HSwish: quant.HSwishQuant}
 
 
-class _AddFakeQuantInputOutput(nn.Cell):
+class _AddFakeQuantInput(nn.Cell):
     """
     Add FakeQuant at input and output of the Network. Only support one input and one output case.
     """
 
     def __init__(self, network, quant_delay=0):
-        super(_AddFakeQuantInputOutput, self).__init__(auto_prefix=False)
+        super(_AddFakeQuantInput, self).__init__(auto_prefix=False)
         self.network = network
         self.fake_quant_input = quant.FakeQuantWithMinMax(
             min_init=-6, max_init=6, quant_delay=quant_delay, ema=True)
         self.fake_quant_input.update_parameters_name('fake_quant_input')
-        self.fake_quant_output = quant.FakeQuantWithMinMax(
-            min_init=-6, max_init=6, quant_delay=quant_delay, ema=True)
-        self.fake_quant_output.update_parameters_name('fake_quant_output')
 
     def construct(self, data):
         data = self.fake_quant_input(data)
         output = self.network(data)
-        output = self.fake_quant_output(output)
         return output
 
 
@@ -55,14 +63,17 @@ class _AddFakeQuantAfterSubCell(nn.Cell):
     Add FakeQuant after of the sub Cell.
     """
 
-    def __init__(self, subcell, quant_delay=0, num_bits=8):
+    def __init__(self, subcell, **kwargs):
         super(_AddFakeQuantAfterSubCell, self).__init__(auto_prefix=False)
         self.subcell = subcell
         self.fake_quant_act = quant.FakeQuantWithMinMax(min_init=-6,
                                                         max_init=6,
-                                                        num_bits=num_bits,
-                                                        quant_delay=quant_delay,
-                                                        ema=True)
+                                                        ema=True,
+                                                        num_bits=kwargs["num_bits"],
+                                                        quant_delay=kwargs["quant_delay"],
+                                                        per_channel=kwargs["per_channel"],
+                                                        symmetric=kwargs["symmetric"],
+                                                        narrow_range=kwargs["narrow_range"])
 
     def construct(self, *data):
         output = self.subcell(*data)
@@ -76,30 +87,22 @@ class ConvertToQuantNetwork:
     """
     __quant_op_name__ = ["TensorAdd", "Sub", "Mul", "RealDiv"]
 
-    def __init__(self,
-                 network,
-                 quant_delay=0,
-                 bn_fold=False,
-                 freeze_bn=0,
-                 weight_bits=8,
-                 act_bits=8,
-                 per_channel=False,
-                 symmetric=False,
-                 narrow_range=False):
-        self.network = validator.check_isinstance(
-            'network', network, (nn.Cell,))
-        self.quant_delay = validator.check_integer(
-            "quant delay", quant_delay, 0, Rel.GE)
-        self.freeze_bn = validator.check_integer(
-            "freeze bn", freeze_bn, 0, Rel.GE)
-        self.weight_bits = validator.check_integer(
-            "weights bit", weight_bits, 0, Rel.GE)
-        self.act_bits = validator.check_integer(
-            "activations bit", act_bits, 0, Rel.GE)
-        self.bn_fold = validator.check_bool("bn fold", bn_fold)
-        self.per_channel = validator.check_bool("per channel", per_channel)
-        self.symmetric = validator.check_bool("symmetric", symmetric)
-        self.narrow_range = validator.check_bool("narrow range", narrow_range)
+    def __init__(self, **kwargs):
+        self.network = validator.check_isinstance('network', kwargs["network"], (nn.Cell,))
+        self.weight_qdelay = validator.check_integer("quant delay", kwargs["quant_delay"][0], 0, Rel.GE)
+        self.act_qdelay = validator.check_integer("quant delay", kwargs["quant_delay"][-1], 0, Rel.GE)
+        self.bn_fold = validator.check_bool("bn fold", kwargs["bn_fold"])
+        self.freeze_bn = validator.check_integer("freeze bn", kwargs["freeze_bn"], 0, Rel.GE)
+        self.weight_bits = validator.check_integer("weights bit", kwargs["num_bits"][0], 0, Rel.GE)
+        self.act_bits = validator.check_integer("activations bit", kwargs["num_bits"][-1], 0, Rel.GE)
+        self.weight_channel = validator.check_bool("per channel", kwargs["per_channel"][0])
+        self.act_channel = validator.check_bool("per channel", kwargs["per_channel"][-1])
+        self.weight_symmetric = validator.check_bool("symmetric", kwargs["symmetric"][0])
+        self.act_symmetric = validator.check_bool("symmetric", kwargs["symmetric"][-1])
+        self.weight_range = validator.check_bool("narrow range", kwargs["narrow_range"][0])
+        self.act_range = validator.check_bool("narrow range", kwargs["narrow_range"][-1])
+        self._convert_method_map = {quant.Conv2dBnAct: self._convert_conv,
+                                    quant.DenseBnAct: self._convert_dense}
 
     def _convert_op_name(self, name):
         pattern = re.compile(r'([A-Z]{1})')
@@ -111,6 +114,7 @@ class ConvertToQuantNetwork:
     def run(self):
         self.network.update_cell_prefix()
         network = self._convert_subcells2quant(self.network)
+        network = _AddFakeQuantInput(network)
         return network
 
     def _convert_subcells2quant(self, network):
@@ -123,15 +127,9 @@ class ConvertToQuantNetwork:
             subcell = cells[name]
             if subcell == network:
                 continue
-            elif isinstance(subcell, combined.Conv2d):
-                prefix = subcell.param_prefix
-                new_subcell = self._convert_conv(subcell)
-                new_subcell.update_parameters_name(prefix + '.')
-                network.insert_child_to_cell(name, new_subcell)
-                change = True
-            elif isinstance(subcell, combined.Dense):
+            elif isinstance(subcell, (quant.Conv2dBnAct, quant.DenseBnAct)):
                 prefix = subcell.param_prefix
-                new_subcell = self._convert_dense(subcell)
+                new_subcell = self._convert_method_map[type(subcell)](subcell)
                 new_subcell.update_parameters_name(prefix + '.')
                 network.insert_child_to_cell(name, new_subcell)
                 change = True
@@ -150,7 +148,12 @@ class ConvertToQuantNetwork:
                 add_list.append((name, attr))
         for name, prim_op in add_list:
             prefix = name
-            add_quant = _AddFakeQuantAfterSubCell(prim_op)  # quant.TensorAddQuant()
+            add_quant = _AddFakeQuantAfterSubCell(prim_op,
+                                                  num_bits=self.act_bits,
+                                                  quant_delay=self.act_delay,
+                                                  per_channel=self.act_channel,
+                                                  symmetric=self.act_symmetric,
+                                                  narrow_range=self.act_range)
             prefix = '.'.join([network.param_prefix, self._convert_op_name(prim_op.name)])
             add_quant.update_parameters_name(prefix + '.')
             del network.__dict__[name]
@@ -159,7 +162,7 @@ class ConvertToQuantNetwork:
 
     def _convert_conv(self, subcell):
         """
-        convet conv cell to combine cell
+        convet conv cell to quant cell
         """
         conv_inner = subcell.conv
         bn_inner = subcell.batchnorm
@@ -174,13 +177,13 @@ class ConvertToQuantNetwork:
                                                     group=conv_inner.group,
                                                     eps=bn_inner.eps,
                                                     momentum=bn_inner.momentum,
-                                                    quant_delay=self.quant_delay,
+                                                    quant_delay=self.weight_qdelay,
                                                     freeze_bn=self.freeze_bn,
-                                                    per_channel=self.per_channel,
+                                                    per_channel=self.weight_channel,
                                                     num_bits=self.weight_bits,
                                                     fake=True,
-                                                    symmetric=self.symmetric,
-                                                    narrow_range=self.narrow_range)
+                                                    symmetric=self.weight_symmetric,
+                                                    narrow_range=self.weight_range)
             del subcell.batchnorm
             subcell.batchnorm = None
             subcell.has_bn = False
@@ -194,16 +197,22 @@ class ConvertToQuantNetwork:
                                            dilation=conv_inner.dilation,
                                            group=conv_inner.group,
                                            has_bias=conv_inner.has_bias,
-                                           quant_delay=self.quant_delay,
-                                           per_channel=self.per_channel,
+                                           quant_delay=self.weight_qdelay,
+                                           per_channel=self.weight_channel,
                                            num_bits=self.weight_bits,
-                                           symmetric=self.symmetric,
-                                           narrow_range=self.narrow_range)
+                                           symmetric=self.weight_symmetric,
+                                           narrow_range=self.weight_range)
         subcell.conv = conv_inner
-        if subcell.activation is not None:
+        if subcell.has_act and subcell.activation is not None:
             subcell.activation = self._convert_activation(subcell.activation)
         else:
-            subcell = _AddFakeQuantAfterSubCell(subcell)
+            subcell.has_act = True
+            subcell.activation = _AddFakeQuantAfterSubCell(F.identity,
+                                                           num_bits=self.act_bits,
+                                                           quant_delay=self.act_qdelay,
+                                                           per_channel=self.act_channel,
+                                                           symmetric=self.act_symmetric,
+                                                           narrow_range=self.act_range)
         return subcell
 
     def _convert_dense(self, subcell):
@@ -214,12 +223,22 @@ class ConvertToQuantNetwork:
         dense_inner = quant.DenseQuant(dense_inner.in_channels,
                                        dense_inner.out_channels,
                                        has_bias=dense_inner.has_bias,
-                                       quant_delay=self.quant_delay,
-                                       per_channel=self.per_channel,
-                                       num_bits=self.weight_bits)
+                                       num_bits=self.weight_bits,
+                                       quant_delay=self.weight_qdelay,
+                                       per_channel=self.weight_channel,
+                                       symmetric=self.weight_symmetric,
+                                       narrow_range=self.weight_range)
         subcell.dense = dense_inner
-        if subcell.activation is not None:
+        if subcell.has_act and subcell.activation is not None:
             subcell.activation = self._convert_activation(subcell.activation)
+        else:
+            subcell.has_act = True
+            subcell.activation = _AddFakeQuantAfterSubCell(F.identity,
+                                                           num_bits=self.act_bits,
+                                                           quant_delay=self.act_qdelay,
+                                                           per_channel=self.act_channel,
+                                                           symmetric=self.act_symmetric,
+                                                           narrow_range=self.act_range)
         return subcell
 
     def _convert_activation(self, activation):
@@ -227,36 +246,210 @@ class ConvertToQuantNetwork:
         if act_class not in _ACTIVATION_MAP:
             raise ValueError(
                 "Unsupported activation in auto Quant: ", act_class)
-        return _ACTIVATION_MAP[act_class](num_bits=self.act_bits, quant_delay=self.quant_delay)
+        return _ACTIVATION_MAP[act_class](num_bits=self.act_bits,
+                                          quant_delay=self.act_qdelay,
+                                          per_channel=self.act_channel,
+                                          symmetric=self.weight_symmetric,
+                                          narrow_range=self.weight_range)
+
+
+class ExportQuantNetworkDeploy:
+    """
+    Convert quantization aware network to deploy network.
+
+    Args:
+        network (Cell): MindSpore network produced by `convert_quant_network`.
+        inputs (Tensor): Inputs of the `network`.
+
+    Returns:
+        Cell, converted network.
+    """
+    __quant_op_name__ = ["TensorAdd", "Sub", "Mul", "RealDiv"]
+
+    def __init__(self,
+                 network,
+                 *inputs):
+        network = validator.check_isinstance('network', network, (nn.Cell,))
+        self.data_type = mstype.int8
+        self.network = copy.deepcopy(network)
+        self.all_paramters = {p.name: p for p in self.network.get_parameters()}
+        self.get_inputs_table(inputs)
+
+    def get_inputs_table(self, inputs):
+        """Get the support info for quant export."""
+        phase_name = 'export_quant'
+        graph_id, _ = _executor.compile(self.network, *inputs, phase=phase_name, do_convert=False)
+        self.quant_info_table = _executor.fetch_info_for_quant_export(graph_id)
+
+    def run(self):
+        """Start to convert."""
+        self.network.update_cell_prefix()
+        network = self.network
+        if isinstance(network, _AddFakeQuantInput):
+            network = network.network
+        network = self._convert_quant2deploy(network)
+        return network
+
+    def _get_quant_block(self, cell_core, activation, fake_quant_a_out):
+        """convet network's quant subcell to deploy subcell"""
+        # Calculate the scale and zero point
+        w_minq_name = cell_core.fake_quant_weight.minq.name
+        np_type = mstype.dtype_to_nptype(self.data_type)
+        scale_w, zp_w = quant_utils.scale_zp_from_fack_quant_cell(cell_core.fake_quant_weight, np_type)
+        scale_a_out, _ = quant_utils.scale_zp_from_fack_quant_cell(fake_quant_a_out, np_type)
+        info = self.quant_info_table.get(w_minq_name, None)
+        if info:
+            fack_quant_a_in_op, minq_name = info
+            maxq = self.all_paramters[minq_name[:-4] + "maxq"]
+            minq = self.all_paramters[minq_name]
+            scale_a_in, zp_a_in = quant_utils.scale_zp_from_data(fack_quant_a_in_op, maxq, minq, np_type)
+        else:
+            logger.warning(f"Do not find `fake_quant` from input with `fack_quant.minq` {w_minq_name}")
+            return None
+
+        # Build the `Quant` `Dequant` op.
+        # AscendQuant only support perlayer version. Need check here.
+        quant_op = inner.AscendQuant(float(scale_a_in), float(zp_a_in))
+        sqrt_mode = False
+        scale_deq = scale_a_out * scale_w
+        if scale_deq < 2 ** -14:
+            scale_deq = np.sqrt(scale_deq)
+            sqrt_mode = True
+        dequant_op = inner.AscendDequant(sqrt_mode)
+
+        # get op
+        op_core = cell_core.matmul if isinstance(cell_core, quant.DenseQuant) else cell_core.conv
+        if isinstance(activation, _AddFakeQuantAfterSubCell):
+            activation = activation.subcell
+        elif hasattr(activation, "get_origin"):
+            activation = activation.get_origin()
+
+        # get the `weight` and `bias`
+        weight = cell_core.weight.data.asnumpy()
+        bias = None
+        if isinstance(cell_core, (quant.DenseQuant, quant.Conv2dQuant)):
+            if cell_core.has_bias:
+                bias = cell_core.bias.data.asnumpy()
+        elif isinstance(cell_core, quant.Conv2dBatchNormQuant):
+            weight, bias = quant_utils.fold_batchnorm(weight, cell_core)
+
+        # apply the quant
+        weight = Tensor(quant_utils.weight2int(weight, scale_w, zp_w), self.data_type)
+        if bias is not None:
+            bias = Tensor(scale_a_in * scale_w * bias, mstype.int32)
+        scale_deq = Tensor(scale_deq, mstype.float16)
+        block = quant.QuantBlock(op_core, weight, quant_op, dequant_op, scale_deq, bias, activation)
+        return block
+
+    def _convert_quant2deploy(self, network):
+        """Convet network's all quant subcell to deploy subcell."""
+        cells = network.name_cells()
+        change = False
+        for name in cells:
+            subcell = cells[name]
+            if subcell == network:
+                continue
+            cell_core = None
+            fake_quant_act = None
+            activation = None
+            if isinstance(subcell, quant.Conv2dBnAct):
+                cell_core = subcell.conv
+                activation = subcell.activation
+                fake_quant_act = activation.fake_quant_act
+            elif isinstance(subcell, quant.DenseBnAct):
+                cell_core = subcell.dense
+                activation = subcell.activation
+                fake_quant_act = activation.fake_quant_act
+            if cell_core is not None:
+                new_subcell = self._get_quant_block(cell_core, activation, fake_quant_act)
+                if new_subcell:
+                    prefix = subcell.param_prefix
+                    new_subcell.update_parameters_name(prefix + '.')
+                    network.insert_child_to_cell(name, new_subcell)
+                    change = True
+            elif isinstance(subcell, _AddFakeQuantAfterSubCell):
+                op = subcell.subcell
+                if op.name in ConvertToQuantNetwork.__quant_op_name__ and isinstance(op, ops.Primitive):
+                    network.__delattr__(name)
+                    network.__setattr__(name, op)
+                    change = True
+            else:
+                self._convert_quant2deploy(subcell)
+        if isinstance(network, nn.SequentialCell) and change:
+            network.cell_list = list(network.cells())
+        return network
+
+
+def export_geir(network, *inputs, file_name):
+    """
+    Exports MindSpore quant predict model to deploy with GEIR.
+
+    Args:
+        network (Cell): MindSpore network produced by `convert_quant_network`.
+        inputs (Tensor): Inputs of the `network`.
+        file_name (str): File name of model to export.
+    """
+    exporter = ExportQuantNetworkDeploy(network, *inputs)
+    deploy_net = exporter.run()
+    serialization.export(deploy_net, *inputs, file_name=file_name, file_format="GEIR")
 
 
 def convert_quant_network(network,
-                          quant_delay=0,
                           bn_fold=False,
                           freeze_bn=0,
-                          weight_bits=8,
-                          act_bits=8,
-                          per_channel=False,
-                          symmetric=False,
-                          narrow_range=False
+                          quant_delay=(0, 0),
+                          num_bits=(8, 8),
+                          per_channel=(False, False),
+                          symmetric=(False, False),
+                          narrow_range=(False, False)
                           ):
     r"""
-    Create aware quantizaiton training network.
+    Create quantization aware training network.
 
     Args:
         network (Cell): Obtain a pipeline through network for saving graph summary.
-        quant_delay (int): Number of steps after which weights and activations are quantized during eval. Default: 0.
+        quant_delay (int or tuple): Number of steps after which weights and activations are quantized during
+            eval. The first element represent weights and second element represent data flow. Default: (0, 0)
         bn_fold (bool): Flag to used bn fold ops for simulation inference operation. Default: False.
-        freeze_bn (bool): Number of steps after which BN parameters used total mean and variance. Default: 0.
-        weight_bits (int): Number of bits to use for quantizing weights. Default: 8.
-        act_bits (int): Number of bits to use for quantizing activations. Default: 8.
-        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
-        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
-        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
-
-    returns:
-        Cell, Network which has change to aware quantization training network.
+        freeze_bn (int): Number of steps after which BatchNorm OP parameters used total mean and variance. Default: 0.
+        num_bits (int or tuple): Number of bits to use for quantizing weights and activations. The first
+            element represent weights and second element represent data flow. Default: (8, 8)
+        per_channel (int or tuple):  Quantization granularity based on layer or on channel. If `True`
+            then base on per channel otherwise base on per layer. The first element represent weights
+            and second element represent data flow. Default: (False, False)
+        symmetric (int or tuple): Quantization algorithm use symmetric or not. If `True` then base on
+            symmetric otherwise base on assymmetric. The first element represent weights and second
+            element represent data flow. Default: (False, False)
+        narrow_range (int or tuple): Quantization algorithm use narrow range or not. If `True` then base
+            on narrow range otherwise base on off narrow range. The first element represent weights and
+            second element represent data flow. Default: (False, False)
+
+    Returns:
+        Cell, Network which has change to quantization aware training network cell.
     """
-    net = ConvertToQuantNetwork(
-        network, quant_delay, bn_fold, freeze_bn, weight_bits, act_bits, per_channel, symmetric, narrow_range)
+    support_device = ["Ascend", "GPU"]
+    def convert2list(name, value):
+        if not isinstance(value, list) and not isinstance(value, tuple):
+            value = [value]
+        elif len(value) > 2:
+            raise ValueError("input `{}` len should less then 2".format(name))
+        return value
+
+    quant_delay = convert2list("quant delay", quant_delay)
+    num_bits = convert2list("num bits", num_bits)
+    per_channel = convert2list("per channel", per_channel)
+    symmetric = convert2list("symmetric", symmetric)
+    narrow_range = convert2list("narrow range", narrow_range)
+
+    if context.get_context('device_target') not in support_device:
+        raise KeyError("Not support {} backend.".format(context.get_context('device_target')))
+
+    net = ConvertToQuantNetwork(network=network,
+                                quant_delay=quant_delay,
+                                bn_fold=bn_fold,
+                                freeze_bn=freeze_bn,
+                                num_bits=num_bits,
+                                per_channel=per_channel,
+                                symmetric=symmetric,
+                                narrow_range=narrow_range)
     return net.run()
diff --git a/mindspore/train/quant/quant_utils.py b/mindspore/train/quant/quant_utils.py
new file mode 100644
index 0000000000..c9e6ac92e1
--- /dev/null
+++ b/mindspore/train/quant/quant_utils.py
@@ -0,0 +1,191 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Quantization utils."""
+
+import numpy as np
+
+
+def cal_quantization_params(input_min,
+                            input_max,
+                            data_type,
+                            num_bits=8,
+                            symmetric=False,
+                            narrow_range=False):
+    r"""
+    Calculate quantization params for scale and zero point.
+
+    Args:
+        input_min (numpy.ndarray): The dimension of channel or 1.
+        input_max (numpy.ndarray): The dimension of channel or 1.
+        data_type (numpy type) : Can ben numpy int8, numpy uint8.
+        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
+        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
+        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
+
+    Returns:
+        scale (numpy.ndarray): quantization param.
+        zero point (numpy.ndarray): quantization param.
+    """
+    input_max = np.maximum(0.0, input_max)
+    input_min = np.minimum(0.0, input_min)
+
+    if input_min.shape != input_max.shape:
+        raise ValueError("input min shape should equal to input max.")
+    if len(input_min.shape) > 1:
+        raise ValueError("input min and max shape should be one dim.")
+    if input_min > input_max:
+        raise ValueError("input_min min should less than input max.")
+    if (input_max == input_min).all():
+        # scale = 1.0, zp = 0.0
+        return np.ones(input_min.shape), np.zeros(input_min.shape)
+
+    if data_type == np.int8:
+        quant_min = 0 - 2 ** (num_bits - 1)
+        quant_max = 2 ** (num_bits - 1)
+    else:
+        quant_min = 0
+        quant_max = 2 ** num_bits - 1
+    if narrow_range:
+        quant_min = quant_min + 1
+
+    # calculate scale
+    if symmetric:
+        input_max = np.maximum(-input_min, input_max)
+        input_min = -input_max
+    scale = (input_max - input_min) / (quant_max - quant_min)
+
+    # calculate zero point
+    if symmetric:
+        zp = np.zeros(input_min.shape)
+    else:
+        zp_from_min = quant_min - input_min / scale
+        zp_from_max = quant_max - input_max / scale
+        zp_from_min_error = np.abs(quant_min) + np.abs(input_min / scale)
+        zp_from_max_error = np.abs(quant_max) + np.abs(input_max / scale)
+        zp_double = zp_from_min if zp_from_min_error < zp_from_max_error else zp_from_max
+        if zp_double < quant_min:
+            zp = quant_min
+        elif zp_double > quant_max:
+            zp = quant_max
+        else:
+            zp = np.floor(zp_double + 0.5)
+
+    return scale, zp
+
+
+def weight2int(data,
+               scale,
+               zero_point):
+    r"""
+    Calculate int8/uint8 weight from fp32. the formula is defined as:
+
+    .. math::
+        int8/uint8 = round(float/scale) + offset
+
+    Args:
+        data (numpy.ndarray): The dimension of channel or 1. Should be NCHW.
+        scale (numpy.ndarray): The dimension of channel or 1.
+        zero_point (numpy.ndarray): The dimension of channel or 1.
+
+    Returns:
+        weight (numpy.ndarray): The dimension of channel or 1.
+    """
+    if scale.shape != zero_point.shape:
+        raise ValueError("scale and zero_point should have the same shape.")
+    if scale.shape[0] > 0:
+        scale = scale.reshape(1, -1)
+        zero_point = zero_point.reshape(1, -1)
+
+    return np.round((data/scale) + zero_point)
+
+
+def scale_zp_from_fack_quant_cell(cell, data_type):
+    r"""
+    Get calculate quantization params for scale and zero point From `FakeQuantWithMinMax`.
+
+    Args:
+        cell (Cell): `mindspore.nn.layer.FakeQuantWithMinMax`
+        data_type (numpy type): Can ben `numpy.int8` or `numpy.uint8`.
+
+    Returns:
+        scale (numpy.ndarray): quantization param.
+        zero point (numpy.ndarray): quantization param.
+    """
+    minq = cell.minq.data.asnumpy()
+    maxq = cell.maxq.data.asnumpy()
+    op = cell.fake_quant
+
+    scale, zp = cal_quantization_params(
+        minq, maxq, data_type,
+        num_bits=op.num_bits,
+        symmetric=op.symmetric,
+        narrow_range=op.narrow_range)
+    return scale, zp
+
+
+def scale_zp_from_data(op, minq, maxq, data_type):
+    r"""
+    Get calculate quantization params for scale and zero point.
+
+    Calculate from `FakeQuantWithMinMax`'s Parameter or Fake quant primitive.
+
+    Args:
+        op (Primitive): Fake quant primitive `mindspore.ops.operation.FakeQuantPerLayer` or
+            `mindspore.ops.operation.FakeQuantPerChannel`
+        minq (Parameter): Parameter `minq` of `mindspore.nn.layer.FakeQuantWithMinMax`
+        maxq (Parameter): Parameter `maxq` of `mindspore.nn.layer.FakeQuantWithMinMax`
+        data_type (numpy type): Can ben `numpy.int8` or `numpy.uint8`.
+
+    Returns:
+        scale (numpy.ndarray): quantization param.
+        zero point (numpy.ndarray): quantization param.
+    """
+    minq = minq.data.asnumpy()
+    maxq = maxq.data.asnumpy()
+
+    scale, zp = cal_quantization_params(
+        minq, maxq, data_type,
+        num_bits=op.num_bits,
+        symmetric=op.symmetric,
+        narrow_range=op.narrow_range)
+    return scale, zp
+
+
+def fold_batchnorm(weight, cell_quant):
+    r"""
+    Fold the batchnorm in `Conv2dBatchNormQuant` to weight.
+
+    Calculate from `FakeQuantWithMinMax`'s Parameter or Fake quant primitive.
+
+    Args:
+        weight (numpy.ndarray): Weight of `cell_quant`.
+        cell_quant (Cell): Object of `mindspore.nn.layer.Conv2dBatchNormQuant`.
+
+    Returns:
+        weight (numpy.ndarray): Folded weight.
+        bias (numpy.ndarray): Folded bias.
+    """
+    variance = cell_quant.moving_variance.data.asnumpy()
+    mean = cell_quant.moving_mean.data.asnumpy()
+    gamma = cell_quant.gamma.data.asnumpy()
+    beta = cell_quant.beta.data.asnumpy()
+    epsilon = cell_quant.eps
+    sigma = np.sqrt(variance + epsilon)
+    gamma = gamma.reshape(-1, 1, 1, 1)
+    sigma = sigma.reshape(-1, 1, 1, 1)
+    mean = mean.reshape(-1, 1, 1, 1)
+    weight = weight * gamma / sigma
+    bias = beta - gamma * mean / sigma
+    return weight, bias
diff --git a/mindspore/train/serialization.py b/mindspore/train/serialization.py
index 502f00572f..c39104c6ff 100644
--- a/mindspore/train/serialization.py
+++ b/mindspore/train/serialization.py
@@ -21,6 +21,7 @@ import mindspore.nn as nn
 import mindspore.context as context
 from mindspore import log as logger
 from mindspore.train.checkpoint_pb2 import Checkpoint
+from mindspore.train.print_pb2 import Print
 from mindspore.common.tensor import Tensor
 from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
@@ -30,11 +31,15 @@ from mindspore._checkparam import check_input_data
 
 __all__ = ["save_checkpoint", "load_checkpoint", "load_param_into_net", "export"]
 
-tensor_to_ms_type = {"Int8": mstype.int8, "Int16": mstype.int16, "Int32": mstype.int32, "Int64": mstype.int64,
-                     "Float16": mstype.float16, "Float32": mstype.float32, "Float64": mstype.float64}
+tensor_to_ms_type = {"Int8": mstype.int8, "Uint8": mstype.uint8, "Int16": mstype.int16, "Uint16": mstype.uint16,
+                     "Int32": mstype.int32, "Uint32": mstype.uint32, "Int64": mstype.int64, "Uint64": mstype.uint64,
+                     "Float16": mstype.float16, "Float32": mstype.float32, "Float64": mstype.float64,
+                     "Bool": mstype.bool_}
+
+tensor_to_np_type = {"Int8": np.int8, "Uint8": np.uint8, "Int16": np.int16, "Uint16": np.uint16,
+                     "Int32": np.int32, "Uint32": np.uint32, "Int64": np.int64, "Uint64": np.uint64,
+                     "Float16": np.float16, "Float32": np.float32, "Float64": np.float64, "Bool": np.bool_}
 
-tensor_to_np_type = {"Int8": np.int8, "Int16": np.int16, "Int32": np.int32, "Int64": np.int64,
-                     "Float16": np.float16, "Float32": np.float32, "Float64": np.float64}
 
 def _special_process_par(par, new_par):
     """
@@ -42,17 +47,17 @@ def _special_process_par(par, new_par):
 
     Like (12,2048,1,1)->(12,2048), this case is caused by GE 4 dimensions tensor.
     """
-    par_shape_len = len(par.data.shape())
-    new_par_shape_len = len(new_par.data.shape())
+    par_shape_len = len(par.data.shape)
+    new_par_shape_len = len(new_par.data.shape)
     delta_len = new_par_shape_len - par_shape_len
     delta_i = 0
     for delta_i in range(delta_len):
-        if new_par.data.shape()[par_shape_len + delta_i] != 1:
+        if new_par.data.shape[par_shape_len + delta_i] != 1:
             break
     if delta_i == delta_len - 1:
         new_val = new_par.data.asnumpy()
-        new_val = new_val.reshape(par.data.shape())
-        par.set_parameter_data(Tensor(new_val, par.data.dtype()))
+        new_val = new_val.reshape(par.data.shape)
+        par.set_parameter_data(Tensor(new_val, par.data.dtype))
         return True
     return False
 
@@ -61,17 +66,17 @@ def _update_param(param, new_param):
     """Updates param's data from new_param's data."""
 
     if isinstance(param.data, Tensor) and isinstance(new_param.data, Tensor):
-        if param.data.dtype() != new_param.data.dtype():
+        if param.data.dtype != new_param.data.dtype:
             logger.error("Failed to combine the net and the parameters for param %s.", param.name)
             msg = ("Net parameters {} type({}) different from parameter_dict's({})"
-                   .format(param.name, param.data.dtype(), new_param.data.dtype()))
+                   .format(param.name, param.data.dtype, new_param.data.dtype))
             raise RuntimeError(msg)
 
-        if param.data.shape() != new_param.data.shape():
+        if param.data.shape != new_param.data.shape:
             if not _special_process_par(param, new_param):
                 logger.error("Failed to combine the net and the parameters for param %s.", param.name)
                 msg = ("Net parameters {} shape({}) different from parameter_dict's({})"
-                       .format(param.name, param.data.shape(), new_param.data.shape()))
+                       .format(param.name, param.data.shape, new_param.data.shape))
                 raise RuntimeError(msg)
             return
 
@@ -79,12 +84,12 @@ def _update_param(param, new_param):
         return
 
     if isinstance(param.data, Tensor) and not isinstance(new_param.data, Tensor):
-        if param.data.shape() != (1,) and param.data.shape() != ():
+        if param.data.shape != (1,) and param.data.shape != ():
             logger.error("Failed to combine the net and the parameters for param %s.", param.name)
             msg = ("Net parameters {} shape({}) is not (1,), inconsitent with parameter_dict's(scalar)."
-                   .format(param.name, param.data.shape()))
+                   .format(param.name, param.data.shape))
             raise RuntimeError(msg)
-        param.set_parameter_data(initializer(new_param.data, param.data.shape(), param.data.dtype()))
+        param.set_parameter_data(initializer(new_param.data, param.data.shape, param.data.dtype))
 
     elif isinstance(new_param.data, Tensor) and not isinstance(param.data, Tensor):
         logger.error("Failed to combine the net and the parameters for param %s.", param.name)
@@ -120,12 +125,12 @@ def save_checkpoint(parameter_list, ckpoint_file_name):
                 param["data"].init_data()
             param_data = param["data"].asnumpy().reshape(-1)
             param_tensor.tensor_content = param_data.tostring()
-            param_tensor.tensor_type = str(param["data"].dtype())
+            param_tensor.tensor_type = str(param["data"].dtype)
 
-            if param['data'].shape() == ():
+            if param['data'].shape == ():
                 param_tensor.dims.append(0)
             else:
-                for dim in param['data'].shape():
+                for dim in param['data'].shape:
                     param_tensor.dims.append(dim)
 
         with open(ckpoint_file_name, "wb") as f:
@@ -398,17 +403,18 @@ def export(net, *inputs, file_name, file_format='GEIR'):
         net (Cell): MindSpore network.
         inputs (Tensor): Inputs of the `net`.
         file_name (str): File name of model to export.
-        file_format (str): MindSpore currently supports 'GEIR', 'ONNX' and 'LITE' format for exported model.
+        file_format (str): MindSpore currently supports 'GEIR', 'ONNX' 'LITE' and 'BINARY' format for exported model.
 
             - GEIR: Graph Engine Intermidiate Representation. An intermidiate representation format of
               Ascend model.
             - ONNX: Open Neural Network eXchange. An open format built to represent machine learning models.
             - LITE: Huawei model format for mobile. A lite model only for the MindSpore Lite
+            - BINARY: Binary format for model. An intermidiate representation format for models.
     """
     logger.info("exporting model file:%s format:%s.", file_name, file_format)
     check_input_data(*inputs, data_class=Tensor)
 
-    supported_formats = ['GEIR', 'ONNX', 'LITE']
+    supported_formats = ['GEIR', 'ONNX', 'LITE', 'BINARY']
     if file_format not in supported_formats:
         raise ValueError(f'Illegal file format {file_format}, it must be one of {supported_formats}')
     # switch network mode to infer when it is training
@@ -428,9 +434,77 @@ def export(net, *inputs, file_name, file_format='GEIR'):
         with open(file_name, 'wb') as f:
             os.chmod(file_name, stat.S_IWUSR | stat.S_IRUSR)
             f.write(onnx_stream)
+    elif file_format == 'BINARY':  # file_format is 'BINARY'
+        phase_name = 'export_binary'
+        graph_id, _ = _executor.compile(net, *inputs, phase=phase_name, do_convert=False)
+        onnx_stream = _executor._get_func_graph_proto(graph_id, 'binary_ir')
+        with open(file_name, 'wb') as f:
+            os.chmod(file_name, stat.S_IWUSR | stat.S_IRUSR)
+            f.write(onnx_stream)
     elif file_format == 'LITE':  # file_format is 'LITE'
         context.set_context(save_ms_model=True, save_ms_model_path=file_name)
         net(*inputs)
     # restore network training mode
     if is_training:
         net.set_train(mode=True)
+
+
+def parse_print(print_file_name):
+    """
+    Loads Print data from a specified file.
+
+    Args:
+        print_file_name (str): The file name of save print data.
+
+    Returns:
+        List, element of list is Tensor.
+
+    Raises:
+        ValueError: Print file is incorrect.
+    """
+    if not os.path.realpath(print_file_name):
+        raise ValueError("Please input the correct print file name.")
+
+    if os.path.getsize(print_file_name) == 0:
+        raise ValueError("The print file may be empty, please make sure enter the correct file name.")
+
+    logger.info("Execute load print process.")
+    print_list = Print()
+
+    try:
+        with open(print_file_name, "rb") as f:
+            pb_content = f.read()
+        print_list.ParseFromString(pb_content)
+    except BaseException as e:
+        logger.error("Failed to read the print file %s, please check the correct of the file.", print_file_name)
+        raise ValueError(e.__str__())
+
+    tensor_list = []
+
+    try:
+        for print_ in print_list.value:
+            # String type
+            if print_.HasField("desc"):
+                tensor_list.append(print_.desc)
+            elif print_.HasField("tensor"):
+                dims = print_.tensor.dims
+                data_type = print_.tensor.tensor_type
+                data = print_.tensor.tensor_content
+                np_type = tensor_to_np_type[data_type]
+                param_data = np.fromstring(data, np_type)
+                ms_type = tensor_to_ms_type[data_type]
+                param_dim = []
+                for dim in dims:
+                    param_dim.append(dim)
+                if param_dim:
+                    param_value = param_data.reshape(param_dim)
+                    tensor_list.append(Tensor(param_value, ms_type))
+                # Scale type
+                else:
+                    tensor_list.append(Tensor(param_data, ms_type))
+
+    except BaseException as e:
+        logger.error("Failed to load the print file %s.", print_list)
+        raise RuntimeError(e.__str__())
+
+    return tensor_list
diff --git a/mindspore/train/summary/_event_writer.py b/mindspore/train/summary/_event_writer.py
deleted file mode 100644
index ae347135f6..0000000000
--- a/mindspore/train/summary/_event_writer.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Writes events to disk in a logdir."""
-import os
-import stat
-from collections import deque
-from multiprocessing import Pool, Process, Queue, cpu_count
-
-from ..._c_expression import EventWriter_
-from ._summary_adapter import package_summary_event
-
-
-def _pack(result, step):
-    summary_event = package_summary_event(result, step)
-    return summary_event.SerializeToString()
-
-
-class EventWriter(Process):
-    """
-    Creates a `EventWriter` and write event to file.
-
-    Args:
-        filepath (str): Summary event file path and file name.
-        flush_interval (int): The flush seconds to flush the pending events to disk. Default: 120.
-    """
-
-    def __init__(self, filepath: str, flush_interval: int) -> None:
-        super().__init__()
-        _ = flush_interval
-        with open(filepath, 'w'):
-            os.chmod(filepath, stat.S_IWUSR | stat.S_IRUSR)
-        self._writer = EventWriter_(filepath)
-        self._queue = Queue(cpu_count() * 2)
-        self.start()
-
-    def run(self):
-
-        with Pool() as pool:
-            deq = deque()
-            while True:
-                while deq and deq[0].ready():
-                    self._writer.Write(deq.popleft().get())
-
-                if not self._queue.empty():
-                    action, data = self._queue.get()
-                    if action == 'WRITE':
-                        if not isinstance(data, (str, bytes)):
-                            deq.append(pool.apply_async(_pack, data))
-                        else:
-                            self._writer.Write(data)
-                    elif action == 'FLUSH':
-                        self._writer.Flush()
-                    elif action == 'END':
-                        break
-            for res in deq:
-                self._writer.Write(res.get())
-
-            self._writer.Shut()
-
-    def write(self, data) -> None:
-        """
-        Write the event to file.
-
-        Args:
-            data (Optional[str, Tuple[list, int]]): The data to write.
-        """
-        self._queue.put(('WRITE', data))
-
-    def flush(self):
-        """Flush the writer."""
-        self._queue.put(('FLUSH', None))
-
-    def close(self) -> None:
-        """Close the writer."""
-        self._queue.put(('END', None))
-        self.join()
diff --git a/mindspore/train/summary/_lineage_adapter.py b/mindspore/train/summary/_lineage_adapter.py
new file mode 100644
index 0000000000..d85d16b49d
--- /dev/null
+++ b/mindspore/train/summary/_lineage_adapter.py
@@ -0,0 +1,39 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Generate the lineage event which conform to proto format."""
+import time
+
+from ..lineage_pb2 import LineageEvent
+
+
+def serialize_to_lineage_event(name, value):
+    """Serialize value to lineage event."""
+    event = LineageEvent()
+    event.wall_time = time.time()
+    content = _get_lineage_content(name, event)
+    content.ParseFromString(value)
+    return event.SerializeToString()
+
+
+def _get_lineage_content(name, event):
+    if name == 'dataset_graph':
+        return event.dataset_graph
+    if name == 'eval_lineage':
+        return event.evaluation_lineage
+    if name == 'train_lineage':
+        return event.train_lineage
+    if name == 'custom_lineage_data':
+        return event.user_defined_info
+    raise KeyError(f'No such field in LineageEvent')
diff --git a/mindspore/train/summary/_summary_adapter.py b/mindspore/train/summary/_summary_adapter.py
index 47ed0a7b90..1ae5bdd2d5 100644
--- a/mindspore/train/summary/_summary_adapter.py
+++ b/mindspore/train/summary/_summary_adapter.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 """Generate the summary event which conform to proto format."""
-import socket
+import platform
 import time
 
 import numpy as np
@@ -30,7 +30,7 @@ MS_IMAGE_TENSOR_FORMAT = 'NCHW'
 # Set the Event mark
 EVENT_FILE_NAME_MARK = ".out.events.summary."
 # Set the init event of version and mark
-EVENT_FILE_INIT_VERSION_MARK = "Mindspore.Event:"
+EVENT_FILE_INIT_VERSION_MARK = "MindSpore.Event:"
 EVENT_FILE_INIT_VERSION = 1
 
 F32_MIN, F32_MAX = np.finfo(np.float32).min, np.finfo(np.float32).max
@@ -51,7 +51,7 @@ def get_event_file_name(prefix, suffix):
     _check_str_by_regular(suffix)
     file_name = ""
     time_second = str(int(time.time()))
-    hostname = socket.gethostname()
+    hostname = platform.node()
 
     if prefix is not None:
         file_name = file_name + prefix
@@ -113,7 +113,7 @@ def package_summary_event(data_list, step):
         data = value["data"]
         tag = value["name"]
 
-        logger.debug("Now process %r summary, tag = %r", summary_type, tag)
+        logger.debug(f"Now process {summary_type} summary, tag = {tag}")
 
         summary_value = summary.value.add()
         summary_value.tag = tag
@@ -130,7 +130,7 @@ def package_summary_event(data_list, step):
             _fill_histogram_summary(tag, data, summary_value.histogram)
         else:
             # The data is invalid ,jump the data
-            logger.error("Summary type(%r) is error, tag = %r", summary_type, tag)
+            logger.error(f"Summary type({summary_type}) is error, tag = {tag}")
             del summary.value[-1]
 
     return summary_event
@@ -186,17 +186,17 @@ def _fill_scalar_summary(tag: str, np_value, summary):
     Returns:
         Summary, return scalar summary content.
     """
-    logger.debug("Set(%r) the scalar summary value", tag)
+    logger.debug(f"Set({tag}) the scalar summary value")
     if np_value.size == 1:
         # is scalar
         summary.scalar_value = np_value.item()
         return True
     if np_value.size > 1:
-        logger.warning("The tensor is not a single scalar, tag = %r, ndim = %r, shape = %r", tag, np_value.ndim,
-                       np_value.shape)
+        logger.warning(
+            f"The tensor is not a single scalar, tag = {tag}, ndim = {np_value.ndim}, shape = {np_value.shape}")
         summary.scalar_value = next(np_value.flat).item()
         return True
-    logger.error("There no values inside tensor, tag = %r, size = %r", tag, np_value.size)
+    logger.error(f"There no values inside tensor, tag = {tag}, size = {np_value.size}")
     return False
 
 
@@ -212,7 +212,7 @@ def _fill_tensor_summary(tag: str, np_value, summary_tensor):
     Retruns:
         Summary, return tensor summary content.
     """
-    logger.debug("Set(%r) the tensor summary value", tag)
+    logger.debug(f"Set({tag}) the tensor summary value")
     # get tensor dtype
     tensor_dtype = _nptype_to_prototype(np_value)
     summary_tensor.data_type = DataType.Value(tensor_dtype)
@@ -266,7 +266,7 @@ def _fill_histogram_summary(tag: str, np_value: np.ndarray, summary) -> None:
         np_value (np.ndarray): Summary data.
         summary (summary_pb2.Summary.Histogram): Summary histogram data.
     """
-    logger.debug("Set(%r) the histogram summary value", tag)
+    logger.debug(f"Set({tag}) the histogram summary value")
     # Default bucket for tensor with no valid data.
     ma_value = np.ma.masked_invalid(np_value)
     total, valid = np_value.size, ma_value.count()
@@ -281,7 +281,7 @@ def _fill_histogram_summary(tag: str, np_value: np.ndarray, summary) -> None:
     summary.count = total
     summary.nan_count, summary.pos_inf_count, summary.neg_inf_count = invalids
     if not valid:
-        logger.warning('There are no valid values in the ndarray(size=%d, shape=%d)', total, np_value.shape)
+        logger.warning(f'There are no valid values in the ndarray(size={total}, shape={np_value.shape})')
         # summary.{min, max, sum} are 0s by default, no need to explicitly set
     else:
         # BUG: max of a masked array with dtype np.float16 returns inf
@@ -290,9 +290,8 @@ def _fill_histogram_summary(tag: str, np_value: np.ndarray, summary) -> None:
             summary.min = ma_value.min(fill_value=np.PINF)
             summary.max = ma_value.max(fill_value=np.NINF)
             if summary.min < F32_MIN or summary.max > F32_MAX:
-                logger.warning(
-                    'Values(%r, %r) are too large, '
-                    'you may encounter some undefined behaviours hereafter.', summary.min, summary.max)
+                logger.warning(f'Values({summary.min}, {summary.max}) are too large, '
+                               f'you may encounter some undefined behaviours hereafter.')
         else:
             summary.min = ma_value.min()
             summary.max = ma_value.max()
@@ -327,14 +326,14 @@ def _fill_image_summary(tag: str, np_value, summary_image, input_format='NCHW'):
     Returns:
         Summary, return image summary content.
     """
-    logger.debug("Set(%r) the image summary value", tag)
+    logger.debug(f"Set({tag}) the image summary value")
     if np_value.ndim != 4 or np_value.shape[1] not in (1, 3):
-        logger.error("The value is not Image, tag = %r, ndim = %r, shape=%r", tag, np_value.ndim, np_value.shape)
+        logger.error(f"The value is not Image, tag = {tag}, ndim = {np_value.ndim}, shape={np_value.shape}")
         return False
 
     if np_value.ndim != len(input_format):
-        logger.error("The tensor with dim(%r) can't convert the format(%r) because dim not same", np_value.ndim,
-                     input_format)
+        logger.error(
+            f"The tensor with dim({np_value.ndim}) can't convert the format({input_format}) because dim not same")
         return False
 
     # convert the tensor format
diff --git a/mindspore/train/summary/_summary_writer.py b/mindspore/train/summary/_summary_writer.py
new file mode 100644
index 0000000000..36d020819a
--- /dev/null
+++ b/mindspore/train/summary/_summary_writer.py
@@ -0,0 +1,79 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Writes events to disk in a logdir."""
+import os
+import stat
+
+from ..._c_expression import EventWriter_
+from ._summary_adapter import package_init_event
+
+
+class BaseWriter:
+    """BaseWriter to be subclass."""
+
+    def __init__(self, filepath) -> None:
+        self._filepath = filepath
+        self._writer: EventWriter_ = None
+
+    def init_writer(self):
+        """Write some metadata etc."""
+
+    @property
+    def writer(self) -> EventWriter_:
+        """Get the writer."""
+        if self._writer is not None:
+            return self._writer
+
+        with open(self._filepath, 'w'):
+            os.chmod(self._filepath, stat.S_IWUSR | stat.S_IRUSR)
+        self._writer = EventWriter_(self._filepath)
+        self.init_writer()
+        return self._writer
+
+    def write(self, plugin, mode, data):
+        """Write data to file."""
+        raise NotImplementedError()
+
+    def flush(self):
+        """Flush the writer."""
+        if self._writer is not None:
+            self._writer.Flush()
+
+    def close(self):
+        """Close the writer."""
+        if self._writer is not None:
+            self._writer.Shut()
+
+
+class SummaryWriter(BaseWriter):
+    """SummaryWriter for write summaries."""
+
+    def init_writer(self):
+        """Write some metadata etc."""
+        self.writer.Write(package_init_event().SerializeToString())
+
+    def write(self, plugin, mode, data):
+        """Write data to file."""
+        if plugin in ('summary', 'graph'):
+            self.writer.Write(data)
+
+
+class LineageWriter(BaseWriter):
+    """LineageWriter for write lineage."""
+
+    def write(self, plugin, mode, data):
+        """Write data to file."""
+        if plugin in ('dataset_graph', 'train_lineage', 'eval_lineage', 'custom_lineage_data'):
+            self.writer.Write(data)
diff --git a/mindspore/train/summary/_writer_pool.py b/mindspore/train/summary/_writer_pool.py
new file mode 100644
index 0000000000..2d219743de
--- /dev/null
+++ b/mindspore/train/summary/_writer_pool.py
@@ -0,0 +1,114 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Write events to disk in a base directory."""
+import os
+from collections import deque
+from multiprocessing import Pool, Process, Queue, cpu_count
+
+from ._lineage_adapter import serialize_to_lineage_event
+from ._summary_adapter import package_graph_event, package_summary_event
+from ._summary_writer import SummaryWriter, LineageWriter
+
+
+def _pack_data(datadict):
+    """Pack data according to which plugin."""
+    result = []
+    summaries, step, mode = [], None, None
+    for plugin, datalist in datadict.items():
+        for data in datalist:
+            if plugin == 'graph':
+                result.append([plugin, data.get('mode'), package_graph_event(data.get('value')).SerializeToString()])
+            elif plugin in ('train_lineage', 'eval_lineage', 'custom_lineage_data', 'dataset_graph'):
+                result.append([plugin, data.get('mode'), serialize_to_lineage_event(plugin, data.get('value'))])
+            elif plugin in ('scalar', 'tensor', 'histogram', 'image'):
+                summaries.append({'_type': plugin.title(), 'name': data.get('tag'), 'data': data.get('value')})
+                step = data.get('step')
+                mode = data.get('mode')
+    if summaries:
+        result.append(['summary', mode, package_summary_event(summaries, step).SerializeToString()])
+    return result
+
+
+class WriterPool(Process):
+    """
+    Use a set of pooled resident processes for writing a list of file.
+
+    Args:
+        base_dir (str): The base directory to hold all the files.
+        filelist (str): The mapping from short name to long filename.
+    """
+
+    def __init__(self, base_dir, **filedict) -> None:
+        super().__init__()
+        self._base_dir, self._filedict = base_dir, filedict
+        self._queue = Queue(cpu_count() * 2)
+        self.start()
+
+    def run(self):
+        writers = self._get_writers()
+
+        with Pool() as pool:
+            deq = deque()
+            while True:
+                while deq and deq[0].ready():
+                    for plugin, mode, data in deq.popleft().get():
+                        for writer in writers:
+                            writer.write(plugin, mode, data)
+
+                if not self._queue.empty():
+                    action, data = self._queue.get()
+                    if action == 'WRITE':
+                        deq.append(pool.apply_async(_pack_data, (data,)))
+                    elif action == 'FLUSH':
+                        for writer in writers:
+                            writer.flush()
+                    elif action == 'END':
+                        break
+            for result in deq:
+                for plugin, mode, data in result.get():
+                    for writer in writers:
+                        writer.write(plugin, mode, data)
+
+            for writer in writers:
+                writer.close()
+
+    def _get_writers(self):
+        writers = []
+        for plugin, filename in self._filedict.items():
+            filepath = os.path.join(self._base_dir, filename)
+            if plugin == 'summary':
+                writers.append(SummaryWriter(filepath))
+            elif plugin == 'lineage':
+                writers.append(LineageWriter(filepath))
+        return writers
+
+    def write(self, data) -> None:
+        """
+        Write the event to file.
+
+        Args:
+            name (str): The key of a specified file.
+            data (Optional[str, Tuple[list, int]]): The data to write.
+        """
+        self._queue.put(('WRITE', data))
+
+    def flush(self):
+        """Flush the writer and sync data to disk."""
+        self._queue.put(('FLUSH', None))
+
+    def close(self) -> None:
+        """Close the writer."""
+        self._queue.put(('END', None))
+        self.join()
diff --git a/mindspore/train/summary/enum.py b/mindspore/train/summary/enum.py
new file mode 100644
index 0000000000..84044eab6c
--- /dev/null
+++ b/mindspore/train/summary/enum.py
@@ -0,0 +1,43 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Summary's enumeration file."""
+from enum import Enum
+
+
+class BaseEnum(Enum):
+    """The base enum class."""
+
+    @classmethod
+    def to_list(cls):
+        """Converts the enumeration into a list."""
+        return [member.value for member in cls.__members__.values()]
+
+
+class PluginEnum(BaseEnum):
+    """The list of plugins currently supported by the summary."""
+    GRAPH = 'graph'
+    SCALAR = 'scalar'
+    IMAGE = 'image'
+    TENSOR = 'tensor'
+    HISTOGRAM = 'histogram'
+    TRAIN_LINEAGE = 'train_lineage'
+    EVAL_LINEAGE = 'eval_lineage'
+    DATASET_GRAPH = 'dataset_graph'
+
+
+class ModeEnum(BaseEnum):
+    """The modes currently supported by the summary."""
+    TRAIN = 'train'
+    EVAL = 'eval'
diff --git a/mindspore/train/summary/summary_record.py b/mindspore/train/summary/summary_record.py
index b2bc872a1f..61c2c8adeb 100644
--- a/mindspore/train/summary/summary_record.py
+++ b/mindspore/train/summary/summary_record.py
@@ -21,9 +21,9 @@ from mindspore import log as logger
 
 from ..._c_expression import Tensor
 from ..._checkparam import _check_str_by_regular
-from .._utils import _make_directory
-from ._event_writer import EventWriter
-from ._summary_adapter import get_event_file_name, package_graph_event, package_init_event
+from .._utils import _make_directory, _check_to_numpy, _check_lineage_value
+from ._summary_adapter import get_event_file_name, package_graph_event
+from ._writer_pool import WriterPool
 
 # for the moment, this lock is for caution's sake,
 # there are actually no any concurrencies happening.
@@ -53,16 +53,20 @@ def _get_summary_tensor_data():
         return data
 
 
+def _dictlist():
+    from collections import defaultdict
+    return defaultdict(list)
+
+
 class SummaryRecord:
     """
-    SummaryRecord is used to record the summary value.
+    SummaryRecord is used to record the summary data and lineage data.
 
     Note:
-        The API will create an event file in a given directory and add summaries and events to it.
-        It writes the event log to a file by executing the record method. In addition,
-        if the SummaryRecord object is created and the summary operator is used in the network,
-        even if the record method is not called, the event in the cache will be written to the
-        file at the end of execution. Make sure to close the SummaryRecord object at the end.
+        The API will create a summary file and a lineage file lazily in a given directory and writes data to them.
+        It writes the data to files by executing the record method. In addition to record the data bubbled up from
+        the network by defining the summary operators, SummaryRecord also supports to record extra data which
+        can be added by calling add_value. Finally, make sure to close the SummaryRecord object at the end.
 
     Args:
         log_dir (str): The log_dir is a directory location to save the summary.
@@ -89,8 +93,12 @@ class SummaryRecord:
                  file_suffix="_MS",
                  network=None):
 
+        self._closed, self._mode = False, 'train'
+        self._data_pool = _dictlist()
+
         _check_str_by_regular(file_prefix)
         _check_str_by_regular(file_suffix)
+
         self.log_path = _make_directory(log_dir)
 
         if not isinstance(queue_max_size, int) or not isinstance(flush_time, int):
@@ -113,7 +121,6 @@ class SummaryRecord:
         self.suffix = file_suffix
         self.network = network
         self.has_graph = False
-        self._closed = False
 
         # create the summary writer file
         self.event_file_name = get_event_file_name(self.prefix, self.suffix)
@@ -122,18 +129,12 @@ class SummaryRecord:
         except Exception as ex:
             raise RuntimeError(ex)
 
-        self._event_writer = None
-
-    def _init_event_writer(self):
-        """Init event writer and write metadata."""
-        event_writer = EventWriter(self.full_file_name, self.flush_time)
-        event_writer.write(package_init_event().SerializeToString())
-        return event_writer
+        self._event_writer = WriterPool(log_dir,
+                                        summary=self.full_file_name,
+                                        lineage=get_event_file_name('events', '_lineage'))
 
     def __enter__(self):
         """Enter the context manager."""
-        if not self._event_writer:
-            self._event_writer = self._init_event_writer()
         if self._closed:
             raise ValueError('SummaryRecord has been closed.')
         return self
@@ -142,6 +143,76 @@ class SummaryRecord:
         """Exit the context manager."""
         self.close()
 
+    def set_mode(self, mode):
+        """
+        Set the mode for the recorder to be aware. The mode is set 'train' by default.
+
+        Args:
+            mode (str): The mode to set, which should be 'train' or 'eval'.
+
+        Raises:
+            ValueError: When the mode is not recognized.
+
+        Examples:
+            >>> with SummaryRecord(log_dir="/opt/log", file_prefix="xxx_", file_suffix="_yyy") as summary_record:
+            >>>     summary_record.set_mode('eval')
+        """
+        mode_spec = 'train', 'eval'
+        if mode not in mode_spec:
+            raise ValueError(f'{repr(mode)} is not a recognized mode.')
+        self._mode = mode
+
+    def add_value(self, plugin, name, value):
+        """
+        Add value to be record later on.
+
+        When the plugin is 'tensor', 'scalar', 'image' or 'histogram',
+        the name should be the tag name, and the value should be a Tensor.
+
+        When the plugin plugin is 'graph', the value should be a GraphProto.
+
+        When the plugin 'dataset_graph', 'train_lineage', 'eval_lineage',
+        or 'custom_lineage_data', the value should be a proto message.
+
+
+        Args:
+            plugin (str): The plugin for the value.
+            name (str): The name for the value.
+            value (Union[Tensor, GraphProto, TrainLineage, EvaluationLineage, DatasetGraph, UserDefinedInfo]): \
+                The value to store.
+
+                - GraphProto: The 'value' should be a serialized string this type when the plugin is 'graph'.
+                - Tensor: The 'value' should be this type when the plugin is 'scalar', 'image', 'tensor' or 'histogram'.
+                - TrainLineage: The 'value' should be this type when the plugin is 'train_lineage'.
+                - EvaluationLineage: The 'value' should be this type when the plugin is 'eval_lineage'.
+                - DatasetGraph: The 'value' should be this type when the plugin is 'dataset_graph'.
+                - UserDefinedInfo: The 'value' should be this type when the plugin is 'custom_lineage_data'.
+
+        Raises:
+            ValueError: When the name is not valid.
+            TypeError: When the value is not a Tensor.
+
+        Examples:
+            >>> with SummaryRecord(log_dir="/opt/log", file_prefix="xxx_", file_suffix="_yyy") as summary_record:
+            >>>     summary_record.add_value('scalar', 'loss', Tensor(0.1))
+        """
+        if plugin in ('tensor', 'scalar', 'image', 'histogram'):
+            if not name or not isinstance(name, str):
+                raise ValueError(f'{repr(name)} is not a valid tag name.')
+            if not isinstance(value, Tensor):
+                raise TypeError(f'Expect the value to be Tensor, but got {type(value).__name__}')
+            np_value = _check_to_numpy(plugin, value)
+            self._data_pool[plugin].append(dict(tag=name, mode=self._mode, value=np_value))
+
+        elif plugin in ('train_lineage', 'eval_lineage', 'dataset_graph', 'custom_lineage_data'):
+            _check_lineage_value(plugin, value)
+            self._data_pool[plugin].append(dict(mode=self._mode, value=value.SerializeToString()))
+        elif plugin == 'graph':
+            package_graph_event(value)
+            self._data_pool[plugin].append(dict(mode=self._mode, value=value))
+        else:
+            raise ValueError(f'No such plugin of {repr(plugin)}')
+
     def record(self, step, train_network=None):
         """
         Record the summary.
@@ -150,12 +221,12 @@ class SummaryRecord:
             step (int): Represents training step number.
             train_network (Cell): The network that called the callback.
 
+        Returns:
+            bool, whether the record process is successful or not.
+
         Examples:
             >>> with SummaryRecord(log_dir="/opt/log", file_prefix="xxx_", file_suffix="_yyy") as summary_record:
             >>>     summary_record.record(step=2)
-
-        Returns:
-            bool, whether the record process is successful or not.
         """
         logger.info("SummaryRecord step is %r.", step)
         if self._closed:
@@ -164,10 +235,6 @@ class SummaryRecord:
         if not isinstance(step, int) or isinstance(step, bool):
             raise ValueError("`step` should be int")
         # Set the current summary of train step
-        if not self._event_writer:
-            self._event_writer = self._init_event_writer()
-            logger.warning('SummaryRecord should be used as context manager for a with statement.')
-
         if self.network is not None and not self.has_graph:
             graph_proto = self.network.get_func_graph_proto()
             if graph_proto is None and train_network is not None:
@@ -175,39 +242,48 @@ class SummaryRecord:
             if graph_proto is None:
                 logger.error("Failed to get proto for graph")
             else:
-                self._event_writer.write(package_graph_event(graph_proto).SerializeToString())
+                self._event_writer.write({'graph': [{'step': step, 'value': graph_proto}]})
                 self.has_graph = True
                 if not _summary_tensor_cache:
                     return True
 
-        data = _get_summary_tensor_data()
-        if not data:
-            logger.error("The step(%r) does not have record data.", step)
-            return False
-        if self.queue_max_size > 0 and len(data) > self.queue_max_size:
-            logger.error("The size of data record is %r, which is greater than queue_max_size %r.", len(data),
-                         self.queue_max_size)
-
-        # process the data
-        result = self._data_convert(data)
-        if not result:
-            logger.error("The step(%r) summary data is invalid.", step)
-            return False
-        self._event_writer.write((result, step))
-        logger.debug("Send the summary data to scheduler for saving, step = %d", step)
+        if self._mode == 'train':
+            self._add_summary_tensor_data()
+
+        self._event_writer.write(self._consume_data_pool(step))
         return True
 
+    def _add_summary_tensor_data(self):
+        summary_data = _get_summary_tensor_data()
+        if not summary_data:
+            logger.debug(f'No summary data bubbled from the network.')
+        for name, tensor in summary_data.items():
+            tag, plugin = SummaryRecord._parse_from(name)
+            if (tag, plugin) == (None, None):
+                logger.warning("The name(%r) is invalid, expected 'TAG[:TYPE]'.", name)
+            else:
+                self.add_value(plugin.lower(), tag, tensor)
+
+    def _consume_data_pool(self, step):
+        try:
+            for values in self._data_pool.values():
+                for value in values:
+                    value['step'] = step
+            return self._data_pool
+        finally:
+            self._data_pool = _dictlist()
+
     @property
     def log_dir(self):
         """
         Get the full path of the log file.
 
+        Returns:
+            str, the full path of log file.
+
         Examples:
             >>> with SummaryRecord(log_dir="/opt/log", file_prefix="xxx_", file_suffix="_yyy") as summary_record:
             >>>     print(summary_record.log_dir)
-
-        Returns:
-            String, the full path of log file.
         """
         return self.full_file_name
 
@@ -236,46 +312,19 @@ class SummaryRecord:
         """
         if not self._closed and self._event_writer:
             # event writer flush and close
+            logger.info('Please wait it may take quite some time to finish writing and closing.')
             self._event_writer.close()
             self._closed = True
 
     def __del__(self) -> None:
         self.close()
 
-    def _data_convert(self, summary):
-        """Convert the data."""
-        # convert the summary to numpy
-        result = []
-        for name, data in summary.items():
-            # confirm the data is valid
-            summary_tag, summary_type = SummaryRecord._parse_from(name)
-            if summary_tag is None:
-                logger.error("The data type is invalid, name = %r, tensor = %r", name, data)
-                return None
-            if isinstance(data, Tensor):
-                result.append({'name': summary_tag, 'data': data.asnumpy(), '_type': summary_type})
-            else:
-                logger.error("The data type is invalid, name = %r, tensor = %r", name, data)
-                return None
-
-        return result
-
     @staticmethod
     def _parse_from(name: str = None):
-        """
-        Parse the tag and type from name.
-
-        Args:
-            name (str): Format: TAG[:TYPE].
-
-        Returns:
-            Tuple, (summary_tag, summary_type).
-        """
-        if name is None:
-            logger.error("The name is None")
+        """Parse the tag and type from name."""
+        if not isinstance(name, str):
             return None, None
         match = re.match(r'(.+)\[:(.+)\]', name)
         if match:
             return match.groups()
-        logger.error("The name(%r) format is invalid, expected 'TAG[:TYPE]'.", name)
         return None, None
diff --git a/model_zoo/README.md b/model_zoo/README.md
new file mode 100644
index 0000000000..24be683b22
--- /dev/null
+++ b/model_zoo/README.md
@@ -0,0 +1,306 @@
+![](https://www.mindspore.cn/static/img/logo.a3e472c9.png)
+
+
+# Welcome to the Model Zoo for MindSpore
+
+In order to facilitate developers to enjoy the benefits of MindSpore framework and Huawei chips, we will continue to add typical networks and models . If you have needs for the model zoo, you can file an issue on [gitee](https://gitee.com/mindspore/mindspore/issues) or [MindSpore](https://bbs.huaweicloud.com/forum/forum-1076-1.html), We will consider it in time.
+
+- SOTA models using the latest MindSpore APIs
+
+- The  best benefits from MindSpore and Huawei chips 
+
+- Officially maintained and supported
+
+  
+
+# Table of Contents
+
+- [Models and Implementations](#models-and-implementations)
+    - [Computer Vision](#computer-vision)
+        - [Image Classification](#image-classification)
+            - [GoogleNet](#googlenet)
+            - [ResNet50[benchmark]](#resnet50)
+            - [ResNet101](#resnet101)
+            - [VGG16](#vgg16)
+            - [AlexNet](#alexnet)
+            - [LeNet](#lenet)
+        - [Object Detection and Segmentation](#object-detection-and-segmentation)
+            - [YoloV3](#yolov3)
+            - [MobileNetV2](#mobilenetv2)
+            - [MobileNetV3](#mobilenetv3)
+            - [SSD](#ssd)
+    - [Natural Language Processing](#natural-language-processing)
+        - [BERT](#bert)
+        - [MASS](#mass)
+
+
+# Announcements
+| Date         | News                                                         |
+| ------------ | ------------------------------------------------------------ |
+| May 31, 2020 | Support [MindSpore v0.3.0-alpha](https://www.mindspore.cn/news/newschildren?id=215) |
+
+
+# Models and Implementations
+
+## Computer Vision
+
+### Image Classification 
+
+#### [GoogleNet](#table-of-contents)
+| Parameters                 | GoogleNet                                                    |
+| -------------------------- | ------------------------------------------------------------ |
+| Published Year             | 2014                                                         |
+| Paper                      | [Going Deeper with Convolutions](https://arxiv.org/abs/1409.4842) |
+| Resource                   | Ascend 910                                                   |
+| Features                   | • Mixed Precision   • Multi-GPU training support with Ascend |
+| MindSpore Version          | 0.3.0-alpha                                                  |
+| Dataset                    | CIFAR-10                                                     |
+| Training Parameters        | epoch=125, batch_size = 128, lr=0.1                          |
+| Optimizer                  | Momentum                                                     |
+| Loss Function              | Softmax Cross Entropy                                        |
+| Accuracy                   | 1pc: 93.4%; 8pcs: 92.17%                                     |
+| Speed                      | 79 ms/Step                                                   |
+| Loss                       | 0.0016                                                       |
+| Params (M)                 | 6.8                                                          |
+| Checkpoint for Fine tuning | 43.07M (.ckpt file)                                          |
+| Model for inference        | 21.50M (.onnx file),  21.60M(.geir file)                     |
+| Scripts                    | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/googlenet |
+
+#### [ResNet50](#table-of-contents)
+
+| Parameters                 | ResNet50 |
+| -------------------------- | -------- |
+| Published Year             |          |
+| Paper                      |          |
+| Resource                   |          |
+| Features                   |          |
+| MindSpore Version          |          |
+| Dataset                    |          |
+| Training Parameters        |          |
+| Optimizer                  |          |
+| Loss Function              |          |
+| Accuracy                   |          |
+| Speed                      |          |
+| Loss                       |          |
+| Params (M)                 |          |
+| Checkpoint for Fine tuning |          |
+| Model for inference        |          |
+| Scripts                    |          |
+
+#### [ResNet101](#table-of-contents)
+
+| Parameters                 | ResNet101 |
+| -------------------------- | --------- |
+| Published Year             |           |
+| Paper                      |           |
+| Resource                   |           |
+| Features                   |           |
+| MindSpore Version          |           |
+| Dataset                    |           |
+| Training Parameters        |           |
+| Optimizer                  |           |
+| Loss Function              |           |
+| Accuracy                   |           |
+| Speed                      |           |
+| Loss                       |           |
+| Params (M)                 |           |
+| Checkpoint for Fine tuning |           |
+| Model for inference        |           |
+| Scripts                    |           |
+
+#### [VGG16](#table-of-contents)
+
+| Parameters                 | VGG16 |
+| -------------------------- | ----- |
+| Published Year             |       |
+| Paper                      |       |
+| Resource                   |       |
+| Features                   |       |
+| MindSpore Version          |       |
+| Dataset                    |       |
+| Training Parameters        |       |
+| Optimizer                  |       |
+| Loss Function              |       |
+| Accuracy                   |       |
+| Speed                      |       |
+| Loss                       |       |
+| Params (M)                 |       |
+| Checkpoint for Fine tuning |       |
+| Model for inference        |       |
+| Scripts                    |       |
+
+#### [AlexNet](#table-of-contents)
+
+| Parameters                 | AlexNet |
+| -------------------------- | ------- |
+| Published Year             |         |
+| Paper                      |         |
+| Resource                   |         |
+| Features                   |         |
+| MindSpore Version          |         |
+| Dataset                    |         |
+| Training Parameters        |         |
+| Optimizer                  |         |
+| Loss Function              |         |
+| Accuracy                   |         |
+| Speed                      |         |
+| Loss                       |         |
+| Params (M)                 |         |
+| Checkpoint for Fine tuning |         |
+| Model for inference        |         |
+| Scripts                    |         |
+
+#### [LeNet](#table-of-contents)
+
+| Parameters                 | LeNet |
+| -------------------------- | ----- |
+| Published Year             |       |
+| Paper                      |       |
+| Resource                   |       |
+| Features                   |       |
+| MindSpore Version          |       |
+| Dataset                    |       |
+| Training Parameters        |       |
+| Optimizer                  |       |
+| Loss Function              |       |
+| Accuracy                   |       |
+| Speed                      |       |
+| Loss                       |       |
+| Params (M)                 |       |
+| Checkpoint for Fine tuning |       |
+| Model for inference        |       |
+| Scripts                    |       |
+
+### Object Detection and Segmentation 
+
+#### [YoloV3](#table-of-contents)
+
+| Parameters                       | YoLoV3 |
+| -------------------------------- | ------ |
+| Published Year                   |        |
+| Paper                            |        |
+| Resource                         |        |
+| Features                         |        |
+| MindSpore Version                |        |
+| Dataset                          |        |
+| Training Parameters              |        |
+| Optimizer                        |        |
+| Loss Function                    |        |
+| Mean Average Precision (mAP@0.5) |        |
+| Speed                            |        |
+| Loss                             |        |
+| Params (M)                       |        |
+| Checkpoint for Fine tuning       |        |
+| Model for inference              |        |
+| Scripts                          |        |
+
+#### [MobileNetV2](#table-of-contents)
+
+| Parameters                       | MobileNetV2 |
+| -------------------------------- | ----------- |
+| Published Year                   |             |
+| Paper                            |             |
+| Resource                         |             |
+| Features                         |             |
+| MindSpore Version                |             |
+| Dataset                          |             |
+| Training Parameters              |             |
+| Optimizer                        |             |
+| Loss Function                    |             |
+| Mean Average Precision (mAP@0.5) |             |
+| Speed                            |             |
+| Loss                             |             |
+| Params (M)                       |             |
+| Checkpoint for Fine tuning       |             |
+| Model for inference              |             |
+| Scripts                          |             |
+
+#### [MobileNetV3](#table-of-contents)
+
+| Parameters                       | MobileNetV3 |
+| -------------------------------- | ----------- |
+| Published Year                   |             |
+| Paper                            |             |
+| Resource                         |             |
+| Features                         |             |
+| MindSpore Version                |             |
+| Dataset                          |             |
+| Training Parameters              |             |
+| Optimizer                        |             |
+| Loss Function                    |             |
+| Mean Average Precision (mAP@0.5) |             |
+| Speed                            |             |
+| Loss                             |             |
+| Params (M)                       |             |
+| Checkpoint for Fine tuning       |             |
+| Model for inference              |             |
+| Scripts                          |             |
+
+#### [SSD](#table-of-contents)
+
+| Parameters                       | SSD  |
+| -------------------------------- | ---- |
+| Published Year                   |      |
+| Paper                            |      |
+| Resource                         |      |
+| Features                         |      |
+| MindSpore Version                |      |
+| Dataset                          |      |
+| Training Parameters              |      |
+| Optimizer                        |      |
+| Loss Function                    |      |
+| Mean Average Precision (mAP@0.5) |      |
+| Speed                            |      |
+| Loss                             |      |
+| Params (M)                       |      |
+| Checkpoint for Fine tuning       |      |
+| Model for inference              |      |
+| Scripts                          |      |
+
+## Natural Language Processing
+
+#### [BERT](#table-of-contents)
+
+| Parameters                 | BERT |
+| -------------------------- | ---- |
+| Published Year             |      |
+| Paper                      |      |
+| Resource                   |      |
+| Features                   |      |
+| MindSpore Version          |      |
+| Dataset                    |      |
+| Training Parameters        |      |
+| Optimizer                  |      |
+| Loss Function              |      |
+| GLUE Score                 |      |
+| Speed                      |      |
+| Loss                       |      |
+| Params (M)                 |      |
+| Checkpoint for Fine tuning |      |
+| Model for inference        |      |
+| Scripts                    |      |
+
+#### [MASS](#table-of-contents)
+
+| Parameters                 | MASS |
+| -------------------------- | ---- |
+| Published Year             |      |
+| Paper                      |      |
+| Resource                   |      |
+| Features                   |      |
+| MindSpore Version          |      |
+| Dataset                    |      |
+| Training Parameters        |      |
+| Optimizer                  |      |
+| Loss Function              |      |
+| ROUGE Score                |      |
+| Speed                      |      |
+| Loss                       |      |
+| Params (M)                 |      |
+| Checkpoint for Fine tuning |      |
+| Model for inference        |      |
+| Scripts                    |      |
+
+#### License
+
+[Apache License 2.0](https://github.com/mindspore-ai/mindspore/blob/master/LICENSE)
diff --git a/model_zoo/Transformer/eval.py b/model_zoo/Transformer/eval.py
index 26d00f1c58..5ced75ba33 100644
--- a/model_zoo/Transformer/eval.py
+++ b/model_zoo/Transformer/eval.py
@@ -78,9 +78,8 @@ def load_weights(model_path):
 
     weights = {}
     for msname in ms_ckpt:
-        infer_name = msname.replace("transformer.transformer.", "")
+        infer_name = msname
         if "tfm_decoder" in msname:
-            infer_name = infer_name.replace(".layers.", ".layer")
             infer_name = "tfm_decoder.decoder." + infer_name
         if is_npz:
             weights[infer_name] = ms_ckpt[msname]
diff --git a/model_zoo/Transformer/src/transformer_model.py b/model_zoo/Transformer/src/transformer_model.py
index 17b5127dca..409f8965eb 100644
--- a/model_zoo/Transformer/src/transformer_model.py
+++ b/model_zoo/Transformer/src/transformer_model.py
@@ -20,11 +20,11 @@ import numpy as np
 import mindspore.common.dtype as mstype
 import mindspore.nn as nn
 import mindspore.ops.functional as F
-from mindspore.common.initializer import TruncatedNormal, initializer
 from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
 from .beam_search import BeamSearchDecoder, TileBeam
+from .weight_init import normal_weight, weight_variable
 
 class TransformerConfig:
     """
@@ -118,9 +118,7 @@ class EmbeddingLookup(nn.Cell):
         self.vocab_size = vocab_size
         self.embedding_size = embedding_size
         self.use_one_hot_embeddings = use_one_hot_embeddings
-        self.embedding_table = Parameter(initializer
-                                         (TruncatedNormal(initializer_range),
-                                          [vocab_size, embedding_size]),
+        self.embedding_table = Parameter(normal_weight([vocab_size, embedding_size], embedding_size),
                                          name='embedding_table')
         self.expand = P.ExpandDims()
         self.shape_flat = (-1,)
@@ -138,8 +136,7 @@ class EmbeddingLookup(nn.Cell):
         flat_ids = self.reshape(input_ids, self.shape_flat)
         if self.use_one_hot_embeddings:
             one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value)
-            output_for_reshape = self.array_mul(
-                one_hot_ids, self.embedding_table)
+            output_for_reshape = self.array_mul(one_hot_ids, self.embedding_table)
         else:
             output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)
 
@@ -329,22 +326,22 @@ class MultiheadAttention(nn.Cell):
                                     units,
                                     activation=query_act,
                                     has_bias=False,
-                                    weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+                                    weight_init=weight_variable([units, from_tensor_width])).to_float(compute_type)
         self.key_layer = nn.Dense(to_tensor_width,
                                   units,
                                   activation=key_act,
                                   has_bias=False,
-                                  weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+                                  weight_init=weight_variable([units, to_tensor_width])).to_float(compute_type)
         self.value_layer = nn.Dense(to_tensor_width,
                                     units,
                                     activation=value_act,
                                     has_bias=False,
-                                    weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+                                    weight_init=weight_variable([units, to_tensor_width])).to_float(compute_type)
         self.out_layer = nn.Dense(units,
                                   out_tensor_width,
                                   activation=out_act,
                                   has_bias=False,
-                                  weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+                                  weight_init=weight_variable([out_tensor_width, units])).to_float(compute_type)
 
         self.shape_from = (batch_size, from_seq_length, num_attention_heads, size_per_head)
         self.shape_to = (batch_size, to_seq_length, num_attention_heads, size_per_head)
@@ -518,10 +515,10 @@ class FeedForward(nn.Cell):
         self.conv1 = nn.Dense(in_channels,
                               hidden_size,
                               activation=hidden_act,
-                              weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+                              weight_init=weight_variable([hidden_size, in_channels])).to_float(compute_type)
         self.conv2 = nn.Dense(hidden_size,
                               out_channels,
-                              weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+                              weight_init=weight_variable([out_channels, hidden_size])).to_float(compute_type)
 
         self.preprocess = LayerPreprocess(in_channels=in_channels)
         self.postprocess = LayerPostprocess(dropout_prob=hidden_dropout_prob)
@@ -784,95 +781,22 @@ class TransformerDecoder(nn.Cell):
         super(TransformerDecoder, self).__init__()
         self.num_hidden_layers = num_hidden_layers
 
-        # wait to be supported
-        # layers = []
-        # for _ in range(num_hidden_layers):
-        #     layer = DecoderCell(batch_size=batch_size,
-        #                         hidden_size=hidden_size,
-        #                         seq_length=seq_length,
-        #                         enc_seq_length=enc_seq_length,
-        #                         num_attention_heads=num_attention_heads,
-        #                         intermediate_size=intermediate_size,
-        #                         attention_probs_dropout_prob=attention_probs_dropout_prob,
-        #                         use_one_hot_embeddings=use_one_hot_embeddings,
-        #                         initializer_range=initializer_range,
-        #                         hidden_dropout_prob=hidden_dropout_prob,
-        #                         hidden_act=hidden_act,
-        #                         compute_type=compute_type)
-        #     layers.append(layer)
-        # self.layers = nn.CellList(layers)
-        self.layer0 = DecoderCell(batch_size=batch_size,
-                                  hidden_size=hidden_size,
-                                  seq_length=seq_length,
-                                  enc_seq_length=enc_seq_length,
-                                  num_attention_heads=num_attention_heads,
-                                  intermediate_size=intermediate_size,
-                                  attention_probs_dropout_prob=attention_probs_dropout_prob,
-                                  use_one_hot_embeddings=use_one_hot_embeddings,
-                                  initializer_range=initializer_range,
-                                  hidden_dropout_prob=hidden_dropout_prob,
-                                  hidden_act=hidden_act,
-                                  compute_type=compute_type)
-        self.layer1 = DecoderCell(batch_size=batch_size,
-                                  hidden_size=hidden_size,
-                                  seq_length=seq_length,
-                                  enc_seq_length=enc_seq_length,
-                                  num_attention_heads=num_attention_heads,
-                                  intermediate_size=intermediate_size,
-                                  attention_probs_dropout_prob=attention_probs_dropout_prob,
-                                  use_one_hot_embeddings=use_one_hot_embeddings,
-                                  initializer_range=initializer_range,
-                                  hidden_dropout_prob=hidden_dropout_prob,
-                                  hidden_act=hidden_act,
-                                  compute_type=compute_type)
-        self.layer2 = DecoderCell(batch_size=batch_size,
-                                  hidden_size=hidden_size,
-                                  seq_length=seq_length,
-                                  enc_seq_length=enc_seq_length,
-                                  num_attention_heads=num_attention_heads,
-                                  intermediate_size=intermediate_size,
-                                  attention_probs_dropout_prob=attention_probs_dropout_prob,
-                                  use_one_hot_embeddings=use_one_hot_embeddings,
-                                  initializer_range=initializer_range,
-                                  hidden_dropout_prob=hidden_dropout_prob,
-                                  hidden_act=hidden_act,
-                                  compute_type=compute_type)
-        self.layer3 = DecoderCell(batch_size=batch_size,
-                                  hidden_size=hidden_size,
-                                  seq_length=seq_length,
-                                  enc_seq_length=enc_seq_length,
-                                  num_attention_heads=num_attention_heads,
-                                  intermediate_size=intermediate_size,
-                                  attention_probs_dropout_prob=attention_probs_dropout_prob,
-                                  use_one_hot_embeddings=use_one_hot_embeddings,
-                                  initializer_range=initializer_range,
-                                  hidden_dropout_prob=hidden_dropout_prob,
-                                  hidden_act=hidden_act,
-                                  compute_type=compute_type)
-        self.layer4 = DecoderCell(batch_size=batch_size,
-                                  hidden_size=hidden_size,
-                                  seq_length=seq_length,
-                                  enc_seq_length=enc_seq_length,
-                                  num_attention_heads=num_attention_heads,
-                                  intermediate_size=intermediate_size,
-                                  attention_probs_dropout_prob=attention_probs_dropout_prob,
-                                  use_one_hot_embeddings=use_one_hot_embeddings,
-                                  initializer_range=initializer_range,
-                                  hidden_dropout_prob=hidden_dropout_prob,
-                                  hidden_act=hidden_act,
-                                  compute_type=compute_type)
-        self.layer5 = DecoderCell(batch_size=batch_size,
-                                  hidden_size=hidden_size,
-                                  seq_length=seq_length,
-                                  enc_seq_length=enc_seq_length,
-                                  num_attention_heads=num_attention_heads,
-                                  intermediate_size=intermediate_size,
-                                  attention_probs_dropout_prob=attention_probs_dropout_prob,
-                                  use_one_hot_embeddings=use_one_hot_embeddings,
-                                  initializer_range=initializer_range,
-                                  hidden_dropout_prob=hidden_dropout_prob,
-                                  hidden_act=hidden_act,
-                                  compute_type=compute_type)
+        layers = []
+        for _ in range(num_hidden_layers):
+            layer = DecoderCell(batch_size=batch_size,
+                                hidden_size=hidden_size,
+                                seq_length=seq_length,
+                                enc_seq_length=enc_seq_length,
+                                num_attention_heads=num_attention_heads,
+                                intermediate_size=intermediate_size,
+                                attention_probs_dropout_prob=attention_probs_dropout_prob,
+                                use_one_hot_embeddings=use_one_hot_embeddings,
+                                initializer_range=initializer_range,
+                                hidden_dropout_prob=hidden_dropout_prob,
+                                hidden_act=hidden_act,
+                                compute_type=compute_type)
+            layers.append(layer)
+        self.layers = nn.CellList(layers)
 
         self.layer_preprocess = LayerPreprocess(in_channels=hidden_size)
 
@@ -883,16 +807,9 @@ class TransformerDecoder(nn.Cell):
     def construct(self, input_tensor, attention_mask, enc_states, enc_attention_mask):
         prev_output = self.reshape(input_tensor, self.shape)
 
-        # wait to be supported
-        # for layer_module in self.layers:
-        #     layer_output = layer_module(prev_output, attention_mask, enc_states, enc_attention_mask)
-        #     prev_output = layer_output
-        prev_output = self.layer0(prev_output, attention_mask, enc_states, enc_attention_mask)
-        prev_output = self.layer1(prev_output, attention_mask, enc_states, enc_attention_mask)
-        prev_output = self.layer2(prev_output, attention_mask, enc_states, enc_attention_mask)
-        prev_output = self.layer3(prev_output, attention_mask, enc_states, enc_attention_mask)
-        prev_output = self.layer4(prev_output, attention_mask, enc_states, enc_attention_mask)
-        prev_output = self.layer5(prev_output, attention_mask, enc_states, enc_attention_mask)
+        for layer_module in self.layers:
+            layer_output = layer_module(prev_output, attention_mask, enc_states, enc_attention_mask)
+            prev_output = layer_output
 
         prev_output = self.layer_preprocess(prev_output)
         output = self.reshape(prev_output, self.out_shape)
@@ -1108,7 +1025,13 @@ class TransformerModel(nn.Cell):
             embedding_size=self.embedding_size,
             use_one_hot_embeddings=use_one_hot_embeddings,
             initializer_range=config.initializer_range)
-        self.tfm_embedding_postprocessor = EmbeddingPostprocessor(
+        self.tfm_embedding_postprocessor_for_encoder = EmbeddingPostprocessor(
+            embedding_size=self.embedding_size,
+            use_one_hot_embeddings=use_one_hot_embeddings,
+            initializer_range=0.02,
+            max_position_embeddings=config.max_position_embeddings,
+            dropout_prob=config.hidden_dropout_prob)
+        self.tfm_embedding_postprocessor_for_decoder = EmbeddingPostprocessor(
             embedding_size=self.embedding_size,
             use_one_hot_embeddings=use_one_hot_embeddings,
             initializer_range=0.02,
@@ -1171,7 +1094,7 @@ class TransformerModel(nn.Cell):
                 hidden_act=config.hidden_act,
                 compute_type=config.compute_type,
                 embedding_lookup=self.tfm_embedding_lookup,
-                embedding_processor=self.tfm_embedding_postprocessor,
+                embedding_processor=self.tfm_embedding_postprocessor_for_decoder,
                 projection=self.projection)
             self.tfm_decoder = BeamSearchDecoder(
                 batch_size=config.batch_size,
@@ -1195,15 +1118,14 @@ class TransformerModel(nn.Cell):
             ones = np.ones(shape=(self.seq_length, self.seq_length))
             self.future_mask = Tensor(np.tril(ones), dtype=mstype.float32)
         else:
-            self.tile_beam = TileBeam(
-                beam_width=config.beam_width)
+            self.tile_beam = TileBeam(beam_width=config.beam_width)
             ones = np.ones(shape=(config.batch_size, config.max_decode_length))
             self.encdec_mask = Tensor(ones, dtype=mstype.float32)
 
     def construct(self, source_ids, source_mask, target_ids=None, target_mask=None):
         # process source sentence
         src_word_embeddings, embedding_tables = self.tfm_embedding_lookup(source_ids)
-        src_embedding_output = self.tfm_embedding_postprocessor(src_word_embeddings)
+        src_embedding_output = self.tfm_embedding_postprocessor_for_encoder(src_word_embeddings)
         # attention mask [batch_size, seq_length, seq_length]
         enc_attention_mask = self._create_attention_mask_from_input_mask(source_mask)
         # transformer encoder
@@ -1213,7 +1135,7 @@ class TransformerModel(nn.Cell):
         if self.is_training:
             # process target sentence
             tgt_word_embeddings, _ = self.tfm_embedding_lookup(target_ids)
-            tgt_embedding_output = self.tfm_embedding_postprocessor(tgt_word_embeddings)
+            tgt_embedding_output = self.tfm_embedding_postprocessor_for_decoder(tgt_word_embeddings)
             # attention mask [batch_size, seq_length, seq_length]
             tgt_attention_mask = self._create_attention_mask_from_input_mask(target_mask)
             tgt_attention_mask = self.multiply(tgt_attention_mask, self.expand(self.future_mask, 0))
@@ -1223,15 +1145,14 @@ class TransformerModel(nn.Cell):
                                               encoder_output, enc_attention_mask)
             # calculate logits and log_probs
             log_probs = self.projection(decoder_output, embedding_tables)
-            return log_probs
-
-        beam_encoder_output = self.tile_beam(encoder_output)
+            ret = log_probs
+        else:
+            beam_encoder_output = self.tile_beam(encoder_output)
 
-        enc_attention_mask = self.multiply(
-            enc_attention_mask[::, 0:1:1, ::],
-            self.expand(self.encdec_mask, -1))
+            enc_attention_mask = self.multiply(enc_attention_mask[::, 0:1:1, ::], self.expand(self.encdec_mask, -1))
 
-        beam_enc_attention_mask = self.tile_beam(enc_attention_mask)
-        beam_enc_attention_mask = self.cast_compute_type(beam_enc_attention_mask)
-        predicted_ids = self.tfm_decoder(beam_encoder_output, beam_enc_attention_mask)
-        return predicted_ids
+            beam_enc_attention_mask = self.tile_beam(enc_attention_mask)
+            beam_enc_attention_mask = self.cast_compute_type(beam_enc_attention_mask)
+            predicted_ids = self.tfm_decoder(beam_encoder_output, beam_enc_attention_mask)
+            ret = predicted_ids
+        return ret
diff --git a/model_zoo/Transformer/train.py b/model_zoo/Transformer/train.py
index 37165a6c20..23c0eb78fd 100644
--- a/model_zoo/Transformer/train.py
+++ b/model_zoo/Transformer/train.py
@@ -16,9 +16,10 @@
 
 import time
 import argparse
+import random
+import numpy as np
 
 import mindspore.common.dtype as mstype
-from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 from mindspore.nn.optim import Adam
 from mindspore.train.model import Model
@@ -26,6 +27,7 @@ from mindspore.train.loss_scale_manager import DynamicLossScaleManager
 from mindspore.train.callback import CheckpointConfig, ModelCheckpoint
 from mindspore.train.callback import Callback, TimeMonitor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
+import mindspore.dataset.engine as de
 import mindspore.communication.management as D
 from mindspore.train.parallel_utils import ParallelMode
 from mindspore import context
@@ -34,9 +36,12 @@ from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNe
                                       TransformerTrainOneStepWithLossScaleCell
 from src.config import cfg, transformer_net_cfg
 from src.dataset import create_transformer_dataset
-from src.weight_init import weight_variable, one_weight, zero_weight, normal_weight
 from src.lr_schedule import create_dynamic_lr
 
+random_seed = 1
+random.seed(random_seed)
+np.random.seed(random_seed)
+de.config.set_seed(random_seed)
 
 def get_ms_timestamp():
     t = time.time()
@@ -108,7 +113,7 @@ def run_transformer_train():
     parser = argparse_init()
     args, _ = parser.parse_known_args()
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id)
-    context.set_context(save_graphs=True, reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)
+    context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)
 
     if args.distribute == "true":
         device_num = args.device_num
@@ -129,29 +134,15 @@ def run_transformer_train():
 
     if args.checkpoint_path:
         parameter_dict = load_checkpoint(args.checkpoint_path)
-    else:
-        parameter_dict = {}
-        params = netwithloss.trainable_params()
-        for param in params:
-            name = param.name
-            value = param.default_input
-            if isinstance(value, Tensor):
-                if name.endswith(".gamma"):
-                    parameter_dict[name] = Parameter(one_weight(value.asnumpy().shape), name=name)
-                elif name.endswith(".beta") or name.endswith(".bias"):
-                    parameter_dict[name] = Parameter(zero_weight(value.asnumpy().shape), name=name)
-                elif "embedding" in name:
-                    parameter_dict[name] = Parameter(normal_weight(value.asnumpy().shape,
-                                                                   transformer_net_cfg.hidden_size), name=name)
-                else:
-                    parameter_dict[name] = Parameter(weight_variable(value.asnumpy().shape), name=name)
-    load_param_into_net(netwithloss, parameter_dict)
+        load_param_into_net(netwithloss, parameter_dict)
 
     lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
                                   training_steps=dataset.get_dataset_size()*args.epoch_size,
                                   learning_rate=cfg.lr_schedule.learning_rate,
                                   warmup_steps=cfg.lr_schedule.warmup_steps,
-                                  hidden_size=transformer_net_cfg.hidden_size), mstype.float32)
+                                  hidden_size=transformer_net_cfg.hidden_size,
+                                  start_decay_step=cfg.lr_schedule.start_decay_step,
+                                  min_lr=cfg.lr_schedule.min_lr), mstype.float32)
     optimizer = Adam(netwithloss.trainable_params(), lr)
 
     callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()]
diff --git a/model_zoo/__init__.py b/model_zoo/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/alexnet_cifar10/README.md b/model_zoo/alexnet/README.md
similarity index 69%
rename from example/alexnet_cifar10/README.md
rename to model_zoo/alexnet/README.md
index e6649e4055..1059e22aae 100644
--- a/example/alexnet_cifar10/README.md
+++ b/model_zoo/alexnet/README.md
@@ -2,7 +2,7 @@
 
 ## Description
 
-Training AlexNet with CIFAR-10 dataset in MindSpore.
+Training AlexNet with dataset in MindSpore.
 
 This is the simple tutorial for training AlexNet in MindSpore.
 
@@ -10,19 +10,19 @@ This is the simple tutorial for training AlexNet in MindSpore.
 
 - Install [MindSpore](https://www.mindspore.cn/install/en).
 
-- Download the CIFAR-10 dataset, the directory structure is as follows:
+- Download the dataset, the directory structure is as follows:
 
 ```
-├─cifar-10-batches-bin
+├─10-batches-bin
 │
-└─cifar-10-verify-bin
+└─10-verify-bin
 ```
 
 ## Running the example
 
 ```python
 # train AlexNet, hyperparameter setting in config.py
-python train.py --data_path cifar-10-batches-bin
+python train.py --data_path 10-batches-bin
 ```
 
 You will get the loss value of each step as following:
@@ -38,8 +38,8 @@ epoch: 1 step: 1538, loss is 1.0221305
 
 Then, evaluate AlexNet according to network model
 ```python
-# evaluate AlexNet, 1 epoch training accuracy is up to 51.1%; 10 epoch training accuracy is up to 81.2%
-python eval.py --data_path cifar-10-verify-bin --mode test --ckpt_path checkpoint_alexnet-1_1562.ckpt
+# evaluate AlexNet
+python eval.py --data_path 10-verify-bin --ckpt_path checkpoint_alexnet-1_1562.ckpt
 ```
 
 ## Note
diff --git a/example/alexnet_cifar10/eval.py b/model_zoo/alexnet/eval.py
similarity index 84%
rename from example/alexnet_cifar10/eval.py
rename to model_zoo/alexnet/eval.py
index 2efc6d15f6..4190451632 100644
--- a/example/alexnet_cifar10/eval.py
+++ b/model_zoo/alexnet/eval.py
@@ -19,11 +19,11 @@ python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt
 """
 
 import argparse
-from config import alexnet_cfg as cfg
-from dataset import create_dataset
+from src.config import alexnet_cfg as cfg
+from src.dataset import create_dataset_mnist
+from src.alexnet import AlexNet
 import mindspore.nn as nn
 from mindspore import context
-from mindspore.model_zoo.alexnet import AlexNet
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
@@ -36,7 +36,7 @@ if __name__ == "__main__":
     parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved')
     parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\
                         path where the trained ckpt file')
-    parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True')
+    parser.add_argument('--dataset_sink_mode', type=bool, default=True, help='dataset_sink_mode is False or True')
     args = parser.parse_args()
 
     context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
@@ -45,14 +45,13 @@ if __name__ == "__main__":
     loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
     repeat_size = cfg.epoch_size
     opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum)
-    model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()})  # test
+    model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()})
 
     print("============== Starting Testing ==============")
     param_dict = load_checkpoint(args.ckpt_path)
     load_param_into_net(network, param_dict)
-    ds_eval = create_dataset(args.data_path,
-                             cfg.batch_size,
-                             1,
-                             "test")
+    ds_eval = create_dataset_mnist(args.data_path,
+                                   cfg.batch_size,
+                                   status="test")
     acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode)
-    print("============== Accuracy:{} ==============".format(acc))
+    print("============== {} ==============".format(acc))
diff --git a/model_zoo/alexnet/src/__init__.py b/model_zoo/alexnet/src/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mindspore/model_zoo/alexnet.py b/model_zoo/alexnet/src/alexnet.py
similarity index 95%
rename from mindspore/model_zoo/alexnet.py
rename to model_zoo/alexnet/src/alexnet.py
index 7ad1c8e37b..c528ae39e9 100644
--- a/mindspore/model_zoo/alexnet.py
+++ b/model_zoo/alexnet/src/alexnet.py
@@ -36,10 +36,9 @@ class AlexNet(nn.Cell):
     """
     Alexnet
     """
-    def __init__(self, num_classes=10):
+    def __init__(self, num_classes=10, channel=3):
         super(AlexNet, self).__init__()
-        self.batch_size = 32
-        self.conv1 = conv(3, 96, 11, stride=4)
+        self.conv1 = conv(channel, 96, 11, stride=4)
         self.conv2 = conv(96, 256, 5, pad_mode="same")
         self.conv3 = conv(256, 384, 3, pad_mode="same")
         self.conv4 = conv(384, 384, 3, pad_mode="same")
diff --git a/example/alexnet_cifar10/config.py b/model_zoo/alexnet/src/config.py
similarity index 100%
rename from example/alexnet_cifar10/config.py
rename to model_zoo/alexnet/src/config.py
diff --git a/example/alexnet_cifar10/dataset.py b/model_zoo/alexnet/src/dataset.py
similarity index 94%
rename from example/alexnet_cifar10/dataset.py
rename to model_zoo/alexnet/src/dataset.py
index d62ed2852d..6e9f310bed 100644
--- a/example/alexnet_cifar10/dataset.py
+++ b/model_zoo/alexnet/src/dataset.py
@@ -16,14 +16,14 @@
 Produce the dataset
 """
 
-from config import alexnet_cfg as cfg
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.c_transforms as C
 import mindspore.dataset.transforms.vision.c_transforms as CV
 from mindspore.common import dtype as mstype
+from .config import alexnet_cfg as cfg
 
 
-def create_dataset(data_path, batch_size=32, repeat_size=1, status="train"):
+def create_dataset_mnist(data_path, batch_size=32, repeat_size=1, status="train"):
     """
     create dataset for train or test
     """
diff --git a/example/alexnet_cifar10/generator_lr.py b/model_zoo/alexnet/src/generator_lr.py
similarity index 100%
rename from example/alexnet_cifar10/generator_lr.py
rename to model_zoo/alexnet/src/generator_lr.py
diff --git a/example/alexnet_cifar10/train.py b/model_zoo/alexnet/train.py
similarity index 85%
rename from example/alexnet_cifar10/train.py
rename to model_zoo/alexnet/train.py
index 0a288ea1db..184290c26c 100644
--- a/example/alexnet_cifar10/train.py
+++ b/model_zoo/alexnet/train.py
@@ -19,15 +19,15 @@ python train.py --data_path /YourDataPath
 """
 
 import argparse
-from config import alexnet_cfg as cfg
-from dataset import create_dataset
-from generator_lr import get_lr
+from src.config import alexnet_cfg as cfg
+from src.dataset import create_dataset_mnist
+from src.generator_lr import get_lr
+from src.alexnet import AlexNet
 import mindspore.nn as nn
 from mindspore import context
 from mindspore import Tensor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
-from mindspore.model_zoo.alexnet import AlexNet
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 
 
@@ -38,24 +38,22 @@ if __name__ == "__main__":
     parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved')
     parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\
                         path where the trained ckpt file')
-    parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True')
+    parser.add_argument('--dataset_sink_mode', type=bool, default=True, help='dataset_sink_mode is False or True')
     args = parser.parse_args()
 
     context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
 
+    ds_train = create_dataset_mnist(args.data_path, cfg.batch_size, cfg.epoch_size)
     network = AlexNet(cfg.num_classes)
     loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
-    lr = Tensor(get_lr(0, cfg.learning_rate, cfg.epoch_size, cfg.save_checkpoint_steps))
+    lr = Tensor(get_lr(0, cfg.learning_rate, cfg.epoch_size, ds_train.get_dataset_size()))
     opt = nn.Momentum(network.trainable_params(), lr, cfg.momentum)
-    model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()})  # test
-
-    print("============== Starting Training ==============")
-    ds_train = create_dataset(args.data_path,
-                              cfg.batch_size,
-                              cfg.epoch_size)
+    model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()})
     time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
     config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                  keep_checkpoint_max=cfg.keep_checkpoint_max)
     ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", directory=args.ckpt_path, config=config_ck)
+
+    print("============== Starting Training ==============")
     model.train(cfg.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()],
                 dataset_sink_mode=args.dataset_sink_mode)
diff --git a/model_zoo/bert/evaluation.py b/model_zoo/bert/evaluation.py
index c58bf836fd..4877b60cef 100644
--- a/model_zoo/bert/evaluation.py
+++ b/model_zoo/bert/evaluation.py
@@ -18,9 +18,11 @@ Bert evaluation script.
 """
 
 import os
+import argparse
 import numpy as np
 import mindspore.common.dtype as mstype
 from mindspore import context
+from mindspore import log as logger
 from mindspore.common.tensor import Tensor
 import mindspore.dataset as de
 import mindspore.dataset.transforms.c_transforms as C
@@ -105,8 +107,17 @@ def bert_predict(Evaluation):
     '''
     prediction function
     '''
-    devid = int(os.getenv('DEVICE_ID'))
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
+    target = args_opt.device_target
+    if target == "Ascend":
+        devid = int(os.getenv('DEVICE_ID'))
+        context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
+    elif target == "GPU":
+        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+        if bert_net_cfg.compute_type != mstype.float32:
+            logger.warning('GPU only support fp32 temporarily, run with fp32.')
+            bert_net_cfg.compute_type = mstype.float32
+    else:
+        raise Exception("Target error, GPU or Ascend is supported.")
     dataset = get_dataset(bert_net_cfg.batch_size, 1)
     if cfg.use_crf:
         net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels=len(tag_to_index), use_crf=True,
@@ -141,12 +152,15 @@ def test_eval():
         if cfg.task == "NER":
             print("Precision {:.6f} ".format(callback.TP / (callback.TP + callback.FP)))
             print("Recall {:.6f} ".format(callback.TP / (callback.TP + callback.FN)))
-            print("F1 {:.6f} ".format(2*callback.TP / (2*callback.TP + callback.FP + callback.FP)))
+            print("F1 {:.6f} ".format(2*callback.TP / (2*callback.TP + callback.FP + callback.FN)))
         else:
             print("acc_num {} , total_num {}, accuracy {:.6f}".format(callback.acc_num, callback.total_num,
                                                                       callback.acc_num / callback.total_num))
         print("==============================================================")
 
+parser = argparse.ArgumentParser(description='Bert eval')
+parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
+args_opt = parser.parse_args()
 if __name__ == "__main__":
     num_labels = cfg.num_labels
     test_eval()
diff --git a/model_zoo/bert/finetune.py b/model_zoo/bert/finetune.py
index 646f7cc73b..df16e3c91d 100644
--- a/model_zoo/bert/finetune.py
+++ b/model_zoo/bert/finetune.py
@@ -18,10 +18,12 @@ Bert finetune script.
 '''
 
 import os
+import argparse
 from src.utils import BertFinetuneCell, BertCLS, BertNER, BertSquad, BertSquadCell
 from src.finetune_config import cfg, bert_net_cfg, tag_to_index
 import mindspore.common.dtype as mstype
 from mindspore import context
+from mindspore import log as logger
 import mindspore.dataset as de
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
@@ -98,8 +100,17 @@ def test_train():
     '''
     finetune function
     '''
-    devid = int(os.getenv('DEVICE_ID'))
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
+    target = args_opt.device_target
+    if target == "Ascend":
+        devid = int(os.getenv('DEVICE_ID'))
+        context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
+    elif target == "GPU":
+        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+        if bert_net_cfg.compute_type != mstype.float32:
+            logger.warning('GPU only support fp32 temporarily, run with fp32.')
+            bert_net_cfg.compute_type = mstype.float32
+    else:
+        raise Exception("Target error, GPU or Ascend is supported.")
     #BertCLSTrain for classification
     #BertNERTrain for sequence labeling
     if cfg.task == 'NER':
@@ -151,5 +162,9 @@ def test_train():
     model = Model(netwithgrads)
     model.train(cfg.epoch_num, dataset, callbacks=[LossCallBack(), ckpoint_cb])
 
+
+parser = argparse.ArgumentParser(description='Bert finetune')
+parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
+args_opt = parser.parse_args()
 if __name__ == "__main__":
     test_train()
diff --git a/model_zoo/bert/pretrain_eval.py b/model_zoo/bert/pretrain_eval.py
new file mode 100644
index 0000000000..5089d88459
--- /dev/null
+++ b/model_zoo/bert/pretrain_eval.py
@@ -0,0 +1,158 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+Bert evaluation script.
+"""
+
+import os
+from src import BertModel, GetMaskedLMOutput
+from src.evaluation_config import cfg, bert_net_cfg
+import mindspore.common.dtype as mstype
+from mindspore import context
+from mindspore.common.tensor import Tensor
+import mindspore.dataset as de
+import mindspore.dataset.transforms.c_transforms as C
+from mindspore.train.model import Model
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+import mindspore.nn as nn
+from mindspore.nn.metrics import Metric
+from mindspore.ops import operations as P
+from mindspore.common.parameter import Parameter
+
+class myMetric(Metric):
+    '''
+    Self-defined Metric as a callback.
+    '''
+    def __init__(self):
+        super(myMetric, self).__init__()
+        self.clear()
+
+    def clear(self):
+        self.total_num = 0
+        self.acc_num = 0
+
+    def update(self, *inputs):
+        total_num = self._convert_data(inputs[0])
+        acc_num = self._convert_data(inputs[1])
+        self.total_num = total_num
+        self.acc_num = acc_num
+
+    def eval(self):
+        return self.acc_num/self.total_num
+
+
+class GetLogProbs(nn.Cell):
+    '''
+    Get MaskedLM prediction scores
+    '''
+    def __init__(self, config):
+        super(GetLogProbs, self).__init__()
+        self.bert = BertModel(config, False)
+        self.cls1 = GetMaskedLMOutput(config)
+
+    def construct(self, input_ids, input_mask, token_type_id, masked_pos):
+        sequence_output, _, embedding_table = self.bert(input_ids, token_type_id, input_mask)
+        prediction_scores = self.cls1(sequence_output, embedding_table, masked_pos)
+        return prediction_scores
+
+
+class BertPretrainEva(nn.Cell):
+    '''
+    Evaluate MaskedLM prediction scores
+    '''
+    def __init__(self, config):
+        super(BertPretrainEva, self).__init__()
+        self.bert = GetLogProbs(config)
+        self.argmax = P.Argmax(axis=-1, output_type=mstype.int32)
+        self.equal = P.Equal()
+        self.mean = P.ReduceMean()
+        self.sum = P.ReduceSum()
+        self.total = Parameter(Tensor([0], mstype.float32), name='total')
+        self.acc = Parameter(Tensor([0], mstype.float32), name='acc')
+        self.reshape = P.Reshape()
+        self.shape = P.Shape()
+        self.cast = P.Cast()
+
+
+    def construct(self, input_ids, input_mask, token_type_id, masked_pos, masked_ids, masked_weights, nsp_label):
+        bs, _ = self.shape(input_ids)
+        probs = self.bert(input_ids, input_mask, token_type_id, masked_pos)
+        index = self.argmax(probs)
+        index = self.reshape(index, (bs, -1))
+        eval_acc = self.equal(index, masked_ids)
+        eval_acc1 = self.cast(eval_acc, mstype.float32)
+        real_acc = eval_acc1 * masked_weights
+        acc = self.sum(real_acc)
+        total = self.sum(masked_weights)
+        self.total += total
+        self.acc += acc
+        return acc, self.total, self.acc
+
+
+def get_enwiki_512_dataset(batch_size=1, repeat_count=1, distribute_file=''):
+    '''
+    Get enwiki seq_length=512 dataset
+    '''
+    ds = de.TFRecordDataset([cfg.data_file], cfg.schema_file, columns_list=["input_ids", "input_mask", "segment_ids",
+                                                                            "masked_lm_positions", "masked_lm_ids",
+                                                                            "masked_lm_weights",
+                                                                            "next_sentence_labels"])
+    type_cast_op = C.TypeCast(mstype.int32)
+    ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
+    ds = ds.map(input_columns="input_mask", operations=type_cast_op)
+    ds = ds.map(input_columns="input_ids", operations=type_cast_op)
+    ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op)
+    ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op)
+    ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op)
+    ds = ds.repeat(repeat_count)
+
+    # apply batch operations
+    ds = ds.batch(batch_size, drop_remainder=True)
+    return ds
+
+
+def bert_predict():
+    '''
+    Predict function
+    '''
+    devid = int(os.getenv('DEVICE_ID'))
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
+    dataset = get_enwiki_512_dataset(bert_net_cfg.batch_size, 1)
+    net_for_pretraining = BertPretrainEva(bert_net_cfg)
+    net_for_pretraining.set_train(False)
+    param_dict = load_checkpoint(cfg.finetune_ckpt)
+    load_param_into_net(net_for_pretraining, param_dict)
+    model = Model(net_for_pretraining)
+    return model, dataset, net_for_pretraining
+
+
+def MLM_eval():
+    '''
+    Evaluate function
+    '''
+    _, dataset, net_for_pretraining = bert_predict()
+    net = Model(net_for_pretraining, eval_network=net_for_pretraining, eval_indexes=[0, 1, 2],
+                metrics={'name': myMetric()})
+    res = net.eval(dataset, dataset_sink_mode=False)
+    print("==============================================================")
+    for _, v in res.items():
+        print("Accuracy is: ")
+        print(v)
+    print("==============================================================")
+
+
+if __name__ == "__main__":
+    MLM_eval()
diff --git a/model_zoo/bert/run_pretrain.py b/model_zoo/bert/run_pretrain.py
index 1a267b93ff..65768946c1 100644
--- a/model_zoo/bert/run_pretrain.py
+++ b/model_zoo/bert/run_pretrain.py
@@ -19,7 +19,9 @@ python run_pretrain.py
 
 import os
 import argparse
+import numpy
 import mindspore.communication.management as D
+import mindspore.common.dtype as mstype
 from mindspore import context
 from mindspore.train.model import Model
 from mindspore.train.parallel_utils import ParallelMode
@@ -27,6 +29,7 @@ from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
 from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig, TimeMonitor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecayDynamicLR
+from mindspore import log as logger
 from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
 from src.dataset import create_bert_dataset
 from src.config import cfg, bert_net_cfg
@@ -54,6 +57,8 @@ class LossCallBack(Callback):
 def run_pretrain():
     """pre-train bert_clue"""
     parser = argparse.ArgumentParser(description='bert pre_training')
+    parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
+                        help='device where the code will be implemented. (Default: Ascend)')
     parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.")
     parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.")
     parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
@@ -63,41 +68,64 @@ def run_pretrain():
     parser.add_argument("--do_shuffle", type=str, default="true", help="Enable shuffle for dataset, default is true.")
     parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.")
     parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.")
-    parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path")
+    parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path")
+    parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path")
     parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, "
                                                                                 "default is 1000.")
+    parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, "
+                                                                    "meaning run all steps according to epoch number.")
     parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.")
     parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path")
     parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path")
 
     args_opt = parser.parse_args()
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
+    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id)
     context.set_context(reserve_class_name_in_scope=False)
-
+    context.set_context(variable_memory_max_size="30GB")
+    ckpt_save_dir = args_opt.save_checkpoint_path
     if args_opt.distribute == "true":
-        device_num = args_opt.device_num
+        if args_opt.device_target == 'Ascend':
+            D.init('hccl')
+            device_num = args_opt.device_num
+            rank = args_opt.device_id % device_num
+        else:
+            D.init('nccl')
+            device_num = D.get_group_size()
+            rank = D.get_rank()
+            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/'
+
         context.reset_auto_parallel_context()
         context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
                                           device_num=device_num)
         from mindspore.parallel._auto_parallel_context import auto_parallel_context
         if bert_net_cfg.num_hidden_layers == 12:
-            auto_parallel_context().set_all_reduce_fusion_split_indices([28, 55, 82, 109, 136, 163, 190, 205])
+            if bert_net_cfg.use_relative_positions:
+                auto_parallel_context().set_all_reduce_fusion_split_indices([29, 58, 87, 116, 145, 174, 203, 217])
+            else:
+                auto_parallel_context().set_all_reduce_fusion_split_indices([28, 55, 82, 109, 136, 163, 190, 205])
         elif bert_net_cfg.num_hidden_layers == 24:
-            auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397])
-        D.init()
-        rank = args_opt.device_id % device_num
+            if bert_net_cfg.use_relative_positions:
+                auto_parallel_context().set_all_reduce_fusion_split_indices([30, 90, 150, 210, 270, 330, 390, 421])
+            else:
+                auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397])
     else:
         rank = 0
         device_num = 1
 
+    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
+        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
+        bert_net_cfg.compute_type = mstype.float32
+
+
     ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num, rank, args_opt.do_shuffle,
                                                args_opt.enable_data_sink, args_opt.data_sink_steps,
                                                args_opt.data_dir, args_opt.schema_dir)
-
+    if args_opt.train_steps > 0:
+        new_repeat_count = min(new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
     netwithloss = BertNetworkWithLoss(bert_net_cfg, True)
 
     if cfg.optimizer == 'Lamb':
-        optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * ds.get_repeat_count(),
+        optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * new_repeat_count,
                          start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate,
                          power=cfg.Lamb.power, warmup_steps=cfg.Lamb.warmup_steps, weight_decay=cfg.Lamb.weight_decay,
                          eps=cfg.Lamb.eps)
@@ -106,7 +134,7 @@ def run_pretrain():
                              momentum=cfg.Momentum.momentum)
     elif cfg.optimizer == 'AdamWeightDecayDynamicLR':
         optimizer = AdamWeightDecayDynamicLR(netwithloss.trainable_params(),
-                                             decay_steps=ds.get_dataset_size() * ds.get_repeat_count(),
+                                             decay_steps=ds.get_dataset_size() * new_repeat_count,
                                              learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
                                              end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
                                              power=cfg.AdamWeightDecayDynamicLR.power,
@@ -120,11 +148,11 @@ def run_pretrain():
     if args_opt.enable_save_ckpt == "true":
         config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
                                      keep_checkpoint_max=args_opt.save_checkpoint_num)
-        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', config=config_ck)
+        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck)
         callback.append(ckpoint_cb)
 
-    if args_opt.checkpoint_path:
-        param_dict = load_checkpoint(args_opt.checkpoint_path)
+    if args_opt.load_checkpoint_path:
+        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
         load_param_into_net(netwithloss, param_dict)
 
     if args_opt.enable_lossscale == "true":
@@ -139,4 +167,5 @@ def run_pretrain():
     model = Model(netwithgrads)
     model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"))
 if __name__ == '__main__':
+    numpy.random.seed(0)
     run_pretrain()
diff --git a/model_zoo/bert/scripts/run_distribute_pretrain.sh b/model_zoo/bert/scripts/run_distribute_pretrain.sh
index 1d77ff8119..5a9f8735aa 100644
--- a/model_zoo/bert/scripts/run_distribute_pretrain.sh
+++ b/model_zoo/bert/scripts/run_distribute_pretrain.sh
@@ -64,7 +64,7 @@ do
     --do_shuffle="true" \
     --enable_data_sink="true" \
     --data_sink_steps=100 \
-    --checkpoint_path="" \
+    --load_checkpoint_path="" \
     --save_checkpoint_steps=10000 \
     --save_checkpoint_num=1 \
     --data_dir=$DATA_DIR \
diff --git a/model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh b/model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
new file mode 100644
index 0000000000..8deff766b9
--- /dev/null
+++ b/model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the scipt as: "
+echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR"
+echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json"
+echo "It is better to use absolute path."
+echo "=============================================================================================================="
+
+RANK_SIZE=$1
+EPOCH_SIZE=$2
+DATA_DIR=$3
+SCHEMA_DIR=$4
+
+mpirun --allow-run-as-root -n $RANK_SIZE \
+	python run_pretrain.py				\
+		--device_target="GPU"			\
+		--distribute="true"				\
+		--epoch_size=$EPOCH_SIZE		\
+		--enable_save_ckpt="true"		\
+		--enable_lossscale="false"		\
+		--do_shuffle="true"				\
+		--enable_data_sink="true"		\
+		--data_sink_steps=1				\
+		--load_checkpoint_path=""			\
+		--save_checkpoint_steps=10000	\
+		--save_checkpoint_num=1			\
+		--data_dir=$DATA_DIR			\
+		--schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
+
diff --git a/model_zoo/bert/scripts/run_standalone_pretrain.sh b/model_zoo/bert/scripts/run_standalone_pretrain.sh
index 438dda58c3..3cd9545f7f 100644
--- a/model_zoo/bert/scripts/run_standalone_pretrain.sh
+++ b/model_zoo/bert/scripts/run_standalone_pretrain.sh
@@ -37,8 +37,8 @@ python run_pretrain.py  \
     --enable_lossscale="true" \
     --do_shuffle="true" \
     --enable_data_sink="true" \
-    --data_sink_steps=100 \
-    --checkpoint_path="" \
+    --data_sink_steps=1 \
+    --load_checkpoint_path="" \
     --save_checkpoint_steps=10000 \
     --save_checkpoint_num=1 \
     --data_dir=$DATA_DIR \
diff --git a/model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh b/model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
new file mode 100644
index 0000000000..1e9f1ec3e7
--- /dev/null
+++ b/model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the scipt as: "
+echo "bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR"
+echo "for example: bash run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json"
+echo "=============================================================================================================="
+
+DEVICE_ID=$1
+EPOCH_SIZE=$2
+DATA_DIR=$3
+SCHEMA_DIR=$4
+
+export CUDA_VISIBLE_DEVICES=$DEVICE_ID
+
+mkdir -p ms_log
+CUR_DIR=`pwd`
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+python run_pretrain.py  \
+    --device_target="GPU" \
+    --distribute="false" \
+    --epoch_size=$EPOCH_SIZE \
+    --enable_save_ckpt="true" \
+    --enable_lossscale="false" \
+    --do_shuffle="true" \
+    --enable_data_sink="true" \
+    --data_sink_steps=1 \
+    --load_checkpoint_path="" \
+    --save_checkpoint_path="" \
+    --save_checkpoint_steps=10000 \
+    --save_checkpoint_num=1 \
+    --data_dir=$DATA_DIR \
+    --schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
diff --git a/model_zoo/bert/src/bert_for_pre_training.py b/model_zoo/bert/src/bert_for_pre_training.py
index 600512b4a7..5e014f02ba 100644
--- a/model_zoo/bert/src/bert_for_pre_training.py
+++ b/model_zoo/bert/src/bert_for_pre_training.py
@@ -27,12 +27,12 @@ from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
 from mindspore.train.parallel_utils import ParallelMode
 from mindspore.communication.management import get_group_size
 from mindspore import context
+from mindspore.ops import _selected_ops
 from .bert_model import BertModel
 
 GRADIENT_CLIP_TYPE = 1
 GRADIENT_CLIP_VALUE = 1.0
 
-_nn_clip_by_norm = nn.ClipByNorm()
 clip_grad = C.MultitypeFuncGraph("clip_grad")
 
 
@@ -57,7 +57,7 @@ def _clip_grad(clip_type, clip_value, grad):
         new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
                                    F.cast(F.tuple_to_array((clip_value,)), dt))
     else:
-        new_grad = _nn_clip_by_norm(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
+        new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
     return new_grad
 
 
@@ -131,7 +131,7 @@ class GetNextSentenceOutput(nn.Cell):
     """
     def __init__(self, config):
         super(GetNextSentenceOutput, self).__init__()
-        self.log_softmax = P.LogSoftmax()
+        self.log_softmax = _selected_ops.LogSoftmax()
         self.weight_init = TruncatedNormal(config.initializer_range)
         self.dense = nn.Dense(config.hidden_size, 2,
                               weight_init=self.weight_init, has_bias=True).to_float(config.compute_type)
diff --git a/model_zoo/bert/src/bert_model.py b/model_zoo/bert/src/bert_model.py
index 310d330daa..5cd90ab84b 100644
--- a/model_zoo/bert/src/bert_model.py
+++ b/model_zoo/bert/src/bert_model.py
@@ -261,7 +261,7 @@ class BertOutput(nn.Cell):
     def construct(self, hidden_status, input_tensor):
         output = self.dense(hidden_status)
         output = self.dropout(output)
-        output = self.add(output, input_tensor)
+        output = self.add(input_tensor, output)
         output = self.layernorm(output)
         return output
 
@@ -832,8 +832,7 @@ class CreateAttentionMaskFromInputMask(nn.Cell):
         if not self.input_mask_from_dataset:
             input_mask = self.input_mask
 
-        input_mask = self.cast(self.reshape(input_mask, self.shape), mstype.float32)
-        attention_mask = self.batch_matmul(self.broadcast_ones, input_mask)
+        attention_mask = self.cast(self.reshape(input_mask, self.shape), mstype.float32)
         return attention_mask
 
 
diff --git a/model_zoo/bert/src/cluener_evaluation.py b/model_zoo/bert/src/cluener_evaluation.py
index c2c6770a4a..09de6bf0b3 100644
--- a/model_zoo/bert/src/cluener_evaluation.py
+++ b/model_zoo/bert/src/cluener_evaluation.py
@@ -19,8 +19,8 @@ import json
 import numpy as np
 import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
-import tokenization
-from sample_process import label_generation, process_one_example_p
+from . import tokenization
+from .sample_process import label_generation, process_one_example_p
 from .evaluation_config import cfg
 from .CRF import postprocess
 
diff --git a/model_zoo/bert/src/config.py b/model_zoo/bert/src/config.py
index d1062b78ee..812f0c2f18 100644
--- a/model_zoo/bert/src/config.py
+++ b/model_zoo/bert/src/config.py
@@ -56,7 +56,7 @@ if cfg.bert_network == 'base':
     bert_net_cfg = BertConfig(
         batch_size=32,
         seq_length=128,
-        vocab_size=21136,
+        vocab_size=21128,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -77,7 +77,7 @@ if cfg.bert_network == 'nezha':
     bert_net_cfg = BertConfig(
         batch_size=32,
         seq_length=128,
-        vocab_size=21136,
+        vocab_size=21128,
         hidden_size=1024,
         num_hidden_layers=24,
         num_attention_heads=16,
@@ -98,7 +98,7 @@ if cfg.bert_network == 'large':
     bert_net_cfg = BertConfig(
         batch_size=16,
         seq_length=512,
-        vocab_size=30528,
+        vocab_size=30522,
         hidden_size=1024,
         num_hidden_layers=24,
         num_attention_heads=16,
diff --git a/model_zoo/bert/src/dataset.py b/model_zoo/bert/src/dataset.py
index 1828fac454..7985ca8559 100644
--- a/model_zoo/bert/src/dataset.py
+++ b/model_zoo/bert/src/dataset.py
@@ -39,6 +39,7 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", e
                             shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
                             shard_equal_rows=True)
     ori_dataset_size = ds.get_dataset_size()
+    print('origin dataset size: ', ori_dataset_size)
     new_size = ori_dataset_size
     if enable_data_sink == "true":
         new_size = data_sink_steps * bert_net_cfg.batch_size
@@ -53,7 +54,7 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", e
     ds = ds.map(input_columns="input_ids", operations=type_cast_op)
     # apply batch operations
     ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True)
-    ds = ds.repeat(new_repeat_count)
+    ds = ds.repeat(max(new_repeat_count, repeat_count))
     logger.info("data size: {}".format(ds.get_dataset_size()))
     logger.info("repeatcount: {}".format(ds.get_repeat_count()))
     return ds, new_repeat_count
diff --git a/model_zoo/bert/src/fused_layer_norm.py b/model_zoo/bert/src/fused_layer_norm.py
index ee3160b036..5dbe9999ad 100644
--- a/model_zoo/bert/src/fused_layer_norm.py
+++ b/model_zoo/bert/src/fused_layer_norm.py
@@ -73,7 +73,7 @@ class FusedLayerNorm(Cell):
 
     Examples:
         >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
-        >>> shape1 = x.shape()[1:]
+        >>> shape1 = x.shape[1:]
         >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
         >>> m(x)
     """
diff --git a/model_zoo/bert/src/utils.py b/model_zoo/bert/src/utils.py
index 50925708fc..9b5383877b 100644
--- a/model_zoo/bert/src/utils.py
+++ b/model_zoo/bert/src/utils.py
@@ -42,6 +42,13 @@ reciprocal = P.Reciprocal()
 def tensor_grad_scale(scale, grad):
     return grad * reciprocal(scale)
 
+_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
+grad_overflow = P.FloatStatus()
+
+@_grad_overflow.register("Tensor")
+def _tensor_grad_overflow(grad):
+    return grad_overflow(grad)
+
 class BertFinetuneCell(nn.Cell):
     """
     Especifically defined for finetuning where only four inputs tensor are needed.
@@ -67,9 +74,16 @@ class BertFinetuneCell(nn.Cell):
             self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
         self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
         self.cast = P.Cast()
-        self.alloc_status = P.NPUAllocFloatStatus()
-        self.get_status = P.NPUGetFloatStatus()
-        self.clear_before_grad = P.NPUClearFloatStatus()
+        self.gpu_target = False
+        if context.get_context("device_target") == "GPU":
+            self.gpu_target = True
+            self.float_status = P.FloatStatus()
+            self.addn = P.AddN()
+            self.reshape = P.Reshape()
+        else:
+            self.alloc_status = P.NPUAllocFloatStatus()
+            self.get_status = P.NPUGetFloatStatus()
+            self.clear_before_grad = P.NPUClearFloatStatus()
         self.reduce_sum = P.ReduceSum(keep_dims=False)
         self.depend_parameter_use = P.ControlDepend(depend_mode=1)
         self.base = Tensor(1, mstype.float32)
@@ -90,7 +104,7 @@ class BertFinetuneCell(nn.Cell):
 
 
         weights = self.weights
-        init = self.alloc_status()
+        init = False
         loss = self.network(input_ids,
                             input_mask,
                             token_type_id,
@@ -99,28 +113,36 @@ class BertFinetuneCell(nn.Cell):
             scaling_sens = self.loss_scale
         else:
             scaling_sens = sens
+
+        if not self.gpu_target:
+            init = self.alloc_status()
+            clear_before_grad = self.clear_before_grad(init)
+            F.control_depend(loss, init)
+            self.depend_parameter_use(clear_before_grad, scaling_sens)
         grads = self.grad(self.network, weights)(input_ids,
                                                  input_mask,
                                                  token_type_id,
                                                  label_ids,
                                                  self.cast(scaling_sens,
                                                            mstype.float32))
-        clear_before_grad = self.clear_before_grad(init)
-        F.control_depend(loss, init)
-        self.depend_parameter_use(clear_before_grad, scaling_sens)
         grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads)
         grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
         if self.reducer_flag:
             grads = self.grad_reducer(grads)
-        flag = self.get_status(init)
-        flag_sum = self.reduce_sum(init, (0,))
+        if not self.gpu_target:
+            flag = self.get_status(init)
+            flag_sum = self.reduce_sum(init, (0,))
+            F.control_depend(grads, flag)
+            F.control_depend(flag, flag_sum)
+        else:
+            flag_sum = self.hyper_map(F.partial(_grad_overflow), grads)
+            flag_sum = self.addn(flag_sum)
+            flag_sum = self.reshape(flag_sum, (()))
         if self.is_distributed:
             flag_reduce = self.allreduce(flag_sum)
             cond = self.less_equal(self.base, flag_reduce)
         else:
             cond = self.less_equal(self.base, flag_sum)
-        F.control_depend(grads, flag)
-        F.control_depend(flag, flag_sum)
         overflow = cond
         if sens is None:
             overflow = self.loss_scaling_manager(self.loss_scale, cond)
diff --git a/model_zoo/deeplabv3/README.md b/model_zoo/deeplabv3/README.md
index b178a83e6d..c8df3dab8d 100644
--- a/model_zoo/deeplabv3/README.md
+++ b/model_zoo/deeplabv3/README.md
@@ -16,17 +16,17 @@ This is an example of training DeepLabv3 with PASCAL VOC 2012 dataset in MindSpo
 - Set options in config.py.
 - Run `run_standalone_train.sh` for non-distributed training.
 	``` bash 
-	sh scripts/run_standalone_train.sh DEVICE_ID EPOCH_SIZE DATA_DIR
+	sh scripts/run_standalone_train.sh DEVICE_ID DATA_PATH
 	```
 - Run `run_distribute_train.sh` for distributed training.
 	``` bash
-	sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_DIR MINDSPORE_HCCL_CONFIG_PATH
+	sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH
 	```
 ### Evaluation
 Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path.
 - Run run_eval.sh for evaluation.
 	``` bash
-	sh scripts/run_eval.sh DEVICE_ID DATA_DIR
+	sh scripts/run_eval.sh DEVICE_ID DATA_PATH PRETRAINED_CKPT_PATH
 	```
 
 ## Options and Parameters
@@ -49,6 +49,11 @@ config.py:
 	decoder_output_stride			The ratio of input to output spatial resolution when employing decoder
 									to refine segmentation results, default is None.
 	image_pyramid					Input scales for multi-scale feature extraction, default is None.
+	epoch_size						Epoch size, default is 6.
+    batch_size                      batch size of input dataset: N, default is 2.
+	enable_save_ckpt				Enable save checkpoint, default is true.
+	save_checkpoint_steps			Save checkpoint steps, default is 1000.
+	save_checkpoint_num				Save checkpoint numbers, default is 1.
 ```	
 
 
@@ -56,11 +61,6 @@ config.py:
 ```
 Parameters for dataset and network:
     distribute						Run distribute, default is false.
-	epoch_size						Epoch size, default is 6.
-    batch_size                      batch size of input dataset: N, default is 2.
 	data_url						Train/Evaluation data url, required.
 	checkpoint_url					Checkpoint path, default is None.
-	enable_save_ckpt				Enable save checkpoint, default is true.
-	save_checkpoint_steps			Save checkpoint steps, default is 1000.
-	save_checkpoint_num				Save checkpoint numbers, default is 1.
 ```
\ No newline at end of file
diff --git a/model_zoo/deeplabv3/evaluation.py b/model_zoo/deeplabv3/eval.py
similarity index 85%
rename from model_zoo/deeplabv3/evaluation.py
rename to model_zoo/deeplabv3/eval.py
index e54b2d717b..7e43571982 100644
--- a/model_zoo/deeplabv3/evaluation.py
+++ b/model_zoo/deeplabv3/eval.py
@@ -25,9 +25,7 @@ from src.config import config
 
 
 parser = argparse.ArgumentParser(description="Deeplabv3 evaluation")
-parser.add_argument('--epoch_size', type=int, default=2, help='Epoch size.')
 parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
-parser.add_argument('--batch_size', type=int, default=2, help='Batch size.')
 parser.add_argument('--data_url', required=True, default=None, help='Evaluation data url')
 parser.add_argument('--checkpoint_url', default=None, help='Checkpoint path')
 
@@ -39,8 +37,8 @@ print(args_opt)
 if __name__ == "__main__":
     args_opt.crop_size = config.crop_size
     args_opt.base_size = config.crop_size
-    eval_dataset = create_dataset(args_opt, args_opt.data_url, args_opt.epoch_size, args_opt.batch_size, usage="eval")
-    net = deeplabv3_resnet50(config.seg_num_classes, [args_opt.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
+    eval_dataset = create_dataset(args_opt, args_opt.data_url, config.epoch_size, config.batch_size, usage="eval")
+    net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
                              infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates,
                              decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride,
                              fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid)
diff --git a/model_zoo/deeplabv3/scripts/run_distribute_train.sh b/model_zoo/deeplabv3/scripts/run_distribute_train.sh
index 514b0229af..4dcd8d9768 100644
--- a/model_zoo/deeplabv3/scripts/run_distribute_train.sh
+++ b/model_zoo/deeplabv3/scripts/run_distribute_train.sh
@@ -16,17 +16,21 @@
  
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_DIR MINDSPORE_HCCL_CONFIG_PATH"
-echo "for example: bash run_distribute_train.sh 8 40 /path/zh-wiki/ /path/hccl.json"
+echo "bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH"
+echo "for example: bash run_distribute_train.sh  MINDSPORE_HCCL_CONFIG_PATH DATA_PATH [PRETRAINED_CKPT_PATH](option)"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
  
-EPOCH_SIZE=$2
-DATA_DIR=$3
+DATA_DIR=$2
  
-export MINDSPORE_HCCL_CONFIG_PATH=$4
-export RANK_TABLE_FILE=$4
-export RANK_SIZE=$1
+export MINDSPORE_HCCL_CONFIG_PATH=$1
+export RANK_TABLE_FILE=$1
+export RANK_SIZE=8
+PATH_CHECKPOINT=""
+if [ $# == 3 ]
+then
+	PATH_CHECKPOINT=$3
+fi
 cores=`cat /proc/cpuinfo|grep "processor" |wc -l`
 echo "the number of logical core" $cores
 avg_core_per_rank=`expr $cores \/ $RANK_SIZE`
@@ -55,12 +59,8 @@ do
     env > env.log
     taskset -c $cmdopt python ../train.py  \
     --distribute="true" \
-    --epoch_size=$EPOCH_SIZE \
     --device_id=$DEVICE_ID \
-    --enable_save_ckpt="true" \
-    --checkpoint_url="" \
-    --save_checkpoint_steps=10000 \
-    --save_checkpoint_num=1 \
+    --checkpoint_url=$PATH_CHECKPOINT \
     --data_url=$DATA_DIR > log.txt 2>&1 &
     cd ../
 done
\ No newline at end of file
diff --git a/model_zoo/deeplabv3/scripts/run_eval.sh b/model_zoo/deeplabv3/scripts/run_eval.sh
index 2470138c33..735dce4cbe 100644
--- a/model_zoo/deeplabv3/scripts/run_eval.sh
+++ b/model_zoo/deeplabv3/scripts/run_eval.sh
@@ -15,18 +15,20 @@
 # ============================================================================
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash run_eval.sh DEVICE_ID DATA_DIR"
-echo "for example: bash run_eval.sh /path/zh-wiki/ "
+echo "bash run_eval.sh DEVICE_ID DATA_PATH PRETRAINED_CKPT_PATH"
+echo "for example: bash run_eval.sh DEVICE_ID DATA_PATH PRETRAINED_CKPT_PATH"
 echo "=============================================================================================================="
  
 DEVICE_ID=$1
 DATA_DIR=$2
+PATH_CHECKPOINT=$3
+
  
 mkdir -p ms_log 
 CUR_DIR=`pwd`
 export GLOG_log_dir=${CUR_DIR}/ms_log
 export GLOG_logtostderr=0
-python evaluation.py  \
+python eval.py  \
     --device_id=$DEVICE_ID \
-    --checkpoint_url="" \
+    --checkpoint_url=$PATH_CHECKPOINT \
     --data_url=$DATA_DIR > log.txt 2>&1 &
\ No newline at end of file
diff --git a/model_zoo/deeplabv3/scripts/run_standalone_train.sh b/model_zoo/deeplabv3/scripts/run_standalone_train.sh
index 1b84f9d583..6f5e8dbe52 100644
--- a/model_zoo/deeplabv3/scripts/run_standalone_train.sh
+++ b/model_zoo/deeplabv3/scripts/run_standalone_train.sh
@@ -15,13 +15,17 @@
 # ============================================================================
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR"
-echo "for example: bash run_standalone_train.sh 0 40 /path/zh-wiki/ "
+echo "bash run_standalone_pretrain.sh DEVICE_ID DATA_PATH"
+echo "for example: bash run_standalone_train.sh DEVICE_ID DATA_PATH [PRETRAINED_CKPT_PATH](option)"
 echo "=============================================================================================================="
  
 DEVICE_ID=$1
-EPOCH_SIZE=$2
-DATA_DIR=$3
+DATA_DIR=$2
+PATH_CHECKPOINT=""
+if [ $# == 3 ]
+then
+	PATH_CHECKPOINT=$3
+fi
  
 mkdir -p ms_log 
 CUR_DIR=`pwd`
@@ -29,10 +33,6 @@ export GLOG_log_dir=${CUR_DIR}/ms_log
 export GLOG_logtostderr=0
 python train.py  \
     --distribute="false" \
-    --epoch_size=$EPOCH_SIZE \
     --device_id=$DEVICE_ID \
-    --enable_save_ckpt="true" \
-    --checkpoint_url="" \
-    --save_checkpoint_steps=10000 \
-    --save_checkpoint_num=1 \
+    --checkpoint_url=$PATH_CHECKPOINT \
     --data_url=$DATA_DIR > log.txt 2>&1 &
\ No newline at end of file
diff --git a/model_zoo/deeplabv3/src/config.py b/model_zoo/deeplabv3/src/config.py
index c3b73e1097..6b5519e46c 100644
--- a/model_zoo/deeplabv3/src/config.py
+++ b/model_zoo/deeplabv3/src/config.py
@@ -29,5 +29,10 @@ config = ed({
     "fine_tune_batch_norm": False,
     "ignore_label": 255,
     "decoder_output_stride": None,
-    "seg_num_classes": 21
+    "seg_num_classes": 21,
+    "epoch_size": 6,
+    "batch_size": 2,
+    "enable_save_ckpt": True,
+    "save_checkpoint_steps": 10000,
+    "save_checkpoint_num": 1
 })
diff --git a/model_zoo/deeplabv3/src/md_dataset.py b/model_zoo/deeplabv3/src/md_dataset.py
index 37b57d1033..e136da23e1 100644
--- a/model_zoo/deeplabv3/src/md_dataset.py
+++ b/model_zoo/deeplabv3/src/md_dataset.py
@@ -16,6 +16,7 @@
 from PIL import Image
 import mindspore.dataset as de
 import mindspore.dataset.transforms.vision.c_transforms as C
+import numpy as np
 
 from .ei_dataset import HwVocRawDataset
 from .utils import custom_transforms as tr
@@ -52,8 +53,8 @@ class DataTransform:
         rhf_tr = tr.RandomHorizontalFlip()
         image, label = rhf_tr(image, label)
 
-        nor_tr = tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
-        image, label = nor_tr(image, label)
+        image = np.array(image).astype(np.float32)
+        label = np.array(label).astype(np.float32)
 
         return image, label
 
@@ -71,13 +72,13 @@ class DataTransform:
         fsc_tr = tr.FixScaleCrop(crop_size=self.args.crop_size)
         image, label = fsc_tr(image, label)
 
-        nor_tr = tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
-        image, label = nor_tr(image, label)
+        image = np.array(image).astype(np.float32)
+        label = np.array(label).astype(np.float32)
 
         return image, label
 
 
-def create_dataset(args, data_url, epoch_num=1, batch_size=1, usage="train"):
+def create_dataset(args, data_url, epoch_num=1, batch_size=1, usage="train", shuffle=True):
     """
     Create Dataset for DeepLabV3.
 
@@ -106,7 +107,7 @@ def create_dataset(args, data_url, epoch_num=1, batch_size=1, usage="train"):
     # 1464 samples / batch_size 8 = 183 batches
     # epoch_num is num of steps
     # 3658 steps / 183 = 20 epochs
-    if usage == "train":
+    if usage == "train" and shuffle:
         dataset = dataset.shuffle(1464)
     dataset = dataset.batch(batch_size, drop_remainder=(usage == "train"))
     dataset = dataset.repeat(count=epoch_num)
diff --git a/model_zoo/deeplabv3/src/utils/custom_transforms.py b/model_zoo/deeplabv3/src/utils/custom_transforms.py
index 3473f7eef5..75c78e1240 100644
--- a/model_zoo/deeplabv3/src/utils/custom_transforms.py
+++ b/model_zoo/deeplabv3/src/utils/custom_transforms.py
@@ -33,6 +33,7 @@ class Normalize:
     def __call__(self, img, mask):
         img = np.array(img).astype(np.float32)
         mask = np.array(mask).astype(np.float32)
+        img = ((img - self.mean) / self.std).astype(np.float32)
 
         return img, mask
 
diff --git a/model_zoo/deeplabv3/train.py b/model_zoo/deeplabv3/train.py
index 2135b0abf5..d096613977 100644
--- a/model_zoo/deeplabv3/train.py
+++ b/model_zoo/deeplabv3/train.py
@@ -27,14 +27,10 @@ from src.config import config
 
 parser = argparse.ArgumentParser(description="Deeplabv3 training")
 parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.")
-parser.add_argument('--epoch_size', type=int, default=6, help='Epoch size.')
-parser.add_argument('--batch_size', type=int, default=2, help='Batch size.')
 parser.add_argument('--data_url', required=True, default=None, help='Train data url')
 parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
 parser.add_argument('--checkpoint_url', default=None, help='Checkpoint path')
-parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.")
-parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, default is 1000.")
-parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.")
+
 args_opt = parser.parse_args()
 print(args_opt)
 context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
@@ -70,16 +66,16 @@ if __name__ == "__main__":
         init()
     args_opt.base_size = config.crop_size
     args_opt.crop_size = config.crop_size
-    train_dataset = create_dataset(args_opt, args_opt.data_url, args_opt.epoch_size, args_opt.batch_size, usage="train")
+    train_dataset = create_dataset(args_opt, args_opt.data_url, config.epoch_size, config.batch_size, usage="train")
     dataset_size = train_dataset.get_dataset_size()
     time_cb = TimeMonitor(data_size=dataset_size)
     callback = [time_cb, LossCallBack()]
-    if args_opt.enable_save_ckpt == "true":
-        config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
-                                     keep_checkpoint_max=args_opt.save_checkpoint_num)
+    if config.enable_save_ckpt:
+        config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps,
+                                     keep_checkpoint_max=config.save_checkpoint_num)
         ckpoint_cb = ModelCheckpoint(prefix='checkpoint_deeplabv3', config=config_ck)
         callback.append(ckpoint_cb)
-    net = deeplabv3_resnet50(config.seg_num_classes, [args_opt.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
+    net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
                              infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates,
                              decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride,
                              fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid)
@@ -88,5 +84,5 @@ if __name__ == "__main__":
     loss = OhemLoss(config.seg_num_classes, config.ignore_label)
     opt = Momentum(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
     model = Model(net, loss, opt)
-    model.train(args_opt.epoch_size, train_dataset, callback)
+    model.train(config.epoch_size, train_dataset, callback)
     
\ No newline at end of file
diff --git a/model_zoo/faster_rcnn/src/dataset.py b/model_zoo/faster_rcnn/src/dataset.py
index e384534f77..d64de09391 100644
--- a/model_zoo/faster_rcnn/src/dataset.py
+++ b/model_zoo/faster_rcnn/src/dataset.py
@@ -23,6 +23,8 @@ from numpy import random
 import mmcv
 import mindspore.dataset as de
 import mindspore.dataset.transforms.vision.c_transforms as C
+import mindspore.dataset.transforms.c_transforms as CC
+import mindspore.common.dtype as mstype
 from mindspore.mindrecord import FileWriter
 from src.config import config
 
@@ -229,6 +231,21 @@ def flip_column(img, img_shape, gt_bboxes, gt_label, gt_num):
 
     return  (img_data, img_shape, flipped, gt_label, gt_num)
 
+def flipped_generation(img, img_shape, gt_bboxes, gt_label, gt_num):
+    """flipped generation"""
+    img_data = img
+    flipped = gt_bboxes.copy()
+    _, w, _ = img_data.shape
+
+    flipped[..., 0::4] = w - gt_bboxes[..., 2::4] - 1
+    flipped[..., 2::4] = w - gt_bboxes[..., 0::4] - 1
+
+    return  (img_data, img_shape, flipped, gt_label, gt_num)
+
+def image_bgr_rgb(img, img_shape, gt_bboxes, gt_label, gt_num):
+    img_data = img[:, :, ::-1]
+    return  (img_data, img_shape, gt_bboxes, gt_label, gt_num)
+
 def transpose_column(img, img_shape, gt_bboxes, gt_label, gt_num):
     """transpose operation for image"""
     img_data = img.transpose(2, 0, 1).copy()
@@ -264,9 +281,10 @@ def preprocess_fn(image, box, is_training):
             input_data = rescale_column(*input_data)
         else:
             input_data = resize_column_test(*input_data)
-        input_data = imnormalize_column(*input_data)
 
-        output_data = transpose_column(*input_data)
+        input_data = image_bgr_rgb(*input_data)
+
+        output_data = input_data
         return output_data
 
     def _data_aug(image, box, is_training):
@@ -289,24 +307,24 @@ def preprocess_fn(image, box, is_training):
         if not is_training:
             return _infer_data(image_bgr, image_shape, gt_box_new, gt_label_new, gt_iscrowd_new_revert)
 
-        flip = (np.random.rand() < config.flip_ratio)
-        photo = (np.random.rand() < config.photo_ratio)
-        expand = (np.random.rand() < config.expand_ratio)
         input_data = image_bgr, image_shape, gt_box_new, gt_label_new, gt_iscrowd_new_revert
 
+        expand = (np.random.rand() < config.expand_ratio)
         if expand:
             input_data = expand_column(*input_data)
+
         if config.keep_ratio:
             input_data = rescale_column(*input_data)
         else:
             input_data = resize_column(*input_data)
+
+        photo = (np.random.rand() < config.photo_ratio)
         if photo:
             input_data = photo_crop_column(*input_data)
-        input_data = imnormalize_column(*input_data)
-        if flip:
-            input_data = flip_column(*input_data)
 
-        output_data = transpose_column(*input_data)
+        input_data = image_bgr_rgb(*input_data)
+
+        output_data = input_data
         return output_data
 
     return _data_aug(image, box, is_training)
@@ -423,19 +441,46 @@ def create_fasterrcnn_dataset(mindrecord_file, batch_size=2, repeat_num=12, devi
     ds = ds.map(input_columns=["image"], operations=decode)
     compose_map_func = (lambda image, annotation: preprocess_fn(image, annotation, is_training))
 
+    hwc_to_chw = C.HWC2CHW()
+    normalize_op = C.Normalize((123.675, 116.28, 103.53), (58.395, 57.12, 57.375))
+    horizontally_op = C.RandomHorizontalFlip(1)
+    type_cast0 = CC.TypeCast(mstype.float32)
+    type_cast1 = CC.TypeCast(mstype.float16)
+    type_cast2 = CC.TypeCast(mstype.int32)
+    type_cast3 = CC.TypeCast(mstype.bool_)
+
     if is_training:
         ds = ds.map(input_columns=["image", "annotation"],
                     output_columns=["image", "image_shape", "box", "label", "valid_num"],
                     columns_order=["image", "image_shape", "box", "label", "valid_num"],
-                    operations=compose_map_func, python_multiprocessing=True, num_parallel_workers=num_parallel_workers)
-        ds = ds.batch(batch_size, drop_remainder=True)
-        ds = ds.repeat(repeat_num)
+                    operations=compose_map_func, num_parallel_workers=4)
+
+        ds = ds.map(input_columns=["image"], operations=[normalize_op, type_cast0],
+                    num_parallel_workers=num_parallel_workers)
+
+        flip = (np.random.rand() < config.flip_ratio)
+        if flip:
+            ds = ds.map(input_columns=["image"], operations=[horizontally_op],
+                        num_parallel_workers=num_parallel_workers)
+            ds = ds.map(input_columns=["image", "image_shape", "box", "label", "valid_num"],
+                        operations=flipped_generation, num_parallel_workers=4)
     else:
         ds = ds.map(input_columns=["image", "annotation"],
                     output_columns=["image", "image_shape", "box", "label", "valid_num"],
                     columns_order=["image", "image_shape", "box", "label", "valid_num"],
                     operations=compose_map_func,
                     num_parallel_workers=num_parallel_workers)
-        ds = ds.batch(batch_size, drop_remainder=True)
-        ds = ds.repeat(repeat_num)
+
+        ds = ds.map(input_columns=["image"], operations=[normalize_op, type_cast0],
+                    num_parallel_workers=num_parallel_workers)
+
+    # transpose_column from python to c
+    ds = ds.map(input_columns=["image"], operations=[hwc_to_chw, type_cast1])
+    ds = ds.map(input_columns=["image_shape"], operations=[type_cast1])
+    ds = ds.map(input_columns=["box"], operations=[type_cast1])
+    ds = ds.map(input_columns=["label"], operations=[type_cast2])
+    ds = ds.map(input_columns=["valid_num"], operations=[type_cast3])
+    ds = ds.batch(batch_size, drop_remainder=True)
+    ds = ds.repeat(repeat_num)
+
     return ds
diff --git a/model_zoo/gat/README.md b/model_zoo/gat/README.md
new file mode 100644
index 0000000000..7c30e08851
--- /dev/null
+++ b/model_zoo/gat/README.md
@@ -0,0 +1,166 @@
+<!--TOC -->
+
+- [Graph Attention Networks Description](#graph-attention-networks-description)
+- [Model architecture](#model-architecture)
+- [Dataset](#dataset)
+  - [Data Preparation](#data-preparation)
+- [Features](#features)
+  - [Mixed Precision](#mixed-precision)
+- [Environment Requirements](#environment-requirements)
+- [Structure](#structure)
+  - [Parameter configuration](#parameter-configuration)
+- [Running the example](#running-the-example)
+  - [Usage](#usage)
+  - [Result](#result)
+- [Description of random situation](#description-of-random-situation)
+- [Others](#others)
+<!--TOC -->
+# Graph Attention Networks Description
+ 
+Graph Attention Networks(GAT) was proposed in 2017 by Petar Veličković et al. By leveraging masked self-attentional layers to address shortcomings of prior graph based method, GAT achieved or matched state of the art performance on both transductive datasets like Cora and inductive dataset like PPI. This is an example of training GAT with Cora dataset in MindSpore.
+
+[Paper](https://arxiv.org/abs/1710.10903): Veličković, P., Cucurull, G., Casanova, A., Romero, A., Lio, P., & Bengio, Y. (2017). Graph attention networks. arXiv preprint arXiv:1710.10903.
+
+# Model architecture
+
+An illustration of multi- head attention (with K = 3 heads) by node 1 on its neighborhood can be found below:
+
+![](https://camo.githubusercontent.com/4fe1a90e67d17a2330d7cfcddc930d5f7501750c/68747470733a2f2f7777772e64726f70626f782e636f6d2f732f71327a703170366b37396a6a6431352f6761745f6c617965722e706e673f7261773d31)
+
+Note that according to whether this attention layer is the output layer of the network or not, the node update function can be concatenate or average.
+
+# Dataset
+Statistics of dataset used are summerized as below:
+
+|                    |           Cora |       Citeseer |
+| ------------------ | -------------: | -------------: |
+| Task               |   Transductive |   Transductive |
+| # Nodes            | 2708 (1 graph) | 3327 (1 graph) |
+| # Edges            |           5429 |           4732 |
+| # Features/Node    |           1433 |           3703 |
+| # Classes          |              7 |              6 |
+| # Training Nodes   |            140 |            120 |
+| # Validation Nodes |            500 |            500 |
+| # Test Nodes       |           1000 |           1000 |
+
+## Data Preparation
+Download the dataset Cora or Citeseer provided by /kimiyoung/planetoid from github.
+ 
+> Place the dataset to any path you want, the folder should include files as follows(we use Cora dataset as an example):
+ 
+```
+.
+└─data
+    ├─ind.cora.allx
+    ├─ind.cora.ally
+    ├─ind.cora.graph
+    ├─ind.cora.test.index
+    ├─ind.cora.tx
+    ├─ind.cora.ty
+    ├─ind.cora.x
+    └─ind.cora.y
+```
+
+> Generate dataset in mindrecord format for cora or citeseer.
+>> Usage
+```buildoutcfg
+cd ./scripts
+# SRC_PATH is the dataset file path you downloaded, DATASET_NAME is cora or citeseer
+sh run_process_data.sh [SRC_PATH] [DATASET_NAME]
+```
+
+>> Launch
+```
+#Generate dataset in mindrecord format for cora
+sh run_process_data.sh cora
+#Generate dataset in mindrecord format for citeseer
+sh run_process_data.sh citeseer
+```
+
+# Features
+
+## Mixed Precision
+
+To ultilize the strong computation power of Ascend chip, and accelerate the training process, the mixed training method is used. MindSpore is able to cope with FP32 inputs and FP16 operators. In GAT example, the model is set to FP16 mode except for the loss calculation part.
+
+# Environment Requirements
+
+- Hardward (Ascend)
+- Install [MindSpore](https://www.mindspore.cn/install/en).
+
+# Structure
+ 
+```shell
+.
+└─gat      
+  ├─README.md
+  ├─scripts 
+  | ├─run_process_data.sh  # Generate dataset in mindrecord format
+  | └─run_train.sh         # Launch training   
+  |
+  ├─src
+  | ├─config.py            # Training configurations
+  | ├─dataset.py           # Data preprocessing
+  | ├─gat.py               # GAT model
+  | └─utils.py             # Utils for training gat
+  |
+  └─train.py               # Train net
+```
+ 
+## Parameter configuration
+ 
+Parameters for training can be set in config.py.
+ 
+```
+"learning_rate": 0.005,            # Learning rate
+"num_epochs": 200,                 # Epoch sizes for training
+"hid_units": [8],                  # Hidden units for attention head at each layer
+"n_heads": [8, 1],                 # Num heads for each layer
+"early_stopping": 100,             # Early stop patience
+"l2_coeff": 0.0005                 # l2 coefficient
+"attn_dropout": 0.6                # Attention dropout ratio
+"feature_dropout":0.6              # Feature dropout ratio
+```
+
+# Running the example
+## Usage
+After Dataset is correctly generated.
+```
+# run train with cora dataset, DATASET_NAME is cora
+sh run_train.sh [DATASET_NAME]
+```
+
+## Result
+ 
+Training result will be stored in the scripts path, whose folder name begins with "train". You can find the result like the followings in log.
+
+ 
+```
+Epoch:0, train loss=1.98498 train acc=0.17143 | val loss=1.97946 val acc=0.27200
+Epoch:1, train loss=1.98345 train acc=0.15000 | val loss=1.97233 val acc=0.32600
+Epoch:2, train loss=1.96968 train acc=0.21429 | val loss=1.96747 val acc=0.37400
+Epoch:3, train loss=1.97061 train acc=0.20714 | val loss=1.96410 val acc=0.47600
+Epoch:4, train loss=1.96864 train acc=0.13571 | val loss=1.96066 val acc=0.59600
+...
+Epoch:195, train loss=1.45111 train_acc=0.56429 | val_loss=1.44325 val_acc=0.81200
+Epoch:196, train loss=1.52476 train_acc=0.52143 | val_loss=1.43871 val_acc=0.81200
+Epoch:197, train loss=1.35807 train_acc=0.62857 | val_loss=1.43364 val_acc=0.81400
+Epoch:198, train loss=1.47566 train_acc=0.51429 | val_loss=1.42948 val_acc=0.81000
+Epoch:199, train loss=1.56411 train_acc=0.55000 | val_loss=1.42632 val_acc=0.80600
+Test loss=1.5366285, test acc=0.84199995
+...
+```
+
+Results on Cora dataset is shown by table below:
+
+|                                      | MindSpore + Ascend910 | Tensorflow + V100 |
+| ------------------------------------ | --------------------: | ----------------: |
+| Accuracy                             |           0.830933271 |       0.828649968 |
+| Training Cost(200 epochs)            |          27.62298311s |        36.711862s |
+| End to End Training Cost(200 epochs) |               39.074s |           50.894s |
+
+# Description of random situation
+GAT model contains lots of dropout operations, if you want to disable dropout, set the attn_dropout and feature_dropout to 0 in src/config.py. Note that this operation will cause the accuracy drop to approximately 80%.
+
+# Others
+GAT model is verified on Ascend environment, not on CPU or GPU.
\ No newline at end of file
diff --git a/model_zoo/gat/scripts/run_process_data.sh b/model_zoo/gat/scripts/run_process_data.sh
new file mode 100755
index 0000000000..4501f3c67f
--- /dev/null
+++ b/model_zoo/gat/scripts/run_process_data.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 2 ]
+then 
+    echo "Usage: sh run_train.sh [SRC_PATH] [DATASET_NAME]"
+exit 1
+fi
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+SRC_PATH=$(get_real_path $1)
+echo $SRC_PATH
+
+DATASET_NAME=$2
+echo $DATASET_NAME
+
+if [ ! -d data_mr ]; then
+  mkdir data_mr
+else
+  echo data_mr exist
+fi
+MINDRECORD_PATH=`pwd`/data_mr
+
+rm -f $MINDRECORD_PATH/*
+
+cd ../../../example/graph_to_mindrecord || exit
+
+python writer.py --mindrecord_script $DATASET_NAME \
+--mindrecord_file "$MINDRECORD_PATH/$DATASET_NAME" \
+--mindrecord_partitions 1 \
+--mindrecord_header_size_by_bit 18 \
+--mindrecord_page_size_by_bit 20 \
+--graph_api_args "$SRC_PATH"
+
+cd - || exit
diff --git a/model_zoo/gat/scripts/run_train.sh b/model_zoo/gat/scripts/run_train.sh
new file mode 100644
index 0000000000..3e9213712d
--- /dev/null
+++ b/model_zoo/gat/scripts/run_train.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 1 ]
+then 
+    echo "Usage: sh run_train.sh [DATASET_NAME]"
+exit 1
+fi
+
+DATASET_NAME=$1
+echo $DATASET_NAME
+
+ulimit -u unlimited
+export DEVICE_NUM=1
+export RANK_SIZE=$DEVICE_NUM
+export DEVICE_ID=0
+export RANK_ID=0
+
+if [ -d "train" ];
+then
+    rm -rf ./train
+fi
+mkdir ./train
+cp ../*.py ./train
+cp *.sh ./train
+cp -r ../src ./train
+cd ./train || exit
+env > env.log
+echo "start training for device $DEVICE_ID"
+
+
+if [ $DATASET_NAME == cora ]
+then
+    python train.py --data_dir=../data_mr/$DATASET_NAME &> log &
+fi
+
+if [ $DATASET_NAME == citeseer ]
+then
+    python train.py --data_dir=../data_mr/$DATASET_NAME --train_nodes_num=120 &> log &
+fi
+cd ..
diff --git a/model_zoo/gat/src/__init__.py b/model_zoo/gat/src/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/model_zoo/gat/src/config.py b/model_zoo/gat/src/config.py
new file mode 100644
index 0000000000..8e22ab5a78
--- /dev/null
+++ b/model_zoo/gat/src/config.py
@@ -0,0 +1,26 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Train configs for training gat"""
+
+
+class GatConfig():
+    lr = 0.005
+    num_epochs = 200
+    hid_units = [8]
+    n_heads = [8, 1]
+    early_stopping = 100
+    l2_coeff = 0.0005
+    attn_dropout = 0.6
+    feature_dropout = 0.6
diff --git a/model_zoo/gat/src/dataset.py b/model_zoo/gat/src/dataset.py
new file mode 100644
index 0000000000..0d0b544514
--- /dev/null
+++ b/model_zoo/gat/src/dataset.py
@@ -0,0 +1,87 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Preprocess data obtained for training"""
+import numpy as np
+import mindspore.dataset as ds
+
+
+def adj_to_bias(adj):
+    """Add self loop to adj and make sure only one hop neighbors are engaged in computing"""
+    num_graphs = adj.shape[0]
+    adj_temp = np.empty(adj.shape)
+    for i in range(num_graphs):
+        adj_temp[i] = adj[i] + np.eye(adj.shape[1])
+    return -1e9 * (1.0 - adj_temp)
+
+
+def get_biases_features_labels(data_dir):
+    """Get biases, features, labels from Dataset"""
+    g = ds.GraphData(data_dir)
+    nodes = g.get_all_nodes(0)
+    nodes_list = nodes.tolist()
+    row_tensor = g.get_node_feature(nodes_list, [1, 2])
+    features = row_tensor[0]
+    features = features[np.newaxis]
+
+    labels = row_tensor[1]
+
+    nodes_num = labels.shape[0]
+    class_num = labels.max() + 1
+    labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32)
+
+    neighbor = g.get_all_neighbors(nodes_list, 0)
+    node_map = {node_id: index for index, node_id in enumerate(nodes_list)}
+    adj = np.zeros([nodes_num, nodes_num], dtype=np.float32)
+    for index, value in np.ndenumerate(neighbor):
+        if value >= 0 and index[1] > 0:
+            adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1
+    adj = adj[np.newaxis]
+    biases = adj_to_bias(adj)
+
+    return biases, features, labels_onehot
+
+
+def get_mask(total, begin, end):
+    """Generate mask according to begin and end position"""
+    mask = np.zeros([total]).astype(np.float32)
+    mask[begin:end] = 1
+    return np.array(mask, dtype=np.bool)
+
+
+def load_and_process(data_dir, train_node_num, eval_node_num, test_node_num):
+    """Load cora dataset and preprocessing"""
+    biases, feature, label = get_biases_features_labels(data_dir)
+    # split training, validation and testing set
+    nodes_num = label.shape[0]
+    train_mask = get_mask(nodes_num, 0, train_node_num)
+    eval_mask = get_mask(nodes_num, train_node_num, train_node_num + eval_node_num)
+    test_mask = get_mask(nodes_num, nodes_num - test_node_num, nodes_num)
+
+    y_train = np.zeros(label.shape)
+    y_val = np.zeros(label.shape)
+    y_test = np.zeros(label.shape)
+
+    y_train[train_mask, :] = label[train_mask, :]
+    y_val[eval_mask, :] = label[eval_mask, :]
+    y_test[test_mask, :] = label[test_mask, :]
+
+    y_train = y_train[np.newaxis]
+    y_val = y_val[np.newaxis]
+    y_test = y_test[np.newaxis]
+    train_mask = train_mask[np.newaxis]
+    eval_mask = eval_mask[np.newaxis]
+    test_mask = test_mask[np.newaxis]
+
+    return feature, biases, y_train, train_mask, y_val, eval_mask, y_test, test_mask
diff --git a/model_zoo/gat/src/gat.py b/model_zoo/gat/src/gat.py
new file mode 100644
index 0000000000..3cb3cc1106
--- /dev/null
+++ b/model_zoo/gat/src/gat.py
@@ -0,0 +1,496 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Aggregator."""
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+from mindspore._extends import cell_attr_register
+from mindspore import Tensor, Parameter
+from mindspore.common.initializer import initializer
+from mindspore._checkparam import check_int_positive, check_bool
+from mindspore.nn.layer.activation import get_activation
+
+
+class GNNFeatureTransform(nn.Cell):
+    r"""
+    The GNN featuren transform layer for input.
+
+    Applies linear transformation for the input feature. This layer implements the operation as:
+
+    .. math::
+        \text{outputs} = \text{inputs} * \text{kernel} + \text{bias},
+
+    where :math:`\text{activation}` is the activation function passed as the activation
+    argument (if passed in),:math:`\text{activation}` is a weight matrix with the same
+    data type as the inputs created by the layer, and :math:`\text{bias}` is a bias vector
+    with the same data type as the inputs created by the layer (only if has_bias is True).
+
+    Args:
+        in_channels (int): The number of channels in the input space.
+        out_channels (int): The number of channels in the output space.
+        weight_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable weight_init parameter. The dtype
+            is same as input x. The values of str refer to the function `initializer`. Default: 'normal'.
+        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
+            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
+        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
+
+    Raises:
+        ValueError: If weight_init or bias_init shape is incorrect.
+
+    Inputs:
+        - **input_x** (Tensor) - The first tensor to be multiplied. The shape of the tensor is :math:`(*B, N, C)`,
+        where :math:`*B` represents the batch size which can be multidimensional, :math:`N` and :math:`C` are the
+        size of the last two dimensions. If `transpose_a` is True, its shape should be :math:`(*B, C, N)`.
+
+    Outputs:
+        Tensor, the shape of the output tensor is :math:`(*B, N, M)`.
+
+    Examples:
+        >>> net = nn.Dense(3, 4)
+        >>> input = Tensor(np.random.randint(0, 255, [2, 3]), mindspore.float32)
+        >>> net(input)
+        [[ 2.5246444   2.2738023   0.5711005  -3.9399147 ]
+         [ 1.0739875   4.0155234   0.94188046 -5.459526  ]]
+    """
+    @cell_attr_register
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 weight_init='normal',
+                 bias_init='zeros',
+                 has_bias=True):
+        super(GNNFeatureTransform, self).__init__()
+        self.in_channels = check_int_positive(in_channels)
+        self.out_channels = check_int_positive(out_channels)
+        self.has_bias = check_bool(has_bias)
+
+        if isinstance(weight_init, Tensor):
+            if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \
+               weight_init.shape()[1] != in_channels:
+                raise ValueError("weight_init shape error")
+
+        self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight")
+
+        if self.has_bias:
+            if isinstance(bias_init, Tensor):
+                if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels:
+                    raise ValueError("bias_init shape error")
+
+            self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias")
+
+        self.matmul = P.MatMul(transpose_b=True)
+        self.bias_add = P.BiasAdd()
+
+    def construct(self, x):
+        tensor_shape = F.shape(x)
+        input_feature = F.reshape(x, (tensor_shape[0] * tensor_shape[1], tensor_shape[2]))
+        output = self.matmul(input_feature, self.weight)
+        if self.has_bias:
+            output = self.bias_add(output, self.bias)
+        output = F.reshape(output, (tensor_shape[0], tensor_shape[1], self.out_channels))
+        return output
+
+    def extend_repr(self):
+        str_info = 'in_channels={}, out_channels={}, weight={}, has_bias={}' \
+                .format(self.in_channels, self.out_channels, self.weight, self.has_bias)
+        if self.has_bias:
+            str_info = str_info + ', bias={}'.format(self.bias)
+
+        return str_info
+
+
+class _BaseAggregator(nn.Cell):
+    """
+    Base Aggregator of GNN
+
+    Args:
+        feature_in_dim (int): Node or edge input feature dim.
+        feature_out_dim (int): Node or edge outpout feature dim.
+        use_fc (bool): Specifies whether a linear transformation before message is aggregated. Default: True
+        weight_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable weight_init parameter. The dtype
+            is same as input x. The values of str refer to the function `initializer`. Default: 'normal'.
+        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
+            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
+        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
+        dropout_ratio (float): The keep rate of dropout layer, greater than 0 and less equal than 1. Default: None.
+        activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
+
+    Examples:
+        >>> class MyAggregator(_BaseAggregator):
+        >>>    def __init__(self):
+        >>>        super(MyAggregator, self).__init__(self, feature_in_dim, feature_out_dim)
+        >>>        self.reduce_mean = P.ReduceSum()
+        >>>
+        >>>    def construct(self, x):
+        >>>        return self.reduce_mean(x, 1)
+    """
+    def __init__(self,
+                 feature_in_dim,
+                 feature_out_dim,
+                 use_fc=True,
+                 weight_init="normal",
+                 bias_init="zeros",
+                 has_bias=True,
+                 dropout_ratio=None,
+                 activation=None):
+        super(_BaseAggregator, self).__init__()
+        self.in_dim = feature_in_dim
+        self.out_dim = feature_out_dim
+        self.use_fc = use_fc
+        if self.use_fc:
+            self.weight_init = weight_init
+            self.bias_init = bias_init
+            self.has_bias = has_bias
+            self.fc = GNNFeatureTransform(self.in_dim,
+                                          self.out_dim,
+                                          weight_init=self.weight_init,
+                                          bias_init=self.bias_init,
+                                          has_bias=self.has_bias)
+        self.dropout_ratio = dropout_ratio
+        if self.dropout_ratio is not None:
+            self.dropout = nn.Dropout(keep_prob=self.dropout_ratio)
+        self.dropout_flag = self.dropout_ratio is not None
+        self.activation = get_activation(activation)
+        self.activation_flag = self.activation is not None
+
+    def construct(self, **kward):
+        """Must be overridden by all subclasses."""
+        raise NotImplementedError
+
+
+class MeanAggregator(_BaseAggregator):
+    """
+    Mean Aggregator of GNN
+
+    Args:
+        feature_in_dim (int): Node or edge input feature dim.
+        feature_out_dim (int): Node or edge outpout feature dim.
+        use_fc (bool): Specifies whether a linear transformation before message is aggregated. Default: True
+        weight_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable weight_init parameter. The dtype
+            is same as input x. The values of str refer to the function `initializer`. Default: 'normal'.
+        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
+            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
+        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
+        dropout_ratio (float): The keep rate of dropout layer, greater than 0 and less equal than 1. Default: None.
+        activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
+
+    Examples:
+        >>> net = MeanAggregator(32, 64, activation="relu", dropout=0.5)
+        >>> input_data = Tensor(np.array(np.random.rand(32, 3, 32), dtypy=np.float32))
+        >>> output = net(input_data)
+    """
+    def __init__(self,
+                 feature_in_dim,
+                 feature_out_dim,
+                 use_fc=True,
+                 weight_init="normal",
+                 bias_init="zeros",
+                 has_bias=True,
+                 dropout_ratio=None,
+                 activation=None):
+        super(MeanAggregator, self).__init__(
+            feature_in_dim,
+            feature_out_dim,
+            use_fc,
+            weight_init,
+            bias_init,
+            has_bias,
+            dropout_ratio,
+            activation)
+        self.reduce_mean = P.ReduceMean(keep_dims=False)
+
+    def construct(self, input_feature):
+        if self.use_fc:
+            input_feature = self.fc(input_feature)
+        if self.dropout_flag:
+            input_feature = self.dropout(input_feature)
+        if self.activation_flag:
+            input_feature = self.activation(input_feature)
+        output_feature = self.reduce_mean(input_feature, 1)
+        return output_feature
+
+
+class AttentionHead(nn.Cell):
+    """
+    Attention Head for Graph Attention Networks.
+
+    Args:
+        in_channel (int): The number of input channel, input feature dim.
+        out_channel (int): The number of output channel, output feature dim.
+        in_drop_ratio (float): Input feature dropout ratio, default 0.0.
+        coef_drop_ratio (float): Coefficient dropout ratio, default 0.0.
+        residual (bool): Whether to use residual connection, default False.
+        coef_activation (Cell): The attention coefficient activation function,
+            default nn.LeakyReLU().
+        activation (Cell): The output activation function, default nn.ELU().
+
+    Inputs:
+        - **input_feature** (Tensor) - Tensor of shape : (batch_size, num_nodes, feature_dim).
+        - **bias_mat** (Tensor) - Tensor of shape : (batch_size, num_nodes, num_nodes).
+
+    Examples:
+        >>> head = AttentionHead(1433,
+                                 8,
+                                 in_drop_ratio=0.6,
+                                 coef_drop_ratio=0.6,
+                                 residual=False)
+        >>> input_data = Tensor(np.array(np.random.rand(1, 2708, 1433), dtypy=np.float32))
+        >>> output = net(input_data)
+    """
+
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 in_drop_ratio=0.0,
+                 coef_drop_ratio=0.0,
+                 residual=False,
+                 coef_activation=nn.LeakyReLU(),
+                 activation=nn.ELU()):
+        super(AttentionHead, self).__init__()
+        self.in_channel = check_int_positive(in_channel)
+        self.out_channel = check_int_positive(out_channel)
+        self.in_drop_ratio = in_drop_ratio
+        self.in_drop = nn.Dropout(keep_prob=1 - in_drop_ratio)
+        self.in_drop_2 = nn.Dropout(keep_prob=1 - in_drop_ratio)
+        self.feature_transform = GNNFeatureTransform(
+            in_channels=self.in_channel,
+            out_channels=self.out_channel,
+            has_bias=False,
+            weight_init='XavierUniform')
+
+        self.f_1_transform = GNNFeatureTransform(
+            in_channels=self.out_channel,
+            out_channels=1,
+            weight_init='XavierUniform')
+        self.f_2_transform = GNNFeatureTransform(
+            in_channels=self.out_channel,
+            out_channels=1,
+            weight_init='XavierUniform')
+        self.softmax = nn.Softmax()
+
+        self.coef_drop = nn.Dropout(keep_prob=1 - coef_drop_ratio)
+        self.matmul = P.MatMul()
+        self.bias_add = P.BiasAdd()
+        self.bias = Parameter(initializer('zeros', self.out_channel), name='bias')
+        self.residual = check_bool(residual)
+        if self.residual:
+            if in_channel != out_channel:
+                self.residual_transform_flag = True
+                self.residual_transform = GNNFeatureTransform(
+                    in_channels=self.in_channel,
+                    out_channels=self.out_channel)
+            else:
+                self.residual_transform = None
+        self.coef_activation = coef_activation
+        self.activation = activation
+
+    def construct(self, input_feature, bias_mat, training=True):
+        if training is True:
+            input_feature = self.in_drop(input_feature)
+
+        feature = self.feature_transform(input_feature)
+        # self attention
+        f_1 = self.f_1_transform(feature)
+        f_2 = self.f_2_transform(feature)
+        logits = f_1 + P.Transpose()(f_2, (0, 2, 1))
+        logits = self.coef_activation(logits) + bias_mat
+        coefs = self.softmax(logits)
+        if training is True:
+            coefs = self.coef_drop(coefs)
+            feature = self.in_drop_2(feature)
+
+        coefs = P.Squeeze(0)(coefs)
+        feature = P.Squeeze(0)(feature)
+
+        ret = self.matmul(coefs, feature)
+        ret = self.bias_add(ret, self.bias)
+        ret = P.ExpandDims()(ret, 0)
+        # residual connection
+        if self.residual:
+            if self.residual_transform_flag:
+                res = self.residual_transform(input_feature)
+                ret = ret + res
+            else:
+                ret = ret + input_feature
+        # activation
+        if self.activation is not None:
+            ret = self.activation(ret)
+        return ret
+
+
+class AttentionAggregator(nn.Cell):
+    """
+    Attention Head for Graph Attention Networks，can be regarded as one
+        GAT layer.
+
+    Args:
+        in_channel (int): Input channel.
+        out_channel (int): Output channel.
+        num_heads (int): Number of attention heads for this layer, default 1.
+        in_drop_ratio (float): Input feature dropout ratio, default 0.0.
+        coef_drop_ratio (float): Coefficient dropout ratio, default 0.0.
+        activation (Cell): The output activation function, default nn.ELU().
+        residual (bool): Whether to use residual connection, default False.
+        output_transform (str['concat', 'sum']): output transform for a layer,
+            default 'concat'
+
+    Inputs:
+        - **input_feature** (Tensor) - Tensor of shape : (batch_size, num_nodes, feature_dim).
+        - **bias_mat** (Tensor) - Tensor of shape : (batch_size, num_nodes, num_nodes).
+
+    Examples:
+        >>> input_data = Tensor(np.array(np.random.rand(1, 2708, 1433), dtype=np.float32))
+        >>> biases = Tensor(np.array(np.random.rand(1, 2708, 2708), dtype=np.float32))
+        >>> net = AttentionAggregator(1433,
+                                      8,
+                                      8)
+        >>> net(input_data, biases)
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_heads=1,
+                 in_drop=0.0,
+                 coef_drop=0.0,
+                 activation=nn.ELU(),
+                 residual=False,
+                 output_transform='concat'):
+        super(AttentionAggregator, self).__init__()
+        self.num_heads = num_heads
+        self.attns = []
+        for _ in range(num_heads):
+            self.attns.append(AttentionHead(in_channels,
+                                            out_channels,
+                                            in_drop_ratio=in_drop,
+                                            coef_drop_ratio=coef_drop,
+                                            activation=activation,
+                                            residual=residual))
+        self.attns = nn.layer.CellList(self.attns)
+        if output_transform == 'concat':
+            self.out_trans = P.Concat(-1)
+        elif output_transform == 'sum':
+            self.out_trans = P.AddN()
+        else:
+            raise ValueError("output_transform must be either 'concat' or 'sum'")
+
+    def construct(self, input_data, bias_mat, training=True):
+        res = ()
+        for i in range(self.num_heads):
+            res += (self.attns[i](input_data, bias_mat, training),)
+        return self.out_trans(res)
+
+
+class GAT(nn.Cell):
+    """
+    Graph Attention Network
+
+    Args:
+        ftr_dims (int): Initial feature dimensions.
+        num_class (int): Num of class to identify.
+        num_nodes (int): Num of nodes in this graph.
+        hidden_units (list[int]): Num of hidden units at each layer.
+        num_heads (list[int]): Num of heads at each layer.
+        attn_drop (float): Drop out ratio of attention coefficient,
+            default 0.0.
+        ftr_drop (float): Drop out ratio of feature, default 0.0.
+        activation (Cell): Activation Function for output layer, default
+            nn.Elu().
+        residual (bool): Whether to use residual connection between
+            intermediate layers, default False.
+
+    Examples:
+        >>> ft_sizes = 1433
+        >>> num_class = 7
+        >>> num_nodes = 2708
+        >>> hid_units = [8]
+        >>> n_heads = [8, 1]
+        >>> activation = nn.ELU()
+        >>> residual = False
+        >>> input_data = np.array(np.random.rand(1, 2708, 1433))
+        >>> biases = np.array(np.random.rand(1, 2708, 2708))
+        >>> net = GAT(ft_sizes,
+                      num_class,
+                      num_nodes,
+                      hidden_units=hid_units,
+                      num_heads=n_heads,
+                      attn_drop=0.6,
+                      ftr_drop=0.6,
+                      activation=activation,
+                      residual=residual)
+        >>> output = net(input_data, biases)
+    """
+
+    def __init__(self,
+                 features,
+                 biases,
+                 ftr_dims,
+                 num_class,
+                 num_nodes,
+                 hidden_units,
+                 num_heads,
+                 attn_drop=0.0,
+                 ftr_drop=0.0,
+                 activation=nn.ELU(),
+                 residual=False):
+        super(GAT, self).__init__()
+        self.features = Tensor(features)
+        self.biases = Tensor(biases)
+        self.ftr_dims = check_int_positive(ftr_dims)
+        self.num_class = check_int_positive(num_class)
+        self.num_nodes = check_int_positive(num_nodes)
+        self.hidden_units = hidden_units
+        self.num_heads = num_heads
+        self.attn_drop = attn_drop
+        self.ftr_drop = ftr_drop
+        self.activation = activation
+        self.residual = check_bool(residual)
+        self.layers = []
+        # first layer
+        self.layers.append(AttentionAggregator(
+            self.ftr_dims,
+            self.hidden_units[0],
+            self.num_heads[0],
+            self.ftr_drop,
+            self.attn_drop,
+            self.activation,
+            residual=False))
+        # intermediate layer
+        for i in range(1, len(self.hidden_units)):
+            self.layers.append(AttentionAggregator(
+                self.hidden_units[i-1]*self.num_heads[i-1],
+                self.hidden_units[i],
+                self.num_heads[i],
+                self.ftr_drop,
+                self.attn_drop,
+                self.activation,
+                residual=self.residual))
+        # output layer
+        self.layers.append(AttentionAggregator(
+            self.hidden_units[-1]*self.num_heads[-2],
+            self.num_class,
+            self.num_heads[-1],
+            self.ftr_drop,
+            self.attn_drop,
+            activation=None,
+            residual=False,
+            output_transform='sum'))
+        self.layers = nn.layer.CellList(self.layers)
+
+    def construct(self, training=True):
+        input_data = self.features
+        bias_mat = self.biases
+        for cell in self.layers:
+            input_data = cell(input_data, bias_mat, training)
+        return input_data/self.num_heads[-1]
diff --git a/model_zoo/gat/src/utils.py b/model_zoo/gat/src/utils.py
new file mode 100644
index 0000000000..03305ca3d3
--- /dev/null
+++ b/model_zoo/gat/src/utils.py
@@ -0,0 +1,178 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Utils for training gat"""
+from mindspore import nn
+from mindspore.common.parameter import ParameterTuple
+from mindspore import Tensor
+from mindspore.common import dtype as mstype
+from mindspore.ops import composite as C
+from mindspore.ops import functional as F
+from mindspore.ops import operations as P
+
+
+class MaskedSoftMaxLoss(nn.Cell):
+    """Calculate masked softmax loss with l2 loss"""
+    def __init__(self, num_class, label, mask, l2_coeff, params):
+        super(MaskedSoftMaxLoss, self).__init__()
+        self.num_class = num_class
+        self.label = label
+        self.mask = mask
+        self.softmax = P.SoftmaxCrossEntropyWithLogits()
+        self.reduce_mean = P.ReduceMean()
+        self.cast = P.Cast()
+        self.l2_coeff = l2_coeff
+        self.params = ParameterTuple(list(param for param in params if param.name[-4:] != 'bias'))
+        self.reduce_sum = P.ReduceSum()
+        self.num_params = len(self.params)
+
+    def construct(self, logits):
+        # calc l2 loss
+        l2_loss = 0
+        for i in range(self.num_params):
+            l2_loss = l2_loss + self.l2_coeff * P.L2Loss()(self.params[i])
+
+        logits = P.Reshape()(logits, (-1, self.num_class))
+        label = P.Reshape()(self.label, (-1, self.num_class))
+        mask = P.Reshape()(self.mask, (-1,))
+
+        logits = self.cast(logits, mstype.float32)
+        loss = self.softmax(logits, label)[0]
+        mask /= self.reduce_mean(mask)
+        loss *= mask
+        loss = self.reduce_mean(loss)
+        l2_loss = P.Cast()(l2_loss, mstype.float32)
+        return loss+l2_loss
+
+
+class MaskedAccuracy(nn.Cell):
+    """Calculate accuracy with mask"""
+    def __init__(self, num_class, label, mask):
+        super(MaskedAccuracy, self).__init__()
+        self.argmax = P.Argmax(axis=1)
+        self.cast = P.Cast()
+        self.reduce_mean = P.ReduceMean()
+        self.equal = P.Equal()
+        self.num_class = num_class
+        self.label = Tensor(label, dtype=mstype.float32)
+        self.mask = Tensor(mask, dtype=mstype.float32)
+
+    def construct(self, logits):
+        logits = P.Reshape()(logits, (-1, self.num_class))
+        labels = P.Reshape()(self.label, (-1, self.num_class))
+        mask = P.Reshape()(self.mask, (-1,))
+
+        labels = self.cast(labels, mstype.float32)
+
+        correct_prediction = self.equal(self.argmax(logits), self.argmax(labels))
+        accuracy_all = self.cast(correct_prediction, mstype.float32)
+        mask = self.cast(mask, mstype.float32)
+        mask /= self.reduce_mean(mask)
+        accuracy_all *= mask
+        return self.reduce_mean(accuracy_all)
+
+
+class LossAccuracyWrapper(nn.Cell):
+    """
+    Warp GAT model with loss calculation and accuracy calculation, loss is calculated with l2 loss.
+
+    Args:
+        network (Cell): GAT network with logits calculation as output.
+        num_class (int): num of class for classification.
+        label (numpy.ndarray): Train Dataset label.
+        mask (numpy.ndarray): Train Dataset mask.
+        l2_coeff (float): l2 loss discount rate.
+    """
+    def __init__(self, network, num_class, label, mask, l2_coeff):
+        super(LossAccuracyWrapper, self).__init__()
+        self.network = network
+        label = Tensor(label, dtype=mstype.float32)
+        mask = Tensor(mask, dtype=mstype.float32)
+        self.loss_func = MaskedSoftMaxLoss(num_class, label, mask, l2_coeff, self.network.trainable_params())
+        self.acc_func = MaskedAccuracy(num_class, label, mask)
+
+    def construct(self):
+        logits = self.network(training=False)
+        loss = self.loss_func(logits)
+        accuracy = self.acc_func(logits)
+        return loss, accuracy
+
+
+class LossNetWrapper(nn.Cell):
+    """Wrap GAT model with loss calculation"""
+    def __init__(self, network, num_class, label, mask, l2_coeff):
+        super(LossNetWrapper, self).__init__()
+        self.network = network
+        label = Tensor(label, dtype=mstype.float32)
+        mask = Tensor(mask, dtype=mstype.float32)
+        params = list(param for param in self.network.trainable_params() if param.name[-4:] != 'bias')
+        self.loss_func = MaskedSoftMaxLoss(num_class, label, mask, l2_coeff, params)
+
+    def construct(self):
+        logits = self.network()
+        loss = self.loss_func(logits)
+        return loss
+
+
+class TrainOneStepCell(nn.Cell):
+    """
+    For network training. Warp the loss net with optimizer.
+
+    Args:
+        network (Cell): GAT network with loss calculation as the output.
+        optimizer (Cell): Optimizer for minimize the loss.
+        sens (Float): Backpropagation input number, default 1.0.
+    """
+    def __init__(self, network, optimizer, sens=1.0):
+        super(TrainOneStepCell, self).__init__(auto_prefix=True)
+        self.network = network
+        self.network.add_flags(defer_inline=True)
+        self.weights = ParameterTuple(network.trainable_params())
+        self.optimizer = optimizer
+        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
+        self.sens = sens
+
+    def construct(self):
+        weights = self.weights
+        loss = self.network()
+        sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
+        grads = self.grad(self.network, weights)(sens)
+        return F.depend(loss, self.optimizer(grads))
+
+
+class TrainGAT(nn.Cell):
+    """
+    Warp GAT model with everything needed for training, include loss, optimizer ,etc.
+
+    Args:
+        network (Cell): GAT network.
+        num_class (int): num of class for classification.
+        label (numpy.ndarray): Train Dataset label.
+        mask (numpy.ndarray): Train Dataset mask.
+        learning_rate (float): Learning rate.
+        l2_coeff (float): l2 loss discount rate.
+    """
+    def __init__(self, network, num_class, label, mask, learning_rate, l2_coeff):
+        super(TrainGAT, self).__init__(auto_prefix=False)
+        self.network = network
+        loss_net = LossNetWrapper(network, num_class, label, mask, l2_coeff)
+        optimizer = nn.Adam(loss_net.trainable_params(),
+                            learning_rate=learning_rate)
+        self.loss_train_net = TrainOneStepCell(loss_net, optimizer)
+        self.accuracy_func = MaskedAccuracy(num_class, label, mask)
+
+    def construct(self):
+        loss = self.loss_train_net()
+        accuracy = self.accuracy_func(self.network())
+        return loss, accuracy
diff --git a/model_zoo/gat/train.py b/model_zoo/gat/train.py
new file mode 100644
index 0000000000..af1808b995
--- /dev/null
+++ b/model_zoo/gat/train.py
@@ -0,0 +1,131 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Test train gat"""
+import argparse
+import os
+
+import numpy as np
+import mindspore.context as context
+from mindspore.train.serialization import _exec_save_checkpoint, load_checkpoint
+
+from src.config import GatConfig
+from src.dataset import load_and_process
+from src.gat import GAT
+from src.utils import LossAccuracyWrapper, TrainGAT
+
+
+def train():
+    """Train GAT model."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', type=str, default='./data/cora/cora_mr', help='Data dir')
+    parser.add_argument('--train_nodes_num', type=int, default=140, help='Nodes numbers for training')
+    parser.add_argument('--eval_nodes_num', type=int, default=500, help='Nodes numbers for evaluation')
+    parser.add_argument('--test_nodes_num', type=int, default=1000, help='Nodes numbers for test')
+    args = parser.parse_args()
+    if not os.path.exists("ckpts"):
+        os.mkdir("ckpts")
+    context.set_context(mode=context.GRAPH_MODE,
+                        device_target="Ascend",
+                        save_graphs=False)
+    # train parameters
+    hid_units = GatConfig.hid_units
+    n_heads = GatConfig.n_heads
+    early_stopping = GatConfig.early_stopping
+    lr = GatConfig.lr
+    l2_coeff = GatConfig.l2_coeff
+    num_epochs = GatConfig.num_epochs
+    feature, biases, y_train, train_mask, y_val, eval_mask, y_test, test_mask = load_and_process(args.data_dir,
+                                                                                                 args.train_nodes_num,
+                                                                                                 args.eval_nodes_num,
+                                                                                                 args.test_nodes_num)
+    feature_size = feature.shape[2]
+    num_nodes = feature.shape[1]
+    num_class = y_train.shape[2]
+
+    gat_net = GAT(feature,
+                  biases,
+                  feature_size,
+                  num_class,
+                  num_nodes,
+                  hid_units,
+                  n_heads,
+                  attn_drop=GatConfig.attn_dropout,
+                  ftr_drop=GatConfig.feature_dropout)
+    gat_net.add_flags_recursive(fp16=True)
+
+    eval_net = LossAccuracyWrapper(gat_net,
+                                   num_class,
+                                   y_val,
+                                   eval_mask,
+                                   l2_coeff)
+
+    train_net = TrainGAT(gat_net,
+                         num_class,
+                         y_train,
+                         train_mask,
+                         lr,
+                         l2_coeff)
+
+    train_net.set_train(True)
+    val_acc_max = 0.0
+    val_loss_min = np.inf
+    for _epoch in range(num_epochs):
+        train_result = train_net()
+        train_loss = train_result[0].asnumpy()
+        train_acc = train_result[1].asnumpy()
+
+        eval_result = eval_net()
+        eval_loss = eval_result[0].asnumpy()
+        eval_acc = eval_result[1].asnumpy()
+
+        print("Epoch:{}, train loss={:.5f}, train acc={:.5f} | val loss={:.5f}, val acc={:.5f}".format(
+            _epoch, train_loss, train_acc, eval_loss, eval_acc))
+        if eval_acc >= val_acc_max or eval_loss < val_loss_min:
+            if eval_acc >= val_acc_max and eval_loss < val_loss_min:
+                val_acc_model = eval_acc
+                val_loss_model = eval_loss
+                _exec_save_checkpoint(train_net.network, "ckpts/gat.ckpt")
+            val_acc_max = np.max((val_acc_max, eval_acc))
+            val_loss_min = np.min((val_loss_min, eval_loss))
+            curr_step = 0
+        else:
+            curr_step += 1
+            if curr_step == early_stopping:
+                print("Early Stop Triggered!, Min loss: {}, Max accuracy: {}".format(val_loss_min, val_acc_max))
+                print("Early stop model validation loss: {}, accuracy{}".format(val_loss_model, val_acc_model))
+                break
+    gat_net_test = GAT(feature,
+                       biases,
+                       feature_size,
+                       num_class,
+                       num_nodes,
+                       hid_units,
+                       n_heads,
+                       attn_drop=0.0,
+                       ftr_drop=0.0)
+    load_checkpoint("ckpts/gat.ckpt", net=gat_net_test)
+    gat_net_test.add_flags_recursive(fp16=True)
+
+    test_net = LossAccuracyWrapper(gat_net_test,
+                                   num_class,
+                                   y_test,
+                                   test_mask,
+                                   l2_coeff)
+    test_result = test_net()
+    print("Test loss={}, test acc={}".format(test_result[0], test_result[1]))
+
+
+if __name__ == "__main__":
+    train()
diff --git a/model_zoo/gcn/README.md b/model_zoo/gcn/README.md
new file mode 100644
index 0000000000..310c307474
--- /dev/null
+++ b/model_zoo/gcn/README.md
@@ -0,0 +1,115 @@
+# GCN Example
+ 
+## Description
+ 
+This is an example of training GCN with Cora and Citeseer dataset in MindSpore.
+
+## Requirements
+
+- Install [MindSpore](https://www.mindspore.cn/install/en).
+
+- Download the dataset Cora or Citeseer provided by /kimiyoung/planetoid from github.
+ 
+> Place the dataset to any path you want, the folder should include files as follows(we use Cora dataset as an example):
+ 
+```
+.
+└─data
+    ├─ind.cora.allx
+    ├─ind.cora.ally
+    ├─ind.cora.graph
+    ├─ind.cora.test.index
+    ├─ind.cora.tx
+    ├─ind.cora.ty
+    ├─ind.cora.x
+    └─ind.cora.y
+```
+
+> Generate dataset in mindrecord format for cora or citeseer.
+>> Usage
+```buildoutcfg
+cd ./scripts
+# SRC_PATH is the dataset file path you downloaded, DATASET_NAME is cora or citeseer
+sh run_process_data.sh [SRC_PATH] [DATASET_NAME]
+```
+
+>> Launch
+```
+#Generate dataset in mindrecord format for cora
+sh run_process_data.sh ./data cora
+#Generate dataset in mindrecord format for citeseer
+sh run_process_data.sh ./data citeseer
+```
+
+## Structure
+ 
+```shell
+.
+└─gcn      
+  ├─README.md
+  ├─scripts 
+  | ├─run_process_data.sh  # Generate dataset in mindrecord format
+  | └─run_train.sh         # Launch training   
+  |
+  ├─src
+  | ├─config.py            # Parameter configuration
+  | ├─dataset.py           # Data preprocessin
+  | ├─gcn.py               # GCN backbone
+  | └─metrics.py           # Loss and accuracy
+  |
+  └─train.py               # Train net
+```
+ 
+## Parameter configuration
+ 
+Parameters for training can be set in config.py.
+ 
+```
+"learning_rate": 0.01,            # Learning rate
+"epochs": 200,                    # Epoch sizes for training
+"hidden1": 16,                    # Hidden size for the first graph convolution layer
+"dropout": 0.5,                   # Dropout ratio for the first graph convolution layer
+"weight_decay": 5e-4,             # Weight decay for the parameter of the first graph convolution layer
+"early_stopping": 10,             # Tolerance for early stopping
+```
+
+## Running the example
+
+### Train
+ 
+#### Usage
+
+```
+# run train with cora or citeseer dataset, DATASET_NAME is cora or citeseer
+sh run_train.sh [DATASET_NAME]
+```
+ 
+#### Launch
+ 
+```bash
+sh run_train.sh cora
+```
+ 
+#### Result
+ 
+Training result will be stored in the scripts path, whose folder name begins with "train". You can find the result like the followings in log.
+
+ 
+```
+Epoch: 0001 train_loss= 1.95373 train_acc= 0.09286 val_loss= 1.95075 val_acc= 0.20200 time= 7.25737
+Epoch: 0002 train_loss= 1.94812 train_acc= 0.32857 val_loss= 1.94717 val_acc= 0.34000 time= 0.00438
+Epoch: 0003 train_loss= 1.94249 train_acc= 0.47857 val_loss= 1.94337 val_acc= 0.43000 time= 0.00428
+Epoch: 0004 train_loss= 1.93550 train_acc= 0.55000 val_loss= 1.93957 val_acc= 0.46400 time= 0.00421
+Epoch: 0005 train_loss= 1.92617 train_acc= 0.67143 val_loss= 1.93558 val_acc= 0.45400 time= 0.00430
+...
+Epoch: 0196 train_loss= 0.60326 train_acc= 0.97857 val_loss= 1.05155 val_acc= 0.78200 time= 0.00418
+Epoch: 0197 train_loss= 0.60377 train_acc= 0.97143 val_loss= 1.04940 val_acc= 0.78000 time= 0.00418
+Epoch: 0198 train_loss= 0.60680 train_acc= 0.95000 val_loss= 1.04847 val_acc= 0.78000 time= 0.00414
+Epoch: 0199 train_loss= 0.61920 train_acc= 0.96429 val_loss= 1.04797 val_acc= 0.78400 time= 0.00413
+Epoch: 0200 train_loss= 0.57948 train_acc= 0.96429 val_loss= 1.04753 val_acc= 0.78600 time= 0.00415
+Optimization Finished!
+Test set results: cost= 1.00983 accuracy= 0.81300 time= 0.39083
+...
+```
+
+
diff --git a/model_zoo/gcn/scripts/run_process_data.sh b/model_zoo/gcn/scripts/run_process_data.sh
new file mode 100755
index 0000000000..d51d915943
--- /dev/null
+++ b/model_zoo/gcn/scripts/run_process_data.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 2 ]
+then 
+    echo "Usage: sh run_train.sh [SRC_PATH] [DATASET_NAME]"
+exit 1
+fi
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+SRC_PATH=$(get_real_path $1)
+echo $SRC_PATH
+
+DATASET_NAME=$2
+echo $DATASET_NAME
+
+if [ ! -d data_mr ]; then
+  mkdir data_mr
+else
+  echo data_mr exist
+fi
+MINDRECORD_PATH=`pwd`/data_mr
+
+rm -f $MINDRECORD_PATH/$DATASET_NAME
+rm -f $MINDRECORD_PATH/$DATASET_NAME.db
+
+cd ../../../example/graph_to_mindrecord || exit
+
+python writer.py --mindrecord_script $DATASET_NAME \
+--mindrecord_file "$MINDRECORD_PATH/$DATASET_NAME" \
+--mindrecord_partitions 1 \
+--mindrecord_header_size_by_bit 18 \
+--mindrecord_page_size_by_bit 20 \
+--graph_api_args "$SRC_PATH"
+
+cd - || exit
diff --git a/model_zoo/gcn/scripts/run_train.sh b/model_zoo/gcn/scripts/run_train.sh
new file mode 100755
index 0000000000..46dee49b0d
--- /dev/null
+++ b/model_zoo/gcn/scripts/run_train.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 1 ]
+then 
+    echo "Usage: sh run_train.sh [DATASET_NAME]"
+exit 1
+fi
+
+DATASET_NAME=$1
+echo $DATASET_NAME
+
+ulimit -u unlimited
+export DEVICE_NUM=1
+export RANK_SIZE=$DEVICE_NUM
+export DEVICE_ID=0
+export RANK_ID=0
+
+if [ -d "train" ];
+then
+    rm -rf ./train
+fi
+mkdir ./train
+cp ../*.py ./train
+cp *.sh ./train
+cp -r ../src ./train
+cd ./train || exit
+env > env.log
+echo "start training for device $DEVICE_ID"
+
+
+if [ $DATASET_NAME == cora ]
+then
+    python train.py --data_dir=../data_mr/$DATASET_NAME --train_nodes_num=140 &> log &
+fi
+
+if [ $DATASET_NAME == citeseer ]
+then
+    python train.py --data_dir=../data_mr/$DATASET_NAME --train_nodes_num=120 &> log &
+fi
+cd ..
+
diff --git a/model_zoo/gcn/src/config.py b/model_zoo/gcn/src/config.py
new file mode 100644
index 0000000000..83974d706c
--- /dev/null
+++ b/model_zoo/gcn/src/config.py
@@ -0,0 +1,26 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+network config setting, will be used in train.py
+"""
+
+
+class ConfigGCN():
+    learning_rate = 0.01
+    epochs = 200
+    hidden1 = 16
+    dropout = 0.5
+    weight_decay = 5e-4
+    early_stopping = 10
diff --git a/model_zoo/gcn/src/dataset.py b/model_zoo/gcn/src/dataset.py
new file mode 100644
index 0000000000..7962f6f550
--- /dev/null
+++ b/model_zoo/gcn/src/dataset.py
@@ -0,0 +1,65 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+create adjacency matrix, node features, labels, and mask for training.
+"""
+import numpy as np
+import scipy.sparse as sp
+import mindspore.dataset as ds
+
+
+def normalize_adj(adj):
+    """Symmetrically normalize adjacency matrix."""
+    rowsum = np.array(adj.sum(1))
+    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
+    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
+    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
+    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
+
+
+def get_adj_features_labels(data_dir):
+    """Get adjacency matrix, node features and labels from dataset."""
+    g = ds.GraphData(data_dir)
+    nodes = g.get_all_nodes(0)
+    nodes_list = nodes.tolist()
+    row_tensor = g.get_node_feature(nodes_list, [1, 2])
+    features = row_tensor[0]
+    labels = row_tensor[1]
+
+    nodes_num = labels.shape[0]
+    class_num = labels.max() + 1
+    labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32)
+
+    neighbor = g.get_all_neighbors(nodes_list, 0)
+    node_map = {node_id: index for index, node_id in enumerate(nodes_list)}
+    adj = np.zeros([nodes_num, nodes_num], dtype=np.float32)
+    for index, value in np.ndenumerate(neighbor):
+        # The first column of neighbor is node_id, second column to last column are neighbors of the first column.
+        # So we only care index[1] > 1.
+        # If the node does not have that many neighbors, -1 is padded. So if value < 0, we will not deal with it.
+        if value >= 0 and index[1] > 0:
+            adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1
+    adj = sp.coo_matrix(adj)
+    adj = adj + adj.T.multiply(adj.T > adj) + sp.eye(nodes_num)
+    nor_adj = normalize_adj(adj)
+    nor_adj = np.array(nor_adj.todense())
+    return nor_adj, features, labels_onehot, labels
+
+
+def get_mask(total, begin, end):
+    """Generate mask."""
+    mask = np.zeros([total]).astype(np.float32)
+    mask[begin:end] = 1
+    return mask
diff --git a/model_zoo/gcn/src/gcn.py b/model_zoo/gcn/src/gcn.py
new file mode 100644
index 0000000000..74199490b6
--- /dev/null
+++ b/model_zoo/gcn/src/gcn.py
@@ -0,0 +1,220 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""GCN."""
+import numpy as np
+from mindspore import nn
+from mindspore.common.parameter import ParameterTuple
+from mindspore.ops import composite as C
+from mindspore.ops import functional as F
+from mindspore.ops import operations as P
+from mindspore import Tensor
+from mindspore.nn.layer.activation import get_activation
+from model_zoo.gcn.src.metrics import Loss, Accuracy
+
+
+def glorot(shape):
+    init_range = np.sqrt(6.0/(shape[0]+shape[1]))
+    initial = np.random.uniform(-init_range, init_range, shape).astype(np.float32)
+    return Tensor(initial)
+
+
+class GraphConvolution(nn.Cell):
+    """
+    GCN graph convolution layer.
+
+    Args:
+        feature_in_dim (int): The input feature dimension.
+        feature_out_dim (int): The output feature dimension.
+        dropout_ratio (float): Dropout ratio for the dropout layer. Default: None.
+        activation (str): Activation function applied to the output of the layer, eg. 'relu'. Default: None.
+
+    Inputs:
+        - **adj** (Tensor) - Tensor of shape :math:`(N, N)`.
+        - **input_feature** (Tensor) - Tensor of shape :math:`(N, C)`.
+
+    Outputs:
+        Tensor, output tensor.
+    """
+
+    def __init__(self,
+                 feature_in_dim,
+                 feature_out_dim,
+                 dropout_ratio=None,
+                 activation=None):
+        super(GraphConvolution, self).__init__()
+        self.in_dim = feature_in_dim
+        self.out_dim = feature_out_dim
+        self.weight_init = glorot([self.out_dim, self.in_dim])
+        self.fc = nn.Dense(self.in_dim,
+                           self.out_dim,
+                           weight_init=self.weight_init,
+                           has_bias=False)
+        self.dropout_ratio = dropout_ratio
+        if self.dropout_ratio is not None:
+            self.dropout = nn.Dropout(keep_prob=1-self.dropout_ratio)
+        self.dropout_flag = self.dropout_ratio is not None
+        self.activation = get_activation(activation)
+        self.activation_flag = self.activation is not None
+        self.matmul = P.MatMul()
+
+    def construct(self, adj, input_feature):
+        dropout = input_feature
+        if self.dropout_flag:
+            dropout = self.dropout(dropout)
+
+        fc = self.fc(dropout)
+        output_feature = self.matmul(adj, fc)
+
+        if self.activation_flag:
+            output_feature = self.activation(output_feature)
+        return output_feature
+
+
+class GCN(nn.Cell):
+    """
+    GCN architecture.
+
+    Args:
+        config (ConfigGCN): Configuration for GCN.
+        adj (numpy.ndarray): Numbers of block in different layers.
+        feature (numpy.ndarray): Input channel in each layer.
+        output_dim (int): The number of output channels, equal to classes num.
+    """
+
+    def __init__(self, config, adj, feature, output_dim):
+        super(GCN, self).__init__()
+        self.adj = Tensor(adj)
+        self.feature = Tensor(feature)
+        input_dim = feature.shape[1]
+        self.layer0 = GraphConvolution(input_dim, config.hidden1, activation="relu", dropout_ratio=config.dropout)
+        self.layer1 = GraphConvolution(config.hidden1, output_dim, dropout_ratio=None)
+
+    def construct(self):
+        output0 = self.layer0(self.adj, self.feature)
+        output1 = self.layer1(self.adj, output0)
+        return output1
+
+
+class LossAccuracyWrapper(nn.Cell):
+    """
+    Wraps the GCN model with loss and accuracy cell.
+
+    Args:
+        network (Cell): GCN network.
+        label (numpy.ndarray): Dataset labels.
+        mask (numpy.ndarray): Mask for training, evaluation or test.
+        weight_decay (float): Weight decay parameter for weight of the first convolution layer.
+    """
+
+    def __init__(self, network, label, mask, weight_decay):
+        super(LossAccuracyWrapper, self).__init__()
+        self.network = network
+        self.loss = Loss(label, mask, weight_decay, network.trainable_params()[0])
+        self.accuracy = Accuracy(label, mask)
+
+    def construct(self):
+        preds = self.network()
+        loss = self.loss(preds)
+        accuracy = self.accuracy(preds)
+        return loss, accuracy
+
+
+class LossWrapper(nn.Cell):
+    """
+    Wraps the GCN model with loss.
+
+    Args:
+        network (Cell): GCN network.
+        label (numpy.ndarray): Dataset labels.
+        mask (numpy.ndarray): Mask for training.
+        weight_decay (float): Weight decay parameter for weight of the first convolution layer.
+    """
+
+    def __init__(self, network, label, mask, weight_decay):
+        super(LossWrapper, self).__init__()
+        self.network = network
+        self.loss = Loss(label, mask, weight_decay, network.trainable_params()[0])
+
+    def construct(self):
+        preds = self.network()
+        loss = self.loss(preds)
+        return loss
+
+
+class TrainOneStepCell(nn.Cell):
+    r"""
+    Network training package class.
+
+    Wraps the network with an optimizer. The resulting Cell be trained without inputs.
+    Backward graph will be created in the construct function to do parameter updating. Different
+    parallel modes are available to run the training.
+
+    Args:
+        network (Cell): The training network.
+        optimizer (Cell): Optimizer for updating the weights.
+        sens (Number): The scaling number to be filled as the input of backpropagation. Default value is 1.0.
+
+    Outputs:
+        Tensor, a scalar Tensor with shape :math:`()`.
+
+    Examples:
+        >>> net = Net()
+        >>> loss_fn = nn.SoftmaxCrossEntropyWithLogits()
+        >>> optim = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>> loss_net = nn.WithLossCell(net, loss_fn)
+        >>> train_net = nn.TrainOneStepCell(loss_net, optim)
+    """
+
+    def __init__(self, network, optimizer, sens=1.0):
+        super(TrainOneStepCell, self).__init__(auto_prefix=False)
+        self.network = network
+        self.network.add_flags(defer_inline=True)
+        self.weights = ParameterTuple(network.trainable_params())
+        self.optimizer = optimizer
+        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
+        self.sens = sens
+
+    def construct(self):
+        weights = self.weights
+        loss = self.network()
+        sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
+        grads = self.grad(self.network, weights)(sens)
+        return F.depend(loss, self.optimizer(grads))
+
+
+class TrainNetWrapper(nn.Cell):
+    """
+    Wraps the GCN model with optimizer.
+
+    Args:
+        network (Cell): GCN network.
+        label (numpy.ndarray): Dataset labels.
+        mask (numpy.ndarray): Mask for training, evaluation or test.
+        config (ConfigGCN): Configuration for GCN.
+    """
+
+    def __init__(self, network, label, mask, config):
+        super(TrainNetWrapper, self).__init__(auto_prefix=True)
+        self.network = network
+        loss_net = LossWrapper(network, label, mask, config.weight_decay)
+        optimizer = nn.Adam(loss_net.trainable_params(),
+                            learning_rate=config.learning_rate)
+        self.loss_train_net = TrainOneStepCell(loss_net, optimizer)
+        self.accuracy = Accuracy(label, mask)
+
+    def construct(self):
+        loss = self.loss_train_net()
+        accuracy = self.accuracy(self.network())
+        return loss, accuracy
diff --git a/model_zoo/gcn/src/metrics.py b/model_zoo/gcn/src/metrics.py
new file mode 100644
index 0000000000..5930956776
--- /dev/null
+++ b/model_zoo/gcn/src/metrics.py
@@ -0,0 +1,70 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Loss and accuracy."""
+from mindspore import nn
+from mindspore import Tensor
+from mindspore.common import dtype as mstype
+from mindspore.ops import operations as P
+
+
+class Loss(nn.Cell):
+    """Softmax cross-entropy loss with masking."""
+    def __init__(self, label, mask, weight_decay, param):
+        super(Loss, self).__init__()
+        self.label = Tensor(label)
+        self.mask = Tensor(mask)
+        self.loss = P.SoftmaxCrossEntropyWithLogits()
+        self.one = Tensor(1.0, mstype.float32)
+        self.zero = Tensor(0.0, mstype.float32)
+        self.mean = P.ReduceMean()
+        self.cast = P.Cast()
+        self.l2_loss = P.L2Loss()
+        self.reduce_sum = P.ReduceSum()
+        self.weight_decay = weight_decay
+        self.param = param
+
+    def construct(self, preds):
+        param = self.l2_loss(self.param)
+        loss = self.weight_decay * param
+        preds = self.cast(preds, mstype.float32)
+        loss = loss + self.loss(preds, self.label)[0]
+        mask = self.cast(self.mask, mstype.float32)
+        mask_reduce = self.mean(mask)
+        mask = mask / mask_reduce
+        loss = loss * mask
+        loss = self.mean(loss)
+        return loss
+
+
+class Accuracy(nn.Cell):
+    """Accuracy with masking."""
+    def __init__(self, label, mask):
+        super(Accuracy, self).__init__()
+        self.label = Tensor(label)
+        self.mask = Tensor(mask)
+        self.equal = P.Equal()
+        self.argmax = P.Argmax()
+        self.cast = P.Cast()
+        self.mean = P.ReduceMean()
+
+    def construct(self, preds):
+        preds = self.cast(preds, mstype.float32)
+        correct_prediction = self.equal(self.argmax(preds), self.argmax(self.label))
+        accuracy_all = self.cast(correct_prediction, mstype.float32)
+        mask = self.cast(self.mask, mstype.float32)
+        mask_reduce = self.mean(mask)
+        mask = mask / mask_reduce
+        accuracy_all *= mask
+        return self.mean(accuracy_all)
diff --git a/model_zoo/gcn/t-SNE_visualization_on_Cora.gif b/model_zoo/gcn/t-SNE_visualization_on_Cora.gif
new file mode 100644
index 0000000000..ae5aada9eb
Binary files /dev/null and b/model_zoo/gcn/t-SNE_visualization_on_Cora.gif differ
diff --git a/model_zoo/gcn/train.py b/model_zoo/gcn/train.py
new file mode 100644
index 0000000000..220d2ecd6b
--- /dev/null
+++ b/model_zoo/gcn/train.py
@@ -0,0 +1,127 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+GCN training script.
+"""
+
+import time
+import argparse
+
+import numpy as np
+from matplotlib import pyplot as plt
+from matplotlib import animation
+from sklearn import manifold
+from mindspore import context
+
+from model_zoo.gcn.src.gcn import GCN, LossAccuracyWrapper, TrainNetWrapper
+from model_zoo.gcn.src.config import ConfigGCN
+from model_zoo.gcn.src.dataset import get_adj_features_labels, get_mask
+
+
+def t_SNE(out_feature, dim):
+    t_sne = manifold.TSNE(n_components=dim, init='pca', random_state=0)
+    return t_sne.fit_transform(out_feature)
+
+
+def update_graph(i, data, scat, plot):
+    scat.set_offsets(data[i])
+    plt.title('t-SNE visualization of Epoch:{0}'.format(i))
+    return scat, plot
+
+
+def train():
+    """Train model."""
+    parser = argparse.ArgumentParser(description='GCN')
+    parser.add_argument('--data_dir', type=str, default='./data/cora/cora_mr', help='Dataset directory')
+    parser.add_argument('--seed', type=int, default=123, help='Random seed')
+    parser.add_argument('--train_nodes_num', type=int, default=140, help='Nodes numbers for training')
+    parser.add_argument('--eval_nodes_num', type=int, default=500, help='Nodes numbers for evaluation')
+    parser.add_argument('--test_nodes_num', type=int, default=1000, help='Nodes numbers for test')
+    parser.add_argument('--save_TSNE', type=bool, default=False, help='Whether to save t-SNE graph')
+    args_opt = parser.parse_args()
+
+    np.random.seed(args_opt.seed)
+    context.set_context(mode=context.GRAPH_MODE,
+                        device_target="Ascend", save_graphs=False)
+    config = ConfigGCN()
+    adj, feature, label_onehot, label = get_adj_features_labels(args_opt.data_dir)
+
+    nodes_num = label_onehot.shape[0]
+    train_mask = get_mask(nodes_num, 0, args_opt.train_nodes_num)
+    eval_mask = get_mask(nodes_num, args_opt.train_nodes_num, args_opt.train_nodes_num + args_opt.eval_nodes_num)
+    test_mask = get_mask(nodes_num, nodes_num - args_opt.test_nodes_num, nodes_num)
+
+    class_num = label_onehot.shape[1]
+    gcn_net = GCN(config, adj, feature, class_num)
+    gcn_net.add_flags_recursive(fp16=True)
+
+    eval_net = LossAccuracyWrapper(gcn_net, label_onehot, eval_mask, config.weight_decay)
+    test_net = LossAccuracyWrapper(gcn_net, label_onehot, test_mask, config.weight_decay)
+    train_net = TrainNetWrapper(gcn_net, label_onehot, train_mask, config)
+
+    loss_list = []
+
+    if args_opt.save_TSNE:
+        out_feature = gcn_net()
+        tsne_result = t_SNE(out_feature.asnumpy(), 2)
+        graph_data = []
+        graph_data.append(tsne_result)
+        fig = plt.figure()
+        scat = plt.scatter(tsne_result[:, 0], tsne_result[:, 1], s=2, c=label, cmap='rainbow')
+        plt.title('t-SNE visualization of Epoch:0', fontsize='large', fontweight='bold', verticalalignment='center')
+
+    for epoch in range(config.epochs):
+        t = time.time()
+
+        train_net.set_train()
+        train_result = train_net()
+        train_loss = train_result[0].asnumpy()
+        train_accuracy = train_result[1].asnumpy()
+
+        eval_net.set_train(False)
+        eval_result = eval_net()
+        eval_loss = eval_result[0].asnumpy()
+        eval_accuracy = eval_result[1].asnumpy()
+
+        loss_list.append(eval_loss)
+        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_loss),
+              "train_acc=", "{:.5f}".format(train_accuracy), "val_loss=", "{:.5f}".format(eval_loss),
+              "val_acc=", "{:.5f}".format(eval_accuracy), "time=", "{:.5f}".format(time.time() - t))
+
+        if args_opt.save_TSNE:
+            out_feature = gcn_net()
+            tsne_result = t_SNE(out_feature.asnumpy(), 2)
+            graph_data.append(tsne_result)
+
+        if epoch > config.early_stopping and loss_list[-1] > np.mean(loss_list[-(config.early_stopping+1):-1]):
+            print("Early stopping...")
+            break
+
+    t_test = time.time()
+    test_net.set_train(False)
+    test_result = test_net()
+    test_loss = test_result[0].asnumpy()
+    test_accuracy = test_result[1].asnumpy()
+    print("Test set results:", "loss=", "{:.5f}".format(test_loss),
+          "accuracy=", "{:.5f}".format(test_accuracy), "time=", "{:.5f}".format(time.time() - t_test))
+
+    if args_opt.save_TSNE:
+        ani = animation.FuncAnimation(fig, update_graph, frames=range(config.epochs + 1), fargs=(graph_data, scat, plt))
+        ani.save('t-SNE_visualization.gif', writer='imagemagick')
+
+
+if __name__ == '__main__':
+    train()
diff --git a/model_zoo/googlenet/README.md b/model_zoo/googlenet/README.md
new file mode 100644
index 0000000000..92cdd8af43
--- /dev/null
+++ b/model_zoo/googlenet/README.md
@@ -0,0 +1,324 @@
+# Contents
+
+- [GoogleNet Description](#googlenet-description)
+- [Model Architecture](#model-architecture)
+- [Dataset](#dataset)
+- [Features](#features)
+    - [Mixed Precision](#mixed-precision)
+- [Environment Requirements](#environment-requirements)
+- [Quick Start](#quick-start)    
+- [Script Description](#script-description)
+    - [Script and Sample Code](#script-and-sample-code)
+    - [Script Parameters](#script-parameters)
+    - [Training Process](#training-process)
+        - [Training](#training)
+        - [Distributed Training](#distributed-training)  
+    - [Evaluation Process](#evaluation-process)
+        - [Evaluation](#evaluation)
+- [Model Description](#model-description)
+    - [Performance](#performance)  
+        - [Evaluation Performance](#evaluation-performance)
+        - [Inference Performance](#evaluation-performance)
+    - [How to use](#how-to-use)
+        - [Inference](#inference) 
+        - [Continue Training on the Pretrained Model](#continue-training-on-the-pretrained-model)
+       - [Transfer Learning](#transfer-learning)
+- [Description of Random Situation](#description-of-random-situation)
+- [ModelZoo Homepage](#modelzoo-homepage)
+
+
+# [GoogleNet Description](#contents)
+
+GoogleNet, a 22 layers deep network, was proposed in 2014 and won the first place in the ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC14).  GoogleNet, also called Inception v1, has significant improvement over ZFNet (The winner in 2013) and AlexNet (The winner in 2012), and has relatively lower error rate compared to VGGNet.  Typically deeper deep learning network means larger number of parameters, which makes it more prone to overfitting. Furthermore, the increased network size leads to increased use of computational resources. To tackle these issues, GoogleNet adopts 1*1 convolution middle of the network to reduce dimension, and thus further reduce the computation. Global average pooling is used at the end of the network, instead of using fully connected layers.  Another technique, called inception module, is to have different sizes of convolutions for the same input and stacking all the outputs. 
+
+[Paper](https://arxiv.org/abs/1409.4842):  Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. "Going deeper with convolutions." *Proceedings of the IEEE conference on computer vision and pattern recognition*. 2015.
+
+
+# [Model Architecture](#contents)
+
+The overall network architecture of GoogleNet is shown below: 
+
+![](https://miro.medium.com/max/3780/1*ZFPOSAted10TPd3hBQU8iQ.png)
+
+Specifically, the GoogleNet contains numerous inception modules, which are connected together to go deeper.  In general, an inception module with dimensionality reduction consists of **1×1 conv**, **3×3 conv**, **5×5 conv**, and **3×3 max pooling**, which are done altogether for the previous input, and stack together again at output.
+
+![](https://miro.medium.com/max/1108/1*sezFsYW1MyM9YOMa1q909A.png)
+
+
+
+# [Dataset](#contents)
+
+Dataset used: [CIFAR-10](<http://www.cs.toronto.edu/~kriz/cifar.html>) 
+
+- Dataset size：175M，60,000 32*32 colorful images in 10 classes
+  - Train：146M，50,000 images  
+  - Test：29.3M，10,000 images 
+- Data format：binary files
+  - Note：Data will be processed in dataset.py
+
+
+
+# [Features](#contents)
+
+## Mixed Precision
+
+The [mixed precision](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/mixed_precision.html) training method accelerates the deep learning neural network training process by using both the single-precision and half-precision data formats, and maintains the network precision achieved by the single-precision training at the same time. Mixed precision training can accelerate the computation process, reduce memory usage, and enable a larger model or batch size to be trained on specific hardware. 
+For FP16 operators, if the input data type is FP32, the backend of MindSpore will automatically handle it with reduced precision. Users could check the reduced-precision operators by enabling INFO log and then searching ‘reduce precision’.
+
+
+
+# [Environment Requirements](#contents)
+
+- Hardware（Ascend/GPU）
+  - Prepare hardware environment with Ascend or GPU processor. If you want to try Ascend  , please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources. 
+- Framework
+  - [MindSpore](http://10.90.67.50/mindspore/archive/20200506/OpenSource/me_vm_x86/)
+- For more information, please check the resources below：
+  - [MindSpore tutorials](https://www.mindspore.cn/tutorial/zh-CN/master/index.html) 
+  - [MindSpore API](https://www.mindspore.cn/api/zh-CN/master/index.html)
+
+
+
+# [Quick Start](#contents)
+
+After installing MindSpore via the official website, you can start training and evaluation as follows: 
+
+```python
+# run training example
+python train.py > train.log 2>&1 & 
+
+# run distributed training example
+sh scripts/run_train.sh rank_table.json
+
+# run evaluation example
+python eval.py > eval.log 2>&1 &  OR  sh run_eval.sh
+```
+
+
+
+# [Script Description](#contents)
+
+## [Script and Sample Code](#contents)
+
+```
+├── model_zoo
+    ├── README.md                          // descriptions about all the models
+    ├── googlenet        
+        ├── README.md                    // descriptions about googlenet
+        ├── scripts 
+        │   ├──run_train.sh             // shell script for distributed 
+        │   ├──run_eval.sh             // shell script for evaluation 
+        ├── src 
+        │   ├──dataset.py             // creating dataset
+        │   ├──googlenet.py          // googlenet architecture
+        │   ├──config.py            // parameter configuration 
+        ├── train.py               // training script 
+        ├── eval.py               //  evaluation script 
+        ├── export.py            // export checkpoint files into geir/onnx 
+```
+
+## [Script Parameters](#contents)
+
+```python
+Major parameters in train.py and config.py are:
+
+--data_path: The absolute full path to the train and evaluation datasets. 
+--epoch_size: Total training epochs. 
+--batch_size: Training batch size. 
+--lr_init: Initial learning rate. 
+--num_classes: The number of classes in the training set.
+--weight_decay: Weight decay value. 
+--image_height: Image height used as input to the model.
+--image_width: Image width used as input the model.
+--pre_trained: Whether training from scratch or training based on the
+               pre-trained model.Optional values are True, False. 
+--device_target: Device where the code will be implemented. Optional values
+                 are "Ascend", "GPU". 
+--device_id: Device ID used to train or evaluate the dataset. Ignore it
+             when you use run_train.sh for distributed training.
+--checkpoint_path: The absolute full path to the checkpoint file saved
+                   after training.
+--onnx_filename: File name of the onnx model used in export.py. 
+--geir_filename: File name of the geir model used in export.py.    
+```
+
+
+## [Training Process](#contents)
+
+### Training 
+
+```
+python train.py > train.log 2>&1 & 
+```
+
+The python command above will run in the background, you can view the results through the file `train.log`.
+
+After training, you'll get some checkpoint files under the script folder by default. The loss value will be achieved as follows:
+
+```
+# grep "loss is " train.log
+epoch: 1 step: 390, loss is 1.4842823
+epcoh: 2 step: 390, loss is 1.0897788
+...
+```
+
+The model checkpoint will be saved in the current directory. 
+
+### Distributed Training
+
+```
+sh scripts/run_train.sh rank_table.json
+```
+
+The above shell script will run distribute training in the background. You can view the results through the file `train_parallel[X]/log`. The loss value will be achieved as follows:
+
+```
+# grep "result: " train_parallel*/log
+train_parallel0/log:epoch: 1 step: 48, loss is 1.4302931
+train_parallel0/log:epcoh: 2 step: 48, loss is 1.4023874
+...
+train_parallel1/log:epoch: 1 step: 48, loss is 1.3458025
+train_parallel1/log:epcoh: 2 step: 48, loss is 1.3729336
+...
+...
+```
+
+
+## [Evaluation Process](#contents)
+
+### Evaluation
+
+Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "username/googlenet/train_googlenet_cifar10-125_390.ckpt".
+
+```
+python eval.py > eval.log 2>&1 &  
+OR
+sh scripts/run_eval.sh
+```
+
+The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows:
+
+```
+# grep "accuracy: " eval.log
+accuracy: {'acc': 0.934}
+```
+
+Note that for evaluation after distributed training, please set the checkpoint_path to be the last saved checkpoint file such as "username/googlenet/train_parallel0/train_googlenet_cifar10-125_48.ckpt". The accuracy of the test dataset will be as follows:
+
+```
+# grep "accuracy: " dist.eval.log
+accuracy: {'acc': 0.9217}
+```
+
+
+# [Model Description](#contents)
+## [Performance](#contents)
+
+### Evaluation Performance 
+
+| Parameters                 | GoogleNet                                                   |
+| -------------------------- | ----------------------------------------------------------- |
+| Model Version              | Inception V1                                                |
+| Resource                   | Ascend 910 ；CPU 2.60GHz，56cores；Memory，314G             |
+| uploaded Date              | 06/09/2020 (month/day/year)                                 |
+| MindSpore Version          | 0.3.0-alpha                                                       |
+| Dataset                    | CIFAR-10                                                    |
+| Training Parameters        | epoch=125, steps=390, batch_size = 128, lr=0.1              |
+| Optimizer                  | SGD                                                         |
+| Loss Function              | Softmax Cross Entropy                                       |
+| outputs                    | probability                                                 |
+| Loss                       | 0.0016                                                      |
+| Speed                      | 1pc: 79 ms/step;  8pcs: 82 ms/step                          |
+| Total time                 | 1pc: 63.85 mins;  8pcs: 11.28 mins                          |
+| Parameters (M)             | 6.8                                                         |
+| Checkpoint for Fine tuning | 43.07M (.ckpt file)                                         |
+| Model for inference        | 21.50M (.onnx file),  21.60M(.geir file)                    |
+| Scripts                    | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/googlenet |
+
+
+### Inference Performance
+
+| Parameters          | GoogleNet                   |
+| ------------------- | --------------------------- |
+| Model Version       | Inception V1                |
+| Resource            | Ascend 910                  |
+| Uploaded Date       | 06/09/2020 (month/day/year) |
+| MindSpore Version   | 0.3.0-alpha                       |
+| Dataset             | CIFAR-10, 10,000 images     |
+| batch_size          | 128                         |
+| outputs             | probability                 |
+| Accuracy            | 1pc: 93.4%;  8pcs: 92.17%   |
+| Model for inference | 21.50M (.onnx file)         |
+
+## [How to use](#contents)
+### Inference
+
+If you need to use the trained model to perform inference on multiple hardware platforms, such as GPU, Ascend 910 or Ascend 310, you can refer to this [Link](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/network_migration.html). Following the steps below, this is a simple example:
+
+```
+# Load unseen dataset for inference
+dataset = dataset.create_dataset(cfg.data_path, 1, False)
+
+# Define model 
+net = GoogleNet(num_classes=cfg.num_classes)
+opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01,
+               cfg.momentum, weight_decay=cfg.weight_decay)
+loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', 
+                                        is_grad=False)
+model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
+
+# Load pre-trained model
+param_dict = load_checkpoint(cfg.checkpoint_path)
+load_param_into_net(net, param_dict)
+net.set_train(False)
+
+# Make predictions on the unseen dataset
+acc = model.eval(dataset)
+print("accuracy: ", acc)
+```
+
+### Continue Training on the Pretrained Model 
+
+```
+# Load dataset
+dataset = create_dataset(cfg.data_path, cfg.epoch_size)
+batch_num = dataset.get_dataset_size()
+
+# Define model
+net = GoogleNet(num_classes=cfg.num_classes)
+# Continue training if set pre_trained to be True
+if cfg.pre_trained:
+    param_dict = load_checkpoint(cfg.checkpoint_path)
+    load_param_into_net(net, param_dict)
+lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size,    
+              steps_per_epoch=batch_num)
+opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 
+               Tensor(lr), cfg.momentum, weight_decay=cfg.weight_decay)
+loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False)
+model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
+              amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None)
+
+# Set callbacks 
+config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, 
+                             keep_checkpoint_max=cfg.keep_checkpoint_max)
+time_cb = TimeMonitor(data_size=batch_num)
+ckpoint_cb = ModelCheckpoint(prefix="train_googlenet_cifar10", directory="./", 
+                             config=config_ck)
+loss_cb = LossMonitor()
+
+# Start training
+model.train(cfg.epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb])
+print("train success")
+```
+
+### Transfer Learning
+To be added.
+
+
+# [Description of Random Situation](#contents)
+
+In dataset.py, we set the seed inside “create_dataset" function. We also use random seed in train.py. 
+
+
+# [ModelZoo Homepage](#contents)  
+ Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo).  
diff --git a/example/googlenet_cifar10/eval.py b/model_zoo/googlenet/eval.py
similarity index 53%
rename from example/googlenet_cifar10/eval.py
rename to model_zoo/googlenet/eval.py
index cc09539aa7..fc469879e7 100644
--- a/example/googlenet_cifar10/eval.py
+++ b/model_zoo/googlenet/eval.py
@@ -14,42 +14,32 @@
 # ============================================================================
 """
 ##############test googlenet example on cifar10#################
-python eval.py --data_path=$DATA_HOME --device_id=$DEVICE_ID
+python eval.py
 """
-import argparse
-
 import mindspore.nn as nn
 from mindspore import context
-from mindspore.model_zoo.googlenet import GooGLeNet
 from mindspore.nn.optim.momentum import Momentum
 from mindspore.train.model import Model
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 
-import dataset
-from config import cifar_cfg as cfg
+from src.config import cifar_cfg as cfg
+from src.dataset import create_dataset
+from src.googlenet import GoogleNet
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Cifar10 classification')
-    parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
-                        help='device where the code will be implemented. (Default: Ascend)')
-    parser.add_argument('--data_path', type=str, default='./cifar', help='path where the dataset is saved')
-    parser.add_argument('--checkpoint_path', type=str, default=None, help='checkpoint file path.')
-    parser.add_argument('--device_id', type=int, default=None, help='device id of GPU or Ascend. (Default: None)')
-    args_opt = parser.parse_args()
-
-    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
-    context.set_context(device_id=args_opt.device_id)
+    context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target)
+    context.set_context(device_id=cfg.device_id)
 
-    net = GooGLeNet(num_classes=cfg.num_classes)
+    net = GoogleNet(num_classes=cfg.num_classes)
     opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, cfg.momentum,
                    weight_decay=cfg.weight_decay)
     loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False)
     model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
 
-    param_dict = load_checkpoint(args_opt.checkpoint_path)
+    param_dict = load_checkpoint(cfg.checkpoint_path)
     load_param_into_net(net, param_dict)
     net.set_train(False)
-    dataset = dataset.create_dataset(args_opt.data_path, 1, False)
-    res = model.eval(dataset)
-    print("result: ", res)
+    dataset = create_dataset(cfg.data_path, 1, False)
+    acc = model.eval(dataset)
+    print("accuracy: ", acc)
diff --git a/model_zoo/googlenet/export.py b/model_zoo/googlenet/export.py
new file mode 100644
index 0000000000..d1a6de9b8d
--- /dev/null
+++ b/model_zoo/googlenet/export.py
@@ -0,0 +1,36 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+##############export checkpoint file into geir and onnx models#################
+python export.py
+"""
+import numpy as np
+
+import mindspore as ms
+from mindspore import Tensor
+from mindspore.train.serialization import load_checkpoint, load_param_into_net, export
+
+from src.config import cifar_cfg as cfg
+from src.googlenet import GoogleNet
+
+
+if __name__ == '__main__':
+    net = GoogleNet(num_classes=cfg.num_classes)
+    param_dict = load_checkpoint(cfg.checkpoint_path)
+    load_param_into_net(net, param_dict)
+
+    input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 3, 224, 224]), ms.float32)
+    export(net, input_arr, file_name=cfg.onnx_filename, file_format="ONNX")
+    export(net, input_arr, file_name=cfg.geir_filename, file_format="GEIR")
diff --git a/model_zoo/googlenet/scripts/run_eval.sh b/model_zoo/googlenet/scripts/run_eval.sh
new file mode 100644
index 0000000000..4aad02a4af
--- /dev/null
+++ b/model_zoo/googlenet/scripts/run_eval.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+ulimit -u unlimited
+
+BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+export PYTHONPATH=${BASEPATH}:$PYTHONPATH
+export DEVICE_ID=0
+
+python ${BASEPATH}/../eval.py  > ./eval.log 2>&1 &
diff --git a/example/googlenet_cifar10/run_distribute_train.sh b/model_zoo/googlenet/scripts/run_train.sh
old mode 100755
new mode 100644
similarity index 73%
rename from example/googlenet_cifar10/run_distribute_train.sh
rename to model_zoo/googlenet/scripts/run_train.sh
index c9b8dfc48f..c21c2f04b6
--- a/example/googlenet_cifar10/run_distribute_train.sh
+++ b/model_zoo/googlenet/scripts/run_train.sh
@@ -14,28 +14,24 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# != 2 ]
-then 
-    echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]"
+if [ $# != 1 ]
+then
+    echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH]"
 exit 1
 fi
 
 if [ ! -f $1 ]
-then 
+then
     echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
 exit 1
-fi 
-
-if [ ! -d $2 ]
-then 
-    echo "error: DATA_PATH=$2 is not a directory"
-exit 1
-fi 
+fi
 
 ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=8
-export MINDSPORE_HCCL_CONFIG_PATH=$1
+MINDSPORE_HCCL_CONFIG_PATH=$(realpath $1)
+export MINDSPORE_HCCL_CONFIG_PATH
+echo "MINDSPORE_HCCL_CONFIG_PATH=${MINDSPORE_HCCL_CONFIG_PATH}"
 
 for((i=0; i<${DEVICE_NUM}; i++))
 do
@@ -43,11 +39,11 @@ do
     export RANK_ID=$i
     rm -rf ./train_parallel$i
     mkdir ./train_parallel$i
-    cp *.py ./train_parallel$i
-    cp *.sh ./train_parallel$i
-    cd ./train_parallel$i || exit
+    cp -r ./src ./train_parallel$i
+    cp ./train.py ./train_parallel$i
     echo "start training for rank $RANK_ID, device $DEVICE_ID"
+    cd ./train_parallel$i ||exit
     env > env.log
-    python train.py --data_path=$2 --device_id=$i &> log &
+    python train.py --device_id=$i > log 2>&1 &
     cd ..
 done
diff --git a/example/googlenet_cifar10/config.py b/model_zoo/googlenet/src/config.py
similarity index 78%
rename from example/googlenet_cifar10/config.py
rename to model_zoo/googlenet/src/config.py
index 4b134f68da..5f803ad325 100644
--- a/example/googlenet_cifar10/config.py
+++ b/model_zoo/googlenet/src/config.py
@@ -18,6 +18,7 @@ network config setting, will be used in main.py
 from easydict import EasyDict as edict
 
 cifar_cfg = edict({
+    'pre_trained': False,
     'num_classes': 10,
     'lr_init': 0.1,
     'batch_size': 128,
@@ -27,5 +28,11 @@ cifar_cfg = edict({
     'buffer_size': 10,
     'image_height': 224,
     'image_width': 224,
-    'keep_checkpoint_max': 10
+    'data_path': './cifar10',
+    'device_target': 'Ascend',
+    'device_id': 4,
+    'keep_checkpoint_max': 10,
+    'checkpoint_path': './train_googlenet_cifar10-125_390.ckpt',
+    'onnx_filename': 'googlenet.onnx',
+    'geir_filename': 'googlenet.geir'
 })
diff --git a/example/googlenet_cifar10/dataset.py b/model_zoo/googlenet/src/dataset.py
similarity index 98%
rename from example/googlenet_cifar10/dataset.py
rename to model_zoo/googlenet/src/dataset.py
index e7b6abfb56..a1cbc2cdab 100644
--- a/example/googlenet_cifar10/dataset.py
+++ b/model_zoo/googlenet/src/dataset.py
@@ -21,7 +21,7 @@ import mindspore.common.dtype as mstype
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.c_transforms as C
 import mindspore.dataset.transforms.vision.c_transforms as vision
-from config import cifar_cfg as cfg
+from src.config import cifar_cfg as cfg
 
 
 def create_dataset(data_home, repeat_num=1, training=True):
diff --git a/mindspore/model_zoo/googlenet.py b/model_zoo/googlenet/src/googlenet.py
similarity index 97%
rename from mindspore/model_zoo/googlenet.py
rename to model_zoo/googlenet/src/googlenet.py
index 4a572828de..701b3aeb5a 100644
--- a/mindspore/model_zoo/googlenet.py
+++ b/model_zoo/googlenet/src/googlenet.py
@@ -40,8 +40,7 @@ class Conv2dBlock(nn.Cell):
     def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, pad_mode="same"):
         super(Conv2dBlock, self).__init__()
         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride,
-                              padding=padding, pad_mode=pad_mode, weight_init=weight_variable(),
-                              bias_init=False)
+                              padding=padding, pad_mode=pad_mode, weight_init=weight_variable())
         self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
         self.relu = nn.ReLU()
 
@@ -78,13 +77,13 @@ class Inception(nn.Cell):
         return self.concat((branch1, branch2, branch3, branch4))
 
 
-class GooGLeNet(nn.Cell):
+class GoogleNet(nn.Cell):
     """
     Googlenet architecture
     """
 
     def __init__(self, num_classes):
-        super(GooGLeNet, self).__init__()
+        super(GoogleNet, self).__init__()
         self.conv1 = Conv2dBlock(3, 64, kernel_size=7, stride=2, padding=0)
         self.maxpool1 = P.MaxPoolWithArgmax(ksize=3, strides=2, padding="same")
 
diff --git a/example/googlenet_cifar10/train.py b/model_zoo/googlenet/train.py
similarity index 82%
rename from example/googlenet_cifar10/train.py
rename to model_zoo/googlenet/train.py
index bee0297bb3..0129176510 100644
--- a/example/googlenet_cifar10/train.py
+++ b/model_zoo/googlenet/train.py
@@ -14,7 +14,7 @@
 # ============================================================================
 """
 #################train googlent example on cifar10########################
-python train.py --data_path=$DATA_HOME --device_id=$DEVICE_ID
+python train.py
 """
 import argparse
 import os
@@ -26,14 +26,14 @@ import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore import context
 from mindspore.communication.management import init
-from mindspore.model_zoo.googlenet import GooGLeNet
 from mindspore.nn.optim.momentum import Momentum
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train.model import Model, ParallelMode
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
 
-
-from dataset import create_dataset
-from config import cifar_cfg as cfg
+from src.config import cifar_cfg as cfg
+from src.dataset import create_dataset
+from src.googlenet import GoogleNet
 
 random.seed(1)
 np.random.seed(1)
@@ -62,14 +62,14 @@ def lr_steps(global_step, lr_max=None, total_epochs=None, steps_per_epoch=None):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Cifar10 classification')
-    parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
-                        help='device where the code will be implemented. (Default: Ascend)')
-    parser.add_argument('--data_path', type=str, default='./cifar', help='path where the dataset is saved')
     parser.add_argument('--device_id', type=int, default=None, help='device id of GPU or Ascend. (Default: None)')
     args_opt = parser.parse_args()
 
-    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
-    context.set_context(device_id=args_opt.device_id)
+    context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target)
+    if args_opt.device_id is not None:
+        context.set_context(device_id=args_opt.device_id)
+    else:
+        context.set_context(device_id=cfg.device_id)
 
     device_num = int(os.environ.get("DEVICE_NUM", 1))
     if device_num > 1:
@@ -78,10 +78,14 @@ if __name__ == '__main__':
                                           mirror_mean=True)
         init()
 
-    dataset = create_dataset(args_opt.data_path, cfg.epoch_size)
+    dataset = create_dataset(cfg.data_path, cfg.epoch_size)
     batch_num = dataset.get_dataset_size()
 
-    net = GooGLeNet(num_classes=cfg.num_classes)
+    net = GoogleNet(num_classes=cfg.num_classes)
+    # Continue training if set pre_trained to be True
+    if cfg.pre_trained:
+        param_dict = load_checkpoint(cfg.checkpoint_path)
+        load_param_into_net(net, param_dict)
     lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num)
     opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), cfg.momentum,
                    weight_decay=cfg.weight_decay)
diff --git a/example/lenet_mnist/README.md b/model_zoo/lenet/README.md
similarity index 78%
rename from example/lenet_mnist/README.md
rename to model_zoo/lenet/README.md
index 88c8769e05..579c9894b2 100644
--- a/example/lenet_mnist/README.md
+++ b/model_zoo/lenet/README.md
@@ -2,7 +2,7 @@
 
 ## Description
 
-Training LeNet with MNIST dataset in MindSpore.
+Training LeNet with dataset in MindSpore.
 
 This is the simple and basic tutorial for constructing a network in MindSpore.
 
@@ -10,10 +10,10 @@ This is the simple and basic tutorial for constructing a network in MindSpore.
 
 - Install [MindSpore](https://www.mindspore.cn/install/en).
 
-- Download the MNIST dataset, the directory structure is as follows:
+- Download the dataset, the directory structure is as follows:
 
 ```
-└─MNIST_Data
+└─Data
     ├─test
     │      t10k-images.idx3-ubyte
     │      t10k-labels.idx1-ubyte
@@ -27,7 +27,7 @@ This is the simple and basic tutorial for constructing a network in MindSpore.
 
 ```python
 # train LeNet, hyperparameter setting in config.py
-python train.py --data_path MNIST_Data
+python train.py --data_path Data
 ```
 
 You will get the loss value of each step as following:
@@ -43,8 +43,8 @@ epoch: 1 step: 1741, loss is 0.05018193
 
 Then, evaluate LeNet according to network model
 ```python
-# evaluate LeNet, after 1 epoch training, the accuracy is up to 96.5%
-python eval.py --data_path MNIST_Data --mode test --ckpt_path checkpoint_lenet-1_1875.ckpt
+# evaluate LeNet
+python eval.py --data_path Data --ckpt_path checkpoint_lenet-1_1875.ckpt
 ```
 
 ## Note
diff --git a/example/lenet_mnist/eval.py b/model_zoo/lenet/eval.py
similarity index 89%
rename from example/lenet_mnist/eval.py
rename to model_zoo/lenet/eval.py
index 8317785a66..a9842f4426 100644
--- a/example/lenet_mnist/eval.py
+++ b/model_zoo/lenet/eval.py
@@ -20,10 +20,10 @@ python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt
 
 import os
 import argparse
-from dataset import create_dataset
-from config import mnist_cfg as cfg
+from src.dataset import create_dataset
+from src.config import mnist_cfg as cfg
+from src.lenet import LeNet5
 import mindspore.nn as nn
-from mindspore.model_zoo.lenet import LeNet5
 from mindspore import context
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
@@ -32,10 +32,10 @@ from mindspore.nn.metrics import Accuracy
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='MindSpore MNIST Example')
+    parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
     parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'],
                         help='device where the code will be implemented (default: Ascend)')
-    parser.add_argument('--data_path', type=str, default="./MNIST_Data",
+    parser.add_argument('--data_path', type=str, default="./Data",
                         help='path where the dataset is saved')
     parser.add_argument('--ckpt_path', type=str, default="", help='if mode is test, must provide\
                         path where the trained ckpt file')
@@ -61,4 +61,4 @@ if __name__ == "__main__":
                              cfg.batch_size,
                              1)
     acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode)
-    print("============== Accuracy:{} ==============".format(acc))
+    print("============== {} ==============".format(acc))
diff --git a/model_zoo/lenet/src/__init__.py b/model_zoo/lenet/src/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/example/lenet_mnist/config.py b/model_zoo/lenet/src/config.py
similarity index 100%
rename from example/lenet_mnist/config.py
rename to model_zoo/lenet/src/config.py
diff --git a/example/lenet_mnist/dataset.py b/model_zoo/lenet/src/dataset.py
similarity index 100%
rename from example/lenet_mnist/dataset.py
rename to model_zoo/lenet/src/dataset.py
diff --git a/mindspore/model_zoo/lenet.py b/model_zoo/lenet/src/lenet.py
similarity index 95%
rename from mindspore/model_zoo/lenet.py
rename to model_zoo/lenet/src/lenet.py
index 6e39c439bf..3864315dba 100644
--- a/mindspore/model_zoo/lenet.py
+++ b/model_zoo/lenet/src/lenet.py
@@ -50,11 +50,10 @@ class LeNet5(nn.Cell):
         >>> LeNet(num_class=10)
 
     """
-    def __init__(self, num_class=10):
+    def __init__(self, num_class=10, channel=1):
         super(LeNet5, self).__init__()
         self.num_class = num_class
-        self.batch_size = 32
-        self.conv1 = conv(1, 6, 5)
+        self.conv1 = conv(channel, 6, 5)
         self.conv2 = conv(6, 16, 5)
         self.fc1 = fc_with_initialize(16 * 5 * 5, 120)
         self.fc2 = fc_with_initialize(120, 84)
diff --git a/example/lenet_mnist/train.py b/model_zoo/lenet/train.py
similarity index 86%
rename from example/lenet_mnist/train.py
rename to model_zoo/lenet/train.py
index 3186f5fca7..740b6e8ca3 100644
--- a/example/lenet_mnist/train.py
+++ b/model_zoo/lenet/train.py
@@ -20,10 +20,10 @@ python train.py --data_path /YourDataPath
 
 import os
 import argparse
-from config import mnist_cfg as cfg
-from dataset import create_dataset
+from src.config import mnist_cfg as cfg
+from src.dataset import create_dataset
+from src.lenet import LeNet5
 import mindspore.nn as nn
-from mindspore.model_zoo.lenet import LeNet5
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
@@ -31,15 +31,18 @@ from mindspore.nn.metrics import Accuracy
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='MindSpore MNIST Example')
+    parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
     parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'],
                         help='device where the code will be implemented (default: Ascend)')
-    parser.add_argument('--data_path', type=str, default="./MNIST_Data",
+    parser.add_argument('--data_path', type=str, default="./Data",
                         help='path where the dataset is saved')
-    parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True')
+    parser.add_argument('--dataset_sink_mode', type=bool, default=True, help='dataset_sink_mode is False or True')
 
     args = parser.parse_args()
 
+    if args.device_target == "CPU":
+        args.dataset_sink_mode = False
+
     context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
     ds_train = create_dataset(os.path.join(args.data_path, "train"),
                               cfg.batch_size,
diff --git a/model_zoo/lenet_quant/README.md b/model_zoo/lenet_quant/README.md
new file mode 100644
index 0000000000..26cdcc3ecd
--- /dev/null
+++ b/model_zoo/lenet_quant/README.md
@@ -0,0 +1,248 @@
+# LeNet Quantization Aware Training
+
+## Description
+
+Training LeNet with MNIST dataset in MindSpore with quantization aware training.
+
+This is the simple and basic tutorial for constructing a network in MindSpore with quantization aware.
+
+In this tutorial, you will:
+
+1. Train a MindSpore fusion model for MNIST from scratch using `nn.Conv2dBnAct` and `nn.DenseBnAct`.
+2. Fine tune the fusion model by applying the quantization aware training auto network converter API `convert_quant_network`, after the network convergence then export a quantization aware model checkpoint file.
+3. Use the quantization aware model to create an actually quantized model for the Ascend inference backend.
+4. See the persistence of accuracy in inference backend and a 4x smaller model. To see the latency benefits on mobile, try out the Ascend inference backend examples.
+
+
+## Train fusion model
+
+### Install
+
+Install MindSpore base on the ascend device and GPU device from [MindSpore](https://www.mindspore.cn/install/en).
+
+
+```python
+pip uninstall -y mindspore-ascend
+pip uninstall -y mindspore-gpu
+pip install mindspore-ascend.whl
+```
+
+Then you will get the following display
+
+
+```bash
+>>> Found existing installation: mindspore-ascend
+>>> Uninstalling mindspore-ascend:
+>>>  Successfully uninstalled mindspore-ascend.
+```
+
+### Prepare Dataset
+
+Download the MNIST dataset, the directory structure is as follows:
+
+```
+└─MNIST_Data
+    ├─test
+    │      t10k-images.idx3-ubyte
+    │      t10k-labels.idx1-ubyte
+    └─train
+           train-images.idx3-ubyte
+           train-labels.idx1-ubyte
+```
+
+### Define fusion model
+
+Define a MindSpore fusion model using `nn.Conv2dBnAct` and `nn.DenseBnAct`.
+
+```Python
+class LeNet5(nn.Cell):
+    """
+    Define Lenet fusion model
+    """
+
+    def __init__(self, num_class=10, channel=1):
+        super(LeNet5, self).__init__()
+        self.num_class = num_class
+
+        # change `nn.Conv2d` to `nn.Conv2dBnAct`
+        self.conv1 = nn.Conv2dBnAct(channel, 6, 5, activation='relu')
+        self.conv2 = nn.Conv2dBnAct(6, 16, 5, activation='relu')
+        # change `nn.Dense` to `nn.DenseBnAct`
+        self.fc1 = nn.DenseBnAct(16 * 5 * 5, 120, activation='relu')
+        self.fc2 = nn.DenseBnAct(120, 84, activation='relu')
+        self.fc3 = nn.DenseBnAct(84, self.num_class)
+
+        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.flatten = nn.Flatten()
+
+    def construct(self, x):
+        x = self.conv1(x)
+        x = self.max_pool2d(x)
+        x = self.conv2(x)
+        x = self.max_pool2d(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = self.fc3(x)
+        return x
+```
+
+Get the MNIST from scratch dataset.
+
+```Python
+ds_train = create_dataset(os.path.join(args.data_path, "train"), 
+                          cfg.batch_size, cfg.epoch_size)
+step_size = ds_train.get_dataset_size()
+```
+
+### Train model
+
+Load the Lenet fusion network, training network using loss `nn.SoftmaxCrossEntropyWithLogits` with optimization `nn.Momentum`.
+
+```Python
+# Define the network
+network = LeNet5Fusion(cfg.num_classes)
+# Define the loss
+net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
+# Define optimization
+net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
+
+# Define model using loss and optimization.
+time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
+config_ck = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size,
+                             keep_checkpoint_max=cfg.keep_checkpoint_max)
+ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck)
+model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
+```
+
+Now we can start training.
+
+```Python
+model.train(cfg['epoch_size'], ds_train, 
+            callbacks=[time_cb, ckpoint_cb, LossMonitor()],
+            dataset_sink_mode=args.dataset_sink_mode)
+```
+
+After all the following we will get the loss value of each step as following:
+
+```bash
+>>> Epoch: [  1/ 10] step: [  1/ 900], loss: [2.3040/2.5234], time: [1.300234]
+>>> ...
+>>> Epoch: [ 10/ 10] step: [887/ 900], loss: [0.0113/0.0223], time: [1.300234]
+>>> Epoch: [ 10/ 10] step: [888/ 900], loss: [0.0334/0.0223], time: [1.300234]
+>>> Epoch: [ 10/ 10] step: [889/ 900], loss: [0.0233/0.0223], time: [1.300234]
+```
+
+Also, you can just run this command instead.
+
+```python
+python train.py --data_path MNIST_Data --device_target Ascend
+```
+
+### Evaluate fusion model
+
+After training epoch stop. We can get the fusion model checkpoint file like `checkpoint_lenet.ckpt`. Meanwhile, we can evaluate this fusion model.
+
+```python
+python eval.py --data_path MNIST_Data --device_target Ascend --ckpt_path checkpoint_lenet.ckpt
+```
+
+The top1 accuracy would display on shell.
+
+```bash
+>>> Accuracy: 98.53.
+```
+
+## Train quantization aware model
+
+### Define quantization aware model
+
+You will apply quantization aware training to the whole model and the layers of "fake quant op" are insert into the whole model. All layers are now perpare by "fake quant op".
+
+Note that the resulting model is quantization aware but not quantized (e.g. the weights are float32 instead of int8).
+
+```python
+# define funsion network
+network = LeNet5Fusion(cfg.num_classes)
+
+# load quantization aware network checkpoint
+param_dict = load_checkpoint(args.ckpt_path)
+load_param_into_net(network, param_dict)
+
+# convert funsion netwrok to quantization aware network
+network = quant.convert_quant_network(network)
+```
+
+### load checkpoint
+
+After convert to quantization aware network, we can load the checkpoint file.
+
+```python
+config_ck = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size,
+                             keep_checkpoint_max=cfg.keep_checkpoint_max)
+ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck)
+model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
+```
+
+### train quantization aware model
+
+Also, you can just run this command instread.
+
+```python
+python train_quant.py --data_path MNIST_Data --device_target Ascend --ckpt_path checkpoint_lenet.ckpt
+```
+
+After all the following we will get the loss value of each step as following:
+
+```bash
+>>> Epoch: [  1/ 10] step: [  1/ 900], loss: [2.3040/2.5234], time: [1.300234]
+>>> ...
+>>> Epoch: [ 10/ 10] step: [887/ 900], loss: [0.0113/0.0223], time: [1.300234]
+>>> Epoch: [ 10/ 10] step: [888/ 900], loss: [0.0334/0.0223], time: [1.300234]
+>>> Epoch: [ 10/ 10] step: [889/ 900], loss: [0.0233/0.0223], time: [1.300234]
+```
+
+### Evaluate quantization aware model
+
+Procedure of quantization aware model evaluation is different from normal. Because the checkpoint was create by quantization aware model, so we need to load fusion model checkpoint before convert fusion model to quantization aware model.
+
+```python
+# define funsion network
+network = LeNet5Fusion(cfg.num_classes)
+
+# load quantization aware network checkpoint
+param_dict = load_checkpoint(args.ckpt_path)
+load_param_into_net(network, param_dict)
+
+# convert funsion netwrok to quantization aware network
+network = quant.convert_quant_network(network 
+```
+
+Also, you can just run this command insread.
+
+```python
+python eval_quant.py --data_path MNIST_Data --device_target Ascend --ckpt_path checkpoint_lenet.ckpt
+```
+
+The top1 accuracy would display on shell.
+
+```bash
+>>> Accuracy: 98.54.
+```
+
+## Note
+
+Here are some optional parameters:
+
+```bash
+--device_target {Ascend,GPU,CPU}
+    device where the code will be implemented (default: Ascend)
+--data_path DATA_PATH
+    path where the dataset is saved
+--dataset_sink_mode DATASET_SINK_MODE
+    dataset_sink_mode is False or True
+```
+
+You can run ```python train.py -h``` or ```python eval.py -h``` to get more information.
+
+We encourage you to try this new capability, which can be particularly important for deployment in resource-constrained environments.
\ No newline at end of file
diff --git a/model_zoo/lenet_quant/eval.py b/model_zoo/lenet_quant/eval.py
new file mode 100644
index 0000000000..c1e3a5fd8c
--- /dev/null
+++ b/model_zoo/lenet_quant/eval.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+######################## eval lenet example ########################
+eval lenet according to model file:
+python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt
+"""
+
+import os
+import argparse
+import mindspore.nn as nn
+from mindspore import context
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+from mindspore.train import Model
+from mindspore.nn.metrics import Accuracy
+from src.dataset import create_dataset
+from src.config import mnist_cfg as cfg
+from src.lenet_fusion import LeNet5 as LeNet5Fusion
+
+parser = argparse.ArgumentParser(description='MindSpore MNIST Example')
+parser.add_argument('--device_target', type=str, default="Ascend",
+                    choices=['Ascend', 'GPU', 'CPU'],
+                    help='device where the code will be implemented (default: Ascend)')
+parser.add_argument('--data_path', type=str, default="./MNIST_Data",
+                    help='path where the dataset is saved')
+parser.add_argument('--ckpt_path', type=str, default="",
+                    help='if mode is test, must provide path where the trained ckpt file')
+parser.add_argument('--dataset_sink_mode', type=bool, default=True,
+                    help='dataset_sink_mode is False or True')
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
+    ds_eval = create_dataset(os.path.join(args.data_path, "test"), cfg.batch_size, 1)
+    step_size = ds_eval.get_dataset_size()
+
+    network = LeNet5Fusion(cfg.num_classes)
+    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
+    repeat_size = cfg.epoch_size
+    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
+    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size,
+                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
+    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck)
+    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
+
+    param_dict = load_checkpoint(args.ckpt_path)
+    load_param_into_net(network, param_dict)
+
+    print("============== Starting Testing ==============")
+    acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode)
+    print("============== {} ==============".format(acc))
diff --git a/model_zoo/lenet_quant/eval_quant.py b/model_zoo/lenet_quant/eval_quant.py
new file mode 100644
index 0000000000..492f6d36b2
--- /dev/null
+++ b/model_zoo/lenet_quant/eval_quant.py
@@ -0,0 +1,69 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+######################## eval lenet example ########################
+eval lenet according to model file:
+python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt
+"""
+
+import os
+import argparse
+import mindspore.nn as nn
+from mindspore import context
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+from mindspore.train import Model
+from mindspore.nn.metrics import Accuracy
+from mindspore.train.quant import quant
+from src.dataset import create_dataset
+from src.config import mnist_cfg as cfg
+from src.lenet_fusion import LeNet5 as LeNet5Fusion
+
+parser = argparse.ArgumentParser(description='MindSpore MNIST Example')
+parser.add_argument('--device_target', type=str, default="Ascend",
+                    choices=['Ascend', 'GPU', 'CPU'],
+                    help='device where the code will be implemented (default: Ascend)')
+parser.add_argument('--data_path', type=str, default="./MNIST_Data",
+                    help='path where the dataset is saved')
+parser.add_argument('--ckpt_path', type=str, default="",
+                    help='if mode is test, must provide path where the trained ckpt file')
+parser.add_argument('--dataset_sink_mode', type=bool, default=True,
+                    help='dataset_sink_mode is False or True')
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
+    ds_eval = create_dataset(os.path.join(args.data_path, "test"), cfg.batch_size, 1)
+    step_size = ds_eval.get_dataset_size()
+
+    # define funsion network
+    network = LeNet5Fusion(cfg.num_classes)
+    # convert funsion netwrok to quantization aware network
+    network = quant.convert_quant_network(network, quant_delay=0, bn_fold=False, freeze_bn=10000)
+
+    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
+    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
+    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size,
+                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
+    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck)
+    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
+
+    # load quantization aware network checkpoint
+    param_dict = load_checkpoint(args.ckpt_path)
+    load_param_into_net(network, param_dict)
+
+    print("============== Starting Testing ==============")
+    acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode)
+    print("============== {} ==============".format(acc))
diff --git a/model_zoo/lenet_quant/src/config.py b/model_zoo/lenet_quant/src/config.py
new file mode 100644
index 0000000000..ab4b2e4084
--- /dev/null
+++ b/model_zoo/lenet_quant/src/config.py
@@ -0,0 +1,31 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+network config setting, will be used in train.py
+"""
+
+from easydict import EasyDict as edict
+
+mnist_cfg = edict({
+    'num_classes': 10,
+    'lr': 0.01,
+    'momentum': 0.9,
+    'epoch_size': 10,
+    'batch_size': 64,
+    'buffer_size': 1000,
+    'image_height': 32,
+    'image_width': 32,
+    'keep_checkpoint_max': 10,
+})
diff --git a/model_zoo/lenet_quant/src/dataset.py b/model_zoo/lenet_quant/src/dataset.py
new file mode 100644
index 0000000000..cef6973483
--- /dev/null
+++ b/model_zoo/lenet_quant/src/dataset.py
@@ -0,0 +1,60 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Produce the dataset
+"""
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.c_transforms as CV
+import mindspore.dataset.transforms.c_transforms as C
+from mindspore.dataset.transforms.vision import Inter
+from mindspore.common import dtype as mstype
+
+
+def create_dataset(data_path, batch_size=32, repeat_size=1,
+                   num_parallel_workers=1):
+    """
+    create dataset for train or test
+    """
+    # define dataset
+    mnist_ds = ds.MnistDataset(data_path)
+
+    resize_height, resize_width = 32, 32
+    rescale = 1.0 / 255.0
+    shift = 0.0
+    rescale_nml = 1 / 0.3081
+    shift_nml = -1 * 0.1307 / 0.3081
+
+    # define map operations
+    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
+    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
+    rescale_op = CV.Rescale(rescale, shift)
+    hwc2chw_op = CV.HWC2CHW()
+    type_cast_op = C.TypeCast(mstype.int32)
+
+    # apply map operations on images
+    mnist_ds = mnist_ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=num_parallel_workers)
+    mnist_ds = mnist_ds.map(input_columns="image", operations=resize_op, num_parallel_workers=num_parallel_workers)
+    mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_op, num_parallel_workers=num_parallel_workers)
+    mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_nml_op, num_parallel_workers=num_parallel_workers)
+    mnist_ds = mnist_ds.map(input_columns="image", operations=hwc2chw_op, num_parallel_workers=num_parallel_workers)
+
+    # apply DatasetOps
+    buffer_size = 10000
+    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)  # 10000 as in LeNet train script
+    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
+    mnist_ds = mnist_ds.repeat(repeat_size)
+
+    return mnist_ds
diff --git a/model_zoo/lenet_quant/src/lenet.py b/model_zoo/lenet_quant/src/lenet.py
new file mode 100644
index 0000000000..026f1e8df5
--- /dev/null
+++ b/model_zoo/lenet_quant/src/lenet.py
@@ -0,0 +1,60 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""LeNet."""
+import mindspore.nn as nn
+
+
+class LeNet5(nn.Cell):
+    """
+    Lenet network
+
+    Args:
+        num_class (int): Num classes. Default: 10.
+
+    Returns:
+        Tensor, output tensor
+    Examples:
+        >>> LeNet(num_class=10)
+
+    """
+
+    def __init__(self, num_class=10, channel=1):
+        super(LeNet5, self).__init__()
+        self.num_class = num_class
+
+        self.conv1 = nn.Conv2d(channel, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Dense(16 * 5 * 5, 120)
+        self.fc2 = nn.Dense(120, 84)
+        self.fc3 = nn.Dense(84, self.num_class)
+
+        self.relu = nn.ReLU()
+        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.flatten = nn.Flatten()
+
+    def construct(self, x):
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.conv2(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.fc3(x)
+        return x
diff --git a/model_zoo/lenet_quant/src/lenet_fusion.py b/model_zoo/lenet_quant/src/lenet_fusion.py
new file mode 100644
index 0000000000..809276a482
--- /dev/null
+++ b/model_zoo/lenet_quant/src/lenet_fusion.py
@@ -0,0 +1,57 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""LeNet."""
+import mindspore.nn as nn
+
+
+class LeNet5(nn.Cell):
+    """
+    Lenet network
+
+    Args:
+        num_class (int): Num classes. Default: 10.
+
+    Returns:
+        Tensor, output tensor
+    Examples:
+        >>> LeNet(num_class=10)
+
+    """
+
+    def __init__(self, num_class=10, channel=1):
+        super(LeNet5, self).__init__()
+        self.num_class = num_class
+
+        # change `nn.Conv2d` to `nn.Conv2dBnAct`
+        self.conv1 = nn.Conv2dBnAct(channel, 6, 5, activation='relu')
+        self.conv2 = nn.Conv2dBnAct(6, 16, 5, activation='relu')
+        # change `nn.Dense` to `nn.DenseBnAct`
+        self.fc1 = nn.DenseBnAct(16 * 5 * 5, 120, activation='relu')
+        self.fc2 = nn.DenseBnAct(120, 84, activation='relu')
+        self.fc3 = nn.DenseBnAct(84, self.num_class)
+
+        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.flatten = nn.Flatten()
+
+    def construct(self, x):
+        x = self.conv1(x)
+        x = self.max_pool2d(x)
+        x = self.conv2(x)
+        x = self.max_pool2d(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = self.fc3(x)
+        return x
diff --git a/model_zoo/lenet_quant/train.py b/model_zoo/lenet_quant/train.py
new file mode 100644
index 0000000000..6e7a46fb38
--- /dev/null
+++ b/model_zoo/lenet_quant/train.py
@@ -0,0 +1,61 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+######################## train lenet example ########################
+train lenet and get network model files(.ckpt) :
+python train.py --data_path /YourDataPath
+"""
+
+import os
+import argparse
+import mindspore.nn as nn
+from mindspore import context
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
+from mindspore.train import Model
+from mindspore.nn.metrics import Accuracy
+from src.dataset import create_dataset
+from src.config import mnist_cfg as cfg
+from src.lenet_fusion import LeNet5 as LeNet5Fusion
+
+parser = argparse.ArgumentParser(description='MindSpore MNIST Example')
+parser.add_argument('--device_target', type=str, default="Ascend",
+                    choices=['Ascend', 'GPU', 'CPU'],
+                    help='device where the code will be implemented (default: Ascend)')
+parser.add_argument('--data_path', type=str, default="./MNIST_Data",
+                    help='path where the dataset is saved')
+parser.add_argument('--ckpt_path', type=str, default="",
+                    help='if mode is test, must provide path where the trained ckpt file')
+parser.add_argument('--dataset_sink_mode', type=bool, default=True,
+                    help='dataset_sink_mode is False or True')
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
+    ds_train = create_dataset(os.path.join(args.data_path, "train"), cfg.batch_size, cfg.epoch_size)
+    step_size = ds_train.get_dataset_size()
+
+    network = LeNet5Fusion(cfg.num_classes)
+    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
+    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
+    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
+    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size,
+                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
+    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck)
+    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
+
+    print("============== Starting Training ==============")
+    model.train(cfg['epoch_size'], ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()],
+                dataset_sink_mode=args.dataset_sink_mode)
+    print("============== End Training ==============")
diff --git a/model_zoo/lenet_quant/train_quant.py b/model_zoo/lenet_quant/train_quant.py
new file mode 100644
index 0000000000..04f595f322
--- /dev/null
+++ b/model_zoo/lenet_quant/train_quant.py
@@ -0,0 +1,70 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+######################## train lenet example ########################
+train lenet and get network model files(.ckpt) :
+python train.py --data_path /YourDataPath
+"""
+
+import os
+import argparse
+import mindspore.nn as nn
+from mindspore import context
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
+from mindspore.train import Model
+from mindspore.nn.metrics import Accuracy
+from mindspore.train.quant import quant
+from src.dataset import create_dataset
+from src.config import mnist_cfg as cfg
+from src.lenet_fusion import LeNet5 as LeNet5Fusion
+
+parser = argparse.ArgumentParser(description='MindSpore MNIST Example')
+parser.add_argument('--device_target', type=str, default="Ascend",
+                    choices=['Ascend', 'GPU', 'CPU'],
+                    help='device where the code will be implemented (default: Ascend)')
+parser.add_argument('--data_path', type=str, default="./MNIST_Data",
+                    help='path where the dataset is saved')
+parser.add_argument('--ckpt_path', type=str, default="",
+                    help='if mode is test, must provide path where the trained ckpt file')
+parser.add_argument('--dataset_sink_mode', type=bool, default=True,
+                    help='dataset_sink_mode is False or True')
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
+    ds_train = create_dataset(os.path.join(args.data_path, "train"), cfg.batch_size, cfg.epoch_size)
+    step_size = ds_train.get_dataset_size()
+
+    # define funsion network
+    network = LeNet5Fusion(cfg.num_classes)
+    # load quantization aware network checkpoint
+    param_dict = load_checkpoint(args.ckpt_path)
+    load_param_into_net(network, param_dict)
+    # convert funsion netwrok to quantization aware network
+    network = quant.convert_quant_network(network, quant_delay=0, bn_fold=False, freeze_bn=10000)
+
+    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
+    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
+    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
+    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size,
+                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
+    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck)
+    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
+
+    print("============== Starting Training ==============")
+    model.train(cfg['epoch_size'], ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()],
+                dataset_sink_mode=args.dataset_sink_mode)
+    print("============== End Training ==============")
diff --git a/example/lstm_aclImdb/README.md b/model_zoo/lstm/README.md
similarity index 94%
rename from example/lstm_aclImdb/README.md
rename to model_zoo/lstm/README.md
index 95ac30f3dc..00a3334968 100644
--- a/example/lstm_aclImdb/README.md
+++ b/model_zoo/lstm/README.md
@@ -72,7 +72,8 @@ result: {'acc': 0.83}
 ```
 usage: train.py [--preprocess {true,false}] [--aclimdb_path ACLIMDB_PATH]
                 [--glove_path GLOVE_PATH] [--preprocess_path PREPROCESS_PATH]
-                [--ckpt_path CKPT_PATH] [--device_target {GPU,CPU}]
+                [--ckpt_path CKPT_PATH] [--pre_trained PRE_TRAINED]
+                [--device_target {GPU,CPU}]
 
 parameters/options:
   --preprocess          whether to preprocess data.
@@ -80,6 +81,7 @@ parameters/options:
   --glove_path          path where the GloVe is stored.
   --preprocess_path     path where the pre-process data is stored.
   --ckpt_path           the path to save the checkpoint file.
+  --pre_trained         the pretrained checkpoint file path.
   --device_target       the target device to run, support "GPU", "CPU".
 ```
 
diff --git a/example/lstm_aclImdb/eval.py b/model_zoo/lstm/eval.py
similarity index 93%
rename from example/lstm_aclImdb/eval.py
rename to model_zoo/lstm/eval.py
index e76d40ac67..a9b81199c1 100644
--- a/example/lstm_aclImdb/eval.py
+++ b/model_zoo/lstm/eval.py
@@ -21,8 +21,8 @@ import os
 
 import numpy as np
 
-from config import lstm_cfg as cfg
-from dataset import create_dataset, convert_to_mindrecord
+from src.config import lstm_cfg as cfg
+from src.dataset import lstm_create_dataset, convert_to_mindrecord
 from mindspore import Tensor, nn, Model, context
 from mindspore.model_zoo.lstm import SentimentNet
 from mindspore.nn import Accuracy
@@ -71,11 +71,11 @@ if __name__ == '__main__':
     model = Model(network, loss, opt, {'acc': Accuracy()})
 
     print("============== Starting Testing ==============")
-    ds_eval = create_dataset(args.preprocess_path, cfg.batch_size, training=False)
+    ds_eval = lstm_create_dataset(args.preprocess_path, cfg.batch_size, training=False)
     param_dict = load_checkpoint(args.ckpt_path)
     load_param_into_net(network, param_dict)
     if args.device_target == "CPU":
         acc = model.eval(ds_eval, dataset_sink_mode=False)
     else:
         acc = model.eval(ds_eval)
-    print("============== Accuracy:{} ==============".format(acc))
+    print("============== {} ==============".format(acc))
diff --git a/model_zoo/lstm/src/__init__.py b/model_zoo/lstm/src/__init__.py
new file mode 100644
index 0000000000..301ef9dcb7
--- /dev/null
+++ b/model_zoo/lstm/src/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
diff --git a/example/lstm_aclImdb/config.py b/model_zoo/lstm/src/config.py
similarity index 100%
rename from example/lstm_aclImdb/config.py
rename to model_zoo/lstm/src/config.py
diff --git a/example/lstm_aclImdb/dataset.py b/model_zoo/lstm/src/dataset.py
similarity index 96%
rename from example/lstm_aclImdb/dataset.py
rename to model_zoo/lstm/src/dataset.py
index 24797198e0..03d4276dfd 100644
--- a/example/lstm_aclImdb/dataset.py
+++ b/model_zoo/lstm/src/dataset.py
@@ -19,12 +19,12 @@ import os
 
 import numpy as np
 
-from imdb import ImdbParser
 import mindspore.dataset as ds
 from mindspore.mindrecord import FileWriter
+from .imdb import ImdbParser
 
 
-def create_dataset(data_home, batch_size, repeat_num=1, training=True):
+def lstm_create_dataset(data_home, batch_size, repeat_num=1, training=True):
     """Data operations."""
     ds.config.set_seed(1)
     data_dir = os.path.join(data_home, "aclImdb_train.mindrecord0")
diff --git a/example/lstm_aclImdb/imdb.py b/model_zoo/lstm/src/imdb.py
similarity index 100%
rename from example/lstm_aclImdb/imdb.py
rename to model_zoo/lstm/src/imdb.py
index 66d04f1281..9888b4c36f 100644
--- a/example/lstm_aclImdb/imdb.py
+++ b/model_zoo/lstm/src/imdb.py
@@ -18,8 +18,8 @@ imdb dataset parser.
 import os
 from itertools import chain
 
-import gensim
 import numpy as np
+import gensim
 
 
 class ImdbParser():
diff --git a/mindspore/model_zoo/lstm.py b/model_zoo/lstm/src/lstm.py
similarity index 63%
rename from mindspore/model_zoo/lstm.py
rename to model_zoo/lstm/src/lstm.py
index 7368bbf8e5..f014eef8df 100644
--- a/mindspore/model_zoo/lstm.py
+++ b/model_zoo/lstm/src/lstm.py
@@ -13,43 +13,12 @@
 # limitations under the License.
 # ============================================================================
 """LSTM."""
-import math
 
 import numpy as np
 
-from mindspore import Parameter, Tensor, nn, context, ParameterTuple
-from mindspore.common.initializer import initializer
+from mindspore import Tensor, nn, context
 from mindspore.ops import operations as P
 
-
-def init_lstm_weight(
-        input_size,
-        hidden_size,
-        num_layers,
-        bidirectional,
-        has_bias=True):
-    """Initialize lstm weight."""
-    num_directions = 1
-    if bidirectional:
-        num_directions = 2
-
-    weight_size = 0
-    gate_size = 4 * hidden_size
-    for layer in range(num_layers):
-        for _ in range(num_directions):
-            input_layer_size = input_size if layer == 0 else hidden_size * num_directions
-            weight_size += gate_size * input_layer_size
-            weight_size += gate_size * hidden_size
-            if has_bias:
-                weight_size += 2 * gate_size
-
-    stdv = 1 / math.sqrt(hidden_size)
-    w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32)
-    w = Parameter(initializer(Tensor(w_np), [weight_size, 1, 1]), name='weight')
-
-    return w
-
-
 # Initialize short-term memory (h) and long-term memory (c) to 0
 def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional):
     """init default input."""
@@ -60,19 +29,15 @@ def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional):
     if context.get_context("device_target") == "CPU":
         h_list = []
         c_list = []
-        for i in range(num_layers):
-            hi = Parameter(initializer(
-                Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)),
-                [num_directions, batch_size, hidden_size]
-            ), name='h' + str(i))
+        i = 0
+        while i < num_layers:
+            hi = Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32))
             h_list.append(hi)
-            ci = Parameter(initializer(
-                Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)),
-                [num_directions, batch_size, hidden_size]
-            ), name='c' + str(i))
+            ci = Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32))
             c_list.append(ci)
-        h = ParameterTuple(tuple(h_list))
-        c = ParameterTuple(tuple(c_list))
+            i = i + 1
+        h = tuple(h_list)
+        c = tuple(c_list)
         return h, c
 
     h = Tensor(
@@ -108,12 +73,7 @@ class SentimentNet(nn.Cell):
                                has_bias=True,
                                bidirectional=bidirectional,
                                dropout=0.0)
-        w_init = init_lstm_weight(
-            embed_size,
-            num_hiddens,
-            num_layers,
-            bidirectional)
-        self.encoder.weight = w_init
+
         self.h, self.c = lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional)
 
         self.concat = P.Concat(1)
@@ -128,6 +88,6 @@ class SentimentNet(nn.Cell):
         embeddings = self.trans(embeddings, self.perm)
         output, _ = self.encoder(embeddings, (self.h, self.c))
         # states[i] size(64,200)  -> encoding.size(64,400)
-        encoding = self.concat((output[0], output[1]))
+        encoding = self.concat((output[0], output[-1]))
         outputs = self.decoder(encoding)
         return outputs
diff --git a/example/lstm_aclImdb/train.py b/model_zoo/lstm/train.py
similarity index 87%
rename from example/lstm_aclImdb/train.py
rename to model_zoo/lstm/train.py
index 08bea7c63d..732655f1de 100644
--- a/example/lstm_aclImdb/train.py
+++ b/model_zoo/lstm/train.py
@@ -21,13 +21,14 @@ import os
 
 import numpy as np
 
-from config import lstm_cfg as cfg
-from dataset import convert_to_mindrecord
-from dataset import create_dataset
+from src.config import lstm_cfg as cfg
+from src.dataset import convert_to_mindrecord
+from src.dataset import lstm_create_dataset
 from mindspore import Tensor, nn, Model, context
 from mindspore.model_zoo.lstm import SentimentNet
 from mindspore.nn import Accuracy
 from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor
+from mindspore.train.serialization import load_param_into_net, load_checkpoint
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='MindSpore LSTM Example')
@@ -41,6 +42,8 @@ if __name__ == '__main__':
                         help='path where the pre-process data is stored.')
     parser.add_argument('--ckpt_path', type=str, default="./",
                         help='the path to save the checkpoint file.')
+    parser.add_argument('--pre_trained', type=str, default=None,
+                        help='the pretrained checkpoint file path.')
     parser.add_argument('--device_target', type=str, default="GPU", choices=['GPU', 'CPU'],
                         help='the target device to run, support "GPU", "CPU". Default: "GPU".')
     args = parser.parse_args()
@@ -63,6 +66,9 @@ if __name__ == '__main__':
                            num_classes=cfg.num_classes,
                            weight=Tensor(embedding_table),
                            batch_size=cfg.batch_size)
+    # pre_trained
+    if args.pre_trained:
+        load_param_into_net(network, load_checkpoint(args.pre_trained))
 
     loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
     opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum)
@@ -71,7 +77,7 @@ if __name__ == '__main__':
     model = Model(network, loss, opt, {'acc': Accuracy()})
 
     print("============== Starting Training ==============")
-    ds_train = create_dataset(args.preprocess_path, cfg.batch_size, cfg.num_epochs)
+    ds_train = lstm_create_dataset(args.preprocess_path, cfg.batch_size, cfg.num_epochs)
     config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                  keep_checkpoint_max=cfg.keep_checkpoint_max)
     ckpoint_cb = ModelCheckpoint(prefix="lstm", directory=args.ckpt_path, config=config_ck)
diff --git a/model_zoo/mass/README.md b/model_zoo/mass/README.md
new file mode 100644
index 0000000000..d6b1c29186
--- /dev/null
+++ b/model_zoo/mass/README.md
@@ -0,0 +1,592 @@
+![](https://www.mindspore.cn/static/img/logo.a3e472c9.png)
+
+<!-- TOC -->
+
+- [MASS: Masked Sequence to Sequence Pre-training for Language Generation Description](#googlenet-description)
+- [Model architecture](#model-architecture)
+- [Dataset](#dataset)
+- [Features](#features)
+- [Script description](#script-description)
+    - [Data Preparation](#Data-Preparation)
+        - [Tokenization](#Tokenization)
+        - [Byte Pair Encoding](#Byte-Pair-Encoding)
+        - [Build Vocabulary](#Build-Vocabulary)
+        - [Generate Dataset](#Generate-Dataset)
+            - [News Crawl Corpus](#News-Crawl-Corpus)
+            - [Gigaword Corpus](#Gigaword-Corpus)
+            - [Cornell Movie Dialog Corpus](#Cornell-Movie-Dialog-Corpus)
+    - [Configuration](#Configuration)
+    - [Training & Evaluation process](#Training-&-Evaluation-process)
+    - [Weights average](#Weights-average)
+    - [Learning rate scheduler](#Learning-rate-scheduler)
+- [Model description](#model-description)
+    - [Performance](#performance)
+        - [Results](#results)
+            - [Training Performance](#training-performance)
+            - [Inference Performance](#inference-performance)
+- [Environment Requirements](#environment-requirements)
+    - [Platform](#Platform)
+    - [Requirements](#Requirements)
+- [Get started](#get-started)
+    - [Pre-training](#Pre-training)
+    - [Fine-tuning](#Fine-tuning)
+    - [Inference](#Inference)
+- [Description of random situation](#description-of-random-situation)
+- [others](#others)
+- [ModelZoo Homepage](#modelzoo-homepage)
+
+<!-- /TOC -->
+
+
+# MASS: Masked Sequence to Sequence Pre-training for Language Generation Description
+
+[MASS: Masked Sequence to Sequence Pre-training for Language Generation](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf) was released by MicroSoft in June 2019. 
+
+BERT(Devlin et al., 2018) have achieved SOTA in natural language understanding area by pre-training the encoder part of Transformer(Vaswani et al., 2017) with masked rich-resource text. Likewise, GPT(Raddford et al., 2018) pre-trains the decoder part of Transformer with masked(encoder inputs are masked) rich-resource text. Both of them build a robust language model by pre-training with masked rich-resource text.
+
+Inspired by BERT, GPT and other language models, MicroSoft addressed [MASS: Masked Sequence to Sequence Pre-training for Language Generation](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf) which combines BERT's and GPT's idea. MASS has an important parameter k, which controls the masked fragment length. BERT and GPT are specicl case when k equals to 1 and sentence length.
+
+[Introducing MASS – A pre-training method that outperforms BERT and GPT in sequence to sequence language generation tasks](https://www.microsoft.com/en-us/research/blog/introducing-mass-a-pre-training-method-that-outperforms-bert-and-gpt-in-sequence-to-sequence-language-generation-tasks/)
+
+[Paper](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf): Song, Kaitao, Xu Tan, Tao Qin, Jianfeng Lu and Tie-Yan Liu. “MASS: Masked Sequence to Sequence Pre-training for Language Generation.” ICML (2019).
+
+
+# Model architecture
+
+The overall network architecture of MASS is shown below, which is Transformer(Vaswani et al., 2017):
+
+MASS is consisted of 6-layer encoder and 6-layer decoder with 1024 embedding/hidden size, and 4096 intermediate size between feed forward network which has two full connection layers.
+
+![Transformer architecture](https://cdn.analyticsvidhya.com/wp-content/uploads/2019/06/Screenshot-from-2019-06-17-19-53-10.png)
+
+
+# Dataset
+
+Dataset used: 
+- monolingual English data from News Crawl dataset(WMT 2019) for pre-training.
+- Gigaword Corpus(Graff et al., 2003) for Text Summarization.
+- Cornell movie dialog corpus(DanescuNiculescu-Mizil & Lee, 2011).
+
+Details about those dataset could be found in [MASS: Masked Sequence to Sequence Pre-training for Language Generation](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf).
+
+
+# Features
+
+Mass is designed to jointly pre train encoder and decoder to complete the task of language generation. 
+First of all, through a sequence to sequence framework, mass only predicts the blocked token, which forces the encoder to understand the meaning of the unshielded token, and encourages the decoder to extract useful information from the encoder.
+Secondly, by predicting the continuous token of the decoder, the decoder can build better language modeling ability than only predicting discrete token.
+Third, by further shielding the input token of the decoder which is not shielded in the encoder, the decoder is encouraged to extract more useful information from the encoder side, rather than using the rich information in the previous token.
+
+
+# Script description
+
+MASS script and code structure are as follow:
+
+```text
+├── mass
+  ├── README.md                              // Introduction of MASS model.
+  ├── config
+  │   ├──config.py                           // Configuration instance definition.
+  │   ├──config.json                         // Configuration file.
+  ├── src
+  │   ├──dataset                             
+  │      ├──bi_data_loader.py                // Dataset loader for fine-tune or inferring.
+  │      ├──mono_data_loader.py              // Dataset loader for pre-training.
+  │   ├──language_model
+  │      ├──noise_channel_language_model.p   // Noisy channel language model for dataset generation.
+  │      ├──mass_language_model.py           // MASS language model according to MASS paper.
+  │      ├──loose_masked_language_model.py   // MASS language model according to MASS released code.
+  │      ├──masked_language_model.py         // Masked language model according to MASS paper.
+  │   ├──transformer
+  │      ├──create_attn_mask.py              // Generate mask matrix to remove padding positions.
+  │      ├──transformer.py                   // Transformer model architecture.
+  │      ├──encoder.py                       // Transformer encoder component.
+  │      ├──decoder.py                       // Transformer decoder component.
+  │      ├──self_attention.py                // Self-Attention block component.
+  │      ├──multi_head_attention.py          // Multi-Head Self-Attention component.
+  │      ├──embedding.py                     // Embedding component.
+  │      ├──positional_embedding.py          // Positional embedding component.
+  │      ├──feed_forward_network.py          // Feed forward network.
+  │      ├──residual_conn.py                 // Residual block.
+  │      ├──beam_search.py                   // Beam search decoder for inferring.
+  │      ├──transformer_for_infer.py         // Use Transformer to infer.
+  │      ├──transformer_for_train.py         // Use Transformer to train.
+  │   ├──utils
+  │      ├──byte_pair_encoding.py            // Apply BPE with subword-nmt.
+  │      ├──dictionary.py                    // Dictionary.
+  │      ├──loss_moniter.py                  // Callback of monitering loss during training step.
+  │      ├──lr_scheduler.py                  // Learning rate scheduler.
+  │      ├──ppl_score.py                     // Perplexity score based on N-gram.
+  │      ├──rouge_score.py                   // Calculate ROUGE score.
+  │      ├──load_weights.py                  // Load weights from a checkpoint or NPZ file.
+  │      ├──initializer.py                   // Parameters initializer.
+  ├── vocab
+  │   ├──all.bpe.codes                       // BPE codes table(this file should be generated by user).
+  │   ├──all_en.dict.bin                     // Learned vocabulary file(this file should be generated by user).
+  ├── scripts
+  │   ├──run.sh                              // Train & evaluate model script.
+  │   ├──learn_subword.sh                    // Learn BPE codes.
+  │   ├──stop_training.sh                    // Stop training.
+  ├── requirements.txt                       // Requirements of third party package. 
+  ├── train.py                               // Train API entry.
+  ├── eval.py                                // Infer API entry.
+  ├── tokenize_corpus.py                     // Corpus tokenization.
+  ├── apply_bpe_encoding.py                  // Applying bpe encoding.
+  ├── weights_average.py                     // Average multi model checkpoints to NPZ format.
+  ├── news_crawl.py                          // Create News Crawl dataset for pre-training.
+  ├── gigaword.py                            // Create Gigaword Corpus.
+  ├── cornell_dialog.py                      // Create Cornell Movie Dialog dataset for conversation response.
+
+```
+
+
+## Data Preparation
+
+The data preparation of a natural language processing task contains data cleaning, tokenization, encoding and vocabulary generation steps.
+
+In our experiments, using [Byte Pair Encoding(BPE)](https://arxiv.org/abs/1508.07909) could reduce size of vocabulary, and relieve the OOV influence effectively.
+
+Vocabulary could be created using `src/utils/dictionary.py` with text dictionary which is learnt from BPE. 
+For more detail about BPE, please refer to [Subword-nmt lib](https://www.cnpython.com/pypi/subword-nmt) or [paper](https://arxiv.org/abs/1508.07909).
+
+In our experiments, vocabulary was learned based on 1.9M sentences from News Crawl Dataset, size of vocabulary is 45755.
+
+Here, we have a brief introduction of data preparation scripts.
+
+
+### Tokenization
+Using `tokenize_corpus.py` could tokenize corpus whose text files are in format of `.txt`.
+
+Major parameters in `tokenize_corpus.py`:
+
+```bash
+--corpus_folder:     Corpus folder path, if multi-folders are provided, use ',' split folders. 
+--output_folder:     Output folder path. 
+--tokenizer:         Tokenizer to be used, nltk or jieba, if nltk is not installed fully, use jieba instead.
+--pool_size:         Processes pool size.
+```
+
+Sample code:
+```bash
+python tokenize_corpus.py --corpus_folder /{path}/corpus --output_folder /{path}/tokenized_corpus --tokenizer {nltk|jieba} --pool_size 16
+```
+
+
+### Byte Pair Encoding
+After tokenization, BPE is applied to tokenized corpus with provided `all.bpe.codes`.
+
+Apply BPE script can be found in `apply_bpe_encoding.py`.
+
+Major parameters in `apply_bpe_encoding.py`:
+
+```bash
+--codes:            BPE codes file.
+--src_folder:       Corpus folders.
+--output_folder:    Output files folder.
+--prefix:           Prefix of text file in `src_folder`.
+--vocab_path:       Generated vocabulary output path.
+--threshold:        Filter out words that frequency is lower than threshold.
+--processes:        Size of process pool (to accelerate). Default: 2.
+```
+
+Sample code:
+```bash
+python tokenize_corpus.py --codes /{path}/all.bpe.codes \
+    --src_folder /{path}/tokenized_corpus \
+    --output_folder /{path}/tokenized_corpus/bpe \
+    --prefix tokenized \
+    --vocab_path /{path}/vocab_en.dict.bin
+    --processes 32
+```
+
+
+### Build Vocabulary
+Support that you want to create a new vocabulary, there are two options:
+1. Learn BPE codes from scratch, and create vocabulary with multi vocabulary files from `subword-nmt`.
+2. Create from an existing vocabulary file which lines in the format of `word frequency`.
+3. *Optional*, Create a small vocabulary based on `vocab/all_en.dict.bin` with method of `shink` from `src/utils/dictionary.py`.
+4. Persistent vocabulary to `vocab` folder with method `persistence()`.
+
+Major interface of `src/utils/dictionary.py` are as follow:
+
+1. `shrink(self, threshold=50)`: Shrink the size of vocabulary by filter out words frequency is lower than threshold. It returns a new vocabulary.
+2. `load_from_text(cls, filepaths: List[str])`: Load existed text vocabulary which lines in the format of `word frequency`.  
+3. `load_from_persisted_dict(cls, filepath)`: Load from a persisted binary vocabulary which was saved by calling `persistence()` method.
+4. `persistence(self, path)`: Save vocabulary object to binary file.
+
+Sample code:
+```python
+from src.utils import Dictionary
+
+vocabulary = Dictionary.load_from_persisted_dict("vocab/all_en.dict.bin")
+tokens = [1, 2, 3, 4, 5]
+# Convert ids to symbols.
+print([vocabulary[t] for t in tokens])
+
+sentence = ["Hello", "world"]
+# Convert symbols to ids.
+print([vocabulary.index[s] for s in sentence])
+```
+
+For more detail, please refer to the source file.
+
+
+### Generate Dataset
+As mentioned above, three corpus are used in MASS mode, dataset generation scripts for them are provided.
+
+#### News Crawl Corpus
+Script can be found in `news_crawl.py`.
+
+Major parameters in `news_crawl.py`:
+
+```bash
+Note that please provide `--existed_vocab` or `--dict_folder` at least one.
+A new vocabulary would be created in `output_folder` when pass `--dict_folder`.
+
+--src_folder:       Corpus folders.
+--existed_vocab:    Optional, persisted vocabulary file.
+--mask_ratio:       Ratio of mask.
+--output_folder:    Output dataset files folder path.
+--max_len:          Maximum sentence length. If a sentence longer than `max_len`, then drop it.
+--suffix:           Optional, suffix of generated dataset files.
+--processes:        Optional, size of process pool (to accelerate). Default: 2.
+```
+
+Sample code:
+
+```bash
+python news_crawl.py --src_folder /{path}/news_crawl \
+    --existed_vocab /{path}/mass/vocab/all_en.dict.bin \
+    --mask_ratio 0.5 \
+    --output_folder /{path}/news_crawl_dataset \
+    --max_len 32 \
+    --processes 32
+```
+
+
+#### Gigaword Corpus
+Script can be found in `gigaword.py`.
+
+Major parameters in `gigaword.py`:
+
+```bash
+--train_src:        Train source file path.
+--train_ref:        Train reference file path.
+--test_src:         Test source file path.
+--test_ref:         Test reference file path.
+--existed_vocab:    Persisted vocabulary file.
+--output_folder:    Output dataset files folder path.
+--noise_prob:       Optional, add noise prob. Default: 0.
+--max_len:          Optional, maximum sentence length. If a sentence longer than `max_len`, then drop it. Default: 64.
+--format:           Optional, dataset format, "mindrecord" or "tfrecord". Default: "tfrecord".
+```
+
+Sample code:
+
+```bash
+python gigaword.py --train_src /{path}/gigaword/train_src.txt \
+    --train_ref /{path}/gigaword/train_ref.txt \
+    --test_src /{path}/gigaword/test_src.txt \
+    --test_ref /{path}/gigaword/test_ref.txt \
+    --existed_vocab /{path}/mass/vocab/all_en.dict.bin \
+    --noise_prob 0.1 \
+    --output_folder /{path}/gigaword_dataset \
+    --max_len 64
+```
+
+
+#### Cornell Movie Dialog Corpus
+Script can be found in `cornell_dialog.py`.
+
+Major parameters in `cornell_dialog.py`:
+
+```bash
+--src_folder:       Corpus folders.
+--existed_vocab:    Persisted vocabulary file.
+--train_prefix:     Train source and target file prefix. Default: train.
+--test_prefix:      Test source and target file prefix. Default: test.
+--output_folder:    Output dataset files folder path.
+--max_len:          Maximum sentence length. If a sentence longer than `max_len`, then drop it.
+--valid_prefix:     Optional, Valid source and target file prefix. Default: valid.
+```
+
+Sample code:
+
+```bash
+python cornell_dialog.py --src_folder /{path}/cornell_dialog \
+    --existed_vocab /{path}/mass/vocab/all_en.dict.bin \
+    --train_prefix train \
+    --test_prefix test \
+    --noise_prob 0.1 \
+    --output_folder /{path}/cornell_dialog_dataset \
+    --max_len 64
+```
+
+
+## Configuration
+Json file under the path `config/` is the template configuration file. 
+Almost all of the options and arguments needed could be assigned conveniently, including the training platform, configurations of dataset and model, arguments of optimizer etc. Optional features such as loss scale and checkpoint are also available by setting the options correspondingly. 
+For more detailed information about the attributes, refer to the file `config/config.py`.
+
+## Training & Evaluation process
+For training a model, the shell script `run.sh` is all you need. In this scripts, the environment variable is set and the training script `train.py` under `mass` is executed.
+You may start a task training with single device or multiple devices by assigning the options and run the command in bash:
+```bash
+sh run.sh [--options]
+```
+
+The usage is shown as bellow:
+```text
+Usage: run.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
+                    [-i, --device_id <N>] [-j, --hccl_json <FILE>]
+                    [-c, --config <FILE>] [-o, --output <FILE>]
+                    [-v, --vocab <FILE>]
+    
+options:
+    -h, --help               show usage
+    -t, --task               select task: CHAR, 't' for train and 'i' for inference".
+    -n, --device_num         device number used for training: N, default is 1.
+    -i, --device_id          device id used for training with single device: N, 0<=N<=7, default is 0.
+    -j, --hccl_json          rank table file used for training with multiple devices: FILE.
+    -c, --config             configuration file as shown in the path 'mass/config': FILE.
+    -o, --output             assign output file of inference: FILE.
+    -v, --vocab               set the vocabulary"
+```
+Notes: Be sure to assign the hccl_json file while running a distributed-training.
+
+The command followed shows a example for training with 2 devices.
+```bash
+sh run.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json --config /{path}/config.json
+```
+ps. Discontinuous device id is not supported in `run.sh` at present, device id in `rank_table.json` must start from 0.
+
+
+If use a single chip, it would be like this:
+```bash
+sh run.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
+```
+
+
+## Weights average
+
+```python
+python weights_average.py --input_files your_checkpoint_list --output_file model.npz
+```
+
+The input_files is a list of you checkpoints file. To use model.npz as the weights, add its path in config.json at "existed_ckpt".
+```json
+{
+  ...
+  "checkpoint_options": {
+    "existed_ckpt": "/xxx/xxx/model.npz",
+    "save_ckpt_steps": 1000,
+    ...
+  },
+  ...
+}
+```
+
+
+## Learning rate scheduler
+
+Two learning rate scheduler are provided in our model:
+
+1. [Polynomial decay scheduler](https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1).
+2. [Inverse square root scheduler](https://ece.uwaterloo.ca/~dwharder/aads/Algorithms/Inverse_square_root/).
+
+LR scheduler could be config in `config/config.json`.
+
+For Polynomial decay scheduler, config could be like:
+```json
+{
+  ...
+  "learn_rate_config": {
+    "optimizer": "adam",
+    "lr": 1e-4,
+    "lr_scheduler": "poly",
+    "poly_lr_scheduler_power": 0.5,
+    "decay_steps": 10000,
+    "warmup_steps": 2000,
+    "min_lr": 1e-6
+  },
+  ...
+}
+```
+
+For Inverse square root scheduler, config could be like:
+```json
+{
+  ...
+  "learn_rate_config": {
+    "optimizer": "adam",
+    "lr": 1e-4,
+    "lr_scheduler": "isr",
+    "decay_start_step": 12000,
+    "warmup_steps": 2000,
+    "min_lr": 1e-6
+  },
+  ...
+}
+```
+
+More detail about LR scheduler could be found in `src/utils/lr_scheduler.py`.
+
+
+# Model description
+
+The MASS network is implemented by Transformer, which has multi-encoder layers and multi-decoder layers. 
+For pre-training, we use the Adam optimizer and loss-scale to get the pre-trained model. 
+During fine-turning, we fine-tune this pre-trained model with different dataset according to different tasks. 
+During testing, we use the fine-turned model to predict the result, and adopt a beam search algorithm to 
+get the most possible prediction results.
+
+
+![MASS framework](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-Fig-2.png)
+
+
+## Performance
+
+### Results
+
+#### Fine-Tuning on Text Summarization
+The comparisons between MASS and two other pre-training methods in terms of ROUGE score on the text summarization task 
+with 3.8M training data are as follows:
+
+| Method         |  RG-1(F)      | RG-2(F)      | RG-L(F)      |
+|:---------------|:--------------|:-------------|:-------------|
+| MASS           | Ongoing       | Ongoing      | Ongoing      |
+
+#### Fine-Tuning on Conversational ResponseGeneration
+The comparisons between MASS and other baseline methods in terms of PPL on Cornell Movie Dialog corpus are as follows:
+
+| Method             | Data = 10K       |  Data = 110K    |
+|--------------------|------------------|-----------------|
+| MASS               | Ongoing          | Ongoing         |
+
+#### Training Performance
+
+| Parameters                 | Masked Sequence to Sequence Pre-training for Language Generation          |
+|:---------------------------|:--------------------------------------------------------------------------|
+| Model Version              | v1                                                                        |
+| Resource                   | Ascend 910, cpu 2.60GHz, 56cores；memory, 314G                            |
+| uploaded Date              | 05/24/2020                                                                |
+| MindSpore Version          | 0.2.0                                                                     |
+| Dataset                    | News Crawl 2007-2017 English monolingual corpus, Gigaword corpus, Cornell Movie Dialog corpus |
+| Training Parameters        | Epoch=50, steps=XXX, batch_size=192, lr=1e-4                              |
+| Optimizer                  | Adam                                                                      |
+| Loss Function              | Label smoothed cross-entropy criterion                                    |
+| outputs                    | Sentence and probability                                                  |
+| Loss                       | Lower than 2                                                              |
+| Accuracy                   | For conversation response, ppl=23.52, for text summarization, RG-1=29.79. |
+| Speed                      | 611.45 sentences/s                                                        |
+| Total time                 | --/--                                                                     |
+| Params (M)                 | 44.6M                                                                     |
+| Checkpoint for Fine tuning | ---Mb, --, [A link]()                                                     |
+| Model for inference        | ---Mb, --, [A link]()                                                     |
+| Scripts                    | [A link]()                                                                |
+
+
+#### Inference Performance
+
+| Parameters                 | Masked Sequence to Sequence Pre-training for Language Generation |
+|:---------------------------|:-----------------------------------------------------------|
+| Model Version              | V1                                                         |
+| Resource                   | Huawei 910                                                 |
+| uploaded Date              | 05/24/2020                                                 |
+| MindSpore Version          | 0.2.0                                                      |
+| Dataset                    | Gigaword corpus, Cornell Movie Dialog corpus               |
+| batch_size                 | ---                                                        |
+| outputs                    | Sentence and probability                                   |
+| Accuracy                   | ppl=23.52 for conversation response, RG-1=29.79 for text summarization. |
+| Speed                      | ---- sentences/s                                           |
+| Total time                 | --/--                                                      |
+| Model for inference        | ---Mb, --, [A link]()                                      |
+
+
+# Environment Requirements
+
+## Platform
+
+- Hardware(Ascend)
+  - Prepare hardware environment with Ascend processor. If you want to try Ascend, please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you could get the resources for trial. 
+- Framework
+  - [MindSpore](http://10.90.67.50/mindspore/archive/20200506/OpenSource/me_vm_x86/)
+- For more information, please check the resources below：
+  - [MindSpore tutorials](https://www.mindspore.cn/tutorial/zh-CN/master/index.html) 
+  - [MindSpore API](https://www.mindspore.cn/api/zh-CN/master/index.html)
+
+## Requirements
+
+```txt
+nltk
+numpy
+subword-nmt
+rouge
+```
+
+https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/network_migration.html
+
+
+# Get started
+MASS pre-trains a sequence to sequence model by predicting the masked fragments in an input sequence. After this, downstream tasks including text summarization and conversation response are candidated for fine-tuning the model and for inference.
+Here we provide a practice example to demonstrate the basic usage of MASS for pre-training, fine-tuning a model, and the inference process. The overall process is as follows:
+1. Download and process the dataset.
+2. Modify the `config.json` to config the network.
+3. Run a task for pre-training and fine-tuning.
+4. Perform inference and validation.
+
+## Pre-training
+For pre-training a model, config the options in `config.json` firstly:
+- Assign the `pre_train_dataset` under `dataset_config` node to the dataset path.
+- Choose the optimizer('momentum/adam/lamb' is available).
+- Assign the 'ckpt_prefix' and 'ckpt_path' under `checkpoint_path` to save the model files.
+- Set other arguments including dataset configurations and network configurations.
+- If you have a trained model already, assign the `existed_ckpt` to the checkpoint file.
+
+Run the shell script `run.sh` as followed:
+
+```bash
+sh run.sh -t t -n 1 -i 1 -c /mass/config/config.json
+```
+Get the log and output files under the path `./run_mass_*/`, and the model file under the path assigned in the `config/config.json` file.
+
+## Fine-tuning
+For fine-tuning a model, config the options in `config.json` firstly:
+- Assign the `fine_tune_dataset` under `dataset_config` node to the dataset path.
+- Assign the `existed_ckpt` under `checkpoint_path` node to the existed model file generated by pre-training.
+- Choose the optimizer('momentum/adam/lamb' is available).
+- Assign the `ckpt_prefix` and `ckpt_path` under `checkpoint_path` node to save the model files.
+- Set other arguments including dataset configurations and network configurations.
+
+Run the shell script `run.sh` as followed:
+```bash
+sh run.sh -t t -n 1 -i 1 -c config/config.json
+```
+Get the log and output files under the path `./run_mass_*/`, and the model file under the path assigned in the `config/config.json` file.
+
+## Inference
+If you need to use the trained model to perform inference on multiple hardware platforms, such as GPU, Ascend 910 or Ascend 310, you can refer to this [Link](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/network_migration.html).
+For inference, config the options in `config.json` firstly:
+- Assign the `test_dataset` under `dataset_config` node to the dataset path.
+- Assign the `existed_ckpt` under `checkpoint_path` node to the model file produced by fine-tuning.
+- Choose the optimizer('momentum/adam/lamb' is available).
+- Assign the `ckpt_prefix` and `ckpt_path` under `checkpoint_path` node to save the model files.
+- Set other arguments including dataset configurations and network configurations.
+
+Run the shell script `run.sh` as followed:
+
+```bash
+sh run.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
+```
+
+# Description of random situation
+
+MASS model contains dropout operations, if you want to disable dropout, please set related dropout_rate to 0 in `config/config.json`. 
+
+
+# others
+The model has been validated on Ascend environment, not validated on CPU and GPU. 
+
+
+# ModelZoo Homepage  
+ [Link](https://gitee.com/mindspore/mindspore/tree/master/mindspore/model_zoo)
diff --git a/model_zoo/mass/__init__.py b/model_zoo/mass/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/model_zoo/mass/apply_bpe_encoding.py b/model_zoo/mass/apply_bpe_encoding.py
new file mode 100644
index 0000000000..24341a62ac
--- /dev/null
+++ b/model_zoo/mass/apply_bpe_encoding.py
@@ -0,0 +1,84 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Apply bpe script."""
+import os
+import argparse
+from multiprocessing import Pool, cpu_count
+
+from src.utils import Dictionary
+from src.utils import bpe_encode
+
+parser = argparse.ArgumentParser(description='Apply BPE.')
+parser.add_argument("--codes", type=str, default="", required=True,
+                    help="bpe codes path.")
+parser.add_argument("--src_folder", type=str, default="", required=True,
+                    help="raw corpus folder.")
+parser.add_argument("--output_folder", type=str, default="", required=True,
+                    help="encoded corpus output path.")
+parser.add_argument("--prefix", type=str, default="", required=False,
+                    help="Prefix of text file.")
+parser.add_argument("--vocab_path", type=str, default="", required=True,
+                    help="Generated vocabulary output path.")
+parser.add_argument("--threshold", type=int, default=None, required=False,
+                    help="Filter out words that frequency is lower than threshold.")
+parser.add_argument("--processes", type=int, default=2, required=False,
+                    help="Number of processes to use.")
+
+if __name__ == '__main__':
+    args, _ = parser.parse_known_args()
+
+    if not (args.codes and args.src_folder and args.output_folder):
+        raise ValueError("Please enter required params.")
+
+    source_folder = args.src_folder
+    output_folder = args.output_folder
+    codes = args.codes
+
+    if not os.path.exists(codes):
+        raise FileNotFoundError("`--codes` is not existed.")
+    if not os.path.exists(source_folder) or not os.path.isdir(source_folder):
+        raise ValueError("`--src_folder` must be a dir and existed.")
+    if not os.path.exists(output_folder) or not os.path.isdir(output_folder):
+        raise ValueError("`--output_folder` must be a dir and existed.")
+    if not isinstance(args.prefix, str) or len(args.prefix) > 128:
+        raise ValueError("`--prefix` must be a str and len <= 128.")
+    if not isinstance(args.processes, int):
+        raise TypeError("`--processes` must be an integer.")
+
+    available_dict = []
+    args_groups = []
+    for file in os.listdir(source_folder):
+        if args.prefix and not file.startswith(args.prefix):
+            continue
+        if file.endswith(".txt"):
+            output_path = os.path.join(output_folder, file.replace(".txt", "_bpe.txt"))
+            dict_path = os.path.join(output_folder, file.replace(".txt", ".dict"))
+            available_dict.append(dict_path)
+            args_groups.append((codes, os.path.join(source_folder, file),
+                                output_path, dict_path))
+
+    kernel_size = 1 if args.processes <= 0 else args.processes
+    kernel_size = min(kernel_size, cpu_count())
+    pool = Pool(kernel_size)
+    for arg in args_groups:
+        pool.apply_async(bpe_encode, args=arg)
+    pool.close()
+    pool.join()
+
+    vocab = Dictionary.load_from_text(available_dict)
+    if args.threshold is not None:
+        vocab = vocab.shrink(args.threshold)
+    vocab.persistence(args.vocab_path)
+    print(f" | Vocabulary Size: {len(vocab)}")
diff --git a/model_zoo/mass/config/__init__.py b/model_zoo/mass/config/__init__.py
new file mode 100644
index 0000000000..d5c6589ee7
--- /dev/null
+++ b/model_zoo/mass/config/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""MASS model configuration."""
+from .config import TransformerConfig
+
+__all__ = [
+    "TransformerConfig"
+]
diff --git a/model_zoo/mass/config/config.json b/model_zoo/mass/config/config.json
new file mode 100644
index 0000000000..081fb4a72c
--- /dev/null
+++ b/model_zoo/mass/config/config.json
@@ -0,0 +1,54 @@
+{
+  "dataset_config": {
+    "epochs": 20,
+    "batch_size": 192,
+    "pre_train_dataset": "",
+    "fine_tune_dataset": "",
+    "test_dataset": "",
+    "valid_dataset": "",
+    "dataset_sink_mode": false,
+    "dataset_sink_step": 100
+  },
+  "model_config": {
+    "random_seed": 100,
+    "save_graphs": false,
+    "seq_length": 64,
+    "vocab_size": 45744,
+    "hidden_size": 1024,
+    "num_hidden_layers": 6,
+    "num_attention_heads": 8,
+    "intermediate_size": 4096,
+    "hidden_act": "relu",
+    "hidden_dropout_prob": 0.2,
+    "attention_dropout_prob": 0.2,
+    "max_position_embeddings": 64,
+    "initializer_range": 0.02,
+    "label_smoothing": 0.1,
+    "beam_width": 4,
+    "length_penalty_weight": 1.0,
+    "max_decode_length": 64,
+    "input_mask_from_dataset": true
+  },
+  "loss_scale_config": {
+    "init_loss_scale": 65536,
+    "loss_scale_factor": 2,
+    "scale_window": 200
+  },
+  "learn_rate_config": {
+    "optimizer": "adam",
+    "lr": 1e-4,
+    "lr_scheduler": "poly",
+    "poly_lr_scheduler_power": 0.5,
+    "decay_steps": 10000,
+    "decay_start_step": 12000,
+    "warmup_steps": 4000,
+    "min_lr": 1e-6
+  },
+  "checkpoint_options": {
+    "existed_ckpt": "",
+    "save_ckpt_steps": 2500,
+    "keep_ckpt_max": 50,
+    "ckpt_prefix": "ckpt",
+    "ckpt_path": "checkpoints"
+  }
+}
diff --git a/model_zoo/mass/config/config.py b/model_zoo/mass/config/config.py
new file mode 100644
index 0000000000..985f3aa318
--- /dev/null
+++ b/model_zoo/mass/config/config.py
@@ -0,0 +1,232 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Configuration class for Transformer."""
+import os
+import json
+import copy
+from typing import List
+
+import mindspore.common.dtype as mstype
+
+
+def _is_dataset_file(file: str):
+    return "tfrecord" in file.lower() or "mindrecord" in file.lower()
+
+
+def _get_files_from_dir(folder: str):
+    _files = []
+    for file in os.listdir(folder):
+        if _is_dataset_file(file):
+            _files.append(os.path.join(folder, file))
+    return _files
+
+
+def get_source_list(folder: str) -> List:
+    """
+    Get file list from a folder.
+
+    Returns:
+        list, file list.
+    """
+    _list = []
+    if not folder:
+        return _list
+
+    if os.path.isdir(folder):
+        _list = _get_files_from_dir(folder)
+    else:
+        if _is_dataset_file(folder):
+            _list.append(folder)
+    return _list
+
+
+PARAM_NODES = {"dataset_config",
+               "model_config",
+               "loss_scale_config",
+               "learn_rate_config",
+               "checkpoint_options"}
+
+
+class TransformerConfig:
+    """
+    Configuration for `Transformer`.
+
+    Args:
+        random_seed (int): Random seed.
+        batch_size (int): Batch size of input dataset.
+        epochs (int): Epoch number.
+        dataset_sink_mode (bool): Whether enable dataset sink mode.
+        dataset_sink_step (int): Dataset sink step.
+        lr_scheduler (str): Whether use lr_scheduler, only support "ISR" now.
+        lr (float): Initial learning rate.
+        min_lr (float): Minimum learning rate.
+        decay_start_step (int): Step to decay.
+        warmup_steps (int): Warm up steps.
+        dataset_schema (str): Path of dataset schema file.
+        pre_train_dataset (str): Path of pre-training dataset file or folder.
+        fine_tune_dataset (str): Path of fine-tune dataset file or folder.
+        test_dataset (str): Path of test dataset file or folder.
+        valid_dataset (str): Path of validation dataset file or folder.
+        ckpt_path (str): Checkpoints save path.
+        save_ckpt_steps (int): Interval of saving ckpt.
+        ckpt_prefix (str): Prefix of ckpt file.
+        keep_ckpt_max (int): Max ckpt files number.
+        seq_length (int): Length of input sequence. Default: 64.
+        vocab_size (int): The shape of each embedding vector. Default: 46192.
+        hidden_size (int): Size of embedding, attention, dim. Default: 512.
+        num_hidden_layers (int): Encoder, Decoder layers.
+        num_attention_heads (int): Number of hidden layers in the Transformer encoder/decoder
+            cell. Default: 6.
+        intermediate_size (int): Size of intermediate layer in the Transformer
+            encoder/decoder cell. Default: 4096.
+        hidden_act (str): Activation function used in the Transformer encoder/decoder
+            cell. Default: "relu".
+        init_loss_scale (int): Initialized loss scale.
+        loss_scale_factor (int): Loss scale factor.
+        scale_window (int): Window size of loss scale.
+        beam_width (int): Beam width for beam search in inferring. Default: 4.
+        length_penalty_weight (float): Penalty for sentence length. Default: 1.0.
+        label_smoothing (float): Label smoothing setting. Default: 0.1.
+        input_mask_from_dataset (bool): Specifies whether to use the input mask that loaded from
+            dataset. Default: True.
+        save_graphs (bool): Whether to save graphs, please set to True if mindinsight
+            is wanted.
+        dtype (mstype): Data type of the input. Default: mstype.float32.
+        max_decode_length (int): Max decode length for inferring. Default: 64.
+        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
+        attention_dropout_prob (float): The dropout probability for
+            Multi-head Self-Attention. Default: 0.1.
+        max_position_embeddings (int): Maximum length of sequences used in this
+            model. Default: 512.
+        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
+    """
+
+    def __init__(self,
+                 random_seed=74,
+                 batch_size=64, epochs=1,
+                 dataset_sink_mode=True, dataset_sink_step=1,
+                 lr_scheduler="", optimizer="adam",
+                 lr=1e-4, min_lr=1e-6,
+                 decay_steps=10000, poly_lr_scheduler_power=1,
+                 decay_start_step=-1, warmup_steps=2000,
+                 pre_train_dataset: str = None,
+                 fine_tune_dataset: str = None,
+                 test_dataset: str = None,
+                 valid_dataset: str = None,
+                 ckpt_path: str = None,
+                 save_ckpt_steps=2000,
+                 ckpt_prefix="CKPT",
+                 existed_ckpt="",
+                 keep_ckpt_max=20,
+                 seq_length=128,
+                 vocab_size=46192,
+                 hidden_size=512,
+                 num_hidden_layers=6,
+                 num_attention_heads=8,
+                 intermediate_size=4096,
+                 hidden_act="relu",
+                 hidden_dropout_prob=0.1,
+                 attention_dropout_prob=0.1,
+                 max_position_embeddings=64,
+                 initializer_range=0.02,
+                 init_loss_scale=2 ** 10,
+                 loss_scale_factor=2, scale_window=2000,
+                 beam_width=5,
+                 length_penalty_weight=1.0,
+                 label_smoothing=0.1,
+                 input_mask_from_dataset=True,
+                 save_graphs=False,
+                 dtype=mstype.float32,
+                 max_decode_length=64):
+
+        self.save_graphs = save_graphs
+        self.random_seed = random_seed
+        self.pre_train_dataset = get_source_list(pre_train_dataset)  # type: List[str]
+        self.fine_tune_dataset = get_source_list(fine_tune_dataset)  # type: List[str]
+        self.valid_dataset = get_source_list(valid_dataset)  # type: List[str]
+        self.test_dataset = get_source_list(test_dataset)  # type: List[str]
+
+        if not isinstance(epochs, int) and epochs < 0:
+            raise ValueError("`epoch` must be type of int.")
+
+        self.epochs = epochs
+        self.dataset_sink_mode = dataset_sink_mode
+        self.dataset_sink_step = dataset_sink_step
+
+        self.ckpt_path = ckpt_path
+        self.keep_ckpt_max = keep_ckpt_max
+        self.save_ckpt_steps = save_ckpt_steps
+        self.ckpt_prefix = ckpt_prefix
+        self.existed_ckpt = existed_ckpt
+
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.label_smoothing = label_smoothing
+
+        self.beam_width = beam_width
+        self.length_penalty_weight = length_penalty_weight
+        self.max_decode_length = max_decode_length
+        self.input_mask_from_dataset = input_mask_from_dataset
+        self.compute_type = mstype.float16
+        self.dtype = dtype
+
+        self.scale_window = scale_window
+        self.loss_scale_factor = loss_scale_factor
+        self.init_loss_scale = init_loss_scale
+
+        self.optimizer = optimizer
+        self.lr = lr
+        self.lr_scheduler = lr_scheduler
+        self.min_lr = min_lr
+        self.poly_lr_scheduler_power = poly_lr_scheduler_power
+        self.decay_steps = decay_steps
+        self.decay_start_step = decay_start_step
+        self.warmup_steps = warmup_steps
+
+        self.train_url = ""
+
+    @classmethod
+    def from_dict(cls, json_object: dict):
+        """Constructs a `TransformerConfig` from a Python dictionary of parameters."""
+        _params = {}
+        for node in PARAM_NODES:
+            for key in json_object[node]:
+                _params[key] = json_object[node][key]
+        return cls(**_params)
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `TransformerConfig` from a json file of parameters."""
+        with open(json_file, "r") as reader:
+            return cls.from_dict(json.load(reader))
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
diff --git a/model_zoo/mass/cornell_dialog.py b/model_zoo/mass/cornell_dialog.py
new file mode 100644
index 0000000000..e2e9e9155f
--- /dev/null
+++ b/model_zoo/mass/cornell_dialog.py
@@ -0,0 +1,110 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Generate Cornell Movie Dialog dataset."""
+import os
+import argparse
+from src.dataset import BiLingualDataLoader
+from src.language_model import NoiseChannelLanguageModel
+from src.utils import Dictionary
+
+parser = argparse.ArgumentParser(description='Generate Cornell Movie Dialog dataset file.')
+parser.add_argument("--src_folder", type=str, default="", required=True,
+                    help="Raw corpus folder.")
+parser.add_argument("--existed_vocab", type=str, default="", required=True,
+                    help="Existed vocabulary.")
+parser.add_argument("--train_prefix", type=str, default="train", required=False,
+                    help="Prefix of train file.")
+parser.add_argument("--test_prefix", type=str, default="test", required=False,
+                    help="Prefix of test file.")
+parser.add_argument("--valid_prefix", type=str, default=None, required=False,
+                    help="Prefix of valid file.")
+parser.add_argument("--noise_prob", type=float, default=0., required=False,
+                    help="Add noise prob.")
+parser.add_argument("--max_len", type=int, default=32, required=False,
+                    help="Max length of sentence.")
+parser.add_argument("--output_folder", type=str, default="", required=True,
+                    help="Dataset output path.")
+
+if __name__ == '__main__':
+    args, _ = parser.parse_known_args()
+
+    dicts = []
+    train_src_file = ""
+    train_tgt_file = ""
+    test_src_file = ""
+    test_tgt_file = ""
+    valid_src_file = ""
+    valid_tgt_file = ""
+    for file in os.listdir(args.src_folder):
+        if file.startswith(args.train_prefix) and "src" in file and file.endswith(".txt"):
+            train_src_file = os.path.join(args.src_folder, file)
+        elif file.startswith(args.train_prefix) and "tgt" in file and file.endswith(".txt"):
+            train_tgt_file = os.path.join(args.src_folder, file)
+        elif file.startswith(args.test_prefix) and "src" in file and file.endswith(".txt"):
+            test_src_file = os.path.join(args.src_folder, file)
+        elif file.startswith(args.test_prefix) and "tgt" in file and file.endswith(".txt"):
+            test_tgt_file = os.path.join(args.src_folder, file)
+        elif args.valid_prefix and file.startswith(args.valid_prefix) and "src" in file and file.endswith(".txt"):
+            valid_src_file = os.path.join(args.src_folder, file)
+        elif args.valid_prefix and file.startswith(args.valid_prefix) and "tgt" in file and file.endswith(".txt"):
+            valid_tgt_file = os.path.join(args.src_folder, file)
+        else:
+            continue
+
+    vocab = Dictionary.load_from_persisted_dict(args.existed_vocab)
+
+    if train_src_file and train_tgt_file:
+        BiLingualDataLoader(
+            src_filepath=train_src_file,
+            tgt_filepath=train_tgt_file,
+            src_dict=vocab, tgt_dict=vocab,
+            src_lang="en", tgt_lang="en",
+            language_model=NoiseChannelLanguageModel(add_noise_prob=args.noise_prob),
+            max_sen_len=args.max_len
+        ).write_to_tfrecord(
+            path=os.path.join(
+                args.output_folder, "train_cornell_dialog.tfrecord"
+            )
+        )
+
+    if test_src_file and test_tgt_file:
+        BiLingualDataLoader(
+            src_filepath=test_src_file,
+            tgt_filepath=test_tgt_file,
+            src_dict=vocab, tgt_dict=vocab,
+            src_lang="en", tgt_lang="en",
+            language_model=NoiseChannelLanguageModel(add_noise_prob=0.),
+            max_sen_len=args.max_len
+        ).write_to_tfrecord(
+            path=os.path.join(
+                args.output_folder, "test_cornell_dialog.tfrecord"
+            )
+        )
+
+    if args.valid_prefix:
+        BiLingualDataLoader(
+            src_filepath=os.path.join(args.src_folder, valid_src_file),
+            tgt_filepath=os.path.join(args.src_folder, valid_tgt_file),
+            src_dict=vocab, tgt_dict=vocab,
+            src_lang="en", tgt_lang="en",
+            language_model=NoiseChannelLanguageModel(add_noise_prob=0.),
+            max_sen_len=args.max_len
+        ).write_to_tfrecord(
+            path=os.path.join(
+                args.output_folder, "valid_cornell_dialog.tfrecord"
+            )
+        )
+
+    print(f" | Vocabulary size: {vocab.size}.")
diff --git a/model_zoo/mass/eval.py b/model_zoo/mass/eval.py
new file mode 100644
index 0000000000..4da63a7333
--- /dev/null
+++ b/model_zoo/mass/eval.py
@@ -0,0 +1,75 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Evaluation api."""
+import argparse
+import pickle
+import numpy as np
+
+from mindspore.common import dtype as mstype
+
+from config import TransformerConfig
+from src.transformer import infer
+from src.utils import ngram_ppl
+from src.utils import Dictionary
+from src.utils import rouge
+
+parser = argparse.ArgumentParser(description='Evaluation MASS.')
+parser.add_argument("--config", type=str, required=True,
+                    help="Model config json file path.")
+parser.add_argument("--vocab", type=str, required=True,
+                    help="Vocabulary to use.")
+parser.add_argument("--output", type=str, required=True,
+                    help="Result file path.")
+
+
+def get_config(config):
+    config = TransformerConfig.from_json_file(config)
+    config.compute_type = mstype.float16
+    config.dtype = mstype.float32
+    return config
+
+
+if __name__ == '__main__':
+    args, _ = parser.parse_known_args()
+    vocab = Dictionary.load_from_persisted_dict(args.vocab)
+    _config = get_config(args.config)
+    result = infer(_config)
+    with open(args.output, "wb") as f:
+        pickle.dump(result, f, 1)
+
+    ppl_score = 0.
+    preds = []
+    tgts = []
+    _count = 0
+    for sample in result:
+        sentence_prob = np.array(sample['prediction_prob'], dtype=np.float32)
+        sentence_prob = sentence_prob[:, 1:]
+        _ppl = []
+        for path in sentence_prob:
+            _ppl.append(ngram_ppl(path, log_softmax=True))
+        ppl = np.min(_ppl)
+        preds.append(' '.join([vocab[t] for t in sample['prediction']]))
+        tgts.append(' '.join([vocab[t] for t in sample['target']]))
+        print(f" | source: {' '.join([vocab[t] for t in sample['source']])}")
+        print(f" | target: {tgts[-1]}")
+        print(f" | prediction: {preds[-1]}")
+        print(f" | ppl: {ppl}.")
+        if np.isinf(ppl):
+            continue
+        ppl_score += ppl
+        _count += 1
+
+    print(f" | PPL={ppl_score / _count}.")
+    rouge(preds, tgts)
diff --git a/model_zoo/mass/gigaword.py b/model_zoo/mass/gigaword.py
new file mode 100644
index 0000000000..f473ddd5ce
--- /dev/null
+++ b/model_zoo/mass/gigaword.py
@@ -0,0 +1,84 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Generate Gigaword dataset."""
+import os
+import argparse
+
+from src.dataset import BiLingualDataLoader
+from src.language_model import NoiseChannelLanguageModel
+from src.utils import Dictionary
+
+parser = argparse.ArgumentParser(description='Create Gigaword fine-tune Dataset.')
+parser.add_argument("--train_src", type=str, default="", required=False,
+                    help="train dataset source file path.")
+parser.add_argument("--train_ref", type=str, default="", required=False,
+                    help="train dataset reference file path.")
+parser.add_argument("--test_src", type=str, default="", required=False,
+                    help="test dataset source file path.")
+parser.add_argument("--test_ref", type=str, default="", required=False,
+                    help="test dataset reference file path.")
+parser.add_argument("--noise_prob", type=float, default=0., required=False,
+                    help="add noise prob.")
+parser.add_argument("--existed_vocab", type=str, default="", required=False,
+                    help="existed vocab path.")
+parser.add_argument("--max_len", type=int, default=64, required=False,
+                    help="max length of sentences.")
+parser.add_argument("--output_folder", type=str, default="", required=True,
+                    help="dataset output path.")
+parser.add_argument("--format", type=str, default="tfrecord", required=False,
+                    help="dataset format.")
+
+if __name__ == '__main__':
+    args, _ = parser.parse_known_args()
+
+    vocab = Dictionary.load_from_persisted_dict(args.existed_vocab)
+
+    if args.train_src and args.train_ref:
+        train = BiLingualDataLoader(
+            src_filepath=args.train_src,
+            tgt_filepath=args.train_ref,
+            src_dict=vocab, tgt_dict=vocab,
+            src_lang="en", tgt_lang="en",
+            language_model=NoiseChannelLanguageModel(add_noise_prob=args.noise_prob),
+            max_sen_len=args.max_len
+        )
+        if "tf" in args.format.lower():
+            train.write_to_tfrecord(
+                path=os.path.join(args.output_folder, "gigaword_train_dataset.tfrecord")
+            )
+        else:
+            train.write_to_mindrecord(
+                path=os.path.join(args.output_folder, "gigaword_train_dataset.mindrecord")
+            )
+
+    if args.test_src and args.test_ref:
+        test = BiLingualDataLoader(
+            src_filepath=args.test_src,
+            tgt_filepath=args.test_ref,
+            src_dict=vocab, tgt_dict=vocab,
+            src_lang="en", tgt_lang="en",
+            language_model=NoiseChannelLanguageModel(add_noise_prob=0),
+            max_sen_len=args.max_len
+        )
+        if "tf" in args.format.lower():
+            test.write_to_tfrecord(
+                path=os.path.join(args.output_folder, "gigaword_test_dataset.tfrecord")
+            )
+        else:
+            test.write_to_mindrecord(
+                path=os.path.join(args.output_folder, "gigaword_test_dataset.mindrecord")
+            )
+
+    print(f" | Vocabulary size: {vocab.size}.")
diff --git a/model_zoo/mass/news_crawl.py b/model_zoo/mass/news_crawl.py
new file mode 100644
index 0000000000..4481846cca
--- /dev/null
+++ b/model_zoo/mass/news_crawl.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Generate News Crawl corpus dataset."""
+import argparse
+
+from src.utils import Dictionary
+from src.utils.preprocess import create_pre_training_dataset
+
+parser = argparse.ArgumentParser(description='Create News Crawl Pre-Training Dataset.')
+parser.add_argument("--src_folder", type=str, default="", required=True,
+                    help="Raw corpus folder.")
+parser.add_argument("--existed_vocab", type=str, default="", required=True,
+                    help="Existed vocab path.")
+parser.add_argument("--mask_ratio", type=float, default=0.4, required=True,
+                    help="Mask ratio.")
+parser.add_argument("--output_folder", type=str, default="", required=True,
+                    help="Dataset output path.")
+parser.add_argument("--max_len", type=int, default=32, required=False,
+                    help="Max length of sentences.")
+parser.add_argument("--suffix", type=str, default="", required=False,
+                    help="Add suffix to output file.")
+parser.add_argument("--processes", type=int, default=2, required=False,
+                    help="Size of processes pool.")
+
+if __name__ == '__main__':
+    args, _ = parser.parse_known_args()
+    if not (args.src_folder and args.output_folder):
+        raise ValueError("Please enter required params.")
+
+    if not args.existed_vocab:
+        raise ValueError("`--existed_vocab` is required.")
+
+    vocab = Dictionary.load_from_persisted_dict(args.existed_vocab)
+
+    create_pre_training_dataset(
+        folder_path=args.src_folder,
+        output_folder_path=args.output_folder,
+        vocabulary=vocab,
+        prefix="news.20", suffix=args.suffix,
+        mask_ratio=args.mask_ratio,
+        min_sen_len=10,
+        max_sen_len=args.max_len,
+        dataset_type="tfrecord",
+        cores=args.processes
+    )
+    print(f" | Vocabulary size: {vocab.size}.")
diff --git a/model_zoo/mass/requirements.txt b/model_zoo/mass/requirements.txt
new file mode 100644
index 0000000000..f70e569a82
--- /dev/null
+++ b/model_zoo/mass/requirements.txt
@@ -0,0 +1,5 @@
+nltk
+jieba
+numpy
+subword-nmt
+files2rouge
diff --git a/model_zoo/mass/scripts/__init__.py b/model_zoo/mass/scripts/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/model_zoo/mass/scripts/learn_subword.sh b/model_zoo/mass/scripts/learn_subword.sh
new file mode 100644
index 0000000000..05dd516880
--- /dev/null
+++ b/model_zoo/mass/scripts/learn_subword.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+src_folder_path=$1  # source text folder path.
+
+cd $src_folder_path || exit
+cat *.txt | subword-nmt learn-bpe -s 46000 -o all.bpe.codes
diff --git a/model_zoo/mass/scripts/run.sh b/model_zoo/mass/scripts/run.sh
new file mode 100644
index 0000000000..fc9606fcbd
--- /dev/null
+++ b/model_zoo/mass/scripts/run.sh
@@ -0,0 +1,169 @@
+#!/usr/bin/env bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+export DEVICE_ID=0
+export RANK_ID=0
+export RANK_SIZE=1
+
+options=`getopt -u -o ht:n:i:j:c:o:v: -l help,task:,device_num:,device_id:,hccl_json:,config:,output:,vocab -- "$@"`
+eval set -- "$options"
+echo $options
+
+echo_help()
+{
+  echo "Usage:"
+  echo "bash train.sh [-h] [-t t|i] [-n N] [-i N] [-j FILE] [-c FILE] [-o FILE] [-v FILE]"
+  echo "options:"
+  echo "        -h --help                show usage"
+  echo "        -t --task                select task, 't' for training and 'i' for inference"
+  echo "        -n --device_num          training with N devices"
+  echo "        -i --device_id           training with device i"
+  echo "        -j --hccl_json           set the rank table file"
+  echo "        -c --config              set the configuration file"
+  echo "        -o --output              set the output file of inference"
+  echo "        -v --vocab               set the vocabulary"
+}
+
+set_hccl_json()
+{
+  while [ -n "$1" ]
+  do
+    if [[ "$1" == "-j" || "$1"  == "--hccl_json" ]]
+    then
+      export MINDSPORE_HCCL_CONFIG_PATH=$2 #/data/wsc/hccl_2p_01.json
+      export RANK_TABLE_FILE=$2 #/data/wsc/hccl_2p_01.json
+      break
+    fi
+    shift
+  done
+}
+set_device_id()
+{
+  while [ -n "$1" ]
+  do
+    if [[ "$1" == "-i" || "$1" == "--device_id" ]]
+    then
+      if [[ $2 -ge 0 && $2 -le 7 ]]
+      then
+        export DEVICE_ID=$2
+      fi
+      break
+    fi
+    shift
+  done
+}
+
+while [ -n "$1" ]
+do
+  case "$1" in
+  -h|--help)
+      echo_help
+      shift
+      ;;
+  -t|--task)
+    echo "task:"
+    if [ "$2" == "t" ]
+    then
+      task=train
+    elif [ "$2" == "i" ]
+    then
+      task=infer
+    fi
+    shift 2
+    ;;
+  -n|--device_num)
+    echo "device_num"
+    if [ $2 -eq 1 ]
+    then
+      set_device_id $options
+    elif [ $2 -gt 1 ]
+    then
+        export HCCL_FLAG=1
+        export DEPLOY_MODE=0
+
+        export RANK_SIZE=$2
+        set_hccl_json $options
+    fi
+    shift 2
+    ;;
+  -i|--device_id)
+    echo "set device id"
+    export DEVICE_ID=$2
+    shift 2
+    ;;
+  -c|--config)
+    echo "config";
+    configurations=$2
+    shift 2
+    ;;
+  -o|--output)
+    echo "output";
+    output=$2
+    shift 2
+    ;;
+  -v|--vocab)
+    echo "vocab";
+    vocab=$2
+    shift 2
+    ;;
+  --)
+    shift
+    break
+    ;;
+  *)
+    shift
+    ;;
+esac
+done
+
+for((i=0; i < $RANK_SIZE; i++))
+do
+  if [ $RANK_SIZE -gt 1 ]
+  then
+    echo $RANK_SIZE
+    export RANK_ID=$i
+    export DEVICE_ID=$[i]
+  fi
+  echo "Working on device $i"
+
+  file_path=$(cd "$(dirname $0)" || exit; pwd)
+  cd $file_path || exit
+  cd ../ || exit
+
+  rm -rf ./run_mass_$DEVICE_ID
+  mkdir ./run_mass_$DEVICE_ID
+
+  cp train.py ./run_mass_$DEVICE_ID
+  cp eval.py ./run_mass_$DEVICE_ID
+  cp $configurations ./run_mass_$DEVICE_ID
+
+  if [ $vocab ]
+  then
+    cp $vocab ./run_mass_$DEVICE_ID
+  fi
+
+  cd ./run_mass_$DEVICE_ID || exit
+  env > log.log
+  echo $task
+  if [ "$task" == "train" ]
+  then
+    python train.py --config ${configurations##*/} >>log.log 2>&1 &
+  elif [ "$task" == "infer" ]
+  then
+    python eval.py --config ${configurations##*/} --output ${output} --vocab ${vocab##*/} >>log_infer.log 2>&1 &
+  fi
+  cd ../
+done
diff --git a/model_zoo/mass/src/__init__.py b/model_zoo/mass/src/__init__.py
new file mode 100644
index 0000000000..7e943365a0
--- /dev/null
+++ b/model_zoo/mass/src/__init__.py
@@ -0,0 +1,44 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Source of mass model."""
+from .dataset import load_dataset
+from .dataset import bi_data_loader
+from .dataset import mono_data_loader
+from .transformer import TransformerDecoder
+from .transformer import TransformerEncoder
+from .transformer import Transformer
+from .transformer import TransformerNetworkWithLoss
+from .transformer import LabelSmoothedCrossEntropyCriterion
+from .transformer import TransformerTrainOneStepWithLossScaleCell
+from .transformer import TransformerTraining
+from .transformer import infer
+from .language_model import LooseMaskedLanguageModel
+from .language_model import MaskedLanguageModel
+from .language_model import NoiseChannelLanguageModel
+
+__all__ = [
+    "load_dataset",
+    "bi_data_loader",
+    "mono_data_loader",
+    "Transformer",
+    "infer",
+    "TransformerTraining",
+    "TransformerNetworkWithLoss",
+    "TransformerTrainOneStepWithLossScaleCell",
+    "LabelSmoothedCrossEntropyCriterion",
+    "LooseMaskedLanguageModel",
+    "MaskedLanguageModel",
+    "NoiseChannelLanguageModel"
+]
diff --git a/model_zoo/mass/src/dataset/__init__.py b/model_zoo/mass/src/dataset/__init__.py
new file mode 100644
index 0000000000..b93504d922
--- /dev/null
+++ b/model_zoo/mass/src/dataset/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Dataset module."""
+from .bi_data_loader import BiLingualDataLoader
+from .mono_data_loader import MonoLingualDataLoader
+from .load_dataset import load_dataset
+
+__all__ = [
+    "load_dataset",
+    "BiLingualDataLoader",
+    "MonoLingualDataLoader"
+]
diff --git a/model_zoo/mass/src/dataset/base.py b/model_zoo/mass/src/dataset/base.py
new file mode 100644
index 0000000000..79f1281513
--- /dev/null
+++ b/model_zoo/mass/src/dataset/base.py
@@ -0,0 +1,102 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Base class of data loader."""
+import os
+import collections
+import numpy as np
+
+from mindspore.mindrecord import FileWriter
+from .schema import SCHEMA
+
+
+class DataLoader:
+    """Data loader for dataset."""
+    _SCHEMA = SCHEMA
+
+    def __init__(self, max_sen_len=66):
+        self._examples = []
+        self._max_sentence_len = max_sen_len
+
+    def _load(self):
+        raise NotImplementedError
+
+    def padding(self, sen, padding_idx, dtype=np.int64):
+        """Padding <pad> to sentence."""
+        if sen.shape[0] > self._max_sentence_len:
+            return None
+        new_sen = np.array([padding_idx] * self._max_sentence_len,
+                           dtype=dtype)
+        new_sen[:sen.shape[0]] = sen[:]
+        return new_sen
+
+    def write_to_mindrecord(self, path, shard_num=1, desc=""):
+        """
+        Write mindrecord file.
+
+        Args:
+            path (str): File path.
+            shard_num (int): Shard num.
+            desc (str): Description.
+        """
+        if not os.path.isabs(path):
+            path = os.path.abspath(path)
+
+        writer = FileWriter(file_name=path, shard_num=shard_num)
+        writer.add_schema(self._SCHEMA, desc)
+        if not self._examples:
+            self._load()
+
+        writer.write_raw_data(self._examples)
+        writer.commit()
+        print(f"| Wrote to {path}.")
+
+    def write_to_tfrecord(self, path, shard_num=1):
+        """
+        Write to tfrecord.
+
+        Args:
+            path (str): Output file path.
+            shard_num (int): Shard num.
+        """
+        import tensorflow as tf
+        if not os.path.isabs(path):
+            path = os.path.abspath(path)
+        output_files = []
+        for i in range(shard_num):
+            output_file = path + "-%03d-of-%03d" % (i + 1, shard_num)
+            output_files.append(output_file)
+        # create writers
+        writers = []
+        for output_file in output_files:
+            writers.append(tf.io.TFRecordWriter(output_file))
+
+        if not self._examples:
+            self._load()
+
+        # create feature
+        features = collections.OrderedDict()
+        for example in self._examples:
+            for key in example:
+                features[key] = tf.train.Feature(int64_list=tf.train.Int64List(value=example[key].tolist()))
+            tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+            for writer in writers:
+                writer.write(tf_example.SerializeToString())
+        for writer in writers:
+            writer.close()
+        for p in output_files:
+            print(f" | Write to {p}.")
+
+    def _add_example(self, example):
+        self._examples.append(example)
diff --git a/model_zoo/mass/src/dataset/bi_data_loader.py b/model_zoo/mass/src/dataset/bi_data_loader.py
new file mode 100644
index 0000000000..e2532662d9
--- /dev/null
+++ b/model_zoo/mass/src/dataset/bi_data_loader.py
@@ -0,0 +1,142 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Bilingual data loader."""
+import numpy as np
+
+from src.utils import Dictionary
+from .base import DataLoader
+from ..language_model.base import LanguageModel
+from ..language_model.noise_channel_language_model import NoiseChannelLanguageModel
+
+
+class BiLingualDataLoader(DataLoader):
+    """Loader for bilingual data."""
+
+    def __init__(self, src_filepath: str, tgt_filepath: str,
+                 src_dict: Dictionary, tgt_dict: Dictionary,
+                 src_lang: str, tgt_lang: str,
+                 language_model: LanguageModel = NoiseChannelLanguageModel(add_noise_prob=0),
+                 max_sen_len=66,
+                 merge_dict=True):
+        super(BiLingualDataLoader, self).__init__(max_sen_len)
+        self._src_filepath = src_filepath
+        self._tgt_filepath = tgt_filepath
+        self._src_dict = src_dict
+        self._tgt_dict = tgt_dict
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        self._lm = language_model
+        self.max_sen_len = max_sen_len
+        self.share_dict = merge_dict
+        self._merge_dict()
+
+    def _merge_dict(self):
+        if self.share_dict:
+            merged_dict = self._src_dict.merge_dict(self._tgt_dict,
+                                                    new_dict=True)
+            self._src_dict = merged_dict
+            self._tgt_dict = merged_dict
+
+    @property
+    def src_dict(self):
+        return self._src_dict
+
+    @property
+    def tgt_dict(self):
+        return self._tgt_dict
+
+    def _load(self):
+        _min_len = 9999999999
+        _max_len = 0
+        unk_count = 0
+        tokens_count = 0
+        count = 0
+        with open(self._src_filepath, "r") as _src_file:
+            print(f" | Processing corpus {self._src_filepath}.")
+            print(f" | Processing corpus {self._tgt_filepath}.")
+            with open(self._tgt_filepath, "r") as _tgt_file:
+                _min, _max = 9999999, -1
+                for _, _pair in enumerate(zip(_src_file, _tgt_file)):
+                    src_tokens = [
+                        self._src_dict.index(t)
+                        for t in _pair[0].strip().split(" ") if t
+                    ]
+                    tgt_tokens = [
+                        self._tgt_dict.index(t)
+                        for t in _pair[1].strip().split(" ") if t
+                    ]
+                    src_tokens.append(self._src_dict.eos_index)
+                    tgt_tokens.append(self._tgt_dict.eos_index)
+                    opt = self._lm.emit(
+                        sentence=np.array(src_tokens, dtype=np.int64),
+                        target=np.array(tgt_tokens, dtype=np.int64),
+                        mask_symbol_idx=self._src_dict.mask_index,
+                        bos_symbol_idx=self._tgt_dict.bos_index
+                    )
+                    src_len = opt["sentence_length"]
+                    tgt_len = opt["tgt_sen_length"]
+
+                    _min_len = min(_min_len, opt["sentence_length"], opt["tgt_sen_length"])
+                    _max_len = max(_max_len, opt["sentence_length"], opt["tgt_sen_length"])
+
+                    if src_len > self.max_sen_len or tgt_len > self.max_sen_len:
+                        continue
+
+                    src_padding = np.zeros(shape=self.max_sen_len, dtype=np.int64)
+                    tgt_padding = np.zeros(shape=self.max_sen_len, dtype=np.int64)
+                    for i in range(src_len):
+                        src_padding[i] = 1
+                    for j in range(tgt_len):
+                        tgt_padding[j] = 1
+
+                    tokens_count += opt["encoder_input"].shape[0]
+                    tokens_count += opt["decoder_input"].shape[0]
+                    tokens_count += opt["decoder_output"].shape[0]
+                    unk_count += np.where(opt["encoder_input"] == self._src_dict.unk_index)[0].shape[0]
+                    unk_count += np.where(opt["decoder_input"] == self._src_dict.unk_index)[0].shape[0]
+                    unk_count += np.where(opt["decoder_output"] == self._src_dict.unk_index)[0].shape[0]
+
+                    encoder_input = self.padding(opt["encoder_input"],
+                                                 self._src_dict.padding_index)
+                    decoder_input = self.padding(opt["decoder_input"],
+                                                 self._tgt_dict.padding_index)
+                    decoder_output = self.padding(opt["decoder_output"],
+                                                  self._tgt_dict.padding_index)
+                    if encoder_input is None or decoder_input is None or decoder_output is None:
+                        continue
+
+                    _min = np.min([np.min(encoder_input),
+                                   np.min(decoder_input),
+                                   np.min(decoder_output), _min])
+                    _max = np.max([np.max(encoder_input),
+                                   np.max(decoder_input),
+                                   np.max(decoder_output), _max])
+
+                    example = {
+                        "src_padding": src_padding,
+                        "tgt_padding": tgt_padding,
+                        "src": encoder_input,
+                        "prev_opt": decoder_input,
+                        "prev_padding": tgt_padding,
+                        "target": decoder_output
+                    }
+                    self._add_example(example)
+                    count += 1
+
+                print(f" | Shortest len = {_min_len}.")
+                print(f" | Longest  len = {_max_len}.")
+                print(f" | Total    sen = {count}.")
+                print(f" | Total token num={tokens_count}, "
+                      f"{unk_count / tokens_count * 100}% replaced by <unk>.")
diff --git a/model_zoo/mass/src/dataset/load_dataset.py b/model_zoo/mass/src/dataset/load_dataset.py
new file mode 100644
index 0000000000..9d9d558cb6
--- /dev/null
+++ b/model_zoo/mass/src/dataset/load_dataset.py
@@ -0,0 +1,121 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Dataset loader to feed into model."""
+import os
+import mindspore.common.dtype as mstype
+import mindspore.dataset.engine as de
+import mindspore.dataset.transforms.c_transforms as deC
+
+
+def _load_dataset(input_files, batch_size, epoch_count=1,
+                  sink_mode=False, sink_step=1, rank_size=1, rank_id=0, shuffle=True):
+    """
+    Load dataset according to passed in params.
+
+    Args:
+        input_files (list): Data files.
+        batch_size (int): Batch size.
+        epoch_count (int): Epoch count.
+        sink_mode (bool): Whether enable sink mode.
+        sink_step (int): Step to sink.
+        rank_size (int): Rank size.
+        rank_id (int): Rank id.
+        shuffle (bool): Whether shuffle dataset.
+
+    Returns:
+        Dataset, dataset instance.
+    """
+    if not input_files:
+        raise FileNotFoundError("Require at least one dataset.")
+
+    if not (schema_file and
+            os.path.exists(schema_file)
+            and os.path.isfile(schema_file)
+            and os.path.basename(schema_file).endswith(".json")):
+        raise FileNotFoundError("`dataset_schema` must be a existed json file.")
+
+    if not isinstance(sink_mode, bool):
+        raise ValueError("`sink` must be type of bool.")
+
+    for datafile in input_files:
+        print(f" | Loading {datafile}.")
+
+    ds = de.TFRecordDataset(
+        input_files,
+        columns_list=[
+            "src", "src_padding",
+            "prev_opt", "prev_padding",
+            "target", "tgt_padding"
+        ],
+        shuffle=shuffle, num_shards=rank_size, shard_id=rank_id,
+        shard_equal_rows=True, num_parallel_workers=8)
+
+    ori_dataset_size = ds.get_dataset_size()
+    print(f" | Dataset size: {ori_dataset_size}.")
+    repeat_count = epoch_count
+    if sink_mode:
+        ds.set_dataset_size(sink_step * batch_size)
+        repeat_count = epoch_count * ori_dataset_size // ds.get_dataset_size()
+
+    type_cast_op = deC.TypeCast(mstype.int32)
+    ds = ds.map(input_columns="src", operations=type_cast_op)
+    ds = ds.map(input_columns="src_padding", operations=type_cast_op)
+    ds = ds.map(input_columns="prev_opt", operations=type_cast_op)
+    ds = ds.map(input_columns="prev_padding", operations=type_cast_op)
+    ds = ds.map(input_columns="target", operations=type_cast_op)
+    ds = ds.map(input_columns="tgt_padding", operations=type_cast_op)
+
+    ds = ds.rename(
+        input_columns=["src",
+                       "src_padding",
+                       "prev_opt",
+                       "prev_padding",
+                       "target",
+                       "tgt_padding"],
+        output_columns=["source_eos_ids",
+                        "source_eos_mask",
+                        "target_sos_ids",
+                        "target_sos_mask",
+                        "target_eos_ids",
+                        "target_eos_mask"]
+    )
+
+    ds = ds.batch(batch_size, drop_remainder=True)
+    ds = ds.repeat(repeat_count)
+
+    ds.channel_name = 'transformer'
+    return ds
+
+
+def load_dataset(data_files: list, batch_size: int, epoch_count: int,
+                 sink_mode: bool, sink_step: int = 1, rank_size: int = 1, rank_id: int = 0, shuffle=True):
+    """
+    Load dataset.
+
+    Args:
+        data_files (list): Data files.
+        batch_size (int): Batch size.
+        epoch_count (int): Epoch count.
+        sink_mode (bool): Whether enable sink mode.
+        sink_step (int): Step to sink.
+        rank_size (int): Rank size.
+        rank_id (int): Rank id.
+        shuffle (bool): Whether shuffle dataset.
+
+    Returns:
+        Dataset, dataset instance.
+    """
+    return _load_dataset(data_files, batch_size, epoch_count, sink_mode,
+                         sink_step, rank_size, rank_id, shuffle=shuffle)
diff --git a/model_zoo/mass/src/dataset/mono_data_loader.py b/model_zoo/mass/src/dataset/mono_data_loader.py
new file mode 100644
index 0000000000..13379a2f42
--- /dev/null
+++ b/model_zoo/mass/src/dataset/mono_data_loader.py
@@ -0,0 +1,109 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Mono data loader."""
+import numpy as np
+
+from src.utils import Dictionary
+
+from .base import DataLoader
+from .schema import SCHEMA
+from ..language_model.base import LanguageModel
+from ..language_model import LooseMaskedLanguageModel
+
+
+class MonoLingualDataLoader(DataLoader):
+    """Loader for monolingual data."""
+    _SCHEMA = SCHEMA
+
+    def __init__(self, src_filepath: str, lang: str, dictionary: Dictionary,
+                 language_model: LanguageModel = LooseMaskedLanguageModel(mask_ratio=0.3),
+                 max_sen_len=66, min_sen_len=16):
+        super(MonoLingualDataLoader, self).__init__(max_sen_len=max_sen_len)
+        self._file_path = src_filepath
+        self._lang = lang
+        self._dictionary = dictionary
+        self._lm = language_model
+        self.max_sen_len = max_sen_len
+        self.min_sen_len = min_sen_len
+
+    @property
+    def dict(self):
+        return self._dictionary
+
+    def generate_padding_mask(self, sentence, length, exclude_mask=False):
+        """Generate padding mask vector."""
+        src_padding = np.zeros(shape=self.max_sen_len, dtype=np.int64)
+        if exclude_mask:
+            pos = np.where(sentence == self._dictionary.padding_index)[0]
+        else:
+            pos = np.where((sentence == self._dictionary.padding_index) | (sentence == self._dictionary.mask_index))[0]
+        src_padding[0:length] = 1
+        if pos.shape[0] != 0:
+            src_padding[pos] = 0
+        return src_padding
+
+    def _load(self):
+        _min_len = 9999999999
+        _max_len = 0
+        count = 0
+        with open(self._file_path, "r") as _file:
+            print(f" | Processing corpus {self._file_path}.")
+            for _, _line in enumerate(_file):
+                tokens = [self._dictionary.index(t.replace(" ", ""))
+                          for t in _line.strip().split(" ") if t]
+                # In mass code, it doesn't add <BOS> to sen.
+                tokens.append(self._dictionary.eos_index)
+                opt = self._lm.emit(sentence=np.array(tokens, dtype=np.int32),
+                                    vocabulary=self._dictionary)
+
+                src_len = opt["sentence_length"]
+                _min_len = min(_min_len, opt["sentence_length"], opt["tgt_sen_length"])
+                _max_len = max(_max_len, opt["sentence_length"], opt["tgt_sen_length"])
+
+                if src_len > self.max_sen_len:
+                    continue
+                if src_len < self.min_sen_len:
+                    continue
+
+                src_padding = self.generate_padding_mask(opt["encoder_input"],
+                                                         opt["sentence_length"],
+                                                         exclude_mask=False)
+                tgt_padding = self.generate_padding_mask(opt["decoder_input"],
+                                                         opt["tgt_sen_length"],
+                                                         exclude_mask=True)
+
+                encoder_input = self.padding(opt["encoder_input"],
+                                             self._dictionary.padding_index)
+                decoder_input = self.padding(opt["decoder_input"],
+                                             self._dictionary.padding_index)
+                decoder_output = self.padding(opt["decoder_output"],
+                                              self._dictionary.padding_index)
+                if encoder_input is None or decoder_input is None or decoder_output is None:
+                    continue
+
+                example = {
+                    "src": encoder_input,
+                    "src_padding": src_padding,
+                    "prev_opt": decoder_input,
+                    "prev_padding": tgt_padding,
+                    "target": decoder_output,
+                    "tgt_padding": tgt_padding,
+                }
+                self._add_example(example)
+                count += 1
+
+        print(f" | Shortest len = {_min_len}.")
+        print(f" | Longest  len = {_max_len}.")
+        print(f" | Total    sen = {count}.")
diff --git a/model_zoo/mass/src/dataset/schema.py b/model_zoo/mass/src/dataset/schema.py
new file mode 100644
index 0000000000..9e92d7979b
--- /dev/null
+++ b/model_zoo/mass/src/dataset/schema.py
@@ -0,0 +1,24 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Define schema of mindrecord."""
+
+SCHEMA = {
+    "src": {"type": "int64", "shape": [-1]},
+    "src_padding": {"type": "int64", "shape": [-1]},
+    "prev_opt": {"type": "int64", "shape": [-1]},
+    "prev_padding": {"type": "int64", "shape": [-1]},
+    "target": {"type": "int64", "shape": [-1]},
+    "tgt_padding": {"type": "int64", "shape": [-1]},
+}
diff --git a/model_zoo/mass/src/language_model/__init__.py b/model_zoo/mass/src/language_model/__init__.py
new file mode 100644
index 0000000000..329e39c128
--- /dev/null
+++ b/model_zoo/mass/src/language_model/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Language model."""
+from .noise_channel_language_model import NoiseChannelLanguageModel
+from .masked_language_model import MaskedLanguageModel
+from .loose_masked_language_model import LooseMaskedLanguageModel
+from .mass_language_model import MassLanguageModel
+
+__all__ = [
+    "LooseMaskedLanguageModel",
+    "MassLanguageModel",
+    "MaskedLanguageModel",
+    "NoiseChannelLanguageModel"
+]
diff --git a/model_zoo/mass/src/language_model/base.py b/model_zoo/mass/src/language_model/base.py
new file mode 100644
index 0000000000..1803a9ea13
--- /dev/null
+++ b/model_zoo/mass/src/language_model/base.py
@@ -0,0 +1,25 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Base language model."""
+
+
+class LanguageModel:
+    """Define base language model."""
+
+    def __init__(self):
+        pass
+
+    def emit(self, **kwargs):
+        raise NotImplementedError
diff --git a/model_zoo/mass/src/language_model/loose_masked_language_model.py b/model_zoo/mass/src/language_model/loose_masked_language_model.py
new file mode 100644
index 0000000000..eb7df52a5f
--- /dev/null
+++ b/model_zoo/mass/src/language_model/loose_masked_language_model.py
@@ -0,0 +1,130 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Modified masked language model."""
+import numpy as np
+
+from src.utils import Dictionary
+from .base import LanguageModel
+
+
+class LooseMaskedLanguageModel(LanguageModel):
+    """
+    Modified mask operation on sentence.
+
+    If k is assigned, then mask sentence with length k.
+    Otherwise, use mask_ratio.
+
+    Args:
+        k (int): Length of fragment.
+        mask_ratio (float): Mask ratio.
+    """
+
+    def __init__(self, k: int = None, mask_ratio=0.5,
+                 mask_all_prob=None):
+        super(LooseMaskedLanguageModel, self).__init__()
+        self.mask_ratio = mask_ratio
+        self._k = k
+        self._threshold = mask_all_prob
+
+    def emit(self, sentence: np.ndarray, vocabulary: Dictionary):
+        """
+        Mask mono source sentence.
+
+        A sample used to train model is processed with following step:
+
+        encoder input (source): [x1, x2, x3, x4, x5, x6, x7, x8, </eos>]
+        masked encoder input:   [x1, x2, x3,  _,  _,  _, x7, x8, </eos>]
+        decoder input:          [  -, x3, x4, x5]
+                                  |   |   |   |
+                                  V   V   V   V
+        decoder output:         [x3, x4, x5, x6]
+
+        Notes:
+            A simple rule is made that source sentence starts without <BOS>
+            but end with <EOS>.
+
+        Args:
+            vocabulary (Dictionary): Vocabulary.
+            sentence (np.ndarray): Raw sentence instance.
+
+        Returns:
+            dict, an example.
+        """
+        # If v=0, then u must equal to 0. [u, v)
+        u, v = self._get_masked_interval(sentence.shape[0],
+                                         self._k, self._threshold)
+
+        encoder_input = sentence.copy()
+        right_shifted_sentence = np.concatenate(([vocabulary.bos_index], sentence[:-1]))
+
+        if u == 0:
+            _len = v - u if v - u != 0 else sentence.shape[0]
+            decoder_input = right_shifted_sentence[:_len]
+            decoder_input[0] = vocabulary.mask_index
+            decoder_output = sentence[:_len].copy()
+        else:
+            decoder_input = right_shifted_sentence[u - 1:v]
+            decoder_input[0] = vocabulary.mask_index
+            decoder_output = sentence[u - 1:v].copy()
+
+        if v == 0:
+            decoder_input[:] = vocabulary.mask_index
+        else:
+            encoder_input[np.arange(start=u, stop=v)] = vocabulary.mask_index
+
+        if u != v and u > 1:
+            padding = np.array([vocabulary.padding_index] * (u - 1), dtype=np.int32)
+            decoder_input = np.concatenate((padding, decoder_input))
+            decoder_output = np.concatenate((padding, decoder_output))
+
+        if decoder_input.shape[0] != decoder_output.shape[0]:
+            raise ValueError("seq len must equal.")
+
+        return {
+            "sentence_length": sentence.shape[0],
+            "tgt_sen_length": decoder_output.shape[0],
+            "encoder_input": encoder_input,  # end with </eos>
+            "decoder_input": decoder_input,
+            "decoder_output": decoder_output  # end with </eos>
+        }
+
+    def _get_masked_interval(self, length, fix_length=None,
+                             threshold_to_mask_all=None):
+        """
+        Generate a sequence length according to length and mask_ratio.
+
+        Args:
+            length (int): Sequence length.
+
+        Returns:
+            Tuple[int, int], [start position, end position].
+        """
+        # Can not larger than sequence length.
+        # Mask_length belongs to [0, length].
+        if fix_length is not None:
+            interval_length = min(length, fix_length)
+        else:
+            interval_length = min(length, round(self.mask_ratio * length))
+
+        _magic = np.random.random()
+        if threshold_to_mask_all is not None and _magic <= threshold_to_mask_all:
+            return 0, length
+
+        # If not sequence to be masked, then return 0, 0.
+        if interval_length == 0:
+            return 0, 0
+        # Otherwise, return start position and interval length.
+        start_pos = np.random.randint(low=0, high=length - interval_length + 1)
+        return start_pos, start_pos + interval_length
diff --git a/model_zoo/mass/src/language_model/masked_language_model.py b/model_zoo/mass/src/language_model/masked_language_model.py
new file mode 100644
index 0000000000..52aed8d53e
--- /dev/null
+++ b/model_zoo/mass/src/language_model/masked_language_model.py
@@ -0,0 +1,128 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Masked language model."""
+import numpy as np
+
+from .base import LanguageModel
+
+
+class MaskedLanguageModel(LanguageModel):
+    """
+    Do mask operation on sentence.
+
+    If k is assigned, then mask sentence with length k.
+    Otherwise, use mask_ratio.
+
+    Args:
+        k (int): Length of fragment.
+        mask_ratio (float): Mask ratio.
+    """
+
+    def __init__(self, k: int = None, mask_ratio=0.5,
+                 mask_all_prob=None):
+        super(MaskedLanguageModel, self).__init__()
+        self.mask_ratio = mask_ratio
+        self._k = k
+        self._threshold = mask_all_prob
+
+    def emit(self, sentence: np.ndarray, vocabulary):
+        """
+        Mask mono source sentence.
+
+        A sample used to train model is processed with following step:
+
+        encoder input (source): [x1, x2, x3, x4, x5, x6, x7, x8, </eos>]
+        masked encoder input:   [x1, x2,  _,  _,  _, x6, x7, x8, </eos>]
+        decoder input:          [  _, x3, x4]
+                                  |   |   |
+                                  V   V   V
+        decoder output:         [ x3, x4, x5]
+
+        Notes:
+            A simple rule is made that source sentence starts without <BOS>
+            but end with <EOS>.
+
+        Args:
+            vocabulary (Dictionary): Vocabulary.
+            sentence (np.ndarray): Raw sentence instance.
+
+        Returns:
+            dict, an example.
+        """
+        encoder_input = sentence.copy()
+        seq_len = encoder_input.shape[0]
+
+        # If v=0, then u must equal to 0. [u, v)
+        u, v = self._get_masked_interval(len(encoder_input),
+                                         self._k, self._threshold)
+
+        if u == 0:
+            _len = v - u if v - u != 0 else seq_len
+            decoder_input = np.array([vocabulary.mask_index] * _len, dtype=np.int32)
+            decoder_input[1:] = encoder_input[:_len - 1].copy()
+        else:
+            decoder_input = np.array([vocabulary.mask_index] * (v - u), dtype=np.int32)
+            decoder_input[1:] = encoder_input[u:v - 1].copy()
+
+        if v == 0:
+            decoder_output = encoder_input.copy()
+            encoder_input[:] = vocabulary.mask_index
+        else:
+            decoder_output = encoder_input[u:v].copy()
+            encoder_input[np.arange(start=u, stop=v)] = vocabulary.mask_index
+
+        if u != v and u > 0:
+            padding = np.array([vocabulary.padding_index] * u, dtype=np.int32)
+            decoder_input = np.concatenate((padding, decoder_input))
+            decoder_output = np.concatenate((padding, decoder_output))
+
+        assert decoder_input.shape[0] == decoder_output.shape[0], "seq len must equal."
+
+        return {
+            "sentence_length": seq_len,
+            "tgt_sen_length": decoder_output.shape[0],
+            "encoder_input": encoder_input,  # end with </eos>
+            "decoder_input": decoder_input,
+            "decoder_output": decoder_output  # end with </eos>
+        }
+
+    def _get_masked_interval(self, length, fix_length=None,
+                             threshold_to_mask_all=None):
+        """
+        Generate a sequence length according to length and mask_ratio.
+
+        Args:
+            length (int): Sequence length.
+
+        Returns:
+            Tuple[int, int], [start position, end position].
+        """
+        # Can not larger than sequence length.
+        # Mask_length belongs to [0, length].
+        if fix_length is not None:
+            interval_length = min(length, fix_length)
+        else:
+            interval_length = min(length, round(self.mask_ratio * length))
+
+        _magic = np.random.random()
+        if threshold_to_mask_all is not None and _magic <= threshold_to_mask_all:
+            return 0, length
+
+        # If not sequence to be masked, then return 0, 0.
+        if interval_length == 0:
+            return 0, 0
+        # Otherwise, return start position and interval length.
+        start_pos = np.random.randint(low=0, high=length - interval_length + 1)
+        return start_pos, start_pos + interval_length
diff --git a/model_zoo/mass/src/language_model/mass_language_model.py b/model_zoo/mass/src/language_model/mass_language_model.py
new file mode 100644
index 0000000000..68b79265f8
--- /dev/null
+++ b/model_zoo/mass/src/language_model/mass_language_model.py
@@ -0,0 +1,202 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Masked language model."""
+import numpy as np
+
+from .base import LanguageModel
+
+
+class MassLanguageModel(LanguageModel):
+    """
+    Do mask operation on sentence.
+
+    If k is assigned, then mask sentence with length k.
+    Otherwise, use mask_ratio.
+
+    In mass paper, mask_ratio:keep_ratio:random_ratio=8:1:1,
+    fragment_ratio=0.5.
+
+    Args:
+        fragment_ratio (float): Masked length of fragment.
+        mask_ratio (float): Total mask ratio.
+        keep_ratio (float): Keep ratio.
+        random_ratio (float): Random replacement ratio.
+        mask_all_prob (float): Mask all ratio.
+    """
+
+    def __init__(self, fragment_ratio: float = 0.5,
+                 mask_ratio: float = 0.8,
+                 keep_ratio: float = 0.1,
+                 random_ratio: float = 0.1,
+                 mask_all_prob=None):
+        if mask_ratio + keep_ratio + random_ratio > 1:
+            raise ValueError("The sum of `mask_ratio`, `keep_ratio` and `random_ratio` must less or equal to 1.")
+
+        super(MassLanguageModel, self).__init__()
+        self.fragment_ratio = fragment_ratio
+        self.keep_ratio = keep_ratio
+        self.random_ratio = random_ratio
+        self._threshold = mask_all_prob
+
+    def emit(self, sentence: np.ndarray, vocabulary):
+        """
+        Mask mono source sentence.
+
+        A sample used to train model is processed with following step:
+
+        encoder input (source): [x1, x2, x3, x4, x5, x6, x7, x8, </eos>]
+        masked encoder input:   [x1, x2,  _,  _,  _, x6, x7, x8, </eos>]
+        decoder input:          [  _, x3, x4]
+                                  |   |   |
+                                  V   V   V
+        decoder output:         [ x3, x4, x5]
+
+        Notes:
+            A simple rule is made that source sentence starts without <BOS>
+            but end with <EOS>.
+
+        Args:
+            vocabulary (Dictionary): Vocabulary.
+            sentence (np.ndarray): Raw sentence instance.
+
+        Returns:
+            dict, an example.
+        """
+        encoder_input = sentence.copy()
+        seq_len = encoder_input.shape[0]
+
+        # If v=0, then u must equal to 0. [u, v)
+        u, v = self._get_masked_interval(
+            len(encoder_input),
+            threshold_to_mask_all=self._threshold
+        )
+
+        if u == 0:
+            _len = v - u if v - u != 0 else seq_len
+            decoder_input = np.array([vocabulary.mask_index] * _len, dtype=np.int32)
+            decoder_input[1:] = encoder_input[:_len - 1].copy()
+        else:
+            decoder_input = np.array([vocabulary.mask_index] * (v - u), dtype=np.int32)
+            decoder_input[1:] = encoder_input[u:v - 1].copy()
+
+        if v == 0:
+            decoder_output = encoder_input.copy()
+            encoder_input[:] = vocabulary.mask_index
+        else:
+            decoder_output = encoder_input[u:v].copy()
+            encoder_input[np.arange(start=u, stop=v)] = vocabulary.mask_index
+
+        if u != v and u > 0:
+            padding = np.array([vocabulary.padding_index] * u, dtype=np.int32)
+            decoder_input = np.concatenate((padding, decoder_input))
+            decoder_output = np.concatenate((padding, decoder_output))
+
+        assert decoder_input.shape[0] == decoder_output.shape[0], "seq len must equal."
+
+        # Get masked tokens positions.
+        src_idx = np.where(encoder_input == vocabulary.mask_index)[0]
+        if src_idx.shape[0] != 0:
+            encoder_input = self._replace(encoder_input.copy(),
+                                          replacement=sentence,
+                                          position=src_idx,
+                                          vocabulary=vocabulary,
+                                          repl_prob=self.keep_ratio,
+                                          random_prob=self.random_ratio)
+
+        prev_opt_idx = np.where(decoder_input != vocabulary.padding_index)[0]
+        if prev_opt_idx.shape[0] != 0:
+            decoder_input = self._replace(decoder_input.copy(),
+                                          replacement=vocabulary.mask_index,
+                                          position=prev_opt_idx,
+                                          vocabulary=vocabulary,
+                                          repl_prob=self.keep_ratio,
+                                          random_prob=self.random_ratio)
+
+        return {
+            "sentence_length": seq_len,
+            "tgt_sen_length": decoder_output.shape[0],
+            "encoder_input": encoder_input,  # end with </eos>
+            "decoder_input": decoder_input,
+            "decoder_output": decoder_output  # end with </eos>
+        }
+
+    @staticmethod
+    def _replace(sentence, replacement, position, vocabulary, repl_prob, random_prob):
+        """
+        Do replacement randomly according to mass paper.
+
+        Args:
+            sentence (np.ndarray): Sentence.
+            replacement (Union[int, np.ndarray]): Replacement char.
+            position (np.ndarray): Position to be replaced.
+            vocabulary (Dictionary): Vocabulary.
+            repl_prob (float): Replace to mask prob.
+            random_prob (float): Replace randomly prob.
+
+        Returns:
+            np.ndarray, a sentence.
+        """
+        _probs = [repl_prob, random_prob]
+        _repl_len, _random_len = np.floor(
+            np.array(_probs) * position.shape[0]
+        ).astype(np.int32)
+
+        if _repl_len + _random_len >= position.shape[0]:
+            return sentence
+
+        if 0 < _repl_len < position.shape[0]:
+            _repl_idx = np.random.choice(a=position, size=_repl_len, replace=False)
+            if isinstance(replacement, np.ndarray):
+                sentence[_repl_idx] = replacement[_repl_idx]
+            else:
+                sentence[_repl_idx] = replacement
+
+        if 0 < _random_len < position.shape[0]:
+            _random_idx = np.random.choice(a=position, size=_random_len, replace=False)
+            sentence[_random_idx] = np.random.randint(
+                low=5, high=vocabulary.size,
+                size=_random_idx.shape[0], dtype=np.int32
+            )
+
+        return sentence
+
+    def _get_masked_interval(self, length, fix_length=None,
+                             threshold_to_mask_all=None):
+        """
+        Generate a sequence length according to length and mask_ratio.
+
+        Args:
+            length (int): Sequence length.
+
+        Returns:
+            Tuple[int, int], [start position, end position].
+        """
+        # Can not larger than sequence length.
+        # Mask_length belongs to [0, length].
+        if fix_length is not None:
+            interval_length = min(length, fix_length)
+        else:
+            interval_length = min(length, round(self.fragment_ratio * length))
+
+        _magic = np.random.random()
+        if threshold_to_mask_all is not None and _magic <= threshold_to_mask_all:
+            return 0, length
+
+        # If not sequence to be masked, then return 0, 0.
+        if interval_length == 0:
+            return 0, 0
+        # Otherwise, return start position and interval length.
+        start_pos = np.random.randint(low=0, high=length - interval_length + 1)
+        return start_pos, start_pos + interval_length
diff --git a/model_zoo/mass/src/language_model/noise_channel_language_model.py b/model_zoo/mass/src/language_model/noise_channel_language_model.py
new file mode 100644
index 0000000000..2da89b659e
--- /dev/null
+++ b/model_zoo/mass/src/language_model/noise_channel_language_model.py
@@ -0,0 +1,72 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Noise channel language model."""
+import numpy as np
+
+from .base import LanguageModel
+
+
+class NoiseChannelLanguageModel(LanguageModel):
+    """Do mask on bilingual data."""
+
+    def __init__(self, add_noise_prob: float = 0.1):
+        super(NoiseChannelLanguageModel, self).__init__()
+        self._noisy_prob = add_noise_prob
+
+    def emit(self, sentence: np.ndarray, target: np.ndarray,
+             mask_symbol_idx: int,
+             bos_symbol_idx: int):
+        """
+        Add noise to sentence randomly.
+
+        For example, given a sentence pair:
+        source sentence:    [x1, x2, x3, x4, x5, x6, </eos>]
+        target sentence:    [y1, y2, y3, y4, </eos>]
+
+        After do random mask, data is looked like:
+        encoder input (source): [x1, x2,  _, x4, x5,  _, </eos>]
+        decoder input:          [<bos>,  y1,  y2,  y3,  y4]
+                                   |    |    |    |    |
+                                   V    V    V    V    V
+        decoder output:         [ y1,  y2,  y3,  y4, </eos>]
+
+        Args:
+            sentence (np.ndarray): Raw sentence.
+            target (np.ndarray): Target output (prediction).
+            mask_symbol_idx (int): Index of MASK symbol.
+            bos_symbol_idx (int): Index of bos symbol.
+
+        Returns:
+            dict, an example.
+        """
+        encoder_input = sentence.copy()
+        tgt_seq_len = target.shape[0]
+
+        for i, _ in enumerate(encoder_input):
+            _prob = np.random.random()
+            if _prob < self._noisy_prob:
+                encoder_input[i] = mask_symbol_idx
+
+        decoder_input = np.empty(shape=tgt_seq_len, dtype=np.int64)
+        decoder_input[1:] = target[:-1]
+        decoder_input[0] = bos_symbol_idx
+
+        return {
+            "sentence_length": encoder_input.shape[0],
+            "tgt_sen_length": tgt_seq_len,
+            "encoder_input": encoder_input,  # end with </eos>
+            "decoder_input": decoder_input,  # start with <bos>
+            "decoder_output": target  # end with </eos>
+        }
diff --git a/model_zoo/mass/src/transformer/__init__.py b/model_zoo/mass/src/transformer/__init__.py
new file mode 100644
index 0000000000..7912e7f0dd
--- /dev/null
+++ b/model_zoo/mass/src/transformer/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Transformer model module."""
+from .transformer import Transformer
+from .encoder import TransformerEncoder
+from .decoder import TransformerDecoder
+from .beam_search import BeamSearchDecoder
+from .transformer_for_train import TransformerTraining, LabelSmoothedCrossEntropyCriterion, \
+    TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
+from .infer_mass import infer
+
+__all__ = [
+    "infer",
+    "TransformerTraining",
+    "LabelSmoothedCrossEntropyCriterion",
+    "TransformerTrainOneStepWithLossScaleCell",
+    "TransformerNetworkWithLoss",
+    "Transformer",
+    "TransformerEncoder",
+    "TransformerDecoder",
+    "BeamSearchDecoder"
+]
diff --git a/model_zoo/mass/src/transformer/beam_search.py b/model_zoo/mass/src/transformer/beam_search.py
new file mode 100644
index 0000000000..0c48aa3cf0
--- /dev/null
+++ b/model_zoo/mass/src/transformer/beam_search.py
@@ -0,0 +1,363 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Beam search decoder."""
+import numpy as np
+
+import mindspore.common.dtype as mstype
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.common.tensor import Tensor
+
+INF = 1. * 1e9
+
+
+class LengthPenalty(nn.Cell):
+    """
+    Length penalty.
+
+    Args:
+        weight (float): The length penalty weight.
+        compute_type (mstype): Mindspore data type. Default: mstype.float32.
+    """
+
+    def __init__(self, weight=1.0, compute_type=mstype.float32):
+        super(LengthPenalty, self).__init__()
+        self.weight = weight
+
+        self.add = P.TensorAdd()
+        self.pow = P.Pow()
+        self.div = P.RealDiv()
+
+        self.five = Tensor(5.0, mstype.float32)
+        self.six = Tensor(6.0, mstype.float32)
+
+        self.cast = P.Cast()
+
+    def construct(self, length_tensor):
+        """
+        Process source sentence
+
+        Inputs:
+            length_tensor (Tensor):  the input tensor.
+
+        Returns:
+            Tensor, after punishment of length.
+        """
+        length_tensor = self.cast(length_tensor, mstype.float32)
+        output = self.add(length_tensor, self.five)
+        output = self.div(output, self.six)
+        output = self.pow(output, self.weight)
+        return output
+
+
+class TileBeam(nn.Cell):
+    """
+    Beam Tile operation.
+
+    Args:
+        beam_width (int): The Number of beam.
+        compute_type (mstype): Mindspore data type. Default: mstype.float32.
+    """
+
+    def __init__(self, beam_width, compute_type=mstype.float32):
+        super(TileBeam, self).__init__()
+        self.beam_width = beam_width
+
+        self.expand = P.ExpandDims()
+        self.tile = P.Tile()
+        self.reshape = P.Reshape()
+        self.shape = P.Shape()
+
+    def construct(self, input_tensor):
+        """
+        Process source sentence
+
+        Inputs:
+            input_tensor (Tensor):  with shape (N, T, D).
+
+        Returns:
+            Tensor, tiled tensor.
+        """
+        shape = self.shape(input_tensor)
+        # add an dim
+        input_tensor = self.expand(input_tensor, 1)
+        # get tile shape: [1, beam, ...]
+        # shape = self.shape(input_tensor)
+        tile_shape = (1,) + (self.beam_width,)
+        for _ in range(len(shape) - 1):
+            tile_shape = tile_shape + (1,)
+        # tile
+        output = self.tile(input_tensor, tile_shape)
+        # reshape to [batch*beam, ...]
+        out_shape = (shape[0] * self.beam_width,) + shape[1:]
+        output = self.reshape(output, out_shape)
+
+        return output
+
+
+class Mod(nn.Cell):
+    """
+    Mod operation.
+
+    Args:
+        compute_type (mstype): Mindspore data type. Default: mstype.float32.
+    """
+
+    def __init__(self,
+                 compute_type=mstype.float32):
+        super(Mod, self).__init__()
+        self.compute_type = compute_type
+
+        self.floor_div = P.FloorDiv()
+        self.sub = P.Sub()
+        self.multiply = P.Mul()
+
+    def construct(self, input_x, input_y):
+        """
+        Get the remainder of input_x and input_y.
+
+        Inputs:
+            input_x (Tensor): Divisor.
+            input_y (Tensor): Dividend.
+
+        Returns:
+            Tensor, remainder.
+        """
+        x = self.floor_div(input_x, input_y)
+        x = self.multiply(x, input_y)
+        x = self.sub(input_x, x)
+        return x
+
+
+class BeamSearchDecoder(nn.Cell):
+    """
+    Beam search decoder.
+
+    Args:
+        batch_size (int): Batch size of input dataset.
+        seq_length (int): Length of input sequence.
+        vocab_size (int): The shape of each embedding vector.
+        decoder    (Cell): The transformrer decoder.
+        beam_width (int): Beam width for beam search in inferring. Default: 4.
+        length_penalty_weight (float): Penalty for sentence length. Default: 1.0.
+        max_decode_length (int): Max decode length for inferring. Default: 64.
+        sos_id (int): The index of start label <SOS>. Default: 1.
+        eos_id (int): The index of end label <EOS>. Default: 2.
+        compute_type (mstype): Compute type in TransformerAttention.
+            Default: mstype.float32.
+    """
+
+    def __init__(self,
+                 batch_size,
+                 seq_length,
+                 vocab_size,
+                 decoder,
+                 beam_width=4,
+                 length_penalty_weight=1.0,
+                 max_decode_length=64,
+                 sos_id=1,
+                 eos_id=2):
+        super(BeamSearchDecoder, self).__init__(auto_prefix=False)
+
+        self.batch_size = batch_size
+        self.vocab_size = vocab_size
+        self.beam_width = beam_width
+        self.length_penalty_weight = length_penalty_weight
+        self.max_decode_length = max_decode_length
+
+        self.decoder = decoder
+
+        self.add = P.TensorAdd()
+        self.expand = P.ExpandDims()
+        self.reshape = P.Reshape()
+        self.shape_flat = (-1,)
+        self.shape = P.Shape()
+
+        self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mstype.float32)
+        self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mstype.float32)
+
+        self.select = P.Select()
+        self.flat_shape = (batch_size, beam_width * vocab_size)
+        self.topk = P.TopK(sorted=True)
+        self.floor_div = P.FloorDiv()
+        self.vocab_size_tensor = Tensor(self.vocab_size, mstype.int32)
+        self.real_div = P.RealDiv()
+        self.mod = Mod()
+        self.equal = P.Equal()
+        self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mstype.int32)
+
+        beam_ids = np.tile(np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1])
+        self.beam_ids = Tensor(beam_ids, mstype.int32)
+
+        batch_ids = np.arange(batch_size * beam_width).reshape((batch_size, beam_width)) // beam_width
+        self.batch_ids = Tensor(batch_ids, mstype.int32)
+
+        self.concat = P.Concat(axis=-1)
+        self.gather_nd = P.GatherNd()
+
+        # init inputs and states
+        self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mstype.int32)
+        self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mstype.int32)
+
+        init_scores = np.tile(np.array([[0.] + [-INF] * (beam_width - 1)]), [batch_size, 1])
+
+        self.init_total_log_probs = Tensor(np.zeros([batch_size, beam_width, 1]), mstype.float32)
+        self.init_scores = Tensor(init_scores, mstype.float32)
+
+        self.init_attention = Tensor(np.zeros([batch_size, beam_width, seq_length, 1]), mstype.float32)
+        self.init_finished = Tensor(np.zeros([batch_size, beam_width], dtype=np.bool))
+        self.init_length = Tensor(np.zeros([batch_size, beam_width], dtype=np.int32))
+
+        self.length_penalty = LengthPenalty(weight=length_penalty_weight)
+
+        self.one = Tensor(1, mstype.int32)
+        self.prob_concat = P.Concat(axis=1)
+
+    def one_step(self, cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_finished,
+                 state_length, entire_log_probs):
+        """
+        Beam search one_step output.
+
+        Inputs:
+            cur_input_ids (Tensor):  with shape (batch_size * beam_width, m).
+            enc_states (Tensor):  with shape (batch_size * beam_width, T, D).
+            enc_attention_mask (Tensor):  with shape (batch_size * beam_width, T, D).
+            state_log_probs (Tensor):  with shape (batch_size, beam_width).
+            state_seq (Tensor):  with shape (batch_size, beam_width, m).
+            state_finished (Tensor):  with shape (batch_size, beam_width).
+            state_length (Tensor):  with shape (batch_size, beam_width).
+            entire_log_probs (Tensor):  with shape (batch_size, beam_width, vocab_size).
+
+        Return:
+            Update input parameters.
+        """
+        # log_probs, [batch_size * beam_width, 1, V]
+        log_probs = self.decoder(cur_input_ids, enc_states, enc_attention_mask)
+        # log_probs: [batch_size, beam_width, V]
+        log_probs = self.reshape(log_probs, (self.batch_size, self.beam_width, self.vocab_size))
+
+        # select topk indices, [batch_size, beam_width, V]
+        total_log_probs = self.add(log_probs, self.expand(state_log_probs, -1))
+
+        # mask finished beams, [batch_size, beam_width]
+        # t-1 has finished
+        mask_tensor = self.select(state_finished, self.ninf_tensor, self.zero_tensor)
+        # save the t-1 probability
+        total_log_probs = self.add(total_log_probs, self.expand(mask_tensor, -1))
+        # [batch, beam*vocab]
+        flat_scores = self.reshape(total_log_probs, self.flat_shape)
+        # select topk, [batch, beam]
+        topk_scores, topk_indices = self.topk(flat_scores, self.beam_width)
+
+        # convert to beam and word indices, [batch, beam]
+        beam_indices = self.floor_div(topk_indices, self.vocab_size_tensor)
+        word_indices = self.mod(topk_indices, self.vocab_size_tensor)
+
+        current_word_pro = self.gather_nd(
+            log_probs,
+            self.concat((self.expand(self.batch_ids, -1),
+                         self.expand(beam_indices, -1),
+                         self.expand(word_indices, -1)))
+        )
+        # [batch, beam]
+        current_word_pro = self.reshape(current_word_pro, (self.batch_size, self.beam_width))
+
+        # mask finished indices, [batch, beam]
+        beam_indices = self.select(state_finished, self.beam_ids, beam_indices)
+        word_indices = self.select(state_finished, self.eos_ids, word_indices)
+        topk_scores = self.select(state_finished, state_log_probs, topk_scores)
+
+        current_word_pro = self.select(state_finished, self.ninf_tensor, current_word_pro)
+
+        # sort according to scores with -inf for finished beams, [batch, beam]
+        # t ends
+        tmp_log_probs = self.select(
+            self.equal(word_indices, self.eos_ids),
+            self.ninf_tensor,
+            topk_scores)
+
+        _, tmp_indices = self.topk(tmp_log_probs, self.beam_width)
+        # update, [batch_size, beam_width, 2]
+        tmp_gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(tmp_indices, -1)))
+        # [batch_size, beam_width]
+        beam_indices = self.gather_nd(beam_indices, tmp_gather_indices)
+        word_indices = self.gather_nd(word_indices, tmp_gather_indices)
+        topk_scores = self.gather_nd(topk_scores, tmp_gather_indices)
+        # [batch_size, beam_width]
+        sorted_current_word_pro = self.gather_nd(current_word_pro, tmp_gather_indices)
+
+        # gather indices for selecting alive beams
+        gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(beam_indices, -1)))
+
+        # length add 1 if not finished in the previous step, [batch_size, beam_width]
+        length_add = self.add(state_length, self.one)
+        state_length = self.select(state_finished, state_length, length_add)
+        state_length = self.gather_nd(state_length, gather_indices)
+
+        # concat seq
+        seq = self.gather_nd(state_seq, gather_indices)
+        state_seq = self.concat((seq, self.expand(word_indices, -1)))
+        # update the probability of entire_log_probs
+        selected_entire_log_probs = self.gather_nd(entire_log_probs, gather_indices)
+        entire_log_probs = self.concat((selected_entire_log_probs,
+                                        self.expand(sorted_current_word_pro, -1)))
+
+        # new finished flag and log_probs
+        state_finished = self.equal(word_indices, self.eos_ids)
+        state_log_probs = topk_scores
+        cur_input_ids = self.reshape(state_seq, (self.batch_size * self.beam_width, -1))
+
+        return cur_input_ids, state_log_probs, state_seq, state_finished, state_length, entire_log_probs
+
+    def construct(self, enc_states, enc_attention_mask):
+        """
+        Process source sentence
+
+        Inputs:
+            enc_states (Tensor): Output of transformer encoder with shape (N, T, D).
+            enc_attention_mask (Tensor): encoder attention mask with shape (N, T, T).
+
+        Returns:
+            Tensor, predictions output and prediction probs.
+        """
+        cur_input_ids = self.start_ids
+        # beam search states
+        state_log_probs = self.init_scores
+        state_seq = self.init_seq
+        state_finished = self.init_finished
+        state_length = self.init_length
+        entire_log_probs = self.init_total_log_probs
+
+        for _ in range(self.max_decode_length):
+            # run one step decoder to get outputs of the current step
+            # shape [batch*beam, 1, vocab]
+            cur_input_ids, state_log_probs, state_seq, state_finished, state_length, entire_log_probs = self.one_step(
+                cur_input_ids, enc_states, enc_attention_mask, state_log_probs,
+                state_seq, state_finished, state_length, entire_log_probs)
+
+        # add length penalty scores
+        penalty_len = self.length_penalty(state_length)
+        # return penalty_len
+        log_probs = self.real_div(state_log_probs, penalty_len)
+
+        # sort according to scores
+        _, top_beam_indices = self.topk(log_probs, self.beam_width)
+        gather_indices = self.concat((self.expand(self.batch_ids, -1), self.expand(top_beam_indices, -1)))
+        # sort sequence and attention scores
+        predicted_ids = self.gather_nd(state_seq, gather_indices)
+        # take the first one
+        predicted_ids = predicted_ids[::, 0:1:1, ::]
+
+        return predicted_ids, entire_log_probs
diff --git a/model_zoo/mass/src/transformer/components.py b/model_zoo/mass/src/transformer/components.py
new file mode 100644
index 0000000000..2efa1ee757
--- /dev/null
+++ b/model_zoo/mass/src/transformer/components.py
@@ -0,0 +1,66 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Components of model."""
+import mindspore.common.dtype as mstype
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+
+
+class SaturateCast(nn.Cell):
+    """Cast wrapper."""
+
+    def __init__(self, dst_type=mstype.float32):
+        super(SaturateCast, self).__init__()
+        self.cast = P.Cast()
+        self.dst_type = dst_type
+
+    def construct(self, x):
+        return self.cast(x, self.dst_type)
+
+
+class LayerNorm(nn.Cell):
+    """
+    Do layer norm.
+
+    Args:
+        in_channels (int): In channels number of layer norm.
+        return_2d (bool): Whether return 2d tensor.
+
+    Returns:
+        Tensor, output.
+    """
+
+    def __init__(self, in_channels=None, return_2d=False):
+        super(LayerNorm, self).__init__()
+        self.return_2d = return_2d
+        self.layer_norm = nn.LayerNorm((in_channels,))
+        self.cast = P.Cast()
+        self.get_dtype = P.DType()
+        self.reshape = P.Reshape()
+        self.get_shape = P.Shape()
+
+    def construct(self, input_tensor):
+        shape = self.get_shape(input_tensor)
+        batch_size = shape[0]
+        max_len = shape[1]
+        embed_dim = shape[2]
+
+        output = self.reshape(input_tensor, (-1, embed_dim))
+        output = self.cast(output, mstype.float32)
+        output = self.layer_norm(output)
+        output = self.cast(output, self.get_dtype(input_tensor))
+        if not self.return_2d:
+            output = self.reshape(output, (batch_size, max_len, embed_dim))
+        return output
diff --git a/model_zoo/mass/src/transformer/create_attn_mask.py b/model_zoo/mass/src/transformer/create_attn_mask.py
new file mode 100644
index 0000000000..160e7ec3fb
--- /dev/null
+++ b/model_zoo/mass/src/transformer/create_attn_mask.py
@@ -0,0 +1,76 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Create mask matrix for inputs."""
+import numpy as np
+import mindspore.common.dtype as mstype
+from mindspore import nn
+from mindspore.ops import operations as P
+from mindspore.common.tensor import Tensor
+
+
+class CreateAttentionMaskFromInputMask(nn.Cell):
+    """
+    Create attention mask according to input mask.
+
+    Args:
+        config (TransformerConfig): Config class.
+
+    Returns:
+        Tensor, shape of (N, T, T).
+    """
+
+    def __init__(self, config):
+        super(CreateAttentionMaskFromInputMask, self).__init__()
+        self.input_mask_from_dataset = config.input_mask_from_dataset
+        self.input_mask = None
+
+        assert self.input_mask_from_dataset
+
+        self.cast = P.Cast()
+        self.shape = P.Shape()
+        self.reshape = P.Reshape()
+        self.batch_matmul = P.BatchMatMul()
+        self.multiply = P.Mul()
+        self.shape = P.Shape()
+        # mask future positions
+        ones = np.ones(shape=(config.batch_size, config.seq_length, config.seq_length))
+        self.lower_triangle_mask = Tensor(np.tril(ones), dtype=mstype.float32)
+
+    def construct(self, input_mask, mask_future=False):
+        """
+        Construct network.
+
+        Args:
+            input_mask (Tensor): Tensor mask vectors with shape (N, T).
+            mask_future (bool): Whether mask future (for decoder training).
+
+        Returns:
+            Tensor, shape of (N, T, T).
+        """
+        input_shape = self.shape(input_mask)
+        # Add this for infer as the seq_length will increase.
+        shape_right = (input_shape[0], 1, input_shape[1])
+        shape_left = input_shape + (1,)
+
+        input_mask = self.cast(input_mask, mstype.float32)
+        mask_left = self.reshape(input_mask, shape_left)
+        mask_right = self.reshape(input_mask, shape_right)
+
+        attention_mask = self.batch_matmul(mask_left, mask_right)
+
+        if mask_future:
+            attention_mask = self.multiply(attention_mask, self.lower_triangle_mask)
+
+        return attention_mask
diff --git a/model_zoo/mass/src/transformer/decoder.py b/model_zoo/mass/src/transformer/decoder.py
new file mode 100644
index 0000000000..3e18dcf25f
--- /dev/null
+++ b/model_zoo/mass/src/transformer/decoder.py
@@ -0,0 +1,221 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Decoder of Transformer."""
+import mindspore.common.dtype as mstype
+from mindspore import nn
+
+from .feed_forward_network import FeedForwardNet
+from .self_attention import SelfAttention
+from .components import LayerNorm
+
+
+class DecoderCell(nn.Cell):
+    """
+    Decoder cells used in Transformer.
+
+    Args:
+        attn_embed_dim (int): Dimensions of attention weight, e.g. Q, K, V.
+        num_attn_heads (int): Attention heads number.
+        intermediate_size (int): Hidden size in FFN.
+        attn_dropout_prob (float): Dropout rate in attention layer. Default: 0.1.
+        initializer_range (float): Initial range. Default: 0.02.
+        dropout_prob (float): Dropout rate between layers. Default: 0.1.
+        hidden_act (str): Activation function in FFN. Default: "relu".
+        compute_type (mstype): Mindspore data type. Default: mstype.float32.
+
+    Returns:
+        Tensor, output with shape (N, T', D).
+    """
+
+    def __init__(self,
+                 attn_embed_dim=768,
+                 num_attn_heads=12,
+                 intermediate_size=3072,
+                 attn_dropout_prob=0.02,
+                 initializer_range=0.02,
+                 dropout_prob=0.1,
+                 hidden_act="relu",
+                 compute_type=mstype.float32):
+        super(DecoderCell, self).__init__()
+        self.masked_attn = SelfAttention(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            attn_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=dropout_prob,
+            compute_type=compute_type)
+        self.enc_dec_attn = SelfAttention(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            attn_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=dropout_prob,
+            compute_type=compute_type)
+        self.feed_forward_net = FeedForwardNet(
+            in_channels=attn_embed_dim,
+            hidden_size=intermediate_size,
+            out_channels=attn_embed_dim,
+            hidden_act=hidden_act,
+            initializer_range=initializer_range,
+            hidden_dropout_prob=dropout_prob,
+            compute_type=compute_type)
+
+    def construct(self, queries, attention_mask, encoder_out, enc_attention_mask):
+        """
+        Construct network.
+
+        Args:
+            queries (Tensor): With shape (N, T', D).
+            attention_mask (Tensor): With shape (N, T', T').
+            encoder_out (Tensor): With shape (N, T, D).
+            enc_attention_mask (Tensor): With shape (N, T, T).
+
+        Returns:
+            Tensor, output.
+        """
+        attention_output = self.masked_attn(
+            queries, queries, queries,
+            attention_mask
+        )
+        attention_output = self.enc_dec_attn(
+            attention_output,  # (N, T', D)
+            encoder_out, encoder_out,  # (N, T, D)
+            enc_attention_mask  # (N, T, T)
+        )
+        output = self.feed_forward_net(attention_output)
+        return output
+
+
+class TransformerDecoder(nn.Cell):
+    """
+    Implements of Transformer decoder.
+
+    Args:
+        attn_embed_dim (int): Dimensions of attention layer.
+        decoder_layers (int): Decoder layers.
+        num_attn_heads (int): Attention heads number.
+        intermediate_size (int): Hidden size of FFN.
+        attn_dropout_prob (float): Dropout rate in attention. Default: 0.1.
+        initializer_range (float): Initial range. Default: 0.02.
+        dropout_prob (float): Dropout rate between layers. Default: 0.1.
+        hidden_act (str): Non-linear activation function in FFN. Default: "relu".
+        compute_type (mstype): Mindspore data type. Default: mstype.float32.
+
+    Returns:
+        Tensor, shape of (N, T', D).
+    """
+
+    def __init__(self,
+                 attn_embed_dim,
+                 decoder_layers,
+                 num_attn_heads,
+                 intermediate_size,
+                 attn_dropout_prob=0.1,
+                 initializer_range=0.02,
+                 dropout_prob=0.1,
+                 hidden_act="relu",
+                 compute_type=mstype.float32):
+        super(TransformerDecoder, self).__init__()
+        self.num_layers = decoder_layers
+        self.attn_embed_dim = attn_embed_dim
+
+        self.layer0 = DecoderCell(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            intermediate_size=intermediate_size,
+            attn_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=dropout_prob,
+            hidden_act=hidden_act,
+            compute_type=compute_type
+        )
+        self.layer1 = DecoderCell(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            intermediate_size=intermediate_size,
+            attn_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=dropout_prob,
+            hidden_act=hidden_act,
+            compute_type=compute_type
+        )
+        self.layer2 = DecoderCell(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            intermediate_size=intermediate_size,
+            attn_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=dropout_prob,
+            hidden_act=hidden_act,
+            compute_type=compute_type
+        )
+        self.layer3 = DecoderCell(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            intermediate_size=intermediate_size,
+            attn_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=dropout_prob,
+            hidden_act=hidden_act,
+            compute_type=compute_type
+        )
+        self.layer4 = DecoderCell(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            intermediate_size=intermediate_size,
+            attn_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=dropout_prob,
+            hidden_act=hidden_act,
+            compute_type=compute_type
+        )
+        self.layer5 = DecoderCell(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            intermediate_size=intermediate_size,
+            attn_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=dropout_prob,
+            hidden_act=hidden_act,
+            compute_type=compute_type
+        )
+
+        self.layer_preprocess = LayerNorm(in_channels=attn_embed_dim,
+                                          return_2d=False)
+
+    def construct(self, input_tensor, attention_mask, encoder_out, enc_attention_mask):
+        """
+        Construct network.
+
+        Args:
+            input_tensor (Tensor): With shape of (N, T', D).
+            attention_mask (Tensor): With shape of (N, T', T').
+            encoder_out (Tensor): With shape of (N, T, D).
+            enc_attention_mask (Tensor): With shape of (N, T, T).
+
+        Returns:
+            Tensor, shape of (N, T', D).
+        """
+        prev_output = input_tensor
+        prev_output = self.layer0(prev_output, attention_mask, encoder_out, enc_attention_mask)
+        prev_output = self.layer1(prev_output, attention_mask, encoder_out, enc_attention_mask)
+        prev_output = self.layer2(prev_output, attention_mask, encoder_out, enc_attention_mask)
+        prev_output = self.layer3(prev_output, attention_mask, encoder_out, enc_attention_mask)
+        prev_output = self.layer4(prev_output, attention_mask, encoder_out, enc_attention_mask)
+        prev_output = self.layer5(prev_output, attention_mask, encoder_out, enc_attention_mask)
+
+        # Add layer norm, and full connection layer.
+        prev_output = self.layer_preprocess(prev_output)
+        return prev_output
diff --git a/model_zoo/mass/src/transformer/embedding.py b/model_zoo/mass/src/transformer/embedding.py
new file mode 100644
index 0000000000..bdce540416
--- /dev/null
+++ b/model_zoo/mass/src/transformer/embedding.py
@@ -0,0 +1,81 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Embedding."""
+import numpy as np
+import mindspore.common.dtype as mstype
+from mindspore import nn
+from mindspore.ops import operations as P
+from mindspore.common.tensor import Tensor
+from mindspore.common.parameter import Parameter
+
+
+class EmbeddingLookup(nn.Cell):
+    """Embeddings lookup table with a fixed dictionary and size."""
+
+    def __init__(self,
+                 vocab_size,
+                 embed_dim,
+                 use_one_hot_embeddings=False):
+        """
+        Embeddings lookup table with a fixed dictionary and size.
+
+        Args:
+            vocab_size (int): Size of the dictionary of embeddings.
+            embed_dim (int): The size of word embedding.
+            use_one_hot_embeddings (bool): Whether use one-hot embedding. Default: False.
+        """
+        super(EmbeddingLookup, self).__init__()
+        self.embedding_dim = embed_dim
+        self.vocab_size = vocab_size
+        self.use_one_hot_embeddings = use_one_hot_embeddings
+
+        init_weight = np.random.normal(0, embed_dim ** -0.5, size=[vocab_size, embed_dim])
+        # 0 is Padding index, thus init it as 0.
+        init_weight[0, :] = 0
+        self.embedding_table = Parameter(Tensor(init_weight),
+                                         name='embedding_table')
+        self.expand = P.ExpandDims()
+        self.gather = P.GatherV2()
+        self.one_hot = P.OneHot()
+        self.on_value = Tensor(1.0, mstype.float32)
+        self.off_value = Tensor(0.0, mstype.float32)
+        self.array_mul = P.MatMul()
+        self.reshape = P.Reshape()
+        self.get_shape = P.Shape()
+
+    def construct(self, input_ids):
+        """
+        Construct network.
+
+        Args:
+            input_ids (Tensor): A batch of sentences with shape (N, T).
+
+        Returns:
+            Tensor, word embeddings with shape (N, T, D)
+        """
+        _shape = self.get_shape(input_ids)  # (N, T).
+        _batch_size = _shape[0]
+        _max_len = _shape[1]
+
+        flat_ids = self.reshape(input_ids, (_batch_size * _max_len,))
+        if self.use_one_hot_embeddings:
+            one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value)
+            output_for_reshape = self.array_mul(
+                one_hot_ids, self.embedding_table)
+        else:
+            output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)
+
+        output = self.reshape(output_for_reshape, (_batch_size, _max_len, self.embedding_dim))
+        return output, self.embedding_table
diff --git a/model_zoo/mass/src/transformer/encoder.py b/model_zoo/mass/src/transformer/encoder.py
new file mode 100644
index 0000000000..35a112a2c3
--- /dev/null
+++ b/model_zoo/mass/src/transformer/encoder.py
@@ -0,0 +1,179 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Encoder of Transformer."""
+import mindspore.common.dtype as mstype
+from mindspore import nn
+
+from .feed_forward_network import FeedForwardNet
+from .self_attention import SelfAttention
+from .components import LayerNorm
+
+
+class EncoderCell(nn.Cell):
+    """
+    Single Encoder layer.
+
+    Layer structure is as below:
+        -> pre_LayerNorm
+        -> Multi-head Self-Attention
+        -> Dropout & Add
+        -> pre_LayerNorm
+        -> Fc1
+        -> Activation Function
+        -> Dropout
+        -> Fc2
+        -> Dropout & Add
+
+    Args:
+        attn_embed_dim (int): Dimensions of attention weights.
+        num_attn_heads (int): Heads number.
+        intermediate_size (int): Hidden size in FFN.
+        attention_dropout_prob (float): Dropout rate in attention layer.
+        initializer_range (float): Initial range.
+        hidden_dropout_prob (float): Dropout rate in FFN.
+        hidden_act (str): Activation function in FFN.
+        compute_type (mstype): Mindspore data type.
+
+    Returns:
+        Tensor, shape of (N, T, D).
+    """
+
+    def __init__(self,
+                 attn_embed_dim=768,
+                 num_attn_heads=12,
+                 intermediate_size=3072,
+                 attention_dropout_prob=0.02,
+                 initializer_range=0.02,
+                 hidden_dropout_prob=0.1,
+                 hidden_act="relu",
+                 compute_type=mstype.float32):
+        super(EncoderCell, self).__init__()
+        self.attention = SelfAttention(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            attn_dropout_prob=attention_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=hidden_dropout_prob,
+            compute_type=compute_type)
+        self.feed_forward_net = FeedForwardNet(
+            in_channels=attn_embed_dim,
+            hidden_size=intermediate_size,
+            out_channels=attn_embed_dim,
+            hidden_act=hidden_act,
+            initializer_range=initializer_range,
+            hidden_dropout_prob=hidden_dropout_prob,
+            dropout=hidden_dropout_prob,
+            compute_type=compute_type)
+
+    def construct(self, queries, attention_mask):
+        """
+        Construct network.
+
+        Args:
+            queries (Tensor): Shape (N, T, D).
+            attention_mask (Tensor): Shape (N, T, T').
+
+        Returns:
+            Tensor, shape (N, T, D).
+        """
+        attention_output = self.attention(queries, queries, queries,
+                                          attention_mask)  # (N, T, D)
+        output = self.feed_forward_net(attention_output)  # (N, T, D)
+        return output
+
+
+class TransformerEncoder(nn.Cell):
+    """
+    Implements of Transformer encoder.
+
+    According to Google Tensor2Tensor lib experience, they found that
+    put layer norm behind the multi-head self-attention and ffn would
+    make model more robust.
+
+    Thus, we take the same action.
+
+    Encoder layer structure is as below:
+        -> pre_LayerNorm
+        -> Multi-head Self-Attention
+        -> Dropout & Add
+        -> pre_LayerNorm
+        -> Fc1
+        -> Activation Function
+        -> Dropout
+        -> Fc2
+        -> Dropout & Add
+
+    Args:
+        attn_embed_dim (int): Dimensions of attention weights.
+        encoder_layers (int): Encoder layers.
+        num_attn_heads (int): Heads number.
+        intermediate_size (int): Hidden size in FFN.
+        attention_dropout_prob (float): Dropout rate in attention.
+        initializer_range (float): Initial range.
+        hidden_dropout_prob (float): Dropout rate in FFN.
+        hidden_act (str): Activation function.
+        compute_type (mstype): Mindspore data type.
+
+    Returns:
+        Tensor, shape of (N, T, D).
+    """
+
+    def __init__(self,
+                 attn_embed_dim,
+                 encoder_layers,
+                 num_attn_heads=12,
+                 intermediate_size=3072,
+                 attention_dropout_prob=0.1,
+                 initializer_range=0.02,
+                 hidden_dropout_prob=0.1,
+                 hidden_act="relu",
+                 compute_type=mstype.float32):
+        super(TransformerEncoder, self).__init__()
+        self.num_layers = encoder_layers
+
+        layers = []
+        for _ in range(encoder_layers):
+            layer = EncoderCell(
+                attn_embed_dim=attn_embed_dim,
+                num_attn_heads=num_attn_heads,
+                intermediate_size=intermediate_size,
+                attention_dropout_prob=attention_dropout_prob,
+                initializer_range=initializer_range,
+                hidden_dropout_prob=hidden_dropout_prob,
+                hidden_act=hidden_act,
+                compute_type=compute_type
+            )
+            layers.append(layer)
+
+        self.layers = nn.CellList(layers)
+        self.layer_norm = LayerNorm(in_channels=attn_embed_dim)
+
+    def construct(self, input_tensor, attention_mask):
+        """
+        Construct network.
+
+        Args:
+            input_tensor (Tensor): Shape (N, T, D).
+            attention_mask (Tensor): Shape (N, T, T).
+
+        Returns:
+            Tensor, shape (N, T, D).
+        """
+        prev_output = input_tensor
+        for layer_module in self.layers:
+            prev_output = layer_module(prev_output,
+                                       attention_mask)  # (N, T, D)
+        prev_output = self.layer_norm(prev_output)  # (N, T, D)
+        return prev_output
diff --git a/model_zoo/mass/src/transformer/feed_forward_network.py b/model_zoo/mass/src/transformer/feed_forward_network.py
new file mode 100644
index 0000000000..ca42e6a3e7
--- /dev/null
+++ b/model_zoo/mass/src/transformer/feed_forward_network.py
@@ -0,0 +1,92 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Forward network with two fc layers."""
+import mindspore.common.dtype as mstype
+from mindspore import nn
+from mindspore.common.initializer import TruncatedNormal
+from mindspore.ops import operations as P
+
+from .residual_conn import ResidualConnection
+from .components import LayerNorm
+
+
+class FeedForwardNet(nn.Cell):
+    """
+    Feed Forward Network (contain 2 fc layers).
+
+    Args:
+        in_channels (int): Dimensions of input matrix.
+        hidden_size (int): Hidden size.
+        out_channels (int): Dimensions of output matrix.
+        hidden_act (str): Activation function.
+        initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02.
+        hidden_dropout_prob (float): The dropout probability for hidden outputs. Default: 0.1.
+        dropout (float): Dropout in residual block. Default: 0.1.
+        compute_type (mstype): Compute type in FeedForward. Default: mstype.float32.
+
+    Returns:
+        Tensor, shape of (N, T, D).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 hidden_size,
+                 out_channels,
+                 hidden_act="relu",
+                 initializer_range=0.02,
+                 hidden_dropout_prob=0.1,
+                 dropout=None,
+                 compute_type=mstype.float32):
+        super(FeedForwardNet, self).__init__()
+
+        self.fc1 = nn.Dense(in_channels,
+                            hidden_size,
+                            activation=hidden_act,
+                            weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+        self.fc2 = nn.Dense(hidden_size,
+                            out_channels,
+                            weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+
+        self.layer_norm = LayerNorm(in_channels=in_channels,
+                                    return_2d=True)
+        self.residual = ResidualConnection(
+            dropout_prob=hidden_dropout_prob if dropout is None else dropout
+        )
+        self.get_shape = P.Shape()
+        self.reshape = P.Reshape()
+        self.dropout = nn.Dropout(keep_prob=1 - hidden_dropout_prob)
+
+    def construct(self, input_tensor):
+        """
+        Construct network.
+
+        Args:
+            input_tensor (Tensor): Shape (N, T, D).
+
+        Returns:
+            Tensor, (N, T, D).
+        """
+        shape = self.get_shape(input_tensor)
+        batch_size = shape[0]
+        max_len = shape[1]
+        embed_dim = shape[2]
+
+        output = self.layer_norm(input_tensor)
+        output = self.fc1(output)
+        output = self.dropout(output)
+        output = self.fc2(output)  # (-1, D)
+        output = self.residual(self.reshape(output, (batch_size, max_len, embed_dim)),
+                               input_tensor)  # (N, T, D)
+        return output
diff --git a/model_zoo/mass/src/transformer/grad_clip.py b/model_zoo/mass/src/transformer/grad_clip.py
new file mode 100644
index 0000000000..33a169967e
--- /dev/null
+++ b/model_zoo/mass/src/transformer/grad_clip.py
@@ -0,0 +1,67 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Gradient clip."""
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+from mindspore.ops import composite as C
+
+GRADIENT_CLIP_TYPE = 1
+GRADIENT_CLIP_VALUE = 8.0
+
+
+class ClipGradients(nn.Cell):
+    """
+    Clip gradients.
+
+    Returns:
+        List, a list of clipped_grad tuples.
+    """
+
+    def __init__(self):
+        super(ClipGradients, self).__init__()
+        self.clip_by_norm = nn.ClipByNorm()
+        self.cast = P.Cast()
+        self.dtype = P.DType()
+
+    def construct(self,
+                  grads,
+                  clip_type,
+                  clip_value):
+        """
+        Construct gradient clip network.
+
+        Args:
+            grads (list): List of gradient tuples.
+            clip_type (Tensor): The way to clip, 'value' or 'norm'.
+            clip_value (Tensor): Specifies how much to clip.
+
+        Returns:
+            List, a list of clipped_grad tuples.
+        """
+        if clip_type != 0 and clip_type != 1:  # pylint: disable=R1714
+            return grads
+
+        new_grads = ()
+        for grad in grads:
+            dt = self.dtype(grad)
+            if clip_type == 0:
+                t = C.clip_by_value(grad, self.cast(F.tuple_to_array((-clip_value,)), dt),
+                                    self.cast(F.tuple_to_array((clip_value,)), dt))
+            else:
+                t = self.clip_by_norm(grad, self.cast(F.tuple_to_array((clip_value,)), dt))
+            new_grads = new_grads + (t,)
+
+        return new_grads
diff --git a/model_zoo/mass/src/transformer/infer_mass.py b/model_zoo/mass/src/transformer/infer_mass.py
new file mode 100644
index 0000000000..54a0b4e54f
--- /dev/null
+++ b/model_zoo/mass/src/transformer/infer_mass.py
@@ -0,0 +1,158 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Infer api."""
+import time
+
+import mindspore.nn as nn
+import mindspore.common.dtype as mstype
+from mindspore.common.tensor import Tensor
+from mindspore.train.model import Model
+
+from mindspore import context
+
+from src.dataset import load_dataset
+from .transformer_for_infer import TransformerInferModel
+from ..utils.load_weights import load_infer_weights
+
+context.set_context(
+    mode=context.GRAPH_MODE,
+    save_graphs=False,
+    device_target="Ascend",
+    reserve_class_name_in_scope=False)
+
+
+class TransformerInferCell(nn.Cell):
+    """
+    Encapsulation class of transformer network infer.
+
+    Args:
+        network (nn.Cell): Transformer model.
+
+    Returns:
+        Tuple[Tensor, Tensor], predicted_ids and predicted_probs.
+    """
+
+    def __init__(self, network):
+        super(TransformerInferCell, self).__init__(auto_prefix=False)
+        self.network = network
+
+    def construct(self,
+                  source_ids,
+                  source_mask):
+        """Defines the computation performed."""
+
+        predicted_ids, predicted_probs = self.network(source_ids,
+                                                      source_mask)
+
+        return predicted_ids, predicted_probs
+
+
+def transformer_infer(config, dataset):
+    """
+    Run infer with Transformer.
+
+    Args:
+        config (TransformerConfig): Config.
+        dataset (Dataset): Dataset.
+
+    Returns:
+        List[Dict], prediction, each example has 4 keys, "source",
+        "target", "prediction" and "prediction_prob".
+    """
+    tfm_model = TransformerInferModel(config=config, use_one_hot_embeddings=False)
+    tfm_model.init_parameters_data()
+
+    params = tfm_model.trainable_params()
+    weights = load_infer_weights(config)
+
+    for param in params:
+        value = param.default_input
+        name = param.name
+        if name not in weights:
+            raise ValueError(f"{name} is not found in weights.")
+
+        with open("weight_after_deal.txt", "a+") as f:
+            weights_name = name
+            f.write(weights_name + "\n")
+            if isinstance(value, Tensor):
+                print(name, value.asnumpy().shape)
+                if weights_name in weights:
+                    assert weights_name in weights
+                    param.default_input = Tensor(weights[weights_name], mstype.float32)
+                else:
+                    raise ValueError(f"{weights_name} is not found in checkpoint.")
+            else:
+                raise TypeError(f"Type of {weights_name} is not Tensor.")
+
+    print(" | Load weights successfully.")
+    tfm_infer = TransformerInferCell(tfm_model)
+    model = Model(tfm_infer)
+
+    predictions = []
+    probs = []
+    source_sentences = []
+    target_sentences = []
+    for batch in dataset.create_dict_iterator():
+        source_sentences.append(batch["source_eos_ids"])
+        target_sentences.append(batch["target_eos_ids"])
+
+        source_ids = Tensor(batch["source_eos_ids"], mstype.int32)
+        source_mask = Tensor(batch["source_eos_mask"], mstype.int32)
+
+        start_time = time.time()
+        predicted_ids, entire_probs = model.predict(source_ids, source_mask)
+        print(f" | Batch size: {config.batch_size}, "
+              f"Time cost: {time.time() - start_time}.")
+
+        predictions.append(predicted_ids.asnumpy())
+        probs.append(entire_probs.asnumpy())
+
+    output = []
+    for inputs, ref, batch_out, batch_probs in zip(source_sentences,
+                                                   target_sentences,
+                                                   predictions,
+                                                   probs):
+        for i in range(config.batch_size):
+            if batch_out.ndim == 3:
+                batch_out = batch_out[:, 0]
+
+            example = {
+                "source": inputs[i].tolist(),
+                "target": ref[i].tolist(),
+                "prediction": batch_out[i].tolist(),
+                "prediction_prob": batch_probs[i].tolist()
+            }
+            output.append(example)
+
+    return output
+
+
+def infer(config):
+    """
+    Transformer infer api.
+
+    Args:
+        config (TransformerConfig): Config.
+
+    Returns:
+        list, result with
+    """
+    eval_dataset = load_dataset(data_files=config.test_dataset,
+                                batch_size=config.batch_size,
+                                epoch_count=1,
+                                sink_mode=config.dataset_sink_mode,
+                                shuffle=False) if config.test_dataset else None
+    prediction = transformer_infer(config, eval_dataset)
+    return prediction
diff --git a/model_zoo/mass/src/transformer/multi_head_attention.py b/model_zoo/mass/src/transformer/multi_head_attention.py
new file mode 100644
index 0000000000..dbdf1716cf
--- /dev/null
+++ b/model_zoo/mass/src/transformer/multi_head_attention.py
@@ -0,0 +1,226 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Multi-Head Self-Attention block."""
+import math
+
+import mindspore.common.dtype as mstype
+import mindspore.nn as nn
+import mindspore.ops.functional as F
+from mindspore.common.initializer import TruncatedNormal
+from mindspore.common.tensor import Tensor
+from mindspore.ops import operations as P
+from .components import SaturateCast
+
+
+class MultiHeadAttention(nn.Cell):
+    """
+    Implement of multi-head self-attention.
+
+    In the encoder, the calculation of single-head self-attention is as below.
+
+    Inputs: [x1, x2, x3, x4...] (xi is a word embedding, with shape T*D, Inputs's shape is N*T*D);
+    Weights: Wq(D*embed_dim), Wk(D*embed_dim), Wv(D*embed_dim);
+
+    Query, key, value are calculated in below formula:
+        Q = Input * Wq (N*T*embed_dim);
+        K = Input * Wk (N*T*embed_dim);
+        V = Input * Wv (N*T*embed_dim);
+
+    Then, attention score is calculated:
+        A = K * Q.T (qi is doted with each ki, A's shape is N*T*T.
+                     e.g. q1 is doted with k1, k2, k3, k4,
+                     then vector of [a1.1, a1.2, a1.3, a1.4] will be available.
+                     ai,j represent the importance of j-th word embedding to i-th.)
+
+        A^ = Soft-max(A) (Normalize the score, N*T*T).
+
+    Finally, the output of self-attention cell is:
+        O = A^ * V (N*T*embed_dim, each word embedding was represented with self-attention.)
+
+    Multi-head self-attention is the same with single-head self-attention except that
+    Wq, Wk, Wv are repeat `head_num` times.
+
+    In our implements, Wq = Wk = Wv = attn_embed_dim // num_attn_heads.
+
+    Args:
+        src_dim (int): Dimensions of queries.
+        tgt_dim (int): Dimensions of keys and values.
+        attn_embed_dim (int): Dimensions of attention weight, e.g. Q, K, V.
+        num_attn_heads (int): Attention heads number. Default: 1.
+        query_act (str): Activation function for Q. Default: None.
+        key_act (str): Activation function for K. Default: None.
+        value_act (str): Activation function for V. Default: None.
+        has_attention_mask (bool): Whether has attention mask. Default: True.
+        attention_dropout_prob (float): Dropout rate in attention. Default: 0.1.
+        initializer_range (float): Initial range.
+        do_return_2d_tensor (bool): Whether return 2d matrix. Default: True.
+        compute_type (mstype): Mindspore data type. Default: mstype.float32.
+
+    Returns:
+        Tensor, with shape (N, T, D).
+    """
+
+    def __init__(self,
+                 src_dim,
+                 tgt_dim,
+                 attn_embed_dim,
+                 num_attn_heads=1,
+                 query_act=None,
+                 key_act=None,
+                 value_act=None,
+                 out_act=None,
+                 has_attention_mask=True,
+                 attention_dropout_prob=0.0,
+                 initializer_range=0.02,
+                 do_return_2d_tensor=True,
+                 compute_type=mstype.float32):
+        super(MultiHeadAttention, self).__init__()
+        if attn_embed_dim % num_attn_heads != 0:
+            raise ValueError(f"The hidden size {attn_embed_dim} is not a multiple of the "
+                             f"number of attention heads {num_attn_heads}")
+
+        self.attn_embed_dim = attn_embed_dim
+        self.num_attn_heads = num_attn_heads
+        self.size_per_head = attn_embed_dim // num_attn_heads
+        self.src_dim = src_dim
+        self.tgt_dim = tgt_dim
+        self.has_attention_mask = has_attention_mask
+
+        if attn_embed_dim != self.num_attn_heads * self.size_per_head:
+            raise ValueError("`attn_embed_dim` must be divided by num_attn_heads.")
+
+        self.scores_mul = Tensor([1.0 / math.sqrt(float(self.size_per_head))],
+                                 dtype=compute_type)
+        self.reshape = P.Reshape()
+
+        self.query_layer = nn.Dense(src_dim,
+                                    attn_embed_dim,
+                                    activation=query_act,
+                                    has_bias=True,
+                                    weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+        self.key_layer = nn.Dense(tgt_dim,
+                                  attn_embed_dim,
+                                  activation=key_act,
+                                  has_bias=True,
+                                  weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+        self.value_layer = nn.Dense(tgt_dim,
+                                    attn_embed_dim,
+                                    activation=value_act,
+                                    has_bias=True,
+                                    weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+        self.out_layer = nn.Dense(attn_embed_dim,
+                                  attn_embed_dim,
+                                  activation=out_act,
+                                  has_bias=True,
+                                  weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
+
+        self.matmul_trans_b = P.BatchMatMul(transpose_b=True)
+        self.multiply = P.Mul()
+        self.transpose = P.Transpose()
+        self.multiply_data = Tensor([-10000.0], dtype=compute_type)
+        self.matmul = P.BatchMatMul()
+
+        self.softmax = nn.Softmax()
+        self.dropout = nn.Dropout(1 - attention_dropout_prob)
+
+        if self.has_attention_mask:
+            self.expand_dims = P.ExpandDims()
+            self.sub = P.Sub()
+            self.add = P.TensorAdd()
+            self.cast = P.Cast()
+            self.get_dtype = P.DType()
+
+        self.do_return_2d_tensor = do_return_2d_tensor
+        self.cast_compute_type = SaturateCast(dst_type=compute_type)
+        self.softmax_cast = P.Cast()
+        self.get_shape = P.Shape()
+        self.transpose_orders = (0, 2, 1, 3)
+
+    def construct(self, queries, keys, values, attention_mask):
+        """
+        Construct network.
+
+        For self attention operation, T==T'.
+        For encoder-decoder-attention, T!=T'
+
+        Args:
+            queries (Tensor): Input queries, with shape (N, T, D).
+            keys (Tensor): Input keys, with shape (N, T', D).
+            values (Tensor): Input values, with shape (N, T', D).
+            attention_mask (Tensor): Mask matrix, with shape (N, T, T').
+
+        Returns:
+            Tensor, with shape (N, T, D).
+        """
+        q_shape = self.get_shape(queries)  # (N, T, D)
+        batch_size = q_shape[0]
+        src_max_len = q_shape[1]
+
+        k_shape = self.get_shape(keys)  # (N, T', D)
+        tgt_max_len = k_shape[1]
+
+        _src_4d_shape = (batch_size, src_max_len, self.num_attn_heads, self.size_per_head)
+        _tgt_4d_shape = (batch_size, tgt_max_len, self.num_attn_heads, self.size_per_head)
+
+        queries_2d = self.reshape(queries, (-1, self.src_dim))
+        keys_2d = self.reshape(keys, (-1, self.tgt_dim))
+        values_2d = self.reshape(values, (-1, self.tgt_dim))
+
+        query_out = self.query_layer(queries_2d)  # (N*T, D)*(D, D) -> (N*T, D)
+        key_out = self.key_layer(keys_2d)  # (N*T, D)*(D, D) -> (N*T, D)
+        value_out = self.value_layer(values_2d)  # (N*T, D)*(D, D) -> (N*T, D)
+
+        query_out = self.multiply(query_out, self.scores_mul)
+
+        query_layer = self.reshape(query_out, _src_4d_shape)
+        query_layer = self.transpose(query_layer, self.transpose_orders)  # (N, h, T, D')
+        key_layer = self.reshape(key_out, _tgt_4d_shape)
+        key_layer = self.transpose(key_layer, self.transpose_orders)  # (N, h, T', D')
+        value_layer = self.reshape(value_out, _tgt_4d_shape)
+        value_layer = self.transpose(value_layer, self.transpose_orders)  # (N, h, T', D')
+
+        # (N, h, T, D')*(N, h, D', T') -> (N, h, T, T')
+        attention_scores = self.matmul_trans_b(query_layer, key_layer)
+
+        if self.has_attention_mask:
+            attention_mask = self.expand_dims(attention_mask, 1)
+            multiply_out = self.sub(
+                self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)),
+                self.cast(attention_mask, self.get_dtype(attention_scores))
+            )  # make mask position into 1, unmask position into 0.
+            adder = self.multiply(multiply_out, self.multiply_data)
+            adder = self.softmax_cast(adder, mstype.float32)
+            attention_scores = self.softmax_cast(attention_scores, mstype.float32)
+            attention_scores = self.add(adder, attention_scores)
+
+        attention_scores = self.softmax_cast(attention_scores, mstype.float32)
+        attention_prob = self.softmax(attention_scores)
+        attention_prob = self.softmax_cast(attention_prob, self.get_dtype(key_layer))
+        attention_prob = self.dropout(attention_prob)
+
+        # (N, h, T, T')*(N, h, T', D') -> (N, h, T, D')
+        context_layer = self.matmul(attention_prob, value_layer)
+        context_layer = self.transpose(context_layer, self.transpose_orders)  # (N, T, h, D')
+        context_layer = self.reshape(context_layer,
+                                     (batch_size * src_max_len, self.attn_embed_dim))  # (N*T, D)
+
+        context_layer = self.out_layer(context_layer)
+
+        if not self.do_return_2d_tensor:
+            context_layer = self.reshape(
+                context_layer, (batch_size, src_max_len, self.attn_embed_dim)
+            )  # (N, T, D)
+
+        return context_layer
diff --git a/model_zoo/mass/src/transformer/positional_embedding.py b/model_zoo/mass/src/transformer/positional_embedding.py
new file mode 100644
index 0000000000..317077aef7
--- /dev/null
+++ b/model_zoo/mass/src/transformer/positional_embedding.py
@@ -0,0 +1,82 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Positional Embedding."""
+import numpy as np
+from mindspore import nn
+from mindspore import Tensor
+import mindspore.common.dtype as mstype
+from mindspore.ops import operations as P
+
+
+def position_encoding(length, depth,
+                      min_timescale=1,
+                      max_timescale=1e4):
+    """
+    Create Tensor of sinusoids of different frequencies.
+
+    Args:
+        length (int): Length of the Tensor to create, i.e. Number of steps.
+        depth (int): Dimensions of embedding.
+        min_timescale (float): Minimum time scale.
+        max_timescale (float): Maximum time scale.
+
+    Returns:
+        Tensor of shape (T, D)
+    """
+    depth = depth // 2
+    positions = np.arange(length, dtype=np.float32)
+    log_timescale_increment = (np.log(max_timescale / min_timescale) / (depth - 1))
+    inv_timescales = min_timescale * np.exp(
+        np.arange(depth, dtype=np.float32) * -log_timescale_increment)
+    scaled_time = np.expand_dims(positions, 1) * np.expand_dims(inv_timescales, 0)
+    # instead of using SIN and COS interleaved
+    # it's  the same to first use SIN then COS
+    # as they are applied to the same position
+    x = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
+    return x
+
+
+class PositionalEmbedding(nn.Cell):
+    """
+    Add positional info to word embeddings.
+
+    Args:
+        embedding_size (int): Size of word embedding.
+        max_position_embeddings (int): Maximum step in this model.
+
+    Returns:
+        Tensor, shape of (N, T, D).
+    """
+
+    def __init__(self,
+                 embedding_size,
+                 max_position_embeddings=512):
+        super(PositionalEmbedding, self).__init__()
+        self.add = P.TensorAdd()
+        self.expand_dims = P.ExpandDims()
+        self.position_embedding_table = Tensor(
+            position_encoding(max_position_embeddings, embedding_size),
+            mstype.float32
+        )
+        self.gather = P.GatherV2()
+        self.get_shape = P.Shape()
+
+    def construct(self, word_embeddings):
+        input_shape = self.get_shape(word_embeddings)
+        input_len = input_shape[1]
+        position_embeddings = self.position_embedding_table[0:input_len:1, ::]
+        position_embeddings = self.expand_dims(position_embeddings, 0)
+        output = self.add(word_embeddings, position_embeddings)
+        return output
diff --git a/model_zoo/mass/src/transformer/residual_conn.py b/model_zoo/mass/src/transformer/residual_conn.py
new file mode 100644
index 0000000000..9d75a9b0c2
--- /dev/null
+++ b/model_zoo/mass/src/transformer/residual_conn.py
@@ -0,0 +1,49 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Residual block."""
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+
+
+class ResidualConnection(nn.Cell):
+    """
+    Add residual to output.
+
+    Args:
+        dropout_prob (float): Dropout rate.
+
+    Returns:
+        Tensor, with same shape of hidden_tensor.
+    """
+
+    def __init__(self, dropout_prob=0.1):
+        super(ResidualConnection, self).__init__()
+        self.add = P.TensorAdd()
+        self.dropout = nn.Dropout(1 - dropout_prob)
+
+    def construct(self, hidden_tensor, residual):
+        """
+        Construct network.
+
+        Args:
+            hidden_tensor (Tensor): Hidden tensor.
+            residual (Tensor): Input tensor.
+
+        Returns:
+            Tensor, which has the same shape with hidden_tensor and residual.
+        """
+        output = self.dropout(hidden_tensor)
+        output = self.add(output, residual)
+        return output
diff --git a/model_zoo/mass/src/transformer/self_attention.py b/model_zoo/mass/src/transformer/self_attention.py
new file mode 100644
index 0000000000..5a21c5aaf3
--- /dev/null
+++ b/model_zoo/mass/src/transformer/self_attention.py
@@ -0,0 +1,86 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Self-Attention block."""
+import mindspore.common.dtype as mstype
+from mindspore import nn
+
+from .multi_head_attention import MultiHeadAttention
+from .residual_conn import ResidualConnection
+from .components import LayerNorm
+
+
+class SelfAttention(nn.Cell):
+    """
+    Self-Attention.
+
+    Layer norm -> Multi-Head Self-Attention -> Add & Dropout.
+
+    Args:
+        attn_embed_dim (int): Dimensions of attention weight, e.g. Q, K, V.
+        num_attn_heads (int): Attention heads number. Default: 1.
+        attn_dropout_prob (float): Dropout rate in attention. Default: 0.1.
+        initializer_range (float): Initial range.
+        dropout_prob (float): Dropout rate.
+        has_attention_mask (bool): Whether has attention mask.
+        compute_type (mstype): Mindspore data type. Default: mstype.float32.
+
+    Returns:
+        Tensor, shape (N, T, D).
+    """
+
+    def __init__(self,
+                 attn_embed_dim,
+                 num_attn_heads,
+                 attn_dropout_prob=0.1,
+                 initializer_range=0.02,
+                 dropout_prob=0.1,
+                 has_attention_mask=True,
+                 compute_type=mstype.float32):
+        super(SelfAttention, self).__init__()
+        self.multi_head_self_attention = MultiHeadAttention(
+            src_dim=attn_embed_dim,
+            tgt_dim=attn_embed_dim,
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            attention_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            has_attention_mask=has_attention_mask,
+            do_return_2d_tensor=False,
+            compute_type=compute_type)
+
+        self.layer_norm = LayerNorm(in_channels=attn_embed_dim)
+        self.residual = ResidualConnection(dropout_prob=dropout_prob)
+
+    def construct(self, queries, keys, values, attention_mask):
+        """
+        Construct self-attention block.
+
+        Layer norm -> Multi-Head Self-Attention -> Add & Dropout.
+
+        Args:
+            queries (Tensor): Shape (N, T, D).
+            keys (Tensor): Shape (N, T', D).
+            values (Tensor): Shape (N, T', D).
+            attention_mask (Tensor): Shape (N, T, T').
+
+        Returns:
+            Tensor, shape (N, T, D).
+        """
+        q = self.layer_norm(queries)  # (N, T, D)
+        attention_output = self.multi_head_self_attention(
+            q, keys, values, attention_mask
+        )  # (N, T, D)
+        q = self.residual(attention_output, queries)
+        return q
diff --git a/model_zoo/mass/src/transformer/transformer.py b/model_zoo/mass/src/transformer/transformer.py
new file mode 100644
index 0000000000..97d682f29b
--- /dev/null
+++ b/model_zoo/mass/src/transformer/transformer.py
@@ -0,0 +1,166 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Transformer model addressed by Vaswani et al., 2017."""
+import copy
+import math
+
+from mindspore import nn, Tensor
+from mindspore.ops import operations as P
+from mindspore.common import dtype as mstype
+
+from config.config import TransformerConfig
+
+from .encoder import TransformerEncoder
+from .decoder import TransformerDecoder
+from .create_attn_mask import CreateAttentionMaskFromInputMask
+from .embedding import EmbeddingLookup
+from .positional_embedding import PositionalEmbedding
+from .components import SaturateCast
+
+
+class Transformer(nn.Cell):
+    """
+    Transformer with encoder and decoder.
+
+    In Transformer, we define T = src_max_len, T' = tgt_max_len.
+
+    Args:
+        config (TransformerConfig): Model config.
+        is_training (bool): Whether is training.
+        use_one_hot_embeddings (bool): Whether use one-hot embedding.
+
+    Returns:
+        Tuple[Tensor], network outputs.
+    """
+
+    def __init__(self,
+                 config: TransformerConfig,
+                 is_training: bool,
+                 use_one_hot_embeddings: bool = False,
+                 use_positional_embedding: bool = True):
+        super(Transformer, self).__init__()
+
+        self.use_positional_embedding = use_positional_embedding
+        config = copy.deepcopy(config)
+        self.is_training = is_training
+        if not is_training:
+            config.hidden_dropout_prob = 0.0
+            config.attention_dropout_prob = 0.0
+
+        self.input_mask_from_dataset = config.input_mask_from_dataset
+        self.batch_size = config.batch_size
+        self.max_positions = config.seq_length
+        self.attn_embed_dim = config.hidden_size
+        self.num_layers = config.num_hidden_layers
+        self.word_embed_dim = config.hidden_size
+
+        self.last_idx = self.num_layers - 1
+
+        self.embedding_lookup = EmbeddingLookup(
+            vocab_size=config.vocab_size,
+            embed_dim=self.word_embed_dim,
+            use_one_hot_embeddings=use_one_hot_embeddings)
+
+        if self.use_positional_embedding:
+            self.positional_embedding = PositionalEmbedding(
+                embedding_size=self.word_embed_dim,
+                max_position_embeddings=config.max_position_embeddings)
+
+        self.encoder = TransformerEncoder(
+            attn_embed_dim=self.attn_embed_dim,
+            encoder_layers=self.num_layers,
+            num_attn_heads=config.num_attention_heads,
+            intermediate_size=config.intermediate_size,
+            attention_dropout_prob=config.attention_dropout_prob,
+            initializer_range=config.initializer_range,
+            hidden_dropout_prob=config.hidden_dropout_prob,
+            hidden_act=config.hidden_act,
+            compute_type=config.compute_type)
+
+        self.decoder = TransformerDecoder(
+            attn_embed_dim=self.attn_embed_dim,
+            decoder_layers=self.num_layers,
+            num_attn_heads=config.num_attention_heads,
+            intermediate_size=config.intermediate_size,
+            attn_dropout_prob=config.attention_dropout_prob,
+            initializer_range=config.initializer_range,
+            dropout_prob=config.hidden_dropout_prob,
+            hidden_act=config.hidden_act,
+            compute_type=config.compute_type)
+
+        self.cast = P.Cast()
+        self.dtype = config.dtype
+        self.cast_compute_type = SaturateCast(dst_type=config.compute_type)
+        self.slice = P.StridedSlice()
+        self.dropout = nn.Dropout(keep_prob=1 - config.hidden_dropout_prob)
+
+        self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config)
+
+        self.scale = Tensor([math.sqrt(float(self.word_embed_dim))],
+                            dtype=mstype.float32)
+        self.multiply = P.Mul()
+
+    def construct(self, source_ids, source_mask, target_ids, target_mask):
+        """
+        Construct network.
+
+        In this method, T = src_max_len, T' = tgt_max_len.
+
+        Args:
+            source_ids (Tensor): Source sentences with shape (N, T).
+            source_mask (Tensor): Source sentences padding mask with shape (N, T),
+                where 0 indicates padding position.
+            target_ids (Tensor): Target sentences with shape (N, T').
+            target_mask (Tensor): Target sentences padding mask with shape (N, T'),
+                where 0 indicates padding position.
+
+        Returns:
+            Tuple[Tensor], network outputs.
+        """
+        # Process source sentences.
+        src_embeddings, embedding_tables = self.embedding_lookup(source_ids)
+        src_embeddings = self.multiply(src_embeddings, self.scale)
+        if self.use_positional_embedding:
+            src_embeddings = self.positional_embedding(src_embeddings)
+        src_embeddings = self.dropout(src_embeddings)
+
+        # Attention mask with shape (N, T, T).
+        enc_attention_mask = self._create_attention_mask_from_input_mask(source_mask)
+        # Transformer encoder.
+        encoder_output = self.encoder(
+            self.cast_compute_type(src_embeddings),  # (N, T, D).
+            self.cast_compute_type(enc_attention_mask)  # (N, T, T).
+        )
+
+        # Process target sentences.
+        tgt_embeddings, _ = self.embedding_lookup(target_ids)
+        tgt_embeddings = self.multiply(tgt_embeddings, self.scale)
+        if self.use_positional_embedding:
+            tgt_embeddings = self.positional_embedding(tgt_embeddings)
+        tgt_embeddings = self.dropout(tgt_embeddings)
+
+        # Attention mask with shape (N, T', T').
+        tgt_attention_mask = self._create_attention_mask_from_input_mask(
+            target_mask, True
+        )
+        # Transformer decoder.
+        decoder_output = self.decoder(
+            self.cast_compute_type(tgt_embeddings),  # (N, T', D)
+            self.cast_compute_type(tgt_attention_mask),  # (N, T', T')
+            encoder_output,  # (N, T, D)
+            enc_attention_mask  # (N, T, T)
+        )
+
+        return encoder_output, decoder_output, embedding_tables
diff --git a/model_zoo/mass/src/transformer/transformer_for_infer.py b/model_zoo/mass/src/transformer/transformer_for_infer.py
new file mode 100644
index 0000000000..8b1a1c4667
--- /dev/null
+++ b/model_zoo/mass/src/transformer/transformer_for_infer.py
@@ -0,0 +1,331 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Transformer for infer."""
+import math
+import copy
+import numpy as np
+import mindspore.common.dtype as mstype
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.common.tensor import Tensor
+
+from .beam_search import BeamSearchDecoder, TileBeam
+from .embedding import EmbeddingLookup
+from .positional_embedding import PositionalEmbedding
+from .components import SaturateCast
+from .create_attn_mask import CreateAttentionMaskFromInputMask
+from .decoder import TransformerDecoder
+from .encoder import TransformerEncoder
+
+
+class PredLogProbs(nn.Cell):
+    """
+    Get log probs.
+
+    Args:
+        batch_size (int): Batch size of input dataset.
+        seq_length (int): The length of sequences.
+        width (int): Number of parameters of a layer
+        compute_type (int): Type of input type.
+        dtype (int): Type of MindSpore output type.
+    """
+
+    def __init__(self,
+                 batch_size,
+                 seq_length,
+                 width,
+                 compute_type=mstype.float32,
+                 dtype=mstype.float32):
+        super(PredLogProbs, self).__init__()
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.width = width
+        self.compute_type = compute_type
+        self.dtype = dtype
+
+        self.reshape = P.Reshape()
+        self.matmul = P.MatMul(transpose_b=True)
+        self.log_softmax = nn.LogSoftmax(axis=-1)
+        self.shape_flat_sequence_tensor = (self.batch_size * self.seq_length, self.width)
+        self.cast = P.Cast()
+
+    def construct(self, input_tensor, output_weights):
+        """
+        Calculate the log_softmax.
+
+        Inputs:
+            input_tensor (Tensor): A batch of sentences with shape (N, T).
+            output_weights (Tensor): A batch of masks with shape (N, T).
+
+        Returns:
+            Tensor, the prediction probability with shape (N, T').
+        """
+        input_tensor = self.reshape(input_tensor, self.shape_flat_sequence_tensor)
+        input_tensor = self.cast(input_tensor, self.compute_type)
+        output_weights = self.cast(output_weights, self.compute_type)
+
+        logits = self.matmul(input_tensor, output_weights)
+        logits = self.cast(logits, self.dtype)
+
+        log_probs = self.log_softmax(logits)
+        return log_probs
+
+
+class TransformerDecoderStep(nn.Cell):
+    """
+    Multi-layer transformer decoder step.
+
+    Args:
+        config (TransformerConfig): The config of Transformer.
+        num_hidden_layers (int): The numbers of hidden layers.
+        attn_embed_dim (int): Dimensions of attention weights.
+        num_attn_heads=12 (int): Heads number.
+        seq_length (int): The length of a sequence.
+        intermediate_size: Hidden size in FFN.
+        attn_dropout_prob (float): Dropout rate in attention. Default: 0.1.
+        initializer_range (float): Initial range.
+        hidden_dropout_prob (float): Dropout rate in FFN.
+        hidden_act (str): Activation function in FFN.
+        compute_type (mstype): Mindspore data type. Default: mstype.float32.
+        embedding_lookup (function): Embeddings lookup operation. Default: None.
+        positional_embedding (function): Position Embedding operation. Default: None.
+        projection (function): Function to get log probs. Default: None.
+    """
+
+    def __init__(self,
+                 config,
+                 num_hidden_layers,
+                 attn_embed_dim,
+                 num_attn_heads=12,
+                 seq_length=64,
+                 intermediate_size=3072,
+                 attn_dropout_prob=0.1,
+                 initializer_range=0.02,
+                 hidden_dropout_prob=0.1,
+                 hidden_act="relu",
+                 compute_type=mstype.float32,
+                 embedding_lookup=None,
+                 positional_embedding=None,
+                 projection=None):
+        super(TransformerDecoderStep, self).__init__(auto_prefix=False)
+        self.embedding_lookup = embedding_lookup
+        self.positional_embedding = positional_embedding
+        self.projection = projection
+        self.seq_length = seq_length
+        self.decoder = TransformerDecoder(
+            attn_embed_dim=attn_embed_dim,
+            num_attn_heads=num_attn_heads,
+            decoder_layers=num_hidden_layers,
+            intermediate_size=intermediate_size,
+            attn_dropout_prob=attn_dropout_prob,
+            initializer_range=initializer_range,
+            dropout_prob=hidden_dropout_prob,
+            hidden_act=hidden_act,
+            compute_type=compute_type)
+
+        self.ones_like = P.OnesLike()
+        self.shape = P.Shape()
+
+        self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config)
+        self.expand = P.ExpandDims()
+        self.multiply = P.Mul()
+
+        ones = np.ones(shape=(seq_length, seq_length))
+        self.future_mask = Tensor(np.tril(ones), dtype=mstype.float32)
+
+        self.cast_compute_type = SaturateCast(dst_type=compute_type)
+        self.scale = Tensor([math.sqrt(float(attn_embed_dim))], dtype=mstype.float32)
+
+    def construct(self, input_ids, enc_states, enc_attention_mask):
+        """
+        Get log probs.
+
+        Args:
+            input_ids: [batch_size * beam_width, m]
+            enc_states: [batch_size * beam_width, T, D]
+            enc_attention_mask: [batch_size * beam_width, T, D]
+
+        Returns:
+            Tensor, the log_probs. [batch_size * beam_width, 1, Vocabulary_Dimension]
+        """
+
+        # process embedding. input_embedding: [batch_size * beam_width, m, D], embedding_tables: [V, D]
+        input_embedding, embedding_tables = self.embedding_lookup(input_ids)
+        input_embedding = self.multiply(input_embedding, self.scale)
+        input_embedding = self.positional_embedding(input_embedding)
+        input_embedding = self.cast_compute_type(input_embedding)
+
+        input_shape = self.shape(input_ids)
+        input_len = input_shape[1]
+        # [m,m]
+        future_mask = self.future_mask[0:input_len:1, 0:input_len:1]
+        # [batch_size * beam_width, m]
+        input_mask = self.ones_like(input_ids)
+        # [batch_size * beam_width, m, m]
+        input_mask = self._create_attention_mask_from_input_mask(input_mask)
+        # [batch_size * beam_width, m, m]
+        input_mask = self.multiply(input_mask, self.expand(future_mask, 0))
+        input_mask = self.cast_compute_type(input_mask)
+
+        # [batch_size * beam_width, m, D]
+        enc_attention_mask = enc_attention_mask[::, 0:input_len:1, ::]
+
+        # call TransformerDecoder:  [batch_size * beam_width, m, D]
+        decoder_output = self.decoder(input_embedding, input_mask, enc_states, enc_attention_mask)
+
+        # take the last step, [batch_size * beam_width, 1, D]
+        decoder_output = decoder_output[::, input_len - 1:input_len:1, ::]
+
+        # projection and log_prob
+        log_probs = self.projection(decoder_output, embedding_tables)
+
+        # [batch_size * beam_width, 1, vocabulary_size]
+        return log_probs
+
+
+class TransformerInferModel(nn.Cell):
+    """
+    Transformer Infer.
+
+    Args:
+        config (TransformerConfig): The config of Transformer.
+        use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False.
+    """
+
+    def __init__(self,
+                 config,
+                 use_one_hot_embeddings=False):
+        super(TransformerInferModel, self).__init__()
+        config = copy.deepcopy(config)
+        config.hidden_dropout_prob = 0.0
+        config.attention_dropout_prob = 0.0
+
+        self.input_mask_from_dataset = config.input_mask_from_dataset
+        self.batch_size = config.batch_size
+        self.seq_length = config.seq_length
+        self.hidden_size = config.hidden_size
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embedding_size = config.hidden_size
+        self.attn_embed_dim = config.hidden_size
+        self.num_layers = config.num_hidden_layers
+        self.last_idx = self.num_hidden_layers - 1
+
+        self.embedding_lookup = EmbeddingLookup(
+            vocab_size=config.vocab_size,
+            embed_dim=self.embedding_size,
+            use_one_hot_embeddings=use_one_hot_embeddings)
+
+        self.positional_embedding = PositionalEmbedding(
+            embedding_size=self.embedding_size,
+            max_position_embeddings=config.max_position_embeddings)
+        # use for infer
+        self.projection = PredLogProbs(
+            batch_size=config.batch_size * config.beam_width,
+            seq_length=1,
+            width=self.hidden_size,
+            compute_type=config.compute_type)
+
+        self.encoder = TransformerEncoder(
+            attn_embed_dim=self.attn_embed_dim,
+            encoder_layers=self.num_layers,
+            num_attn_heads=config.num_attention_heads,
+            intermediate_size=config.intermediate_size,
+            attention_dropout_prob=config.attention_dropout_prob,
+            initializer_range=config.initializer_range,
+            hidden_dropout_prob=config.hidden_dropout_prob,
+            hidden_act=config.hidden_act,
+            compute_type=config.compute_type)
+
+        decoder_cell = TransformerDecoderStep(
+            config=config,
+            num_hidden_layers=config.num_hidden_layers,
+            attn_embed_dim=self.attn_embed_dim,
+            seq_length=config.seq_length,
+            num_attn_heads=config.num_attention_heads,
+            intermediate_size=config.intermediate_size,
+            hidden_dropout_prob=config.hidden_dropout_prob,
+            compute_type=config.compute_type,
+            initializer_range=config.initializer_range,
+            hidden_act="relu",
+            embedding_lookup=self.embedding_lookup,
+            positional_embedding=self.positional_embedding,
+            attn_dropout_prob=config.attention_dropout_prob,
+            projection=self.projection
+        )
+
+        # link beam_search after decoder
+        self.decoder = BeamSearchDecoder(
+            batch_size=config.batch_size,
+            seq_length=config.seq_length,
+            vocab_size=config.vocab_size,
+            decoder=decoder_cell,
+            beam_width=config.beam_width,
+            length_penalty_weight=config.length_penalty_weight,
+            max_decode_length=config.max_decode_length)
+
+        self.decoder.add_flags(loop_can_unroll=True)
+
+        self.cast = P.Cast()
+        self.dtype = config.dtype
+        self.cast_compute_type = SaturateCast(dst_type=config.compute_type)
+        self.expand = P.ExpandDims()
+        self.multiply = P.Mul()
+
+        self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config)
+
+        # use for infer
+        self.tile_beam = TileBeam(beam_width=config.beam_width)
+        ones = np.ones(shape=(config.batch_size, config.max_decode_length))
+        self.encode_mask = Tensor(ones, dtype=mstype.float32)
+
+        self.scale = Tensor([math.sqrt(float(self.embedding_size))],
+                            dtype=mstype.float32)
+        self.reshape = P.Reshape()
+
+    def construct(self, source_ids, source_mask, target_ids=None, target_mask=None):
+        """
+        Process source sentence
+
+        Inputs:
+            source_ids (Tensor): Source sentences with shape (N, T).
+            source_mask (Tensor): Source sentences padding mask with shape (N, T),
+                where 0 indicates padding position.
+
+        Returns:
+            Tensor, Predictions with shape (N, T').
+        """
+        # word_embeddings
+        src_embeddings, _ = self.embedding_lookup(source_ids)
+        src_embeddings = self.multiply(src_embeddings, self.scale)
+        # position_embeddings
+        src_embeddings = self.positional_embedding(src_embeddings)
+        # attention mask, [batch_size, seq_length, seq_length]
+        enc_attention_mask = self._create_attention_mask_from_input_mask(source_mask)
+        # encode
+        encoder_output = self.encoder(self.cast_compute_type(src_embeddings),
+                                      self.cast_compute_type(enc_attention_mask))
+
+        # bean search for encoder output
+        beam_encoder_output = self.tile_beam(encoder_output)
+        # [batch_size, T, D]
+        enc_attention_mask = self.multiply(
+            enc_attention_mask[::, 0:1:1, ::],
+            self.expand(self.encode_mask, -1))
+        # [N*batch_size, T, D]
+        beam_enc_attention_mask = self.tile_beam(enc_attention_mask)
+        beam_enc_attention_mask = self.cast_compute_type(beam_enc_attention_mask)
+        predicted_ids, predicted_probs = self.decoder(beam_encoder_output, beam_enc_attention_mask)
+        predicted_ids = self.reshape(predicted_ids, (self.batch_size, -1))
+        return predicted_ids, predicted_probs
diff --git a/model_zoo/mass/src/transformer/transformer_for_train.py b/model_zoo/mass/src/transformer/transformer_for_train.py
new file mode 100644
index 0000000000..eb75e2d7b9
--- /dev/null
+++ b/model_zoo/mass/src/transformer/transformer_for_train.py
@@ -0,0 +1,348 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Transformer for training."""
+from mindspore import nn
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+from mindspore.ops import composite as C
+from mindspore.common.tensor import Tensor
+from mindspore.common.parameter import Parameter, ParameterTuple
+from mindspore.common import dtype as mstype
+from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
+from mindspore.train.parallel_utils import ParallelMode
+from mindspore.parallel._utils import _get_device_num, _get_parallel_mode, _get_mirror_mean
+
+from .transformer import Transformer
+from .grad_clip import GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE, ClipGradients
+
+
+class PredLogProbs(nn.Cell):
+    """
+    Get log probs.
+
+    Args:
+        config (TransformerConfig): The config of Transformer.
+
+    Returns:
+        Tensor, masked lm output.
+    """
+
+    def __init__(self, config):
+        super(PredLogProbs, self).__init__()
+        self.width = config.hidden_size
+        self.reshape = P.Reshape()
+
+        self.matmul = P.MatMul(transpose_b=True)
+        self.log_softmax = nn.LogSoftmax(axis=-1)
+        self.shape_flat_sequence_tensor = (config.batch_size * config.seq_length, self.width)
+        self.cast = P.Cast()
+        self.compute_type = config.compute_type
+        self.dtype = config.dtype
+        self.get_shape = P.Shape()
+
+    def construct(self, input_tensor, output_weights):
+        """
+        Construct network.
+
+        Args:
+            input_tensor (Tensor): Tensor.
+            output_weights (Tensor): Tensor.
+
+        Returns:
+            Tensor, masked lm output.
+        """
+        shape = self.get_shape(input_tensor)
+
+        input_tensor = self.reshape(input_tensor, (shape[0] * shape[1], shape[2]))
+        input_tensor = self.cast(input_tensor, self.compute_type)
+        output_weights = self.cast(output_weights, self.compute_type)
+
+        logits = self.matmul(input_tensor, output_weights)
+        logits = self.cast(logits, self.dtype)
+
+        log_probs = self.log_softmax(logits)
+        return log_probs
+
+
+class TransformerTraining(nn.Cell):
+    """
+    Transformer training network.
+
+    Args:
+        config (TransformerConfig): The config of Transformer.
+        is_training (bool): Specifies whether to use the training mode.
+        use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings.
+
+    Returns:
+        Tensor, prediction_scores, seq_relationship_score.
+    """
+
+    def __init__(self, config, is_training, use_one_hot_embeddings):
+        super(TransformerTraining, self).__init__()
+        self.transformer = Transformer(config, is_training, use_one_hot_embeddings)
+        self.projection = PredLogProbs(config)
+
+    def construct(self, source_ids, source_mask, target_ids, target_mask):
+        """
+        Construct network.
+
+        Args:
+            source_ids (Tensor): Source sentence.
+            source_mask (Tensor): Source padding mask.
+            target_ids (Tensor): Target sentence.
+            target_mask (Tensor): Target padding mask.
+
+        Returns:
+            Tensor, prediction_scores, seq_relationship_score.
+        """
+        _, decoder_outputs, embedding_table = \
+            self.transformer(source_ids, source_mask, target_ids, target_mask)
+        prediction_scores = self.projection(decoder_outputs,
+                                            embedding_table)
+        return prediction_scores
+
+
+class LabelSmoothedCrossEntropyCriterion(nn.Cell):
+    """
+    Label Smoothed Cross-Entropy Criterion.
+
+    Args:
+        config (TransformerConfig): The config of Transformer.
+
+    Returns:
+        Tensor, final loss.
+    """
+
+    def __init__(self, config):
+        super(LabelSmoothedCrossEntropyCriterion, self).__init__()
+        self.vocab_size = config.vocab_size
+        self.onehot = P.OneHot()
+        self.on_value = Tensor(float(1 - config.label_smoothing), mstype.float32)
+        self.off_value = Tensor(config.label_smoothing / float(self.vocab_size - 1), mstype.float32)
+        self.reduce_sum = P.ReduceSum()
+        self.reduce_mean = P.ReduceMean()
+        self.reshape = P.Reshape()
+        self.last_idx = (-1,)
+        self.flatten = P.Flatten()
+        self.neg = P.Neg()
+        self.cast = P.Cast()
+        self.flat_shape = (config.batch_size * config.seq_length,)
+        self.get_shape = P.Shape()
+
+    def construct(self, prediction_scores, label_ids, label_weights):
+        """
+        Construct network to calculate loss.
+
+        Args:
+            prediction_scores (Tensor): Prediction scores.
+            label_ids (Tensor): Labels.
+            label_weights (Tensor): Mask tensor.
+
+        Returns:
+            Tensor, final loss.
+        """
+        label_shape = self.get_shape(label_ids)
+
+        label_ids = self.reshape(label_ids, (label_shape[0] * label_shape[1],))
+        label_weights = self.cast(
+            self.reshape(label_weights, (label_shape[0] * label_shape[1],)),
+            mstype.float32
+        )
+        one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)
+
+        per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
+        numerator = self.reduce_sum(label_weights * per_example_loss, ())
+        denominator = self.reduce_sum(label_weights, ()) + self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
+        loss = numerator / denominator
+
+        return loss
+
+
+class TransformerNetworkWithLoss(nn.Cell):
+    """
+    Provide  transformer training loss through network.
+
+    Args:
+        config (BertConfig): The config of Transformer.
+        is_training (bool): Specifies whether to use the training mode.
+        use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False.
+
+    Returns:
+        Tensor, the loss of the network.
+    """
+
+    def __init__(self, config, is_training, use_one_hot_embeddings=False):
+        super(TransformerNetworkWithLoss, self).__init__()
+        self.transformer = TransformerTraining(config, is_training, use_one_hot_embeddings)
+        self.loss = LabelSmoothedCrossEntropyCriterion(config)
+        self.cast = P.Cast()
+
+    def construct(self,
+                  source_ids,
+                  source_mask,
+                  target_ids,
+                  target_mask,
+                  label_ids,
+                  label_weights):
+        prediction_scores = self.transformer(source_ids, source_mask, target_ids, target_mask)
+        total_loss = self.loss(prediction_scores, label_ids, label_weights)
+        return self.cast(total_loss, mstype.float32)
+
+
+grad_scale = C.MultitypeFuncGraph("grad_scale")
+reciprocal = P.Reciprocal()
+
+
+@grad_scale.register("Tensor", "Tensor")
+def tensor_grad_scale(scale, grad):
+    return grad * F.cast(reciprocal(scale), F.dtype(grad))
+
+
+class TransformerTrainOneStepWithLossScaleCell(nn.Cell):
+    """
+    Encapsulation class of Transformer network training.
+
+    Append an optimizer to the training network after that the construct
+    function can be called to create the backward graph.
+
+    Args:
+        network: Cell. The training network. Note that loss function should have
+            been added.
+        optimizer: Optimizer. Optimizer for updating the weights.
+
+    Returns:
+        Tuple[Tensor, Tensor, Tensor], loss, overflow, sen.
+    """
+
+    def __init__(self, network, optimizer, scale_update_cell=None):
+
+        super(TransformerTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
+        self.network = network
+        self.network.add_flags(defer_inline=True)
+        self.weights = ParameterTuple(network.trainable_params())
+        self.optimizer = optimizer
+        self.grad = C.GradOperation('grad', get_by_list=True,
+                                    sens_param=True)
+        self.reducer_flag = False
+        self.all_reduce = P.AllReduce()
+
+        self.parallel_mode = _get_parallel_mode()
+        if self.parallel_mode not in ParallelMode.MODE_LIST:
+            raise ValueError("Parallel mode does not support: ", self.parallel_mode)
+        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
+            self.reducer_flag = True
+        self.grad_reducer = None
+        if self.reducer_flag:
+            mean = _get_mirror_mean()
+            degree = _get_device_num()
+            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
+        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
+        self.clip_gradients = ClipGradients()
+        self.cast = P.Cast()
+        self.alloc_status = P.NPUAllocFloatStatus()
+        self.get_status = P.NPUGetFloatStatus()
+        self.clear_before_grad = P.NPUClearFloatStatus()
+        self.reduce_sum = P.ReduceSum(keep_dims=False)
+        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
+        self.base = Tensor(1, mstype.float32)
+        self.less_equal = P.LessEqual()
+        self.hyper_map = C.HyperMap()
+
+        self.loss_scale = None
+        self.loss_scaling_manager = scale_update_cell
+        if scale_update_cell:
+            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
+                                        name="loss_scale")
+        self.add_flags(has_effect=True)
+
+    def construct(self,
+                  source_eos_ids,
+                  source_eos_mask,
+                  target_sos_ids,
+                  target_sos_mask,
+                  target_eos_ids,
+                  target_eos_mask,
+                  sens=None):
+        """
+        Construct network.
+
+        Args:
+            source_eos_ids (Tensor): Source sentence.
+            source_eos_mask (Tensor): Source padding mask.
+            target_sos_ids (Tensor): Target sentence.
+            target_sos_mask (Tensor): Target padding mask.
+            target_eos_ids (Tensor): Prediction sentence.
+            target_eos_mask (Tensor): Prediction padding mask.
+            sens (Tensor): Loss sen.
+
+        Returns:
+            Tuple[Tensor, Tensor, Tensor], loss, overflow, sen.
+        """
+        source_ids = source_eos_ids
+        source_mask = source_eos_mask
+        target_ids = target_sos_ids
+        target_mask = target_sos_mask
+        label_ids = target_eos_ids
+        label_weights = target_eos_mask
+
+        weights = self.weights
+        loss = self.network(source_ids,
+                            source_mask,
+                            target_ids,
+                            target_mask,
+                            label_ids,
+                            label_weights)
+        # Alloc status.
+        init = self.alloc_status()
+        # Clear overflow buffer.
+        self.clear_before_grad(init)
+        if sens is None:
+            scaling_sens = self.loss_scale
+        else:
+            scaling_sens = sens
+        grads = self.grad(self.network, weights)(source_ids,
+                                                 source_mask,
+                                                 target_ids,
+                                                 target_mask,
+                                                 label_ids,
+                                                 label_weights,
+                                                 self.cast(scaling_sens,
+                                                           mstype.float32))
+
+        grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads)
+        grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE)
+        if self.reducer_flag:
+            # Apply grad reducer on grads.
+            grads = self.grad_reducer(grads)
+        self.get_status(init)
+        flag_sum = self.reduce_sum(init, (0,))
+
+        if self.is_distributed:
+            # Sum overflow flag over devices.
+            flag_reduce = self.all_reduce(flag_sum)
+            cond = self.less_equal(self.base, flag_reduce)
+        else:
+            cond = self.less_equal(self.base, flag_sum)
+
+        overflow = cond
+        if sens is None:
+            overflow = self.loss_scaling_manager(self.loss_scale, cond)
+        if overflow:
+            succ = False
+        else:
+            succ = self.optimizer(grads)
+
+        ret = (loss, cond, scaling_sens)
+        return F.depend(ret, succ)
diff --git a/model_zoo/mass/src/utils/__init__.py b/model_zoo/mass/src/utils/__init__.py
new file mode 100644
index 0000000000..f78be57b22
--- /dev/null
+++ b/model_zoo/mass/src/utils/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Utils for mass model."""
+from .dictionary import Dictionary
+from .ppl_score import ngram_ppl
+from .lr_scheduler import square_root_schedule
+from .loss_monitor import LossCallBack
+from .byte_pair_encoding import bpe_encode
+from .initializer import zero_weight, one_weight, normal_weight, weight_variable
+from .rouge_score import rouge
+
+__all__ = [
+    "Dictionary",
+    "rouge",
+    "bpe_encode",
+    "ngram_ppl",
+    "square_root_schedule",
+    "LossCallBack",
+    "one_weight",
+    "zero_weight",
+    "normal_weight",
+    "weight_variable"
+]
diff --git a/model_zoo/mass/src/utils/byte_pair_encoding.py b/model_zoo/mass/src/utils/byte_pair_encoding.py
new file mode 100644
index 0000000000..fb0e34a30d
--- /dev/null
+++ b/model_zoo/mass/src/utils/byte_pair_encoding.py
@@ -0,0 +1,52 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""BPE."""
+import os
+import subprocess
+
+ENCODER = "subword-nmt apply-bpe -c {codes} -i {input} -o {output}"
+LEARN_DICT = "subword-nmt get-vocab -i {input} -o {dict_path}"
+
+
+def bpe_encode(codes_path, src_path, output_path, dict_path):
+    """
+    Do bpe.
+
+    Args:
+        codes_path (str): BPE codes file.
+        src_path (str): Source text file path.
+        output_path (str): Output path.
+        dict_path (str): Dict path.
+    """
+    if not (os.path.isabs(codes_path)
+            and os.path.isabs(src_path)
+            and os.path.isabs(output_path)
+            and os.path.isabs(dict_path)):
+        raise ValueError("Absolute path is required.")
+
+    if not (os.path.exists(os.path.dirname(codes_path))
+            and os.path.exists(os.path.dirname(src_path))
+            and os.path.exists(os.path.dirname(output_path))
+            and os.path.exists(os.path.dirname(dict_path))):
+        raise FileNotFoundError("Dir not found.")
+
+    # Encoding.
+    print(f" | Applying BPE encoding.")
+    subprocess.call(ENCODER.format(codes=codes_path, input=src_path, output=output_path),
+                    shell=True)
+    print(f" | Fetching vocabulary from single file.")
+    # Learn vocab.
+    subprocess.call(LEARN_DICT.format(input=output_path, dict_path=dict_path),
+                    shell=True)
diff --git a/model_zoo/mass/src/utils/dictionary.py b/model_zoo/mass/src/utils/dictionary.py
new file mode 100644
index 0000000000..5ccfbd4ea2
--- /dev/null
+++ b/model_zoo/mass/src/utils/dictionary.py
@@ -0,0 +1,276 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Vocabulary."""
+from typing import List
+import numpy as np
+
+CUBE_SIZE = 16
+REPLACE_THRESHOLD = 200
+
+
+class Dictionary:
+    """Dictionary for mono lingual dataset."""
+
+    def __init__(self, max_size=46000, bos="<s>", eos="</s>", unk="<unk>",
+                 mask="<mask>", padding="<pad>"):
+        self._bos = bos
+        self._eos = eos
+        self._unk = unk
+        self._mask = mask
+        self._padding = padding
+        self._symbols = []
+        self._frequency = []
+        self._mapping = {}
+        self._init_symbols()
+        self.is_learning = False
+        self.max_vocab_size = max_size
+
+    def shrink(self, threshold=50):
+        """
+        Shrink dataset into a small one.
+
+        Args:
+            threshold (int): Threshold that determinate whether to
+                drop the word.
+
+        Returns:
+            Dictionary, a new dict.
+        """
+        _new_dict = Dictionary()
+
+        freq_idx = [(f, i) for i, f in enumerate(self._frequency)]
+        freq_idx = sorted(freq_idx, key=lambda x: x[0], reverse=True)
+
+        freqs = np.array(self._frequency, dtype=np.int)
+        filtered_count = np.where(freqs <= threshold)[0].shape[0]
+
+        left_count = self.size - filtered_count
+        if left_count % CUBE_SIZE != 0:
+            supplement = CUBE_SIZE - left_count % CUBE_SIZE
+            if supplement <= filtered_count:
+                filtered_count -= supplement
+
+        for f, i in freq_idx:
+            if f <= threshold and filtered_count > 0:
+                filtered_count -= 1
+                continue
+            _new_dict.add_symbol(self._symbols[i], f)
+
+        return _new_dict
+
+    def set_to_learn(self, learn: bool):
+        self.is_learning = learn
+
+    def is_empty(self):
+        if self.size <= 4:
+            if sum(self._frequency) == 0:
+                return True
+        return False
+
+    @property
+    def symbols(self):
+        return self._symbols
+
+    @property
+    def frequency(self):
+        return self._frequency
+
+    @property
+    def size(self):
+        return len(self._symbols)
+
+    @property
+    def mask(self):
+        return self._mask
+
+    @property
+    def eos(self):
+        return self._eos
+
+    @property
+    def bos(self):
+        return self._bos
+
+    @property
+    def unk(self):
+        return self._unk
+
+    @property
+    def padding(self):
+        return self._padding
+
+    @property
+    def padding_index(self):
+        return self._padding_index
+
+    @property
+    def mask_index(self):
+        return self._mask_index
+
+    @property
+    def eos_index(self):
+        return self._eos_index
+
+    @property
+    def bos_index(self):
+        return self._bos_index
+
+    @property
+    def unk_index(self):
+        return self._unk_index
+
+    def _init_symbols(self):
+        self._padding_index = self.add_symbol(self._padding, 0)  # 0
+        self._bos_index = self.add_symbol(self._bos, 0)  # 1
+        self._eos_index = self.add_symbol(self._eos, 0)  # 2
+        self._unk_index = self.add_symbol(self._unk, 0)  # 3
+        self._mask_index = self.add_symbol(self._mask, 0)  # 4
+
+    def __contains__(self, symbol):
+        return symbol in self._mapping
+
+    def __getitem__(self, idx):
+        if 0 <= idx < self.size:
+            return self._symbols[idx]
+        return self._unk
+
+    def __len__(self):
+        return self.size
+
+    def index(self, symbol: str):
+        """
+        Return id according to symbol.
+
+        Args:
+            symbol (str): Symbol.
+
+        Returns:
+            int, id.
+        """
+        idx = self._mapping.get(symbol)
+        if idx is None:
+            if self.is_learning and symbol.isalpha():
+                if self.max_vocab_size <= self.size:
+                    return self.add_symbol(symbol)
+
+                if symbol.lower() in self._mapping:
+                    return self._mapping.get(symbol.lower())
+
+            idx = self._mapping.get(symbol.lower())
+            if idx is not None:
+                freq = self._frequency[idx]
+                # If lower symbol in vocabulary and
+                # its frequency larger than `REPLACE_THRESHOLD`,
+                # then replace symbol by lower symbol.
+                if freq >= REPLACE_THRESHOLD:
+                    return idx
+            return self.unk_index
+        return idx
+
+    def add_symbol(self, symbol, times=1):
+        """
+        Add symbol to dict.
+
+        Args:
+            symbol (str): Symbol.
+            times (int): Frequency.
+
+        Returns:
+            int, token id.
+        """
+        if symbol in self._mapping:
+            idx = self._mapping[symbol]
+            self._frequency[idx] = self._frequency[idx] + times
+            return idx
+
+        idx = len(self._symbols)
+        self._mapping[symbol] = idx
+        self._symbols.append(symbol)
+        self._frequency.append(times)
+        return idx
+
+    @classmethod
+    def load_from_text(cls, filepaths: List[str]):
+        """
+        Load dict from text which is in format of [word, freq].
+
+        Args:
+            filepaths (str): Dict list.
+
+        Returns:
+            Dictionary, dict instance.
+        """
+        _dict = cls()
+        for filepath in filepaths:
+            with open(filepath, "r", encoding="utf-8") as f:
+                for _, line in enumerate(f):
+                    line = line.strip()
+                    if line is None:
+                        continue
+                    try:
+                        word, freq = line.split(" ")
+                        _dict.add_symbol(word, times=int(freq))
+                    except ValueError:
+                        continue
+
+        return _dict
+
+    @classmethod
+    def load_from_persisted_dict(cls, filepath):
+        """
+        Load dict from binary file.
+
+        Args:
+            filepath (str): File path.
+
+        Returns:
+            Dictionary, dict instance.
+        """
+        import pickle
+        with open(filepath, "rb") as f:
+            return pickle.load(f)
+
+    def persistence(self, path):
+        """Save dict to binary file."""
+        import pickle
+        with open(path, "wb") as _dict:
+            pickle.dump(self, _dict, protocol=1)
+
+    def merge_dict(self, other, new_dict=False):
+        """Merge two dict."""
+        if other.is_empty():
+            return self
+
+        if new_dict:
+            _dict = Dictionary()
+
+            for s, f in zip(self.symbols, self.frequency):
+                _dict.add_symbol(s, times=f)
+            for s, f in zip(other.symbols, other.frequency):
+                _dict.add_symbol(s, times=f)
+            return _dict
+
+        for s, f in zip(other.symbols, other.frequency):
+            self.add_symbol(s, times=f)
+
+        return self
+
+    def export(self, path):
+        """Save text-like vocabulary."""
+        _lines = []
+        for token, freq in zip(self._symbols, self._frequency):
+            _lines.append(f"{token} {freq}")
+        with open(path, "w") as f:
+            f.writelines(_lines)
diff --git a/model_zoo/mass/src/utils/initializer.py b/model_zoo/mass/src/utils/initializer.py
new file mode 100644
index 0000000000..d1b5ba92ba
--- /dev/null
+++ b/model_zoo/mass/src/utils/initializer.py
@@ -0,0 +1,108 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Initializer."""
+import math
+import numpy as np
+
+from mindspore import Tensor
+
+
+def _compute_fans(shape):
+    """
+    Computes the number of input and output units for a weight shape.
+
+    Args:
+        shape (tuple): Integer shape tuple or TF tensor shape.
+
+    Returns:
+        tuple, integer scalars (fan_in, fan_out).
+    """
+    if not shape:
+        fan_in = fan_out = 1
+    elif len(shape) == 1:
+        fan_in = fan_out = shape[0]
+    elif len(shape) == 2:
+        fan_in = shape[0]
+        fan_out = shape[1]
+    else:
+        # Assuming convolution kernels (2D, 3D, or more).
+        # kernel shape: (..., input_depth, depth)
+        receptive_field_size = 1
+        for dim in shape[:-2]:
+            receptive_field_size *= dim
+        fan_in = shape[-2] * receptive_field_size
+        fan_out = shape[-1] * receptive_field_size
+    return int(fan_in), int(fan_out)
+
+
+def weight_variable(shape):
+    """
+    Generate weight var.
+
+    Args:
+        shape (tuple): Shape.
+
+    Returns:
+        Tensor, var.
+    """
+    scale_shape = shape
+    fan_in, fan_out = _compute_fans(scale_shape)
+    scale = 1.0 / max(1., (fan_in + fan_out) / 2.)
+    limit = math.sqrt(3.0 * scale)
+    values = np.random.uniform(-limit, limit, shape).astype(np.float32)
+    return Tensor(values)
+
+
+def one_weight(shape):
+    """
+    Generate weight with ones.
+
+    Args:
+        shape (tuple): Shape.
+
+    Returns:
+        Tensor, var.
+    """
+    ones = np.ones(shape).astype(np.float32)
+    return Tensor(ones)
+
+
+def zero_weight(shape):
+    """
+    Generate weight with zeros.
+
+    Args:
+        shape (tuple): Shape.
+
+    Returns:
+        Tensor, var.
+    """
+    zeros = np.zeros(shape).astype(np.float32)
+    return Tensor(zeros)
+
+
+def normal_weight(shape, num_units):
+    """
+    Generate weight with normal dist.
+
+    Args:
+        shape (tuple): Shape.
+        num_units (int): Dimension.
+
+    Returns:
+        Tensor, var.
+    """
+    norm = np.random.normal(0.0, num_units ** -0.5, shape).astype(np.float32)
+    return Tensor(norm)
diff --git a/model_zoo/mass/src/utils/load_weights.py b/model_zoo/mass/src/utils/load_weights.py
new file mode 100644
index 0000000000..c5b30fefe6
--- /dev/null
+++ b/model_zoo/mass/src/utils/load_weights.py
@@ -0,0 +1,52 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Weight loader."""
+import numpy as np
+from mindspore.train.serialization import load_checkpoint
+
+
+def load_infer_weights(config):
+    """
+    Load weights from ckpt or npz.
+
+    Args:
+        config (TransformerConfig): Config.
+
+    Returns:
+        dict, weights.
+    """
+    model_path = config.existed_ckpt
+    if model_path.endswith(".npz"):
+        ms_ckpt = np.load(model_path)
+        is_npz = True
+    else:
+        ms_ckpt = load_checkpoint(model_path)
+        is_npz = False
+    weights = {}
+    with open("variable_after_deal.txt", "a") as f:
+        for param_name in ms_ckpt:
+            infer_name = param_name.replace("transformer.transformer.", "")
+            if not infer_name.startswith("encoder"):
+                if infer_name.startswith("decoder.layers."):
+                    infer_name = infer_name.replace("decoder.layers.", "decoder.layer")
+                infer_name = "decoder.decoder." + infer_name
+            if is_npz:
+                weights[infer_name] = ms_ckpt[param_name]
+            else:
+                weights[infer_name] = ms_ckpt[param_name].data.asnumpy()
+            f.write(infer_name)
+            f.write("\n")
+    f.close()
+    return weights
diff --git a/model_zoo/mass/src/utils/loss_monitor.py b/model_zoo/mass/src/utils/loss_monitor.py
new file mode 100644
index 0000000000..80b95c0c12
--- /dev/null
+++ b/model_zoo/mass/src/utils/loss_monitor.py
@@ -0,0 +1,62 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Loss monitor."""
+import time
+from mindspore.train.callback import Callback
+from config import TransformerConfig
+
+
+class LossCallBack(Callback):
+    """
+    Monitor the loss in training.
+
+    If the loss is NAN or INF terminating training.
+
+    Note:
+        If per_print_times is 0 do not print loss.
+
+    Args:
+        per_print_times (int): Print loss every times. Default: 1.
+    """
+    time_stamp_init = False
+    time_stamp_first = 0
+
+    def __init__(self, config: TransformerConfig, per_print_times: int = 1):
+        super(LossCallBack, self).__init__()
+        if not isinstance(per_print_times, int) or per_print_times < 0:
+            raise ValueError("print_step must be int and >= 0.")
+        self.config = config
+        self._per_print_times = per_print_times
+
+        if not self.time_stamp_init:
+            self.time_stamp_first = self._get_ms_timestamp()
+            self.time_stamp_init = True
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        file_name = "./loss.log"
+        with open(file_name, "a+") as f:
+            time_stamp_current = self._get_ms_timestamp()
+            f.write("time: {}, epoch: {}, step: {}, outputs are {}.\n".format(
+                time_stamp_current - self.time_stamp_first,
+                cb_params.cur_epoch_num,
+                cb_params.cur_step_num,
+                str(cb_params.net_outputs)
+            ))
+
+    @staticmethod
+    def _get_ms_timestamp():
+        t = time.time()
+        return int(round(t * 1000))
diff --git a/model_zoo/mass/src/utils/lr_scheduler.py b/model_zoo/mass/src/utils/lr_scheduler.py
new file mode 100644
index 0000000000..44ef397fdd
--- /dev/null
+++ b/model_zoo/mass/src/utils/lr_scheduler.py
@@ -0,0 +1,107 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Learning scheduler."""
+from math import ceil
+
+import numpy as np
+
+
+def square_root_schedule(lr, update_num, decay_start_step,
+                         warmup_steps=2000,
+                         min_lr=1e-5):
+    """
+    Decay the LR based on the ISR(inverse square root).
+
+    During warm-up::
+        lrs = np.linspace(0, lr, warmup_steps)
+
+    After warm-up:
+        decay_factor = lr * sqrt(warmup_steps)
+        lr = decay_factor / sqrt(step) if step >= decay_start_step else lr
+
+    Args:
+        lr (float): Init learning rate.
+        update_num (int): Total steps.
+        decay_start_step (int): Decay begins after `decay_start_step` steps.
+        warmup_steps (int): Warm up steps.
+        min_lr (float): Min learning rate.
+
+    Returns:
+        np.ndarray, learning rate array.
+    """
+    warmup_end_lr = lr
+    warmup_init_lr = 0 if warmup_steps > 0 else warmup_end_lr
+
+    # If warmup_init_lr > lr, then lr_step is negative.
+    # Otherwise, it's positive.
+    lr_step = (warmup_end_lr - warmup_init_lr) / warmup_steps
+    decay_factor = lr * warmup_steps ** 0.5
+
+    lrs = np.empty(shape=update_num, dtype=np.float32)
+    _start_step = 0
+    if 0 < warmup_steps < update_num:
+        lrs[:warmup_steps] = np.linspace(warmup_init_lr, warmup_end_lr, warmup_steps)
+        _start_step = warmup_steps
+
+    for step in range(_start_step, update_num):
+        if step < warmup_steps:
+            _lr = warmup_init_lr + step * lr_step
+        elif step < decay_start_step:
+            _lr = lr
+        else:
+            _lr = decay_factor * step ** -0.5
+            if _lr < min_lr:
+                _lr = min_lr
+        lrs[step] = _lr
+
+    return lrs
+
+
+def polynomial_decay_scheduler(lr, min_lr, decay_steps, total_update_num, warmup_steps=1000, power=1.0):
+    """
+    Implements of polynomial decay learning rate scheduler which cycles by default.
+
+    Args:
+        lr (float): Initial learning rate.
+        warmup_steps (int): Warmup steps.
+        decay_steps (int): Decay steps.
+        total_update_num (int): Total update steps.
+        min_lr (float): Min learning.
+        power (float): Power factor.
+
+    Returns:
+        np.ndarray, learning rate of each step.
+    """
+    lrs = np.zeros(shape=total_update_num, dtype=np.float32)
+
+    if decay_steps <= 0:
+        raise ValueError("`decay_steps` must larger than 1.")
+
+    _start_step = 0
+    if 0 < warmup_steps < total_update_num:
+        warmup_end_lr = lr
+        warmup_init_lr = 0 if warmup_steps > 0 else warmup_end_lr
+        lrs[:warmup_steps] = np.linspace(warmup_init_lr, warmup_end_lr, warmup_steps)
+        _start_step = warmup_steps
+
+    decay_steps = decay_steps
+    for step in range(_start_step, total_update_num):
+        _step = step - _start_step  # 2999
+        ratio = ceil(_step / decay_steps)  # 3
+        ratio = 1 if ratio < 1 else ratio
+        _decay_steps = decay_steps * ratio  # 3000
+        lrs[step] = (lr - min_lr) * pow(1 - _step / _decay_steps, power) + min_lr
+
+    return lrs
diff --git a/model_zoo/mass/src/utils/ppl_score.py b/model_zoo/mass/src/utils/ppl_score.py
new file mode 100644
index 0000000000..2e5d6e6642
--- /dev/null
+++ b/model_zoo/mass/src/utils/ppl_score.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Calculate Perplexity score under N-gram language model."""
+from typing import Union
+
+import numpy as np
+
+NINF = -1.0 * 1e9
+
+
+def ngram_ppl(prob: Union[np.ndarray, list], log_softmax=False, index: float = np.e):
+    """
+    Calculate Perplexity(PPL) score under N-gram language model.
+
+    Please make sure the sum of `prob` is 1.
+    Otherwise, assign `normalize=True`.
+
+    The number of N is depended by model.
+
+    Args:
+        prob (Union[list, np.ndarray]): Prediction probability
+            of the sentence.
+        log_softmax (bool): If sum of `prob` is not 1, please
+            set normalize=True.
+        index (float): Base number of log softmax.
+
+    Returns:
+        float, ppl score.
+    """
+    eps = 1e-8
+    if not isinstance(prob, (np.ndarray, list)):
+        raise TypeError("`prob` must be type of list or np.ndarray.")
+    if not isinstance(prob, np.ndarray):
+        prob = np.array(prob)
+    if prob.shape[0] == 0:
+        raise ValueError("`prob` length must greater than 0.")
+
+    p = 1.0
+    sen_len = 0
+    for t in range(prob.shape[0]):
+        s = prob[t]
+        if s <= NINF:
+            break
+        if log_softmax:
+            s = np.power(index, s)
+        p *= (1 / (s + eps))
+        sen_len += 1
+
+    if sen_len == 0:
+        return np.inf
+
+    return pow(p, 1 / sen_len)
diff --git a/model_zoo/mass/src/utils/preprocess.py b/model_zoo/mass/src/utils/preprocess.py
new file mode 100644
index 0000000000..04f7eeaf5c
--- /dev/null
+++ b/model_zoo/mass/src/utils/preprocess.py
@@ -0,0 +1,127 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Create pre-training dataset."""
+import os
+from multiprocessing import Pool, cpu_count
+
+from src.dataset import MonoLingualDataLoader
+from src.language_model import LooseMaskedLanguageModel
+
+
+def _create_pre_train(text_file, vocabulary, output_folder_path,
+                      mask_ratio,
+                      mask_all_prob,
+                      min_sen_len,
+                      max_sen_len,
+                      suffix,
+                      dataset_type):
+    """
+    Create pre-training dataset.
+
+    Args:
+        text_file (str): Text file path.
+        vocabulary (Dictionary): Vocab instance.
+        output_folder_path (str): Output folder path.
+        mask_ratio (float): Mask ratio.
+        mask_all_prob (float): Mask all ratio.
+        min_sen_len (int): Minimum sentence length.
+        max_sen_len (int): Maximum sentence length.
+        suffix (str): Suffix of output file.
+        dataset_type (str): Tfrecord or mindrecord.
+    """
+    suffix = suffix if not suffix else "_" + suffix
+    loader = MonoLingualDataLoader(
+        src_filepath=text_file,
+        lang="en", dictionary=vocabulary,
+        language_model=LooseMaskedLanguageModel(mask_ratio=mask_ratio, mask_all_prob=mask_all_prob),
+        max_sen_len=max_sen_len, min_sen_len=min_sen_len
+    )
+    src_file_name = os.path.basename(text_file)
+    if dataset_type.lower() == "tfrecord":
+        file_name = os.path.join(
+            output_folder_path,
+            src_file_name.replace('.txt', f'_len_{max_sen_len}{suffix}.tfrecord')
+        )
+        loader.write_to_tfrecord(path=file_name)
+    else:
+        file_name = os.path.join(
+            output_folder_path,
+            src_file_name.replace('.txt', f'_len_{max_sen_len}{suffix}.mindrecord')
+        )
+        loader.write_to_mindrecord(path=file_name)
+
+
+def create_pre_training_dataset(folder_path,
+                                output_folder_path,
+                                vocabulary,
+                                prefix, suffix="",
+                                mask_ratio=0.3,
+                                mask_all_prob=None,
+                                min_sen_len=7,
+                                max_sen_len=82,
+                                dataset_type="tfrecord",
+                                cores=2):
+    """
+    Create pre-training dataset.
+
+    Args:
+        folder_path (str): Text file folder path.
+        vocabulary (Dictionary): Vocab instance.
+        output_folder_path (str): Output folder path.
+        mask_ratio (float): Mask ratio.
+        mask_all_prob (float): Mask all ratio.
+        min_sen_len (int): Minimum sentence length.
+        max_sen_len (int): Maximum sentence length.
+        prefix (str): Prefix of text file.
+        suffix (str): Suffix of output file.
+        dataset_type (str): Tfrecord or mindrecord.
+        cores (int): Cores to use.
+    """
+    # Second step of data preparation.
+    # Create mono zh-zh train MindRecord.
+    if not os.path.exists(output_folder_path):
+        raise NotADirectoryError(f"`output_folder_path` is not existed.")
+    if not os.path.isdir(output_folder_path):
+        raise NotADirectoryError(f"`output_folder_path` must be a dir.")
+
+    data_file = []
+    dirs = os.listdir(folder_path)
+    for file in dirs:
+        if file.startswith(prefix) and file.endswith(".txt"):
+            data_file.append(os.path.join(folder_path, file))
+
+    if not data_file:
+        raise FileNotFoundError("No available text file found.")
+
+    args_groups = []
+    for text_file in data_file:
+        args_groups.append((text_file,
+                            vocabulary,
+                            output_folder_path,
+                            mask_ratio,
+                            mask_all_prob,
+                            min_sen_len,
+                            max_sen_len,
+                            suffix,
+                            dataset_type))
+
+    cores = min(cores, cpu_count())
+    pool = Pool(cores)
+    for arg in args_groups:
+        pool.apply_async(_create_pre_train, args=arg)
+    pool.close()
+    pool.join()
+
+    print(f" | Generate Dataset for Pre-training is done.")
diff --git a/model_zoo/mass/src/utils/rouge_score.py b/model_zoo/mass/src/utils/rouge_score.py
new file mode 100644
index 0000000000..f453b5d2e1
--- /dev/null
+++ b/model_zoo/mass/src/utils/rouge_score.py
@@ -0,0 +1,60 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Calculate ROUGE score."""
+from typing import List
+from rouge import Rouge
+
+H_PATH = "summaries.txt"
+R_PATH = "references.txt"
+
+
+def rouge(hypothesis: List[str], target: List[str]):
+    """
+    Calculate ROUGE score.
+
+    Args:
+        hypothesis (List[str]): Inference result.
+        target (List[str]): Reference.
+    """
+
+    def cut(s):
+        idx = s.find("</s>")
+        if idx != -1:
+            s = s[:idx]
+        return s
+
+    if not hypothesis or not target:
+        raise ValueError(f"`hypothesis` and `target` can not be None.")
+
+    edited_hyp = []
+    edited_ref = []
+    for h, r in zip(hypothesis, target):
+        h = cut(h).replace("<s>", "").strip()
+        r = cut(r).replace("<s>", "").strip()
+        edited_hyp.append(h + "\n")
+        edited_ref.append(r + "\n")
+
+    _rouge = Rouge()
+    scores = _rouge.get_scores(edited_hyp, target, avg=True)
+    print(" | ROUGE Score:")
+    print(f" | RG-1(F): {scores['rouge-1']['f'] * 100:8.2f}")
+    print(f" | RG-2(F): {scores['rouge-2']['f'] * 100:8.2f}")
+    print(f" | RG-L(F): {scores['rouge-l']['f'] * 100:8.2f}")
+
+    with open(H_PATH, "w") as f:
+        f.writelines(edited_hyp)
+
+    with open(R_PATH, "w") as f:
+        f.writelines(edited_ref)
diff --git a/model_zoo/mass/tokenize_corpus.py b/model_zoo/mass/tokenize_corpus.py
new file mode 100644
index 0000000000..4717cfdd12
--- /dev/null
+++ b/model_zoo/mass/tokenize_corpus.py
@@ -0,0 +1,97 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Tokenizer."""
+import os
+import argparse
+from typing import Callable
+from multiprocessing import Pool
+
+parser = argparse.ArgumentParser(description='Corpus tokenizer which text file must end with `.txt`.')
+parser.add_argument("--corpus_folder", type=str, default="", required=True,
+                    help="Corpus folder path, if multi-folders are provided, use ',' split folders.")
+parser.add_argument("--output_folder", type=str, default="", required=True,
+                    help="Output folder path.")
+parser.add_argument("--tokenizer", type=str, default="nltk", required=False,
+                    help="Tokenizer to be used, nltk or jieba, if nltk is not installed fully, "
+                         "use jieba instead.")
+parser.add_argument("--pool_size", type=int, default=2, required=False,
+                    help="Processes pool size.")
+
+TOKENIZER = Callable
+
+
+def create_tokenized_sentences(file_path, tokenized_file):
+    """
+    Create tokenized sentences.
+
+    Args:
+        file_path (str): Text file.
+        tokenized_file (str): Output file.
+    """
+    global TOKENIZER
+
+    print(f" | Processing {file_path}.")
+    tokenized_sen = []
+    with open(file_path, "r") as file:
+        for sen in file:
+            tokens = TOKENIZER(sen)
+            tokens = [t for t in tokens if t != " "]
+            if len(tokens) > 175:
+                continue
+            tokenized_sen.append(" ".join(tokens) + "\n")
+
+    with open(tokenized_file, "w") as file:
+        file.writelines(tokenized_sen)
+    print(f" | Wrote to {tokenized_file}.")
+
+
+def tokenize():
+    """Tokenizer."""
+    global TOKENIZER
+
+    args, _ = parser.parse_known_args()
+    src_folder = args.corpus_folder.split(",")
+
+    try:
+        from nltk.tokenize import word_tokenize
+
+        TOKENIZER = word_tokenize
+    except (ImportError, ModuleNotFoundError, LookupError):
+        try:
+            import jieba
+        except Exception as e:
+            raise e
+
+        print(" | NLTK is not found, use jieba instead.")
+        TOKENIZER = jieba.cut
+
+    if args.tokenizer == "jieba":
+        import jieba
+        TOKENIZER = jieba.cut
+
+    pool = Pool(args.pool_size)
+    for folder in src_folder:
+        for file in os.listdir(folder):
+            if not file.endswith(".txt"):
+                continue
+            file_path = os.path.join(folder, file)
+            out_path = os.path.join(args.output_folder, file.replace(".txt", "_tokenized.txt"))
+            pool.apply_async(create_tokenized_sentences, (file_path, out_path,))
+    pool.close()
+    pool.join()
+
+
+if __name__ == '__main__':
+    tokenize()
diff --git a/model_zoo/mass/train.py b/model_zoo/mass/train.py
new file mode 100644
index 0000000000..05b96ddae3
--- /dev/null
+++ b/model_zoo/mass/train.py
@@ -0,0 +1,330 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Train api."""
+import os
+import argparse
+import pickle
+
+import numpy as np
+
+import mindspore.common.dtype as mstype
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Momentum
+from mindspore.nn.optim import Adam, Lamb
+from mindspore.train.model import Model
+from mindspore.train.loss_scale_manager import DynamicLossScaleManager
+from mindspore.train.callback import CheckpointConfig, ModelCheckpoint
+from mindspore import context, ParallelMode, Parameter
+from mindspore.communication import management as MultiAscend
+from mindspore.train.serialization import load_checkpoint
+
+from config import TransformerConfig
+from src.dataset import load_dataset
+from src.transformer import TransformerNetworkWithLoss, TransformerTrainOneStepWithLossScaleCell
+from src.transformer.infer_mass import infer
+from src.utils import LossCallBack
+from src.utils import one_weight, zero_weight, weight_variable
+from src.utils import square_root_schedule
+from src.utils.lr_scheduler import polynomial_decay_scheduler
+
+parser = argparse.ArgumentParser(description='MASS train entry point.')
+parser.add_argument("--config", type=str, required=True, help="model config json file path.")
+
+device_id = os.getenv('DEVICE_ID', None)
+if device_id is None:
+    raise RuntimeError("`DEVICE_ID` can not be None.")
+
+device_id = int(device_id)
+context.set_context(
+    mode=context.GRAPH_MODE,
+    device_target="Ascend",
+    reserve_class_name_in_scope=False,
+    device_id=device_id)
+
+
+def get_config(config):
+    config = TransformerConfig.from_json_file(config)
+    config.compute_type = mstype.float16
+    config.dtype = mstype.float32
+    return config
+
+
+def _train(model, config: TransformerConfig,
+           pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None,
+           callbacks: list = None):
+    """
+    Train model.
+
+    Args:
+        model (Model): MindSpore model instance.
+        config (TransformerConfig): Config of mass model.
+        pre_training_dataset (Dataset): Pre-training dataset.
+        fine_tune_dataset (Dataset): Fine-tune dataset.
+        test_dataset (Dataset): Test dataset.
+        callbacks (list): A list of callbacks.
+    """
+    callbacks = callbacks if callbacks else []
+
+    if pre_training_dataset is not None:
+        print(" | Start pre-training job.")
+        epoch_size = pre_training_dataset.get_repeat_count()
+        if os.getenv("RANK_SIZE") is not None and int(os.getenv("RANK_SIZE")) > 1:
+            print(f" | Rank {MultiAscend.get_rank()} Call model train.")
+        model.train(epoch_size, pre_training_dataset,
+                    callbacks=callbacks, dataset_sink_mode=config.dataset_sink_mode)
+        # Test the accuracy of the model.
+        if test_dataset is not None:
+            print(" | Start test job.")
+            result = infer(_config)
+            with open("validation_res_after_pre_training.bin", "wb") as f:
+                pickle.dump(result, f, 1)
+
+    if fine_tune_dataset is not None:
+        print(" | Start fine-tuning job.")
+        epoch_size = fine_tune_dataset.get_repeat_count()
+
+        model.train(epoch_size, fine_tune_dataset,
+                    callbacks=callbacks, dataset_sink_mode=config.dataset_sink_mode)
+
+        # Test the accuracy of the model.
+        if test_dataset is not None:
+            print(" | Start test job.")
+            result = infer(_config)
+            with open("validation_res_after_pre_training.bin", "wb") as f:
+                pickle.dump(result, f, 1)
+
+
+def _build_training_pipeline(config: TransformerConfig,
+                             pre_training_dataset=None,
+                             fine_tune_dataset=None,
+                             test_dataset=None):
+    """
+    Build training pipeline.
+
+    Args:
+        config (TransformerConfig): Config of mass model.
+        pre_training_dataset (Dataset): Pre-training dataset.
+        fine_tune_dataset (Dataset): Fine-tune dataset.
+        test_dataset (Dataset): Test dataset.
+    """
+    net_with_loss = TransformerNetworkWithLoss(config, is_training=True)
+
+    if config.existed_ckpt:
+        if config.existed_ckpt.endswith(".npz"):
+            weights = np.load(config.existed_ckpt)
+        else:
+            weights = load_checkpoint(config.existed_ckpt)
+        for param in net_with_loss.trainable_params():
+            weights_name = param.name
+            if weights_name not in weights:
+                raise ValueError(f"Param {weights_name} is not found in ckpt file.")
+
+            if isinstance(weights[weights_name], Parameter):
+                param.default_input = weights[weights_name].default_input
+            elif isinstance(weights[weights_name], Tensor):
+                param.default_input = Tensor(weights[weights_name].asnumpy(), config.dtype)
+            elif isinstance(weights[weights_name], np.ndarray):
+                param.default_input = Tensor(weights[weights_name], config.dtype)
+            else:
+                param.default_input = weights[weights_name]
+    else:
+        for param in net_with_loss.trainable_params():
+            name = param.name
+            value = param.default_input
+            if isinstance(value, Tensor):
+                if name.endswith(".gamma"):
+                    param.default_input = one_weight(value.asnumpy().shape)
+                elif name.endswith(".beta") or name.endswith(".bias"):
+                    param.default_input = zero_weight(value.asnumpy().shape)
+                else:
+                    param.default_input = weight_variable(value.asnumpy().shape)
+
+    dataset = pre_training_dataset if pre_training_dataset is not None \
+        else fine_tune_dataset
+
+    if dataset is None:
+        raise ValueError("pre-training dataset or fine-tuning dataset must be provided one.")
+
+    update_steps = dataset.get_repeat_count() * dataset.get_dataset_size()
+    if config.lr_scheduler == "isr":
+        lr = Tensor(square_root_schedule(lr=config.lr,
+                                         update_num=update_steps,
+                                         decay_start_step=config.decay_start_step,
+                                         warmup_steps=config.warmup_steps,
+                                         min_lr=config.min_lr), dtype=mstype.float32)
+    elif config.lr_scheduler == "poly":
+        lr = Tensor(polynomial_decay_scheduler(lr=config.lr,
+                                               min_lr=config.min_lr,
+                                               decay_steps=config.decay_steps,
+                                               total_update_num=update_steps,
+                                               warmup_steps=config.warmup_steps,
+                                               power=config.poly_lr_scheduler_power), dtype=mstype.float32)
+    else:
+        lr = config.lr
+
+    if config.optimizer.lower() == "adam":
+        optimizer = Adam(net_with_loss.trainable_params(), lr, beta1=0.9, beta2=0.98)
+    elif config.optimizer.lower() == "lamb":
+        optimizer = Lamb(net_with_loss.trainable_params(), decay_steps=12000,
+                         start_learning_rate=config.lr, end_learning_rate=config.min_lr,
+                         power=10.0, warmup_steps=config.warmup_steps, weight_decay=0.01,
+                         eps=1e-6)
+    elif config.optimizer.lower() == "momentum":
+        optimizer = Momentum(net_with_loss.trainable_params(), lr, momentum=0.9)
+    else:
+        raise ValueError(f"optimizer only support `adam` and `momentum` now.")
+
+    # Dynamic loss scale.
+    scale_manager = DynamicLossScaleManager(init_loss_scale=config.init_loss_scale,
+                                            scale_factor=config.loss_scale_factor,
+                                            scale_window=config.scale_window)
+    net_with_grads = TransformerTrainOneStepWithLossScaleCell(
+        network=net_with_loss, optimizer=optimizer,
+        scale_update_cell=scale_manager.get_update_cell()
+    )
+    net_with_grads.set_train(True)
+    model = Model(net_with_grads)
+    loss_monitor = LossCallBack(config)
+    ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps,
+                                   keep_checkpoint_max=config.keep_ckpt_max)
+
+    rank_size = os.getenv('RANK_SIZE')
+    callbacks = [loss_monitor]
+    if rank_size is not None and int(rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
+        ckpt_callback = ModelCheckpoint(
+            prefix=config.ckpt_prefix,
+            directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
+            config=ckpt_config)
+        callbacks.append(ckpt_callback)
+
+    if rank_size is None or int(rank_size) == 1:
+        ckpt_callback = ModelCheckpoint(
+            prefix=config.ckpt_prefix,
+            directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
+            config=ckpt_config)
+        callbacks.append(ckpt_callback)
+
+    print(f" | ALL SET, PREPARE TO TRAIN.")
+    _train(model=model, config=config,
+           pre_training_dataset=pre_training_dataset,
+           fine_tune_dataset=fine_tune_dataset,
+           test_dataset=test_dataset,
+           callbacks=callbacks)
+
+
+def _setup_parallel_env():
+    context.reset_auto_parallel_context()
+    MultiAscend.init()
+    context.set_auto_parallel_context(
+        parallel_mode=ParallelMode.DATA_PARALLEL,
+        device_num=MultiAscend.get_group_size(),
+        parameter_broadcast=True,
+        mirror_mean=True
+    )
+
+
+def train_parallel(config: TransformerConfig):
+    """
+    Train model with multi ascend chips.
+
+    Args:
+        config (TransformerConfig): Config for MASS model.
+    """
+    _setup_parallel_env()
+
+    print(f" | Starting training on {os.getenv('RANK_SIZE', None)} devices.")
+
+    pre_train_dataset = load_dataset(
+        data_files=config.pre_train_dataset,
+        batch_size=config.batch_size, epoch_count=config.epochs,
+        sink_mode=config.dataset_sink_mode,
+        sink_step=config.dataset_sink_step,
+        rank_size=MultiAscend.get_group_size(),
+        rank_id=MultiAscend.get_rank()
+    ) if config.pre_train_dataset else None
+    fine_tune_dataset = load_dataset(
+        data_files=config.fine_tune_dataset,
+        batch_size=config.batch_size, epoch_count=config.epochs,
+        sink_mode=config.dataset_sink_mode,
+        sink_step=config.dataset_sink_step,
+        rank_size=MultiAscend.get_group_size(),
+        rank_id=MultiAscend.get_rank()
+    ) if config.fine_tune_dataset else None
+    test_dataset = load_dataset(
+        data_files=config.test_dataset,
+        batch_size=config.batch_size, epoch_count=config.epochs,
+        sink_mode=config.dataset_sink_mode,
+        sink_step=config.dataset_sink_step,
+        rank_size=MultiAscend.get_group_size(),
+        rank_id=MultiAscend.get_rank()
+    ) if config.test_dataset else None
+
+    _build_training_pipeline(config=config,
+                             pre_training_dataset=pre_train_dataset,
+                             fine_tune_dataset=fine_tune_dataset,
+                             test_dataset=test_dataset)
+
+
+def train_single(config: TransformerConfig):
+    """
+    Train model on single device.
+
+    Args:
+        config (TransformerConfig): Config for model.
+    """
+    print(" | Starting training on single device.")
+    pre_train_dataset = load_dataset(data_files=config.pre_train_dataset,
+                                     batch_size=config.batch_size,
+                                     epoch_count=config.epochs,
+                                     sink_mode=config.dataset_sink_mode,
+                                     sink_step=config.dataset_sink_step) if config.pre_train_dataset else None
+    fine_tune_dataset = load_dataset(data_files=config.fine_tune_dataset,
+                                     batch_size=config.batch_size,
+                                     epoch_count=config.epochs,
+                                     sink_mode=config.dataset_sink_mode,
+                                     sink_step=config.dataset_sink_step) if config.fine_tune_dataset else None
+    test_dataset = load_dataset(data_files=config.test_dataset,
+                                batch_size=config.batch_size,
+                                epoch_count=config.epochs,
+                                sink_mode=config.dataset_sink_mode,
+                                sink_step=config.dataset_sink_step) if config.test_dataset else None
+
+    _build_training_pipeline(config=config,
+                             pre_training_dataset=pre_train_dataset,
+                             fine_tune_dataset=fine_tune_dataset,
+                             test_dataset=test_dataset)
+
+
+def _check_args(config):
+    if not os.path.exists(config):
+        raise FileNotFoundError("`config` is not existed.")
+    if not isinstance(config, str):
+        raise ValueError("`config` must be type of str.")
+
+
+if __name__ == '__main__':
+    _rank_size = os.getenv('RANK_SIZE')
+
+    args, _ = parser.parse_known_args()
+    _check_args(args.config)
+    _config = get_config(args.config)
+
+    np.random.seed(_config.random_seed)
+    context.set_context(save_graphs=_config.save_graphs)
+
+    if _rank_size is not None and int(_rank_size) > 1:
+        train_parallel(_config)
+    else:
+        train_single(_config)
diff --git a/model_zoo/mass/weights_average.py b/model_zoo/mass/weights_average.py
new file mode 100644
index 0000000000..911181ba45
--- /dev/null
+++ b/model_zoo/mass/weights_average.py
@@ -0,0 +1,81 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Weight average."""
+import os
+import argparse
+import numpy as np
+from mindspore.train.serialization import load_checkpoint
+
+parser = argparse.ArgumentParser(description='transformer')
+parser.add_argument("--input_files", type=str, default=None, required=False,
+                    help="Multi ckpt files path.")
+parser.add_argument("--input_folder", type=str, default=None, required=False,
+                    help="Ckpt files folder.")
+parser.add_argument("--output_file", type=str, default=None, required=True,
+                    help="Output model file path.")
+
+
+def average_me_models(ckpt_list):
+    """
+    Average multi ckpt params.
+
+    Args:
+        ckpt_list (list): Ckpt paths.
+
+    Returns:
+        dict, params dict.
+    """
+    avg_model = {}
+    # load all checkpoint
+    for ckpt in ckpt_list:
+        if not ckpt.endswith(".ckpt"):
+            continue
+        if not os.path.exists(ckpt):
+            raise FileNotFoundError(f"Checkpoint file is not existed.")
+
+        print(f" | Loading ckpt from {ckpt}.")
+        ms_ckpt = load_checkpoint(ckpt)
+        for param_name in ms_ckpt:
+            if param_name not in avg_model:
+                avg_model[param_name] = []
+            avg_model[param_name].append(ms_ckpt[param_name].data.asnumpy())
+
+    for name in avg_model:
+        avg_model[name] = sum(avg_model[name]) / float(len(ckpt_list))
+
+    return avg_model
+
+
+def main():
+    """Entry point."""
+    args, _ = parser.parse_known_args()
+
+    if not args.input_files and not args.input_folder:
+        raise ValueError("`--input_files` or `--input_folder` must be provided one as least.")
+
+    ckpt_list = []
+    if args.input_files:
+        ckpt_list.extend(args.input_files.split(","))
+
+    if args.input_folder and os.path.exists(args.input_folder) and os.path.isdir(args.input_folder):
+        for file in os.listdir(args.input_folder):
+            ckpt_list.append(os.path.join(args.input_folder, file))
+
+    avg_weights = average_me_models(ckpt_list)
+    np.savez(args.output_file, **avg_weights)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_zoo/mobilenetv2/src/mobilenetV2.py b/model_zoo/mobilenetv2/src/mobilenetV2.py
index df35c5f369..5b1b4cc5ef 100644
--- a/model_zoo/mobilenetv2/src/mobilenetV2.py
+++ b/model_zoo/mobilenetv2/src/mobilenetV2.py
@@ -267,21 +267,21 @@ class MobileNetV2(nn.Cell):
             if isinstance(m, (nn.Conv2d, DepthwiseConv)):
                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                 m.weight.set_parameter_data(Tensor(np.random.normal(0, np.sqrt(2. / n),
-                                                                    m.weight.data.shape()).astype("float32")))
+                                                                    m.weight.data.shape).astype("float32")))
                 if m.bias is not None:
                     m.bias.set_parameter_data(
-                        Tensor(np.zeros(m.bias.data.shape(), dtype="float32")))
+                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
             elif isinstance(m, nn.BatchNorm2d):
                 m.gamma.set_parameter_data(
-                    Tensor(np.ones(m.gamma.data.shape(), dtype="float32")))
+                    Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
                 m.beta.set_parameter_data(
-                    Tensor(np.zeros(m.beta.data.shape(), dtype="float32")))
+                    Tensor(np.zeros(m.beta.data.shape, dtype="float32")))
             elif isinstance(m, nn.Dense):
                 m.weight.set_parameter_data(Tensor(np.random.normal(
-                    0, 0.01, m.weight.data.shape()).astype("float32")))
+                    0, 0.01, m.weight.data.shape).astype("float32")))
                 if m.bias is not None:
                     m.bias.set_parameter_data(
-                        Tensor(np.zeros(m.bias.data.shape(), dtype="float32")))
+                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
 
 
 def mobilenet_v2(**kwargs):
diff --git a/model_zoo/mobilenetv3/src/mobilenetV3.py b/model_zoo/mobilenetv3/src/mobilenetV3.py
index 820e60493f..61b63f9ea1 100644
--- a/model_zoo/mobilenetv3/src/mobilenetV3.py
+++ b/model_zoo/mobilenetv3/src/mobilenetV3.py
@@ -322,21 +322,21 @@ class MobileNetV3(nn.Cell):
             if isinstance(m, (nn.Conv2d)):
                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                 m.weight.set_parameter_data(Tensor(np.random.normal(0, np.sqrt(2. / n),
-                                                                    m.weight.data.shape()).astype("float32")))
+                                                                    m.weight.data.shape).astype("float32")))
                 if m.bias is not None:
                     m.bias.set_parameter_data(
-                        Tensor(np.zeros(m.bias.data.shape(), dtype="float32")))
+                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
             elif isinstance(m, nn.BatchNorm2d):
                 m.gamma.set_parameter_data(
-                    Tensor(np.ones(m.gamma.data.shape(), dtype="float32")))
+                    Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
                 m.beta.set_parameter_data(
-                    Tensor(np.zeros(m.beta.data.shape(), dtype="float32")))
+                    Tensor(np.zeros(m.beta.data.shape, dtype="float32")))
             elif isinstance(m, nn.Dense):
                 m.weight.set_parameter_data(Tensor(np.random.normal(
-                    0, 0.01, m.weight.data.shape()).astype("float32")))
+                    0, 0.01, m.weight.data.shape).astype("float32")))
                 if m.bias is not None:
                     m.bias.set_parameter_data(
-                        Tensor(np.zeros(m.bias.data.shape(), dtype="float32")))
+                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
 
 
 def mobilenet_v3(model_name, **kwargs):
diff --git a/example/resnet101_imagenet2012/README.md b/model_zoo/resnet101/README.md
similarity index 77%
rename from example/resnet101_imagenet2012/README.md
rename to model_zoo/resnet101/README.md
index 6ccaf5f6b6..86744be372 100644
--- a/example/resnet101_imagenet2012/README.md
+++ b/model_zoo/resnet101/README.md
@@ -20,19 +20,24 @@ This is an example of training ResNet101 with ImageNet dataset in MindSpore.
     └─validation_preprocess
 ```
 
-## Example structure
+## Structure
  
 ```shell
 .
-├── crossentropy.py                 # CrossEntropy loss function
-├── config.py                       # parameter configuration
-├── dataset.py                      # data preprocessing
-├── eval.py                         # eval net
-├── lr_generator.py                 # generate learning rate
-├── run_distribute_train.sh         # launch distributed training(8p)
-├── run_infer.sh                    # launch evaluating
-├── run_standalone_train.sh         # launch standalone training(1p)
-└── train.py                        # train net
+└─resnet101      
+  ├─README.md
+  ├─scripts      
+    ├─run_standalone_train.sh         # launch standalone training(1p)
+    ├─run_distribute_train.sh         # launch distributed training(8p)
+    └─run_eval.sh                     # launch evaluating
+  ├─src
+    ├─config.py                       # parameter configuration
+    ├─crossentropy.py                 # CrossEntropy loss function
+    ├─dataset.py                      # data preprocessin
+    ├─lr_generator.py                 # generate learning rate
+    ├─resnet101.py                    # resnet101 backbone
+  ├─eval.py                           # eval net
+  └─train.py                          # train net
 ```
  
 ## Parameter configuration
@@ -95,7 +100,7 @@ sh run_standalone_train.sh dataset/ilsvrc ./ckpt/pretrained.ckpt
 
 #### Result
  
-Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in log.
+Training result will be stored in the scripts path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in log.
 
  
 ```
@@ -119,14 +124,14 @@ epoch: 70 step: 5004, loss is 1.8717369
  
 ```
 # infer
-sh run_infer.sh [VALIDATION_DATASET_PATH] [CHECKPOINT_PATH]
+sh run_eval.sh [VALIDATION_DATASET_PATH] [CHECKPOINT_PATH]
 ```
  
 #### Launch
  
 ```bash
 # infer with checkpoint
-sh run_infer.sh dataset/validation_preprocess/ train_parallel0/resnet-120_5004.ckpt
+sh run_eval.sh dataset/validation_preprocess/ train_parallel0/resnet-120_5004.ckpt
 
 ```
  
@@ -135,7 +140,7 @@ sh run_infer.sh dataset/validation_preprocess/ train_parallel0/resnet-120_5004.c
 
 #### Result
  
-Inference result will be stored in the example path, whose folder name is "infer". Under this, you can find result like the followings in log.
+Inference result will be stored in the scripts path, whose folder name is "eval". Under this, you can find result like the followings in log.
  
 ```
 result: {'top_5_accuracy': 0.9429417413572343, 'top_1_accuracy': 0.7853513124199744} ckpt=train_parallel0/resnet-120_5004.ckpt
diff --git a/example/resnet101_imagenet2012/eval.py b/model_zoo/resnet101/eval.py
similarity index 95%
rename from example/resnet101_imagenet2012/eval.py
rename to model_zoo/resnet101/eval.py
index 88d942866b..73c0289ebd 100755
--- a/example/resnet101_imagenet2012/eval.py
+++ b/model_zoo/resnet101/eval.py
@@ -19,16 +19,16 @@ import os
 import argparse
 import random
 import numpy as np
-from dataset import create_dataset
-from config import config
 from mindspore import context
-from mindspore.model_zoo.resnet import resnet101
 from mindspore.parallel._auto_parallel_context import auto_parallel_context
 from mindspore.train.model import Model, ParallelMode
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 import mindspore.dataset.engine as de
 from mindspore.communication.management import init
-from crossentropy import CrossEntropy
+from src.resnet101 import resnet101
+from src.dataset import create_dataset
+from src.config import config
+from src.crossentropy import CrossEntropy
 
 random.seed(1)
 np.random.seed(1)
diff --git a/example/resnet101_imagenet2012/run_distribute_train.sh b/model_zoo/resnet101/scripts/run_distribute_train.sh
similarity index 96%
rename from example/resnet101_imagenet2012/run_distribute_train.sh
rename to model_zoo/resnet101/scripts/run_distribute_train.sh
index 8f8021202d..65790b88c1 100755
--- a/example/resnet101_imagenet2012/run_distribute_train.sh
+++ b/model_zoo/resnet101/scripts/run_distribute_train.sh
@@ -67,8 +67,9 @@ do
     export RANK_ID=$i
     rm -rf ./train_parallel$i
     mkdir ./train_parallel$i
-    cp *.py ./train_parallel$i
+    cp ../*.py ./train_parallel$i
     cp *.sh ./train_parallel$i
+    cp -r ../src ./train_parallel$i
     cd ./train_parallel$i || exit
     echo "start training for rank $RANK_ID, device $DEVICE_ID"
     env > env.log
diff --git a/example/resnet101_imagenet2012/run_infer.sh b/model_zoo/resnet101/scripts/run_eval.sh
similarity index 87%
rename from example/resnet101_imagenet2012/run_infer.sh
rename to model_zoo/resnet101/scripts/run_eval.sh
index b82427e15f..88f5d364ce 100755
--- a/example/resnet101_imagenet2012/run_infer.sh
+++ b/model_zoo/resnet101/scripts/run_eval.sh
@@ -16,7 +16,7 @@
 
 if [ $# != 2 ]
 then 
-    echo "Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]"
+    echo "Usage: sh run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]"
 exit 1
 fi
 
@@ -50,14 +50,15 @@ export DEVICE_ID=0
 export RANK_SIZE=$DEVICE_NUM
 export RANK_ID=0
 
-if [ -d "infer" ];
+if [ -d "eval" ];
 then
-    rm -rf ./infer
+    rm -rf ./eval
 fi
-mkdir ./infer
-cp *.py ./infer
-cp *.sh ./infer
-cd ./infer || exit
+mkdir ./eval
+cp ../*.py ./eval
+cp *.sh ./eval
+cp -r ../src ./eval
+cd ./eval || exit
 env > env.log
 echo "start infering for device $DEVICE_ID"
 python eval.py --do_eval=True --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
diff --git a/example/resnet101_imagenet2012/run_standalone_train.sh b/model_zoo/resnet101/scripts/run_standalone_train.sh
similarity index 97%
rename from example/resnet101_imagenet2012/run_standalone_train.sh
rename to model_zoo/resnet101/scripts/run_standalone_train.sh
index 7db8b5d7bc..7214d114d5 100755
--- a/example/resnet101_imagenet2012/run_standalone_train.sh
+++ b/model_zoo/resnet101/scripts/run_standalone_train.sh
@@ -58,8 +58,9 @@ then
     rm -rf ./train
 fi
 mkdir ./train
-cp *.py ./train
+cp ../*.py ./train
 cp *.sh ./train
+cp -r ../src ./train
 cd ./train || exit
 echo "start training for device $DEVICE_ID"
 env > env.log
diff --git a/example/resnet101_imagenet2012/config.py b/model_zoo/resnet101/src/config.py
similarity index 100%
rename from example/resnet101_imagenet2012/config.py
rename to model_zoo/resnet101/src/config.py
diff --git a/example/resnet101_imagenet2012/crossentropy.py b/model_zoo/resnet101/src/crossentropy.py
similarity index 100%
rename from example/resnet101_imagenet2012/crossentropy.py
rename to model_zoo/resnet101/src/crossentropy.py
diff --git a/example/resnet101_imagenet2012/dataset.py b/model_zoo/resnet101/src/dataset.py
similarity index 99%
rename from example/resnet101_imagenet2012/dataset.py
rename to model_zoo/resnet101/src/dataset.py
index 31377cfc12..b2a074a535 100755
--- a/example/resnet101_imagenet2012/dataset.py
+++ b/model_zoo/resnet101/src/dataset.py
@@ -20,7 +20,7 @@ import mindspore.common.dtype as mstype
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.vision.c_transforms as C
 import mindspore.dataset.transforms.c_transforms as C2
-from config import config
+from src.config import config
 
 def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
     """
diff --git a/example/resnet101_imagenet2012/lr_generator.py b/model_zoo/resnet101/src/lr_generator.py
similarity index 100%
rename from example/resnet101_imagenet2012/lr_generator.py
rename to model_zoo/resnet101/src/lr_generator.py
diff --git a/model_zoo/resnet101/src/resnet101.py b/model_zoo/resnet101/src/resnet101.py
new file mode 100755
index 0000000000..33f10fd6cb
--- /dev/null
+++ b/model_zoo/resnet101/src/resnet101.py
@@ -0,0 +1,261 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""ResNet101."""
+import numpy as np
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.common.tensor import Tensor
+
+
+def _weight_variable(shape, factor=0.01):
+    init_value = np.random.randn(*shape).astype(np.float32) * factor
+    return Tensor(init_value)
+
+
+def _conv3x3(in_channel, out_channel, stride=1):
+    weight_shape = (out_channel, in_channel, 3, 3)
+    weight = _weight_variable(weight_shape)
+    return nn.Conv2d(in_channel, out_channel,
+                     kernel_size=3, stride=stride, padding=0, pad_mode='same', weight_init=weight)
+
+
+def _conv1x1(in_channel, out_channel, stride=1):
+    weight_shape = (out_channel, in_channel, 1, 1)
+    weight = _weight_variable(weight_shape)
+    return nn.Conv2d(in_channel, out_channel,
+                     kernel_size=1, stride=stride, padding=0, pad_mode='same', weight_init=weight)
+
+
+def _conv7x7(in_channel, out_channel, stride=1):
+    weight_shape = (out_channel, in_channel, 7, 7)
+    weight = _weight_variable(weight_shape)
+    return nn.Conv2d(in_channel, out_channel,
+                     kernel_size=7, stride=stride, padding=0, pad_mode='same', weight_init=weight)
+
+
+def _bn(channel):
+    return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
+                          gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
+
+
+def _bn_last(channel):
+    return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
+                          gamma_init=0, beta_init=0, moving_mean_init=0, moving_var_init=1)
+
+
+def _fc(in_channel, out_channel):
+    weight_shape = (out_channel, in_channel)
+    weight = _weight_variable(weight_shape)
+    return nn.Dense(in_channel, out_channel, has_bias=True, weight_init=weight, bias_init=0)
+
+
+class ResidualBlock(nn.Cell):
+    """
+    ResNet V1 residual block definition.
+
+    Args:
+        in_channel (int): Input channel.
+        out_channel (int): Output channel.
+        stride (int): Stride size for the first convolutional layer. Default: 1.
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ResidualBlock(3, 256, stride=2)
+    """
+    expansion = 4
+
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 stride=1):
+        super(ResidualBlock, self).__init__()
+
+        channel = out_channel // self.expansion
+        self.conv1 = _conv1x1(in_channel, channel, stride=1)
+        self.bn1 = _bn(channel)
+
+        self.conv2 = _conv3x3(channel, channel, stride=stride)
+        self.bn2 = _bn(channel)
+
+        self.conv3 = _conv1x1(channel, out_channel, stride=1)
+        self.bn3 = _bn_last(out_channel)
+
+        self.relu = nn.ReLU()
+
+        self.down_sample = False
+
+        if stride != 1 or in_channel != out_channel:
+            self.down_sample = True
+        self.down_sample_layer = None
+
+        if self.down_sample:
+            self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride),
+                                                        _bn(out_channel)])
+        self.add = P.TensorAdd()
+
+    def construct(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.down_sample:
+            identity = self.down_sample_layer(identity)
+
+        out = self.add(out, identity)
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Cell):
+    """
+    ResNet architecture.
+
+    Args:
+        block (Cell): Block for network.
+        layer_nums (list): Numbers of block in different layers.
+        in_channels (list): Input channel in each layer.
+        out_channels (list): Output channel in each layer.
+        strides (list):  Stride size in each layer.
+        num_classes (int): The number of classes that the training images are belonging to.
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ResNet(ResidualBlock,
+        >>>        [3, 4, 6, 3],
+        >>>        [64, 256, 512, 1024],
+        >>>        [256, 512, 1024, 2048],
+        >>>        [1, 2, 2, 2],
+        >>>        10)
+    """
+
+    def __init__(self,
+                 block,
+                 layer_nums,
+                 in_channels,
+                 out_channels,
+                 strides,
+                 num_classes):
+        super(ResNet, self).__init__()
+
+        if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:
+            raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!")
+
+        self.conv1 = _conv7x7(3, 64, stride=2)
+        self.bn1 = _bn(64)
+        self.relu = P.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
+
+        self.layer1 = self._make_layer(block,
+                                       layer_nums[0],
+                                       in_channel=in_channels[0],
+                                       out_channel=out_channels[0],
+                                       stride=strides[0])
+        self.layer2 = self._make_layer(block,
+                                       layer_nums[1],
+                                       in_channel=in_channels[1],
+                                       out_channel=out_channels[1],
+                                       stride=strides[1])
+        self.layer3 = self._make_layer(block,
+                                       layer_nums[2],
+                                       in_channel=in_channels[2],
+                                       out_channel=out_channels[2],
+                                       stride=strides[2])
+        self.layer4 = self._make_layer(block,
+                                       layer_nums[3],
+                                       in_channel=in_channels[3],
+                                       out_channel=out_channels[3],
+                                       stride=strides[3])
+
+        self.mean = P.ReduceMean(keep_dims=True)
+        self.flatten = nn.Flatten()
+        self.end_point = _fc(out_channels[3], num_classes)
+
+    def _make_layer(self, block, layer_num, in_channel, out_channel, stride):
+        """
+        Make stage network of ResNet.
+
+        Args:
+            block (Cell): Resnet block.
+            layer_num (int): Layer number.
+            in_channel (int): Input channel.
+            out_channel (int): Output channel.
+            stride (int): Stride size for the first convolutional layer.
+
+        Returns:
+            SequentialCell, the output layer.
+
+        Examples:
+            >>> _make_layer(ResidualBlock, 3, 128, 256, 2)
+        """
+        layers = []
+
+        resnet_block = block(in_channel, out_channel, stride=stride)
+        layers.append(resnet_block)
+
+        for _ in range(1, layer_num):
+            resnet_block = block(out_channel, out_channel, stride=1)
+            layers.append(resnet_block)
+
+        return nn.SequentialCell(layers)
+
+    def construct(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        c1 = self.maxpool(x)
+
+        c2 = self.layer1(c1)
+        c3 = self.layer2(c2)
+        c4 = self.layer3(c3)
+        c5 = self.layer4(c4)
+
+        out = self.mean(c5, (2, 3))
+        out = self.flatten(out)
+        out = self.end_point(out)
+
+        return out
+
+def resnet101(class_num=1001):
+    """
+    Get ResNet101 neural network.
+
+    Args:
+        class_num (int): Class number.
+
+    Returns:
+        Cell, cell instance of ResNet101 neural network.
+
+    Examples:
+        >>> net = resnet101(1001)
+    """
+    return ResNet(ResidualBlock,
+                  [3, 4, 23, 3],
+                  [64, 256, 512, 1024],
+                  [256, 512, 1024, 2048],
+                  [1, 2, 2, 2],
+                  class_num)
diff --git a/example/resnet101_imagenet2012/train.py b/model_zoo/resnet101/train.py
similarity index 93%
rename from example/resnet101_imagenet2012/train.py
rename to model_zoo/resnet101/train.py
index e3d6adb267..1cd3627a11 100755
--- a/example/resnet101_imagenet2012/train.py
+++ b/model_zoo/resnet101/train.py
@@ -17,12 +17,8 @@ import os
 import argparse
 import random
 import numpy as np
-from dataset import create_dataset
-from lr_generator import warmup_cosine_annealing_lr
-from config import config
 from mindspore import context
 from mindspore import Tensor
-from mindspore.model_zoo.resnet import resnet101
 from mindspore.parallel._auto_parallel_context import auto_parallel_context
 from mindspore.nn.optim.momentum import Momentum
 from mindspore.train.model import Model, ParallelMode
@@ -33,7 +29,11 @@ import mindspore.dataset.engine as de
 from mindspore.communication.management import init
 import mindspore.nn as nn
 import mindspore.common.initializer as weight_init
-from crossentropy import CrossEntropy
+from src.resnet101 import resnet101
+from src.dataset import create_dataset
+from src.lr_generator import warmup_cosine_annealing_lr
+from src.config import config
+from src.crossentropy import CrossEntropy
 
 random.seed(1)
 np.random.seed(1)
@@ -66,12 +66,12 @@ if __name__ == '__main__':
     for _, cell in net.cells_and_names():
         if isinstance(cell, nn.Conv2d):
             cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
-                                                                cell.weight.default_input.shape(),
-                                                                cell.weight.default_input.dtype()).to_tensor()
+                                                                cell.weight.default_input.shape,
+                                                                cell.weight.default_input.dtype).to_tensor()
         if isinstance(cell, nn.Dense):
             cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
-                                                                cell.weight.default_input.shape(),
-                                                                cell.weight.default_input.dtype()).to_tensor()
+                                                                cell.weight.default_input.shape,
+                                                                cell.weight.default_input.dtype).to_tensor()
     if not config.label_smooth:
         config.label_smooth_factor = 0.0
     loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
diff --git a/model_zoo/ssd/README.md b/model_zoo/ssd/README.md
new file mode 100644
index 0000000000..ded107e499
--- /dev/null
+++ b/model_zoo/ssd/README.md
@@ -0,0 +1,119 @@
+# SSD Example
+
+## Description
+
+SSD network based on MobileNetV2, with support for training and evaluation.
+
+## Requirements
+
+- Install [MindSpore](https://www.mindspore.cn/install/en).
+
+- Dataset
+
+    We use coco2017 as training dataset in this example by default, and you can also use your own datasets.
+
+    1. If coco dataset is used. **Select dataset to coco when run script.**
+        Install Cython and pycocotool.
+
+        ```
+        pip install Cython
+
+        pip install pycocotools
+        ```
+        And change the coco_root and other settings you need in `config.py`. The directory structure is as follows:
+
+
+        ```
+        .
+        └─cocodataset
+          ├─annotations
+            ├─instance_train2017.json
+            └─instance_val2017.json
+          ├─val2017
+          └─train2017
+        ```
+
+    2. If your own dataset is used. **Select dataset to other when run script.**
+        Organize the dataset infomation into a TXT file, each row in the file is as follows:
+
+        ```
+        train2017/0000001.jpg 0,259,401,459,7 35,28,324,201,2 0,30,59,80,2
+        ```
+
+        Each row is an image annotation which split by space, the first column is a relative path of image, the others are box and class infomations of the format [xmin,ymin,xmax,ymax,class]. We read image from an image path joined by the `image_dir`(dataset directory) and the relative path in `anno_path`(the TXT file path), `image_dir` and `anno_path` are setting in `config.py`.
+
+
+## Running the example
+
+### Training
+
+To train the model, run `train.py`. If the `mindrecord_dir` is empty, it will generate [mindrecord](https://www.mindspore.cn/tutorial/en/master/use/data_preparation/converting_datasets.html) files by `coco_root`(coco dataset) or `iamge_dir` and `anno_path`(own dataset). **Note if mindrecord_dir isn't empty, it will use mindrecord_dir instead of raw images.**
+
+
+- Stand alone mode
+
+    ```
+    python train.py --dataset coco
+
+    ```
+
+    You can run ```python train.py -h```  to get more information.
+
+
+- Distribute mode
+
+    ```
+    sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json
+    ```
+
+    The input parameters are device numbers, epoch size, learning rate, dataset mode and [hccl json configuration file](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). **It is better to use absolute path.** 
+
+You will get the loss value of each step as following:
+
+```
+epoch: 1 step: 458, loss is 3.1681802
+epoch time: 228752.4654865265, per step time: 499.4595316299705
+epoch: 2 step: 458, loss is 2.8847265
+epoch time: 38912.93382644653, per step time: 84.96273761232868
+epoch: 3 step: 458, loss is 2.8398118
+epoch time: 38769.184827804565, per step time: 84.64887516987896
+...
+
+epoch: 498 step: 458, loss is 0.70908034
+epoch time: 38771.079778671265, per step time: 84.65301261718616
+epoch: 499 step: 458, loss is 0.7974688
+epoch time: 38787.413120269775, per step time: 84.68867493508685
+epoch: 500 step: 458, loss is 0.5548882
+epoch time: 39064.8467540741, per step time: 85.29442522723602
+```
+
+### Evaluation
+
+for evaluation , run `eval.py` with `checkpoint_path`. `checkpoint_path` is the path of [checkpoint](https://www.mindspore.cn/tutorial/en/master/use/saving_and_loading_model_parameters.html) file.
+
+```
+python eval.py --checkpoint_path ssd.ckpt --dataset coco
+```
+
+You can run ```python eval.py -h```  to get more information.
+ 
+You will get the result as following:
+
+```
+Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.189
+Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.341
+Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.183
+Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.040
+Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.181
+Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.326
+Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.213
+Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.348
+Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.380
+Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.124
+Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.412
+Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.588
+
+========================================
+
+mAP: 0.18937438355383837
+```
diff --git a/example/ssd_coco2017/eval.py b/model_zoo/ssd/eval.py
similarity index 78%
rename from example/ssd_coco2017/eval.py
rename to model_zoo/ssd/eval.py
index d5e0d86b67..9054bf6f24 100644
--- a/example/ssd_coco2017/eval.py
+++ b/model_zoo/ssd/eval.py
@@ -14,49 +14,51 @@
 # ============================================================================
 
 """Evaluation for SSD"""
+
 import os
 import argparse
 import time
+import numpy as np
 from mindspore import context, Tensor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from mindspore.model_zoo.ssd import SSD300, ssd_mobilenet_v2
-from dataset import create_ssd_dataset, data_to_mindrecord_byte_image
-from config import ConfigSSD
-from util import metrics
+from src.ssd import SSD300, ssd_mobilenet_v2
+from src.dataset import create_ssd_dataset, data_to_mindrecord_byte_image
+from src.config import config
+from src.coco_eval import metrics
 
 def ssd_eval(dataset_path, ckpt_path):
     """SSD evaluation."""
-
-    ds = create_ssd_dataset(dataset_path, batch_size=1, repeat_num=1, is_training=False)
-    net = SSD300(ssd_mobilenet_v2(), ConfigSSD(), is_training=False)
+    batch_size = 1
+    ds = create_ssd_dataset(dataset_path, batch_size=batch_size, repeat_num=1, is_training=False)
+    net = SSD300(ssd_mobilenet_v2(), config, is_training=False)
     print("Load Checkpoint!")
     param_dict = load_checkpoint(ckpt_path)
     net.init_parameters_data()
     load_param_into_net(net, param_dict)
 
     net.set_train(False)
-    i = 1.
-    total = ds.get_dataset_size()
+    i = batch_size
+    total = ds.get_dataset_size() * batch_size
     start = time.time()
     pred_data = []
     print("\n========================================\n")
     print("total images num: ", total)
     print("Processing, please wait a moment.")
     for data in ds.create_dict_iterator():
+        img_id = data['img_id']
         img_np = data['image']
         image_shape = data['image_shape']
-        annotation = data['annotation']
 
         output = net(Tensor(img_np))
         for batch_idx in range(img_np.shape[0]):
             pred_data.append({"boxes": output[0].asnumpy()[batch_idx],
                               "box_scores": output[1].asnumpy()[batch_idx],
-                              "annotation": annotation,
-                              "image_shape": image_shape})
-        percent = round(i / total * 100, 2)
+                              "img_id": int(np.squeeze(img_id[batch_idx])),
+                              "image_shape": image_shape[batch_idx]})
+        percent = round(i / total * 100., 2)
 
         print(f'    {str(percent)} [{i}/{total}]', end='\r')
-        i += 1
+        i += batch_size
     cost_time = int((time.time() - start) * 1000)
     print(f'    100% [{total}/{total}] cost {cost_time} ms')
     mAP = metrics(pred_data)
@@ -73,22 +75,21 @@ if __name__ == '__main__':
 
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
 
-    config = ConfigSSD()
     prefix = "ssd_eval.mindrecord"
-    mindrecord_dir = config.MINDRECORD_DIR
+    mindrecord_dir = config.mindrecord_dir
     mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
     if not os.path.exists(mindrecord_file):
         if not os.path.isdir(mindrecord_dir):
             os.makedirs(mindrecord_dir)
         if args_opt.dataset == "coco":
-            if os.path.isdir(config.COCO_ROOT):
+            if os.path.isdir(config.coco_root):
                 print("Create Mindrecord.")
                 data_to_mindrecord_byte_image("coco", False, prefix)
                 print("Create Mindrecord Done, at {}".format(mindrecord_dir))
             else:
-                print("COCO_ROOT not exits.")
+                print("coco_root not exits.")
         else:
-            if os.path.isdir(config.IMAGE_DIR) and os.path.exists(config.ANNO_PATH):
+            if os.path.isdir(config.image_dir) and os.path.exists(config.anno_path):
                 print("Create Mindrecord.")
                 data_to_mindrecord_byte_image("other", False, prefix)
                 print("Create Mindrecord Done, at {}".format(mindrecord_dir))
diff --git a/example/ssd_coco2017/run_distribute_train.sh b/model_zoo/ssd/scripts/run_distribute_train.sh
similarity index 74%
rename from example/ssd_coco2017/run_distribute_train.sh
rename to model_zoo/ssd/scripts/run_distribute_train.sh
index bd8519be41..60eccf2c40 100644
--- a/example/ssd_coco2017/run_distribute_train.sh
+++ b/model_zoo/ssd/scripts/run_distribute_train.sh
@@ -14,60 +14,62 @@
 # limitations under the License.
 # ============================================================================
 
-echo "================================================================================================================="
+echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
-echo "for example: sh run_distribute_train.sh 8 350 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
+echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
+echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
 echo "It is better to use absolute path."
-echo "The learning rate is 0.4 as default, if you want other lr, please change the value in this script."
 echo "================================================================================================================="
 
-if [ $# != 4 ] && [ $# != 6 ]
+if [ $# != 5 ] && [ $# != 7 ]
 then
-    echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [DATASET] \
+    echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
 [MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
     exit 1
 fi
 
 # Before start distribute train, first create mindrecord files.
+BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
+cd $BASE_PATH/../ || exit
 python train.py --only_create_dataset=1
 
 echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
 
 export RANK_SIZE=$1
 EPOCH_SIZE=$2
-DATASET=$3
-PRE_TRAINED=$5
-PRE_TRAINED_EPOCH_SIZE=$6
-export MINDSPORE_HCCL_CONFIG_PATH=$4
-
+LR=$3
+DATASET=$4
+PRE_TRAINED=$6
+PRE_TRAINED_EPOCH_SIZE=$7
+export MINDSPORE_HCCL_CONFIG_PATH=$5
 
 for((i=0;i<RANK_SIZE;i++))
 do
     export DEVICE_ID=$i
     rm -rf LOG$i
     mkdir ./LOG$i
-    cp  *.py ./LOG$i
+    cp ./*.py ./LOG$i
+    cp -r ./src ./LOG$i
     cd ./LOG$i || exit
     export RANK_ID=$i
     echo "start training for rank $i, device $DEVICE_ID"
     env > env.log
-    if [ $# == 4 ]
+    if [ $# == 5 ]
     then
-        python ../train.py  \
+        python train.py  \
         --distribute=1  \
-        --lr=0.4 \
+        --lr=$LR \
         --dataset=$DATASET \
         --device_num=$RANK_SIZE  \
         --device_id=$DEVICE_ID  \
         --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
     fi
 
-    if [ $# == 6 ]
+    if [ $# == 7 ]
     then
-        python ../train.py  \
+        python train.py  \
         --distribute=1  \
-        --lr=0.4 \
+        --lr=$LR \
         --dataset=$DATASET \
         --device_num=$RANK_SIZE  \
         --device_id=$DEVICE_ID  \
diff --git a/model_zoo/ssd/src/__init__.py b/model_zoo/ssd/src/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/model_zoo/ssd/src/box_utils.py b/model_zoo/ssd/src/box_utils.py
new file mode 100644
index 0000000000..5e75ab6a4e
--- /dev/null
+++ b/model_zoo/ssd/src/box_utils.py
@@ -0,0 +1,165 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Bbox utils"""
+
+import math
+import itertools as it
+import numpy as np
+from .config import config
+
+
+class GeneratDefaultBoxes():
+    """
+    Generate Default boxes for SSD, follows the order of (W, H, archor_sizes).
+    `self.default_boxes` has a shape of [archor_sizes, H, W, 4], the last dimension is [y, x, h, w].
+    `self.default_boxes_ltrb` has a shape as `self.default_boxes`, the last dimension is [y1, x1, y2, x2].
+    """
+    def __init__(self):
+        fk = config.img_shape[0] / np.array(config.steps)
+        scale_rate = (config.max_scale - config.min_scale) / (len(config.num_default) - 1)
+        scales = [config.min_scale + scale_rate * i for i in range(len(config.num_default))] + [1.0]
+        self.default_boxes = []
+        for idex, feature_size in enumerate(config.feature_size):
+            sk1 = scales[idex]
+            sk2 = scales[idex + 1]
+            sk3 = math.sqrt(sk1 * sk2)
+            if idex == 0:
+                w, h = sk1 * math.sqrt(2), sk1 / math.sqrt(2)
+                all_sizes = [(0.1, 0.1), (w, h), (h, w)]
+            else:
+                all_sizes = [(sk1, sk1)]
+                for aspect_ratio in config.aspect_ratios[idex]:
+                    w, h = sk1 * math.sqrt(aspect_ratio), sk1 / math.sqrt(aspect_ratio)
+                    all_sizes.append((w, h))
+                    all_sizes.append((h, w))
+                all_sizes.append((sk3, sk3))
+
+            assert len(all_sizes) == config.num_default[idex]
+
+            for i, j in it.product(range(feature_size), repeat=2):
+                for w, h in all_sizes:
+                    cx, cy = (j + 0.5) / fk[idex], (i + 0.5) / fk[idex]
+                    self.default_boxes.append([cy, cx, h, w])
+
+        def to_ltrb(cy, cx, h, w):
+            return cy - h / 2, cx - w / 2, cy + h / 2, cx + w / 2
+
+        # For IoU calculation
+        self.default_boxes_ltrb = np.array(tuple(to_ltrb(*i) for i in self.default_boxes), dtype='float32')
+        self.default_boxes = np.array(self.default_boxes, dtype='float32')
+
+
+default_boxes_ltrb = GeneratDefaultBoxes().default_boxes_ltrb
+default_boxes = GeneratDefaultBoxes().default_boxes
+y1, x1, y2, x2 = np.split(default_boxes_ltrb[:, :4], 4, axis=-1)
+vol_anchors = (x2 - x1) * (y2 - y1)
+matching_threshold = config.match_thershold
+
+
+def ssd_bboxes_encode(boxes):
+    """
+    Labels anchors with ground truth inputs.
+
+    Args:
+        boxex: ground truth with shape [N, 5], for each row, it stores [y, x, h, w, cls].
+
+    Returns:
+        gt_loc: location ground truth with shape [num_anchors, 4].
+        gt_label: class ground truth with shape [num_anchors, 1].
+        num_matched_boxes: number of positives in an image.
+    """
+
+    def jaccard_with_anchors(bbox):
+        """Compute jaccard score a box and the anchors."""
+        # Intersection bbox and volume.
+        ymin = np.maximum(y1, bbox[0])
+        xmin = np.maximum(x1, bbox[1])
+        ymax = np.minimum(y2, bbox[2])
+        xmax = np.minimum(x2, bbox[3])
+        w = np.maximum(xmax - xmin, 0.)
+        h = np.maximum(ymax - ymin, 0.)
+
+        # Volumes.
+        inter_vol = h * w
+        union_vol = vol_anchors + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) - inter_vol
+        jaccard = inter_vol / union_vol
+        return np.squeeze(jaccard)
+
+    pre_scores = np.zeros((config.num_ssd_boxes), dtype=np.float32)
+    t_boxes = np.zeros((config.num_ssd_boxes, 4), dtype=np.float32)
+    t_label = np.zeros((config.num_ssd_boxes), dtype=np.int64)
+    for bbox in boxes:
+        label = int(bbox[4])
+        scores = jaccard_with_anchors(bbox)
+        idx = np.argmax(scores)
+        scores[idx] = 2.0
+        mask = (scores > matching_threshold)
+        mask = mask & (scores > pre_scores)
+        pre_scores = np.maximum(pre_scores, scores * mask)
+        t_label = mask * label + (1 - mask) * t_label
+        for i in range(4):
+            t_boxes[:, i] = mask * bbox[i] + (1 - mask) * t_boxes[:, i]
+
+    index = np.nonzero(t_label)
+
+    # Transform to ltrb.
+    bboxes = np.zeros((config.num_ssd_boxes, 4), dtype=np.float32)
+    bboxes[:, [0, 1]] = (t_boxes[:, [0, 1]] + t_boxes[:, [2, 3]]) / 2
+    bboxes[:, [2, 3]] = t_boxes[:, [2, 3]] - t_boxes[:, [0, 1]]
+
+    # Encode features.
+    bboxes_t = bboxes[index]
+    default_boxes_t = default_boxes[index]
+    bboxes_t[:, :2] = (bboxes_t[:, :2] - default_boxes_t[:, :2]) / (default_boxes_t[:, 2:] * config.prior_scaling[0])
+    bboxes_t[:, 2:4] = np.log(bboxes_t[:, 2:4] / default_boxes_t[:, 2:4]) / config.prior_scaling[1]
+    bboxes[index] = bboxes_t
+
+    num_match = np.array([len(np.nonzero(t_label)[0])], dtype=np.int32)
+    return bboxes, t_label.astype(np.int32), num_match
+
+
+def ssd_bboxes_decode(boxes):
+    """Decode predict boxes to [y, x, h, w]"""
+    boxes_t = boxes.copy()
+    default_boxes_t = default_boxes.copy()
+    boxes_t[:, :2] = boxes_t[:, :2] * config.prior_scaling[0] * default_boxes_t[:, 2:] + default_boxes_t[:, :2]
+    boxes_t[:, 2:4] = np.exp(boxes_t[:, 2:4] * config.prior_scaling[1]) * default_boxes_t[:, 2:4]
+
+    bboxes = np.zeros((len(boxes_t), 4), dtype=np.float32)
+
+    bboxes[:, [0, 1]] = boxes_t[:, [0, 1]] - boxes_t[:, [2, 3]] / 2
+    bboxes[:, [2, 3]] = boxes_t[:, [0, 1]] + boxes_t[:, [2, 3]] / 2
+
+    return np.clip(bboxes, 0, 1)
+
+
+def intersect(box_a, box_b):
+    """Compute the intersect of two sets of boxes."""
+    max_yx = np.minimum(box_a[:, 2:4], box_b[2:4])
+    min_yx = np.maximum(box_a[:, :2], box_b[:2])
+    inter = np.clip((max_yx - min_yx), a_min=0, a_max=np.inf)
+    return inter[:, 0] * inter[:, 1]
+
+
+def jaccard_numpy(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes."""
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2] - box_a[:, 0]) *
+              (box_a[:, 3] - box_a[:, 1]))
+    area_b = ((box_b[2] - box_b[0]) *
+              (box_b[3] - box_b[1]))
+    union = area_a + area_b - inter
+    return inter / union
diff --git a/model_zoo/ssd/src/coco_eval.py b/model_zoo/ssd/src/coco_eval.py
new file mode 100644
index 0000000000..eb36618089
--- /dev/null
+++ b/model_zoo/ssd/src/coco_eval.py
@@ -0,0 +1,127 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Coco metrics utils"""
+
+import os
+import json
+import numpy as np
+from .config import config
+from .box_utils import ssd_bboxes_decode
+
+
+def apply_nms(all_boxes, all_scores, thres, max_boxes):
+    """Apply NMS to bboxes."""
+    y1 = all_boxes[:, 0]
+    x1 = all_boxes[:, 1]
+    y2 = all_boxes[:, 2]
+    x2 = all_boxes[:, 3]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+
+    order = all_scores.argsort()[::-1]
+    keep = []
+
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+
+        if len(keep) >= max_boxes:
+            break
+
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thres)[0]
+
+        order = order[inds + 1]
+    return keep
+
+
+def metrics(pred_data):
+    """Calculate mAP of predicted bboxes."""
+    from pycocotools.coco import COCO
+    from pycocotools.cocoeval import COCOeval
+    num_classes = config.num_classes
+
+    coco_root = config.coco_root
+    data_type = config.val_data_type
+
+    #Classes need to train or test.
+    val_cls = config.coco_classes
+    val_cls_dict = {}
+    for i, cls in enumerate(val_cls):
+        val_cls_dict[i] = cls
+
+    anno_json = os.path.join(coco_root, config.instances_set.format(data_type))
+    coco_gt = COCO(anno_json)
+    classs_dict = {}
+    cat_ids = coco_gt.loadCats(coco_gt.getCatIds())
+    for cat in cat_ids:
+        classs_dict[cat["name"]] = cat["id"]
+
+    predictions = []
+    img_ids = []
+
+    for sample in pred_data:
+        pred_boxes = sample['boxes']
+        box_scores = sample['box_scores']
+        img_id = sample['img_id']
+        h, w = sample['image_shape']
+
+        pred_boxes = ssd_bboxes_decode(pred_boxes)
+        final_boxes = []
+        final_label = []
+        final_score = []
+        img_ids.append(img_id)
+
+        for c in range(1, num_classes):
+            class_box_scores = box_scores[:, c]
+            score_mask = class_box_scores > config.min_score
+            class_box_scores = class_box_scores[score_mask]
+            class_boxes = pred_boxes[score_mask] * [h, w, h, w]
+
+            if score_mask.any():
+                nms_index = apply_nms(class_boxes, class_box_scores, config.nms_thershold, config.max_boxes)
+                class_boxes = class_boxes[nms_index]
+                class_box_scores = class_box_scores[nms_index]
+
+                final_boxes += class_boxes.tolist()
+                final_score += class_box_scores.tolist()
+                final_label += [classs_dict[val_cls_dict[c]]] * len(class_box_scores)
+
+        for loc, label, score in zip(final_boxes, final_label, final_score):
+            res = {}
+            res['image_id'] = img_id
+            res['bbox'] = [loc[1], loc[0], loc[3] - loc[1], loc[2] - loc[0]]
+            res['score'] = score
+            res['category_id'] = label
+            predictions.append(res)
+    with open('predictions.json', 'w') as f:
+        json.dump(predictions, f)
+
+    coco_dt = coco_gt.loadRes('predictions.json')
+    E = COCOeval(coco_gt, coco_dt, iouType='bbox')
+    E.params.imgIds = img_ids
+    E.evaluate()
+    E.accumulate()
+    E.summarize()
+    return E.stats[0]
diff --git a/model_zoo/ssd/src/config.py b/model_zoo/ssd/src/config.py
new file mode 100644
index 0000000000..683b8de31f
--- /dev/null
+++ b/model_zoo/ssd/src/config.py
@@ -0,0 +1,78 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#" ============================================================================
+
+"""Config parameters for SSD models."""
+
+from easydict import EasyDict as ed
+
+config = ed({
+    "img_shape": [300, 300],
+    "num_ssd_boxes": 1917,
+    "neg_pre_positive": 3,
+    "match_thershold": 0.5,
+    "nms_thershold": 0.6,
+    "min_score": 0.1,
+    "max_boxes": 100,
+
+    # learing rate settings
+    "global_step": 0,
+    "lr_init": 0.001,
+    "lr_end_rate": 0.001,
+    "warmup_epochs": 2,
+    "momentum": 0.9,
+    "weight_decay": 1.5e-4,
+
+    # network
+    "num_default": [3, 6, 6, 6, 6, 6],
+    "extras_in_channels": [256, 576, 1280, 512, 256, 256],
+    "extras_out_channels": [576, 1280, 512, 256, 256, 128],
+    "extras_srides": [1, 1, 2, 2, 2, 2],
+    "extras_ratio": [0.2, 0.2, 0.2, 0.25, 0.5, 0.25],
+    "feature_size": [19, 10, 5, 3, 2, 1],
+    "min_scale": 0.2,
+    "max_scale": 0.95,
+    "aspect_ratios": [(2,), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3)],
+    "steps": (16, 32, 64, 100, 150, 300),
+    "prior_scaling": (0.1, 0.2),
+    "gamma": 2.0,
+    "alpha": 0.75,
+
+    # `mindrecord_dir` and `coco_root` are better to use absolute path.
+    "mindrecord_dir": "/data/MindRecord_COCO",
+    "coco_root": "/data/coco2017",
+    "train_data_type": "train2017",
+    "val_data_type": "val2017",
+    "instances_set": "annotations/instances_{}.json",
+    "coco_classes": ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+                     'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+                     'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+                     'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
+                     'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
+                     'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+                     'kite', 'baseball bat', 'baseball glove', 'skateboard',
+                     'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
+                     'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+                     'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
+                     'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
+                     'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+                     'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+                     'refrigerator', 'book', 'clock', 'vase', 'scissors',
+                     'teddy bear', 'hair drier', 'toothbrush'),
+    "num_classes": 81,
+
+    # if coco used, `image_dir` and `anno_path` are useless.
+    "image_dir": "",
+    "anno_path": "",
+})
diff --git a/model_zoo/ssd/src/dataset.py b/model_zoo/ssd/src/dataset.py
new file mode 100644
index 0000000000..19c66fc598
--- /dev/null
+++ b/model_zoo/ssd/src/dataset.py
@@ -0,0 +1,289 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""SSD dataset"""
+
+from __future__ import division
+
+import os
+import cv2
+import numpy as np
+
+import mindspore.dataset as de
+import mindspore.dataset.transforms.vision.c_transforms as C
+from mindspore.mindrecord import FileWriter
+from .config import config
+from .box_utils import jaccard_numpy, ssd_bboxes_encode
+
+
+def _rand(a=0., b=1.):
+    """Generate random."""
+    return np.random.rand() * (b - a) + a
+
+
+def random_sample_crop(image, boxes):
+    """Random Crop the image and boxes"""
+    height, width, _ = image.shape
+    min_iou = np.random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])
+
+    if min_iou is None:
+        return image, boxes
+
+    # max trails (50)
+    for _ in range(50):
+        image_t = image
+
+        w = _rand(0.3, 1.0) * width
+        h = _rand(0.3, 1.0) * height
+
+        # aspect ratio constraint b/t .5 & 2
+        if h / w < 0.5 or h / w > 2:
+            continue
+
+        left = _rand() * (width - w)
+        top = _rand() * (height - h)
+
+        rect = np.array([int(top), int(left), int(top+h), int(left+w)])
+        overlap = jaccard_numpy(boxes, rect)
+
+        # dropout some boxes
+        drop_mask = overlap > 0
+        if not drop_mask.any():
+            continue
+
+        if overlap[drop_mask].min() < min_iou:
+            continue
+
+        image_t = image_t[rect[0]:rect[2], rect[1]:rect[3], :]
+
+        centers = (boxes[:, :2] + boxes[:, 2:4]) / 2.0
+
+        m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
+        m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
+
+        # mask in that both m1 and m2 are true
+        mask = m1 * m2 * drop_mask
+
+        # have any valid boxes? try again if not
+        if not mask.any():
+            continue
+
+        # take only matching gt boxes
+        boxes_t = boxes[mask, :].copy()
+
+        boxes_t[:, :2] = np.maximum(boxes_t[:, :2], rect[:2])
+        boxes_t[:, :2] -= rect[:2]
+        boxes_t[:, 2:4] = np.minimum(boxes_t[:, 2:4], rect[2:4])
+        boxes_t[:, 2:4] -= rect[:2]
+
+        return image_t, boxes_t
+    return image, boxes
+
+
+def preprocess_fn(img_id, image, box, is_training):
+    """Preprocess function for dataset."""
+    def _infer_data(image, input_shape):
+        img_h, img_w, _ = image.shape
+        input_h, input_w = input_shape
+
+        image = cv2.resize(image, (input_w, input_h))
+
+        #When the channels of image is 1
+        if len(image.shape) == 2:
+            image = np.expand_dims(image, axis=-1)
+            image = np.concatenate([image, image, image], axis=-1)
+
+        return img_id, image, np.array((img_h, img_w), np.float32)
+
+    def _data_aug(image, box, is_training, image_size=(300, 300)):
+        """Data augmentation function."""
+        ih, iw, _ = image.shape
+        w, h = image_size
+
+        if not is_training:
+            return _infer_data(image, image_size)
+
+        # Random crop
+        box = box.astype(np.float32)
+        image, box = random_sample_crop(image, box)
+        ih, iw, _ = image.shape
+
+        # Resize image
+        image = cv2.resize(image, (w, h))
+
+        # Flip image or not
+        flip = _rand() < .5
+        if flip:
+            image = cv2.flip(image, 1, dst=None)
+
+        # When the channels of image is 1
+        if len(image.shape) == 2:
+            image = np.expand_dims(image, axis=-1)
+            image = np.concatenate([image, image, image], axis=-1)
+
+        box[:, [0, 2]] = box[:, [0, 2]] / ih
+        box[:, [1, 3]] = box[:, [1, 3]] / iw
+
+        if flip:
+            box[:, [1, 3]] = 1 - box[:, [3, 1]]
+
+        box, label, num_match = ssd_bboxes_encode(box)
+        return image, box, label, num_match
+    return _data_aug(image, box, is_training, image_size=config.img_shape)
+
+
+def create_coco_label(is_training):
+    """Get image path and annotation from COCO."""
+    from pycocotools.coco import COCO
+
+    coco_root = config.coco_root
+    data_type = config.val_data_type
+    if is_training:
+        data_type = config.train_data_type
+
+    #Classes need to train or test.
+    train_cls = config.coco_classes
+    train_cls_dict = {}
+    for i, cls in enumerate(train_cls):
+        train_cls_dict[cls] = i
+
+    anno_json = os.path.join(coco_root, config.instances_set.format(data_type))
+
+    coco = COCO(anno_json)
+    classs_dict = {}
+    cat_ids = coco.loadCats(coco.getCatIds())
+    for cat in cat_ids:
+        classs_dict[cat["id"]] = cat["name"]
+
+    image_ids = coco.getImgIds()
+    images = []
+    image_path_dict = {}
+    image_anno_dict = {}
+
+    for img_id in image_ids:
+        image_info = coco.loadImgs(img_id)
+        file_name = image_info[0]["file_name"]
+        anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = coco.loadAnns(anno_ids)
+        image_path = os.path.join(coco_root, data_type, file_name)
+        annos = []
+        iscrowd = False
+        for label in anno:
+            bbox = label["bbox"]
+            class_name = classs_dict[label["category_id"]]
+            iscrowd = iscrowd or label["iscrowd"]
+            if class_name in train_cls:
+                x_min, x_max = bbox[0], bbox[0] + bbox[2]
+                y_min, y_max = bbox[1], bbox[1] + bbox[3]
+                annos.append(list(map(round, [y_min, x_min, y_max, x_max])) + [train_cls_dict[class_name]])
+
+        if not is_training and iscrowd:
+            continue
+        if len(annos) >= 1:
+            images.append(img_id)
+            image_path_dict[img_id] = image_path
+            image_anno_dict[img_id] = np.array(annos)
+
+    return images, image_path_dict, image_anno_dict
+
+
+def anno_parser(annos_str):
+    """Parse annotation from string to list."""
+    annos = []
+    for anno_str in annos_str:
+        anno = list(map(int, anno_str.strip().split(',')))
+        annos.append(anno)
+    return annos
+
+
+def filter_valid_data(image_dir, anno_path):
+    """Filter valid image file, which both in image_dir and anno_path."""
+    images = []
+    image_path_dict = {}
+    image_anno_dict = {}
+    if not os.path.isdir(image_dir):
+        raise RuntimeError("Path given is not valid.")
+    if not os.path.isfile(anno_path):
+        raise RuntimeError("Annotation file is not valid.")
+
+    with open(anno_path, "rb") as f:
+        lines = f.readlines()
+    for img_id, line in enumerate(lines):
+        line_str = line.decode("utf-8").strip()
+        line_split = str(line_str).split(' ')
+        file_name = line_split[0]
+        image_path = os.path.join(image_dir, file_name)
+        if os.path.isfile(image_path):
+            images.append(img_id)
+            image_path_dict[img_id] = image_path
+            image_anno_dict[img_id] = anno_parser(line_split[1:])
+
+    return images, image_path_dict, image_anno_dict
+
+
+def data_to_mindrecord_byte_image(dataset="coco", is_training=True, prefix="ssd.mindrecord", file_num=8):
+    """Create MindRecord file."""
+    mindrecord_dir = config.mindrecord_dir
+    mindrecord_path = os.path.join(mindrecord_dir, prefix)
+    writer = FileWriter(mindrecord_path, file_num)
+    if dataset == "coco":
+        images, image_path_dict, image_anno_dict = create_coco_label(is_training)
+    else:
+        images, image_path_dict, image_anno_dict = filter_valid_data(config.image_dir, config.anno_path)
+
+    ssd_json = {
+        "img_id": {"type": "int32", "shape": [1]},
+        "image": {"type": "bytes"},
+        "annotation": {"type": "int32", "shape": [-1, 5]},
+    }
+    writer.add_schema(ssd_json, "ssd_json")
+
+    for img_id in images:
+        image_path = image_path_dict[img_id]
+        with open(image_path, 'rb') as f:
+            img = f.read()
+        annos = np.array(image_anno_dict[img_id], dtype=np.int32)
+        img_id = np.array([img_id], dtype=np.int32)
+        row = {"img_id": img_id, "image": img, "annotation": annos}
+        writer.write_raw_data([row])
+    writer.commit()
+
+
+def create_ssd_dataset(mindrecord_file, batch_size=32, repeat_num=10, device_num=1, rank=0,
+                       is_training=True, num_parallel_workers=4):
+    """Creatr SSD dataset with MindDataset."""
+    ds = de.MindDataset(mindrecord_file, columns_list=["img_id", "image", "annotation"], num_shards=device_num,
+                        shard_id=rank, num_parallel_workers=num_parallel_workers, shuffle=is_training)
+    decode = C.Decode()
+    ds = ds.map(input_columns=["image"], operations=decode)
+    change_swap_op = C.HWC2CHW()
+    normalize_op = C.Normalize(mean=[0.485*255, 0.456*255, 0.406*255], std=[0.229*255, 0.224*255, 0.225*255])
+    color_adjust_op = C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
+    compose_map_func = (lambda img_id, image, annotation: preprocess_fn(img_id, image, annotation, is_training))
+    if is_training:
+        output_columns = ["image", "box", "label", "num_match"]
+        trans = [color_adjust_op, normalize_op, change_swap_op]
+    else:
+        output_columns = ["img_id", "image", "image_shape"]
+        trans = [normalize_op, change_swap_op]
+    ds = ds.map(input_columns=["img_id", "image", "annotation"],
+                output_columns=output_columns, columns_order=output_columns,
+                operations=compose_map_func, python_multiprocessing=is_training,
+                num_parallel_workers=num_parallel_workers)
+    ds = ds.map(input_columns=["image"], operations=trans, python_multiprocessing=is_training,
+                num_parallel_workers=num_parallel_workers)
+    ds = ds.batch(batch_size, drop_remainder=True)
+    ds = ds.repeat(repeat_num)
+    return ds
diff --git a/model_zoo/ssd/src/init_params.py b/model_zoo/ssd/src/init_params.py
new file mode 100644
index 0000000000..6e1f8869b3
--- /dev/null
+++ b/model_zoo/ssd/src/init_params.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Parameters utils"""
+
+from mindspore import Tensor
+from mindspore.common.initializer import initializer, TruncatedNormal
+
+def init_net_param(network, initialize_mode='TruncatedNormal'):
+    """Init the parameters in net."""
+    params = network.trainable_params()
+    for p in params:
+        if isinstance(p.data, Tensor) and 'beta' not in p.name and 'gamma' not in p.name and 'bias' not in p.name:
+            if initialize_mode == 'TruncatedNormal':
+                p.set_parameter_data(initializer(TruncatedNormal(0.03), p.data.shape, p.data.dtype))
+            else:
+                p.set_parameter_data(initialize_mode, p.data.shape, p.data.dtype)
+
+
+def load_backbone_params(network, param_dict):
+    """Init the parameters from pre-train model, default is mobilenetv2."""
+    for _, param in net.parameters_and_names():
+        param_name = param.name.replace('network.backbone.', '')
+        name_split = param_name.split('.')
+        if 'features_1' in param_name:
+            param_name = param_name.replace('features_1', 'features')
+        if 'features_2' in param_name:
+            param_name = '.'.join(['features', str(int(name_split[1]) + 14)] + name_split[2:])
+        if param_name in param_dict:
+            param.set_parameter_data(param_dict[param_name].data)
diff --git a/model_zoo/ssd/src/lr_schedule.py b/model_zoo/ssd/src/lr_schedule.py
new file mode 100644
index 0000000000..4df26b3905
--- /dev/null
+++ b/model_zoo/ssd/src/lr_schedule.py
@@ -0,0 +1,56 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Learning rate schedule"""
+
+import math
+import numpy as np
+
+
+def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch):
+    """
+    generate learning rate array
+
+    Args:
+       global_step(int): total steps of the training
+       lr_init(float): init learning rate
+       lr_end(float): end learning rate
+       lr_max(float): max learning rate
+       warmup_epochs(float): number of warmup epochs
+       total_epochs(int): total epoch of training
+       steps_per_epoch(int): steps of one epoch
+
+    Returns:
+       np.array, learning rate array
+    """
+    lr_each_step = []
+    total_steps = steps_per_epoch * total_epochs
+    warmup_steps = steps_per_epoch * warmup_epochs
+    for i in range(total_steps):
+        if i < warmup_steps:
+            lr = lr_init + (lr_max - lr_init) * i / warmup_steps
+        else:
+            lr = lr_end + \
+                 (lr_max - lr_end) * \
+                 (1. + math.cos(math.pi * (i - warmup_steps) / (total_steps - warmup_steps))) / 2.
+        if lr < 0.0:
+            lr = 0.0
+        lr_each_step.append(lr)
+
+    current_step = global_step
+    lr_each_step = np.array(lr_each_step).astype(np.float32)
+    learning_rate = lr_each_step[current_step:]
+
+    return learning_rate
diff --git a/mindspore/model_zoo/ssd.py b/model_zoo/ssd/src/ssd.py
similarity index 74%
rename from mindspore/model_zoo/ssd.py
rename to model_zoo/ssd/src/ssd.py
index b69942cd5c..d2fb64531e 100644
--- a/mindspore/model_zoo/ssd.py
+++ b/model_zoo/ssd/src/ssd.py
@@ -14,25 +14,17 @@
 # ============================================================================
 
 """SSD net based MobilenetV2."""
+
 import mindspore.common.dtype as mstype
 import mindspore as ms
 import mindspore.nn as nn
-from mindspore import context
+from mindspore import Parameter, context, Tensor
 from mindspore.parallel._auto_parallel_context import auto_parallel_context
 from mindspore.communication.management import get_group_size
 from mindspore.ops import operations as P
 from mindspore.ops import functional as F
 from mindspore.ops import composite as C
 from mindspore.common.initializer import initializer
-from mindspore.ops.operations import TensorAdd
-from mindspore import Parameter
-
-
-def _conv2d(in_channel, out_channel, kernel_size=3, stride=1, pad_mod='same'):
-    weight_shape = (out_channel, in_channel, kernel_size, kernel_size)
-    weight = initializer('XavierUniform', shape=weight_shape, dtype=mstype.float32).to_tensor()
-    return nn.Conv2d(in_channel, out_channel, kernel_size=kernel_size, stride=stride,
-                     padding=0, pad_mode=pad_mod, weight_init=weight)
 
 
 def _make_divisible(v, divisor, min_value=None):
@@ -46,6 +38,55 @@ def _make_divisible(v, divisor, min_value=None):
     return new_v
 
 
+def _conv2d(in_channel, out_channel, kernel_size=3, stride=1, pad_mod='same'):
+    return nn.Conv2d(in_channel, out_channel, kernel_size=kernel_size, stride=stride,
+                     padding=0, pad_mode=pad_mod, has_bias=True)
+
+
+def _bn(channel):
+    return nn.BatchNorm2d(channel, eps=1e-3, momentum=0.97,
+                          gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
+
+
+def _last_conv2d(in_channel, out_channel, kernel_size=3, stride=1, pad_mod='same', pad=0):
+    depthwise_conv = DepthwiseConv(in_channel, kernel_size, stride, pad_mode='same', pad=pad)
+    conv = _conv2d(in_channel, out_channel, kernel_size=1)
+    return nn.SequentialCell([depthwise_conv, _bn(in_channel), nn.ReLU6(), conv])
+
+
+class ConvBNReLU(nn.Cell):
+    """
+    Convolution/Depthwise fused with Batchnorm and ReLU block definition.
+
+    Args:
+        in_planes (int): Input channel.
+        out_planes (int): Output channel.
+        kernel_size (int): Input kernel size.
+        stride (int): Stride size for the first convolutional layer. Default: 1.
+        groups (int): channel group. Convolution is 1 while Depthiwse is input channel. Default: 1.
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ConvBNReLU(16, 256, kernel_size=1, stride=1, groups=1)
+    """
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        super(ConvBNReLU, self).__init__()
+        padding = 0
+        if groups == 1:
+            conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride, pad_mode='same',
+                             padding=padding)
+        else:
+            conv = DepthwiseConv(in_planes, kernel_size, stride, pad_mode='same', pad=padding)
+        layers = [conv, _bn(out_planes), nn.ReLU6()]
+        self.features = nn.SequentialCell(layers)
+
+    def construct(self, x):
+        output = self.features(x)
+        return output
+
+
 class DepthwiseConv(nn.Cell):
     """
     Depthwise Convolution warpper definition.
@@ -64,6 +105,7 @@ class DepthwiseConv(nn.Cell):
     Examples:
         >>> DepthwiseConv(16, 3, 1, 'pad', 1, channel_multiplier=1)
     """
+
     def __init__(self, in_planes, kernel_size, stride, pad_mode, pad, channel_multiplier=1, has_bias=False):
         super(DepthwiseConv, self).__init__()
         self.has_bias = has_bias
@@ -91,42 +133,9 @@ class DepthwiseConv(nn.Cell):
         return output
 
 
-class ConvBNReLU(nn.Cell):
-    """
-    Convolution/Depthwise fused with Batchnorm and ReLU block definition.
-
-    Args:
-        in_planes (int): Input channel.
-        out_planes (int): Output channel.
-        kernel_size (int): Input kernel size.
-        stride (int): Stride size for the first convolutional layer. Default: 1.
-        groups (int): channel group. Convolution is 1 while Depthiwse is input channel. Default: 1.
-
-    Returns:
-        Tensor, output tensor.
-
-    Examples:
-        >>> ConvBNReLU(16, 256, kernel_size=1, stride=1, groups=1)
-    """
-    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
-        super(ConvBNReLU, self).__init__()
-        padding = (kernel_size - 1) // 2
-        if groups == 1:
-            conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride, pad_mode='pad',
-                             padding=padding)
-        else:
-            conv = DepthwiseConv(in_planes, kernel_size, stride, pad_mode='pad', pad=padding)
-        layers = [conv, nn.BatchNorm2d(out_planes), nn.ReLU6()]
-        self.features = nn.SequentialCell(layers)
-
-    def construct(self, x):
-        output = self.features(x)
-        return output
-
-
 class InvertedResidual(nn.Cell):
     """
-    Mobilenetv2 residual block definition.
+    Residual block definition.
 
     Args:
         inp (int): Input channel.
@@ -140,7 +149,7 @@ class InvertedResidual(nn.Cell):
     Examples:
         >>> ResidualBlock(3, 256, 1, 1)
     """
-    def __init__(self, inp, oup, stride, expand_ratio):
+    def __init__(self, inp, oup, stride, expand_ratio, last_relu=False):
         super(InvertedResidual, self).__init__()
         assert stride in [1, 2]
 
@@ -155,17 +164,21 @@ class InvertedResidual(nn.Cell):
             ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
             # pw-linear
             nn.Conv2d(hidden_dim, oup, kernel_size=1, stride=1, has_bias=False),
-            nn.BatchNorm2d(oup),
+            _bn(oup),
         ])
         self.conv = nn.SequentialCell(layers)
-        self.add = TensorAdd()
+        self.add = P.TensorAdd()
         self.cast = P.Cast()
+        self.last_relu = last_relu
+        self.relu = nn.ReLU6()
 
     def construct(self, x):
         identity = x
         x = self.conv(x)
         if self.use_res_connect:
-            return self.add(identity, x)
+            x = self.add(identity, x)
+        if self.last_relu:
+            x = self.relu(x)
         return x
 
 
@@ -174,14 +187,14 @@ class FlattenConcat(nn.Cell):
     Concatenate predictions into a single tensor.
 
     Args:
-        config (Class): The default config of SSD.
+        config (dict): The default config of SSD.
 
     Returns:
         Tensor, flatten predictions.
     """
     def __init__(self, config):
         super(FlattenConcat, self).__init__()
-        self.num_ssd_boxes = config.NUM_SSD_BOXES
+        self.num_ssd_boxes = config.num_ssd_boxes
         self.concat = P.Concat(axis=1)
         self.transpose = P.Transpose()
     def construct(self, inputs):
@@ -199,7 +212,7 @@ class MultiBox(nn.Cell):
     Multibox conv layers. Each multibox layer contains class conf scores and localization predictions.
 
     Args:
-        config (Class): The default config of SSD.
+        config (dict): The default config of SSD.
 
     Returns:
         Tensor, localization predictions.
@@ -207,17 +220,17 @@ class MultiBox(nn.Cell):
     """
     def __init__(self, config):
         super(MultiBox, self).__init__()
-        num_classes = config.NUM_CLASSES
-        out_channels = config.EXTRAS_OUT_CHANNELS
-        num_default = config.NUM_DEFAULT
+        num_classes = config.num_classes
+        out_channels = config.extras_out_channels
+        num_default = config.num_default
 
         loc_layers = []
         cls_layers = []
         for k, out_channel in enumerate(out_channels):
-            loc_layers += [_conv2d(out_channel, 4 * num_default[k],
-                                   kernel_size=3, stride=1, pad_mod='same')]
-            cls_layers += [_conv2d(out_channel, num_classes * num_default[k],
-                                   kernel_size=3, stride=1, pad_mod='same')]
+            loc_layers += [_last_conv2d(out_channel, 4 * num_default[k],
+                                        kernel_size=3, stride=1, pad_mod='same', pad=0)]
+            cls_layers += [_last_conv2d(out_channel, num_classes * num_default[k],
+                                        kernel_size=3, stride=1, pad_mod='same', pad=0)]
 
         self.multi_loc_layers = nn.layer.CellList(loc_layers)
         self.multi_cls_layers = nn.layer.CellList(cls_layers)
@@ -238,7 +251,7 @@ class SSD300(nn.Cell):
 
     Args:
         backbone (Cell): Backbone Network.
-        config (Class): The default config of SSD.
+        config (dict): The default config of SSD.
 
     Returns:
         Tensor, localization predictions.
@@ -246,25 +259,26 @@ class SSD300(nn.Cell):
 
     Examples:backbone
          SSD300(backbone=resnet34(num_classes=None),
-                config=ConfigSSDResNet34()).
+                config=config).
     """
     def __init__(self, backbone, config, is_training=True):
         super(SSD300, self).__init__()
 
         self.backbone = backbone
-        in_channels = config.EXTRAS_IN_CHANNELS
-        out_channels = config.EXTRAS_OUT_CHANNELS
-        ratios = config.EXTRAS_RATIO
-        strides = config.EXTRAS_STRIDES
+        in_channels = config.extras_in_channels
+        out_channels = config.extras_out_channels
+        ratios = config.extras_ratio
+        strides = config.extras_srides
         residual_list = []
         for i in range(2, len(in_channels)):
-            residual = InvertedResidual(in_channels[i], out_channels[i], stride=strides[i], expand_ratio=ratios[i])
+            residual = InvertedResidual(in_channels[i], out_channels[i], stride=strides[i],
+                                        expand_ratio=ratios[i], last_relu=True)
             residual_list.append(residual)
         self.multi_residual = nn.layer.CellList(residual_list)
         self.multi_box = MultiBox(config)
         self.is_training = is_training
         if not is_training:
-            self.softmax = P.Softmax()
+            self.activation = P.Sigmoid()
 
     def construct(self, x):
         layer_out_13, output = self.backbone(x)
@@ -275,77 +289,42 @@ class SSD300(nn.Cell):
             multi_feature += (feature,)
         pred_loc, pred_label = self.multi_box(multi_feature)
         if not self.is_training:
-            pred_label = self.softmax(pred_label)
+            pred_label = self.activation(pred_label)
         return pred_loc, pred_label
 
 
-class LocalizationLoss(nn.Cell):
+class SigmoidFocalClassificationLoss(nn.Cell):
     """"
-    Computes the localization loss with SmoothL1Loss.
-
-    Returns:
-        Tensor, box regression loss.
-    """
-    def __init__(self):
-        super(LocalizationLoss, self).__init__()
-        self.reduce_sum = P.ReduceSum()
-        self.reduce_mean = P.ReduceMean()
-        self.loss = nn.SmoothL1Loss()
-        self.expand_dims = P.ExpandDims()
-        self.less = P.Less()
-
-    def construct(self, pred_loc, gt_loc, gt_label, num_matched_boxes):
-        mask = F.cast(self.less(0, gt_label), mstype.float32)
-        mask = self.expand_dims(mask, -1)
-        smooth_l1 = self.loss(gt_loc, pred_loc) * mask
-        box_loss = self.reduce_sum(smooth_l1, 1)
-        return self.reduce_mean(box_loss / F.cast(num_matched_boxes, mstype.float32), (0, 1))
-
-
-class ClassificationLoss(nn.Cell):
-    """"
-    Computes the classification loss with hard example mining.
+    Sigmoid focal-loss for classification.
 
     Args:
-        config (Class): The default config of SSD.
+        gamma (float): Hyper-parameter to balance the easy and hard examples. Default: 2.0
+        alpha (float): Hyper-parameter to balance the positive and negative example. Default: 0.25
 
     Returns:
-        Tensor, classification loss.
+        Tensor, the focal loss.
     """
-    def __init__(self, config):
-        super(ClassificationLoss, self).__init__()
-        self.num_classes = config.NUM_CLASSES
-        self.num_boxes = config.NUM_SSD_BOXES
-        self.neg_pre_positive = config.NEG_PRE_POSITIVE
-        self.minimum = P.Minimum()
-        self.less = P.Less()
-        self.sort = P.TopK()
-        self.tile = P.Tile()
-        self.reduce_sum = P.ReduceSum()
-        self.reduce_mean = P.ReduceMean()
-        self.expand_dims = P.ExpandDims()
-        self.sort_descend = P.TopK(True)
-        self.cross_entropy = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
-
-    def construct(self, pred_label, gt_label, num_matched_boxes):
-        gt_label = F.cast(gt_label, mstype.int32)
-        mask = F.cast(self.less(0, gt_label), mstype.float32)
-        gt_label_shape = F.shape(gt_label)
-        pred_label = F.reshape(pred_label, (-1, self.num_classes))
-        gt_label = F.reshape(gt_label, (-1,))
-        cross_entropy = self.cross_entropy(pred_label, gt_label)
-        cross_entropy = F.reshape(cross_entropy, gt_label_shape)
-
-        # Hard example mining
-        num_matched_boxes = F.reshape(num_matched_boxes, (-1,))
-        neg_masked_cross_entropy = F.cast(cross_entropy * (1- mask), mstype.float16)
-        _, loss_idx = self.sort_descend(neg_masked_cross_entropy, self.num_boxes)
-        _, relative_position = self.sort(F.cast(loss_idx, mstype.float16), self.num_boxes)
-        num_neg_boxes = self.minimum(num_matched_boxes * self.neg_pre_positive, self.num_boxes)
-        tile_num_neg_boxes = self.tile(self.expand_dims(num_neg_boxes, -1), (1, self.num_boxes))
-        top_k_neg_mask = F.cast(self.less(relative_position, tile_num_neg_boxes), mstype.float32)
-        class_loss = self.reduce_sum(cross_entropy * (mask + top_k_neg_mask), 1)
-        return self.reduce_mean(class_loss / F.cast(num_matched_boxes, mstype.float32), 0)
+    def __init__(self, gamma=2.0, alpha=0.25):
+        super(SigmoidFocalClassificationLoss, self).__init__()
+        self.sigmiod_cross_entropy = P.SigmoidCrossEntropyWithLogits()
+        self.sigmoid = P.Sigmoid()
+        self.pow = P.Pow()
+        self.onehot = P.OneHot()
+        self.on_value = Tensor(1.0, mstype.float32)
+        self.off_value = Tensor(0.0, mstype.float32)
+        self.gamma = gamma
+        self.alpha = alpha
+
+    def construct(self, logits, label):
+        label = self.onehot(label, F.shape(logits)[-1], self.on_value, self.off_value)
+        sigmiod_cross_entropy = self.sigmiod_cross_entropy(logits, label)
+        sigmoid = self.sigmoid(logits)
+        label = F.cast(label, mstype.float32)
+        p_t = label * sigmoid + (1 - label) * (1 - sigmoid)
+        modulating_factor = self.pow(1 - p_t, self.gamma)
+        alpha_weight_factor = label * self.alpha + (1 - label) * (1 - self.alpha)
+        focal_loss = modulating_factor * alpha_weight_factor * sigmiod_cross_entropy
+        return focal_loss
 
 
 class SSDWithLossCell(nn.Cell):
@@ -354,7 +333,7 @@ class SSDWithLossCell(nn.Cell):
 
     Args:
         network (Cell): The training network.
-        config (Class): SSD config.
+        config (dict): SSD config.
 
     Returns:
         Tensor, the loss of the network.
@@ -362,14 +341,29 @@ class SSDWithLossCell(nn.Cell):
     def __init__(self, network, config):
         super(SSDWithLossCell, self).__init__()
         self.network = network
-        self.class_loss = ClassificationLoss(config)
-        self.box_loss = LocalizationLoss()
+        self.less = P.Less()
+        self.tile = P.Tile()
+        self.reduce_sum = P.ReduceSum()
+        self.reduce_mean = P.ReduceMean()
+        self.expand_dims = P.ExpandDims()
+        self.class_loss = SigmoidFocalClassificationLoss(config.gamma, config.alpha)
+        self.loc_loss = nn.SmoothL1Loss()
 
     def construct(self, x, gt_loc, gt_label, num_matched_boxes):
         pred_loc, pred_label = self.network(x)
-        loss_cls = self.class_loss(pred_label, gt_label, num_matched_boxes)
-        loss_loc = self.box_loss(pred_loc, gt_loc, gt_label, num_matched_boxes)
-        return loss_cls + loss_loc
+        mask = F.cast(self.less(0, gt_label), mstype.float32)
+        num_matched_boxes = self.reduce_sum(F.cast(num_matched_boxes, mstype.float32))
+
+        # Localization Loss
+        mask_loc = self.tile(self.expand_dims(mask, -1), (1, 1, 4))
+        smooth_l1 = self.loc_loss(pred_loc, gt_loc) * mask_loc
+        loss_loc = self.reduce_sum(self.reduce_mean(smooth_l1, -1), -1)
+
+        # Classification Loss
+        loss_cls = self.class_loss(pred_label, gt_label)
+        loss_cls = self.reduce_sum(loss_cls, (1, 2))
+
+        return self.reduce_sum((loss_cls + loss_loc) / num_matched_boxes)
 
 
 class TrainingWrapper(nn.Cell):
@@ -415,7 +409,6 @@ class TrainingWrapper(nn.Cell):
         return F.depend(loss, self.optimizer(grads))
 
 
-
 class SSDWithMobileNetV2(nn.Cell):
     """
     MobileNetV2 architecture for SSD backbone.
diff --git a/example/ssd_coco2017/train.py b/model_zoo/ssd/train.py
similarity index 64%
rename from example/ssd_coco2017/train.py
rename to model_zoo/ssd/train.py
index 9347bf61c8..27f0e7ad0f 100644
--- a/example/ssd_coco2017/train.py
+++ b/model_zoo/ssd/train.py
@@ -13,83 +13,38 @@
 # limitations under the License.
 # ============================================================================
 
-"""train SSD and get checkpoint files."""
+"""Train SSD and get checkpoint files."""
 
 import os
-import math
 import argparse
-import numpy as np
 import mindspore.nn as nn
 from mindspore import context, Tensor
 from mindspore.communication.management import init
 from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor
 from mindspore.train import Model, ParallelMode
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from mindspore.common.initializer import initializer
-
-from mindspore.model_zoo.ssd import SSD300, SSDWithLossCell, TrainingWrapper, ssd_mobilenet_v2
-from config import ConfigSSD
-from dataset import create_ssd_dataset, data_to_mindrecord_byte_image
-
-
-def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch):
-    """
-    generate learning rate array
-
-    Args:
-       global_step(int): total steps of the training
-       lr_init(float): init learning rate
-       lr_end(float): end learning rate
-       lr_max(float): max learning rate
-       warmup_epochs(int): number of warmup epochs
-       total_epochs(int): total epoch of training
-       steps_per_epoch(int): steps of one epoch
-
-    Returns:
-       np.array, learning rate array
-    """
-    lr_each_step = []
-    total_steps = steps_per_epoch * total_epochs
-    warmup_steps = steps_per_epoch * warmup_epochs
-    for i in range(total_steps):
-        if i < warmup_steps:
-            lr = lr_init + (lr_max - lr_init) * i / warmup_steps
-        else:
-            lr = lr_end + (lr_max - lr_end) * \
-                 (1. + math.cos(math.pi * (i - warmup_steps) / (total_steps - warmup_steps))) / 2.
-        if lr < 0.0:
-            lr = 0.0
-        lr_each_step.append(lr)
-
-    current_step = global_step
-    lr_each_step = np.array(lr_each_step).astype(np.float32)
-    learning_rate = lr_each_step[current_step:]
-
-    return learning_rate
-
+from src.ssd import SSD300, SSDWithLossCell, TrainingWrapper, ssd_mobilenet_v2
+from src.config import config
+from src.dataset import create_ssd_dataset, data_to_mindrecord_byte_image
+from src.lr_schedule import get_lr
+from src.init_params import init_net_param
 
-def init_net_param(network, initialize_mode='XavierUniform'):
-    """Init the parameters in net."""
-    params = network.trainable_params()
-    for p in params:
-        if isinstance(p.data, Tensor) and 'beta' not in p.name and 'gamma' not in p.name and 'bias' not in p.name:
-            p.set_parameter_data(initializer(initialize_mode, p.data.shape(), p.data.dtype()))
 
 def main():
     parser = argparse.ArgumentParser(description="SSD training")
     parser.add_argument("--only_create_dataset", type=bool, default=False, help="If set it true, only create "
-                                                                                "Mindrecord, default is false.")
-    parser.add_argument("--distribute", type=bool, default=False, help="Run distribute, default is false.")
+                                                                                "Mindrecord, default is False.")
+    parser.add_argument("--distribute", type=bool, default=False, help="Run distribute, default is False.")
     parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
     parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
-    parser.add_argument("--lr", type=float, default=0.25, help="Learning rate, default is 0.25.")
+    parser.add_argument("--lr", type=float, default=0.05, help="Learning rate, default is 0.05.")
     parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink.")
     parser.add_argument("--dataset", type=str, default="coco", help="Dataset, defalut is coco.")
-    parser.add_argument("--epoch_size", type=int, default=70, help="Epoch size, default is 70.")
+    parser.add_argument("--epoch_size", type=int, default=250, help="Epoch size, default is 250.")
     parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
     parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.")
     parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.")
-    parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
+    parser.add_argument("--save_checkpoint_epochs", type=int, default=10, help="Save checkpoint epochs, default is 5.")
     parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
     args_opt = parser.parse_args()
 
@@ -111,27 +66,26 @@ def main():
     # It will generate mindrecord file in args_opt.mindrecord_dir,
     # and the file name is ssd.mindrecord0, 1, ... file_num.
 
-    config = ConfigSSD()
     prefix = "ssd.mindrecord"
-    mindrecord_dir = config.MINDRECORD_DIR
+    mindrecord_dir = config.mindrecord_dir
     mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
     if not os.path.exists(mindrecord_file):
         if not os.path.isdir(mindrecord_dir):
             os.makedirs(mindrecord_dir)
         if args_opt.dataset == "coco":
-            if os.path.isdir(config.COCO_ROOT):
+            if os.path.isdir(config.coco_root):
                 print("Create Mindrecord.")
                 data_to_mindrecord_byte_image("coco", True, prefix)
                 print("Create Mindrecord Done, at {}".format(mindrecord_dir))
             else:
-                print("COCO_ROOT not exits.")
+                print("coco_root not exits.")
         else:
-            if os.path.isdir(config.IMAGE_DIR) and os.path.exists(config.ANNO_PATH):
+            if os.path.isdir(config.image_dir) and os.path.exists(config.anno_path):
                 print("Create Mindrecord.")
                 data_to_mindrecord_byte_image("other", True, prefix)
                 print("Create Mindrecord Done, at {}".format(mindrecord_dir))
             else:
-                print("IMAGE_DIR or ANNO_PATH not exits.")
+                print("image_dir or anno_path not exits.")
 
     if not args_opt.only_create_dataset:
         loss_scale = float(args_opt.loss_scale)
@@ -143,7 +97,8 @@ def main():
         dataset_size = dataset.get_dataset_size()
         print("Create dataset done!")
 
-        ssd = SSD300(backbone=ssd_mobilenet_v2(), config=config)
+        backbone = ssd_mobilenet_v2()
+        ssd = SSD300(backbone=backbone, config=config)
         net = SSDWithLossCell(ssd, config)
         init_net_param(net)
 
@@ -157,12 +112,13 @@ def main():
             param_dict = load_checkpoint(args_opt.pre_trained)
             load_param_into_net(net, param_dict)
 
-        lr = Tensor(get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size,
-                           lr_init=0, lr_end=0, lr_max=args_opt.lr,
-                           warmup_epochs=max(350 // 20, 1),
-                           total_epochs=350,
+        lr = Tensor(get_lr(global_step=config.global_step,
+                           lr_init=config.lr_init, lr_end=config.lr_end_rate * args_opt.lr, lr_max=args_opt.lr,
+                           warmup_epochs=config.warmup_epochs,
+                           total_epochs=args_opt.epoch_size,
                            steps_per_epoch=dataset_size))
-        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 0.0001, loss_scale)
+        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr,
+                          config.momentum, config.weight_decay, loss_scale)
         net = TrainingWrapper(net, opt, loss_scale)
 
         callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
diff --git a/example/vgg16_cifar10/README.md b/model_zoo/vgg16/README.md
similarity index 93%
rename from example/vgg16_cifar10/README.md
rename to model_zoo/vgg16/README.md
index 2c3de2eed9..53eb05f66d 100644
--- a/example/vgg16_cifar10/README.md
+++ b/model_zoo/vgg16/README.md
@@ -73,12 +73,13 @@ train_parallel1/log:epcoh: 2 step: 97, loss is 1.7133579
 ### Training
 ```
 usage: train.py [--device_target TARGET][--data_path DATA_PATH]
-                [--device_id DEVICE_ID]
+                [--device_id DEVICE_ID][--pre_trained PRE_TRAINED]
 
 parameters/options:
   --device_target       the training backend type, default is Ascend.
   --data_path           the storage path of dataset
   --device_id           the device which used to train model.
+  --pre_trained         the pretrained checkpoint file path.
 
 ```
 
@@ -98,7 +99,7 @@ parameters/options:
 ### Distribute Training
 
 ```
-Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]
+Usage: sh script/run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]
 
 parameters/options:
   MINDSPORE_HCCL_CONFIG_PATH   HCCL configuration file path.
diff --git a/example/vgg16_cifar10/eval.py b/model_zoo/vgg16/eval.py
similarity index 93%
rename from example/vgg16_cifar10/eval.py
rename to model_zoo/vgg16/eval.py
index ec9fc607c2..8cdcc86031 100644
--- a/example/vgg16_cifar10/eval.py
+++ b/model_zoo/vgg16/eval.py
@@ -17,14 +17,15 @@
 python eval.py --data_path=$DATA_HOME --device_id=$DEVICE_ID
 """
 import argparse
+
 import mindspore.nn as nn
+from mindspore import context
 from mindspore.nn.optim.momentum import Momentum
 from mindspore.train.model import Model
-from mindspore import context
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from mindspore.model_zoo.vgg import vgg16
-from config import cifar_cfg as cfg
-import dataset
+from src.config import cifar_cfg as cfg
+from src.dataset import vgg_create_dataset
+from src.vgg import vgg16
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Cifar10 classification')
@@ -47,6 +48,6 @@ if __name__ == '__main__':
     param_dict = load_checkpoint(args_opt.checkpoint_path)
     load_param_into_net(net, param_dict)
     net.set_train(False)
-    dataset = dataset.create_dataset(args_opt.data_path, 1, False)
+    dataset = vgg_create_dataset(args_opt.data_path, 1, False)
     res = model.eval(dataset)
     print("result: ", res)
diff --git a/example/vgg16_cifar10/run_distribute_train.sh b/model_zoo/vgg16/scripts/run_distribute_train.sh
similarity index 92%
rename from example/vgg16_cifar10/run_distribute_train.sh
rename to model_zoo/vgg16/scripts/run_distribute_train.sh
index c9b8dfc48f..ca4c993ded 100755
--- a/example/vgg16_cifar10/run_distribute_train.sh
+++ b/model_zoo/vgg16/scripts/run_distribute_train.sh
@@ -15,39 +15,38 @@
 # ============================================================================
 
 if [ $# != 2 ]
-then 
+then
     echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]"
 exit 1
 fi
 
 if [ ! -f $1 ]
-then 
+then
     echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
 exit 1
-fi 
+fi
 
 if [ ! -d $2 ]
-then 
+then
     echo "error: DATA_PATH=$2 is not a directory"
 exit 1
-fi 
+fi
 
-ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=8
 export MINDSPORE_HCCL_CONFIG_PATH=$1
 
-for((i=0; i<${DEVICE_NUM}; i++))
+for((i=0;i<RANK_SIZE;i++))
 do
     export DEVICE_ID=$i
     export RANK_ID=$i
     rm -rf ./train_parallel$i
     mkdir ./train_parallel$i
     cp *.py ./train_parallel$i
-    cp *.sh ./train_parallel$i
+    cp -r src ./train_parallel$i
     cd ./train_parallel$i || exit
     echo "start training for rank $RANK_ID, device $DEVICE_ID"
     env > env.log
     python train.py --data_path=$2 --device_id=$i &> log &
     cd ..
-done
+done
\ No newline at end of file
diff --git a/model_zoo/vgg16/src/__init__.py b/model_zoo/vgg16/src/__init__.py
new file mode 100644
index 0000000000..301ef9dcb7
--- /dev/null
+++ b/model_zoo/vgg16/src/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
diff --git a/example/vgg16_cifar10/config.py b/model_zoo/vgg16/src/config.py
similarity index 100%
rename from example/vgg16_cifar10/config.py
rename to model_zoo/vgg16/src/config.py
diff --git a/example/vgg16_cifar10/dataset.py b/model_zoo/vgg16/src/dataset.py
similarity index 96%
rename from example/vgg16_cifar10/dataset.py
rename to model_zoo/vgg16/src/dataset.py
index e8dfd777e6..b08659fb5e 100644
--- a/example/vgg16_cifar10/dataset.py
+++ b/model_zoo/vgg16/src/dataset.py
@@ -16,13 +16,15 @@
 Data operations, will be used in train.py and eval.py
 """
 import os
+
+import mindspore.common.dtype as mstype
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.c_transforms as C
 import mindspore.dataset.transforms.vision.c_transforms as vision
-import mindspore.common.dtype as mstype
-from config import cifar_cfg as cfg
+from .config import cifar_cfg as cfg
+
 
-def create_dataset(data_home, repeat_num=1, training=True):
+def vgg_create_dataset(data_home, repeat_num=1, training=True):
     """Data operations."""
     ds.config.set_seed(1)
     data_dir = os.path.join(data_home, "cifar-10-batches-bin")
diff --git a/mindspore/model_zoo/vgg.py b/model_zoo/vgg16/src/vgg.py
similarity index 100%
rename from mindspore/model_zoo/vgg.py
rename to model_zoo/vgg16/src/vgg.py
diff --git a/example/vgg16_cifar10/train.py b/model_zoo/vgg16/train.py
similarity index 87%
rename from example/vgg16_cifar10/train.py
rename to model_zoo/vgg16/train.py
index 9993db706a..c582cdd679 100644
--- a/example/vgg16_cifar10/train.py
+++ b/model_zoo/vgg16/train.py
@@ -19,20 +19,25 @@ python train.py --data_path=$DATA_HOME --device_id=$DEVICE_ID
 import argparse
 import os
 import random
+
 import numpy as np
+
 import mindspore.nn as nn
 from mindspore import Tensor
+from mindspore import context
 from mindspore.communication.management import init
 from mindspore.nn.optim.momentum import Momentum
-from mindspore.train.model import Model, ParallelMode
-from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
-from mindspore.model_zoo.vgg import vgg16
-from dataset import create_dataset
-from config import cifar_cfg as cfg
+from mindspore.train.model import Model, ParallelMode
+from mindspore.train.serialization import load_param_into_net, load_checkpoint
+from src.config import cifar_cfg as cfg
+from src.dataset import vgg_create_dataset
+from src.vgg import vgg16
+
 random.seed(1)
 np.random.seed(1)
 
+
 def lr_steps(global_step, lr_max=None, total_epochs=None, steps_per_epoch=None):
     """Set learning rate."""
     lr_each_step = []
@@ -60,6 +65,7 @@ if __name__ == '__main__':
                         help='device where the code will be implemented. (Default: Ascend)')
     parser.add_argument('--data_path', type=str, default='./cifar', help='path where the dataset is saved')
     parser.add_argument('--device_id', type=int, default=None, help='device id of GPU or Ascend. (Default: None)')
+    parser.add_argument('--pre_trained', type=str, default=None, help='the pretrained checkpoint file path.')
     args_opt = parser.parse_args()
 
     context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
@@ -72,12 +78,17 @@ if __name__ == '__main__':
                                           mirror_mean=True)
         init()
 
-    dataset = create_dataset(args_opt.data_path, cfg.epoch_size)
+    dataset = vgg_create_dataset(args_opt.data_path, cfg.epoch_size)
     batch_num = dataset.get_dataset_size()
 
     net = vgg16(num_classes=cfg.num_classes)
+    # pre_trained
+    if args_opt.pre_trained:
+        load_param_into_net(net, load_checkpoint(args_opt.pre_trained))
+
     lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num)
-    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), cfg.momentum, weight_decay=cfg.weight_decay)
+    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), cfg.momentum,
+                   weight_decay=cfg.weight_decay)
     loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False)
     model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
                   amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None)
diff --git a/model_zoo/wide_and_deep/README.md b/model_zoo/wide_and_deep/README.md
index 48e979815e..54367ef173 100644
--- a/model_zoo/wide_and_deep/README.md
+++ b/model_zoo/wide_and_deep/README.md
@@ -13,24 +13,28 @@ The Criteo datasets are used for model training and evaluation.
 The entire code structure is as following:
 ```
 |--- wide_and_deep/
-    train_and_test.py            "Entrance of Wide&Deep model training and evaluation"
-    test.py                      "Entrance of Wide&Deep model evaluation"
-    train.py                     "Entrance of Wide&Deep model training"
-    train_and_test_multinpu.py   "Entrance of Wide&Deep model data parallel training and evaluation"
-    |--- src/                    "entrance of training and evaluation"
-        config.py                "parameters configuration"
-        dataset.py               "Dataset loader class"
-        process_data.py          "process dataset"
-        preprocess_data.py       "pre_process dataset"
-        WideDeep.py              "Model structure"
-        callbacks.py             "Callback class for training and evaluation"
-        metrics.py               "Metric class"
+    train_and_eval.py                "Entrance of Wide&Deep model training and evaluation"
+    eval.py                          "Entrance of Wide&Deep model evaluation"
+    train.py                         "Entrance of Wide&Deep model training"
+    train_and_eval_multinpu.py       "Entrance of Wide&Deep model data parallel training and evaluation"
+    train_and_eval_auto_parallel.py
+    |--- src/                        "Entrance of training and evaluation"
+        config.py                    "Parameters configuration"
+        dataset.py                   "Dataset loader class"
+        process_data.py              "Process dataset"
+        preprocess_data.py           "Pre_process dataset"
+        wide_and_deep.py             "Model structure"
+        callbacks.py                 "Callback class for training and evaluation"
+        metrics.py                   "Metric class"
+    |--- script/                     "Run shell dir"
+        run_multinpu_train.sh        "Run data parallel"
+        run_auto_parallel_train.sh   "Run auto parallel"
 ```
 
 ### Train and evaluate model
 To train and evaluate the model, command as follows:
 ```
-python train_and_test.py
+python train_and_eval.py
 ```
 Arguments:
   * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
@@ -42,6 +46,7 @@ Arguments:
   * `--emb_dim`： The dense embedding dimension of sparse feature.
   * `--deep_layers_dim`： The dimension of all deep layers.
   * `--deep_layers_act`： The activation of all deep layers.
+  * `--dropout_flag`： Whether do dropout.
   * `--keep_prob`： The rate to keep in dropout layer.
   * `--ckpt_path`：The location of the checkpoint file.
   * `--eval_file_name` : Eval output file.
@@ -61,6 +66,7 @@ Arguments:
   * `--emb_dim`： The dense embedding dimension of sparse feature.
   * `--deep_layers_dim`： The dimension of all deep layers.
   * `--deep_layers_act`： The activation of all deep layers.
+  * `--dropout_flag`： Whether do dropout.
   * `--keep_prob`： The rate to keep in dropout layer.
   * `--ckpt_path`：The location of the checkpoint file.
   * `--eval_file_name` : Eval output file.
@@ -68,13 +74,17 @@ Arguments:
 
 To train the model in distributed, command as follows:
 ```
-# configure environment path, RANK_TABLE_FILE, RANK_SIZE, MINDSPORE_HCCL_CONFIG_PATH before training
-bash run_multinpu_train.sh
+# configure environment path before training
+bash run_multinpu_train.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE 
+```
+```
+# configure environment path before training
+bash run_auto_parallel_train.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE 
 ```
 
 To evaluate the model, command as follows:
 ```
-python test.py
+python eval.py
 ```
 Arguments:
   * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
diff --git a/model_zoo/wide_and_deep/test.py b/model_zoo/wide_and_deep/eval.py
similarity index 100%
rename from model_zoo/wide_and_deep/test.py
rename to model_zoo/wide_and_deep/eval.py
diff --git a/model_zoo/wide_and_deep/run_multinpu_train.sh b/model_zoo/wide_and_deep/run_multinpu_train.sh
deleted file mode 100644
index db7823eed7..0000000000
--- a/model_zoo/wide_and_deep/run_multinpu_train.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# bash run_multinpu_train.sh
-execute_path=$(pwd)
-
-export RANK_TABLE_FILE=${execute_path}/rank_table_8p.json
-export RANK_SIZE=8
-export MINDSPORE_HCCL_CONFIG_PATH=${execute_path}/rank_table_8p.json
-
-for((i=0;i<=7;i++));
-do
-  rm -rf ${execute_path}/device_$i/
-  mkdir ${execute_path}/device_$i/
-  cd ${execute_path}/device_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  pytest -s ${execute_path}/train_and_test_multinpu.py >train_deep$i.log 2>&1 &
-done
diff --git a/model_zoo/wide_and_deep/script/run_auto_parallel_train.sh b/model_zoo/wide_and_deep/script/run_auto_parallel_train.sh
new file mode 100644
index 0000000000..9e9226a23a
--- /dev/null
+++ b/model_zoo/wide_and_deep/script/run_auto_parallel_train.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# bash run_multinpu_train.sh
+execute_path=$(pwd)
+script_self=$(readlink -f "$0")
+self_path=$(dirname "${script_self}")
+export RANK_SIZE=$1
+export EPOCH_SIZE=$2
+export DATASET=$3
+export RANK_TABLE_FILE=$4
+export MINDSPORE_HCCL_CONFIG_PATH=$4
+
+for((i=0;i<$RANK_SIZE;i++));
+do
+  rm -rf ${execute_path}/device_$i/
+  mkdir ${execute_path}/device_$i/
+  cd ${execute_path}/device_$i/ || exit
+  export RANK_ID=$i
+  export DEVICE_ID=$i
+  python -s ${self_path}/../train_and_eval_auto_parallel.py --data_path=$DATASET --epochs=$EPOCH_SIZE >train_deep$i.log 2>&1 &
+done
diff --git a/model_zoo/wide_and_deep/script/run_multigpu_train.sh b/model_zoo/wide_and_deep/script/run_multigpu_train.sh
new file mode 100644
index 0000000000..987eeaa65e
--- /dev/null
+++ b/model_zoo/wide_and_deep/script/run_multigpu_train.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# bash run_multigpu_train.sh
+script_self=$(readlink -f "$0")
+self_path=$(dirname "${script_self}")
+RANK_SIZE=$1
+EPOCH_SIZE=$2
+DATASET=$3
+
+mpirun --allow-run-as-root -n $RANK_SIZE                    \
+    python -s ${self_path}/../train_and_eval_distribute.py  \
+        --device_target="GPU"                               \
+        --data_path=$DATASET                                \
+        --epochs=$EPOCH_SIZE > log.txt 2>&1 &
diff --git a/model_zoo/wide_and_deep/script/run_multinpu_train.sh b/model_zoo/wide_and_deep/script/run_multinpu_train.sh
new file mode 100644
index 0000000000..4b642bc196
--- /dev/null
+++ b/model_zoo/wide_and_deep/script/run_multinpu_train.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# bash run_multinpu_train.sh
+execute_path=$(pwd)
+script_self=$(readlink -f "$0")
+self_path=$(dirname "${script_self}")
+export RANK_SIZE=$1
+export EPOCH_SIZE=$2
+export DATASET=$3
+export RANK_TABLE_FILE=$4
+export MINDSPORE_HCCL_CONFIG_PATH=$4
+
+for((i=0;i<$RANK_SIZE;i++));
+do
+  rm -rf ${execute_path}/device_$i/
+  mkdir ${execute_path}/device_$i/
+  cd ${execute_path}/device_$i/ || exit
+  export RANK_ID=$i
+  export DEVICE_ID=$i
+  python -s ${self_path}/../train_and_eval_distribute.py --data_path=$DATASET --epochs=$EPOCH_SIZE >train_deep$i.log 2>&1 &
+done
diff --git a/model_zoo/wide_and_deep/src/callbacks.py b/model_zoo/wide_and_deep/src/callbacks.py
index 6e3bb75aae..4c2f9c700e 100644
--- a/model_zoo/wide_and_deep/src/callbacks.py
+++ b/model_zoo/wide_and_deep/src/callbacks.py
@@ -17,6 +17,7 @@ callbacks
 import time
 from mindspore.train.callback import Callback
 from mindspore import context
+from mindspore.train import ParallelMode
 
 def add_write(file_path, out_str):
     """
@@ -85,14 +86,17 @@ class EvalCallBack(Callback):
         self.aucMetric = auc_metric
         self.aucMetric.clear()
         self.eval_file_name = config.eval_file_name
+        self.eval_values = []
 
-    def epoch_name(self, run_context):
+    def epoch_end(self, run_context):
         """
-        epoch name
+        epoch end
         """
         self.aucMetric.clear()
-        context.set_auto_parallel_context(strategy_ckpt_save_file="",
-                                          strategy_ckpt_load_file="./strategy_train.ckpt")
+        parallel_mode = context.get_auto_parallel_context("parallel_mode")
+        if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
+            context.set_auto_parallel_context(strategy_ckpt_save_file="",
+                                              strategy_ckpt_load_file="./strategy_train.ckpt")
         start_time = time.time()
         out = self.model.eval(self.eval_dataset)
         end_time = time.time()
@@ -101,4 +105,5 @@ class EvalCallBack(Callback):
         time_str = time.strftime("%Y-%m-%d %H:%M%S", time.localtime())
         out_str = "{}==== EvalCallBack model.eval(): {}; eval_time: {}s".format(time_str, out.values(), eval_time)
         print(out_str)
+        self.eval_values = out.values()
         add_write(self.eval_file_name, out_str)
diff --git a/model_zoo/wide_and_deep/src/config.py b/model_zoo/wide_and_deep/src/config.py
index 3559e8bf23..f8a2c84743 100644
--- a/model_zoo/wide_and_deep/src/config.py
+++ b/model_zoo/wide_and_deep/src/config.py
@@ -20,17 +20,20 @@ def argparse_init():
     argparse_init
     """
     parser = argparse.ArgumentParser(description='WideDeep')
+    parser.add_argument("--device_target", type=str, default="Ascend", choices=["Ascend", "GPU"],
+                        help="device where the code will be implemented. (Default: Ascend)")
     parser.add_argument("--data_path", type=str, default="./test_raw_data/")
     parser.add_argument("--epochs", type=int, default=15)
+    parser.add_argument("--full_batch", type=bool, default=False)
     parser.add_argument("--batch_size", type=int, default=16000)
     parser.add_argument("--eval_batch_size", type=int, default=16000)
     parser.add_argument("--field_size", type=int, default=39)
-    parser.add_argument("--vocab_size", type=int, default=184965)
+    parser.add_argument("--vocab_size", type=int, default=200000)
     parser.add_argument("--emb_dim", type=int, default=80)
     parser.add_argument("--deep_layer_dim", type=int, nargs='+', default=[1024, 512, 256, 128])
     parser.add_argument("--deep_layer_act", type=str, default='relu')
     parser.add_argument("--keep_prob", type=float, default=1.0)
-
+    parser.add_argument("--dropout_flag", type=int, default=0)
     parser.add_argument("--output_path", type=str, default="./output/")
     parser.add_argument("--ckpt_path", type=str, default="./checkpoints/")
     parser.add_argument("--eval_file_name", type=str, default="eval.log")
@@ -43,12 +46,14 @@ class WideDeepConfig():
     WideDeepConfig
     """
     def __init__(self):
+        self.device_target = "Ascend"
         self.data_path = "./test_raw_data/"
+        self.full_batch = False
         self.epochs = 15
         self.batch_size = 16000
         self.eval_batch_size = 16000
         self.field_size = 39
-        self.vocab_size = 184965
+        self.vocab_size = 200000
         self.emb_dim = 80
         self.deep_layer_dim = [1024, 512, 256, 128]
         self.deep_layer_act = 'relu'
@@ -70,8 +75,10 @@ class WideDeepConfig():
         """
         parser = argparse_init()
         args, _ = parser.parse_known_args()
+        self.device_target = args.device_target
         self.data_path = args.data_path
         self.epochs = args.epochs
+        self.full_batch = args.full_batch
         self.batch_size = args.batch_size
         self.eval_batch_size = args.eval_batch_size
         self.field_size = args.field_size
@@ -83,7 +90,7 @@ class WideDeepConfig():
         self.weight_bias_init = ['normal', 'normal']
         self.emb_init = 'normal'
         self.init_args = [-0.01, 0.01]
-        self.dropout_flag = False
+        self.dropout_flag = bool(args.dropout_flag)
         self.l2_coef = 8e-5
 
         self.output_path = args.output_path
diff --git a/model_zoo/wide_and_deep/src/datasets.py b/model_zoo/wide_and_deep/src/datasets.py
index 775dd7ca54..0ec4f327dd 100644
--- a/model_zoo/wide_and_deep/src/datasets.py
+++ b/model_zoo/wide_and_deep/src/datasets.py
@@ -17,11 +17,20 @@
 
 import os
 import math
+from enum import Enum
 import numpy as np
 import pandas as pd
 import mindspore.dataset.engine as de
 import mindspore.common.dtype as mstype
 
+class DataType(Enum):
+    """
+    Enumerate supported dataset format.
+    """
+    MINDRECORD = 1
+    TFRECORD = 2
+    H5 = 3
+
 
 class H5Dataset():
     """
@@ -193,15 +202,60 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
     ds = ds.repeat(epochs)
     return ds
 
+def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
+                            line_per_sample=1000, rank_size=None, rank_id=None):
+    """
+    Get dataset with mindrecord format.
+
+    Args:
+        directory (str): Dataset directory.
+        train_mode (bool): Whether dataset is use for train or eval (default=True).
+        epochs (int): Dataset epoch size (default=1).
+        batch_size (int): Dataset batch size (default=1000).
+        line_per_sample (int): The number of sample per line (default=1000).
+        rank_size (int): The number of device, not necessary for single device (default=None).
+        rank_id (int): Id of device, not necessary for single device (default=None).
+
+    Returns:
+        Dataset.
+    """
+    file_prefix_name = 'train_input_part.mindrecord' if train_mode else 'test_input_part.mindrecord'
+    file_suffix_name = '00' if train_mode else '0'
+    shuffle = train_mode
+
+    if rank_size is not None and rank_id is not None:
+        ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
+                            columns_list=['feat_ids', 'feat_vals', 'label'],
+                            num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
+                            num_parallel_workers=8)
+    else:
+        ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
+                            columns_list=['feat_ids', 'feat_vals', 'label'],
+                            shuffle=shuffle, num_parallel_workers=8)
+    ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
+    ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
+                                             np.array(y).flatten().reshape(batch_size, 39),
+                                             np.array(z).flatten().reshape(batch_size, 1))),
+                input_columns=['feat_ids', 'feat_vals', 'label'],
+                columns_order=['feat_ids', 'feat_vals', 'label'],
+                num_parallel_workers=8)
+    ds = ds.repeat(epochs)
+    return ds
+
 
 def create_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
-                   is_tf_dataset=True, line_per_sample=1000, rank_size=None, rank_id=None):
+                   data_type=DataType.TFRECORD, line_per_sample=1000, rank_size=None, rank_id=None):
     """
     create_dataset
     """
-    if is_tf_dataset:
+    if data_type == DataType.TFRECORD:
         return _get_tf_dataset(data_dir, train_mode, epochs, batch_size,
                                line_per_sample, rank_size=rank_size, rank_id=rank_id)
+    if data_type == DataType.MINDRECORD:
+        return _get_mindrecord_dataset(data_dir, train_mode, epochs,
+                                       batch_size, line_per_sample,
+                                       rank_size, rank_id)
+
     if rank_size > 1:
         raise RuntimeError("please use tfrecord dataset.")
     return _get_h5_dataset(data_dir, train_mode, epochs, batch_size)
diff --git a/model_zoo/wide_and_deep/src/metrics.py b/model_zoo/wide_and_deep/src/metrics.py
index 277d6744dc..c89e948405 100644
--- a/model_zoo/wide_and_deep/src/metrics.py
+++ b/model_zoo/wide_and_deep/src/metrics.py
@@ -17,8 +17,10 @@
 Area under cure metric
 """
 
-from mindspore.nn.metrics import Metric
 from sklearn.metrics import roc_auc_score
+from mindspore import context
+from mindspore.nn.metrics import Metric
+from mindspore.communication.management import get_rank, get_group_size
 
 class AUCMetric(Metric):
     """
@@ -28,6 +30,7 @@ class AUCMetric(Metric):
     def __init__(self):
         super(AUCMetric, self).__init__()
         self.clear()
+        self.full_batch = context.get_auto_parallel_context("full_batch")
 
     def clear(self):
         """Clear the internal evaluation result."""
@@ -35,10 +38,17 @@ class AUCMetric(Metric):
         self.pred_probs = []
 
     def update(self, *inputs): # inputs
-        all_predict = inputs[1].asnumpy() # predict
-        all_label = inputs[2].asnumpy() # label
-        self.true_labels.extend(all_label.flatten().tolist())
-        self.pred_probs.extend(all_predict.flatten().tolist())
+        """Update list of predicts and labels."""
+        all_predict = inputs[1].asnumpy().flatten().tolist() # predict
+        all_label = inputs[2].asnumpy().flatten().tolist() # label
+        self.pred_probs.extend(all_predict)
+        if self.full_batch:
+            rank_id = get_rank()
+            group_size = get_group_size()
+            gap = len(all_label) // group_size
+            self.true_labels.extend(all_label[rank_id*gap: (rank_id+1)*gap])
+        else:
+            self.true_labels.extend(all_label)
 
     def eval(self):
         if len(self.true_labels) != len(self.pred_probs):
diff --git a/model_zoo/wide_and_deep/src/process_data.py b/model_zoo/wide_and_deep/src/process_data.py
index 37b38b0bbb..acf618297f 100644
--- a/model_zoo/wide_and_deep/src/process_data.py
+++ b/model_zoo/wide_and_deep/src/process_data.py
@@ -248,8 +248,8 @@ def random_split_trans2h5(in_file_path, output_path, criteo_stats, part_rows=200
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(description="Get and Process datasets")
-    parser.add_argument("--raw_data_path", default="/opt/npu/data/origin_criteo_data/", help="The path to save dataset")
-    parser.add_argument("--output_path", default="/opt/npu/data/origin_criteo_data/h5_data/",
+    parser.add_argument("--raw_data_path", default="./raw_data", help="The path to save dataset")
+    parser.add_argument("--output_path", default="./output",
                         help="The path to save dataset")
     args, _ = parser.parse_known_args()
     base_path = args.raw_data_path
diff --git a/model_zoo/wide_and_deep/src/wide_and_deep.py b/model_zoo/wide_and_deep/src/wide_and_deep.py
index 7772431ab3..16102039a8 100644
--- a/model_zoo/wide_and_deep/src/wide_and_deep.py
+++ b/model_zoo/wide_and_deep/src/wide_and_deep.py
@@ -14,16 +14,20 @@
 # ============================================================================
 """wide and deep model"""
 from mindspore import nn
-from mindspore import Tensor, Parameter, ParameterTuple
+from mindspore import Parameter, ParameterTuple
 import mindspore.common.dtype as mstype
 from mindspore.ops import functional as F
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
-# from mindspore.nn import Dropout
+from mindspore.nn import Dropout
 from mindspore.nn.optim import Adam, FTRL
 # from mindspore.nn.metrics import Metric
 from mindspore.common.initializer import Uniform, initializer
 # from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+from mindspore.parallel._utils import _get_device_num, _get_parallel_mode, _get_mirror_mean
+from mindspore.train.parallel_utils import ParallelMode
+from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
+from mindspore.communication.management import get_group_size
 import numpy as np
 
 np_type = np.float32
@@ -42,8 +46,7 @@ def init_method(method, shape, name, max_val=1.0):
     elif method == 'zero':
         params = Parameter(initializer("zeros", shape, ms_type), name=name)
     elif method == "normal":
-        params = Parameter(Tensor(np.random.normal(
-            loc=0.0, scale=0.01, size=shape).astype(dtype=np_type)), name=name)
+        params = Parameter(initializer("normal", shape, ms_type), name=name)
     return params
 
 
@@ -66,8 +69,8 @@ def init_var_dict(init_args, in_vars):
                 var_map[key] = Parameter(initializer(
                     "zeros", shape, ms_type), name=key)
             elif method == 'normal':
-                var_map[key] = Parameter(Tensor(np.random.normal(
-                    loc=0.0, scale=0.01, size=shape).astype(dtype=np_type)), name=key)
+                var_map[key] = Parameter(initializer(
+                    "normal", shape, ms_type), name=key)
     return var_map
 
 
@@ -79,7 +82,7 @@ class DenseLayer(nn.Cell):
     """
 
     def __init__(self, input_dim, output_dim, weight_bias_init, act_str,
-                 keep_prob=0.7, scale_coef=1.0, convert_dtype=True):
+                 keep_prob=0.7, use_activation=True, convert_dtype=True, drop_out=False):
         super(DenseLayer, self).__init__()
         weight_init, bias_init = weight_bias_init
         self.weight = init_method(
@@ -89,11 +92,10 @@ class DenseLayer(nn.Cell):
         self.matmul = P.MatMul(transpose_b=False)
         self.bias_add = P.BiasAdd()
         self.cast = P.Cast()
-        #self.dropout = Dropout(keep_prob=keep_prob)
-        self.mul = P.Mul()
-        self.realDiv = P.RealDiv()
-        self.scale_coef = scale_coef
+        self.dropout = Dropout(keep_prob=keep_prob)
+        self.use_activation = use_activation
         self.convert_dtype = convert_dtype
+        self.drop_out = drop_out
 
     def _init_activation(self, act_str):
         act_str = act_str.lower()
@@ -106,20 +108,23 @@ class DenseLayer(nn.Cell):
         return act_func
 
     def construct(self, x):
-        x = self.act_func(x)
-        # if self.training:
-        #    x = self.dropout(x)
-        x = self.mul(x, self.scale_coef)
+        if self.training and self.drop_out:
+            x = self.dropout(x)
         if self.convert_dtype:
             x = self.cast(x, mstype.float16)
             weight = self.cast(self.weight, mstype.float16)
+            bias = self.cast(self.bias, mstype.float16)
             wx = self.matmul(x, weight)
+            wx = self.bias_add(wx, bias)
+            if self.use_activation:
+                wx = self.act_func(wx)
             wx = self.cast(wx, mstype.float32)
         else:
             wx = self.matmul(x, self.weight)
-        wx = self.realDiv(wx, self.scale_coef)
-        output = self.bias_add(wx, self.bias)
-        return output
+            wx = self.bias_add(wx, self.bias)
+            if self.use_activation:
+                wx = self.act_func(wx)
+        return wx
 
 
 class WideDeepModel(nn.Cell):
@@ -132,6 +137,9 @@ class WideDeepModel(nn.Cell):
     def __init__(self, config):
         super(WideDeepModel, self).__init__()
         self.batch_size = config.batch_size
+        parallel_mode = _get_parallel_mode()
+        if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
+            self.batch_size = self.batch_size * get_group_size()
         self.field_size = config.field_size
         self.vocab_size = config.vocab_size
         self.emb_dim = config.emb_dim
@@ -157,23 +165,28 @@ class WideDeepModel(nn.Cell):
         self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
                                         self.all_dim_list[1],
                                         self.weight_bias_init,
-                                        self.deep_layer_act, convert_dtype=True)
+                                        self.deep_layer_act,
+                                        convert_dtype=True, drop_out=config.dropout_flag)
         self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
                                         self.all_dim_list[2],
                                         self.weight_bias_init,
-                                        self.deep_layer_act, convert_dtype=True)
+                                        self.deep_layer_act,
+                                        convert_dtype=True, drop_out=config.dropout_flag)
         self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
                                         self.all_dim_list[3],
                                         self.weight_bias_init,
-                                        self.deep_layer_act, convert_dtype=True)
+                                        self.deep_layer_act,
+                                        convert_dtype=True, drop_out=config.dropout_flag)
         self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
                                         self.all_dim_list[4],
                                         self.weight_bias_init,
-                                        self.deep_layer_act, convert_dtype=True)
+                                        self.deep_layer_act,
+                                        convert_dtype=True, drop_out=config.dropout_flag)
         self.dense_layer_5 = DenseLayer(self.all_dim_list[4],
                                         self.all_dim_list[5],
                                         self.weight_bias_init,
-                                        self.deep_layer_act, convert_dtype=True)
+                                        self.deep_layer_act,
+                                        use_activation=False, convert_dtype=True, drop_out=config.dropout_flag)
 
         self.gather_v2 = P.GatherV2()
         self.mul = P.Mul()
@@ -258,7 +271,7 @@ class TrainStepWrap(nn.Cell):
         sens (Number): The adjust parameter. Default: 1000.0
     """
 
-    def __init__(self, network, sens=1000.0):
+    def __init__(self, network, sens=1024.0):
         super(TrainStepWrap, self).__init__()
         self.network = network
         self.network.set_train()
@@ -285,6 +298,18 @@ class TrainStepWrap(nn.Cell):
         self.loss_net_w = IthOutputCell(network, output_index=0)
         self.loss_net_d = IthOutputCell(network, output_index=1)
 
+        self.reducer_flag = False
+        self.grad_reducer_w = None
+        self.grad_reducer_d = None
+        parallel_mode = _get_parallel_mode()
+        self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL,
+                                              ParallelMode.HYBRID_PARALLEL)
+        if self.reducer_flag:
+            mean = _get_mirror_mean()
+            degree = _get_device_num()
+            self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree)
+            self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree)
+
     def construct(self, batch_ids, batch_wts, label):
         weights_w = self.weights_w
         weights_d = self.weights_d
@@ -295,6 +320,9 @@ class TrainStepWrap(nn.Cell):
                                                           label, sens_w)
         grads_d = self.grad_d(self.loss_net_d, weights_d)(batch_ids, batch_wts,
                                                           label, sens_d)
+        if self.reducer_flag:
+            grads_w = self.grad_reducer_w(grads_w)
+            grads_d = self.grad_reducer_d(grads_d)
         return F.depend(loss_w, self.optimizer_w(grads_w)), F.depend(loss_d,
                                                                      self.optimizer_d(grads_d))
 
diff --git a/model_zoo/wide_and_deep/train.py b/model_zoo/wide_and_deep/train.py
index b3996e01cb..ac9750c547 100644
--- a/model_zoo/wide_and_deep/train.py
+++ b/model_zoo/wide_and_deep/train.py
@@ -14,7 +14,7 @@
 """ test_training """
 import os
 from mindspore import Model, context
-from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
 
 from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
 from src.callbacks import LossCallBack
@@ -75,7 +75,7 @@ def test_train(configure):
     ckptconfig = CheckpointConfig(save_checkpoint_steps=1,
                                   keep_checkpoint_max=5)
     ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig)
-    model.train(epochs, ds_train, callbacks=[callback, ckpoint_cb])
+    model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb])
 
 
 if __name__ == "__main__":
diff --git a/model_zoo/wide_and_deep/train_and_test.py b/model_zoo/wide_and_deep/train_and_eval.py
similarity index 100%
rename from model_zoo/wide_and_deep/train_and_test.py
rename to model_zoo/wide_and_deep/train_and_eval.py
diff --git a/model_zoo/wide_and_deep/train_and_eval_auto_parallel.py b/model_zoo/wide_and_deep/train_and_eval_auto_parallel.py
new file mode 100644
index 0000000000..780c95540c
--- /dev/null
+++ b/model_zoo/wide_and_deep/train_and_eval_auto_parallel.py
@@ -0,0 +1,119 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train_multinpu."""
+
+
+import os
+import sys
+import mindspore.dataset.engine as de
+from mindspore import Model, context
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
+from mindspore.train import ParallelMode
+from mindspore.communication.management import get_rank, get_group_size, init
+from mindspore.parallel import _cost_model_context as cost_model_context
+from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple
+
+from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
+from src.callbacks import LossCallBack, EvalCallBack
+from src.datasets import create_dataset
+from src.metrics import AUCMetric
+from src.config import WideDeepConfig
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True)
+context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, mirror_mean=True)
+cost_model_context.set_cost_model_context(multi_subgraphs=True)
+init()
+
+
+
+def get_WideDeep_net(config):
+    WideDeep_net = WideDeepModel(config)
+    loss_net = NetWithLossClass(WideDeep_net, config)
+    loss_net = VirtualDatasetCellTriple(loss_net)
+    train_net = TrainStepWrap(loss_net)
+    eval_net = PredictWithSigmoid(WideDeep_net)
+    eval_net = VirtualDatasetCellTriple(eval_net)
+    return train_net, eval_net
+
+
+class ModelBuilder():
+    """
+    ModelBuilder
+    """
+    def __init__(self):
+        pass
+
+    def get_hook(self):
+        pass
+
+    def get_train_hook(self):
+        hooks = []
+        callback = LossCallBack()
+        hooks.append(callback)
+        if int(os.getenv('DEVICE_ID')) == 0:
+            pass
+        return hooks
+
+    def get_net(self, config):
+        return get_WideDeep_net(config)
+
+
+def train_and_eval(config):
+    """
+    test_train_eval
+    """
+    data_path = config.data_path
+    batch_size = config.batch_size
+    epochs = config.epochs
+    print("epochs is {}".format(epochs))
+    if config.full_batch:
+        context.set_auto_parallel_context(full_batch=True)
+        de.config.set_seed(1)
+        ds_train = create_dataset(data_path, train_mode=True, epochs=epochs,
+                                  batch_size=batch_size*get_group_size())
+        ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1,
+                                 batch_size=batch_size*get_group_size())
+    else:
+        ds_train = create_dataset(data_path, train_mode=True, epochs=epochs,
+                                  batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size())
+        ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1,
+                                 batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size())
+    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
+    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))
+
+    net_builder = ModelBuilder()
+
+    train_net, eval_net = net_builder.get_net(config)
+    train_net.set_train()
+    auc_metric = AUCMetric()
+
+    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})
+
+    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
+
+    callback = LossCallBack(config=config)
+    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
+    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
+                                 directory=config.ckpt_path, config=ckptconfig)
+    context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt")
+    model.train(epochs, ds_train,
+                callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
+
+
+if __name__ == "__main__":
+    wide_deep_config = WideDeepConfig()
+    wide_deep_config.argparse_init()
+    train_and_eval(wide_deep_config)
diff --git a/model_zoo/wide_and_deep/train_and_eval_distribute.py b/model_zoo/wide_and_deep/train_and_eval_distribute.py
new file mode 100644
index 0000000000..db98bacfec
--- /dev/null
+++ b/model_zoo/wide_and_deep/train_and_eval_distribute.py
@@ -0,0 +1,113 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train_multinpu."""
+
+
+import os
+import sys
+import numpy as np
+from mindspore import Model, context
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
+from mindspore.train import ParallelMode
+from mindspore.communication.management import get_rank, get_group_size, init
+
+from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
+from src.callbacks import LossCallBack, EvalCallBack
+from src.datasets import create_dataset
+from src.metrics import AUCMetric
+from src.config import WideDeepConfig
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+def get_WideDeep_net(config):
+    WideDeep_net = WideDeepModel(config)
+    loss_net = NetWithLossClass(WideDeep_net, config)
+    train_net = TrainStepWrap(loss_net)
+    eval_net = PredictWithSigmoid(WideDeep_net)
+    return train_net, eval_net
+
+
+class ModelBuilder():
+    """
+    ModelBuilder
+    """
+    def __init__(self):
+        pass
+
+    def get_hook(self):
+        pass
+
+    def get_train_hook(self):
+        hooks = []
+        callback = LossCallBack()
+        hooks.append(callback)
+        if int(os.getenv('DEVICE_ID')) == 0:
+            pass
+        return hooks
+
+    def get_net(self, config):
+        return get_WideDeep_net(config)
+
+
+def train_and_eval(config):
+    """
+    test_train_eval
+    """
+    np.random.seed(1000)
+    data_path = config.data_path
+    batch_size = config.batch_size
+    epochs = config.epochs
+    print("epochs is {}".format(epochs))
+    ds_train = create_dataset(data_path, train_mode=True, epochs=epochs,
+                              batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size())
+    ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1,
+                             batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size())
+    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
+    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))
+
+    net_builder = ModelBuilder()
+
+    train_net, eval_net = net_builder.get_net(config)
+    train_net.set_train()
+    auc_metric = AUCMetric()
+
+    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})
+
+    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
+
+    callback = LossCallBack(config=config)
+    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
+    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
+                                 directory=config.ckpt_path, config=ckptconfig)
+    out = model.eval(ds_eval)
+    print("=====" * 5 + "model.eval() initialized: {}".format(out))
+    model.train(epochs, ds_train,
+                callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
+
+
+if __name__ == "__main__":
+    wide_deep_config = WideDeepConfig()
+    wide_deep_config.argparse_init()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
+    if wide_deep_config.device_target == "Ascend":
+        init("hccl")
+    elif wide_deep_config.device_target == "GPU":
+        init("nccl")
+    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
+                                      device_num=get_group_size())
+
+    train_and_eval(wide_deep_config)
diff --git a/example/yolov3_coco2017/README.md b/model_zoo/yolov3/README.md
similarity index 100%
rename from example/yolov3_coco2017/README.md
rename to model_zoo/yolov3/README.md
diff --git a/example/yolov3_coco2017/eval.py b/model_zoo/yolov3/eval.py
similarity index 92%
rename from example/yolov3_coco2017/eval.py
rename to model_zoo/yolov3/eval.py
index 6e6d358248..65dc408a15 100644
--- a/example/yolov3_coco2017/eval.py
+++ b/model_zoo/yolov3/eval.py
@@ -19,10 +19,10 @@ import argparse
 import time
 from mindspore import context, Tensor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from mindspore.model_zoo.yolov3 import yolov3_resnet18, YoloWithEval
-from dataset import create_yolo_dataset, data_to_mindrecord_byte_image
-from config import ConfigYOLOV3ResNet18
-from util import metrics
+from src.yolov3 import yolov3_resnet18, YoloWithEval
+from src.dataset import create_yolo_dataset, data_to_mindrecord_byte_image
+from src.config import ConfigYOLOV3ResNet18
+from src.utils import metrics
 
 def yolo_eval(dataset_path, ckpt_path):
     """Yolov3 evaluation."""
@@ -88,15 +88,15 @@ if __name__ == '__main__':
     if not os.path.isdir(args_opt.mindrecord_dir):
         os.makedirs(args_opt.mindrecord_dir)
 
-    prefix = "yolo.mindrecord"
-    mindrecord_file = os.path.join(args_opt.mindrecord_dir, prefix + "0")
+    yolo_prefix = "yolo.mindrecord"
+    mindrecord_file = os.path.join(args_opt.mindrecord_dir, yolo_prefix + "0")
     if not os.path.exists(mindrecord_file):
         if os.path.isdir(args_opt.image_dir) and os.path.exists(args_opt.anno_path):
             print("Create Mindrecord")
             data_to_mindrecord_byte_image(args_opt.image_dir,
                                           args_opt.anno_path,
                                           args_opt.mindrecord_dir,
-                                          prefix=prefix,
+                                          prefix=yolo_prefix,
                                           file_num=8)
             print("Create Mindrecord Done, at {}".format(args_opt.mindrecord_dir))
         else:
diff --git a/example/yolov3_coco2017/run_distribute_train.sh b/model_zoo/yolov3/scripts/run_distribute_train.sh
similarity index 94%
rename from example/yolov3_coco2017/run_distribute_train.sh
rename to model_zoo/yolov3/scripts/run_distribute_train.sh
index 0b764419d2..eeda5077e9 100644
--- a/example/yolov3_coco2017/run_distribute_train.sh
+++ b/model_zoo/yolov3/scripts/run_distribute_train.sh
@@ -45,6 +45,9 @@ echo "After running the scipt, the network runs in the background. The log will
 export MINDSPORE_HCCL_CONFIG_PATH=$6
 export RANK_SIZE=$1
 
+BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
+cd $BASE_PATH/../ || exit
+
 for((i=0;i<RANK_SIZE;i++))
 do
     export DEVICE_ID=$i
@@ -56,6 +59,7 @@ do
     rm -rf LOG$i
     mkdir ./LOG$i
     cp  *.py ./LOG$i
+    cp -r ./src ./LOG$i
     cd ./LOG$i || exit
     export RANK_ID=$i
     echo "start training for rank $i, device $DEVICE_ID"
@@ -63,7 +67,7 @@ do
 
     if [ $# == 6 ]
     then
-        taskset -c $cmdopt python ../train.py  \
+        taskset -c $cmdopt python train.py  \
         --distribute=1  \
         --lr=0.005 \
         --device_num=$RANK_SIZE  \
@@ -76,7 +80,7 @@ do
 
     if [ $# == 8 ]
     then
-        taskset -c $cmdopt python ../train.py  \
+        taskset -c $cmdopt python train.py  \
         --distribute=1  \
         --lr=0.005 \
         --device_num=$RANK_SIZE  \
diff --git a/example/yolov3_coco2017/run_eval.sh b/model_zoo/yolov3/scripts/run_eval.sh
similarity index 94%
rename from example/yolov3_coco2017/run_eval.sh
rename to model_zoo/yolov3/scripts/run_eval.sh
index 4608e92589..ccdb619854 100644
--- a/example/yolov3_coco2017/run_eval.sh
+++ b/model_zoo/yolov3/scripts/run_eval.sh
@@ -20,4 +20,7 @@ echo "sh run_eval.sh DEVICE_ID CKPT_PATH MINDRECORD_DIR IMAGE_DIR ANNO_PATH"
 echo "for example: sh run_eval.sh 0 yolo.ckpt ./Mindrecord_eval ./dataset ./dataset/eval.txt"
 echo "=============================================================================================================="
 
+BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
+cd $BASE_PATH/../ || exit
+
 python eval.py --device_id=$1 --ckpt_path=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
diff --git a/example/yolov3_coco2017/run_standalone_train.sh b/model_zoo/yolov3/scripts/run_standalone_train.sh
similarity index 96%
rename from example/yolov3_coco2017/run_standalone_train.sh
rename to model_zoo/yolov3/scripts/run_standalone_train.sh
index 8bce45d89c..71634a89cb 100644
--- a/example/yolov3_coco2017/run_standalone_train.sh
+++ b/model_zoo/yolov3/scripts/run_standalone_train.sh
@@ -27,6 +27,9 @@ then
     exit 1
 fi
 
+BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
+cd $BASE_PATH/../ || exit
+
 if [ $# == 5 ]
 then
     python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
diff --git a/example/yolov3_coco2017/config.py b/model_zoo/yolov3/src/config.py
similarity index 98%
rename from example/yolov3_coco2017/config.py
rename to model_zoo/yolov3/src/config.py
index 6d858bcbcb..37bdcb944b 100644
--- a/example/yolov3_coco2017/config.py
+++ b/model_zoo/yolov3/src/config.py
@@ -25,7 +25,7 @@ class ConfigYOLOV3ResNet18:
     """
     img_shape = [352, 640]
     feature_shape = [32, 3, 352, 640]
-    num_classes = 80
+    num_classes = 2
     nms_max_num = 50
 
     backbone_input_shape = [64, 64, 128, 256]
diff --git a/example/yolov3_coco2017/dataset.py b/model_zoo/yolov3/src/dataset.py
similarity index 99%
rename from example/yolov3_coco2017/dataset.py
rename to model_zoo/yolov3/src/dataset.py
index 23d34e0f4f..f85b442209 100644
--- a/example/yolov3_coco2017/dataset.py
+++ b/model_zoo/yolov3/src/dataset.py
@@ -23,7 +23,7 @@ from PIL import Image
 import mindspore.dataset as de
 from mindspore.mindrecord import FileWriter
 import mindspore.dataset.transforms.vision.c_transforms as C
-from config import ConfigYOLOV3ResNet18
+from src.config import ConfigYOLOV3ResNet18
 
 iter_cnt = 0
 _NUM_BOXES = 50
@@ -268,7 +268,7 @@ def filter_valid_data(image_dir, anno_path):
     return image_files, image_anno_dict
 
 
-def data_to_mindrecord_byte_image(image_dir, anno_path, mindrecord_dir, prefix="yolo.mindrecord", file_num=8):
+def data_to_mindrecord_byte_image(image_dir, anno_path, mindrecord_dir, prefix, file_num):
     """Create MindRecord file by image_dir and anno_path."""
     mindrecord_path = os.path.join(mindrecord_dir, prefix)
     writer = FileWriter(mindrecord_path, file_num)
diff --git a/example/yolov3_coco2017/util.py b/model_zoo/yolov3/src/utils.py
similarity index 99%
rename from example/yolov3_coco2017/util.py
rename to model_zoo/yolov3/src/utils.py
index 62e15afe38..b0462db65e 100644
--- a/example/yolov3_coco2017/util.py
+++ b/model_zoo/yolov3/src/utils.py
@@ -15,7 +15,7 @@
 """metrics utils"""
 
 import numpy as np
-from config import ConfigYOLOV3ResNet18
+from src.config import ConfigYOLOV3ResNet18
 
 
 def calc_iou(bbox_pred, bbox_ground):
diff --git a/mindspore/model_zoo/yolov3.py b/model_zoo/yolov3/src/yolov3.py
similarity index 100%
rename from mindspore/model_zoo/yolov3.py
rename to model_zoo/yolov3/src/yolov3.py
diff --git a/example/yolov3_coco2017/train.py b/model_zoo/yolov3/train.py
similarity index 97%
rename from example/yolov3_coco2017/train.py
rename to model_zoo/yolov3/train.py
index 683d7bd785..0a15066ed3 100644
--- a/example/yolov3_coco2017/train.py
+++ b/model_zoo/yolov3/train.py
@@ -33,9 +33,9 @@ from mindspore.train import Model, ParallelMode
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.common.initializer import initializer
 
-from mindspore.model_zoo.yolov3 import yolov3_resnet18, YoloWithLossCell, TrainingWrapper
-from dataset import create_yolo_dataset, data_to_mindrecord_byte_image
-from config import ConfigYOLOV3ResNet18
+from src.yolov3 import yolov3_resnet18, YoloWithLossCell, TrainingWrapper
+from src.dataset import create_yolo_dataset, data_to_mindrecord_byte_image
+from src.config import ConfigYOLOV3ResNet18
 
 
 def get_lr(learning_rate, start_step, global_step, decay_step, decay_rate, steps=False):
@@ -56,7 +56,7 @@ def init_net_param(network, init_value='ones'):
     params = network.trainable_params()
     for p in params:
         if isinstance(p.data, Tensor) and 'beta' not in p.name and 'gamma' not in p.name and 'bias' not in p.name:
-            p.set_parameter_data(initializer(init_value, p.data.shape(), p.data.dtype()))
+            p.set_parameter_data(initializer(init_value, p.data.shape, p.data.dtype))
 
 
 def main():
diff --git a/requirements.txt b/requirements.txt
index e182cd7a3b..4038e63ea7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-numpy >= 1.17.0
+numpy >= 1.17.0, <= 1.17.5
 protobuf >= 3.8.0
 asttokens >= 1.1.13
 pillow >= 6.2.0
@@ -10,4 +10,6 @@ wheel >= 0.32.0
 decorator >= 4.4.0
 setuptools >= 40.8.0
 matplotlib >= 3.1.3         # for ut test
-opencv-python >= 4.2.0.32   # for ut test
+opencv-python >= 4.1.2.30   # for ut test
+sklearn >= 0.0              # for st test
+pandas >= 1.0.2             # for ut test
\ No newline at end of file
diff --git a/serving/CMakeLists.txt b/serving/CMakeLists.txt
new file mode 100644
index 0000000000..3c1c08ece0
--- /dev/null
+++ b/serving/CMakeLists.txt
@@ -0,0 +1,69 @@
+find_package(Threads REQUIRED)
+
+# This branch assumes that gRPC and all its dependencies are already installed
+# on this system, so they can be located by find_package().
+
+# Find Protobuf installation
+# Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+
+#set(protobuf_MODULE_COMPATIBLE TRUE)
+#find_package(Protobuf CONFIG REQUIRED)
+#message(STATUS "Using protobuf ${protobuf_VERSION}")
+add_library(protobuf::libprotobuf ALIAS protobuf::protobuf)
+add_executable(protobuf::libprotoc ALIAS protobuf::protoc)
+
+set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
+set(_REFLECTION gRPC::grpc++_reflection)
+if(CMAKE_CROSSCOMPILING)
+    find_program(_PROTOBUF_PROTOC protoc)
+else()
+    set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+endif()
+
+# Find gRPC installation
+# Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+find_package(gRPC CONFIG REQUIRED)
+message(STATUS "Using gRPC ${gRPC_VERSION}")
+
+set(_GRPC_GRPCPP gRPC::grpc++)
+if(CMAKE_CROSSCOMPILING)
+    find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+else()
+    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
+endif()
+
+# Proto file
+get_filename_component(hw_proto "ms_service.proto" ABSOLUTE)
+get_filename_component(hw_proto_path "${hw_proto}" PATH)
+
+# Generated sources
+set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/ms_service.pb.cc")
+set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/ms_service.pb.h")
+set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/ms_service.grpc.pb.cc")
+set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/ms_service.grpc.pb.h")
+add_custom_command(
+        OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
+        COMMAND ${_PROTOBUF_PROTOC}
+        ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${hw_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${hw_proto}"
+        DEPENDS "${hw_proto}")
+
+# Include generated *.pb.h files
+include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/core"
+        "${PROJECT_SOURCE_DIR}/mindspore/ccsrc")
+file(GLOB_RECURSE CORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+        "core/*.cc" "core/util/*.cc" "core/version_control/*.cc")
+
+list(APPEND SERVING_SRC "main.cc" ${hw_proto_srcs} ${hw_grpc_srcs} ${CORE_SRC_LIST})
+
+include_directories(${CMAKE_BINARY_DIR})
+add_executable(ms_serving ${SERVING_SRC})
+target_link_libraries(ms_serving inference mindspore_gvar)
+target_link_libraries(ms_serving ${_REFLECTION} ${_GRPC_GRPCPP} ${_PROTOBUF_LIBPROTOBUF} pthread)
+if (ENABLE_D)
+    add_compile_definitions(ENABLE_D)
+    target_link_libraries(ms_serving ${RUNTIME_LIB})
+endif()
diff --git a/serving/README.en.md b/serving/README.en.md
new file mode 100644
index 0000000000..830b94537a
--- /dev/null
+++ b/serving/README.en.md
@@ -0,0 +1,36 @@
+# serving
+
+#### Description
+A flexible, high-performance serving system for deep learning models
+
+#### Software Architecture
+Software architecture description
+
+#### Installation
+
+1.  xxxx
+2.  xxxx
+3.  xxxx
+
+#### Instructions
+
+1.  xxxx
+2.  xxxx
+3.  xxxx
+
+#### Contribution
+
+1.  Fork the repository
+2.  Create Feat_xxx branch
+3.  Commit your code
+4.  Create Pull Request
+
+
+#### Gitee Feature
+
+1.  You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md
+2.  Gitee blog [blog.gitee.com](https://blog.gitee.com)
+3.  Explore open source project [https://gitee.com/explore](https://gitee.com/explore)
+4.  The most valuable open source project [GVP](https://gitee.com/gvp)
+5.  The manual of Gitee [https://gitee.com/help](https://gitee.com/help)
+6.  The most popular members  [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)
diff --git a/serving/README.md b/serving/README.md
new file mode 100644
index 0000000000..b26b9a6887
--- /dev/null
+++ b/serving/README.md
@@ -0,0 +1,37 @@
+# serving
+
+#### 介绍
+A flexible, high-performance serving system for deep learning models
+
+#### 软件架构
+软件架构说明
+
+
+#### 安装教程
+
+1.  xxxx
+2.  xxxx
+3.  xxxx
+
+#### 使用说明
+
+1.  xxxx
+2.  xxxx
+3.  xxxx
+
+#### 参与贡献
+
+1.  Fork 本仓库
+2.  新建 Feat_xxx 分支
+3.  提交代码
+4.  新建 Pull Request
+
+
+#### 码云特技
+
+1.  使用 Readme\_XXX.md 来支持不同的语言，例如 Readme\_en.md, Readme\_zh.md
+2.  码云官方博客 [blog.gitee.com](https://blog.gitee.com)
+3.  你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解码云上的优秀开源项目
+4.  [GVP](https://gitee.com/gvp) 全称是码云最有价值开源项目，是码云综合评定出的优秀开源项目
+5.  码云官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help)
+6.  码云封面人物是一档用来展示码云会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)
diff --git a/serving/core/server.cc b/serving/core/server.cc
new file mode 100644
index 0000000000..add9d16bee
--- /dev/null
+++ b/serving/core/server.cc
@@ -0,0 +1,277 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "core/server.h"
+#include <grpcpp/grpcpp.h>
+#include <grpcpp/health_check_service_interface.h>
+#include <grpcpp/ext/proto_server_reflection_plugin.h>
+#include <string>
+#include <map>
+#include <vector>
+#include <utility>
+#include <memory>
+
+#include "mindspore/ccsrc/utils/log_adapter.h"
+#include "serving/ms_service.grpc.pb.h"
+#include "core/util/option_parser.h"
+#include "core/version_control/version_controller.h"
+#include "mindspore/ccsrc/utils/context/ms_context.h"
+#include "core/util/file_system_operation.h"
+#include "graphengine/third_party/fwkacllib/inc/runtime/context.h"
+
+using ms_serving::MSService;
+using ms_serving::PredictReply;
+using ms_serving::PredictRequest;
+
+namespace mindspore {
+namespace serving {
+using MSTensorPtr = std::shared_ptr<inference::MSTensor>;
+
+Status Session::CreatDeviceSession(const std::string &device, uint32_t device_id) {
+  session_ = inference::MSSession::CreateSession(device + "Inference", device_id);
+  if (session_ == nullptr) {
+    MS_LOG(ERROR) << "Creat Session Failed";
+    return FAILED;
+  }
+  device_type_ = device;
+  return SUCCESS;
+}
+
+Session &Session::Instance() {
+  static Session instance;
+  return instance;
+}
+
+Status Session::Predict(const std::vector<MSTensorPtr> &inputs, inference::MultiTensor *outputs) {
+  if (last_graph_ == nullptr) {
+    MS_LOG(ERROR) << "the model has not loaded";
+    return FAILED;
+  }
+  if (session_ == nullptr) {
+    MS_LOG(ERROR) << "the inference session has not be initialized";
+    return FAILED;
+  }
+  std::lock_guard<std::mutex> lock(mutex_);
+  MS_LOG(INFO) << "run Predict";
+
+  *outputs = session_->RunGraph(graph_id_, inputs);
+  return SUCCESS;
+}
+
+Status Session::Warmup(const MindSporeModelPtr model) {
+  if (session_ == nullptr) {
+    MS_LOG(ERROR) << "The CreatDeviceSession should be called, before warmup";
+    return FAILED;
+  }
+  std::lock_guard<std::mutex> lock(mutex_);
+  size_t size = 0;
+  std::string file_name = model->GetModelPath() + '/' + model->GetModelName();
+  char *graphBuf = ReadFile(file_name.c_str(), &size);
+  if (graphBuf == nullptr) {
+    MS_LOG(ERROR) << "Load graph model failed, file name is " << file_name.c_str();
+    return FAILED;
+  }
+  last_graph_ = inference::LoadModel(graphBuf, size, device_type_);
+  graph_id_ = session_->CompileGraph(last_graph_);
+  MS_LOG(INFO) << "Session Warmup";
+  return SUCCESS;
+}
+
+Status Session::Clear() {
+  session_ = nullptr;
+  return SUCCESS;
+}
+
+namespace {
+const std::map<ms_serving::DataType, TypeId> type2id_map{
+  {ms_serving::MS_UNKNOWN, TypeId::kNumberTypeBegin},   {ms_serving::MS_BOOL, TypeId::kNumberTypeBool},
+  {ms_serving::MS_INT8, TypeId::kNumberTypeInt8},       {ms_serving::MS_UINT8, TypeId::kNumberTypeUInt8},
+  {ms_serving::MS_INT16, TypeId::kNumberTypeInt16},     {ms_serving::MS_UINT16, TypeId::kNumberTypeUInt16},
+  {ms_serving::MS_INT32, TypeId::kNumberTypeInt32},     {ms_serving::MS_UINT32, TypeId::kNumberTypeUInt32},
+  {ms_serving::MS_INT64, TypeId::kNumberTypeInt64},     {ms_serving::MS_UINT64, TypeId::kNumberTypeUInt64},
+  {ms_serving::MS_FLOAT16, TypeId::kNumberTypeFloat16}, {ms_serving::MS_FLOAT32, TypeId::kNumberTypeFloat32},
+  {ms_serving::MS_FLOAT64, TypeId::kNumberTypeFloat64},
+};
+
+const std::map<TypeId, ms_serving::DataType> id2type_map{
+  {TypeId::kNumberTypeBegin, ms_serving::MS_UNKNOWN},   {TypeId::kNumberTypeBool, ms_serving::MS_BOOL},
+  {TypeId::kNumberTypeInt8, ms_serving::MS_INT8},       {TypeId::kNumberTypeUInt8, ms_serving::MS_UINT8},
+  {TypeId::kNumberTypeInt16, ms_serving::MS_INT16},     {TypeId::kNumberTypeUInt16, ms_serving::MS_UINT16},
+  {TypeId::kNumberTypeInt32, ms_serving::MS_INT32},     {TypeId::kNumberTypeUInt32, ms_serving::MS_UINT32},
+  {TypeId::kNumberTypeInt64, ms_serving::MS_INT64},     {TypeId::kNumberTypeUInt64, ms_serving::MS_UINT64},
+  {TypeId::kNumberTypeFloat16, ms_serving::MS_FLOAT16}, {TypeId::kNumberTypeFloat32, ms_serving::MS_FLOAT32},
+  {TypeId::kNumberTypeFloat64, ms_serving::MS_FLOAT64},
+};
+const std::map<ms_serving::DataType, size_t> length_map{
+  {ms_serving::MS_UNKNOWN, 0},
+  {ms_serving::MS_BOOL, sizeof(bool)},
+  {ms_serving::MS_INT8, sizeof(int8_t)},
+  {ms_serving::MS_UINT8, sizeof(uint8_t)},
+  {ms_serving::MS_INT16, sizeof(int16_t)},
+  {ms_serving::MS_UINT16, sizeof(uint16_t)},
+  {ms_serving::MS_INT32, sizeof(int32_t)},
+  {ms_serving::MS_UINT32, sizeof(uint32_t)},
+  {ms_serving::MS_INT64, sizeof(int64_t)},
+  {ms_serving::MS_UINT64, sizeof(uint64_t)},
+  {ms_serving::MS_FLOAT16, 2},
+  {ms_serving::MS_FLOAT32, 4},
+  {ms_serving::MS_FLOAT64, 8},
+};
+MSTensorPtr ServingTensor2MSTensor(const ms_serving::Tensor &tensor) {
+  std::vector<int> shape;
+  for (auto dim : tensor.tensor_shape().dims()) {
+    shape.push_back(static_cast<int>(dim));
+  }
+  auto iter = type2id_map.find(tensor.tensor_type());
+  if (iter == type2id_map.end()) {
+    MS_LOG(ERROR) << "input tensor type is wrong, type is " << tensor.tensor_type();
+    return nullptr;
+  }
+  TypeId type = iter->second;
+  auto ms_tensor = std::shared_ptr<inference::MSTensor>(inference::MSTensor::CreateTensor(type, shape));
+  memcpy_s(ms_tensor->MutableData(), tensor.data().size(), tensor.data().data(), tensor.data().size());
+  return ms_tensor;
+}
+
+ms_serving::Tensor MSTensor2ServingTensor(MSTensorPtr ms_tensor) {
+  ms_serving::Tensor tensor;
+  ms_serving::TensorShape shape;
+  for (auto dim : ms_tensor->shape()) {
+    shape.add_dims(dim);
+  }
+  *tensor.mutable_tensor_shape() = shape;
+  auto iter = id2type_map.find(ms_tensor->data_type());
+  if (iter == id2type_map.end()) {
+    MS_LOG(ERROR) << "input tensor type is wrong, type is " << tensor.tensor_type();
+    return tensor;
+  }
+  tensor.set_tensor_type(iter->second);
+  tensor.set_data(ms_tensor->MutableData(), ms_tensor->Size());
+  return tensor;
+}
+
+void ClearEnv() {
+  Session::Instance().Clear();
+  inference::ExitInference();
+}
+void HandleSignal(int sig) {
+  ClearEnv();
+  exit(0);
+}
+
+#ifdef ENABLE_D
+static rtContext_t g_ctx = nullptr;
+#endif
+}  // namespace
+
+// Service Implement
+class MSServiceImpl final : public MSService::Service {
+  grpc::Status Predict(grpc::ServerContext *context, const PredictRequest *request, PredictReply *reply) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+#ifdef ENABLE_D
+    if (g_ctx == nullptr) {
+      MS_LOG(ERROR) << "rtCtx is nullptr";
+      return grpc::Status::CANCELLED;
+    }
+    rtError_t rt_ret = rtCtxSetCurrent(g_ctx);
+    if (rt_ret != RT_ERROR_NONE) {
+      MS_LOG(ERROR) << "set Ascend rtCtx failed";
+    }
+#endif
+    std::vector<MSTensorPtr> inputs;
+    inference::MultiTensor outputs;
+    for (int i = 0; i < request->data_size(); i++) {
+      auto input = ServingTensor2MSTensor(request->data(i));
+      if (input == nullptr) {
+        MS_LOG(ERROR) << "Tensor convert failed";
+        return grpc::Status::CANCELLED;
+      }
+      inputs.push_back(input);
+    }
+    auto res = Session::Instance().Predict(inputs, &outputs);
+    if (res != SUCCESS) {
+      return grpc::Status::CANCELLED;
+    }
+    for (const auto &tensor : outputs) {
+      *reply->add_result() = MSTensor2ServingTensor(tensor);
+    }
+    MS_LOG(INFO) << "Finish call service Eval";
+    return grpc::Status::OK;
+  }
+
+  grpc::Status Test(grpc::ServerContext *context, const PredictRequest *request, PredictReply *reply) override {
+    MS_LOG(INFO) << "TestService call";
+    return grpc::Status::OK;
+  }
+  std::mutex mutex_;
+};
+
+Status Server::BuildAndStart() {
+  // handle exit signal
+  signal(SIGINT, HandleSignal);
+  Status res;
+  auto option_args = Options::Instance().GetArgs();
+  std::string server_address = "0.0.0.0:" + std::to_string(option_args->grpc_port);
+  std::string model_path = option_args->model_path;
+  std::string model_name = option_args->model_name;
+  std::string device_type = option_args->device_type;
+  auto device_id = option_args->device_id;
+  res = Session::Instance().CreatDeviceSession(device_type, device_id);
+  if (res != SUCCESS) {
+    MS_LOG(ERROR) << "creat session failed";
+    ClearEnv();
+    return res;
+  }
+  VersionController version_controller(option_args->poll_model_wait_seconds, model_path, model_name);
+  res = version_controller.Run();
+  if (res != SUCCESS) {
+    MS_LOG(ERROR) << "load model failed";
+    ClearEnv();
+    return res;
+  }
+#ifdef ENABLE_D
+  // set d context
+  rtContext_t ctx = nullptr;
+  rtError_t rt_ret = rtCtxGetCurrent(&ctx);
+  if (rt_ret != RT_ERROR_NONE || ctx == nullptr) {
+    MS_LOG(ERROR) << "the ascend device context is null";
+    return FAILED;
+  }
+  g_ctx = ctx;
+#endif
+  MSServiceImpl service;
+  grpc::EnableDefaultHealthCheckService(true);
+  grpc::reflection::InitProtoReflectionServerBuilderPlugin();
+  // Set the port is not reuseable
+  auto option = grpc::MakeChannelArgumentOption(GRPC_ARG_ALLOW_REUSEPORT, 0);
+  grpc::ServerBuilder builder;
+  builder.SetOption(std::move(option));
+  // Listen on the given address without any authentication mechanism.
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  // Register "service" as the instance through which we'll communicate with
+  // clients. In this case it corresponds to an *synchronous* service.
+  builder.RegisterService(&service);
+  // Finally assemble the server.
+  std::unique_ptr<grpc::Server> server(builder.BuildAndStart());
+  MS_LOG(INFO) << "Server listening on " << server_address << std::endl;
+
+  // Wait for the server to shutdown. Note that some other thread must be
+  // responsible for shutting down the server for this call to ever return.
+  server->Wait();
+  return SUCCESS;
+}
+
+}  // namespace serving
+}  // namespace mindspore
diff --git a/serving/core/server.h b/serving/core/server.h
new file mode 100644
index 0000000000..f1927e9946
--- /dev/null
+++ b/serving/core/server.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_SERVER_H
+#define MINDSPORE_SERVER_H
+
+#include <string>
+#include <mutex>
+#include <vector>
+#include <memory>
+#include "util/status.h"
+#include "version_control/model.h"
+#include "include/inference.h"
+#include "mindspore/ccsrc/debug/info.h"
+namespace mindspore {
+namespace serving {
+class Session {
+ public:
+  static Session &Instance();
+  Status CreatDeviceSession(const std::string &device, uint32_t device_id);
+  Status Predict(const std::vector<std::shared_ptr<inference::MSTensor>> &inputs, inference::MultiTensor *output);
+  Status Warmup(const MindSporeModelPtr model);
+  Status Clear();
+
+ private:
+  Session() = default;
+  ~Session() = default;
+  int sesseion_id_{0};
+  std::shared_ptr<inference::MSSession> session_{nullptr};
+  FuncGraphPtr last_graph_{nullptr};
+  uint32_t graph_id_{0};
+  std::mutex mutex_;
+  std::string device_type_;
+};
+
+class Server {
+ public:
+  Server() = default;
+  ~Server() = default;
+  Status BuildAndStart();
+};
+}  // namespace serving
+}  // namespace mindspore
+#endif  // MINDSPORE_SERVER_H
diff --git a/serving/core/util/file_system_operation.cc b/serving/core/util/file_system_operation.cc
new file mode 100644
index 0000000000..a5143995de
--- /dev/null
+++ b/serving/core/util/file_system_operation.cc
@@ -0,0 +1,102 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "core/util/file_system_operation.h"
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <ctime>
+#include <fstream>
+#include <memory>
+#include "mindspore/ccsrc/utils/log_adapter.h"
+
+namespace mindspore {
+namespace serving {
+
+char *ReadFile(const char *file, size_t *size) {
+  if (file == nullptr) {
+    MS_LOG(ERROR) << "file is nullptr";
+    return nullptr;
+  }
+  MS_ASSERT(size != nullptr);
+  std::string realPath = file;
+  std::ifstream ifs(realPath);
+  if (!ifs.good()) {
+    MS_LOG(ERROR) << "file: " << realPath << " is not exist";
+    return nullptr;
+  }
+
+  if (!ifs.is_open()) {
+    MS_LOG(ERROR) << "file: " << realPath << "open failed";
+    return nullptr;
+  }
+
+  ifs.seekg(0, std::ios::end);
+  *size = ifs.tellg();
+  std::unique_ptr<char> buf(new (std::nothrow) char[*size]);
+  if (buf == nullptr) {
+    MS_LOG(ERROR) << "malloc buf failed, file: " << realPath;
+    ifs.close();
+    return nullptr;
+  }
+
+  ifs.seekg(0, std::ios::beg);
+  ifs.read(buf.get(), *size);
+  ifs.close();
+
+  return buf.release();
+}
+
+bool DirOrFileExist(const std::string &file_path) {
+  int ret = access(file_path.c_str(), 0);
+  return (ret == -1) ? false : true;
+}
+
+std::vector<std::string> GetAllSubDirs(const std::string &dir_path) {
+  DIR *dir;
+  struct dirent *ptr;
+  std::vector<std::string> SubDirs;
+
+  if ((dir = opendir(dir_path.c_str())) == NULL) {
+    MS_LOG(ERROR) << "Open " << dir_path << " error!";
+    return std::vector<std::string>();
+  }
+
+  while ((ptr = readdir(dir)) != NULL) {
+    std::string name = ptr->d_name;
+    if (name == "." || name == "..") {
+      continue;
+    }
+    if (ptr->d_type == DT_DIR) {
+      SubDirs.push_back(dir_path + "/" + name);
+    }
+  }
+  closedir(dir);
+  std::sort(SubDirs.begin(), SubDirs.end());
+  return SubDirs;
+}
+
+time_t GetModifyTime(const std::string &file_path) {
+  struct stat info;
+  (void)stat(file_path.c_str(), &info);
+  return info.st_mtime;
+}
+}  // namespace serving
+}  // namespace mindspore
diff --git a/serving/core/util/file_system_operation.h b/serving/core/util/file_system_operation.h
new file mode 100644
index 0000000000..e03883b812
--- /dev/null
+++ b/serving/core/util/file_system_operation.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_SERVING_FILE_SYSTEM_OPERATION_H_
+#define MINDSPORE_SERVING_FILE_SYSTEM_OPERATION_H_
+
+#include <string>
+#include <vector>
+#include <ctime>
+
+namespace mindspore {
+namespace serving {
+char *ReadFile(const char *file, size_t *size);
+bool DirOrFileExist(const std::string &file_path);
+std::vector<std::string> GetAllSubDirs(const std::string &dir_path);
+time_t GetModifyTime(const std::string &file_path);
+}  // namespace serving
+}  // namespace mindspore
+
+#endif  // !MINDSPORE_SERVING_FILE_SYSTEM_OPERATION_H_
diff --git a/serving/core/util/option_parser.cc b/serving/core/util/option_parser.cc
new file mode 100644
index 0000000000..9cbd7eaee8
--- /dev/null
+++ b/serving/core/util/option_parser.cc
@@ -0,0 +1,243 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "core/util/option_parser.h"
+#include <vector>
+#include <string>
+#include <cstring>
+#include <iostream>
+#include <iomanip>
+#include "mindspore/ccsrc/utils/log_adapter.h"
+
+namespace mindspore {
+namespace serving {
+bool StartWith(const std::string &str, const std::string &expected) {
+  return expected.empty() ||
+         (str.size() >= expected.size() && memcmp(str.data(), expected.data(), expected.size()) == 0);
+}
+
+bool RemovePrefix(std::string *str, const std::string &prefix) {
+  if (!StartWith(*str, prefix)) return false;
+  str->replace(str->begin(), str->begin() + prefix.size(), "");
+  return true;
+}
+
+bool Option::ParseInt32(std::string *arg) {
+  if (RemovePrefix(arg, "--") && RemovePrefix(arg, name_) && RemovePrefix(arg, "=")) {
+    char extra;
+    int32_t parsed_value;
+    if (sscanf(arg->data(), "%d%c", &parsed_value, &extra) != 1) {
+      std::cout << "Parse " << name_ << "Error for option " << *arg << std::endl;
+      return false;
+    } else {
+      *int32_default_ = parsed_value;
+    }
+    return true;
+  }
+
+  return false;
+}
+
+bool Option::ParseBool(std::string *arg) {
+  if (RemovePrefix(arg, "--") && RemovePrefix(arg, name_) && RemovePrefix(arg, "=")) {
+    if (*arg == "true") {
+      *bool_default_ = true;
+    } else if (*arg == "false") {
+      *bool_default_ = false;
+    } else {
+      std::cout << "Parse " << name_ << " Error for option " << *arg << std::endl;
+      return false;
+    }
+    return true;
+  }
+
+  return false;
+}
+
+bool Option::ParseString(std::string *arg) {
+  if (RemovePrefix(arg, "--") && RemovePrefix(arg, name_) && RemovePrefix(arg, "=")) {
+    *string_default_ = *arg;
+    return true;
+  }
+  return false;
+}
+
+bool Option::ParseFloat(std::string *arg) {
+  if (RemovePrefix(arg, "--") && RemovePrefix(arg, name_) && RemovePrefix(arg, "=")) {
+    char extra;
+    float parsed_value;
+    if (sscanf(arg->data(), "%f%c", &parsed_value, &extra) != 1) {
+      std::cout << "Parse " << name_ << "Error for option " << *arg << std::endl;
+      return false;
+    } else {
+      *float_default_ = parsed_value;
+    }
+    return true;
+  }
+
+  return false;
+}
+
+Option::Option(const std::string &name, int32_t *default_point, const std::string &usage)
+    : name_(name),
+      type_(MS_TYPE_INT32),
+      int32_default_(default_point),
+      bool_default_(nullptr),
+      string_default_(nullptr),
+      float_default_(nullptr),
+      usage_(usage) {}
+
+Option::Option(const std::string &name, bool *default_point, const std::string &usage)
+    : name_(name),
+      type_(MS_TYPE_BOOL),
+      int32_default_(nullptr),
+      bool_default_(default_point),
+      string_default_(nullptr),
+      float_default_(nullptr),
+      usage_(usage) {}
+
+Option::Option(const std::string &name, std::string *default_point, const std::string &usage)
+    : name_(name),
+      type_(MS_TYPE_STRING),
+      int32_default_(nullptr),
+      bool_default_(nullptr),
+      string_default_(default_point),
+      float_default_(nullptr),
+      usage_(usage) {}
+
+Option::Option(const std::string &name, float *default_point, const std::string &usage)
+    : name_(name),
+      type_(MS_TYPE_FLOAT),
+      int32_default_(nullptr),
+      bool_default_(nullptr),
+      string_default_(nullptr),
+      float_default_(default_point),
+      usage_(usage) {}
+
+bool Option::Parse(std::string *arg) {
+  bool result = false;
+  switch (type_) {
+    case MS_TYPE_BOOL:
+      result = ParseBool(arg);
+      break;
+    case MS_TYPE_FLOAT:
+      result = ParseFloat(arg);
+      break;
+    case MS_TYPE_INT32:
+      result = ParseInt32(arg);
+      break;
+    case MS_TYPE_STRING:
+      result = ParseString(arg);
+      break;
+    default:
+      break;
+  }
+  return result;
+}
+
+std::shared_ptr<Options> Options::inst_ = nullptr;
+
+Options &Options::Instance() {
+  static Options instance;
+  return instance;
+}
+
+Options::Options() : args_(nullptr) { CreateOptions(); }
+
+void Options::CreateOptions() {
+  args_ = std::make_shared<Arguments>();
+  std::vector<Option> options = {
+    Option("port", &args_->grpc_port, "Port to listen on for gRPC API, default is 5500"),
+    Option("model_name", &args_->model_name, "model name "),
+    Option("model_path", &args_->model_path, "the path of the model files"),
+    Option("device_id", &args_->device_id, "the device id, default is 0"),
+  };
+  options_ = options;
+}
+
+bool Options::CheckOptions() {
+  if (args_->model_name == "" || args_->model_path == "") {
+    std::cout << "model_path and model_name should not be null" << std::endl;
+    return false;
+  }
+  if (args_->device_type != "Ascend") {
+    std::cout << "device_type only support Ascend right now" << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool Options::ParseCommandLine(int argc, char **argv) {
+  if (argc < 2 || (strcmp(argv[1], "--help") == 0)) {
+    Usage();
+    return false;
+  }
+  std::vector<std::string> unkown_options;
+  for (int i = 1; i < argc; ++i) {
+    bool found = false;
+    for (auto &option : options_) {
+      std::string arg = argv[i];
+      if (option.Parse(&arg)) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found == false) {
+      unkown_options.push_back(argv[i]);
+    }
+  }
+
+  if (!unkown_options.empty()) {
+    std::cout << "unkown options:" << std::endl;
+    for (const auto &option : unkown_options) {
+      std::cout << option << std::endl;
+    }
+  }
+  bool valid = (unkown_options.empty() && CheckOptions());
+  if (!valid) {
+    Usage();
+  }
+  return valid;
+}
+
+void Options::Usage() {
+  std::cout << "USAGE: mindspore-serving [options]" << std::endl;
+
+  for (const auto &option : options_) {
+    std::string type;
+    switch (option.type_) {
+      case Option::MS_TYPE_BOOL:
+        type = "bool";
+        break;
+      case Option::MS_TYPE_FLOAT:
+        type = "float";
+        break;
+      case Option::MS_TYPE_INT32:
+        type = "int32";
+        break;
+      case Option::MS_TYPE_STRING:
+        type = "string";
+        break;
+      default:
+        break;
+    }
+    std::cout << "--" << std::setw(30) << std::left << option.name_ << std::setw(10) << std::left << type
+              << option.usage_ << std::endl;
+  }
+}
+
+}  // namespace serving
+}  // namespace mindspore
diff --git a/serving/core/util/option_parser.h b/serving/core/util/option_parser.h
new file mode 100644
index 0000000000..b2185e66c8
--- /dev/null
+++ b/serving/core/util/option_parser.h
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_SERVING_OPTION_PARSER_H_
+#define MINDSPORE_SERVING_OPTION_PARSER_H_
+
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace mindspore {
+namespace serving {
+
+struct Arguments {
+  int32_t grpc_port = 5500;
+  std::string grpc_socket_path;
+  std::string ssl_config_file;
+  int32_t poll_model_wait_seconds = 1;
+  std::string model_name;
+  std::string model_path;
+  std::string device_type = "Ascend";
+  int32_t device_id = 0;
+};
+
+class Option {
+ public:
+  Option(const std::string &name, int32_t *default_point, const std::string &usage);
+  Option(const std::string &name, bool *default_point, const std::string &usage);
+  Option(const std::string &name, std::string *default_point, const std::string &usage);
+  Option(const std::string &name, float *default_point, const std::string &usage);
+
+ private:
+  friend class Options;
+
+  bool ParseInt32(std::string *arg);
+  bool ParseBool(std::string *arg);
+  bool ParseString(std::string *arg);
+  bool ParseFloat(std::string *arg);
+  bool Parse(std::string *arg);
+  std::string name_;
+  enum { MS_TYPE_INT32, MS_TYPE_BOOL, MS_TYPE_STRING, MS_TYPE_FLOAT } type_;
+  int32_t *int32_default_;
+  bool *bool_default_;
+  std::string *string_default_;
+  float *float_default_;
+  std::string usage_;
+};
+
+class Options {
+ public:
+  ~Options() = default;
+  Options(const Options &) = delete;
+  Options &operator=(const Options &) = delete;
+  static Options &Instance();
+  bool ParseCommandLine(int argc, char **argv);
+  void Usage();
+  std::shared_ptr<Arguments> GetArgs() { return args_; }
+
+ private:
+  Options();
+  void CreateOptions();
+  bool CheckOptions();
+  static std::shared_ptr<Options> inst_;
+  std::string usage_;
+  std::vector<Option> options_;
+  std::shared_ptr<Arguments> args_;
+};
+
+}  // namespace serving
+}  // namespace mindspore
+
+#endif
diff --git a/mindspore/ccsrc/kernel/gpu/nn/tanh_gpu_kernel.cc b/serving/core/util/status.h
similarity index 73%
rename from mindspore/ccsrc/kernel/gpu/nn/tanh_gpu_kernel.cc
rename to serving/core/util/status.h
index 727dffeedb..5f97f9b0b7 100644
--- a/mindspore/ccsrc/kernel/gpu/nn/tanh_gpu_kernel.cc
+++ b/serving/core/util/status.h
@@ -13,12 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include "kernel/gpu/nn/tanh_gpu_kernel.h"
-
+#ifndef MINDSPORE_STATUS_H
+#define MINDSPORE_STATUS_H
 namespace mindspore {
-namespace kernel {
-MS_REG_GPU_KERNEL_ONE(Tanh, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                      TanhGpuKernel, float)
-}  // namespace kernel
+namespace serving {
+using Status = uint32_t;
+enum ServingStatus { SUCCESS = 0, FAILED };
+}  // namespace serving
 }  // namespace mindspore
+
+#endif  // MINDSPORE_STATUS_H
diff --git a/serving/core/version_control/model.cc b/serving/core/version_control/model.cc
new file mode 100644
index 0000000000..76db60fc55
--- /dev/null
+++ b/serving/core/version_control/model.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "core/version_control/model.h"
+#include <string>
+#include "mindspore/ccsrc/utils/log_adapter.h"
+
+namespace mindspore {
+namespace serving {
+
+MindSporeModel::MindSporeModel(const std::string &model_name, const std::string &model_path,
+                               const std::string &model_version, const time_t &last_update_time)
+    : model_name_(model_name),
+      model_path_(model_path),
+      model_version_(model_version),
+      last_update_time_(last_update_time) {
+  MS_LOG(INFO) << "init mindspore model, model_name = " << model_name_ << ", model_path = " << model_path_
+               << ", model_version = " << model_version_ << ", last_update_time = " << last_update_time_;
+}
+}  // namespace serving
+}  // namespace mindspore
diff --git a/serving/core/version_control/model.h b/serving/core/version_control/model.h
new file mode 100644
index 0000000000..97af127ee2
--- /dev/null
+++ b/serving/core/version_control/model.h
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_SERVING_MODEL_H_
+#define MINDSPORE_SERVING_MODEL_H_
+
+#include <string>
+#include <ctime>
+#include <memory>
+
+namespace mindspore {
+namespace serving {
+class MindSporeModel {
+ public:
+  MindSporeModel(const std::string &model_name, const std::string &model_path, const std::string &model_version,
+                 const time_t &last_update_time);
+  ~MindSporeModel() = default;
+  std::string GetModelName() { return model_name_; }
+  std::string GetModelPath() { return model_path_; }
+  std::string GetModelVersion() { return model_version_; }
+  time_t GetLastUpdateTime() { return last_update_time_; }
+  void SetLastUpdateTime(const time_t &last_update_time) { last_update_time_ = last_update_time; }
+
+ private:
+  std::string model_name_;
+  std::string model_path_;
+  std::string model_version_;
+  time_t last_update_time_;
+};
+
+using MindSporeModelPtr = std::shared_ptr<MindSporeModel>;
+}  // namespace serving
+}  // namespace mindspore
+
+#endif  // !MINDSPORE_SERVING_MODEL_H_
diff --git a/serving/core/version_control/version_controller.cc b/serving/core/version_control/version_controller.cc
new file mode 100644
index 0000000000..a297e8d825
--- /dev/null
+++ b/serving/core/version_control/version_controller.cc
@@ -0,0 +1,134 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "core/version_control/version_controller.h"
+
+#include <string>
+#include <iostream>
+#include <ctime>
+#include <memory>
+#include "util/file_system_operation.h"
+#include "mindspore/ccsrc/utils/log_adapter.h"
+#include "core/server.h"
+
+namespace mindspore {
+namespace serving {
+
+volatile bool stop_poll = false;
+
+std::string GetVersionFromPath(const std::string &path) {
+  std::string new_path = path;
+  if (path.back() == '/') {
+    new_path = path.substr(0, path.size() - 1);
+  }
+
+  std::string::size_type index = new_path.find_last_of("/");
+  std::string version = new_path.substr(index + 1);
+  return version;
+}
+
+void PeriodicFunction::operator()() {
+  while (true) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(poll_model_wait_seconds_ * 1000));
+    std::vector<std::string> SubDirs = GetAllSubDirs(models_path_);
+
+    if (version_control_strategy_ == VersionController::VersionControllerStrategy::kLastest) {
+      auto path = SubDirs.empty() ? models_path_ : SubDirs.back();
+      std::string model_version = GetVersionFromPath(path);
+      time_t last_update_time = GetModifyTime(path);
+      if (model_version != valid_models_.back()->GetModelVersion()) {
+        MindSporeModelPtr model_ptr = std::make_shared<MindSporeModel>(valid_models_.front()->GetModelName(), path,
+                                                                       model_version, last_update_time);
+        valid_models_.back() = model_ptr;
+        Session::Instance().Warmup(valid_models_.back());
+      } else {
+        if (difftime(valid_models_.back()->GetLastUpdateTime(), last_update_time) < 0) {
+          valid_models_.back()->SetLastUpdateTime(last_update_time);
+        }
+      }
+    } else {
+      // not support
+    }
+
+    if (stop_poll == true) {
+      break;
+    }
+  }
+}
+
+VersionController::VersionController(int32_t poll_model_wait_seconds, const std::string &models_path,
+                                     const std::string &model_name)
+    : version_control_strategy_(kLastest),
+      poll_model_wait_seconds_(poll_model_wait_seconds),
+      models_path_(models_path),
+      model_name_(model_name) {}
+
+void StopPollModelPeriodic() { stop_poll = true; }
+
+VersionController::~VersionController() {
+  StopPollModelPeriodic();
+  if (poll_model_thread_.joinable()) {
+    poll_model_thread_.join();
+  }
+}
+
+Status VersionController::Run() {
+  Status ret;
+  ret = CreateInitModels();
+  if (ret != SUCCESS) {
+    return ret;
+  }
+  // disable periodic check
+  // StartPollModelPeriodic();
+  return SUCCESS;
+}
+
+Status VersionController::CreateInitModels() {
+  if (!DirOrFileExist(models_path_)) {
+    MS_LOG(ERROR) << "Model Path Not Exist!" << std::endl;
+    return FAILED;
+  }
+  std::vector<std::string> SubDirs = GetAllSubDirs(models_path_);
+  if (version_control_strategy_ == kLastest) {
+    auto path = SubDirs.empty() ? models_path_ : SubDirs.back();
+    std::string model_version = GetVersionFromPath(path);
+    time_t last_update_time = GetModifyTime(path);
+    MindSporeModelPtr model_ptr = std::make_shared<MindSporeModel>(model_name_, path, model_version, last_update_time);
+    valid_models_.emplace_back(model_ptr);
+  } else {
+    for (auto &dir : SubDirs) {
+      std::string model_version = GetVersionFromPath(dir);
+      time_t last_update_time = GetModifyTime(dir);
+      MindSporeModelPtr model_ptr = std::make_shared<MindSporeModel>(model_name_, dir, model_version, last_update_time);
+      valid_models_.emplace_back(model_ptr);
+    }
+  }
+  if (valid_models_.empty()) {
+    MS_LOG(ERROR) << "There is no valid model for serving";
+    return FAILED;
+  }
+  Session::Instance().Warmup(valid_models_.back());
+  return SUCCESS;
+}
+
+void VersionController::StartPollModelPeriodic() {
+  poll_model_thread_ = std::thread(
+    PeriodicFunction(poll_model_wait_seconds_, models_path_, version_control_strategy_, std::ref(valid_models_)));
+}
+
+void VersionController::StopPollModelPeriodic() {}
+
+}  // namespace serving
+}  // namespace mindspore
diff --git a/serving/core/version_control/version_controller.h b/serving/core/version_control/version_controller.h
new file mode 100644
index 0000000000..2908f72c68
--- /dev/null
+++ b/serving/core/version_control/version_controller.h
@@ -0,0 +1,71 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_SERVING_VERSOIN_CONTROLLER_H_
+#define MINDSPORE_SERVING_VERSOIN_CONTROLLER_H_
+
+#include <string>
+#include <vector>
+#include <thread>
+#include "./model.h"
+#include "util/status.h"
+
+namespace mindspore {
+namespace serving {
+class VersionController {
+ public:
+  enum VersionControllerStrategy { kLastest = 0, kMulti = 1 };
+
+  VersionController(int32_t poll_model_wait_seconds, const std::string &models_path, const std::string &model_name);
+  ~VersionController();
+  Status Run();
+  void StartPollModelPeriodic();
+  void StopPollModelPeriodic();
+
+ private:
+  Status CreateInitModels();
+
+ private:
+  VersionControllerStrategy version_control_strategy_;
+  std::vector<MindSporeModelPtr> valid_models_;
+  int32_t poll_model_wait_seconds_;
+  std::thread poll_model_thread_;
+  std::string models_path_;
+  std::string model_name_;
+};
+
+class PeriodicFunction {
+ public:
+  PeriodicFunction(int32_t poll_model_wait_seconds, const std::string &models_path,
+                   VersionController::VersionControllerStrategy version_control_strategy,
+                   const std::vector<MindSporeModelPtr> &valid_models)
+      : poll_model_wait_seconds_(poll_model_wait_seconds),
+        models_path_(models_path),
+        version_control_strategy_(version_control_strategy),
+        valid_models_(valid_models) {}
+  ~PeriodicFunction() = default;
+  void operator()();
+
+ private:
+  int32_t poll_model_wait_seconds_;
+  std::string models_path_;
+  VersionController::VersionControllerStrategy version_control_strategy_;
+  std::vector<MindSporeModelPtr> valid_models_;
+};
+
+}  // namespace serving
+}  // namespace mindspore
+
+#endif  // !MINDSPORE_SERVING_VERSOIN_CONTROLLER_H_
diff --git a/serving/cpp_example/CMakeLists.txt b/serving/cpp_example/CMakeLists.txt
new file mode 100644
index 0000000000..aaf0277880
--- /dev/null
+++ b/serving/cpp_example/CMakeLists.txt
@@ -0,0 +1,71 @@
+cmake_minimum_required(VERSION 3.5.1)
+
+project(HelloWorld C CXX)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+
+find_package(Threads REQUIRED)
+
+# This branch assumes that gRPC and all its dependencies are already installed
+  # on this system, so they can be located by find_package().
+
+  # Find Protobuf installation
+  # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+  set(protobuf_MODULE_COMPATIBLE TRUE)
+  find_package(Protobuf CONFIG REQUIRED)
+  message(STATUS "Using protobuf ${protobuf_VERSION}")
+
+  set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
+  set(_REFLECTION gRPC::grpc++_reflection)
+  if(CMAKE_CROSSCOMPILING)
+      find_program(_PROTOBUF_PROTOC protoc)
+  else()
+      set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+  endif()
+
+  # Find gRPC installation
+  # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+  find_package(gRPC CONFIG REQUIRED)
+  message(STATUS "Using gRPC ${gRPC_VERSION}")
+
+  set(_GRPC_GRPCPP gRPC::grpc++)
+  if(CMAKE_CROSSCOMPILING)
+      find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+  else()
+      set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
+  endif()
+
+# Proto file
+get_filename_component(hw_proto "../ms_service.proto" ABSOLUTE)
+get_filename_component(hw_proto_path "${hw_proto}" PATH)
+
+# Generated sources
+set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/ms_service.pb.cc")
+set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/ms_service.pb.h")
+set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/ms_service.grpc.pb.cc")
+set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/ms_service.grpc.pb.h")
+add_custom_command(
+        OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
+        COMMAND ${_PROTOBUF_PROTOC}
+        ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${hw_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${hw_proto}"
+        DEPENDS "${hw_proto}")
+
+# Include generated *.pb.h files
+include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+
+# Targets greeter_[async_](client|server)
+foreach(_target
+        ms_client ms_server)
+    add_executable(${_target} "${_target}.cc"
+            ${hw_proto_srcs}
+            ${hw_grpc_srcs})
+    target_link_libraries(${_target}
+            ${_REFLECTION}
+            ${_GRPC_GRPCPP}
+            ${_PROTOBUF_LIBPROTOBUF})
+endforeach()
diff --git a/serving/cpp_example/ms_client.cc b/serving/cpp_example/ms_client.cc
new file mode 100644
index 0000000000..846dd618eb
--- /dev/null
+++ b/serving/cpp_example/ms_client.cc
@@ -0,0 +1,323 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <grpcpp/grpcpp.h>
+#include <iostream>
+#include <vector>
+#include <string>
+#include <fstream>
+#include "./ms_service.grpc.pb.h"
+
+using grpc::Channel;
+using grpc::ClientContext;
+using grpc::Status;
+using ms_serving::MSService;
+using ms_serving::PredictReply;
+using ms_serving::PredictRequest;
+using ms_serving::Tensor;
+using ms_serving::TensorShape;
+
+enum TypeId : int {
+  kTypeUnknown = 0,
+  kMetaTypeBegin = kTypeUnknown,
+  kMetaTypeType,  // Type
+  kMetaTypeAnything,
+  kMetaTypeObject,
+  kMetaTypeTypeType,  // TypeType
+  kMetaTypeProblem,
+  kMetaTypeExternal,
+  kMetaTypeNone,
+  kMetaTypeNull,
+  kMetaTypeEllipsis,
+  kMetaTypeEnd,
+  //
+  // Object types
+  //
+  kObjectTypeBegin = kMetaTypeEnd,
+  kObjectTypeNumber,
+  kObjectTypeString,
+  kObjectTypeList,
+  kObjectTypeTuple,
+  kObjectTypeSlice,
+  kObjectTypeKeyword,
+  kObjectTypeTensorType,
+  kObjectTypeClass,
+  kObjectTypeDictionary,
+  kObjectTypeFunction,
+  kObjectTypeJTagged,
+  kObjectTypeSymbolicKeyType,
+  kObjectTypeEnvType,
+  kObjectTypeRefKey,
+  kObjectTypeRef,
+  kObjectTypeEnd,
+  //
+  // Number Types
+  //
+  kNumberTypeBegin = kObjectTypeEnd,
+  kNumberTypeBool,
+  kNumberTypeInt,
+  kNumberTypeInt8,
+  kNumberTypeInt16,
+  kNumberTypeInt32,
+  kNumberTypeInt64,
+  kNumberTypeUInt,
+  kNumberTypeUInt8,
+  kNumberTypeUInt16,
+  kNumberTypeUInt32,
+  kNumberTypeUInt64,
+  kNumberTypeFloat,
+  kNumberTypeFloat16,
+  kNumberTypeFloat32,
+  kNumberTypeFloat64,
+  kNumberTypeEnd
+};
+
+std::string RealPath(const char *path) {
+  if (path == nullptr) {
+    std::cout << "path is nullptr";
+    return "";
+  }
+  if ((strlen(path)) >= PATH_MAX) {
+    std::cout << "path is too long";
+    return "";
+  }
+
+  std::shared_ptr<char> resolvedPath(new (std::nothrow) char[PATH_MAX]{0});
+  if (resolvedPath == nullptr) {
+    std::cout << "new resolvedPath failed";
+    return "";
+  }
+
+  auto ret = realpath(path, resolvedPath.get());
+  if (ret == nullptr) {
+    std::cout << "realpath failed";
+    return "";
+  }
+  return resolvedPath.get();
+}
+
+char *ReadFile(const char *file, size_t *size) {
+  if (file == nullptr) {
+    std::cout << "file is nullptr" << std::endl;
+    return nullptr;
+  }
+  if (size == nullptr) {
+    std::cout << "size should not be nullptr" << std::endl;
+    return nullptr;
+  }
+  std::ifstream ifs(RealPath(file));
+  if (!ifs.good()) {
+    std::cout << "file: " << file << "is not exist";
+    return nullptr;
+  }
+
+  if (!ifs.is_open()) {
+    std::cout << "file: " << file << "open failed";
+    return nullptr;
+  }
+
+  ifs.seekg(0, std::ios::end);
+  *size = ifs.tellg();
+  std::unique_ptr<char> buf(new (std::nothrow) char[*size]);
+  if (buf == nullptr) {
+    std::cout << "malloc buf failed, file: " << file;
+    ifs.close();
+    return nullptr;
+  }
+
+  ifs.seekg(0, std::ios::beg);
+  ifs.read(buf.get(), *size);
+  ifs.close();
+
+  return buf.release();
+}
+const std::map<TypeId, ms_serving::DataType> id2type_map{
+  {TypeId::kNumberTypeBegin, ms_serving::MS_UNKNOWN},   {TypeId::kNumberTypeBool, ms_serving::MS_BOOL},
+  {TypeId::kNumberTypeInt8, ms_serving::MS_INT8},       {TypeId::kNumberTypeUInt8, ms_serving::MS_UINT8},
+  {TypeId::kNumberTypeInt16, ms_serving::MS_INT16},     {TypeId::kNumberTypeUInt16, ms_serving::MS_UINT16},
+  {TypeId::kNumberTypeInt32, ms_serving::MS_INT32},     {TypeId::kNumberTypeUInt32, ms_serving::MS_UINT32},
+  {TypeId::kNumberTypeInt64, ms_serving::MS_INT64},     {TypeId::kNumberTypeUInt64, ms_serving::MS_UINT64},
+  {TypeId::kNumberTypeFloat16, ms_serving::MS_FLOAT16}, {TypeId::kNumberTypeFloat32, ms_serving::MS_FLOAT32},
+  {TypeId::kNumberTypeFloat64, ms_serving::MS_FLOAT64},
+};
+
+int WriteFile(const void *buf, size_t size) {
+  auto fd = fopen("output.json", "a+");
+  if (fd == NULL) {
+    std::cout << "fd is null and open file fail" << std::endl;
+    return 0;
+  }
+  fwrite(buf, size, 1, fd);
+  fclose(fd);
+  return 0;
+}
+
+PredictRequest ReadBertInput() {
+  size_t size;
+  auto buf = ReadFile("input206.json", &size);
+  if (buf == nullptr) {
+    std::cout << "read file failed" << std::endl;
+    return PredictRequest();
+  }
+  PredictRequest request;
+  auto cur = buf;
+  while (size > 0) {
+    if (request.data_size() == 4) {
+      break;
+    }
+    Tensor data;
+    TensorShape shape;
+    // set type
+    int type = *(reinterpret_cast<int *>(cur));
+    cur = cur + sizeof(int);
+    size = size - sizeof(int);
+    ms_serving::DataType dataType = id2type_map.at(TypeId(type));
+    data.set_tensor_type(dataType);
+
+    // set shape
+    size_t dims = *(reinterpret_cast<size_t *>(cur));
+    cur = cur + sizeof(size_t);
+    size = size - sizeof(size_t);
+
+    for (size_t i = 0; i < dims; i++) {
+      int dim = *(reinterpret_cast<int *>(cur));
+      shape.add_dims(dim);
+      cur = cur + sizeof(int);
+      size = size - sizeof(int);
+    }
+    *data.mutable_tensor_shape() = shape;
+
+    // set data
+    size_t data_len = *(reinterpret_cast<size_t *>(cur));
+    cur = cur + sizeof(size_t);
+    size = size - sizeof(size_t);
+    data.set_data(cur, data_len);
+    cur = cur + data_len;
+    size = size - data_len;
+    *request.add_data() = data;
+  }
+  return request;
+}
+
+class MSClient {
+ public:
+  explicit MSClient(std::shared_ptr<Channel> channel) : stub_(MSService::NewStub(channel)) {}
+
+  std::string Predict(const std::string &type) {
+    // Data we are sending to the server.
+    PredictRequest request;
+    if (type == "add") {
+      Tensor data;
+      TensorShape shape;
+      shape.add_dims(1);
+      shape.add_dims(1);
+      shape.add_dims(2);
+      shape.add_dims(2);
+      *data.mutable_tensor_shape() = shape;
+      data.set_tensor_type(ms_serving::MS_FLOAT32);
+      std::vector<float> input_data{1.1, 2.1, 3.1, 4.1};
+      data.set_data(input_data.data(), input_data.size());
+      *request.add_data() = data;
+      *request.add_data() = data;
+    } else if (type == "bert") {
+      request = ReadBertInput();
+    } else {
+      std::cout << "type only support bert or add, but input is " << type << std::endl;
+    }
+    std::cout << "intput tensor size is " << request.data_size() << std::endl;
+    // Container for the data we expect from the server.
+    PredictReply reply;
+
+    // Context for the client. It could be used to convey extra information to
+    // the server and/or tweak certain RPC behaviors.
+    ClientContext context;
+
+    // The actual RPC.
+    Status status = stub_->Predict(&context, request, &reply);
+
+    for (int i = 0; i < reply.result_size(); i++) {
+      WriteFile(reply.result(i).data().data(), reply.result(i).data().size());
+    }
+
+    std::cout << "the return result size is " << reply.result_size() << std::endl;
+
+    // Act upon its status.
+    if (status.ok()) {
+      return "RPC OK";
+    } else {
+      std::cout << status.error_code() << ": " << status.error_message() << std::endl;
+      return "RPC failed";
+    }
+  }
+
+ private:
+  std::unique_ptr<MSService::Stub> stub_;
+};
+
+int main(int argc, char **argv) {
+  // Instantiate the client. It requires a channel, out of which the actual RPCs
+  // are created. This channel models a connection to an endpoint specified by
+  // the argument "--target=" which is the only expected argument.
+  // We indicate that the channel isn't authenticated (use of
+  // InsecureChannelCredentials()).
+  std::string target_str;
+  std::string arg_target_str("--target");
+  std::string type;
+  std::string arg_type_str("--type");
+  if (argc > 2) {
+    {
+      // parse target
+      std::string arg_val = argv[1];
+      size_t start_pos = arg_val.find(arg_target_str);
+      if (start_pos != std::string::npos) {
+        start_pos += arg_target_str.size();
+        if (arg_val[start_pos] == '=') {
+          target_str = arg_val.substr(start_pos + 1);
+        } else {
+          std::cout << "The only correct argument syntax is --target=" << std::endl;
+          return 0;
+        }
+      } else {
+        target_str = "localhost:5500";
+      }
+    }
+
+    {
+      // parse type
+      std::string arg_val2 = argv[2];
+      size_t start_pos = arg_val2.find(arg_type_str);
+      if (start_pos != std::string::npos) {
+        start_pos += arg_type_str.size();
+        if (arg_val2[start_pos] == '=') {
+          type = arg_val2.substr(start_pos + 1);
+        } else {
+          std::cout << "The only correct argument syntax is --target=" << std::endl;
+          return 0;
+        }
+      } else {
+        type = "add";
+      }
+    }
+
+  } else {
+    target_str = "localhost:5500";
+    type = "add";
+  }
+  MSClient client(grpc::CreateChannel(target_str, grpc::InsecureChannelCredentials()));
+  std::string reply = client.Predict(type);
+  std::cout << "client received: " << reply << std::endl;
+
+  return 0;
+}
diff --git a/serving/cpp_example/ms_server.cc b/serving/cpp_example/ms_server.cc
new file mode 100644
index 0000000000..f6021ef000
--- /dev/null
+++ b/serving/cpp_example/ms_server.cc
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <grpcpp/grpcpp.h>
+#include <grpcpp/health_check_service_interface.h>
+#include <grpcpp/ext/proto_server_reflection_plugin.h>
+#include <iostream>
+
+#include "./ms_service.grpc.pb.h"
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::Status;
+using ms_serving::MSService;
+using ms_serving::PredictReply;
+using ms_serving::PredictRequest;
+
+// Logic and data behind the server's behavior.
+class MSServiceImpl final : public MSService::Service {
+  Status Predict(ServerContext *context, const PredictRequest *request, PredictReply *reply) override {
+    std::cout << "server eval" << std::endl;
+    return Status::OK;
+  }
+};
+
+void RunServer() {
+  std::string server_address("0.0.0.0:50051");
+  MSServiceImpl service;
+
+  grpc::EnableDefaultHealthCheckService(true);
+  grpc::reflection::InitProtoReflectionServerBuilderPlugin();
+  auto option = grpc::MakeChannelArgumentOption(GRPC_ARG_ALLOW_REUSEPORT, 0);
+
+  ServerBuilder builder;
+  builder.SetOption(std::move(option));
+  // Listen on the given address without any authentication mechanism.
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  // Register "service" as the instance through which we'll communicate with
+  // clients. In this case it corresponds to an *synchronous* service.
+  builder.RegisterService(&service);
+  // Finally assemble the server.
+  std::unique_ptr<Server> server(builder.BuildAndStart());
+  std::cout << "Server listening on " << server_address << std::endl;
+
+  // Wait for the server to shutdown. Note that some other thread must be
+  // responsible for shutting down the server for this call to ever return.
+  server->Wait();
+}
+
+int main(int argc, char **argv) {
+  RunServer();
+
+  return 0;
+}
diff --git a/serving/main.cc b/serving/main.cc
new file mode 100644
index 0000000000..64566f8953
--- /dev/null
+++ b/serving/main.cc
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "core/server.h"
+#include "core/util/option_parser.h"
+
+using mindspore::serving::Options;
+
+int main(int argc, char **argv) {
+  auto flag = Options::Instance().ParseCommandLine(argc, argv);
+  if (!flag) {
+    return 0;
+  }
+  mindspore::serving::Server server;
+  server.BuildAndStart();
+  return 0;
+}
diff --git a/serving/ms_service.proto b/serving/ms_service.proto
new file mode 100644
index 0000000000..6b03896b76
--- /dev/null
+++ b/serving/ms_service.proto
@@ -0,0 +1,48 @@
+// ms_service.proto
+syntax = "proto3";
+
+package ms_serving;
+
+service MSService {
+    rpc Predict(PredictRequest) returns (PredictReply) {}
+    rpc Test(PredictRequest) returns (PredictReply) {}
+}
+
+message PredictRequest {
+    repeated Tensor data = 1;
+}
+
+message PredictReply {
+    repeated Tensor result = 1;
+}
+enum DataType {
+  MS_UNKNOWN = 0;
+  MS_BOOL = 1;
+  MS_INT8 = 2;
+  MS_UINT8 = 3;
+  MS_INT16 = 4;
+  MS_UINT16 = 5;
+  MS_INT32 = 6;
+  MS_UINT32 = 7;
+  MS_INT64 = 8;
+  MS_UINT64 = 9;
+  MS_FLOAT16 = 10;
+  MS_FLOAT32 = 11;
+  MS_FLOAT64 = 12;
+}
+
+message TensorShape {
+  repeated int64 dims = 1;
+};
+
+message Tensor {
+  // tensor shape info
+  TensorShape tensor_shape = 1;
+
+  // tensor content data type
+  DataType tensor_type = 2;
+
+  // tensor data
+  bytes data = 3;
+}
+
diff --git a/serving/python_example/ms_client.py b/serving/python_example/ms_client.py
new file mode 100644
index 0000000000..d567d089b8
--- /dev/null
+++ b/serving/python_example/ms_client.py
@@ -0,0 +1,57 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import grpc
+import numpy as np
+import ms_service_pb2
+import ms_service_pb2_grpc
+
+
+def run():
+    channel = grpc.insecure_channel('localhost:50051')
+    stub = ms_service_pb2_grpc.MSServiceStub(channel)
+    # request = ms_service_pb2.PredictRequest()
+    # request.name = 'haha'
+    # response = stub.Eval(request)
+    # print("ms client received: " + response.message)
+
+    request = ms_service_pb2.PredictRequest()
+    request.data.tensor_shape.dims.extend([32, 1, 32, 32])
+    request.data.tensor_type = ms_service_pb2.MS_FLOAT32
+    request.data.data = (np.ones([32, 1, 32, 32]).astype(np.float32) * 0.01).tobytes()
+
+    request.label.tensor_shape.dims.extend([32])
+    request.label.tensor_type = ms_service_pb2.MS_INT32
+    request.label.data = np.ones([32]).astype(np.int32).tobytes()
+
+    result = stub.Predict(request)
+    #result_np = np.frombuffer(result.result.data, dtype=np.float32).reshape(result.result.tensor_shape.dims)
+    print("ms client received: ")
+    #print(result_np)
+
+    # future_list = []
+    # times = 1000
+    # for i in range(times):
+    #     async_future = stub.Eval.future(request)
+    #     future_list.append(async_future)
+    #     print("async call, future list add item " + str(i));
+    #
+    # for i in range(len(future_list)):
+    #     async_result = future_list[i].result()
+    #     print("ms client async get result of item " + str(i))
+
+
+
+if __name__ == '__main__':
+    run()
diff --git a/serving/python_example/ms_client_test_call.py b/serving/python_example/ms_client_test_call.py
new file mode 100644
index 0000000000..56643d8351
--- /dev/null
+++ b/serving/python_example/ms_client_test_call.py
@@ -0,0 +1,46 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import grpc
+import numpy as np
+import ms_service_pb2
+import ms_service_pb2_grpc
+
+
+def run():
+    channel = grpc.insecure_channel('localhost:50051')
+    stub = ms_service_pb2_grpc.MSServiceStub(channel)
+    # request = ms_service_pb2.EvalRequest()
+    # request.name = 'haha'
+    # response = stub.Eval(request)
+    # print("ms client received: " + response.message)
+
+    request = ms_service_pb2.PredictRequest()
+    request.data.tensor_shape.dims.extend([32, 1, 32, 32])
+    request.data.tensor_type = ms_service_pb2.MS_FLOAT32
+    request.data.data = (np.ones([32, 1, 32, 32]).astype(np.float32) * 0.01).tobytes()
+
+    request.label.tensor_shape.dims.extend([32])
+    request.label.tensor_type = ms_service_pb2.MS_INT32
+    request.label.data = np.ones([32]).astype(np.int32).tobytes()
+
+    result = stub.Test(request)
+    #result_np = np.frombuffer(result.result.data, dtype=np.float32).reshape(result.result.tensor_shape.dims)
+    print("ms client test call received: ")
+    #print(result_np)
+
+
+
+if __name__ == '__main__':
+    run()
diff --git a/serving/python_example/ms_server.py b/serving/python_example/ms_server.py
new file mode 100644
index 0000000000..f538856804
--- /dev/null
+++ b/serving/python_example/ms_server.py
@@ -0,0 +1,55 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from concurrent import futures
+import time
+import grpc
+import numpy as np
+import ms_service_pb2
+import ms_service_pb2_grpc
+import test_cpu_lenet
+from mindspore import Tensor
+
+class MSService(ms_service_pb2_grpc.MSServiceServicer):
+    def Predict(self, request, context):
+        request_data = request.data
+        request_label = request.label
+
+        data_from_buffer = np.frombuffer(request_data.data, dtype=np.float32)
+        data_from_buffer = data_from_buffer.reshape(request_data.tensor_shape.dims)
+        data = Tensor(data_from_buffer)
+
+        label_from_buffer = np.frombuffer(request_label.data, dtype=np.int32)
+        label_from_buffer = label_from_buffer.reshape(request_label.tensor_shape.dims)
+        label = Tensor(label_from_buffer)
+
+        result = test_cpu_lenet.test_lenet(data, label)
+        result_reply = ms_service_pb2.PredictReply()
+        result_reply.result.tensor_shape.dims.extend(result.shape())
+        result_reply.result.data = result.asnumpy().tobytes()
+        return result_reply
+
+def serve():
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    ms_service_pb2_grpc.add_MSServiceServicer_to_server(MSService(), server)
+    server.add_insecure_port('[::]:50051')
+    server.start()
+    try:
+        while True:
+            time.sleep(60*60*24) # one day in seconds
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == '__main__':
+    serve()
diff --git a/serving/python_example/ms_service_pb2_grpc.py b/serving/python_example/ms_service_pb2_grpc.py
new file mode 100644
index 0000000000..e6f21a0de3
--- /dev/null
+++ b/serving/python_example/ms_service_pb2_grpc.py
@@ -0,0 +1,96 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+import grpc
+
+import ms_service_pb2 as ms__service__pb2
+
+
+class MSServiceStub(object):
+    """Missing associated documentation comment in .proto file"""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Predict = channel.unary_unary(
+                '/ms_serving.MSService/Predict',
+                request_serializer=ms__service__pb2.PredictRequest.SerializeToString,
+                response_deserializer=ms__service__pb2.PredictReply.FromString,
+                )
+        self.Test = channel.unary_unary(
+                '/ms_serving.MSService/Test',
+                request_serializer=ms__service__pb2.PredictRequest.SerializeToString,
+                response_deserializer=ms__service__pb2.PredictReply.FromString,
+                )
+
+
+class MSServiceServicer(object):
+    """Missing associated documentation comment in .proto file"""
+
+    def Predict(self, request, context):
+        """Missing associated documentation comment in .proto file"""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Test(self, request, context):
+        """Missing associated documentation comment in .proto file"""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_MSServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Predict': grpc.unary_unary_rpc_method_handler(
+                    servicer.Predict,
+                    request_deserializer=ms__service__pb2.PredictRequest.FromString,
+                    response_serializer=ms__service__pb2.PredictReply.SerializeToString,
+            ),
+            'Test': grpc.unary_unary_rpc_method_handler(
+                    servicer.Test,
+                    request_deserializer=ms__service__pb2.PredictRequest.FromString,
+                    response_serializer=ms__service__pb2.PredictReply.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'ms_serving.MSService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class MSService(object):
+    """Missing associated documentation comment in .proto file"""
+
+    @staticmethod
+    def Predict(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/ms_serving.MSService/Predict',
+            ms__service__pb2.PredictRequest.SerializeToString,
+            ms__service__pb2.PredictReply.FromString,
+            options, channel_credentials,
+            call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Test(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/ms_serving.MSService/Test',
+            ms__service__pb2.PredictRequest.SerializeToString,
+            ms__service__pb2.PredictReply.FromString,
+            options, channel_credentials,
+            call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/tests/st/networks/models/lenet.py b/serving/python_example/test_cpu_lenet.py
similarity index 50%
rename from tests/st/networks/models/lenet.py
rename to serving/python_example/test_cpu_lenet.py
index ce0932ca21..a609c9b924 100644
--- a/tests/st/networks/models/lenet.py
+++ b/serving/python_example/test_cpu_lenet.py
@@ -12,8 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import numpy as np
+import mindspore.context as context
 import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.nn import TrainOneStepCell, WithLossCell
+from mindspore.nn.optim import Momentum
 from mindspore.ops import operations as P
+import ms_service_pb2
 
 
 class LeNet(nn.Cell):
@@ -44,3 +50,42 @@ class LeNet(nn.Cell):
         output = self.relu(output)
         output = self.fc3(output)
         return output
+
+
+def train(net, data, label):
+    learning_rate = 0.01
+    momentum = 0.9
+
+    optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(net_with_criterion, optimizer)  # optimizer
+    train_network.set_train()
+    res = train_network(data, label)
+    print("+++++++++Loss+++++++++++++")
+    print(res)
+    print("+++++++++++++++++++++++++++")
+    assert res
+    return res
+
+def test_lenet(data, label):
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    net = LeNet()
+    return train(net, data, label)
+
+if __name__ == '__main__':
+    tensor = ms_service_pb2.Tensor()
+    tensor.tensor_shape.dim.extend([32, 1, 32, 32])
+    # tensor.tensor_shape.dim.add() = 1
+    # tensor.tensor_shape.dim.add() = 32
+    # tensor.tensor_shape.dim.add() = 32
+    tensor.tensor_type = ms_service_pb2.MS_FLOAT32
+    tensor.data = np.ones([32, 1, 32, 32]).astype(np.float32).tobytes()
+
+    data_from_buffer = np.frombuffer(tensor.data, dtype=np.float32)
+    print(tensor.tensor_shape.dim)
+    data_from_buffer = data_from_buffer.reshape(tensor.tensor_shape.dim)
+    print(data_from_buffer.shape)
+    input_data = Tensor(data_from_buffer * 0.01)
+    input_label = Tensor(np.ones([32]).astype(np.int32))
+    test_lenet(input_data, input_label)
diff --git a/serving/scripts/format_source_code.sh b/serving/scripts/format_source_code.sh
new file mode 100755
index 0000000000..1a4c232567
--- /dev/null
+++ b/serving/scripts/format_source_code.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+set -e
+
+CLANG_FORMAT=$(which clang-format) || (echo "Please install 'clang-format' tool first"; exit 1)
+
+version=$("${CLANG_FORMAT}" --version | sed -n "s/.*\ \([0-9]*\)\.[0-9]*\.[0-9]*.*/\1/p")
+if [[ "${version}" -lt "8" ]]; then
+  echo "clang-format's version must be at least 8.0.0"
+  exit 1
+fi
+
+CURRENT_PATH=$(pwd)
+SCRIPTS_PATH=$(dirname "$0")
+
+echo "CURRENT_PATH=${CURRENT_PATH}"
+echo "SCRIPTS_PATH=${SCRIPTS_PATH}"
+
+# print usage message
+function usage()
+{
+  echo "Format the specified source files to conform the code style."
+  echo "Usage:"
+  echo "bash $0 [-a] [-c] [-l] [-h]"
+  echo "e.g. $0 -c"
+  echo ""
+  echo "Options:"
+  echo "    -a format of all files"
+  echo "    -c format of the files changed compared to last commit, default case"
+  echo "    -l format of the files changed in last commit"
+  echo "    -h Print usage"
+}
+
+# check and set options
+function checkopts()
+{
+  # init variable
+  mode="changed"    # default format changed files
+
+  # Process the options
+  while getopts 'aclh' opt
+  do
+    case "${opt}" in
+      a)
+        mode="all"
+        ;;
+      c)
+        mode="changed"
+        ;;
+      l)
+        mode="lastcommit"
+        ;;
+      h)
+        usage
+        exit 0
+        ;;
+      *)
+        echo "Unknown option ${opt}!"
+        usage
+        exit 1
+    esac
+  done
+}
+
+# init variable
+# check options
+checkopts "$@"
+
+# switch to project root path, which contains clang-format config file '.clang-format'
+cd "${SCRIPTS_PATH}/.." || exit 1
+
+FMT_FILE_LIST='__format_files_list__'
+
+if [[ "X${mode}" == "Xall" ]]; then
+  find ./ -type f -name "*" | grep "\.h$\|\.cc$" > "${FMT_FILE_LIST}" || true
+elif [[ "X${mode}" == "Xchanged" ]]; then
+  git diff --name-only | grep "\.h$\|\.cc$" > "${FMT_FILE_LIST}" || true
+else  # "X${mode}" == "Xlastcommit"
+  git diff --name-only HEAD~ HEAD | grep "\.h$\|\.cc$" > "${FMT_FILE_LIST}" || true
+fi
+
+while read line; do
+  if [ -f "${line}" ]; then
+    ${CLANG_FORMAT} -i "${line}"
+  fi
+done < "${FMT_FILE_LIST}"
+
+rm "${FMT_FILE_LIST}"
+cd "${CURRENT_PATH}" || exit 1
+
+echo "Specified cpp source files have been format successfully."
diff --git a/tests/mindspore_test_framework/utils/check_gradient.py b/tests/mindspore_test_framework/utils/check_gradient.py
index cadb0207d6..c2252b8a78 100644
--- a/tests/mindspore_test_framework/utils/check_gradient.py
+++ b/tests/mindspore_test_framework/utils/check_gradient.py
@@ -18,8 +18,8 @@
 
 from typing import Callable, List, Any
 
-import mindspore._c_expression as _c_expression
 import numpy as np
+import mindspore._c_expression as _c_expression
 
 from mindspore import ParameterTuple
 from mindspore import Tensor
diff --git a/tests/mindspore_test_framework/utils/keyword.py b/tests/mindspore_test_framework/utils/keyword.py
index 56c27b0d04..cee0f14ff8 100644
--- a/tests/mindspore_test_framework/utils/keyword.py
+++ b/tests/mindspore_test_framework/utils/keyword.py
@@ -18,14 +18,14 @@
 import sys
 
 
-class _MindsporeTestFrameworkkeyword:
+class _MindSporeTestFrameworkkeyword:
     def __setattr__(self, name, value):
         if name in self.__dict__:
             raise TypeError("can not rebind keyword (%s)" % name)
         self.__dict__[name] = value
 
 
-keyword = _MindsporeTestFrameworkkeyword()
+keyword = _MindSporeTestFrameworkkeyword()
 
 keyword.function = "function"
 keyword.inputs = "inputs"
diff --git a/tests/perf_test/bert/test_bert_train.py b/tests/perf_test/bert/test_bert_train.py
index c2bb88b047..096571adea 100644
--- a/tests/perf_test/bert/test_bert_train.py
+++ b/tests/perf_test/bert/test_bert_train.py
@@ -22,9 +22,9 @@ import os
 import mindspore.common.dtype as mstype
 import mindspore.context as context
 from mindspore import Tensor
-from model_zoo.bert.src import BertConfig, BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
 from mindspore.nn.optim import AdamWeightDecayDynamicLR
 from mindspore.train.loss_scale_manager import DynamicLossScaleManager
+from model_zoo.bert.src import BertConfig, BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
 from ...dataset_mock import MindData
 from ...ops_common import nn, np, batch_tuple_tensor, build_construct_graph
 
diff --git a/tests/perf_test/lenet.py b/tests/perf_test/lenet.py
new file mode 100644
index 0000000000..3864315dba
--- /dev/null
+++ b/tests/perf_test/lenet.py
@@ -0,0 +1,78 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""LeNet."""
+import mindspore.nn as nn
+from mindspore.common.initializer import TruncatedNormal
+
+
+def conv(in_channels, out_channels, kernel_size, stride=1, padding=0):
+    """weight initial for conv layer"""
+    weight = weight_variable()
+    return nn.Conv2d(in_channels, out_channels,
+                     kernel_size=kernel_size, stride=stride, padding=padding,
+                     weight_init=weight, has_bias=False, pad_mode="valid")
+
+
+def fc_with_initialize(input_channels, out_channels):
+    """weight initial for fc layer"""
+    weight = weight_variable()
+    bias = weight_variable()
+    return nn.Dense(input_channels, out_channels, weight, bias)
+
+
+def weight_variable():
+    """weight initial"""
+    return TruncatedNormal(0.02)
+
+
+class LeNet5(nn.Cell):
+    """
+    Lenet network
+
+    Args:
+        num_class (int): Num classes. Default: 10.
+
+    Returns:
+        Tensor, output tensor
+    Examples:
+        >>> LeNet(num_class=10)
+
+    """
+    def __init__(self, num_class=10, channel=1):
+        super(LeNet5, self).__init__()
+        self.num_class = num_class
+        self.conv1 = conv(channel, 6, 5)
+        self.conv2 = conv(6, 16, 5)
+        self.fc1 = fc_with_initialize(16 * 5 * 5, 120)
+        self.fc2 = fc_with_initialize(120, 84)
+        self.fc3 = fc_with_initialize(84, self.num_class)
+        self.relu = nn.ReLU()
+        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.flatten = nn.Flatten()
+
+    def construct(self, x):
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.conv2(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.fc3(x)
+        return x
diff --git a/tests/perf_test/test_lenet.py b/tests/perf_test/test_lenet.py
index ef526e1fc2..72a6552f53 100644
--- a/tests/perf_test/test_lenet.py
+++ b/tests/perf_test/test_lenet.py
@@ -17,12 +17,12 @@
 
 import numpy as np
 
+from lenet import LeNet5
 import mindspore.nn as nn
 import mindspore.ops.composite as C
 from mindspore import Tensor
 from mindspore import context
 from mindspore.common.api import _executor
-from mindspore.model_zoo.lenet import LeNet
 
 context.set_context(mode=context.GRAPH_MODE)
 
@@ -61,7 +61,7 @@ def test_compile():
 
 def test_compile_grad():
     """Compile forward and backward graph"""
-    net = LeNet(num_class=num_class)
+    net = LeNet5(num_class=num_class)
     inp = Tensor(np.array(np.random.randn(batch_size,
                                           channel,
                                           height,
diff --git a/tests/st/auto_parallel/test_expand_loss.py b/tests/st/auto_parallel/test_expand_loss.py
index d5148e35e2..efae4f0ba9 100644
--- a/tests/st/auto_parallel/test_expand_loss.py
+++ b/tests/st/auto_parallel/test_expand_loss.py
@@ -16,10 +16,6 @@ import os
 import pytest
 
 
-@pytest.mark.level0
-@pytest.mark.platform_x86_ascend_training
-@pytest.mark.platform_arm_ascend_training
-@pytest.mark.env_single
 def test_expand_loss():
     sh_path = os.path.split(os.path.realpath(__file__))[0]
     ret = os.system(f"sh {sh_path}/run_auto_parallel_loss_expand.sh")
diff --git a/tests/st/auto_parallel/test_resnet50_expand_loss.py b/tests/st/auto_parallel/test_resnet50_expand_loss.py
index ddcddd73c2..8400a2d855 100644
--- a/tests/st/auto_parallel/test_resnet50_expand_loss.py
+++ b/tests/st/auto_parallel/test_resnet50_expand_loss.py
@@ -16,10 +16,6 @@ import os
 import pytest
 
 
-@pytest.mark.level0
-@pytest.mark.platform_x86_ascend_training
-@pytest.mark.platform_arm_ascend_training
-@pytest.mark.env_single
 def test_expand_loss():
     sh_path = os.path.split(os.path.realpath(__file__))[0]
     ret = os.system(f"sh {sh_path}/run_auto_parallel_resnet50_expand_loss.sh")
diff --git a/tests/st/gnn/aggregator.py b/tests/st/gnn/aggregator.py
index 271e383d06..373df5f961 100644
--- a/tests/st/gnn/aggregator.py
+++ b/tests/st/gnn/aggregator.py
@@ -78,15 +78,15 @@ class GNNFeatureTransform(nn.Cell):
         self.has_bias = check_bool(has_bias)
 
         if isinstance(weight_init, Tensor):
-            if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \
-                    weight_init.shape()[1] != in_channels:
+            if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \
+                    weight_init.shape[1] != in_channels:
                 raise ValueError("weight_init shape error")
 
         self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight")
 
         if self.has_bias:
             if isinstance(bias_init, Tensor):
-                if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels:
+                if bias_init.dim() != 1 or bias_init.shape[0] != out_channels:
                     raise ValueError("bias_init shape error")
 
             self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias")
diff --git a/tests/st/gnn/gcn/__init__.py b/tests/st/gnn/gcn/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/st/gnn/gcn/test_gcn.py b/tests/st/gnn/gcn/test_gcn.py
new file mode 100644
index 0000000000..e5804809aa
--- /dev/null
+++ b/tests/st/gnn/gcn/test_gcn.py
@@ -0,0 +1,87 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import time
+import pytest
+import numpy as np
+from mindspore import context
+from model_zoo.gcn.src.gcn import GCN, LossAccuracyWrapper, TrainNetWrapper
+from model_zoo.gcn.src.config import ConfigGCN
+from model_zoo.gcn.src.dataset import get_adj_features_labels, get_mask
+
+
+DATA_DIR = '/home/workspace/mindspore_dataset/cora/cora_mr/cora_mr'
+TRAIN_NODE_NUM = 140
+EVAL_NODE_NUM = 500
+TEST_NODE_NUM = 1000
+SEED = 20
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_gcn():
+    print("test_gcn begin")
+    np.random.seed(SEED)
+    context.set_context(mode=context.GRAPH_MODE,
+                        device_target="Ascend", save_graphs=False)
+    config = ConfigGCN()
+    config.dropout = 0.0
+    adj, feature, label_onehot, _ = get_adj_features_labels(DATA_DIR)
+
+    nodes_num = label_onehot.shape[0]
+    train_mask = get_mask(nodes_num, 0, TRAIN_NODE_NUM)
+    eval_mask = get_mask(nodes_num, TRAIN_NODE_NUM, TRAIN_NODE_NUM + EVAL_NODE_NUM)
+    test_mask = get_mask(nodes_num, nodes_num - TEST_NODE_NUM, nodes_num)
+
+    class_num = label_onehot.shape[1]
+    gcn_net = GCN(config, adj, feature, class_num)
+    gcn_net.add_flags_recursive(fp16=True)
+
+    eval_net = LossAccuracyWrapper(gcn_net, label_onehot, eval_mask, config.weight_decay)
+    test_net = LossAccuracyWrapper(gcn_net, label_onehot, test_mask, config.weight_decay)
+    train_net = TrainNetWrapper(gcn_net, label_onehot, train_mask, config)
+
+    loss_list = []
+    for epoch in range(config.epochs):
+        t = time.time()
+
+        train_net.set_train()
+        train_result = train_net()
+        train_loss = train_result[0].asnumpy()
+        train_accuracy = train_result[1].asnumpy()
+
+        eval_net.set_train(False)
+        eval_result = eval_net()
+        eval_loss = eval_result[0].asnumpy()
+        eval_accuracy = eval_result[1].asnumpy()
+
+        loss_list.append(eval_loss)
+        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_loss),
+              "train_acc=", "{:.5f}".format(train_accuracy), "val_loss=", "{:.5f}".format(eval_loss),
+              "val_acc=", "{:.5f}".format(eval_accuracy), "time=", "{:.5f}".format(time.time() - t))
+
+        if epoch > config.early_stopping and loss_list[-1] > np.mean(loss_list[-(config.early_stopping+1):-1]):
+            print("Early stopping...")
+            break
+
+    test_net.set_train(False)
+    test_result = test_net()
+    test_loss = test_result[0].asnumpy()
+    test_accuracy = test_result[1].asnumpy()
+    print("Test set results:", "loss=", "{:.5f}".format(test_loss),
+          "accuracy=", "{:.5f}".format(test_accuracy))
+    assert test_accuracy > 0.812
diff --git a/tests/st/model_zoo_tests/deeplabv3/run_deeplabv3_ci.sh b/tests/st/model_zoo_tests/deeplabv3/run_deeplabv3_ci.sh
new file mode 100644
index 0000000000..df24367417
--- /dev/null
+++ b/tests/st/model_zoo_tests/deeplabv3/run_deeplabv3_ci.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+echo "=============================================================================================================="
+echo "Please run the scipt as: "
+echo "for example: bash run_deeplabv3_ci.sh DEVICE_ID DATA_PATH PRETRAINED_CKPT_PATH"
+echo "=============================================================================================================="
+DEVICE_ID=$1
+DATA_DIR=$2
+PATH_CHECKPOINT=$3
+BASE_PATH=$(cd "$(dirname $0)"; pwd)
+unset SLOG_PRINT_TO_STDOUT
+CODE_DIR="./"
+if [ -d ${BASE_PATH}/../../../../model_zoo/deeplabv3 ]; then
+    CODE_DIR=${BASE_PATH}/../../../../model_zoo/deeplabv3
+elif [ -d ${BASE_PATH}/../../model_zoo/deeplabv3 ]; then
+    CODE_DIR=${BASE_PATH}/../../model_zoo/deeplabv3
+else
+     echo "[ERROR] code dir is not found"
+fi
+echo $CODE_DIR
+rm -rf ${BASE_PATH}/deeplabv3
+cp -r ${CODE_DIR}  ${BASE_PATH}/deeplabv3
+cp -f ${BASE_PATH}/train_one_epoch_with_loss.py ${BASE_PATH}/deeplabv3/train_one_epoch_with_loss.py
+cd ${BASE_PATH}/deeplabv3
+python train_one_epoch_with_loss.py --data_url=$DATA_DIR --checkpoint_url=$PATH_CHECKPOINT --device_id=$DEVICE_ID > train_deeplabv3_ci.log 2>&1 &
+process_pid=`echo $!`
+wait ${process_pid}
+status=`echo $?`
+if [ "${status}" != "0" ]; then
+        echo "[ERROR] test deeplabv3 failed. status: ${status}"
+    exit 1
+else
+    echo "[INFO] test deeplabv3 success."
+fi
\ No newline at end of file
diff --git a/tests/st/model_zoo_tests/deeplabv3/train_one_epoch_with_loss.py b/tests/st/model_zoo_tests/deeplabv3/train_one_epoch_with_loss.py
new file mode 100644
index 0000000000..73931a8046
--- /dev/null
+++ b/tests/st/model_zoo_tests/deeplabv3/train_one_epoch_with_loss.py
@@ -0,0 +1,96 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train."""
+import argparse
+import time
+from mindspore import context
+from mindspore.nn.optim.momentum import Momentum
+from mindspore import Model
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+from mindspore.train.callback import Callback
+from src.md_dataset import create_dataset
+from src.losses import OhemLoss
+from src.deeplabv3 import deeplabv3_resnet50
+from src.config import config
+parser = argparse.ArgumentParser(description="Deeplabv3 training")
+parser.add_argument('--data_url', required=True, default=None, help='Train data url')
+parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
+parser.add_argument('--checkpoint_url', default=None, help='Checkpoint path')
+args_opt = parser.parse_args()
+print(args_opt)
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
+
+class LossCallBack(Callback):
+    """
+    Monitor the loss in training.
+    Note:
+        if per_print_times is 0 do not print loss.
+    Args:
+        per_print_times (int): Print loss every times. Default: 1.
+    """
+    def __init__(self, data_size, per_print_times=1):
+        super(LossCallBack, self).__init__()
+        if not isinstance(per_print_times, int) or per_print_times < 0:
+            raise ValueError("print_step must be int and >= 0")
+        self.data_size = data_size
+        self._per_print_times = per_print_times
+        self.time = 1000
+        self.loss = 0
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        self.time = epoch_mseconds / self.data_size
+        self.loss += cb_params.net_outputs
+        print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
+                                                           str(cb_params.net_outputs)))
+
+def model_fine_tune(flags, train_net, fix_weight_layer):
+    checkpoint_path = flags.checkpoint_url
+    if checkpoint_path is None:
+        return
+    param_dict = load_checkpoint(checkpoint_path)
+    load_param_into_net(train_net, param_dict)
+    for para in train_net.trainable_params():
+        if fix_weight_layer in para.name:
+            para.requires_grad = False
+
+if __name__ == "__main__":
+    start_time = time.time()
+    epoch_size = 3
+    args_opt.base_size = config.crop_size
+    args_opt.crop_size = config.crop_size
+    train_dataset = create_dataset(args_opt, args_opt.data_url, epoch_size, config.batch_size,
+                                   usage="train", shuffle=False)
+    dataset_size = train_dataset.get_dataset_size()
+    callback = LossCallBack(dataset_size)
+    net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
+                             infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates,
+                             decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride,
+                             fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid)
+    net.set_train()
+    model_fine_tune(args_opt, net, 'layer')
+    loss = OhemLoss(config.seg_num_classes, config.ignore_label)
+    opt = Momentum(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
+    model = Model(net, loss, opt)
+    model.train(epoch_size, train_dataset, callback)
+    print(time.time() - start_time)
+    print("expect loss: ", callback.loss / 3)
+    print("expect time: ", callback.time)
+    expect_loss = 0.5
+    expect_time = 35
+    assert callback.loss.asnumpy() / 3 <= expect_loss
+    assert callback.time <= expect_time
diff --git a/tests/st/model_zoo_tests/transformer/test_transformer.py b/tests/st/model_zoo_tests/transformer/test_transformer.py
new file mode 100644
index 0000000000..ebfdbbbb7e
--- /dev/null
+++ b/tests/st/model_zoo_tests/transformer/test_transformer.py
@@ -0,0 +1,182 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Transformer testing script."""
+
+import time
+import os
+import pytest
+import numpy as np
+import mindspore.common.dtype as mstype
+from mindspore.common.tensor import Tensor
+from mindspore.nn.optim import Adam
+from mindspore.train.model import Model
+from mindspore.train.loss_scale_manager import DynamicLossScaleManager
+from mindspore.train.callback import Callback
+from mindspore import context
+from model_zoo.Transformer.src.transformer_model import TransformerConfig
+from model_zoo.Transformer.src.transformer_for_train import TransformerNetworkWithLoss, \
+                                      TransformerTrainOneStepWithLossScaleCell
+from model_zoo.Transformer.src.config import cfg
+from model_zoo.Transformer.src.dataset import create_transformer_dataset
+from model_zoo.Transformer.src.lr_schedule import create_dynamic_lr
+
+DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"]
+
+def get_config(version='base', batch_size=1):
+    """get config"""
+    if version == 'large':
+        transformer_cfg = TransformerConfig(
+            batch_size=96,
+            seq_length=128,
+            vocab_size=36560,
+            hidden_size=1024,
+            num_hidden_layers=6,
+            num_attention_heads=16,
+            intermediate_size=4096,
+            hidden_act="relu",
+            hidden_dropout_prob=0.0,
+            attention_probs_dropout_prob=0.0,
+            max_position_embeddings=128,
+            initializer_range=0.02,
+            label_smoothing=0.1,
+            input_mask_from_dataset=True,
+            dtype=mstype.float32,
+            compute_type=mstype.float16)
+    elif version == 'base':
+        transformer_cfg = TransformerConfig(
+            batch_size=96,
+            seq_length=128,
+            vocab_size=36560,
+            hidden_size=512,
+            num_hidden_layers=6,
+            num_attention_heads=8,
+            intermediate_size=2048,
+            hidden_act="relu",
+            hidden_dropout_prob=0.0,
+            attention_probs_dropout_prob=0.0,
+            max_position_embeddings=128,
+            initializer_range=0.02,
+            label_smoothing=0.1,
+            input_mask_from_dataset=True,
+            dtype=mstype.float32,
+            compute_type=mstype.float16)
+    else:
+        transformer_cfg = TransformerConfig(batch_size=batch_size)
+    return transformer_cfg
+
+class ModelCallback(Callback):
+    def __init__(self):
+        super(ModelCallback, self).__init__()
+        self.loss_list = []
+        self.overflow_list = []
+        self.lossscale_list = []
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        self.loss_list.append(cb_params.net_outputs[0].asnumpy()[0])
+        self.overflow_list.append(cb_params.net_outputs[1].asnumpy())
+        self.lossscale_list.append(cb_params.net_outputs[2].asnumpy())
+        print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs)))
+
+class TimeMonitor(Callback):
+    """Time Monitor."""
+    def __init__(self, data_size):
+        super(TimeMonitor, self).__init__()
+        self.data_size = data_size
+        self.epoch_mseconds_list = []
+        self.per_step_mseconds_list = []
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        self.epoch_mseconds_list.append(epoch_mseconds)
+        self.per_step_mseconds_list.append(epoch_mseconds / self.data_size)
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_transformer():
+    """
+    Transformer training.
+    """
+    np.random.seed(1)
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+    context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)
+    version = os.getenv('VERSION', 'large')
+    batch_size = 96
+    epoch_size = 3
+    config = get_config(version=version, batch_size=batch_size)
+    dataset, repeat_count = create_transformer_dataset(epoch_count=epoch_size,
+                                                       do_shuffle="false",
+                                                       enable_data_sink="false",
+                                                       dataset_path=DATA_DIR)
+
+    netwithloss = TransformerNetworkWithLoss(config, True)
+
+    lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
+                                  training_steps=dataset.get_dataset_size()*epoch_size,
+                                  learning_rate=cfg.lr_schedule.learning_rate,
+                                  warmup_steps=cfg.lr_schedule.warmup_steps,
+                                  hidden_size=config.hidden_size), mstype.float32)
+    optimizer = Adam(netwithloss.trainable_params(), lr)
+
+    callback = ModelCallback()
+
+    scale_manager = DynamicLossScaleManager(init_loss_scale=4194304,
+                                            scale_factor=cfg.scale_factor,
+                                            scale_window=3)
+    update_cell = scale_manager.get_update_cell()
+    netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
+                                                            scale_update_cell=update_cell)
+
+    netwithgrads.set_train(True)
+    time_monitor_callback = TimeMonitor(dataset.get_dataset_size())
+    model = Model(netwithgrads)
+    model.train(repeat_count, dataset, callbacks=[time_monitor_callback, callback], dataset_sink_mode=False)
+
+    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
+    loss_value = np.array(callback.loss_list)
+    assert np.allclose(loss_value[0], 11.241624, 0, 0.000005)
+
+    expect_loss_value = [11.241624, 11.243232, 11.217465, 11.204196, 11.2138195,
+                         11.215386, 11.19053, 11.150403, 11.191858, 11.160057]
+    print("loss value: {}".format(loss_value))
+    assert np.allclose(loss_value[0:10], expect_loss_value, 0, 0.0005)
+
+    overflow = np.array(callback.overflow_list)
+    expect_overflow = [False, False, False, True, False, False, False, True, False, False]
+    print("overflow: {}".format(overflow))
+    assert (overflow[0:10] == expect_overflow).all()
+
+    loss_scale = np.array(callback.lossscale_list)
+    expect_loss_scale = [4194304.0, 4194304.0, 8388608.0, 4194304.0, 4194304.0,
+                         4194304.0, 8388608.0, 4194304.0, 4194304.0, 4194304.0]
+    print("loss scale: {}".format(loss_scale))
+    assert np.allclose(loss_scale[0:10], expect_loss_scale, 0, 0)
+
+    epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2]
+    expect_epoch_mseconds = 3180
+    print("epoch mseconds: {}".format(epoch_mseconds))
+    assert epoch_mseconds <= expect_epoch_mseconds + 20
+
+    per_step_mseconds = np.array(time_monitor_callback.per_step_mseconds_list)[2]
+    expect_per_step_mseconds = 318
+    print("per step mseconds: {}".format(per_step_mseconds))
+    assert per_step_mseconds <= expect_per_step_mseconds + 2
+
+if __name__ == '__main__':
+    test_transformer()
diff --git a/tests/st/model_zoo_tests/wide_and_deep/env.sh b/tests/st/model_zoo_tests/wide_and_deep/env.sh
new file mode 100644
index 0000000000..475c9f5ffb
--- /dev/null
+++ b/tests/st/model_zoo_tests/wide_and_deep/env.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+LOCAL_HIAI=/usr/local/Ascend
+export TBE_IMPL_PATH=${LOCAL_HIAI}/runtime/ops/op_impl/built-in/ai_core/tbe/impl/:${TBE_IMPL_PATH}
+export LD_LIBRARY_PATH=${LOCAL_HIAI}/runtime/lib64/:${LOCAL_HIAI}/add-ons/:${LD_LIBRARY_PATH}
+export PATH=${LOCAL_HIAI}/runtime/ccec_compiler/bin/:${PATH}
+export PYTHONPATH=${LOCAL_HIAI}/runtime/ops/op_impl/built-in/ai_core/tbe/:${PYTHONPATH}
+export DEVICE_MEMORY_CAPACITY=1073741824000
+export NOT_FULLY_USE_DEVICES=off
diff --git a/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/__init__.py b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/config.py b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/config.py
new file mode 100644
index 0000000000..578a4f93e3
--- /dev/null
+++ b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/config.py
@@ -0,0 +1,92 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" config. """
+import argparse
+
+
+def argparse_init():
+    """
+    argparse_init
+    """
+    parser = argparse.ArgumentParser(description='WideDeep')
+    parser.add_argument("--data_path", type=str, default="./test_raw_data/")
+    parser.add_argument("--epochs", type=int, default=15)
+    parser.add_argument("--batch_size", type=int, default=16000)
+    parser.add_argument("--eval_batch_size", type=int, default=16000)
+    parser.add_argument("--field_size", type=int, default=39)
+    parser.add_argument("--vocab_size", type=int, default=184965)
+    parser.add_argument("--emb_dim", type=int, default=80)
+    parser.add_argument("--deep_layer_dim", type=int, nargs='+', default=[1024, 512, 256, 128])
+    parser.add_argument("--deep_layer_act", type=str, default='relu')
+    parser.add_argument("--keep_prob", type=float, default=1.0)
+
+    parser.add_argument("--output_path", type=str, default="./output/")
+    parser.add_argument("--ckpt_path", type=str, default="./checkpoints/")
+    parser.add_argument("--eval_file_name", type=str, default="eval.log")
+    parser.add_argument("--loss_file_name", type=str, default="loss.log")
+    return parser
+
+
+class WideDeepConfig():
+    """
+    WideDeepConfig
+    """
+    def __init__(self):
+        self.data_path = "/home/workspace/mindspore_dataset/criteo_data/mindrecord"
+        self.epochs = 1
+        self.batch_size = 16000
+        self.eval_batch_size = 16000
+        self.field_size = 39
+        self.vocab_size = 184968
+        self.emb_dim = 64
+        self.deep_layer_dim = [1024, 512, 256, 128]
+        self.deep_layer_act = 'relu'
+        self.weight_bias_init = ['normal', 'normal']
+        self.emb_init = 'normal'
+        self.init_args = [-0.01, 0.01]
+        self.dropout_flag = False
+        self.keep_prob = 1.0
+        self.l2_coef = 8e-5
+
+        self.output_path = "./output"
+        self.eval_file_name = "eval.log"
+        self.loss_file_name = "loss.log"
+        self.ckpt_path = "./checkpoints/"
+
+    def argparse_init(self):
+        """
+        argparse_init
+        """
+        parser = argparse_init()
+        args, _ = parser.parse_known_args()
+        self.data_path = args.data_path
+        self.epochs = args.epochs
+        self.batch_size = args.batch_size
+        self.eval_batch_size = args.eval_batch_size
+        self.field_size = args.field_size
+        self.vocab_size = args.vocab_size
+        self.emb_dim = args.emb_dim
+        self.deep_layer_dim = args.deep_layer_dim
+        self.deep_layer_act = args.deep_layer_act
+        self.keep_prob = args.keep_prob
+        self.weight_bias_init = ['normal', 'normal']
+        self.emb_init = 'normal'
+        self.init_args = [-0.01, 0.01]
+        self.dropout_flag = False
+        self.l2_coef = 8e-5
+
+        self.output_path = args.output_path
+        self.eval_file_name = args.eval_file_name
+        self.loss_file_name = args.loss_file_name
+        self.ckpt_path = args.ckpt_path
diff --git a/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/datasets.py b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/datasets.py
new file mode 100644
index 0000000000..bad2106fbf
--- /dev/null
+++ b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/datasets.py
@@ -0,0 +1,116 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train_imagenet."""
+
+
+import os
+from enum import Enum
+import numpy as np
+import mindspore.dataset.engine as de
+import mindspore.common.dtype as mstype
+
+class DataType(Enum):
+    """
+    Enumerate supported dataset format.
+    """
+    MINDRECORD = 1
+    TFRECORD = 2
+    H5 = 3
+
+def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
+                    line_per_sample=1000, rank_size=None, rank_id=None):
+    """
+    get_tf_dataset
+    """
+    dataset_files = []
+    file_prefix_name = 'train' if train_mode else 'test'
+    shuffle = train_mode
+    for (dirpath, _, filenames) in os.walk(data_dir):
+        for filename in filenames:
+            if file_prefix_name in filename and "tfrecord" in filename:
+                dataset_files.append(os.path.join(dirpath, filename))
+    schema = de.Schema()
+    schema.add_column('feat_ids', de_type=mstype.int32)
+    schema.add_column('feat_vals', de_type=mstype.float32)
+    schema.add_column('label', de_type=mstype.float32)
+    if rank_size is not None and rank_id is not None:
+        ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8,
+                                num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
+    else:
+        ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8)
+    ds = ds.batch(int(batch_size / line_per_sample),
+                  drop_remainder=True)
+    ds = ds.map(operations=(lambda x, y, z: (
+        np.array(x).flatten().reshape(batch_size, 39),
+        np.array(y).flatten().reshape(batch_size, 39),
+        np.array(z).flatten().reshape(batch_size, 1))),
+                input_columns=['feat_ids', 'feat_vals', 'label'],
+                columns_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
+    #if train_mode:
+    ds = ds.repeat(epochs)
+    return ds
+
+def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
+                            line_per_sample=1000, rank_size=None, rank_id=None):
+    """
+    Get dataset with mindrecord format.
+
+    Args:
+        directory (str): Dataset directory.
+        train_mode (bool): Whether dataset is use for train or eval (default=True).
+        epochs (int): Dataset epoch size (default=1).
+        batch_size (int): Dataset batch size (default=1000).
+        line_per_sample (int): The number of sample per line (default=1000).
+        rank_size (int): The number of device, not necessary for single device (default=None).
+        rank_id (int): Id of device, not necessary for single device (default=None).
+
+    Returns:
+        Dataset.
+    """
+    file_prefix_name = 'train_input_part.mindrecord' if train_mode else 'test_input_part.mindrecord'
+    file_suffix_name = '00' if train_mode else '0'
+    shuffle = train_mode
+
+    if rank_size is not None and rank_id is not None:
+        ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
+                            columns_list=['feat_ids', 'feat_vals', 'label'],
+                            num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
+                            num_parallel_workers=8)
+    else:
+        ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
+                            columns_list=['feat_ids', 'feat_vals', 'label'],
+                            shuffle=shuffle, num_parallel_workers=8)
+    ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
+    ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
+                                             np.array(y).flatten().reshape(batch_size, 39),
+                                             np.array(z).flatten().reshape(batch_size, 1))),
+                input_columns=['feat_ids', 'feat_vals', 'label'],
+                columns_order=['feat_ids', 'feat_vals', 'label'],
+                num_parallel_workers=8)
+    ds = ds.repeat(epochs)
+    return ds
+
+
+def create_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
+                   data_type=DataType.TFRECORD, line_per_sample=1000, rank_size=None, rank_id=None):
+    """
+    create_dataset
+    """
+    if data_type == DataType.TFRECORD:
+        return _get_tf_dataset(data_dir, train_mode, epochs, batch_size,
+                               line_per_sample, rank_size=rank_size, rank_id=rank_id)
+    return _get_mindrecord_dataset(data_dir, train_mode, epochs,
+                                   batch_size, line_per_sample,
+                                   rank_size, rank_id)
diff --git a/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/train_and_test_multinpu_ci.py b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/train_and_test_multinpu_ci.py
new file mode 100644
index 0000000000..0aca7d1e75
--- /dev/null
+++ b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/train_and_test_multinpu_ci.py
@@ -0,0 +1,108 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train_multinpu."""
+
+
+import os
+import sys
+from mindspore import Model, context
+from mindspore.train.callback import TimeMonitor
+from mindspore.train import ParallelMode
+from mindspore.communication.management import get_rank, get_group_size, init
+from mindspore.parallel import _cost_model_context as cost_model_context
+from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple
+
+from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
+from src.callbacks import LossCallBack, EvalCallBack
+from src.datasets import create_dataset, DataType
+from src.metrics import AUCMetric
+from src.config import WideDeepConfig
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True)
+context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, mirror_mean=True)
+cost_model_context.set_cost_model_context(multi_subgraphs=True)
+init()
+
+
+
+def get_WideDeep_net(config):
+    WideDeep_net = WideDeepModel(config)
+    loss_net = NetWithLossClass(WideDeep_net, config)
+    loss_net = VirtualDatasetCellTriple(loss_net)
+    train_net = TrainStepWrap(loss_net)
+    eval_net = PredictWithSigmoid(WideDeep_net)
+    eval_net = VirtualDatasetCellTriple(eval_net)
+    return train_net, eval_net
+
+
+class ModelBuilder():
+    """
+    ModelBuilder
+    """
+    def __init__(self):
+        pass
+
+    def get_hook(self):
+        pass
+
+    def get_train_hook(self):
+        hooks = []
+        callback = LossCallBack()
+        hooks.append(callback)
+        if int(os.getenv('DEVICE_ID')) == 0:
+            pass
+        return hooks
+
+    def get_net(self, config):
+        return get_WideDeep_net(config)
+
+
+def test_train_eval():
+    """
+    test_train_eval
+    """
+    config = WideDeepConfig()
+    data_path = config.data_path
+    batch_size = config.batch_size
+    epochs = config.epochs
+    print("epochs is {}".format(epochs))
+    ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size,
+                              data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size())
+    ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size,
+                             data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size())
+    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
+    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))
+
+    net_builder = ModelBuilder()
+
+    train_net, eval_net = net_builder.get_net(config)
+    train_net.set_train()
+    auc_metric = AUCMetric()
+
+    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})
+
+    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
+
+    callback = LossCallBack(config=config)
+    context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt")
+    model.train(epochs, ds_train,
+                callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback])
+    eval_values = list(eval_callback.eval_values)
+    assert eval_values[0] > 0.78
+
+
+if __name__ == "__main__":
+    test_train_eval()
diff --git a/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/wide_and_deep.py b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/wide_and_deep.py
new file mode 100644
index 0000000000..e860e0afef
--- /dev/null
+++ b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/wide_and_deep.py
@@ -0,0 +1,333 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""wide and deep model"""
+from mindspore import nn
+from mindspore import Parameter, ParameterTuple
+import mindspore.common.dtype as mstype
+from mindspore.ops import functional as F
+from mindspore.ops import composite as C
+from mindspore.ops import operations as P
+# from mindspore.nn import Dropout
+from mindspore.nn.optim import Adam, FTRL
+# from mindspore.nn.metrics import Metric
+from mindspore.common.initializer import Uniform, initializer
+# from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+from mindspore.parallel._utils import _get_device_num, _get_parallel_mode, _get_mirror_mean
+from mindspore.train.parallel_utils import ParallelMode
+from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
+from mindspore.communication.management import get_group_size
+import numpy as np
+
+np_type = np.float32
+ms_type = mstype.float32
+
+
+def init_method(method, shape, name, max_val=1.0):
+    '''
+    parameter init method
+    '''
+    if method in ['uniform']:
+        params = Parameter(initializer(
+            Uniform(max_val), shape, ms_type), name=name)
+    elif method == "one":
+        params = Parameter(initializer("ones", shape, ms_type), name=name)
+    elif method == 'zero':
+        params = Parameter(initializer("zeros", shape, ms_type), name=name)
+    elif method == "normal":
+        params = Parameter(initializer("normal", shape, ms_type), name=name)
+    return params
+
+
+def init_var_dict(init_args, in_vars):
+    '''
+    var init function
+    '''
+    var_map = {}
+    _, _max_val = init_args
+    for _, iterm in enumerate(in_vars):
+        key, shape, method = iterm
+        if key not in var_map.keys():
+            if method in ['random', 'uniform']:
+                var_map[key] = Parameter(initializer(
+                    Uniform(_max_val), shape, ms_type), name=key)
+            elif method == "one":
+                var_map[key] = Parameter(initializer(
+                    "ones", shape, ms_type), name=key)
+            elif method == "zero":
+                var_map[key] = Parameter(initializer(
+                    "zeros", shape, ms_type), name=key)
+            elif method == 'normal':
+                var_map[key] = Parameter(initializer(
+                    "normal", shape, ms_type), name=key)
+    return var_map
+
+
+class DenseLayer(nn.Cell):
+    """
+    Dense Layer for Deep Layer of WideDeep Model;
+    Containing: activation, matmul, bias_add;
+    Args:
+    """
+
+    def __init__(self, input_dim, output_dim, weight_bias_init, act_str,
+                 keep_prob=0.7, scale_coef=1.0, convert_dtype=True):
+        super(DenseLayer, self).__init__()
+        weight_init, bias_init = weight_bias_init
+        self.weight = init_method(
+            weight_init, [input_dim, output_dim], name="weight")
+        self.bias = init_method(bias_init, [output_dim], name="bias")
+        self.act_func = self._init_activation(act_str)
+        self.matmul = P.MatMul(transpose_b=False)
+        self.bias_add = P.BiasAdd()
+        self.cast = P.Cast()
+        #self.dropout = Dropout(keep_prob=keep_prob)
+        self.mul = P.Mul()
+        self.realDiv = P.RealDiv()
+        self.scale_coef = scale_coef
+        self.convert_dtype = convert_dtype
+
+    def _init_activation(self, act_str):
+        act_str = act_str.lower()
+        if act_str == "relu":
+            act_func = P.ReLU()
+        elif act_str == "sigmoid":
+            act_func = P.Sigmoid()
+        elif act_str == "tanh":
+            act_func = P.Tanh()
+        return act_func
+
+    def construct(self, x):
+        x = self.act_func(x)
+        # if self.training:
+        #    x = self.dropout(x)
+        x = self.mul(x, self.scale_coef)
+        if self.convert_dtype:
+            x = self.cast(x, mstype.float16)
+            weight = self.cast(self.weight, mstype.float16)
+            wx = self.matmul(x, weight)
+            wx = self.cast(wx, mstype.float32)
+        else:
+            wx = self.matmul(x, self.weight)
+        wx = self.realDiv(wx, self.scale_coef)
+        output = self.bias_add(wx, self.bias)
+        return output
+
+
+class WideDeepModel(nn.Cell):
+    """
+        From paper: " Wide & Deep Learning for Recommender Systems"
+        Args:
+            config (Class): The default config of Wide&Deep
+    """
+
+    def __init__(self, config):
+        super(WideDeepModel, self).__init__()
+        self.batch_size = config.batch_size
+        parallel_mode = _get_parallel_mode()
+        if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
+            self.batch_size = self.batch_size * get_group_size()
+        self.field_size = config.field_size
+        self.vocab_size = config.vocab_size
+        self.emb_dim = config.emb_dim
+        self.deep_layer_dims_list = config.deep_layer_dim
+        self.deep_layer_act = config.deep_layer_act
+        self.init_args = config.init_args
+        self.weight_init, self.bias_init = config.weight_bias_init
+        self.weight_bias_init = config.weight_bias_init
+        self.emb_init = config.emb_init
+        self.drop_out = config.dropout_flag
+        self.keep_prob = config.keep_prob
+        self.deep_input_dims = self.field_size * self.emb_dim
+        self.layer_dims = self.deep_layer_dims_list + [1]
+        self.all_dim_list = [self.deep_input_dims] + self.layer_dims
+
+        init_acts = [('Wide_w', [self.vocab_size, 1], self.emb_init),
+                     ('V_l2', [self.vocab_size, self.emb_dim], self.emb_init),
+                     ('Wide_b', [1], self.emb_init)]
+        var_map = init_var_dict(self.init_args, init_acts)
+        self.wide_w = var_map["Wide_w"]
+        self.wide_b = var_map["Wide_b"]
+        self.embedding_table = var_map["V_l2"]
+        self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
+                                        self.all_dim_list[1],
+                                        self.weight_bias_init,
+                                        self.deep_layer_act, convert_dtype=True)
+        self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
+                                        self.all_dim_list[2],
+                                        self.weight_bias_init,
+                                        self.deep_layer_act, convert_dtype=True)
+        self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
+                                        self.all_dim_list[3],
+                                        self.weight_bias_init,
+                                        self.deep_layer_act, convert_dtype=True)
+        self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
+                                        self.all_dim_list[4],
+                                        self.weight_bias_init,
+                                        self.deep_layer_act, convert_dtype=True)
+        self.dense_layer_5 = DenseLayer(self.all_dim_list[4],
+                                        self.all_dim_list[5],
+                                        self.weight_bias_init,
+                                        self.deep_layer_act, convert_dtype=True)
+
+        self.gather_v2 = P.GatherV2().set_strategy(((1, 8), (1, 1)))
+        self.gather_v2_1 = P.GatherV2()
+        self.mul = P.Mul()
+        self.reduce_sum = P.ReduceSum(keep_dims=False)
+        self.reshape = P.Reshape()
+        self.square = P.Square()
+        self.shape = P.Shape()
+        self.tile = P.Tile()
+        self.concat = P.Concat(axis=1)
+        self.cast = P.Cast()
+
+    def construct(self, id_hldr, wt_hldr):
+        """
+        Args:
+            id_hldr: batch ids;
+            wt_hldr: batch weights;
+        """
+        mask = self.reshape(wt_hldr, (self.batch_size, self.field_size, 1))
+        # Wide layer
+        wide_id_weight = self.gather_v2_1(self.wide_w, id_hldr, 0)
+        wx = self.mul(wide_id_weight, mask)
+        wide_out = self.reshape(self.reduce_sum(wx, 1) + self.wide_b, (-1, 1))
+        # Deep layer
+        deep_id_embs = self.gather_v2(self.embedding_table, id_hldr, 0)
+        vx = self.mul(deep_id_embs, mask)
+        deep_in = self.reshape(vx, (-1, self.field_size * self.emb_dim))
+        deep_in = self.dense_layer_1(deep_in)
+        deep_in = self.dense_layer_2(deep_in)
+        deep_in = self.dense_layer_3(deep_in)
+        deep_in = self.dense_layer_4(deep_in)
+        deep_out = self.dense_layer_5(deep_in)
+        out = wide_out + deep_out
+        return out, self.embedding_table
+
+
+class NetWithLossClass(nn.Cell):
+
+    """"
+    Provide WideDeep training loss through network.
+    Args:
+        network (Cell): The training network
+        config (Class): WideDeep config
+    """
+
+    def __init__(self, network, config):
+        super(NetWithLossClass, self).__init__(auto_prefix=False)
+        self.network = network
+        self.l2_coef = config.l2_coef
+        self.loss = P.SigmoidCrossEntropyWithLogits()
+        self.square = P.Square().set_strategy(((1, get_group_size()),))
+        self.reduceMean_false = P.ReduceMean(keep_dims=False)
+        self.reduceSum_false = P.ReduceSum(keep_dims=False)
+
+    def construct(self, batch_ids, batch_wts, label):
+        predict, embedding_table = self.network(batch_ids, batch_wts)
+        log_loss = self.loss(predict, label)
+        wide_loss = self.reduceMean_false(log_loss)
+        l2_loss_v = self.reduceSum_false(self.square(embedding_table)) / 2
+        deep_loss = self.reduceMean_false(log_loss) + self.l2_coef * l2_loss_v
+
+        return wide_loss, deep_loss
+
+
+class IthOutputCell(nn.Cell):
+    def __init__(self, network, output_index):
+        super(IthOutputCell, self).__init__()
+        self.network = network
+        self.output_index = output_index
+
+    def construct(self, x1, x2, x3):
+        predict = self.network(x1, x2, x3)[self.output_index]
+        return predict
+
+
+class TrainStepWrap(nn.Cell):
+    """
+    Encapsulation class of WideDeep network training.
+    Append Adam and FTRL optimizers to the training network after that construct
+    function can be called to create the backward graph.
+    Args:
+        network (Cell): the training network. Note that loss function should have been added.
+        sens (Number): The adjust parameter. Default: 1000.0
+    """
+
+    def __init__(self, network, sens=1000.0):
+        super(TrainStepWrap, self).__init__()
+        self.network = network
+        self.network.set_train()
+        self.trainable_params = network.trainable_params()
+        weights_w = []
+        weights_d = []
+        for params in self.trainable_params:
+            if 'wide' in params.name:
+                weights_w.append(params)
+            else:
+                weights_d.append(params)
+        self.weights_w = ParameterTuple(weights_w)
+        self.weights_d = ParameterTuple(weights_d)
+        self.optimizer_w = FTRL(learning_rate=1e-2, params=self.weights_w,
+                                l1=1e-8, l2=1e-8, initial_accum=1.0)
+        self.optimizer_d = Adam(
+            self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
+        self.hyper_map = C.HyperMap()
+        self.grad_w = C.GradOperation('grad_w', get_by_list=True,
+                                      sens_param=True)
+        self.grad_d = C.GradOperation('grad_d', get_by_list=True,
+                                      sens_param=True)
+        self.sens = sens
+        self.loss_net_w = IthOutputCell(network, output_index=0)
+        self.loss_net_d = IthOutputCell(network, output_index=1)
+
+        self.reducer_flag = False
+        self.grad_reducer_w = None
+        self.grad_reducer_d = None
+        parallel_mode = _get_parallel_mode()
+        self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL,
+                                              ParallelMode.HYBRID_PARALLEL)
+        if self.reducer_flag:
+            mean = _get_mirror_mean()
+            degree = _get_device_num()
+            self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree)
+            self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree)
+
+    def construct(self, batch_ids, batch_wts, label):
+        weights_w = self.weights_w
+        weights_d = self.weights_d
+        loss_w, loss_d = self.network(batch_ids, batch_wts, label)
+        sens_w = P.Fill()(P.DType()(loss_w), P.Shape()(loss_w), self.sens)
+        sens_d = P.Fill()(P.DType()(loss_d), P.Shape()(loss_d), self.sens)
+        grads_w = self.grad_w(self.loss_net_w, weights_w)(batch_ids, batch_wts,
+                                                          label, sens_w)
+        grads_d = self.grad_d(self.loss_net_d, weights_d)(batch_ids, batch_wts,
+                                                          label, sens_d)
+        if self.reducer_flag:
+            grads_w = self.grad_reducer_w(grads_w)
+            grads_d = self.grad_reducer_d(grads_d)
+        return F.depend(loss_w, self.optimizer_w(grads_w)), F.depend(loss_d,
+                                                                     self.optimizer_d(grads_d))
+
+
+class PredictWithSigmoid(nn.Cell):
+    def __init__(self, network):
+        super(PredictWithSigmoid, self).__init__()
+        self.network = network
+        self.sigmoid = P.Sigmoid()
+
+    def construct(self, batch_ids, batch_wts, labels):
+        logits, _, _, = self.network(batch_ids, batch_wts)
+        pred_probs = self.sigmoid(logits)
+        return logits, pred_probs, labels
diff --git a/tests/st/model_zoo_tests/wide_and_deep/run_wide_and_deep_auto_parallel.sh b/tests/st/model_zoo_tests/wide_and_deep/run_wide_and_deep_auto_parallel.sh
new file mode 100644
index 0000000000..189014ce91
--- /dev/null
+++ b/tests/st/model_zoo_tests/wide_and_deep/run_wide_and_deep_auto_parallel.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+set -e
+BASE_PATH=$(cd "$(dirname $0)"; pwd)
+CONFIG_PATH=/home/workspace/mindspore_config
+export DEVICE_NUM=8
+export RANK_SIZE=$DEVICE_NUM
+unset SLOG_PRINT_TO_STDOUT
+export MINDSPORE_HCCL_CONFIG_PATH=$CONFIG_PATH/hccl/rank_table_${DEVICE_NUM}p.json
+CODE_DIR="./"
+if [ -d ${BASE_PATH}/../../../../model_zoo/wide_and_deep ]; then
+    CODE_DIR=${BASE_PATH}/../../../../model_zoo/wide_and_deep
+elif [ -d ${BASE_PATH}/../../model_zoo/wide_and_deep ]; then
+    CODE_DIR=${BASE_PATH}/../../model_zoo/wide_and_deep
+else
+     echo "[ERROR] code dir is not found"
+fi
+echo $CODE_DIR
+rm -rf ${BASE_PATH}/wide_and_deep
+cp -r ${CODE_DIR}  ${BASE_PATH}/wide_and_deep
+cp -f ${BASE_PATH}/python_file_for_ci/train_and_test_multinpu_ci.py ${BASE_PATH}/wide_and_deep/train_and_test_multinpu_ci.py
+cp -f ${BASE_PATH}/python_file_for_ci/__init__.py ${BASE_PATH}/wide_and_deep/__init__.py
+cp -f ${BASE_PATH}/python_file_for_ci/config.py ${BASE_PATH}/wide_and_deep/src/config.py
+cp -f ${BASE_PATH}/python_file_for_ci/datasets.py ${BASE_PATH}/wide_and_deep/src/datasets.py
+cp -f ${BASE_PATH}/python_file_for_ci/wide_and_deep.py ${BASE_PATH}/wide_and_deep/src/wide_and_deep.py
+source ${BASE_PATH}/env.sh
+export PYTHONPATH=${BASE_PATH}/wide_and_deep/:$PYTHONPATH
+process_pid=()
+for((i=0; i<$DEVICE_NUM; i++)); do
+    rm -rf ${BASE_PATH}/wide_and_deep_auto_parallel${i}
+    mkdir ${BASE_PATH}/wide_and_deep_auto_parallel${i}
+    cd ${BASE_PATH}/wide_and_deep_auto_parallel${i}
+    export RANK_ID=${i}
+    export DEVICE_ID=${i}
+    echo "start training for device $i"
+    env > env$i.log
+    pytest -s -v ../wide_and_deep/train_and_test_multinpu_ci.py > train_and_test_multinpu_ci$i.log 2>&1 &
+    process_pid[${i}]=`echo $!`
+done
+
+for((i=0; i<${DEVICE_NUM}; i++)); do
+    wait ${process_pid[i]}
+    status=`echo $?`
+    if [ "${status}" != "0" ]; then
+	    echo "[ERROR] test wide_and_deep semi auto parallel failed. status: ${status}"
+        exit 1
+    else
+        echo "[INFO] test wide_and_deep semi auto parallel success."
+    fi
+done
+
+exit 0
diff --git a/tests/st/model_zoo_tests/wide_and_deep/test_wide_and_deep_auto_parallel.py b/tests/st/model_zoo_tests/wide_and_deep/test_wide_and_deep_auto_parallel.py
new file mode 100644
index 0000000000..7856f6803a
--- /dev/null
+++ b/tests/st/model_zoo_tests/wide_and_deep/test_wide_and_deep_auto_parallel.py
@@ -0,0 +1,27 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import os
+import pytest
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.env_single
+def test_wide_and_deep():
+    sh_path = os.path.split(os.path.realpath(__file__))[0]
+    ret = os.system(f"sh {sh_path}/run_wide_and_deep_auto_parallel.sh")
+    os.system(f"grep -E 'ERROR|error' {sh_path}/wide_and_deep_auto_parallel*/train*log -C 3")
+    assert ret == 0
diff --git a/model_zoo/wide_and_deep/train_and_test_multinpu.py b/tests/st/model_zoo_tests/wide_and_deep/train_and_test_multinpu_ci_data_parallel.py
similarity index 93%
rename from model_zoo/wide_and_deep/train_and_test_multinpu.py
rename to tests/st/model_zoo_tests/wide_and_deep/train_and_test_multinpu_ci_data_parallel.py
index 9a4e154326..e39562c92f 100644
--- a/model_zoo/wide_and_deep/train_and_test_multinpu.py
+++ b/tests/st/model_zoo_tests/wide_and_deep/train_and_test_multinpu_ci_data_parallel.py
@@ -101,6 +101,13 @@ def test_train_eval():
     print("=====" * 5 + "model.eval() initialized: {}".format(out))
     model.train(epochs, ds_train,
                 callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
+    expect_out0 = [0.792634,0.799862,0.803324]
+    expect_out6 = [0.796580,0.803908,0.807262]
+    if get_rank() == 0:
+        assert np.allclose(eval_callback.eval_values, expect_out0)
+    if get_rank() == 6:
+        assert np.allclose(eval_callback.eval_values, expect_out6)
+
 
 
 if __name__ == "__main__":
diff --git a/tests/st/model_zoo_tests/yolov3/src/config.py b/tests/st/model_zoo_tests/yolov3/src/config.py
new file mode 100644
index 0000000000..37bdcb944b
--- /dev/null
+++ b/tests/st/model_zoo_tests/yolov3/src/config.py
@@ -0,0 +1,49 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Config parameters for YOLOv3 models."""
+
+
+class ConfigYOLOV3ResNet18:
+    """
+    Config parameters for YOLOv3.
+
+    Examples:
+        ConfigYoloV3ResNet18.
+    """
+    img_shape = [352, 640]
+    feature_shape = [32, 3, 352, 640]
+    num_classes = 2
+    nms_max_num = 50
+
+    backbone_input_shape = [64, 64, 128, 256]
+    backbone_shape = [64, 128, 256, 512]
+    backbone_layers = [2, 2, 2, 2]
+    backbone_stride = [1, 2, 2, 2]
+
+    ignore_threshold = 0.5
+    obj_threshold = 0.3
+    nms_threshold = 0.4
+
+    anchor_scales = [(10, 13),
+                     (16, 30),
+                     (33, 23),
+                     (30, 61),
+                     (62, 45),
+                     (59, 119),
+                     (116, 90),
+                     (156, 198),
+                     (163, 326)]
+    out_channel = int(len(anchor_scales) / 3 * (num_classes + 5))
diff --git a/tests/st/model_zoo_tests/yolov3/src/dataset.py b/tests/st/model_zoo_tests/yolov3/src/dataset.py
new file mode 100644
index 0000000000..e13802566b
--- /dev/null
+++ b/tests/st/model_zoo_tests/yolov3/src/dataset.py
@@ -0,0 +1,318 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""YOLOv3 dataset"""
+from __future__ import division
+
+import os
+import numpy as np
+from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
+from PIL import Image
+import mindspore.dataset as de
+from mindspore.mindrecord import FileWriter
+import mindspore.dataset.transforms.vision.c_transforms as C
+from src.config import ConfigYOLOV3ResNet18
+
+iter_cnt = 0
+_NUM_BOXES = 50
+np.random.seed(1)
+de.config.set_seed(1)
+
+def preprocess_fn(image, box, is_training):
+    """Preprocess function for dataset."""
+    config_anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 163, 326]
+    anchors = np.array([float(x) for x in config_anchors]).reshape(-1, 2)
+    do_hsv = False
+    max_boxes = 20
+    num_classes = ConfigYOLOV3ResNet18.num_classes
+
+    def _rand(a=0., b=1.):
+        return np.random.rand() * (b - a) + a
+
+    def _preprocess_true_boxes(true_boxes, anchors, in_shape=None):
+        """Get true boxes."""
+        num_layers = anchors.shape[0] // 3
+        anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+        true_boxes = np.array(true_boxes, dtype='float32')
+        # input_shape = np.array([in_shape, in_shape], dtype='int32')
+        input_shape = np.array(in_shape, dtype='int32')
+        boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2.
+        boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2]
+        true_boxes[..., 0:2] = boxes_xy / input_shape[::-1]
+        true_boxes[..., 2:4] = boxes_wh / input_shape[::-1]
+
+        grid_shapes = [input_shape // 32, input_shape // 16, input_shape // 8]
+        y_true = [np.zeros((grid_shapes[l][0], grid_shapes[l][1], len(anchor_mask[l]),
+                            5 + num_classes), dtype='float32') for l in range(num_layers)]
+
+        anchors = np.expand_dims(anchors, 0)
+        anchors_max = anchors / 2.
+        anchors_min = -anchors_max
+
+        valid_mask = boxes_wh[..., 0] >= 1
+
+        wh = boxes_wh[valid_mask]
+
+
+        if len(wh) >= 1:
+            wh = np.expand_dims(wh, -2)
+            boxes_max = wh / 2.
+            boxes_min = -boxes_max
+
+            intersect_min = np.maximum(boxes_min, anchors_min)
+            intersect_max = np.minimum(boxes_max, anchors_max)
+            intersect_wh = np.maximum(intersect_max - intersect_min, 0.)
+            intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
+            box_area = wh[..., 0] * wh[..., 1]
+            anchor_area = anchors[..., 0] * anchors[..., 1]
+            iou = intersect_area / (box_area + anchor_area - intersect_area)
+
+            best_anchor = np.argmax(iou, axis=-1)
+            for t, n in enumerate(best_anchor):
+                for l in range(num_layers):
+                    if n in anchor_mask[l]:
+                        i = np.floor(true_boxes[t, 0] * grid_shapes[l][1]).astype('int32')
+                        j = np.floor(true_boxes[t, 1] * grid_shapes[l][0]).astype('int32')
+                        k = anchor_mask[l].index(n)
+
+                        c = true_boxes[t, 4].astype('int32')
+                        y_true[l][j, i, k, 0:4] = true_boxes[t, 0:4]
+                        y_true[l][j, i, k, 4] = 1.
+                        y_true[l][j, i, k, 5 + c] = 1.
+
+        pad_gt_box0 = np.zeros(shape=[50, 4], dtype=np.float32)
+        pad_gt_box1 = np.zeros(shape=[50, 4], dtype=np.float32)
+        pad_gt_box2 = np.zeros(shape=[50, 4], dtype=np.float32)
+
+        mask0 = np.reshape(y_true[0][..., 4:5], [-1])
+        gt_box0 = np.reshape(y_true[0][..., 0:4], [-1, 4])
+        gt_box0 = gt_box0[mask0 == 1]
+        pad_gt_box0[:gt_box0.shape[0]] = gt_box0
+
+        mask1 = np.reshape(y_true[1][..., 4:5], [-1])
+        gt_box1 = np.reshape(y_true[1][..., 0:4], [-1, 4])
+        gt_box1 = gt_box1[mask1 == 1]
+        pad_gt_box1[:gt_box1.shape[0]] = gt_box1
+
+        mask2 = np.reshape(y_true[2][..., 4:5], [-1])
+        gt_box2 = np.reshape(y_true[2][..., 0:4], [-1, 4])
+        gt_box2 = gt_box2[mask2 == 1]
+        pad_gt_box2[:gt_box2.shape[0]] = gt_box2
+
+        return y_true[0], y_true[1], y_true[2], pad_gt_box0, pad_gt_box1, pad_gt_box2
+
+    def _infer_data(img_data, input_shape, box):
+        w, h = img_data.size
+        input_h, input_w = input_shape
+        scale = min(float(input_w) / float(w), float(input_h) / float(h))
+        nw = int(w * scale)
+        nh = int(h * scale)
+        img_data = img_data.resize((nw, nh), Image.BICUBIC)
+
+        new_image = np.zeros((input_h, input_w, 3), np.float32)
+        new_image.fill(128)
+        img_data = np.array(img_data)
+        if len(img_data.shape) == 2:
+            img_data = np.expand_dims(img_data, axis=-1)
+            img_data = np.concatenate([img_data, img_data, img_data], axis=-1)
+
+        dh = int((input_h - nh) / 2)
+        dw = int((input_w - nw) / 2)
+        new_image[dh:(nh + dh), dw:(nw + dw), :] = img_data
+        new_image /= 255.
+        new_image = np.transpose(new_image, (2, 0, 1))
+        new_image = np.expand_dims(new_image, 0)
+        return new_image, np.array([h, w], np.float32), box
+
+    def _data_aug(image, box, is_training, jitter=0.3, hue=0.1, sat=1.5, val=1.5, image_size=(352, 640)):
+        """Data augmentation function."""
+        if not isinstance(image, Image.Image):
+            image = Image.fromarray(image)
+
+        iw, ih = image.size
+        ori_image_shape = np.array([ih, iw], np.int32)
+        h, w = image_size
+
+        if not is_training:
+            return _infer_data(image, image_size, box)
+
+        flip = _rand() < .5
+        # correct boxes
+        box_data = np.zeros((max_boxes, 5))
+        while True:
+            # Prevent the situation that all boxes are eliminated
+            new_ar = float(w) / float(h) * _rand(1 - jitter, 1 + jitter) / \
+                     _rand(1 - jitter, 1 + jitter)
+            scale = _rand(0.25, 2)
+
+            if new_ar < 1:
+                nh = int(scale * h)
+                nw = int(nh * new_ar)
+            else:
+                nw = int(scale * w)
+                nh = int(nw / new_ar)
+
+            dx = int(_rand(0, w - nw))
+            dy = int(_rand(0, h - nh))
+
+            if len(box) >= 1:
+                t_box = box.copy()
+                np.random.shuffle(t_box)
+                t_box[:, [0, 2]] = t_box[:, [0, 2]] * float(nw) / float(iw) + dx
+                t_box[:, [1, 3]] = t_box[:, [1, 3]] * float(nh) / float(ih) + dy
+                if flip:
+                    t_box[:, [0, 2]] = w - t_box[:, [2, 0]]
+                t_box[:, 0:2][t_box[:, 0:2] < 0] = 0
+                t_box[:, 2][t_box[:, 2] > w] = w
+                t_box[:, 3][t_box[:, 3] > h] = h
+                box_w = t_box[:, 2] - t_box[:, 0]
+                box_h = t_box[:, 3] - t_box[:, 1]
+                t_box = t_box[np.logical_and(box_w > 1, box_h > 1)]  # discard invalid box
+
+            if len(t_box) >= 1:
+                box = t_box
+                break
+
+        box_data[:len(box)] = box
+        # resize image
+        image = image.resize((nw, nh), Image.BICUBIC)
+        # place image
+        new_image = Image.new('RGB', (w, h), (128, 128, 128))
+        new_image.paste(image, (dx, dy))
+        image = new_image
+
+        # flip image or not
+        if flip:
+            image = image.transpose(Image.FLIP_LEFT_RIGHT)
+
+        # convert image to gray or not
+        gray = _rand() < .25
+        if gray:
+            image = image.convert('L').convert('RGB')
+
+        # when the channels of image is 1
+        image = np.array(image)
+        if len(image.shape) == 2:
+            image = np.expand_dims(image, axis=-1)
+            image = np.concatenate([image, image, image], axis=-1)
+
+        # distort image
+        hue = _rand(-hue, hue)
+        sat = _rand(1, sat) if _rand() < .5 else 1 / _rand(1, sat)
+        val = _rand(1, val) if _rand() < .5 else 1 / _rand(1, val)
+        image_data = image / 255.
+        if do_hsv:
+            x = rgb_to_hsv(image_data)
+            x[..., 0] += hue
+            x[..., 0][x[..., 0] > 1] -= 1
+            x[..., 0][x[..., 0] < 0] += 1
+            x[..., 1] *= sat
+            x[..., 2] *= val
+            x[x > 1] = 1
+            x[x < 0] = 0
+            image_data = hsv_to_rgb(x)  # numpy array, 0 to 1
+        image_data = image_data.astype(np.float32)
+
+        # preprocess bounding boxes
+        bbox_true_1, bbox_true_2, bbox_true_3, gt_box1, gt_box2, gt_box3 = \
+            _preprocess_true_boxes(box_data, anchors, image_size)
+
+        return image_data, bbox_true_1, bbox_true_2, bbox_true_3, \
+               ori_image_shape, gt_box1, gt_box2, gt_box3
+
+    if is_training:
+        images, bbox_1, bbox_2, bbox_3, _, gt_box1, gt_box2, gt_box3 = _data_aug(image, box, is_training)
+        return images, bbox_1, bbox_2, bbox_3, gt_box1, gt_box2, gt_box3
+
+    images, shape, anno = _data_aug(image, box, is_training)
+    return images, shape, anno
+
+
+def anno_parser(annos_str):
+    """Parse annotation from string to list."""
+    annos = []
+    for anno_str in annos_str:
+        anno = list(map(int, anno_str.strip().split(',')))
+        annos.append(anno)
+    return annos
+
+
+def filter_valid_data(image_dir, anno_path):
+    """Filter valid image file, which both in image_dir and anno_path."""
+    image_files = []
+    image_anno_dict = {}
+    if not os.path.isdir(image_dir):
+        raise RuntimeError("Path given is not valid.")
+    if not os.path.isfile(anno_path):
+        raise RuntimeError("Annotation file is not valid.")
+
+    with open(anno_path, "rb") as f:
+        lines = f.readlines()
+    for line in lines:
+        line_str = line.decode("utf-8").strip()
+        line_split = str(line_str).split(' ')
+        file_name = line_split[0]
+        if os.path.isfile(os.path.join(image_dir, file_name)):
+            image_anno_dict[file_name] = anno_parser(line_split[1:])
+            image_files.append(file_name)
+    return image_files, image_anno_dict
+
+
+def data_to_mindrecord_byte_image(image_dir, anno_path, mindrecord_dir, prefix="yolo.mindrecord", file_num=8):
+    """Create MindRecord file by image_dir and anno_path."""
+    mindrecord_path = os.path.join(mindrecord_dir, prefix)
+    writer = FileWriter(mindrecord_path, file_num)
+    image_files, image_anno_dict = filter_valid_data(image_dir, anno_path)
+
+    yolo_json = {
+        "image": {"type": "bytes"},
+        "annotation": {"type": "int64", "shape": [-1, 5]},
+    }
+    writer.add_schema(yolo_json, "yolo_json")
+
+    for image_name in image_files:
+        image_path = os.path.join(image_dir, image_name)
+        with open(image_path, 'rb') as f:
+            img = f.read()
+        annos = np.array(image_anno_dict[image_name])
+        row = {"image": img, "annotation": annos}
+        writer.write_raw_data([row])
+    writer.commit()
+
+
+def create_yolo_dataset(mindrecord_dir, batch_size=32, repeat_num=10, device_num=1, rank=0,
+                        is_training=True, num_parallel_workers=8):
+    """Creatr YOLOv3 dataset with MindDataset."""
+    ds = de.MindDataset(mindrecord_dir, columns_list=["image", "annotation"], num_shards=device_num, shard_id=rank,
+                        num_parallel_workers=num_parallel_workers, shuffle=False)
+    decode = C.Decode()
+    ds = ds.map(input_columns=["image"], operations=decode)
+    compose_map_func = (lambda image, annotation: preprocess_fn(image, annotation, is_training))
+
+    if is_training:
+        hwc_to_chw = C.HWC2CHW()
+        ds = ds.map(input_columns=["image", "annotation"],
+                    output_columns=["image", "bbox_1", "bbox_2", "bbox_3", "gt_box1", "gt_box2", "gt_box3"],
+                    columns_order=["image", "bbox_1", "bbox_2", "bbox_3", "gt_box1", "gt_box2", "gt_box3"],
+                    operations=compose_map_func, num_parallel_workers=num_parallel_workers)
+        ds = ds.map(input_columns=["image"], operations=hwc_to_chw, num_parallel_workers=num_parallel_workers)
+        ds = ds.batch(batch_size, drop_remainder=True)
+        ds = ds.repeat(repeat_num)
+    else:
+        ds = ds.map(input_columns=["image", "annotation"],
+                    output_columns=["image", "image_shape", "annotation"],
+                    columns_order=["image", "image_shape", "annotation"],
+                    operations=compose_map_func, num_parallel_workers=num_parallel_workers)
+    return ds
diff --git a/tests/st/model_zoo_tests/yolov3/src/yolov3.py b/tests/st/model_zoo_tests/yolov3/src/yolov3.py
new file mode 100644
index 0000000000..0ac6b21070
--- /dev/null
+++ b/tests/st/model_zoo_tests/yolov3/src/yolov3.py
@@ -0,0 +1,748 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""YOLOv3 based on ResNet18."""
+
+import numpy as np
+import mindspore as ms
+import mindspore.nn as nn
+from mindspore import context, Tensor
+from mindspore.parallel._auto_parallel_context import auto_parallel_context
+from mindspore.communication.management import get_group_size
+from mindspore.common.initializer import TruncatedNormal
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+from mindspore.ops import composite as C
+
+
+def weight_variable():
+    """Weight variable."""
+    return TruncatedNormal(0.02)
+
+
+class _conv2d(nn.Cell):
+    """Create Conv2D with padding."""
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
+        super(_conv2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels,
+                              kernel_size=kernel_size, stride=stride, padding=0, pad_mode='same',
+                              weight_init=weight_variable())
+    def construct(self, x):
+        x = self.conv(x)
+        return x
+
+
+def _fused_bn(channels, momentum=0.99):
+    """Get a fused batchnorm."""
+    return nn.BatchNorm2d(channels, momentum=momentum)
+
+
+def _conv_bn_relu(in_channel,
+                  out_channel,
+                  ksize,
+                  stride=1,
+                  padding=0,
+                  dilation=1,
+                  alpha=0.1,
+                  momentum=0.99,
+                  pad_mode="same"):
+    """Get a conv2d batchnorm and relu layer."""
+    return nn.SequentialCell(
+        [nn.Conv2d(in_channel,
+                   out_channel,
+                   kernel_size=ksize,
+                   stride=stride,
+                   padding=padding,
+                   dilation=dilation,
+                   pad_mode=pad_mode),
+         nn.BatchNorm2d(out_channel, momentum=momentum),
+         nn.LeakyReLU(alpha)]
+    )
+
+
+class BasicBlock(nn.Cell):
+    """
+    ResNet basic block.
+
+    Args:
+        in_channels (int): Input channel.
+        out_channels (int): Output channel.
+        stride (int): Stride size for the initial convolutional layer. Default:1.
+        momentum (float): Momentum for batchnorm layer. Default:0.1.
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        BasicBlock(3,256,stride=2,down_sample=True).
+    """
+    expansion = 1
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 momentum=0.99):
+        super(BasicBlock, self).__init__()
+
+        self.conv1 = _conv2d(in_channels, out_channels, 3, stride=stride)
+        self.bn1 = _fused_bn(out_channels, momentum=momentum)
+        self.conv2 = _conv2d(out_channels, out_channels, 3)
+        self.bn2 = _fused_bn(out_channels, momentum=momentum)
+        self.relu = P.ReLU()
+        self.down_sample_layer = None
+        self.downsample = (in_channels != out_channels)
+        if self.downsample:
+            self.down_sample_layer = _conv2d(in_channels, out_channels, 1, stride=stride)
+        self.add = P.TensorAdd()
+
+    def construct(self, x):
+        identity = x
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+
+        if self.downsample:
+            identity = self.down_sample_layer(identity)
+
+        out = self.add(x, identity)
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Cell):
+    """
+    ResNet network.
+
+    Args:
+        block (Cell): Block for network.
+        layer_nums (list): Numbers of different layers.
+        in_channels (int): Input channel.
+        out_channels (int): Output channel.
+        num_classes (int): Class number. Default:100.
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        ResNet(ResidualBlock,
+               [3, 4, 6, 3],
+               [64, 256, 512, 1024],
+               [256, 512, 1024, 2048],
+               100).
+    """
+
+    def __init__(self,
+                 block,
+                 layer_nums,
+                 in_channels,
+                 out_channels,
+                 strides=None,
+                 num_classes=80):
+        super(ResNet, self).__init__()
+
+        if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:
+            raise ValueError("the length of "
+                             "layer_num, inchannel, outchannel list must be 4!")
+
+        self.conv1 = _conv2d(3, 64, 7, stride=2)
+        self.bn1 = _fused_bn(64)
+        self.relu = P.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode='same')
+
+        self.layer1 = self._make_layer(block,
+                                       layer_nums[0],
+                                       in_channel=in_channels[0],
+                                       out_channel=out_channels[0],
+                                       stride=strides[0])
+        self.layer2 = self._make_layer(block,
+                                       layer_nums[1],
+                                       in_channel=in_channels[1],
+                                       out_channel=out_channels[1],
+                                       stride=strides[1])
+        self.layer3 = self._make_layer(block,
+                                       layer_nums[2],
+                                       in_channel=in_channels[2],
+                                       out_channel=out_channels[2],
+                                       stride=strides[2])
+        self.layer4 = self._make_layer(block,
+                                       layer_nums[3],
+                                       in_channel=in_channels[3],
+                                       out_channel=out_channels[3],
+                                       stride=strides[3])
+
+        self.num_classes = num_classes
+        if num_classes:
+            self.reduce_mean = P.ReduceMean(keep_dims=True)
+            self.end_point = nn.Dense(out_channels[3], num_classes, has_bias=True,
+                                      weight_init=weight_variable(),
+                                      bias_init=weight_variable())
+            self.squeeze = P.Squeeze(axis=(2, 3))
+
+    def _make_layer(self, block, layer_num, in_channel, out_channel, stride):
+        """
+        Make Layer for ResNet.
+
+        Args:
+            block (Cell): Resnet block.
+            layer_num (int): Layer number.
+            in_channel (int): Input channel.
+            out_channel (int): Output channel.
+            stride (int): Stride size for the initial convolutional layer.
+
+        Returns:
+            SequentialCell, the output layer.
+
+        Examples:
+            _make_layer(BasicBlock, 3, 128, 256, 2).
+        """
+        layers = []
+
+        resblk = block(in_channel, out_channel, stride=stride)
+        layers.append(resblk)
+
+        for _ in range(1, layer_num - 1):
+            resblk = block(out_channel, out_channel, stride=1)
+            layers.append(resblk)
+
+        resblk = block(out_channel, out_channel, stride=1)
+        layers.append(resblk)
+
+        return nn.SequentialCell(layers)
+
+    def construct(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        c1 = self.maxpool(x)
+
+        c2 = self.layer1(c1)
+        c3 = self.layer2(c2)
+        c4 = self.layer3(c3)
+        c5 = self.layer4(c4)
+
+        out = c5
+        if self.num_classes:
+            out = self.reduce_mean(c5, (2, 3))
+            out = self.squeeze(out)
+            out = self.end_point(out)
+
+        return c3, c4, out
+
+
+def resnet18(class_num=10):
+    """
+    Get ResNet18 neural network.
+
+    Args:
+        class_num (int): Class number.
+
+    Returns:
+        Cell, cell instance of ResNet18 neural network.
+
+    Examples:
+        resnet18(100).
+    """
+    return ResNet(BasicBlock,
+                  [2, 2, 2, 2],
+                  [64, 64, 128, 256],
+                  [64, 128, 256, 512],
+                  [1, 2, 2, 2],
+                  num_classes=class_num)
+
+
+class YoloBlock(nn.Cell):
+    """
+    YoloBlock for YOLOv3.
+
+    Args:
+        in_channels (int): Input channel.
+        out_chls (int): Middle channel.
+        out_channels (int): Output channel.
+
+    Returns:
+        Tuple, tuple of output tensor,(f1,f2,f3).
+
+    Examples:
+        YoloBlock(1024, 512, 255).
+
+    """
+    def __init__(self, in_channels, out_chls, out_channels):
+        super(YoloBlock, self).__init__()
+        out_chls_2 = out_chls * 2
+
+        self.conv0 = _conv_bn_relu(in_channels, out_chls, ksize=1)
+        self.conv1 = _conv_bn_relu(out_chls, out_chls_2, ksize=3)
+
+        self.conv2 = _conv_bn_relu(out_chls_2, out_chls, ksize=1)
+        self.conv3 = _conv_bn_relu(out_chls, out_chls_2, ksize=3)
+
+        self.conv4 = _conv_bn_relu(out_chls_2, out_chls, ksize=1)
+        self.conv5 = _conv_bn_relu(out_chls, out_chls_2, ksize=3)
+
+        self.conv6 = nn.Conv2d(out_chls_2, out_channels, kernel_size=1, stride=1, has_bias=True)
+
+    def construct(self, x):
+        c1 = self.conv0(x)
+        c2 = self.conv1(c1)
+
+        c3 = self.conv2(c2)
+        c4 = self.conv3(c3)
+
+        c5 = self.conv4(c4)
+        c6 = self.conv5(c5)
+
+        out = self.conv6(c6)
+        return c5, out
+
+
+class YOLOv3(nn.Cell):
+    """
+     YOLOv3 Network.
+
+     Note:
+         backbone = resnet18.
+
+     Args:
+         feature_shape (list): Input image shape, [N,C,H,W].
+         backbone_shape (list): resnet18 output channels shape.
+         backbone (Cell): Backbone Network.
+         out_channel (int): Output channel.
+
+     Returns:
+         Tensor, output tensor.
+
+     Examples:
+         YOLOv3(feature_shape=[1,3,416,416],
+                backbone_shape=[64, 128, 256, 512, 1024]
+                backbone=darknet53(),
+                out_channel=255).
+     """
+    def __init__(self, feature_shape, backbone_shape, backbone, out_channel):
+        super(YOLOv3, self).__init__()
+        self.out_channel = out_channel
+        self.net = backbone
+        self.backblock0 = YoloBlock(backbone_shape[-1], out_chls=backbone_shape[-2], out_channels=out_channel)
+
+        self.conv1 = _conv_bn_relu(in_channel=backbone_shape[-2], out_channel=backbone_shape[-2]//2, ksize=1)
+        self.upsample1 = P.ResizeNearestNeighbor((feature_shape[2]//16, feature_shape[3]//16))
+        self.backblock1 = YoloBlock(in_channels=backbone_shape[-2]+backbone_shape[-3],
+                                    out_chls=backbone_shape[-3],
+                                    out_channels=out_channel)
+
+        self.conv2 = _conv_bn_relu(in_channel=backbone_shape[-3], out_channel=backbone_shape[-3]//2, ksize=1)
+        self.upsample2 = P.ResizeNearestNeighbor((feature_shape[2]//8, feature_shape[3]//8))
+        self.backblock2 = YoloBlock(in_channels=backbone_shape[-3]+backbone_shape[-4],
+                                    out_chls=backbone_shape[-4],
+                                    out_channels=out_channel)
+        self.concat = P.Concat(axis=1)
+
+    def construct(self, x):
+        # input_shape of x is (batch_size, 3, h, w)
+        # feature_map1 is (batch_size, backbone_shape[2], h/8, w/8)
+        # feature_map2 is (batch_size, backbone_shape[3], h/16, w/16)
+        # feature_map3 is (batch_size, backbone_shape[4], h/32, w/32)
+        feature_map1, feature_map2, feature_map3 = self.net(x)
+        con1, big_object_output = self.backblock0(feature_map3)
+
+        con1 = self.conv1(con1)
+        ups1 = self.upsample1(con1)
+        con1 = self.concat((ups1, feature_map2))
+        con2, medium_object_output = self.backblock1(con1)
+
+        con2 = self.conv2(con2)
+        ups2 = self.upsample2(con2)
+        con3 = self.concat((ups2, feature_map1))
+        _, small_object_output = self.backblock2(con3)
+
+        return big_object_output, medium_object_output, small_object_output
+
+
+class DetectionBlock(nn.Cell):
+    """
+     YOLOv3 detection Network. It will finally output the detection result.
+
+     Args:
+         scale (str): Character, scale.
+         config (Class): YOLOv3 config.
+
+     Returns:
+         Tuple, tuple of output tensor,(f1,f2,f3).
+
+     Examples:
+         DetectionBlock(scale='l',stride=32).
+     """
+
+    def __init__(self, scale, config):
+        super(DetectionBlock, self).__init__()
+
+        self.config = config
+        if scale == 's':
+            idx = (0, 1, 2)
+        elif scale == 'm':
+            idx = (3, 4, 5)
+        elif scale == 'l':
+            idx = (6, 7, 8)
+        else:
+            raise KeyError("Invalid scale value for DetectionBlock")
+        self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], ms.float32)
+        self.num_anchors_per_scale = 3
+        self.num_attrib = 4 + 1 + self.config.num_classes
+        self.ignore_threshold = 0.5
+        self.lambda_coord = 1
+
+        self.sigmoid = nn.Sigmoid()
+        self.reshape = P.Reshape()
+        self.tile = P.Tile()
+        self.concat = P.Concat(axis=-1)
+        self.input_shape = Tensor(tuple(config.img_shape[::-1]), ms.float32)
+
+    def construct(self, x):
+        num_batch = P.Shape()(x)[0]
+        grid_size = P.Shape()(x)[2:4]
+
+        # Reshape and transpose the feature to [n, 3, grid_size[0], grid_size[1], num_attrib]
+        prediction = P.Reshape()(x, (num_batch,
+                                     self.num_anchors_per_scale,
+                                     self.num_attrib,
+                                     grid_size[0],
+                                     grid_size[1]))
+        prediction = P.Transpose()(prediction, (0, 3, 4, 1, 2))
+
+        range_x = range(grid_size[1])
+        range_y = range(grid_size[0])
+        grid_x = P.Cast()(F.tuple_to_array(range_x), ms.float32)
+        grid_y = P.Cast()(F.tuple_to_array(range_y), ms.float32)
+        # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid
+        grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1))
+        grid_y = self.tile(self.reshape(grid_y, (1, -1, 1, 1, 1)), (1, 1, grid_size[1], 1, 1))
+        # Shape is [grid_size[0], grid_size[1], 1, 2]
+        grid = self.concat((grid_x, grid_y))
+
+        box_xy = prediction[:, :, :, :, :2]
+        box_wh = prediction[:, :, :, :, 2:4]
+        box_confidence = prediction[:, :, :, :, 4:5]
+        box_probs = prediction[:, :, :, :, 5:]
+
+        box_xy = (self.sigmoid(box_xy) + grid) / P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), ms.float32)
+        box_wh = P.Exp()(box_wh) * self.anchors / self.input_shape
+        box_confidence = self.sigmoid(box_confidence)
+        box_probs = self.sigmoid(box_probs)
+
+        if self.training:
+            return grid, prediction, box_xy, box_wh
+        return box_xy, box_wh, box_confidence, box_probs
+
+
+class Iou(nn.Cell):
+    """Calculate the iou of boxes."""
+    def __init__(self):
+        super(Iou, self).__init__()
+        self.min = P.Minimum()
+        self.max = P.Maximum()
+
+    def construct(self, box1, box2):
+        box1_xy = box1[:, :, :, :, :, :2]
+        box1_wh = box1[:, :, :, :, :, 2:4]
+        box1_mins = box1_xy - box1_wh / F.scalar_to_array(2.0)
+        box1_maxs = box1_xy + box1_wh / F.scalar_to_array(2.0)
+
+        box2_xy = box2[:, :, :, :, :, :2]
+        box2_wh = box2[:, :, :, :, :, 2:4]
+        box2_mins = box2_xy - box2_wh / F.scalar_to_array(2.0)
+        box2_maxs = box2_xy + box2_wh / F.scalar_to_array(2.0)
+
+        intersect_mins = self.max(box1_mins, box2_mins)
+        intersect_maxs = self.min(box1_maxs, box2_maxs)
+        intersect_wh = self.max(intersect_maxs - intersect_mins, F.scalar_to_array(0.0))
+
+        intersect_area = P.Squeeze(-1)(intersect_wh[:, :, :, :, :, 0:1]) * \
+                         P.Squeeze(-1)(intersect_wh[:, :, :, :, :, 1:2])
+        box1_area = P.Squeeze(-1)(box1_wh[:, :, :, :, :, 0:1]) * P.Squeeze(-1)(box1_wh[:, :, :, :, :, 1:2])
+        box2_area = P.Squeeze(-1)(box2_wh[:, :, :, :, :, 0:1]) * P.Squeeze(-1)(box2_wh[:, :, :, :, :, 1:2])
+
+        iou = intersect_area / (box1_area + box2_area - intersect_area)
+        return iou
+
+
+class YoloLossBlock(nn.Cell):
+    """
+     YOLOv3 Loss block cell. It will finally output loss of the scale.
+
+     Args:
+         scale (str): Three scale here, 's', 'm' and 'l'.
+         config (Class): The default config of YOLOv3.
+
+     Returns:
+         Tensor, loss of the scale.
+
+     Examples:
+         YoloLossBlock('l', ConfigYOLOV3ResNet18()).
+     """
+
+    def __init__(self, scale, config):
+        super(YoloLossBlock, self).__init__()
+        self.config = config
+        if scale == 's':
+            idx = (0, 1, 2)
+        elif scale == 'm':
+            idx = (3, 4, 5)
+        elif scale == 'l':
+            idx = (6, 7, 8)
+        else:
+            raise KeyError("Invalid scale value for DetectionBlock")
+        self.anchors = Tensor([self.config.anchor_scales[i] for i in idx], ms.float32)
+        self.ignore_threshold = Tensor(self.config.ignore_threshold, ms.float32)
+        self.concat = P.Concat(axis=-1)
+        self.iou = Iou()
+        self.cross_entropy = P.SigmoidCrossEntropyWithLogits()
+        self.reduce_sum = P.ReduceSum()
+        self.reduce_max = P.ReduceMax(keep_dims=False)
+        self.input_shape = Tensor(tuple(config.img_shape[::-1]), ms.float32)
+
+    def construct(self, grid, prediction, pred_xy, pred_wh, y_true, gt_box):
+
+        object_mask = y_true[:, :, :, :, 4:5]
+        class_probs = y_true[:, :, :, :, 5:]
+
+        grid_shape = P.Shape()(prediction)[1:3]
+        grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32)
+
+        pred_boxes = self.concat((pred_xy, pred_wh))
+        true_xy = y_true[:, :, :, :, :2] * grid_shape - grid
+        true_wh = y_true[:, :, :, :, 2:4]
+        true_wh = P.Select()(P.Equal()(true_wh, 0.0),
+                             P.Fill()(P.DType()(true_wh), P.Shape()(true_wh), 1.0),
+                             true_wh)
+        true_wh = P.Log()(true_wh / self.anchors * self.input_shape)
+        box_loss_scale = 2 - y_true[:, :, :, :, 2:3] * y_true[:, :, :, :, 3:4]
+
+        gt_shape = P.Shape()(gt_box)
+        gt_box = P.Reshape()(gt_box, (gt_shape[0], 1, 1, 1, gt_shape[1], gt_shape[2]))
+
+        iou = self.iou(P.ExpandDims()(pred_boxes, -2), gt_box) # [batch, grid[0], grid[1], num_anchor, num_gt]
+        best_iou = self.reduce_max(iou, -1) # [batch, grid[0], grid[1], num_anchor]
+        ignore_mask = best_iou < self.ignore_threshold
+        ignore_mask = P.Cast()(ignore_mask, ms.float32)
+        ignore_mask = P.ExpandDims()(ignore_mask, -1)
+        ignore_mask = F.stop_gradient(ignore_mask)
+
+        xy_loss = object_mask * box_loss_scale * self.cross_entropy(prediction[:, :, :, :, :2], true_xy)
+        wh_loss = object_mask * box_loss_scale * 0.5 * P.Square()(true_wh - prediction[:, :, :, :, 2:4])
+        confidence_loss = self.cross_entropy(prediction[:, :, :, :, 4:5], object_mask)
+        confidence_loss = object_mask * confidence_loss + (1 - object_mask) * confidence_loss * ignore_mask
+        class_loss = object_mask * self.cross_entropy(prediction[:, :, :, :, 5:], class_probs)
+
+        # Get smooth loss
+        xy_loss = self.reduce_sum(xy_loss, ())
+        wh_loss = self.reduce_sum(wh_loss, ())
+        confidence_loss = self.reduce_sum(confidence_loss, ())
+        class_loss = self.reduce_sum(class_loss, ())
+
+        loss = xy_loss + wh_loss + confidence_loss + class_loss
+        return loss / P.Shape()(prediction)[0]
+
+
+class yolov3_resnet18(nn.Cell):
+    """
+    ResNet based YOLOv3 network.
+
+    Args:
+        config (Class): YOLOv3 config.
+
+    Returns:
+        Cell, cell instance of ResNet based YOLOv3 neural network.
+
+    Examples:
+        yolov3_resnet18(80, [1,3,416,416]).
+    """
+
+    def __init__(self, config):
+        super(yolov3_resnet18, self).__init__()
+        self.config = config
+
+        # YOLOv3 network
+        self.feature_map = YOLOv3(feature_shape=self.config.feature_shape,
+                                  backbone=ResNet(BasicBlock,
+                                                  self.config.backbone_layers,
+                                                  self.config.backbone_input_shape,
+                                                  self.config.backbone_shape,
+                                                  self.config.backbone_stride,
+                                                  num_classes=None),
+                                  backbone_shape=self.config.backbone_shape,
+                                  out_channel=self.config.out_channel)
+
+        # prediction on the default anchor boxes
+        self.detect_1 = DetectionBlock('l', self.config)
+        self.detect_2 = DetectionBlock('m', self.config)
+        self.detect_3 = DetectionBlock('s', self.config)
+
+    def construct(self, x):
+        big_object_output, medium_object_output, small_object_output = self.feature_map(x)
+        output_big = self.detect_1(big_object_output)
+        output_me = self.detect_2(medium_object_output)
+        output_small = self.detect_3(small_object_output)
+
+        return output_big, output_me, output_small
+
+
+class YoloWithLossCell(nn.Cell):
+    """"
+    Provide YOLOv3 training loss through network.
+
+    Args:
+        network (Cell): The training network.
+        config (Class): YOLOv3 config.
+
+    Returns:
+        Tensor, the loss of the network.
+    """
+    def __init__(self, network, config):
+        super(YoloWithLossCell, self).__init__()
+        self.yolo_network = network
+        self.config = config
+        self.loss_big = YoloLossBlock('l', self.config)
+        self.loss_me = YoloLossBlock('m', self.config)
+        self.loss_small = YoloLossBlock('s', self.config)
+
+    def construct(self, x, y_true_0, y_true_1, y_true_2, gt_0, gt_1, gt_2):
+        yolo_out = self.yolo_network(x)
+        loss_l = self.loss_big(yolo_out[0][0], yolo_out[0][1], yolo_out[0][2], yolo_out[0][3], y_true_0, gt_0)
+        loss_m = self.loss_me(yolo_out[1][0], yolo_out[1][1], yolo_out[1][2], yolo_out[1][3], y_true_1, gt_1)
+        loss_s = self.loss_small(yolo_out[2][0], yolo_out[2][1], yolo_out[2][2], yolo_out[2][3], y_true_2, gt_2)
+        return loss_l + loss_m + loss_s
+
+
+class TrainingWrapper(nn.Cell):
+    """
+    Encapsulation class of YOLOv3 network training.
+
+    Append an optimizer to the training network after that the construct
+    function can be called to create the backward graph.
+
+    Args:
+        network (Cell): The training network. Note that loss function should have been added.
+        optimizer (Optimizer): Optimizer for updating the weights.
+        sens (Number): The adjust parameter. Default: 1.0.
+    """
+    def __init__(self, network, optimizer, sens=1.0):
+        super(TrainingWrapper, self).__init__(auto_prefix=False)
+        self.network = network
+        self.weights = ms.ParameterTuple(network.trainable_params())
+        self.optimizer = optimizer
+        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
+        self.sens = sens
+        self.reducer_flag = False
+        self.grad_reducer = None
+        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
+        if self.parallel_mode in [ms.ParallelMode.DATA_PARALLEL, ms.ParallelMode.HYBRID_PARALLEL]:
+            self.reducer_flag = True
+        if self.reducer_flag:
+            mean = context.get_auto_parallel_context("mirror_mean")
+            if auto_parallel_context().get_device_num_is_set():
+                degree = context.get_auto_parallel_context("device_num")
+            else:
+                degree = get_group_size()
+            self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree)
+
+    def construct(self, *args):
+        weights = self.weights
+        loss = self.network(*args)
+        sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
+        grads = self.grad(self.network, weights)(*args, sens)
+        if self.reducer_flag:
+            # apply grad reducer on grads
+            grads = self.grad_reducer(grads)
+        return F.depend(loss, self.optimizer(grads))
+
+
+class YoloBoxScores(nn.Cell):
+    """
+    Calculate the boxes of the original picture size and the score of each box.
+
+    Args:
+        config (Class): YOLOv3 config.
+
+    Returns:
+        Tensor, the boxes of the original picture size.
+        Tensor, the score of each box.
+    """
+    def __init__(self, config):
+        super(YoloBoxScores, self).__init__()
+        self.input_shape = Tensor(np.array(config.img_shape), ms.float32)
+        self.num_classes = config.num_classes
+
+    def construct(self, box_xy, box_wh, box_confidence, box_probs, image_shape):
+        batch_size = F.shape(box_xy)[0]
+        x = box_xy[:, :, :, :, 0:1]
+        y = box_xy[:, :, :, :, 1:2]
+        box_yx = P.Concat(-1)((y, x))
+        w = box_wh[:, :, :, :, 0:1]
+        h = box_wh[:, :, :, :, 1:2]
+        box_hw = P.Concat(-1)((h, w))
+
+        new_shape = P.Round()(image_shape * P.ReduceMin()(self.input_shape / image_shape))
+        offset = (self.input_shape - new_shape) / 2.0 / self.input_shape
+        scale = self.input_shape / new_shape
+        box_yx = (box_yx - offset) * scale
+        box_hw = box_hw * scale
+
+        box_min = box_yx - box_hw / 2.0
+        box_max = box_yx + box_hw / 2.0
+        boxes = P.Concat(-1)((box_min[:, :, :, :, 0:1],
+                              box_min[:, :, :, :, 1:2],
+                              box_max[:, :, :, :, 0:1],
+                              box_max[:, :, :, :, 1:2]))
+        image_scale = P.Tile()(image_shape, (1, 2))
+        boxes = boxes * image_scale
+        boxes = F.reshape(boxes, (batch_size, -1, 4))
+        boxes_scores = box_confidence * box_probs
+        boxes_scores = F.reshape(boxes_scores, (batch_size, -1, self.num_classes))
+        return boxes, boxes_scores
+
+
+class YoloWithEval(nn.Cell):
+    """
+    Encapsulation class of YOLOv3 evaluation.
+
+    Args:
+        network (Cell): The training network. Note that loss function and optimizer must not be added.
+        config (Class): YOLOv3 config.
+
+    Returns:
+        Tensor, the boxes of the original picture size.
+        Tensor, the score of each box.
+        Tensor, the original picture size.
+    """
+    def __init__(self, network, config):
+        super(YoloWithEval, self).__init__()
+        self.yolo_network = network
+        self.box_score_0 = YoloBoxScores(config)
+        self.box_score_1 = YoloBoxScores(config)
+        self.box_score_2 = YoloBoxScores(config)
+
+    def construct(self, x, image_shape):
+        yolo_output = self.yolo_network(x)
+        boxes_0, boxes_scores_0 = self.box_score_0(*yolo_output[0], image_shape)
+        boxes_1, boxes_scores_1 = self.box_score_1(*yolo_output[1], image_shape)
+        boxes_2, boxes_scores_2 = self.box_score_2(*yolo_output[2], image_shape)
+        boxes = P.Concat(1)((boxes_0, boxes_1, boxes_2))
+        boxes_scores = P.Concat(1)((boxes_scores_0, boxes_scores_1, boxes_scores_2))
+        return boxes, boxes_scores, image_shape
diff --git a/tests/st/model_zoo_tests/yolov3/test_yolov3.py b/tests/st/model_zoo_tests/yolov3/test_yolov3.py
new file mode 100644
index 0000000000..126c66a6f3
--- /dev/null
+++ b/tests/st/model_zoo_tests/yolov3/test_yolov3.py
@@ -0,0 +1,157 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# less required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+######################## train YOLOv3 example ########################
+train YOLOv3 and get network model files(.ckpt) :
+python train.py --image_dir /data --anno_path /data/coco/train_coco.txt --mindrecord_dir=/data/Mindrecord_train
+
+If the mindrecord_dir is empty, it wil generate mindrecord file by image_dir and anno_path.
+Note if mindrecord_dir isn't empty, it will use mindrecord_dir rather than image_dir and anno_path.
+"""
+
+import os
+import time
+import pytest
+import numpy as np
+import mindspore.nn as nn
+from mindspore import context, Tensor
+from mindspore.train import Model
+from mindspore.common.initializer import initializer
+from mindspore.train.callback import Callback
+
+from src.yolov3 import yolov3_resnet18, YoloWithLossCell, TrainingWrapper
+from src.dataset import create_yolo_dataset
+from src.config import ConfigYOLOV3ResNet18
+
+np.random.seed(1)
+def get_lr(learning_rate, start_step, global_step, decay_step, decay_rate, steps=False):
+    """Set learning rate."""
+    lr_each_step = []
+    for i in range(global_step):
+        if steps:
+            lr_each_step.append(learning_rate * (decay_rate ** (i // decay_step)))
+        else:
+            lr_each_step.append(learning_rate * (decay_rate ** (i / decay_step)))
+    lr_each_step = np.array(lr_each_step).astype(np.float32)
+    lr_each_step = lr_each_step[start_step:]
+    return lr_each_step
+
+
+def init_net_param(network, init_value='ones'):
+    """Init:wq the parameters in network."""
+    params = network.trainable_params()
+    for p in params:
+        if isinstance(p.data, Tensor) and 'beta' not in p.name and 'gamma' not in p.name and 'bias' not in p.name:
+            p.set_parameter_data(initializer(init_value, p.data.shape, p.data.dtype))
+
+class ModelCallback(Callback):
+    def __init__(self):
+        super(ModelCallback, self).__init__()
+        self.loss_list = []
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        self.loss_list.append(cb_params.net_outputs.asnumpy())
+        print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs)))
+
+class TimeMonitor(Callback):
+    """Time Monitor."""
+    def __init__(self, data_size):
+        super(TimeMonitor, self).__init__()
+        self.data_size = data_size
+        self.epoch_mseconds_list = []
+        self.per_step_mseconds_list = []
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        self.epoch_mseconds_list.append(epoch_mseconds)
+        self.per_step_mseconds_list.append(epoch_mseconds / self.data_size)
+
+DATA_DIR = "/home/workspace/mindspore_dataset/coco/coco2017/mindrecord_train/yolov3"
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_yolov3():
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+    rank = 0
+    device_num = 1
+    lr_init = 0.001
+    epoch_size = 3
+    batch_size = 32
+    loss_scale = 1024
+    mindrecord_dir = DATA_DIR
+
+    # It will generate mindrecord file in args_opt.mindrecord_dir,
+    # and the file name is yolo.mindrecord0, 1, ... file_num.
+    if not os.path.isdir(mindrecord_dir):
+        raise KeyError("mindrecord path is not exist.")
+
+    prefix = "yolo.mindrecord"
+    mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
+    print("yolov3 mindrecord is ", mindrecord_file)
+    if not os.path.exists(mindrecord_file):
+        print("mindrecord file is not exist.")
+        assert False
+    else:
+        loss_scale = float(loss_scale)
+
+        # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0.
+        dataset = create_yolo_dataset(mindrecord_file, repeat_num=epoch_size,
+                                      batch_size=batch_size, device_num=device_num, rank=rank)
+        dataset_size = dataset.get_dataset_size()
+        print("Create dataset done!")
+
+        net = yolov3_resnet18(ConfigYOLOV3ResNet18())
+        net = YoloWithLossCell(net, ConfigYOLOV3ResNet18())
+        init_net_param(net)
+
+        total_epoch_size = 60
+        lr = Tensor(get_lr(learning_rate=lr_init, start_step=0,
+                           global_step=total_epoch_size * dataset_size,
+                           decay_step=1000, decay_rate=0.95, steps=True))
+        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
+        net = TrainingWrapper(net, opt, loss_scale)
+
+        model_callback = ModelCallback()
+        time_monitor_callback = TimeMonitor(data_size=dataset_size)
+        callback = [model_callback, time_monitor_callback]
+
+        model = Model(net)
+        print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.")
+        model.train(epoch_size, dataset, callbacks=callback, dataset_sink_mode=True)
+        # assertion occurs while the loss value, overflow state or loss_scale value is wrong
+        loss_value = np.array(model_callback.loss_list)
+
+        expect_loss_value = [6600, 4200, 2700]
+        print("loss value: {}".format(loss_value))
+        assert loss_value[0] < expect_loss_value[0]
+        assert loss_value[1] < expect_loss_value[1]
+        assert loss_value[2] < expect_loss_value[2]
+
+        epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2]
+        expect_epoch_mseconds = 950
+        print("epoch mseconds: {}".format(epoch_mseconds))
+        assert epoch_mseconds <= expect_epoch_mseconds
+
+        per_step_mseconds = np.array(time_monitor_callback.per_step_mseconds_list)[2]
+        expect_per_step_mseconds = 110
+        print("per step mseconds: {}".format(per_step_mseconds))
+        assert per_step_mseconds <= expect_per_step_mseconds
+        print("yolov3 test case passed.")
diff --git a/tests/st/nccl/test_nccl_all_gather_op.py b/tests/st/nccl/test_nccl_all_gather_op.py
index 36c2ccd29d..9d7ad205c9 100644
--- a/tests/st/nccl/test_nccl_all_gather_op.py
+++ b/tests/st/nccl/test_nccl_all_gather_op.py
@@ -51,4 +51,4 @@ def test_AllGather():
     diff = output.asnumpy() - expect
     error = np.ones(shape=expect.shape) * 1.0e-5
     assert np.all(diff < error)
-    assert output.shape() == expect.shape
+    assert output.shape == expect.shape
diff --git a/tests/st/nccl/test_nccl_all_reduce_op.py b/tests/st/nccl/test_nccl_all_reduce_op.py
index 13df46254c..0f00c6aef7 100644
--- a/tests/st/nccl/test_nccl_all_reduce_op.py
+++ b/tests/st/nccl/test_nccl_all_reduce_op.py
@@ -62,19 +62,19 @@ def test_AllReduce():
     diff0 = output[0].asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output[0].shape() == expect0.shape
+    assert output[0].shape == expect0.shape
 
     expect1 = expect0
     diff1 = output[1].asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output[1].shape() == expect1.shape
+    assert output[1].shape == expect1.shape
 
     expect2 = expect1
     diff2 = output[2].asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output[2].shape() == expect2.shape
+    assert output[2].shape == expect2.shape
 
 
 class Net2(nn.Cell):
@@ -91,10 +91,10 @@ class Net2(nn.Cell):
         self.all_reduce3 = P.AllReduce(self.op2, group=NCCL_WORLD_COMM_GROUP)
 
     def construct(self):
-        x = self.all_reduce1(self.x1)
-        y = self.all_reduce2(x)
+        x_ = self.all_reduce1(self.x1)
+        y = self.all_reduce2(x_)
         z = self.all_reduce3(y)
-        return (x, y, z)
+        return (x_, y, z)
 
 
 def test_AllReduce2():
@@ -108,16 +108,16 @@ def test_AllReduce2():
     diff0 = abs(output[0].asnumpy() - expect0)
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output[0].shape() == expect0.shape
+    assert output[0].shape == expect0.shape
 
     expect1 = expect0 * size
     diff1 = abs(output[1].asnumpy() - expect1)
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output[1].shape() == expect1.shape
+    assert output[1].shape == expect1.shape
 
     expect2 = expect1 * size
     diff2 = abs(output[2].asnumpy() - expect2)
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output[2].shape() == expect2.shape
+    assert output[2].shape == expect2.shape
diff --git a/tests/st/nccl/test_nccl_lenet.py b/tests/st/nccl/test_nccl_lenet.py
index ff7def2d72..b22bde5ab1 100644
--- a/tests/st/nccl/test_nccl_lenet.py
+++ b/tests/st/nccl/test_nccl_lenet.py
@@ -82,7 +82,7 @@ def test_lenet_nccl():
     net.set_train()
 
     learning_rate = multisteplr(epoch, 2)
-    momentum = Tensor(np.array([0.9]).astype(np.float32))
+    momentum = 0.9
     mom_optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
     criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
     net_with_criterion = WithLossCell(net, criterion)
diff --git a/tests/st/nccl/test_nccl_reduce_scatter_op.py b/tests/st/nccl/test_nccl_reduce_scatter_op.py
index c3e1903883..59ede9cd76 100644
--- a/tests/st/nccl/test_nccl_reduce_scatter_op.py
+++ b/tests/st/nccl/test_nccl_reduce_scatter_op.py
@@ -61,16 +61,16 @@ def test_ReduceScatter():
     diff0 = output[0].asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output[0].shape() == expect0.shape
+    assert output[0].shape == expect0.shape
 
     expect1 = np.ones([1, 1, 3, 3]).astype(np.float32) * 0.01 * size
     diff1 = output[1].asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output[1].shape() == expect1.shape
+    assert output[1].shape == expect1.shape
 
     expect2 = np.ones([1, 1, 3, 3]).astype(np.float32) * 0.01 * 1
     diff2 = output[2].asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output[2].shape() == expect2.shape
+    assert output[2].shape == expect2.shape
diff --git a/tests/st/networks/__init__.py b/tests/st/networks/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/st/networks/models/__init__.py b/tests/st/networks/models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/st/networks/models/bert/src/bert_for_pre_training.py b/tests/st/networks/models/bert/src/bert_for_pre_training.py
index 600512b4a7..976f1a3c43 100644
--- a/tests/st/networks/models/bert/src/bert_for_pre_training.py
+++ b/tests/st/networks/models/bert/src/bert_for_pre_training.py
@@ -32,7 +32,6 @@ from .bert_model import BertModel
 GRADIENT_CLIP_TYPE = 1
 GRADIENT_CLIP_VALUE = 1.0
 
-_nn_clip_by_norm = nn.ClipByNorm()
 clip_grad = C.MultitypeFuncGraph("clip_grad")
 
 
@@ -57,7 +56,7 @@ def _clip_grad(clip_type, clip_value, grad):
         new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
                                    F.cast(F.tuple_to_array((clip_value,)), dt))
     else:
-        new_grad = _nn_clip_by_norm(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
+        new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
     return new_grad
 
 
diff --git a/tests/st/networks/models/bert/src/config.py b/tests/st/networks/models/bert/src/config.py
index d1062b78ee..812f0c2f18 100644
--- a/tests/st/networks/models/bert/src/config.py
+++ b/tests/st/networks/models/bert/src/config.py
@@ -56,7 +56,7 @@ if cfg.bert_network == 'base':
     bert_net_cfg = BertConfig(
         batch_size=32,
         seq_length=128,
-        vocab_size=21136,
+        vocab_size=21128,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -77,7 +77,7 @@ if cfg.bert_network == 'nezha':
     bert_net_cfg = BertConfig(
         batch_size=32,
         seq_length=128,
-        vocab_size=21136,
+        vocab_size=21128,
         hidden_size=1024,
         num_hidden_layers=24,
         num_attention_heads=16,
@@ -98,7 +98,7 @@ if cfg.bert_network == 'large':
     bert_net_cfg = BertConfig(
         batch_size=16,
         seq_length=512,
-        vocab_size=30528,
+        vocab_size=30522,
         hidden_size=1024,
         num_hidden_layers=24,
         num_attention_heads=16,
diff --git a/tests/st/networks/models/bert/src/dataset.py b/tests/st/networks/models/bert/src/dataset.py
index 1828fac454..7985ca8559 100644
--- a/tests/st/networks/models/bert/src/dataset.py
+++ b/tests/st/networks/models/bert/src/dataset.py
@@ -39,6 +39,7 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", e
                             shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
                             shard_equal_rows=True)
     ori_dataset_size = ds.get_dataset_size()
+    print('origin dataset size: ', ori_dataset_size)
     new_size = ori_dataset_size
     if enable_data_sink == "true":
         new_size = data_sink_steps * bert_net_cfg.batch_size
@@ -53,7 +54,7 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", e
     ds = ds.map(input_columns="input_ids", operations=type_cast_op)
     # apply batch operations
     ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True)
-    ds = ds.repeat(new_repeat_count)
+    ds = ds.repeat(max(new_repeat_count, repeat_count))
     logger.info("data size: {}".format(ds.get_dataset_size()))
     logger.info("repeatcount: {}".format(ds.get_repeat_count()))
     return ds, new_repeat_count
diff --git a/tests/st/networks/models/bert/src/fused_layer_norm.py b/tests/st/networks/models/bert/src/fused_layer_norm.py
index ee3160b036..5dbe9999ad 100644
--- a/tests/st/networks/models/bert/src/fused_layer_norm.py
+++ b/tests/st/networks/models/bert/src/fused_layer_norm.py
@@ -73,7 +73,7 @@ class FusedLayerNorm(Cell):
 
     Examples:
         >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
-        >>> shape1 = x.shape()[1:]
+        >>> shape1 = x.shape[1:]
         >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
         >>> m(x)
     """
diff --git a/tests/st/networks/models/bert/bert_tdt_lossscale.py b/tests/st/networks/models/bert/test_bert_tdt_lossscale.py
similarity index 58%
rename from tests/st/networks/models/bert/bert_tdt_lossscale.py
rename to tests/st/networks/models/bert/test_bert_tdt_lossscale.py
index 38b207b6a6..29b4e7a542 100644
--- a/tests/st/networks/models/bert/bert_tdt_lossscale.py
+++ b/tests/st/networks/models/bert/test_bert_tdt_lossscale.py
@@ -16,8 +16,12 @@
 """train bert network without lossscale"""
 
 import os
-import pytest
+import time
+
 import numpy as np
+import pytest
+from src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell
+from src.bert_model import BertConfig
 
 import mindspore.common.dtype as mstype
 import mindspore.dataset.engine.datasets as de
@@ -25,8 +29,6 @@ import mindspore.dataset.transforms.c_transforms as C
 from mindspore import context
 from mindspore import log as logger
 from mindspore.common.tensor import Tensor
-from src.bert_model import BertConfig
-from src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell
 from mindspore.nn.optim import Lamb
 from mindspore.train.callback import Callback
 from mindspore.train.loss_scale_manager import DynamicLossScaleManager
@@ -85,14 +87,23 @@ def get_config(version='base', batch_size=1):
     return bert_config
 
 
-def me_de_train_dataset():
+def me_de_train_dataset(sink_mode=False):
     """test me de train dataset"""
     # apply repeat operations
     repeat_count = 1
+    batch_size = 16
     ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
                                                                 "next_sentence_labels", "masked_lm_positions",
                                                                 "masked_lm_ids", "masked_lm_weights"], shuffle=False)
     type_cast_op = C.TypeCast(mstype.int32)
+    new_repeat_count = repeat_count
+    if sink_mode:
+        repeat_count = 30
+        sink_steps = 100
+        ori_dataaet_size = ds.get_dataset_size()
+        new_size = sink_steps * batch_size
+        ds.set_dataset_size(new_size)
+        new_repeat_count = int(repeat_count * ori_dataaet_size // ds.get_dataset_size())
     ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op)
     ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op)
     ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op)
@@ -100,10 +111,11 @@ def me_de_train_dataset():
     ds = ds.map(input_columns="input_mask", operations=type_cast_op)
     ds = ds.map(input_columns="input_ids", operations=type_cast_op)
     # apply batch operations
-    batch_size = int(os.getenv('BATCH_SIZE', '16'))
     ds = ds.batch(batch_size, drop_remainder=True)
     ds = ds.repeat(repeat_count)
-    return ds
+    logger.info("data size: {}".format(ds.get_dataset_size()))
+    logger.info("repeat_count: {}".format(ds.get_repeat_count()))
+    return ds, new_repeat_count
 
 
 def weight_variable(shape):
@@ -127,20 +139,92 @@ class ModelCallback(Callback):
         self.lossscale_list.append(cb_params.net_outputs[2].asnumpy())
         print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs)))
 
+class TimeMonitor(Callback):
+    """Time Monitor."""
+    def __init__(self, data_size):
+        super(TimeMonitor, self).__init__()
+        self.data_size = data_size
+        self.epoch_mseconds_list = []
+        self.per_step_mseconds_list = []
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        self.epoch_mseconds_list.append(epoch_mseconds)
+        self.per_step_mseconds_list.append(epoch_mseconds / self.data_size)
 
 @pytest.mark.level0
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
-def test_bert_tdt():
-    """test bert tdt"""
+def test_bert_percision():
+    """test bert percision"""
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
+    ds, new_repeat_count = me_de_train_dataset()
+    version = os.getenv('VERSION', 'large')
+    batch_size = 16
+    config = get_config(version=version, batch_size=batch_size)
+    netwithloss = BertNetworkWithLoss(config, True)
+    optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size()*new_repeat_count,
+                     start_learning_rate=5e-5, end_learning_rate=1e-9,
+                     power=10.0, warmup_steps=0, weight_decay=0.01)
+    scale_window = 3
+    scale_manager = DynamicLossScaleManager(2 ** 16, 2, scale_window)
+    netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
+                                                     scale_update_cell=scale_manager.get_update_cell())
+    netwithgrads.set_train(True)
+    model = Model(netwithgrads)
+    callback = ModelCallback()
+    params = netwithloss.trainable_params()
+    for param in params:
+        param.init_data()
+        value = param.default_input
+        name = param.name
+        if isinstance(value, Tensor):
+            if name.split('.')[-1] in ['weight']:
+                if name.split('.')[-3] in ['cls2']:
+                    logger.info("***************** BERT param name is 1 {}".format(name))
+                    param.default_input = weight_variable(value.asnumpy().shape)
+                else:
+                    logger.info("***************** BERT param name is 2 {}".format(name))
+                    tempshape = value.asnumpy().shape
+                    shape = (tempshape[1], tempshape[0])
+                    weight_value = weight_variable(shape).asnumpy()
+                    param.default_input = Tensor(np.transpose(weight_value, [1, 0]))
+            else:
+                logger.info("***************** BERT param name is 3 {}".format(name))
+                param.default_input = weight_variable(value.asnumpy().shape)
+    model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=False)
+
+    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
+    loss_value = np.array(callback.loss_list)
+    assert np.allclose(loss_value[0], 12.206575, 0, 0.000001)
+
+    expect_loss_value = [12.206575, 11.865044, 11.828129, 11.826707, 11.82108, 12.407423, 12.005459,
+                         12.621225, 12.222903, 12.427446]
+    print("loss value: {}".format(loss_value))
+    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)
+
+    overflow = np.array(callback.overflow_list)
+    expect_overflow = [False, False, False, True, False, False, False, True, False, False]
+    print("overflow: {}".format(overflow))
+    assert (overflow == expect_overflow).all()
+
+    loss_scale = np.array(callback.lossscale_list)
+    expect_loss_scale = [65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0]
+    print("loss scale: {}".format(loss_scale))
+    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
+
+def test_bert_performance():
+    """test bert performance"""
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
-    ds = me_de_train_dataset()
+    ds, new_repeat_count = me_de_train_dataset(sink_mode=True)
     version = os.getenv('VERSION', 'large')
-    batch_size = int(os.getenv('BATCH_SIZE', '16'))
+    batch_size = 16
     config = get_config(version=version, batch_size=batch_size)
     netwithloss = BertNetworkWithLoss(config, True)
-    optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size()*ds.get_repeat_count(),
+    optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size()*new_repeat_count,
                      start_learning_rate=5e-5, end_learning_rate=1e-9,
                      power=10.0, warmup_steps=0, weight_decay=0.01)
     scale_window = 3
@@ -169,25 +253,36 @@ def test_bert_tdt():
             else:
                 logger.info("***************** BERT param name is 3 {}".format(name))
                 param.default_input = weight_variable(value.asnumpy().shape)
-    model.train(ds.get_repeat_count(), ds, callbacks=callback, dataset_sink_mode=False)
+    time_monitor_callback = TimeMonitor(ds.get_dataset_size())
+    model.train(new_repeat_count, ds, callbacks=[time_monitor_callback, callback],
+                dataset_sink_mode=True)
 
     # assertion occurs while the loss value, overflow state or loss_scale value is wrong
     loss_value = np.array(callback.loss_list)
-    expect_loss_value = [12.207198, 11.980881, 11.984844, 11.879381, 11.832978, 12.411333, 12.009284,
-                         12.621277, 12.223178, 12.427385]
+    expect_loss_value = [10.235566, 10.207392, 10.206976]
     print("loss value: {}".format(loss_value))
     assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)
 
     overflow = np.array(callback.overflow_list)
-    expect_overflow = [True, True, False, False, False, True, False, False, False, True]
+    expect_overflow = [True, True, True]
     print("overflow: {}".format(overflow))
     assert (overflow == expect_overflow).all()
 
     loss_scale = np.array(callback.lossscale_list)
-    expect_loss_scale = [32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0]
+    expect_loss_scale = [262144.0, 262144.0, 262144.0]
     print("loss scale: {}".format(loss_scale))
     assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
 
+    epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2]
+    expect_epoch_mseconds = 1600
+    print("epoch mseconds: {}".format(epoch_mseconds))
+    assert epoch_mseconds <= expect_epoch_mseconds + 5
+
+    per_step_mseconds = np.array(time_monitor_callback.per_step_mseconds_list)[2]
+    expect_per_step_mseconds = 16
+    print("per step mseconds: {}".format(per_step_mseconds))
+    assert per_step_mseconds <= expect_per_step_mseconds + 1
 
 if __name__ == '__main__':
-    test_bert_tdt()
+    test_bert_percision()
+    test_bert_performance()
diff --git a/tests/st/networks/models/deeplabv3/src/__init__.py b/tests/st/networks/models/deeplabv3/src/__init__.py
new file mode 100644
index 0000000000..54f48e9156
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Init DeepLabv3."""
+from .deeplabv3 import ASPP, deeplabv3_resnet50
+from .backbone import *
+
+__all__ = [
+    "ASPP", "deeplabv3_resnet50"
+]
+
+__all__.extend(backbone.__all__)
diff --git a/tests/st/networks/models/deeplabv3/src/backbone/__init__.py b/tests/st/networks/models/deeplabv3/src/backbone/__init__.py
new file mode 100644
index 0000000000..6f78084131
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/backbone/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Init backbone."""
+from .resnet_deeplab import Subsample, DepthwiseConv2dNative, SpaceToBatch, BatchToSpace, ResNetV1, \
+    RootBlockBeta, resnet50_dl
+
+__all__ = [
+    "Subsample", "DepthwiseConv2dNative", "SpaceToBatch", "BatchToSpace", "ResNetV1", "RootBlockBeta", "resnet50_dl"
+]
diff --git a/tests/st/networks/models/deeplabv3/src/backbone/resnet_deeplab.py b/tests/st/networks/models/deeplabv3/src/backbone/resnet_deeplab.py
new file mode 100644
index 0000000000..1dda6fe746
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/backbone/resnet_deeplab.py
@@ -0,0 +1,577 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""ResNet based DeepLab."""
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.common.initializer import initializer
+from mindspore._checkparam import twice
+from mindspore.common.parameter import Parameter
+
+
+def _conv_bn_relu(in_channel,
+                  out_channel,
+                  ksize,
+                  stride=1,
+                  padding=0,
+                  dilation=1,
+                  pad_mode="pad",
+                  use_batch_statistics=False):
+    """Get a conv2d -> batchnorm -> relu layer"""
+    return nn.SequentialCell(
+        [nn.Conv2d(in_channel,
+                   out_channel,
+                   kernel_size=ksize,
+                   stride=stride,
+                   padding=padding,
+                   dilation=dilation,
+                   pad_mode=pad_mode),
+         nn.BatchNorm2d(out_channel, use_batch_statistics=use_batch_statistics),
+         nn.ReLU()]
+    )
+
+
+def _deep_conv_bn_relu(in_channel,
+                       channel_multiplier,
+                       ksize,
+                       stride=1,
+                       padding=0,
+                       dilation=1,
+                       pad_mode="pad",
+                       use_batch_statistics=False):
+    """Get a spacetobatch -> conv2d -> batchnorm -> relu -> batchtospace layer"""
+    return nn.SequentialCell(
+        [DepthwiseConv2dNative(in_channel,
+                               channel_multiplier,
+                               kernel_size=ksize,
+                               stride=stride,
+                               padding=padding,
+                               dilation=dilation,
+                               pad_mode=pad_mode),
+         nn.BatchNorm2d(channel_multiplier * in_channel, use_batch_statistics=use_batch_statistics),
+         nn.ReLU()]
+    )
+
+
+def _stob_deep_conv_btos_bn_relu(in_channel,
+                                 channel_multiplier,
+                                 ksize,
+                                 space_to_batch_block_shape,
+                                 batch_to_space_block_shape,
+                                 paddings,
+                                 crops,
+                                 stride=1,
+                                 padding=0,
+                                 dilation=1,
+                                 pad_mode="pad",
+                                 use_batch_statistics=False):
+    """Get a spacetobatch -> conv2d -> batchnorm -> relu -> batchtospace layer"""
+    return nn.SequentialCell(
+        [SpaceToBatch(space_to_batch_block_shape, paddings),
+         DepthwiseConv2dNative(in_channel,
+                               channel_multiplier,
+                               kernel_size=ksize,
+                               stride=stride,
+                               padding=padding,
+                               dilation=dilation,
+                               pad_mode=pad_mode),
+         BatchToSpace(batch_to_space_block_shape, crops),
+         nn.BatchNorm2d(channel_multiplier * in_channel, use_batch_statistics=use_batch_statistics),
+         nn.ReLU()]
+    )
+
+
+def _stob_conv_btos_bn_relu(in_channel,
+                            out_channel,
+                            ksize,
+                            space_to_batch_block_shape,
+                            batch_to_space_block_shape,
+                            paddings,
+                            crops,
+                            stride=1,
+                            padding=0,
+                            dilation=1,
+                            pad_mode="pad",
+                            use_batch_statistics=False):
+    """Get a spacetobatch -> conv2d -> batchnorm -> relu -> batchtospace layer"""
+    return nn.SequentialCell([SpaceToBatch(space_to_batch_block_shape, paddings),
+                              nn.Conv2d(in_channel,
+                                        out_channel,
+                                        kernel_size=ksize,
+                                        stride=stride,
+                                        padding=padding,
+                                        dilation=dilation,
+                                        pad_mode=pad_mode),
+                              BatchToSpace(batch_to_space_block_shape, crops),
+                              nn.BatchNorm2d(out_channel, use_batch_statistics=use_batch_statistics),
+                              nn.ReLU()]
+                             )
+
+
+def _make_layer(block,
+                in_channels,
+                out_channels,
+                num_blocks,
+                stride=1,
+                rate=1,
+                multi_grads=None,
+                output_stride=None,
+                g_current_stride=2,
+                g_rate=1):
+    """Make layer for DeepLab-ResNet network."""
+    if multi_grads is None:
+        multi_grads = [1] * num_blocks
+    # (stride == 2, num_blocks == 4 --> strides == [1, 1, 1, 2])
+    strides = [1] * (num_blocks - 1) + [stride]
+    blocks = []
+    if output_stride is not None:
+        if output_stride % 4 != 0:
+            raise ValueError('The output_stride needs to be a multiple of 4.')
+        output_stride //= 4
+    for i_stride, _ in enumerate(strides):
+        if output_stride is not None and g_current_stride > output_stride:
+            raise ValueError('The target output_stride cannot be reached.')
+        if output_stride is not None and g_current_stride == output_stride:
+            b_rate = g_rate
+            b_stride = 1
+            g_rate *= strides[i_stride]
+        else:
+            b_rate = rate
+            b_stride = strides[i_stride]
+            g_current_stride *= strides[i_stride]
+        blocks.append(block(in_channels=in_channels,
+                            out_channels=out_channels,
+                            stride=b_stride,
+                            rate=b_rate,
+                            multi_grad=multi_grads[i_stride]))
+        in_channels = out_channels
+    layer = nn.SequentialCell(blocks)
+    return layer, g_current_stride, g_rate
+
+
+class Subsample(nn.Cell):
+    """
+    Subsample for DeepLab-ResNet.
+    Args:
+        factor (int): Sample factor.
+    Returns:
+        Tensor, the sub sampled tensor.
+    Examples:
+        >>> Subsample(2)
+    """
+    def __init__(self, factor):
+        super(Subsample, self).__init__()
+        self.factor = factor
+        self.pool = nn.MaxPool2d(kernel_size=1,
+                                 stride=factor)
+
+    def construct(self, x):
+        if self.factor == 1:
+            return x
+        return self.pool(x)
+
+
+class SpaceToBatch(nn.Cell):
+    def __init__(self, block_shape, paddings):
+        super(SpaceToBatch, self).__init__()
+        self.space_to_batch = P.SpaceToBatch(block_shape, paddings)
+        self.bs = block_shape
+        self.pd = paddings
+
+    def construct(self, x):
+        return self.space_to_batch(x)
+
+
+class BatchToSpace(nn.Cell):
+    def __init__(self, block_shape, crops):
+        super(BatchToSpace, self).__init__()
+        self.batch_to_space = P.BatchToSpace(block_shape, crops)
+        self.bs = block_shape
+        self.cr = crops
+
+    def construct(self, x):
+        return self.batch_to_space(x)
+
+
+class _DepthwiseConv2dNative(nn.Cell):
+    """Depthwise Conv2D Cell."""
+    def __init__(self,
+                 in_channels,
+                 channel_multiplier,
+                 kernel_size,
+                 stride,
+                 pad_mode,
+                 padding,
+                 dilation,
+                 group,
+                 weight_init):
+        super(_DepthwiseConv2dNative, self).__init__()
+        self.in_channels = in_channels
+        self.channel_multiplier = channel_multiplier
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.pad_mode = pad_mode
+        self.padding = padding
+        self.dilation = dilation
+        self.group = group
+        if not (isinstance(in_channels, int) and in_channels > 0):
+            raise ValueError('Attr \'in_channels\' of \'DepthwiseConv2D\' Op passed '
+                             + str(in_channels) + ', should be a int and greater than 0.')
+        if (not isinstance(kernel_size, tuple)) or len(kernel_size) != 2 or \
+            (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
+                kernel_size[0] < 1 or kernel_size[1] < 1:
+            raise ValueError('Attr \'kernel_size\' of \'DepthwiseConv2D\' Op passed '
+                             + str(self.kernel_size) + ', should be a int or tuple and equal to or greater than 1.')
+        self.weight = Parameter(initializer(weight_init, [1, in_channels // group, *kernel_size]),
+                                name='weight')
+
+    def construct(self, *inputs):
+        """Must be overridden by all subclasses."""
+        raise NotImplementedError
+
+
+class DepthwiseConv2dNative(_DepthwiseConv2dNative):
+    """Depthwise Conv2D Cell."""
+    def __init__(self,
+                 in_channels,
+                 channel_multiplier,
+                 kernel_size,
+                 stride=1,
+                 pad_mode='same',
+                 padding=0,
+                 dilation=1,
+                 group=1,
+                 weight_init='normal'):
+        kernel_size = twice(kernel_size)
+        super(DepthwiseConv2dNative, self).__init__(
+            in_channels,
+            channel_multiplier,
+            kernel_size,
+            stride,
+            pad_mode,
+            padding,
+            dilation,
+            group,
+            weight_init)
+        self.depthwise_conv2d_native = P.DepthwiseConv2dNative(channel_multiplier=self.channel_multiplier,
+                                                               kernel_size=self.kernel_size,
+                                                               mode=3,
+                                                               pad_mode=self.pad_mode,
+                                                               pad=self.padding,
+                                                               stride=self.stride,
+                                                               dilation=self.dilation,
+                                                               group=self.group)
+
+    def set_strategy(self, strategy):
+        self.depthwise_conv2d_native.set_strategy(strategy)
+        return self
+
+    def construct(self, x):
+        return self.depthwise_conv2d_native(x, self.weight)
+
+
+class BottleneckV1(nn.Cell):
+    """
+    ResNet V1 BottleneckV1 block definition.
+    Args:
+        in_channels (int): Input channel.
+        out_channels (int): Output channel.
+        stride (int): Stride size for the initial convolutional layer. Default: 1.
+        rate (int): Rate for convolution. Default: 1.
+        multi_grad (int): Employ a rate within network. Default: 1.
+    Returns:
+        Tensor, the ResNet unit's output.
+    Examples:
+        >>> BottleneckV1(3,256,stride=2)
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 use_batch_statistics=False,
+                 use_batch_to_stob_and_btos=False):
+        super(BottleneckV1, self).__init__()
+        expansion = 4
+        mid_channels = out_channels // expansion
+        self.conv_bn1 = _conv_bn_relu(in_channels,
+                                      mid_channels,
+                                      ksize=1,
+                                      stride=1,
+                                      use_batch_statistics=use_batch_statistics)
+        self.conv_bn2 = _conv_bn_relu(mid_channels,
+                                      mid_channels,
+                                      ksize=3,
+                                      stride=stride,
+                                      padding=1,
+                                      dilation=1,
+                                      use_batch_statistics=use_batch_statistics)
+        if use_batch_to_stob_and_btos:
+            self.conv_bn2 = _stob_conv_btos_bn_relu(mid_channels,
+                                                    mid_channels,
+                                                    ksize=3,
+                                                    stride=stride,
+                                                    padding=0,
+                                                    dilation=1,
+                                                    space_to_batch_block_shape=2,
+                                                    batch_to_space_block_shape=2,
+                                                    paddings=[[2, 3], [2, 3]],
+                                                    crops=[[0, 1], [0, 1]],
+                                                    pad_mode="valid",
+                                                    use_batch_statistics=use_batch_statistics)
+
+        self.conv3 = nn.Conv2d(mid_channels,
+                               out_channels,
+                               kernel_size=1,
+                               stride=1)
+        self.bn3 = nn.BatchNorm2d(out_channels, use_batch_statistics=use_batch_statistics)
+        if in_channels != out_channels:
+            conv = nn.Conv2d(in_channels,
+                             out_channels,
+                             kernel_size=1,
+                             stride=stride)
+            bn = nn.BatchNorm2d(out_channels, use_batch_statistics=use_batch_statistics)
+            self.downsample = nn.SequentialCell([conv, bn])
+        else:
+            self.downsample = Subsample(stride)
+        self.add = P.TensorAdd()
+        self.relu = nn.ReLU()
+        self.Reshape = P.Reshape()
+
+    def construct(self, x):
+        out = self.conv_bn1(x)
+        out = self.conv_bn2(out)
+        out = self.bn3(self.conv3(out))
+        out = self.add(out, self.downsample(x))
+        out = self.relu(out)
+        return out
+
+
+class BottleneckV2(nn.Cell):
+    """
+    ResNet V2 Bottleneck variance V2 block definition.
+    Args:
+        in_channels (int): Input channel.
+        out_channels (int): Output channel.
+        stride (int): Stride size for the initial convolutional layer. Default: 1.
+    Returns:
+        Tensor, the ResNet unit's output.
+    Examples:
+        >>> BottleneckV2(3,256,stride=2)
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 use_batch_statistics=False,
+                 use_batch_to_stob_and_btos=False,
+                 dilation=1):
+        super(BottleneckV2, self).__init__()
+        expansion = 4
+        mid_channels = out_channels // expansion
+        self.conv_bn1 = _conv_bn_relu(in_channels,
+                                      mid_channels,
+                                      ksize=1,
+                                      stride=1,
+                                      use_batch_statistics=use_batch_statistics)
+        self.conv_bn2 = _conv_bn_relu(mid_channels,
+                                      mid_channels,
+                                      ksize=3,
+                                      stride=stride,
+                                      padding=1,
+                                      dilation=dilation,
+                                      use_batch_statistics=use_batch_statistics)
+        if use_batch_to_stob_and_btos:
+            self.conv_bn2 = _stob_conv_btos_bn_relu(mid_channels,
+                                                    mid_channels,
+                                                    ksize=3,
+                                                    stride=stride,
+                                                    padding=0,
+                                                    dilation=1,
+                                                    space_to_batch_block_shape=2,
+                                                    batch_to_space_block_shape=2,
+                                                    paddings=[[2, 3], [2, 3]],
+                                                    crops=[[0, 1], [0, 1]],
+                                                    pad_mode="valid",
+                                                    use_batch_statistics=use_batch_statistics)
+        self.conv3 = nn.Conv2d(mid_channels,
+                               out_channels,
+                               kernel_size=1,
+                               stride=1)
+        self.bn3 = nn.BatchNorm2d(out_channels, use_batch_statistics=use_batch_statistics)
+        if in_channels != out_channels:
+            conv = nn.Conv2d(in_channels,
+                             out_channels,
+                             kernel_size=1,
+                             stride=stride)
+            bn = nn.BatchNorm2d(out_channels, use_batch_statistics=use_batch_statistics)
+            self.downsample = nn.SequentialCell([conv, bn])
+        else:
+            self.downsample = Subsample(stride)
+        self.add = P.TensorAdd()
+        self.relu = nn.ReLU()
+
+    def construct(self, x):
+        out = self.conv_bn1(x)
+        out = self.conv_bn2(out)
+        out = self.bn3(self.conv3(out))
+        out = self.add(out, x)
+        out = self.relu(out)
+        return out
+
+
+class BottleneckV3(nn.Cell):
+    """
+    ResNet V1 Bottleneck variance V1 block definition.
+    Args:
+        in_channels (int): Input channel.
+        out_channels (int): Output channel.
+        stride (int): Stride size for the initial convolutional layer. Default: 1.
+    Returns:
+        Tensor, the ResNet unit's output.
+    Examples:
+        >>> BottleneckV3(3,256,stride=2)
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 use_batch_statistics=False):
+        super(BottleneckV3, self).__init__()
+        expansion = 4
+        mid_channels = out_channels // expansion
+        self.conv_bn1 = _conv_bn_relu(in_channels,
+                                      mid_channels,
+                                      ksize=1,
+                                      stride=1,
+                                      use_batch_statistics=use_batch_statistics)
+        self.conv_bn2 = _conv_bn_relu(mid_channels,
+                                      mid_channels,
+                                      ksize=3,
+                                      stride=stride,
+                                      padding=1,
+                                      dilation=1,
+                                      use_batch_statistics=use_batch_statistics)
+        self.conv3 = nn.Conv2d(mid_channels,
+                               out_channels,
+                               kernel_size=1,
+                               stride=1)
+        self.bn3 = nn.BatchNorm2d(out_channels, use_batch_statistics=use_batch_statistics)
+
+        if in_channels != out_channels:
+            conv = nn.Conv2d(in_channels,
+                             out_channels,
+                             kernel_size=1,
+                             stride=stride)
+            bn = nn.BatchNorm2d(out_channels, use_batch_statistics=use_batch_statistics)
+            self.downsample = nn.SequentialCell([conv, bn])
+        else:
+            self.downsample = Subsample(stride)
+        self.downsample = Subsample(stride)
+        self.add = P.TensorAdd()
+        self.relu = nn.ReLU()
+
+    def construct(self, x):
+        out = self.conv_bn1(x)
+        out = self.conv_bn2(out)
+        out = self.bn3(self.conv3(out))
+        out = self.add(out, self.downsample(x))
+        out = self.relu(out)
+        return out
+
+
+class ResNetV1(nn.Cell):
+    """
+    ResNet V1 for DeepLab.
+    Args:
+    Returns:
+        Tuple, output tensor tuple, (c2,c5).
+    Examples:
+        >>> ResNetV1(False)
+    """
+    def __init__(self, fine_tune_batch_norm=False):
+        super(ResNetV1, self).__init__()
+        self.layer_root = nn.SequentialCell(
+            [RootBlockBeta(fine_tune_batch_norm),
+             nn.MaxPool2d(kernel_size=(3, 3),
+                          stride=(2, 2),
+                          pad_mode='same')])
+        self.layer1_1 = BottleneckV1(128, 256, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer1_2 = BottleneckV2(256, 256, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer1_3 = BottleneckV3(256, 256, stride=2, use_batch_statistics=fine_tune_batch_norm)
+        self.layer2_1 = BottleneckV1(256, 512, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer2_2 = BottleneckV2(512, 512, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer2_3 = BottleneckV2(512, 512, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer2_4 = BottleneckV3(512, 512, stride=2, use_batch_statistics=fine_tune_batch_norm)
+        self.layer3_1 = BottleneckV1(512, 1024, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer3_2 = BottleneckV2(1024, 1024, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer3_3 = BottleneckV2(1024, 1024, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer3_4 = BottleneckV2(1024, 1024, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer3_5 = BottleneckV2(1024, 1024, stride=1, use_batch_statistics=fine_tune_batch_norm)
+        self.layer3_6 = BottleneckV2(1024, 1024, stride=1, use_batch_statistics=fine_tune_batch_norm)
+
+        self.layer4_1 = BottleneckV1(1024, 2048, stride=1, use_batch_to_stob_and_btos=True,
+                                     use_batch_statistics=fine_tune_batch_norm)
+        self.layer4_2 = BottleneckV2(2048, 2048, stride=1, use_batch_to_stob_and_btos=True,
+                                     use_batch_statistics=fine_tune_batch_norm)
+        self.layer4_3 = BottleneckV2(2048, 2048, stride=1, use_batch_to_stob_and_btos=True,
+                                     use_batch_statistics=fine_tune_batch_norm)
+
+    def construct(self, x):
+        x = self.layer_root(x)
+        x = self.layer1_1(x)
+        c2 = self.layer1_2(x)
+        x = self.layer1_3(c2)
+        x = self.layer2_1(x)
+        x = self.layer2_2(x)
+        x = self.layer2_3(x)
+        x = self.layer2_4(x)
+        x = self.layer3_1(x)
+        x = self.layer3_2(x)
+        x = self.layer3_3(x)
+        x = self.layer3_4(x)
+        x = self.layer3_5(x)
+        x = self.layer3_6(x)
+
+        x = self.layer4_1(x)
+        x = self.layer4_2(x)
+        c5 = self.layer4_3(x)
+        return c2, c5
+
+
+class RootBlockBeta(nn.Cell):
+    """
+    ResNet V1 beta root block definition.
+    Returns:
+        Tensor, the block unit's output.
+    Examples:
+        >>> RootBlockBeta()
+    """
+    def __init__(self, fine_tune_batch_norm=False):
+        super(RootBlockBeta, self).__init__()
+        self.conv1 = _conv_bn_relu(3, 64, ksize=3, stride=2, padding=0, pad_mode="valid",
+                                   use_batch_statistics=fine_tune_batch_norm)
+        self.conv2 = _conv_bn_relu(64, 64, ksize=3, stride=1, padding=0, pad_mode="same",
+                                   use_batch_statistics=fine_tune_batch_norm)
+        self.conv3 = _conv_bn_relu(64, 128, ksize=3, stride=1, padding=0, pad_mode="same",
+                                   use_batch_statistics=fine_tune_batch_norm)
+
+    def construct(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        return x
+
+
+def resnet50_dl(fine_tune_batch_norm=False):
+    return ResNetV1(fine_tune_batch_norm)
diff --git a/tests/st/networks/models/deeplabv3/src/config.py b/tests/st/networks/models/deeplabv3/src/config.py
new file mode 100644
index 0000000000..6b5519e46c
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/config.py
@@ -0,0 +1,38 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+network config setting, will be used in train.py and evaluation.py
+"""
+from easydict import EasyDict as ed
+
+config = ed({
+    "learning_rate": 0.0014,
+    "weight_decay": 0.00005,
+    "momentum": 0.97,
+    "crop_size": 513,
+    "eval_scales": [0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+    "atrous_rates": None,
+    "image_pyramid": None,
+    "output_stride": 16,
+    "fine_tune_batch_norm": False,
+    "ignore_label": 255,
+    "decoder_output_stride": None,
+    "seg_num_classes": 21,
+    "epoch_size": 6,
+    "batch_size": 2,
+    "enable_save_ckpt": True,
+    "save_checkpoint_steps": 10000,
+    "save_checkpoint_num": 1
+})
diff --git a/tests/st/networks/models/deeplabv3/src/deeplabv3.py b/tests/st/networks/models/deeplabv3/src/deeplabv3.py
new file mode 100644
index 0000000000..906a207302
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/deeplabv3.py
@@ -0,0 +1,457 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""DeepLabv3."""
+
+import numpy as np
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from .backbone.resnet_deeplab import _conv_bn_relu, resnet50_dl, _deep_conv_bn_relu, \
+    DepthwiseConv2dNative, SpaceToBatch, BatchToSpace
+
+
+class ASPPSampleBlock(nn.Cell):
+    """ASPP sample block."""
+    def __init__(self, feature_shape, scale_size, output_stride):
+        super(ASPPSampleBlock, self).__init__()
+        sample_h = (feature_shape[0] * scale_size + 1) / output_stride + 1
+        sample_w = (feature_shape[1] * scale_size + 1) / output_stride + 1
+        self.sample = P.ResizeBilinear((int(sample_h), int(sample_w)), align_corners=True)
+
+    def construct(self, x):
+        return self.sample(x)
+
+
+class ASPP(nn.Cell):
+    """
+    ASPP model for DeepLabv3.
+
+    Args:
+        channel (int): Input channel.
+        depth (int): Output channel.
+        feature_shape (list): The shape of feature,[h,w].
+        scale_sizes (list): Input scales for multi-scale feature extraction.
+        atrous_rates (list): Atrous rates for atrous spatial pyramid pooling.
+        output_stride (int): 'The ratio of input to output spatial resolution.'
+        fine_tune_batch_norm (bool): 'Fine tune the batch norm parameters or not'
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ASPP(channel=2048,256,[14,14],[1],[6],16)
+    """
+    def __init__(self, channel, depth, feature_shape, scale_sizes,
+                 atrous_rates, output_stride, fine_tune_batch_norm=False):
+        super(ASPP, self).__init__()
+        self.aspp0 = _conv_bn_relu(channel,
+                                   depth,
+                                   ksize=1,
+                                   stride=1,
+                                   use_batch_statistics=fine_tune_batch_norm)
+        self.atrous_rates = []
+        if atrous_rates is not None:
+            self.atrous_rates = atrous_rates
+            self.aspp_pointwise = _conv_bn_relu(channel,
+                                                depth,
+                                                ksize=1,
+                                                stride=1,
+                                                use_batch_statistics=fine_tune_batch_norm)
+            self.aspp_depth_depthwiseconv = DepthwiseConv2dNative(channel,
+                                                                  channel_multiplier=1,
+                                                                  kernel_size=3,
+                                                                  stride=1,
+                                                                  dilation=1,
+                                                                  pad_mode="valid")
+            self.aspp_depth_bn = nn.BatchNorm2d(1 * channel, use_batch_statistics=fine_tune_batch_norm)
+            self.aspp_depth_relu = nn.ReLU()
+            self.aspp_depths = []
+            self.aspp_depth_spacetobatchs = []
+            self.aspp_depth_batchtospaces = []
+
+            for scale_size in scale_sizes:
+                aspp_scale_depth_size = np.ceil((feature_shape[0]*scale_size)/16)
+                if atrous_rates is None:
+                    break
+                for rate in atrous_rates:
+                    padding = 0
+                    for j in range(100):
+                        padded_size = rate * j
+                        if padded_size >= aspp_scale_depth_size + 2 * rate:
+                            padding = padded_size - aspp_scale_depth_size - 2 * rate
+                            break
+                    paddings = [[rate, rate + int(padding)],
+                                [rate, rate + int(padding)]]
+                    self.aspp_depth_spacetobatch = SpaceToBatch(rate, paddings)
+                    self.aspp_depth_spacetobatchs.append(self.aspp_depth_spacetobatch)
+                    crops = [[0, int(padding)], [0, int(padding)]]
+                    self.aspp_depth_batchtospace = BatchToSpace(rate, crops)
+                    self.aspp_depth_batchtospaces.append(self.aspp_depth_batchtospace)
+            self.aspp_depths = nn.CellList(self.aspp_depths)
+            self.aspp_depth_spacetobatchs = nn.CellList(self.aspp_depth_spacetobatchs)
+            self.aspp_depth_batchtospaces = nn.CellList(self.aspp_depth_batchtospaces)
+
+        self.global_pooling = nn.AvgPool2d(kernel_size=(int(feature_shape[0]), int(feature_shape[1])))
+        self.global_poolings = []
+        for scale_size in scale_sizes:
+            pooling_h = np.ceil((feature_shape[0]*scale_size)/output_stride)
+            pooling_w = np.ceil((feature_shape[0]*scale_size)/output_stride)
+            self.global_poolings.append(nn.AvgPool2d(kernel_size=(int(pooling_h), int(pooling_w))))
+        self.global_poolings = nn.CellList(self.global_poolings)
+        self.conv_bn = _conv_bn_relu(channel,
+                                     depth,
+                                     ksize=1,
+                                     stride=1,
+                                     use_batch_statistics=fine_tune_batch_norm)
+        self.samples = []
+        for scale_size in scale_sizes:
+            self.samples.append(ASPPSampleBlock(feature_shape, scale_size, output_stride))
+        self.samples = nn.CellList(self.samples)
+        self.feature_shape = feature_shape
+        self.concat = P.Concat(axis=1)
+
+    def construct(self, x, scale_index=0):
+        aspp0 = self.aspp0(x)
+        aspp1 = self.global_poolings[scale_index](x)
+        aspp1 = self.conv_bn(aspp1)
+        aspp1 = self.samples[scale_index](aspp1)
+        output = self.concat((aspp1, aspp0))
+
+        for i in range(len(self.atrous_rates)):
+            aspp_i = self.aspp_depth_spacetobatchs[i + scale_index * len(self.atrous_rates)](x)
+            aspp_i = self.aspp_depth_depthwiseconv(aspp_i)
+            aspp_i = self.aspp_depth_batchtospaces[i + scale_index * len(self.atrous_rates)](aspp_i)
+            aspp_i = self.aspp_depth_bn(aspp_i)
+            aspp_i = self.aspp_depth_relu(aspp_i)
+            aspp_i = self.aspp_pointwise(aspp_i)
+            output = self.concat((output, aspp_i))
+        return output
+
+
+class DecoderSampleBlock(nn.Cell):
+    """Decoder sample block."""
+    def __init__(self, feature_shape, scale_size=1.0, decoder_output_stride=4):
+        super(DecoderSampleBlock, self).__init__()
+        sample_h = (feature_shape[0] * scale_size + 1) / decoder_output_stride + 1
+        sample_w = (feature_shape[1] * scale_size + 1) / decoder_output_stride + 1
+        self.sample = P.ResizeBilinear((int(sample_h), int(sample_w)), align_corners=True)
+
+    def construct(self, x):
+        return self.sample(x)
+
+
+class Decoder(nn.Cell):
+    """
+    Decode module for DeepLabv3.
+    Args:
+        low_level_channel (int): Low level input channel
+        channel (int): Input channel.
+        depth (int): Output channel.
+        feature_shape (list): 'Input image shape, [N,C,H,W].'
+        scale_sizes (list): 'Input scales for multi-scale feature extraction.'
+        decoder_output_stride (int): 'The ratio of input to output spatial resolution'
+        fine_tune_batch_norm (bool): 'Fine tune the batch norm parameters or not'
+    Returns:
+        Tensor, output tensor.
+    Examples:
+        >>> Decoder(256, 100, [56,56])
+    """
+    def __init__(self,
+                 low_level_channel,
+                 channel,
+                 depth,
+                 feature_shape,
+                 scale_sizes,
+                 decoder_output_stride,
+                 fine_tune_batch_norm):
+        super(Decoder, self).__init__()
+        self.feature_projection = _conv_bn_relu(low_level_channel, 48, ksize=1, stride=1,
+                                                pad_mode="same", use_batch_statistics=fine_tune_batch_norm)
+        self.decoder_depth0 = _deep_conv_bn_relu(channel + 48,
+                                                 channel_multiplier=1,
+                                                 ksize=3,
+                                                 stride=1,
+                                                 pad_mode="same",
+                                                 dilation=1,
+                                                 use_batch_statistics=fine_tune_batch_norm)
+        self.decoder_pointwise0 = _conv_bn_relu(channel + 48,
+                                                depth,
+                                                ksize=1,
+                                                stride=1,
+                                                use_batch_statistics=fine_tune_batch_norm)
+        self.decoder_depth1 = _deep_conv_bn_relu(depth,
+                                                 channel_multiplier=1,
+                                                 ksize=3,
+                                                 stride=1,
+                                                 pad_mode="same",
+                                                 dilation=1,
+                                                 use_batch_statistics=fine_tune_batch_norm)
+        self.decoder_pointwise1 = _conv_bn_relu(depth,
+                                                depth,
+                                                ksize=1,
+                                                stride=1,
+                                                use_batch_statistics=fine_tune_batch_norm)
+        self.depth = depth
+        self.concat = P.Concat(axis=1)
+        self.samples = []
+        for scale_size in scale_sizes:
+            self.samples.append(DecoderSampleBlock(feature_shape, scale_size, decoder_output_stride))
+        self.samples = nn.CellList(self.samples)
+
+    def construct(self, x, low_level_feature, scale_index):
+        low_level_feature = self.feature_projection(low_level_feature)
+        low_level_feature = self.samples[scale_index](low_level_feature)
+        x = self.samples[scale_index](x)
+        output = self.concat((x, low_level_feature))
+        output = self.decoder_depth0(output)
+        output = self.decoder_pointwise0(output)
+        output = self.decoder_depth1(output)
+        output = self.decoder_pointwise1(output)
+        return output
+
+
+class SingleDeepLabV3(nn.Cell):
+    """
+    DeepLabv3 Network.
+    Args:
+        num_classes (int): Class number.
+        feature_shape (list): Input image shape, [N,C,H,W].
+        backbone (Cell): Backbone Network.
+        channel (int): Resnet output channel.
+        depth (int): ASPP block depth.
+        scale_sizes (list): Input scales for multi-scale feature extraction.
+        atrous_rates (list): Atrous rates for atrous spatial pyramid pooling.
+        decoder_output_stride (int): 'The ratio of input to output spatial resolution'
+        output_stride (int): 'The ratio of input to output spatial resolution.'
+        fine_tune_batch_norm (bool): 'Fine tune the batch norm parameters or not'
+    Returns:
+        Tensor, output tensor.
+    Examples:
+        >>> SingleDeepLabV3(num_classes=10,
+            >>>           feature_shape=[1,3,224,224],
+            >>>           backbone=resnet50_dl(),
+            >>>           channel=2048,
+            >>>           depth=256)
+            >>>           scale_sizes=[1.0])
+            >>>           atrous_rates=[6])
+            >>>           decoder_output_stride=4)
+            >>>           output_stride=16)
+        """
+
+    def __init__(self,
+                 num_classes,
+                 feature_shape,
+                 backbone,
+                 channel,
+                 depth,
+                 scale_sizes,
+                 atrous_rates,
+                 decoder_output_stride,
+                 output_stride,
+                 fine_tune_batch_norm=False):
+        super(SingleDeepLabV3, self).__init__()
+        self.num_classes = num_classes
+        self.channel = channel
+        self.depth = depth
+        self.scale_sizes = []
+        for scale_size in np.sort(scale_sizes):
+            self.scale_sizes.append(scale_size)
+        self.net = backbone
+        self.aspp = ASPP(channel=self.channel,
+                         depth=self.depth,
+                         feature_shape=[feature_shape[2],
+                                        feature_shape[3]],
+                         scale_sizes=self.scale_sizes,
+                         atrous_rates=atrous_rates,
+                         output_stride=output_stride,
+                         fine_tune_batch_norm=fine_tune_batch_norm)
+        self.aspp.add_flags(loop_can_unroll=True)
+        atrous_rates_len = 0
+        if atrous_rates is not None:
+            atrous_rates_len = len(atrous_rates)
+        self.fc1 = _conv_bn_relu(depth * (2 + atrous_rates_len), depth,
+                                 ksize=1,
+                                 stride=1,
+                                 use_batch_statistics=fine_tune_batch_norm)
+        self.fc2 = nn.Conv2d(depth,
+                             num_classes,
+                             kernel_size=1,
+                             stride=1,
+                             has_bias=True)
+        self.upsample = P.ResizeBilinear((int(feature_shape[2]),
+                                          int(feature_shape[3])),
+                                         align_corners=True)
+        self.samples = []
+        for scale_size in self.scale_sizes:
+            self.samples.append(SampleBlock(feature_shape, scale_size))
+        self.samples = nn.CellList(self.samples)
+        self.feature_shape = [float(feature_shape[0]), float(feature_shape[1]), float(feature_shape[2]),
+                              float(feature_shape[3])]
+
+        self.pad = P.Pad(((0, 0), (0, 0), (1, 1), (1, 1)))
+        self.dropout = nn.Dropout(keep_prob=0.9)
+        self.shape = P.Shape()
+        self.decoder_output_stride = decoder_output_stride
+        if decoder_output_stride is not None:
+            self.decoder = Decoder(low_level_channel=depth,
+                                   channel=depth,
+                                   depth=depth,
+                                   feature_shape=[feature_shape[2],
+                                                  feature_shape[3]],
+                                   scale_sizes=self.scale_sizes,
+                                   decoder_output_stride=decoder_output_stride,
+                                   fine_tune_batch_norm=fine_tune_batch_norm)
+
+    def construct(self, x, scale_index=0):
+        x = (2.0 / 255.0) * x - 1.0
+        x = self.pad(x)
+        low_level_feature, feature_map = self.net(x)
+        for scale_size in self.scale_sizes:
+            if scale_size * self.feature_shape[2] + 1.0 >= self.shape(x)[2] - 2:
+                output = self.aspp(feature_map, scale_index)
+                output = self.fc1(output)
+                if self.decoder_output_stride is not None:
+                    output = self.decoder(output, low_level_feature, scale_index)
+                output = self.fc2(output)
+                output = self.samples[scale_index](output)
+                return output
+            scale_index += 1
+        return feature_map
+
+
+class SampleBlock(nn.Cell):
+    """Sample block."""
+    def __init__(self,
+                 feature_shape,
+                 scale_size=1.0):
+        super(SampleBlock, self).__init__()
+        sample_h = np.ceil(float(feature_shape[2]) * scale_size)
+        sample_w = np.ceil(float(feature_shape[3]) * scale_size)
+        self.sample = P.ResizeBilinear((int(sample_h), int(sample_w)), align_corners=True)
+
+    def construct(self, x):
+        return self.sample(x)
+
+
+class DeepLabV3(nn.Cell):
+    """DeepLabV3 model."""
+    def __init__(self, num_classes, feature_shape, backbone, channel, depth, infer_scale_sizes, atrous_rates,
+                 decoder_output_stride, output_stride, fine_tune_batch_norm, image_pyramid):
+        super(DeepLabV3, self).__init__()
+        self.infer_scale_sizes = []
+        if infer_scale_sizes is not None:
+            self.infer_scale_sizes = infer_scale_sizes
+
+        self.infer_scale_sizes = infer_scale_sizes
+        if image_pyramid is None:
+            image_pyramid = [1.0]
+
+        self.image_pyramid = image_pyramid
+        scale_sizes = []
+        for pyramid in image_pyramid:
+            scale_sizes.append(pyramid)
+        for scale in infer_scale_sizes:
+            scale_sizes.append(scale)
+        self.samples = []
+        for scale_size in scale_sizes:
+            self.samples.append(SampleBlock(feature_shape, scale_size))
+        self.samples = nn.CellList(self.samples)
+        self.deeplabv3 = SingleDeepLabV3(num_classes=num_classes,
+                                         feature_shape=feature_shape,
+                                         backbone=resnet50_dl(fine_tune_batch_norm),
+                                         channel=channel,
+                                         depth=depth,
+                                         scale_sizes=scale_sizes,
+                                         atrous_rates=atrous_rates,
+                                         decoder_output_stride=decoder_output_stride,
+                                         output_stride=output_stride,
+                                         fine_tune_batch_norm=fine_tune_batch_norm)
+        self.softmax = P.Softmax(axis=1)
+        self.concat = P.Concat(axis=2)
+        self.expand_dims = P.ExpandDims()
+        self.reduce_mean = P.ReduceMean()
+        self.sample_common = P.ResizeBilinear((int(feature_shape[2]),
+                                               int(feature_shape[3])),
+                                              align_corners=True)
+
+    def construct(self, x):
+        logits = ()
+        if self.training:
+            if len(self.image_pyramid) >= 1:
+                if self.image_pyramid[0] == 1:
+                    logits = self.deeplabv3(x)
+                else:
+                    x1 = self.samples[0](x)
+                    logits = self.deeplabv3(x1)
+                    logits = self.sample_common(logits)
+                logits = self.expand_dims(logits, 2)
+                for i in range(len(self.image_pyramid) - 1):
+                    x_i = self.samples[i + 1](x)
+                    logits_i = self.deeplabv3(x_i)
+                    logits_i = self.sample_common(logits_i)
+                    logits_i = self.expand_dims(logits_i, 2)
+                    logits = self.concat((logits, logits_i))
+            logits = self.reduce_mean(logits, 2)
+            return logits
+        if len(self.infer_scale_sizes) >= 1:
+            infer_index = len(self.image_pyramid)
+            x1 = self.samples[infer_index](x)
+            logits = self.deeplabv3(x1)
+            logits = self.sample_common(logits)
+            logits = self.softmax(logits)
+            logits = self.expand_dims(logits, 2)
+            for i in range(len(self.infer_scale_sizes) - 1):
+                x_i = self.samples[i + 1 + infer_index](x)
+                logits_i = self.deeplabv3(x_i)
+                logits_i = self.sample_common(logits_i)
+                logits_i = self.softmax(logits_i)
+                logits_i = self.expand_dims(logits_i, 2)
+                logits = self.concat((logits, logits_i))
+        logits = self.reduce_mean(logits, 2)
+        return logits
+
+
+def deeplabv3_resnet50(num_classes, feature_shape, image_pyramid,
+                       infer_scale_sizes, atrous_rates=None, decoder_output_stride=None,
+                       output_stride=16, fine_tune_batch_norm=False):
+    """
+    ResNet50 based DeepLabv3 network.
+
+    Args:
+        num_classes (int): Class number.
+        feature_shape (list): Input image shape, [N,C,H,W].
+        image_pyramid (list): Input scales for multi-scale feature extraction.
+        atrous_rates (list): Atrous rates for atrous spatial pyramid pooling.
+        infer_scale_sizes (list): 'The scales to resize images for inference.
+        decoder_output_stride (int): 'The ratio of input to output spatial resolution'
+        output_stride (int): 'The ratio of input to output spatial resolution.'
+        fine_tune_batch_norm (bool): 'Fine tune the batch norm parameters or not'
+
+    Returns:
+        Cell, cell instance of ResNet50 based DeepLabv3 neural network.
+
+    Examples:
+        >>> deeplabv3_resnet50(100, [1,3,224,224],[1.0],[1.0])
+    """
+    return DeepLabV3(num_classes=num_classes,
+                     feature_shape=feature_shape,
+                     backbone=resnet50_dl(fine_tune_batch_norm),
+                     channel=2048,
+                     depth=256,
+                     infer_scale_sizes=infer_scale_sizes,
+                     atrous_rates=atrous_rates,
+                     decoder_output_stride=decoder_output_stride,
+                     output_stride=output_stride,
+                     fine_tune_batch_norm=fine_tune_batch_norm,
+                     image_pyramid=image_pyramid)
diff --git a/tests/st/networks/models/deeplabv3/src/ei_dataset.py b/tests/st/networks/models/deeplabv3/src/ei_dataset.py
new file mode 100644
index 0000000000..8b471065ae
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/ei_dataset.py
@@ -0,0 +1,84 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Process Dataset."""
+import abc
+import os
+import time
+
+from .utils.adapter import get_raw_samples, read_image
+
+
+class BaseDataset:
+    """
+    Create dataset.
+
+    Args:
+        data_url (str): The path of data.
+        usage (str): Whether to use train or eval (default='train').
+
+    Returns:
+        Dataset.
+    """
+    def __init__(self, data_url, usage):
+        self.data_url = data_url
+        self.usage = usage
+        self.cur_index = 0
+        self.samples = []
+        _s_time = time.time()
+        self._load_samples()
+        _e_time = time.time()
+        print(f"load samples success~, time cost = {_e_time - _s_time}")
+
+    def __getitem__(self, item):
+        sample = self.samples[item]
+        return self._next_data(sample)
+
+    def __len__(self):
+        return len(self.samples)
+
+    @staticmethod
+    def _next_data(sample):
+        image_path = sample[0]
+        mask_image_path = sample[1]
+
+        image = read_image(image_path)
+        mask_image = read_image(mask_image_path)
+        return [image, mask_image]
+
+    @abc.abstractmethod
+    def _load_samples(self):
+        pass
+
+
+class HwVocRawDataset(BaseDataset):
+    """
+    Create dataset with raw data.
+
+    Args:
+        data_url (str): The path of data.
+        usage (str): Whether to use train or eval (default='train').
+
+    Returns:
+        Dataset.
+    """
+    def __init__(self, data_url, usage="train"):
+        super().__init__(data_url, usage)
+
+    def _load_samples(self):
+        try:
+            self.samples = get_raw_samples(os.path.join(self.data_url, self.usage))
+        except Exception as e:
+            print("load HwVocRawDataset failed!!!")
+            raise e
diff --git a/tests/st/networks/models/deeplabv3/src/losses.py b/tests/st/networks/models/deeplabv3/src/losses.py
new file mode 100644
index 0000000000..af782c2de9
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/losses.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""OhemLoss."""
+import mindspore.nn as nn
+import mindspore.common.dtype as mstype
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+
+
+class OhemLoss(nn.Cell):
+    """Ohem loss cell."""
+    def __init__(self, num, ignore_label):
+        super(OhemLoss, self).__init__()
+        self.mul = P.Mul()
+        self.shape = P.Shape()
+        self.one_hot = nn.OneHot(-1, num, 1.0, 0.0)
+        self.squeeze = P.Squeeze()
+        self.num = num
+        self.cross_entropy = P.SoftmaxCrossEntropyWithLogits()
+        self.mean = P.ReduceMean()
+        self.select = P.Select()
+        self.reshape = P.Reshape()
+        self.cast = P.Cast()
+        self.not_equal = P.NotEqual()
+        self.equal = P.Equal()
+        self.reduce_sum = P.ReduceSum(keep_dims=False)
+        self.fill = P.Fill()
+        self.transpose = P.Transpose()
+        self.ignore_label = ignore_label
+        self.loss_weight = 1.0
+
+    def construct(self, logits, labels):
+        logits = self.transpose(logits, (0, 2, 3, 1))
+        logits = self.reshape(logits, (-1, self.num))
+        labels = F.cast(labels, mstype.int32)
+        labels = self.reshape(labels, (-1,))
+        one_hot_labels = self.one_hot(labels)
+        losses = self.cross_entropy(logits, one_hot_labels)[0]
+        weights = self.cast(self.not_equal(labels, self.ignore_label), mstype.float32) * self.loss_weight
+        weighted_losses = self.mul(losses, weights)
+        loss = self.reduce_sum(weighted_losses, (0,))
+        zeros = self.fill(mstype.float32, self.shape(weights), 0.0)
+        ones = self.fill(mstype.float32, self.shape(weights), 1.0)
+        present = self.select(self.equal(weights, zeros), zeros, ones)
+        present = self.reduce_sum(present, (0,))
+
+        zeros = self.fill(mstype.float32, self.shape(present), 0.0)
+        min_control = self.fill(mstype.float32, self.shape(present), 1.0)
+        present = self.select(self.equal(present, zeros), min_control, present)
+        loss = loss / present
+        return loss
diff --git a/tests/st/networks/models/deeplabv3/src/md_dataset.py b/tests/st/networks/models/deeplabv3/src/md_dataset.py
new file mode 100644
index 0000000000..e136da23e1
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/md_dataset.py
@@ -0,0 +1,116 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Dataset module."""
+from PIL import Image
+import mindspore.dataset as de
+import mindspore.dataset.transforms.vision.c_transforms as C
+import numpy as np
+
+from .ei_dataset import HwVocRawDataset
+from .utils import custom_transforms as tr
+
+
+class DataTransform:
+    """Transform dataset for DeepLabV3."""
+
+    def __init__(self, args, usage):
+        self.args = args
+        self.usage = usage
+
+    def __call__(self, image, label):
+        if self.usage == "train":
+            return self._train(image, label)
+        if self.usage == "eval":
+            return self._eval(image, label)
+        return None
+
+    def _train(self, image, label):
+        """
+        Process training data.
+
+        Args:
+            image (list): Image data.
+            label (list): Dataset label.
+        """
+        image = Image.fromarray(image)
+        label = Image.fromarray(label)
+
+        rsc_tr = tr.RandomScaleCrop(base_size=self.args.base_size, crop_size=self.args.crop_size)
+        image, label = rsc_tr(image, label)
+
+        rhf_tr = tr.RandomHorizontalFlip()
+        image, label = rhf_tr(image, label)
+
+        image = np.array(image).astype(np.float32)
+        label = np.array(label).astype(np.float32)
+
+        return image, label
+
+    def _eval(self, image, label):
+        """
+        Process eval data.
+
+        Args:
+            image (list): Image data.
+            label (list): Dataset label.
+        """
+        image = Image.fromarray(image)
+        label = Image.fromarray(label)
+
+        fsc_tr = tr.FixScaleCrop(crop_size=self.args.crop_size)
+        image, label = fsc_tr(image, label)
+
+        image = np.array(image).astype(np.float32)
+        label = np.array(label).astype(np.float32)
+
+        return image, label
+
+
+def create_dataset(args, data_url, epoch_num=1, batch_size=1, usage="train", shuffle=True):
+    """
+    Create Dataset for DeepLabV3.
+
+    Args:
+        args (dict): Train parameters.
+        data_url (str): Dataset path.
+        epoch_num (int): Epoch of dataset (default=1).
+        batch_size (int): Batch size of dataset (default=1).
+        usage (str): Whether is use to train or eval (default='train').
+
+    Returns:
+        Dataset.
+    """
+    # create iter dataset
+    dataset = HwVocRawDataset(data_url, usage=usage)
+    dataset_len = len(dataset)
+
+    # wrapped with GeneratorDataset
+    dataset = de.GeneratorDataset(dataset, ["image", "label"], sampler=None)
+    dataset.set_dataset_size(dataset_len)
+    dataset = dataset.map(input_columns=["image", "label"], operations=DataTransform(args, usage=usage))
+
+    channelswap_op = C.HWC2CHW()
+    dataset = dataset.map(input_columns="image", operations=channelswap_op)
+
+    # 1464 samples / batch_size 8 = 183 batches
+    # epoch_num is num of steps
+    # 3658 steps / 183 = 20 epochs
+    if usage == "train" and shuffle:
+        dataset = dataset.shuffle(1464)
+    dataset = dataset.batch(batch_size, drop_remainder=(usage == "train"))
+    dataset = dataset.repeat(count=epoch_num)
+    dataset.map_model = 4
+
+    return dataset
diff --git a/tests/st/networks/models/deeplabv3/src/miou_precision.py b/tests/st/networks/models/deeplabv3/src/miou_precision.py
new file mode 100644
index 0000000000..b73b3947d4
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/miou_precision.py
@@ -0,0 +1,72 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""mIou."""
+import numpy as np
+from mindspore.nn.metrics.metric import Metric
+
+
+def confuse_matrix(target, pred, n):
+    k = (target >= 0) & (target < n)
+    return np.bincount(n * target[k].astype(int) + pred[k], minlength=n ** 2).reshape(n, n)
+
+
+def iou(hist):
+    denominator = hist.sum(1) + hist.sum(0) - np.diag(hist)
+    res = np.diag(hist) / np.where(denominator > 0, denominator, 1)
+    res = np.sum(res) / np.count_nonzero(denominator)
+    return res
+
+
+class MiouPrecision(Metric):
+    """Calculate miou precision."""
+    def __init__(self, num_class=21):
+        super(MiouPrecision, self).__init__()
+        if not isinstance(num_class, int):
+            raise TypeError('num_class should be integer type, but got {}'.format(type(num_class)))
+        if num_class < 1:
+            raise ValueError('num_class must be at least 1, but got {}'.format(num_class))
+        self._num_class = num_class
+        self._mIoU = []
+        self.clear()
+
+    def clear(self):
+        self._hist = np.zeros((self._num_class, self._num_class))
+        self._mIoU = []
+
+    def update(self, *inputs):
+        if len(inputs) != 2:
+            raise ValueError('Need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
+        predict_in = self._convert_data(inputs[0])
+        label_in = self._convert_data(inputs[1])
+        if predict_in.shape[1] != self._num_class:
+            raise ValueError('Class number not match, last input data contain {} classes, but current data contain {} '
+                             'classes'.format(self._num_class, predict_in.shape[1]))
+        pred = np.argmax(predict_in, axis=1)
+        label = label_in
+        if len(label.flatten()) != len(pred.flatten()):
+            print('Skipping: len(gt) = {:d}, len(pred) = {:d}'.format(len(label.flatten()), len(pred.flatten())))
+            raise ValueError('Class number not match, last input data contain {} classes, but current data contain {} '
+                             'classes'.format(self._num_class, predict_in.shape[1]))
+        self._hist = confuse_matrix(label.flatten(), pred.flatten(), self._num_class)
+        mIoUs = iou(self._hist)
+        self._mIoU.append(mIoUs)
+
+    def eval(self):
+        """
+        Computes the mIoU categorical accuracy.
+        """
+        mIoU = np.nanmean(self._mIoU)
+        print('mIoU = {}'.format(mIoU))
+        return mIoU
diff --git a/tests/st/networks/models/deeplabv3/src/utils/__init__.py b/tests/st/networks/models/deeplabv3/src/utils/__init__.py
new file mode 100644
index 0000000000..e30774307c
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/utils/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
diff --git a/tests/st/networks/models/deeplabv3/src/utils/adapter.py b/tests/st/networks/models/deeplabv3/src/utils/adapter.py
new file mode 100644
index 0000000000..37173ebf48
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/utils/adapter.py
@@ -0,0 +1,67 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Adapter dataset."""
+import fnmatch
+import io
+import os
+
+import numpy as np
+from PIL import Image
+
+from ..utils import file_io
+
+
+def get_raw_samples(data_url):
+    """
+    Get dataset from raw data.
+
+    Args:
+        data_url (str): Dataset path.
+
+    Returns:
+        list, a file list.
+    """
+    def _list_files(dir_path, pattern):
+        full_files = []
+        _, _, files = next(file_io.walk(dir_path))
+        for f in files:
+            if fnmatch.fnmatch(f.lower(), pattern.lower()):
+                full_files.append(os.path.join(dir_path, f))
+        return full_files
+
+    img_files = _list_files(os.path.join(data_url, "Images"), "*.jpg")
+    seg_files = _list_files(os.path.join(data_url, "SegmentationClassRaw"), "*.png")
+
+    files = []
+    for img_file in img_files:
+        _, file_name = os.path.split(img_file)
+        name, _ = os.path.splitext(file_name)
+        seg_file = os.path.join(data_url, "SegmentationClassRaw", ".".join([name, "png"]))
+        if seg_file in seg_files:
+            files.append([img_file, seg_file])
+    return files
+
+
+def read_image(img_path):
+    """
+    Read image from file.
+
+    Args:
+        img_path (str): image path.
+    """
+    img = file_io.read(img_path.strip(), binary=True)
+    data = io.BytesIO(img)
+    img = Image.open(data)
+    return np.array(img)
diff --git a/tests/st/networks/models/deeplabv3/src/utils/custom_transforms.py b/tests/st/networks/models/deeplabv3/src/utils/custom_transforms.py
new file mode 100644
index 0000000000..75c78e1240
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/utils/custom_transforms.py
@@ -0,0 +1,149 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Random process dataset."""
+import random
+
+import numpy as np
+from PIL import Image, ImageOps, ImageFilter
+
+
+class Normalize:
+    """Normalize a tensor image with mean and standard deviation.
+    Args:
+        mean (tuple): means for each channel.
+        std (tuple): standard deviations for each channel.
+    """
+
+    def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, img, mask):
+        img = np.array(img).astype(np.float32)
+        mask = np.array(mask).astype(np.float32)
+        img = ((img - self.mean) / self.std).astype(np.float32)
+
+        return img, mask
+
+
+class RandomHorizontalFlip:
+    """Randomly decide whether to horizontal flip."""
+    def __call__(self, img, mask):
+        if random.random() < 0.5:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
+
+        return img, mask
+
+
+class RandomRotate:
+    """
+    Randomly decide whether to rotate.
+
+    Args:
+        degree (float): The degree of rotate.
+    """
+    def __init__(self, degree):
+        self.degree = degree
+
+    def __call__(self, img, mask):
+        rotate_degree = random.uniform(-1 * self.degree, self.degree)
+        img = img.rotate(rotate_degree, Image.BILINEAR)
+        mask = mask.rotate(rotate_degree, Image.NEAREST)
+
+        return img, mask
+
+
+class RandomGaussianBlur:
+    """Randomly decide whether to filter image with gaussian blur."""
+    def __call__(self, img, mask):
+        if random.random() < 0.5:
+            img = img.filter(ImageFilter.GaussianBlur(
+                radius=random.random()))
+
+        return img, mask
+
+
+class RandomScaleCrop:
+    """Randomly decide whether to scale and crop image."""
+    def __init__(self, base_size, crop_size, fill=0):
+        self.base_size = base_size
+        self.crop_size = crop_size
+        self.fill = fill
+
+    def __call__(self, img, mask):
+        # random scale (short edge)
+        short_size = random.randint(int(self.base_size * 0.5), int(self.base_size * 2.0))
+        w, h = img.size
+        if h > w:
+            ow = short_size
+            oh = int(1.0 * h * ow / w)
+        else:
+            oh = short_size
+            ow = int(1.0 * w * oh / h)
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # pad crop
+        if short_size < self.crop_size:
+            padh = self.crop_size - oh if oh < self.crop_size else 0
+            padw = self.crop_size - ow if ow < self.crop_size else 0
+            img = ImageOps.expand(img, border=(0, 0, padw, padh), fill=0)
+            mask = ImageOps.expand(mask, border=(0, 0, padw, padh), fill=self.fill)
+        # random crop crop_size
+        w, h = img.size
+        x1 = random.randint(0, w - self.crop_size)
+        y1 = random.randint(0, h - self.crop_size)
+        img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
+        mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
+
+        return img, mask
+
+
+class FixScaleCrop:
+    """Scale and crop image with fixing size."""
+    def __init__(self, crop_size):
+        self.crop_size = crop_size
+
+    def __call__(self, img, mask):
+        w, h = img.size
+        if w > h:
+            oh = self.crop_size
+            ow = int(1.0 * w * oh / h)
+        else:
+            ow = self.crop_size
+            oh = int(1.0 * h * ow / w)
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # center crop
+        w, h = img.size
+        x1 = int(round((w - self.crop_size) / 2.))
+        y1 = int(round((h - self.crop_size) / 2.))
+        img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
+        mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
+
+        return img, mask
+
+
+class FixedResize:
+    """Resize image with fixing size."""
+    def __init__(self, size):
+        self.size = (size, size)
+
+    def __call__(self, img, mask):
+        assert img.size == mask.size
+
+        img = img.resize(self.size, Image.BILINEAR)
+        mask = mask.resize(self.size, Image.NEAREST)
+        return img, mask
diff --git a/tests/st/networks/models/deeplabv3/src/utils/file_io.py b/tests/st/networks/models/deeplabv3/src/utils/file_io.py
new file mode 100644
index 0000000000..9d6db034f3
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/src/utils/file_io.py
@@ -0,0 +1,36 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""File operation module."""
+import os
+
+
+def _is_obs(url):
+    return url.startswith("obs://") or url.startswith("s3://")
+
+
+def read(url, binary=False):
+    if _is_obs(url):
+        # TODO read cloud file.
+        return None
+
+    with open(url, "rb" if binary else "r") as f:
+        return f.read()
+
+
+def walk(url):
+    if _is_obs(url):
+        # TODO read cloud file.
+        return None
+    return os.walk(url)
diff --git a/tests/st/networks/models/deeplabv3/test_deeplabv3.py b/tests/st/networks/models/deeplabv3/test_deeplabv3.py
new file mode 100644
index 0000000000..d033a991e9
--- /dev/null
+++ b/tests/st/networks/models/deeplabv3/test_deeplabv3.py
@@ -0,0 +1,102 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train."""
+import argparse
+import time
+import pytest
+import numpy as np
+from mindspore import context, Tensor
+from mindspore.nn.optim.momentum import Momentum
+from mindspore import Model
+from mindspore.train.callback import Callback
+from src.md_dataset import create_dataset
+from src.losses import OhemLoss
+from src.deeplabv3 import deeplabv3_resnet50
+from src.config import config
+
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+#--train
+#--eval
+#  --Images
+#    --2008_001135.jpg
+#    --2008_001404.jpg
+#  --SegmentationClassRaw
+#    --2008_001135.png
+#    --2008_001404.png
+data_url = "/home/workspace/mindspore_dataset/voc/voc2012"
+class LossCallBack(Callback):
+    """
+    Monitor the loss in training.
+    Note:
+        if per_print_times is 0 do not print loss.
+    Args:
+        per_print_times (int): Print loss every times. Default: 1.
+    """
+    def __init__(self, data_size, per_print_times=1):
+        super(LossCallBack, self).__init__()
+        if not isinstance(per_print_times, int) or per_print_times < 0:
+            raise ValueError("print_step must be int and >= 0")
+        self.data_size = data_size
+        self._per_print_times = per_print_times
+        self.time = 1000
+        self.loss = 0
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        self.time = epoch_mseconds / self.data_size
+        self.loss = cb_params.net_outputs
+        print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
+                                                           str(cb_params.net_outputs)))
+
+def model_fine_tune(train_net, fix_weight_layer):
+    for para in train_net.trainable_params():
+        para.set_parameter_data(Tensor(np.ones(para.data.shape).astype(np.float32) * 0.02))
+        if fix_weight_layer in para.name:
+            para.requires_grad = False
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_deeplabv3_1p():
+    start_time = time.time()
+    epoch_size = 100
+    args_opt = argparse.Namespace(base_size=513, crop_size=513, batch_size=2)
+    args_opt.base_size = config.crop_size
+    args_opt.crop_size = config.crop_size
+    args_opt.batch_size = config.batch_size
+    train_dataset = create_dataset(args_opt, data_url, epoch_size, config.batch_size,
+                                   usage="eval")
+    dataset_size = train_dataset.get_dataset_size()
+    callback = LossCallBack(dataset_size)
+    net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
+                             infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates,
+                             decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride,
+                             fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid)
+    net.set_train()
+    model_fine_tune(net, 'layer')
+    loss = OhemLoss(config.seg_num_classes, config.ignore_label)
+    opt = Momentum(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
+    model = Model(net, loss, opt)
+    model.train(epoch_size, train_dataset, callback)
+    print(time.time() - start_time)
+    print("expect loss: ", callback.loss)
+    print("expect time: ", callback.time)
+    expect_loss = 0.92
+    expect_time = 40
+    assert callback.loss.asnumpy() <= expect_loss
+    assert callback.time <= expect_time
diff --git a/tests/st/networks/models/resnet50/__init__.py b/tests/st/networks/models/resnet50/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/st/networks/models/resnet50/src/__init__.py b/tests/st/networks/models/resnet50/src/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/st/networks/models/resnet50/src/config.py b/tests/st/networks/models/resnet50/src/config.py
new file mode 100755
index 0000000000..fbb3e83ba3
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src/config.py
@@ -0,0 +1,47 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+network config setting, will be used in train.py and eval.py
+"""
+from easydict import EasyDict as ed
+
+config = ed({
+    "class_num": 1001,
+    "batch_size": 32,
+    "eval_interval": 1,
+    "eval_batch_size": 50,
+    "loss_scale": 1024,
+    "momentum": 0.9,
+    "weight_decay": 1e-4,
+    "use_nesterov": True,
+    "epoch_size": 90,
+    "pretrained_epoch_size": 1,
+    "buffer_size": 1000,
+    "image_height": 224,
+    "image_width": 224,
+    "save_checkpoint": False,
+    "save_checkpoint_epochs": 5,
+    "keep_checkpoint_max": 10,
+    "save_checkpoint_path": "./",
+    "warmup_epochs": 0,
+    "lr_decay_mode": "cosine",
+    "use_label_smooth": True,
+    "label_smooth_factor": 0.1,
+    "lr_init": 0,
+    "lr_max": 0.1,
+    "use_lars": True,
+    "lars_epsilon": 1e-8,
+    "lars_coefficient": 0.001
+})
diff --git a/tests/st/networks/models/resnet50/src/dataset.py b/tests/st/networks/models/resnet50/src/dataset.py
new file mode 100755
index 0000000000..ae15f4159e
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src/dataset.py
@@ -0,0 +1,79 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""create train or eval dataset."""
+
+import os
+import mindspore.common.dtype as mstype
+import mindspore.dataset.engine as de
+import mindspore.dataset.transforms.vision.c_transforms as C
+import mindspore.dataset.transforms.c_transforms as C2
+
+
+def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
+    """
+    create a train or eval dataset.
+
+    Args:
+        dataset_path(string): the path of dataset.
+        do_train(bool): whether dataset is used for train or eval.
+        repeat_num(int): the repeat times of dataset. Default: 1
+        batch_size(int): the batch size of dataset. Default: 32
+
+    Returns:
+        dataset
+    """
+
+    device_num = int(os.getenv("RANK_SIZE"))
+    rank_id = int(os.getenv("RANK_ID"))
+    if device_num == 1:
+        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
+    else:
+        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
+                                     num_shards=device_num, shard_id=rank_id)
+
+    image_size = 224
+    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
+    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
+
+    # define map operations
+    if do_train:
+        trans = [
+            C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
+            C.RandomHorizontalFlip(prob=0.5),
+            C.Normalize(mean=mean, std=std),
+            C.HWC2CHW()
+        ]
+    else:
+        trans = [
+            C.Decode(),
+            C.Resize((256, 256)),
+            C.CenterCrop(image_size),
+            C.Normalize(mean=mean, std=std),
+            C.HWC2CHW()
+        ]
+
+
+    type_cast_op = C2.TypeCast(mstype.int32)
+
+    ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
+    ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
+
+    # apply batch operations
+    ds = ds.batch(batch_size, drop_remainder=True)
+
+    # apply dataset repeat operation
+    ds = ds.repeat(repeat_num)
+    return ds
diff --git a/tests/st/networks/models/resnet50/src/lr_generator.py b/tests/st/networks/models/resnet50/src/lr_generator.py
new file mode 100755
index 0000000000..5f3d5f571f
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src/lr_generator.py
@@ -0,0 +1,87 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""learning rate generator"""
+import math
+import numpy as np
+
+
+def get_learning_rate(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
+    """
+    generate learning rate array
+
+    Args:
+       lr_init(float): init learning rate
+       lr_end(float): end learning rate
+       lr_max(float): max learning rate
+       warmup_epochs(int): number of warmup epochs
+       total_epochs(int): total epoch of training
+       steps_per_epoch(int): steps of one epoch
+       lr_decay_mode(string): learning rate decay mode, including steps, poly, cosine or default
+
+    Returns:
+       np.array, learning rate array
+    """
+    lr_each_step = []
+    total_steps = steps_per_epoch * total_epochs
+    warmup_steps = steps_per_epoch * warmup_epochs
+    if lr_decay_mode == 'steps':
+        decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps]
+        for i in range(total_steps):
+            if i < decay_epoch_index[0]:
+                lr = lr_max
+            elif i < decay_epoch_index[1]:
+                lr = lr_max * 0.1
+            elif i < decay_epoch_index[2]:
+                lr = lr_max * 0.01
+            else:
+                lr = lr_max * 0.001
+            lr_each_step.append(lr)
+    elif lr_decay_mode == 'poly':
+        if warmup_steps != 0:
+            inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
+        else:
+            inc_each_step = 0
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = float(lr_init) + inc_each_step * float(i)
+            else:
+                base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)))
+                lr = float(lr_max) * base * base
+                if lr < 0.0:
+                    lr = 0.0
+            lr_each_step.append(lr)
+    elif lr_decay_mode == 'cosine':
+        decay_steps = total_steps - warmup_steps
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps)
+                lr = float(lr_init) + lr_inc * (i + 1)
+            else:
+                linear_decay = (total_steps - i) / decay_steps
+                cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps))
+                decayed = linear_decay * cosine_decay + 0.00001
+                lr = lr_max * decayed
+            lr_each_step.append(lr)
+    else:
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = lr_init + (lr_max - lr_init) * i / warmup_steps
+            else:
+                lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
+            lr_each_step.append(lr)
+
+    learning_rate = np.array(lr_each_step).astype(np.float32)
+
+    return learning_rate
diff --git a/tests/st/networks/models/resnet50/src/metric.py b/tests/st/networks/models/resnet50/src/metric.py
new file mode 100644
index 0000000000..4cf93e15aa
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src/metric.py
@@ -0,0 +1,132 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""evaluation metric."""
+
+from mindspore.communication.management import GlobalComm
+from mindspore.ops import operations as P
+import mindspore.nn as nn
+import mindspore.common.dtype as mstype
+
+
+class ClassifyCorrectCell(nn.Cell):
+    r"""
+    Cell that returns correct count of the prediction in classification network.
+    This Cell accepts a network as arguments.
+    It returns orrect count of the prediction to calculate the metrics.
+
+    Args:
+        network (Cell): The network Cell.
+
+    Inputs:
+        - **data** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
+        - **label** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
+
+    Outputs:
+        Tuple, containing a scalar correct count of the prediction
+
+    Examples:
+        >>> # For a defined network Net without loss function
+        >>> net = Net()
+        >>> eval_net = nn.ClassifyCorrectCell(net)
+    """
+
+    def __init__(self, network):
+        super(ClassifyCorrectCell, self).__init__(auto_prefix=False)
+        self._network = network
+        self.argmax = P.Argmax()
+        self.equal = P.Equal()
+        self.cast = P.Cast()
+        self.reduce_sum = P.ReduceSum()
+        self.allreduce = P.AllReduce(P.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
+
+    def construct(self, data, label):
+        outputs = self._network(data)
+        y_pred = self.argmax(outputs)
+        y_pred = self.cast(y_pred, mstype.int32)
+        y_correct = self.equal(y_pred, label)
+        y_correct = self.cast(y_correct, mstype.float32)
+        y_correct = self.reduce_sum(y_correct)
+        total_correct = self.allreduce(y_correct)
+        return (total_correct,)
+
+
+class DistAccuracy(nn.Metric):
+    r"""
+    Calculates the accuracy for classification data in distributed mode.
+    The accuracy class creates two local variables, correct number and total number that are used to compute the
+    frequency with which predictions matches labels. This frequency is ultimately returned as the accuracy: an
+    idempotent operation that simply divides correct number by total number.
+
+    .. math::
+
+        \text{accuracy} =\frac{\text{true_positive} + \text{true_negative}}
+
+        {\text{true_positive} + \text{true_negative} + \text{false_positive} + \text{false_negative}}
+
+    Args:
+        eval_type (str): Metric to calculate the accuracy over a dataset, for classification (single-label).
+
+    Examples:
+        >>> y_correct = Tensor(np.array([20]))
+        >>> metric = nn.DistAccuracy(batch_size=3, device_num=8)
+        >>> metric.clear()
+        >>> metric.update(y_correct)
+        >>> accuracy = metric.eval()
+    """
+
+    def __init__(self, batch_size, device_num):
+        super(DistAccuracy, self).__init__()
+        self.clear()
+        self.batch_size = batch_size
+        self.device_num = device_num
+
+    def clear(self):
+        """Clears the internal evaluation result."""
+        self._correct_num = 0
+        self._total_num = 0
+
+    def update(self, *inputs):
+        """
+        Updates the internal evaluation result :math:`y_{pred}` and :math:`y`.
+
+        Args:
+            inputs: Input `y_correct`. `y_correct` is a `scalar Tensor`.
+                `y_correct` is the right prediction count that gathered from all devices
+                it's a scalar in float type
+
+        Raises:
+            ValueError: If the number of the input is not 1.
+        """
+
+        if len(inputs) != 1:
+            raise ValueError('Distribute accuracy needs 1 input (y_correct), but got {}'.format(len(inputs)))
+        y_correct = self._convert_data(inputs[0])
+        self._correct_num += y_correct
+        self._total_num += self.batch_size * self.device_num
+
+    def eval(self):
+        """
+        Computes the accuracy.
+
+        Returns:
+            Float, the computed result.
+
+        Raises:
+            RuntimeError: If the sample size is 0.
+        """
+
+        if self._total_num == 0:
+            raise RuntimeError('Accuracy can not be calculated, because the number of samples is 0.')
+        return self._correct_num / self._total_num
diff --git a/tests/st/networks/models/resnet50/src_thor/__init__.py b/tests/st/networks/models/resnet50/src_thor/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/st/networks/models/resnet50/src_thor/config.py b/tests/st/networks/models/resnet50/src_thor/config.py
new file mode 100644
index 0000000000..cd1d1cef0c
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/config.py
@@ -0,0 +1,39 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+network config setting, will be used in train.py and eval.py
+"""
+from easydict import EasyDict as ed
+
+config = ed({
+    "class_num": 1000,
+    "batch_size": 32,
+    "loss_scale": 128,
+    "momentum": 0.9,
+    "weight_decay": 5e-4,
+    "epoch_size": 45,
+    "buffer_size": 1000,
+    "image_height": 224,
+    "image_width": 224,
+    "save_checkpoint": True,
+    "save_checkpoint_steps": 5004,
+    "keep_checkpoint_max": 20,
+    "save_checkpoint_path": "./",
+    "label_smooth": 1,
+    "label_smooth_factor": 0.1,
+    "frequency": 834,
+    "eval_interval": 1,
+    "eval_batch_size": 32
+})
diff --git a/tests/st/networks/models/resnet50/src_thor/dataset.py b/tests/st/networks/models/resnet50/src_thor/dataset.py
new file mode 100644
index 0000000000..091172e62c
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/dataset.py
@@ -0,0 +1,82 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""create train or eval dataset."""
+
+import os
+
+import mindspore.common.dtype as mstype
+import mindspore.dataset as dataset
+import mindspore.dataset.engine as de
+import mindspore.dataset.transforms.c_transforms as C2
+import mindspore.dataset.transforms.vision.c_transforms as C
+
+dataset.config.set_seed(1)
+
+
+def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
+    """
+    Create a train or eval dataset.
+
+    Args:
+        dataset_path(string): the path of dataset.
+        do_train(bool): whether dataset is used for train or eval.
+        repeat_num(int): the repeat times of dataset. Default: 1
+        batch_size(int): the batch size of dataset. Default: 32
+
+    Returns:
+        dataset
+    """
+
+    device_num = int(os.getenv("RANK_SIZE"))
+    rank_id = int(os.getenv("RANK_ID"))
+    if device_num == 1:
+        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
+    else:
+        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
+                                     num_shards=device_num, shard_id=rank_id)
+
+    image_size = 224
+    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
+    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
+
+    # define map operations
+    if do_train:
+        trans = [
+            C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
+            C.RandomHorizontalFlip(prob=0.5),
+            C.Normalize(mean=mean, std=std),
+            C.HWC2CHW()
+        ]
+    else:
+        trans = [
+            C.Decode(),
+            C.Resize((256, 256)),
+            C.CenterCrop(image_size),
+            C.Normalize(mean=mean, std=std),
+            C.HWC2CHW()
+        ]
+
+    type_cast_op = C2.TypeCast(mstype.int32)
+
+    ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
+    ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
+
+    # apply batch operations
+    ds = ds.batch(batch_size, drop_remainder=True)
+
+    # apply dataset repeat operation
+    ds = ds.repeat(repeat_num)
+    return ds
diff --git a/tests/st/networks/models/resnet50/src_thor/dataset_helper.py b/tests/st/networks/models/resnet50/src_thor/dataset_helper.py
new file mode 100644
index 0000000000..1ca4d388f7
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/dataset_helper.py
@@ -0,0 +1,127 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Dataset help for minddata dataset"""
+from mindspore._checkparam import check_bool
+from mindspore.parallel._utils import _get_device_num, _get_parallel_mode
+from mindspore.train.dataset_helper import _send_data
+from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes, \
+    _to_full_shapes
+from mindspore.train.parallel_utils import ParallelMode
+
+
+class DatasetHelper:
+    """
+    Help function to use the Minddata dataset.
+
+    According to different context, change the iter of dataset, to use the same for loop in different context.
+
+    Note:
+        The iter of DatasetHelper will give one epoch data.
+
+    Args:
+        dataset (DataSet): The dataset.
+        dataset_sink_mode (bool): If true use GetNext to fetch the data, or else feed the data from host.
+            Default: True.
+        iter_first_order (int): The iteration of first-order subgraph.
+            Default: 1.
+
+    Examples:
+        >>> dataset_helper = DatasetHelper(dataset)
+        >>> for inputs in dataset_helper:
+        >>>     outputs = network(*inputs)
+    """
+
+    def __init__(self, dataset, dataset_sink_mode=True, iter_first_order=0):
+        check_bool(dataset_sink_mode)
+        self.iter = _DatasetIterMSLoopSink(dataset, iter_first_order)
+
+    def __iter__(self):
+        return self.iter.__iter__()
+
+    # A temp solution for loop sink. Delete later
+    def types_shapes(self):
+        """Get the types and shapes from dataset on current config."""
+        return self.iter.types_shapes()
+
+    def loop_size(self):
+        """Get loop_size for every iteration."""
+        return self.iter.loop_size
+
+
+class _DatasetIter:
+    """Base iter for dataset help"""
+
+    def __init__(self, dataset):
+        self.loop_size = 1
+        if not hasattr(dataset, '__ME_INITED__'):
+            if not hasattr(dataset, '__loop_size__'):
+                self.loop_size = dataset.get_dataset_size()
+            else:
+                self.loop_size = dataset.__loop_size__
+            dataset.__TRANSFER_DATASET__ = _exec_datagraph(dataset, self.loop_size)
+            dataset.__ME_INITED__ = dataset.__TRANSFER_DATASET__.queue_name
+
+            if not hasattr(dataset, '__no_send__'):
+                _send_data(dataset)
+        else:
+            _send_data(dataset)
+
+        self.ind = 0
+        self.dataset = dataset
+        dataset_types, dataset_shapes = _get_types_and_shapes(dataset)
+        self.dataset_types, self.dataset_shapes = dataset_types, dataset_shapes
+
+    def __iter__(self):
+        self.ind = 0
+        return self
+
+    def __next__(self):
+        if self.ind >= self.loop_count:
+            raise StopIteration()
+        self.ind += 1
+        return self.op()
+
+    def types_shapes(self):
+        return self.dataset_types, self.dataset_shapes
+
+    def get_loop_count(self, dataset):
+        loop_count = 1
+        if hasattr(dataset, '__loop_size__'):
+            loop_size = dataset.__loop_size__
+            if dataset.get_dataset_size() % loop_size != 0:
+                raise ValueError(f'Dataset size {dataset.get_dataset_size()} and '
+                                 f'loop_size {loop_size} are not matched.')
+            loop_count = int(dataset.get_dataset_size() / loop_size)
+        return loop_count
+
+
+class _DatasetIterMSLoopSink(_DatasetIter):
+    """Iter for context (device_target=Ascend)"""
+
+    def __init__(self, dataset, iter_first_order):
+        super(_DatasetIterMSLoopSink, self).__init__(dataset)
+        loop_size = dataset.__loop_size__ + iter_first_order
+        self.loop_count = int(dataset.get_dataset_size() / loop_size * 2)
+        # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, use a complete tensor to
+        # compile, and slice tensor to run. The batch dimension of tensors for compile is device_number
+        # times the batch dimension of tensors for run. Now only support LoopSink.
+        if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
+            device_num = _get_device_num()
+            self.dataset_shapes = _to_full_shapes(self.dataset_shapes, device_num)
+
+        def op():
+            return tuple()
+
+        self.op = op
diff --git a/tests/st/networks/models/resnet50/src_thor/grad_reducer_thor.py b/tests/st/networks/models/resnet50/src_thor/grad_reducer_thor.py
new file mode 100644
index 0000000000..0b160c02f2
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/grad_reducer_thor.py
@@ -0,0 +1,184 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""grad_reducer_thor"""
+import mindspore.common.dtype as mstype
+from mindspore.communication.management import GlobalComm, get_group_size
+from mindspore.nn.cell import Cell
+from mindspore.ops import functional as F, composite as C, operations as P
+from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp
+
+reduce_opt = C.MultitypeFuncGraph("reduce_opt")
+
+_all_reduce_A = AllReduce()
+
+
+def _init_optimizer_allreduce(group):
+    global _all_reduce_A
+    _all_reduce_A = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
+    _all_reduce_A.add_prim_attr('fusion', group)
+
+
+@reduce_opt.register("Function", "Number", "Tensor")
+def _tensors_allreduce_mean(mul, degree, grad):
+    degree = F.scalar_cast(degree, F.dtype(grad))
+    grad = _all_reduce_A(grad)
+    cast_op = P.Cast()
+    return mul(grad, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(grad)))
+
+
+@reduce_opt.register("Bool", "Tensor")
+def _tensors_allreduce(allreduce_filter, grad):
+    if allreduce_filter:
+        return _all_reduce_A(grad)
+    return grad
+
+
+_get_datatype = C.MultitypeFuncGraph("_get_datatype")
+
+
+@_get_datatype.register("Tensor")
+def _tensors_get_datatype(grad):
+    """
+    Acquire gradient datatype.
+
+    Args:
+        grad (Tensor): The gradient tensor before operation.
+
+    Returns:
+        mstype, the datatype of gradient.
+    """
+    return F.dtype(grad)
+
+
+_cast_datatype = C.MultitypeFuncGraph("_cast_datatype")
+
+
+@_cast_datatype.register("TypeType", "Tensor")
+def _tensors_cast_datatype(datatype, grad):
+    """
+    Cast gradient to datatype.
+
+    Args:
+        datatype (mstype): the destination datatype of gradient.
+        grad (Tensor): The gradient tensor before operation.
+
+    Returns:
+        Tensor, the gradient tensor after operation.
+    """
+    return F.cast(grad, datatype)
+
+
+class DistributedGradReducerThor(Cell):
+    """
+    A distributed optimizer.
+
+    Constructs a gradient reducer Cell, which applies communication and average operations on
+    single-process gradient values.
+
+    Args:
+        parameters (list): the parameters to be updated.
+        group (int): the different group to allreduce.
+        mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. Default: False.
+        degree (int): The mean coefficient. Usually it equals to device number. Default: None.
+
+    Raises:
+        ValueError: If degree is not a int or less than 0.
+
+    Examples:
+        >>> from mindspore.communication import init, get_group_size
+        >>> from mindspore.ops import composite as C
+        >>> from mindspore.ops import operations as P
+        >>> from mindspore.ops import functional as F
+        >>> from mindspore import context
+        >>> from mindspore import nn
+        >>> from mindspore import ParallelMode, ParameterTuple
+        >>>
+        >>> device_id = int(os.environ["DEVICE_ID"])
+        >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True,
+        >>>                     device_id=int(device_id), enable_hccl=True)
+        >>> init()
+        >>> context.reset_auto_parallel_context()
+        >>> context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL)
+        >>>
+        >>>
+        >>> class TrainingWrapper(nn.Cell):
+        >>>     def __init__(self, network, optimizer, sens=1.0):
+        >>>         super(TrainingWrapper, self).__init__(auto_prefix=False)
+        >>>         self.network = network
+        >>>         self.network.add_flags(defer_inline=True)
+        >>>         self.weights = ParameterTuple(network.trainable_params())
+        >>>         self.optimizer = optimizer
+        >>>         self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
+        >>>         self.sens = sens
+        >>>         self.reducer_flag = False
+        >>>         self.grad_reducer = None
+        >>>         self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
+        >>>         if self.parallel_mode in [ParallelMode.DATA_PARALLEL,
+        >>>                                            ParallelMode.HYBRID_PARALLEL]:
+        >>>             self.reducer_flag = True
+        >>>         if self.reducer_flag:
+        >>>             mean = context.get_auto_parallel_context("mirror_mean")
+        >>>             if mean.get_device_num_is_set():
+        >>>                 degree = context.get_auto_parallel_context("device_num")
+        >>>             else:
+        >>>                 degree = get_group_size()
+        >>>             self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree)
+        >>>
+        >>>     def construct(self, *args):
+        >>>         weights = self.weights
+        >>>         loss = self.network(*args)
+        >>>         sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
+        >>>         grads = self.grad(self.network, weights)(*args, sens)
+        >>>         if self.reducer_flag:
+        >>>             # apply grad reducer on grads
+        >>>             grads = self.grad_reducer(grads)
+        >>>         return F.depend(loss, self.optimizer(grads))
+        >>>
+        >>> network = Net()
+        >>> optimizer = nn.Momentum(network.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>> train_cell = TrainingWrapper(network, optimizer)
+        >>> inputs = Tensor(np.ones([16, 16]).astype(np.float32))
+        >>> label = Tensor(np.zeros([16, 16]).astype(np.float32))
+        >>> grads = train_cell(inputs, label)
+    """
+
+    def __init__(self, parameters, group, mean=True, degree=None):
+        super(DistributedGradReducerThor, self).__init__(auto_prefix=False)
+        self.hyper_map = C.HyperMap()
+        self.mul = P.Mul()
+        if degree is None:
+            self.degree = get_group_size()
+        else:
+            if not isinstance(degree, int) or degree <= 0:
+                raise ValueError("Parameter 'degree' in DistributedGradReducer should large than 0 and be int")
+            self.degree = degree
+        self.mean = mean
+        self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters)
+        _init_optimizer_allreduce(group)
+
+    def construct(self, grads):
+        # In some circumstances, the data precision of grads could be mixed with float16 and float32. Thus, the
+        # result of AllReduce is unreliable. To solve the problem, grads should be cast to float32 before AllReduce,
+        # and cast back after the operation.
+        datatypes = self.hyper_map(F.partial(_get_datatype), grads)
+        grads = self.hyper_map(F.partial(_cast_datatype, mstype.float32), grads)
+
+        if self.mean:
+            new_grad = self.hyper_map(F.partial(reduce_opt, self.mul, self.degree), grads)
+        else:
+            new_grad = self.hyper_map(F.partial(reduce_opt), self.allreduce_filter, grads)
+
+        new_grad = self.hyper_map(F.partial(_cast_datatype), datatypes, new_grad)
+        return new_grad
diff --git a/tests/st/networks/models/resnet50/src_thor/lr_generator.py b/tests/st/networks/models/resnet50/src_thor/lr_generator.py
new file mode 100644
index 0000000000..f56bdf1e15
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/lr_generator.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""learning rate generator"""
+import math
+
+import numpy as np
+
+
+def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
+    """
+    generate learning rate array
+
+    Args:
+       lr_init(float): init learning rate
+       lr_end(float): end learning rate
+       lr_max(float): max learning rate
+       warmup_epochs(int): number of warmup epochs
+       total_epochs(int): total epoch of training
+       steps_per_epoch(int): steps of one epoch
+       lr_decay_mode(string): learning rate decay mode, including steps, poly, cosine or default
+
+    Returns:
+       np.array, learning rate array
+    """
+    lr_each_step = []
+    total_steps = steps_per_epoch * total_epochs
+    warmup_steps = steps_per_epoch * warmup_epochs
+    if lr_decay_mode == 'steps':
+        decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps]
+        for i in range(total_steps):
+            if i < decay_epoch_index[0]:
+                lr = lr_max
+            elif i < decay_epoch_index[1]:
+                lr = lr_max * 0.1
+            elif i < decay_epoch_index[2]:
+                lr = lr_max * 0.01
+            else:
+                lr = lr_max * 0.001
+            lr_each_step.append(lr)
+    elif lr_decay_mode == 'poly':
+        if warmup_steps != 0:
+            inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
+        else:
+            inc_each_step = 0
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = float(lr_init) + inc_each_step * float(i)
+            else:
+                base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)))
+                lr = float(lr_max) * base * base
+                if lr < 0.0:
+                    lr = 0.0
+            lr_each_step.append(lr)
+    elif lr_decay_mode == 'cosine':
+        decay_steps = total_steps - warmup_steps
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps)
+                lr = float(lr_init) + lr_inc * (i + 1)
+            else:
+                linear_decay = (total_steps - i) / decay_steps
+                cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps))
+                decayed = linear_decay * cosine_decay + 0.00001
+                lr = lr_max * decayed
+            lr_each_step.append(lr)
+    else:
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = lr_init + (lr_max - lr_init) * i / warmup_steps
+            else:
+                lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
+            lr_each_step.append(lr)
+
+    learning_rate = np.array(lr_each_step).astype(np.float32)
+
+    return learning_rate
diff --git a/tests/st/networks/models/resnet50/src_thor/metric.py b/tests/st/networks/models/resnet50/src_thor/metric.py
new file mode 100644
index 0000000000..1834470fb4
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/metric.py
@@ -0,0 +1,132 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""evaluation metric."""
+
+import mindspore.common.dtype as mstype
+import mindspore.nn as nn
+from mindspore.communication.management import GlobalComm
+from mindspore.ops import operations as P
+
+
+class ClassifyCorrectCell(nn.Cell):
+    r"""
+    Cell that returns correct count of the prediction in classification network.
+    This Cell accepts a network as arguments.
+    It returns orrect count of the prediction to calculate the metrics.
+
+    Args:
+        network (Cell): The network Cell.
+
+    Inputs:
+        - **data** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
+        - **label** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
+
+    Outputs:
+        Tuple, containing a scalar correct count of the prediction
+
+    Examples:
+        >>> # For a defined network Net without loss function
+        >>> net = Net()
+        >>> eval_net = nn.ClassifyCorrectCell(net)
+    """
+
+    def __init__(self, network):
+        super(ClassifyCorrectCell, self).__init__(auto_prefix=False)
+        self._network = network
+        self.argmax = P.Argmax()
+        self.equal = P.Equal()
+        self.cast = P.Cast()
+        self.reduce_sum = P.ReduceSum()
+        self.allreduce = P.AllReduce(P.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
+
+    def construct(self, data, label):
+        outputs = self._network(data)
+        y_pred = self.argmax(outputs)
+        y_pred = self.cast(y_pred, mstype.int32)
+        y_correct = self.equal(y_pred, label)
+        y_correct = self.cast(y_correct, mstype.float32)
+        y_correct = self.reduce_sum(y_correct)
+        total_correct = self.allreduce(y_correct)
+        return (total_correct,)
+
+
+class DistAccuracy(nn.Metric):
+    r"""
+    Calculates the accuracy for classification data in distributed mode.
+    The accuracy class creates two local variables, correct number and total number that are used to compute the
+    frequency with which predictions matches labels. This frequency is ultimately returned as the accuracy: an
+    idempotent operation that simply divides correct number by total number.
+
+    .. math::
+
+        \text{accuracy} =\frac{\text{true_positive} + \text{true_negative}}
+
+        {\text{true_positive} + \text{true_negative} + \text{false_positive} + \text{false_negative}}
+
+    Args:
+        batch_size (int): eval batch size.
+        device_num (int): device number to eval.
+    Examples:
+        >>> y_correct = Tensor(np.array([20]))
+        >>> metric = nn.DistAccuracy(batch_size=3, device_num=8)
+        >>> metric.clear()
+        >>> metric.update(y_correct)
+        >>> accuracy = metric.eval()
+    """
+
+    def __init__(self, batch_size, device_num):
+        super(DistAccuracy, self).__init__()
+        self.clear()
+        self.batch_size = batch_size
+        self.device_num = device_num
+
+    def clear(self):
+        """Clears the internal evaluation result."""
+        self._correct_num = 0
+        self._total_num = 0
+
+    def update(self, *inputs):
+        """
+        Updates the internal evaluation result :math:`y_{pred}` and :math:`y`.
+
+        Args:
+            inputs: Input `y_correct`. `y_correct` is a `scalar Tensor`.
+                `y_correct` is the right prediction count that gathered from all devices
+                it's a scalar in float type
+
+        Raises:
+            ValueError: If the number of the input is not 1.
+        """
+
+        if len(inputs) != 1:
+            raise ValueError('Distribute accuracy needs 1 input (y_correct), but got {}'.format(len(inputs)))
+        y_correct = self._convert_data(inputs[0])
+        self._correct_num += y_correct
+        self._total_num += self.batch_size * self.device_num
+
+    def eval(self):
+        """
+        Computes the accuracy.
+
+        Returns:
+            Float, the computed result.
+
+        Raises:
+            RuntimeError: If the sample size is 0.
+        """
+
+        if self._total_num == 0:
+            raise RuntimeError('Accuracy can not be calculated, because the number of samples is 0.')
+        return self._correct_num / self._total_num
diff --git a/tests/st/networks/models/resnet50/src_thor/model_thor.py b/tests/st/networks/models/resnet50/src_thor/model_thor.py
new file mode 100644
index 0000000000..07b9e60bed
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/model_thor.py
@@ -0,0 +1,742 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Model."""
+
+import numpy as np
+from mindspore import context
+from mindspore import log as logger
+from mindspore import nn
+from mindspore._c_expression import init_exec_dataset
+from mindspore._checkparam import check_input_data, check_output_data, check_int_positive, check_bool
+from mindspore.common import dtype as mstype
+from mindspore.common.dtype import pytype_to_dtype
+from mindspore.common.tensor import Tensor
+from mindspore.nn.metrics import Loss
+from mindspore.nn.metrics import get_metrics
+from mindspore.nn.wrap.cell_wrapper import _VirtualDatasetCell
+from mindspore.parallel._utils import _get_parallel_mode, _get_device_num, _get_global_rank, \
+    _get_parameter_broadcast, _device_number_check, _parameter_broadcast_check
+from mindspore.train import amp
+from mindspore.train.callback import _InternalCallbackParam, RunContext, _CallbackManager
+from mindspore.train.parallel_utils import ParallelMode
+
+from .dataset_helper import DatasetHelper
+
+
+def _convert_type(types):
+    """
+    Convert from numpy type to tensor type.
+
+    Args:
+        types (list): Numpy type list of element in dataset.
+
+    Returns:
+        list, list of element in dataset.
+    """
+    ms_types = []
+    for np_type in types:
+        ms_type = pytype_to_dtype(np_type)
+        ms_types.append(ms_type)
+    return ms_types
+
+
+def _get_types_and_shapes(dataset):
+    """Get dataset types and shapes."""
+    dataset_types = _convert_type(dataset.output_types())
+    dataset_shapes = dataset.output_shapes()
+    return dataset_types, dataset_shapes
+
+
+def _exec_datagraph(exec_dataset, dataset_size, phase='dataset'):
+    """Initialize and execute the dataset graph."""
+    batch_size = exec_dataset.get_batch_size()
+    input_indexs = exec_dataset.input_indexs
+
+    # transform data format
+    dataset_types, dataset_shapes = _get_types_and_shapes(exec_dataset)
+    init_exec_dataset(exec_dataset.__ME_INITED__,
+                      dataset_size,
+                      batch_size,
+                      dataset_types,
+                      dataset_shapes,
+                      input_indexs,
+                      phase=phase,
+                      need_run=False)
+
+
+class Model:
+    """
+    High-Level API for Training or Testing.
+
+    `Model` groups layers into an object with training and inference features.
+
+    Args:
+        network (Cell): The training or testing network.
+        loss_fn (Cell): Objective function, if loss_fn is None, the
+                             network should contain the logic of loss and grads calculation, and the logic
+                             of parallel if needed. Default: None.
+        optimizer (Cell): Optimizer for updating the weights. Default: None.
+        metrics (Union[dict, set]): Dict or set of metrics to be evaluated by the model during
+                        training and testing. eg: {'accuracy', 'recall'}. Default: None.
+        eval_network (Cell): Network for evaluation. If not defined, `network` and `loss_fn` would be wrapped as
+                             `eval_network`. Default: None.
+        eval_indexes (list): In case of defining the `eval_network`, if `eval_indexes` is None, all outputs of
+                             `eval_network` would be passed to metrics, otherwise `eval_indexes` must contain three
+                             elements, representing the positions of loss value, predict value and label, the loss
+                             value would be passed to `Loss` metric, predict value and label would be passed to other
+                             metric. Default: None.
+        amp_level (str): Option for argument `level` in `mindspore.amp.build_train_network`, level for mixed
+            precision training. Supports [O0, O2]. Default: "O0".
+
+            - O0: Do not change.
+            - O2: Cast network to float16, keep batchnorm run in float32, using dynamic loss scale.
+
+        loss_scale_manager (Union[None, LossScaleManager]): If None, not scale the loss, or else
+            scale the loss by LossScaleManager. If it is set, overwrite the level setting. It's a eyword argument.
+            e.g. Use `loss_scale_manager=None` to set the value.
+        keep_batchnorm_fp32 (bool): Keep Batchnorm run in `float32`. If set, overwrite the level setting. Default: True.
+
+    Examples:
+        >>> class Net(nn.Cell):
+        >>>     def __init__(self):
+        >>>         super(Net, self).__init__()
+        >>>         self.conv = nn.Conv2d(3, 64, 3, has_bias=False, weight_init='normal')
+        >>>         self.bn = nn.BatchNorm2d(64)
+        >>>         self.relu = nn.ReLU()
+        >>>         self.flatten = nn.Flatten()
+        >>>         self.fc = nn.Dense(64*224*224, 12) # padding=0
+        >>>
+        >>>     def construct(self, x):
+        >>>         x = self.conv(x)
+        >>>         x = self.bn(x)
+        >>>         x = self.relu(x)
+        >>>         x = self.flatten(x)
+        >>>         out = self.fc(x)
+        >>>         return out
+        >>>
+        >>> net = Net()
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+        >>> optim = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
+        >>> dataset = get_dataset()
+        >>> model.train(2, dataset)
+    """
+
+    def __init__(self, network, loss_fn=None, optimizer=None, metrics=None, eval_network=None,
+                 eval_indexes=None, amp_level="O0", frequency=278, stop_epoch=100, **kwargs):
+        self._network = network
+        self._loss_fn = loss_fn
+        self._optimizer = optimizer
+        self._loss_scale_manager = None
+        self._loss_scale_manager_set = False
+        self._keep_bn_fp32 = True
+        self._check_kwargs(kwargs)
+        self._amp_level = amp_level
+        self._process_amp_args(kwargs)
+        self._parallel_mode = _get_parallel_mode()
+        self._device_number = _get_device_num()
+        self._global_rank = _get_global_rank()
+        self._parameter_broadcast = _get_parameter_broadcast()
+        self._frequency = frequency
+        self._stop_epoch = stop_epoch
+        self._has_do_dataset_init = False
+
+        self._train_network = self._build_train_network()
+        self._build_eval_network(metrics, eval_network, eval_indexes)
+        self._build_predict_network()
+
+    def _process_amp_args(self, kwargs):
+        if self._amp_level == "O0":
+            self._keep_bn_fp32 = False
+        if 'keep_batchnorm_fp32' in kwargs:
+            self._keep_bn_fp32 = kwargs['keep_batchnorm_fp32']
+        if 'loss_scale_manager' in kwargs:
+            self._loss_scale_manager = kwargs['loss_scale_manager']
+            self._loss_scale_manager_set = True
+
+    def _check_kwargs(self, kwargs):
+        for arg in kwargs:
+            if arg not in ['loss_scale_manager', 'keep_batchnorm_fp32']:
+                raise ValueError(f"Unsupport arg '{arg}'")
+
+    def _build_train_network(self):
+        """Build train network"""
+        network = self._network
+        if self._optimizer:
+            if self._loss_scale_manager_set:
+                network = amp.build_train_network(network,
+                                                  self._optimizer,
+                                                  self._loss_fn,
+                                                  level=self._amp_level,
+                                                  loss_scale_manager=self._loss_scale_manager,
+                                                  keep_batchnorm_fp32=self._keep_bn_fp32)
+            else:
+                network = amp.build_train_network(network,
+                                                  self._optimizer,
+                                                  self._loss_fn,
+                                                  level=self._amp_level,
+                                                  keep_batchnorm_fp32=self._keep_bn_fp32)
+        elif self._loss_fn:
+            network = nn.WithLossCell(network, self._loss_fn)
+        # If need to check if loss_fn is not None, but optimizer is None
+
+        if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
+            network.set_auto_parallel()
+        return network
+
+    def _build_eval_network(self, metrics, eval_network, eval_indexes):
+        """Build the network for evaluation."""
+        self._metric_fns = get_metrics(metrics)
+        if not self._metric_fns:
+            return
+
+        if eval_network is not None:
+            if eval_indexes is not None and not (isinstance(eval_indexes, list) and len(eval_indexes) == 3):
+                raise ValueError("Eval_indexes must be a list or None. If eval_indexes is a list, length of it \
+                                 must be three. But got {}".format(eval_indexes))
+
+            self._eval_network = eval_network
+            self._eval_indexes = eval_indexes
+        else:
+            if self._loss_fn is None:
+                raise ValueError("loss_fn can not be None.")
+            self._eval_network = nn.WithEvalCell(self._network, self._loss_fn, self._amp_level == "O2")
+            self._eval_indexes = [0, 1, 2]
+
+        if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
+            self._eval_network.set_auto_parallel()
+
+    def _build_predict_network(self):
+        """Build the network for prediction."""
+        self._predict_network = self._network
+        if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
+            self._predict_network = _VirtualDatasetCell(self._network)
+            self._predict_network.set_auto_parallel()
+
+    def _clear_metrics(self):
+        """Clear metrics local values."""
+        for metric in self._metric_fns.values():
+            metric.clear()
+
+    def _update_metrics(self, outputs):
+        """Update metrics local values."""
+        if not isinstance(outputs, tuple):
+            raise ValueError("The `outputs` is not tuple.")
+
+        if self._eval_indexes is not None and len(outputs) < 3:
+            raise ValueError("The length of `outputs` must be greater than or equal to 3, \
+                             but got {}".format(len(outputs)))
+
+        for metric in self._metric_fns.values():
+            if self._eval_indexes is None:
+                metric.update(*outputs)
+            else:
+                if isinstance(metric, Loss):
+                    metric.update(outputs[self._eval_indexes[0]])
+                else:
+                    metric.update(outputs[self._eval_indexes[1]], outputs[self._eval_indexes[2]])
+
+    def _get_metrics(self):
+        """Get metrics local values."""
+        metrics = dict()
+        for key, value in self._metric_fns.items():
+            metrics[key] = value.eval()
+        return metrics
+
+    def _get_scaling_sens(self):
+        """get the scaling sens"""
+        scaling_sens = 1
+        if self._loss_scale_manager is not None:
+            scaling_sens = self._loss_scale_manager.get_loss_scale()
+        if self._parallel_mode == ParallelMode.DATA_PARALLEL:
+            scaling_sens /= self._device_number
+        return scaling_sens
+
+    def _exec_preprocess(self, network, is_train, phase, dataset, dataset_sink_mode, iter_first_order=1):
+        """Initializes dataset."""
+        need_wrap = False
+        if dataset_sink_mode:
+            # remove later to deal with loop sink
+            if not hasattr(dataset, '__ME_INITED__') and context.get_context("device_target") == "Ascend" \
+                    and not context.get_context("enable_ge"):
+                need_wrap = True
+
+            if not is_train:
+                dataset.__loop_size__ = 1
+
+        dataset_helper = DatasetHelper(dataset, dataset_sink_mode, iter_first_order)
+
+        # remove later to deal with loop sink
+        if need_wrap:
+            network = nn.DataWrapper(network, *(dataset_helper.types_shapes()), dataset.__ME_INITED__)
+            network.set_train(is_train)
+            network.phase = phase
+
+        return dataset_helper, network
+
+    def init(self, train_dataset=None, valid_dataset=None):
+        """
+        Initializes compute graphs and data graphs with sink mode.
+
+        Note:
+            Pre-init process only supports `GRAPH_MODE` and `Ascend` target currently.
+
+        Args:
+            train_dataset (Dataset): A training dataset iterator. If define `train_dataset`, training graphs will be
+                                     initialized. Default: None.
+            valid_dataset (Dataset): A evaluating dataset iterator. If define `valid_dataset`, evaluation graphs will
+                                     be initialized, and `metrics` in `Model` can not be None. Default: None.
+
+        Examples:
+            >>> train_dataset = get_train_dataset()
+            >>> valid_dataset = get_valid_dataset()
+            >>> net = Net()
+            >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+            >>> optim = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9)
+            >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics={'acc'})
+            >>> model.init(train_dataset, valid_dataset)
+            >>> model.train(2, train_dataset)
+            >>> model.eval(valid_dataset)
+        """
+        if context.get_context("mode") != context.GRAPH_MODE or context.get_context("device_target") != "Ascend":
+            raise RuntimeError('Pre-init process only supports GRAPH MODE and Ascend target currently.')
+
+        if not train_dataset and not valid_dataset:
+            raise ValueError('Both train_dataset and valid_dataset can not be None or empty.')
+
+        _device_number_check(self._parallel_mode, self._device_number)
+
+        if train_dataset:
+            _parameter_broadcast_check(self._parallel_mode, self._parameter_broadcast)
+            self._train_network.set_train()
+            self._train_network.phase = 'train'
+
+            if self._parameter_broadcast:
+                self._train_network.set_broadcast_flag()
+            iter_first_order = self._frequency - 1
+            iter_second_order = 1
+            train_dataset.__loop_size__ = iter_second_order
+            train_dataset_helper, train_network = self._exec_preprocess(self._train_network,
+                                                                        is_train=True,
+                                                                        phase='train',
+                                                                        dataset=train_dataset,
+                                                                        dataset_sink_mode=True,
+                                                                        iter_first_order=iter_first_order)
+            self._train_network = train_network
+            switch_branch_one = True
+            index = 0
+            for inputs in train_dataset_helper:
+                if switch_branch_one:
+                    self._train_network.add_flags_recursive(thor=True)
+                    self._train_network.phase = 'train0'
+                else:
+                    self._train_network.add_flags_recursive(thor=False)
+                    self._train_network.phase = 'train1'
+                    if not self._has_do_dataset_init:
+                        _exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset')
+                        self._has_do_dataset_init = True
+                switch_branch_one = not switch_branch_one
+                self._train_network.compile(*inputs)
+                if index >= 1:
+                    break
+                index += 1
+
+        if valid_dataset:
+            if not self._metric_fns:
+                raise RuntimeError('If define `valid_dataset`, metric fn can not be None or empty.')
+
+            self._eval_network.set_train(False)
+            self._eval_network.phase = 'eval'
+            valid_dataset_helper, eval_network = self._exec_preprocess(self._eval_network,
+                                                                       is_train=False,
+                                                                       phase='eval',
+                                                                       dataset=valid_dataset,
+                                                                       dataset_sink_mode=True)
+            self._eval_network = eval_network
+            for inputs in valid_dataset_helper:
+                self._eval_network.compile(*inputs)
+                break
+
+    def _train(self, epoch, train_dataset, callbacks=None, dataset_sink_mode=True):
+        """
+        Training.
+
+        Args:
+            epoch (int): Total number of iterations on the data.
+            train_dataset (Dataset): A training dataset iterator. If there is no
+                                     loss_fn, a tuple with multiply data (data1, data2, data3, ...) will be
+                                     returned and passed to the network. Otherwise, a tuple (data, label) will
+                                     be returned, and the data and label are passed to the network and loss
+                                     function respectively.
+            callbacks (list): List of callback object. Callbacks which should be executed while training. Default: None.
+            dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True.
+                                      Configure pynative mode, the training process will be performed with
+                                      dataset not sink.
+        """
+        epoch = check_int_positive(epoch)
+        self._train_network.set_train()
+
+        if self._parameter_broadcast:
+            self._train_network.set_broadcast_flag()
+
+        # build callback list
+        cb_params = _InternalCallbackParam()
+        cb_params.train_network = self._train_network
+        cb_params.epoch_num = epoch
+        cb_params.batch_num = train_dataset.get_dataset_size()
+        cb_params.mode = "train"
+        cb_params.loss_fn = self._loss_fn
+        cb_params.optimizer = self._optimizer
+        cb_params.parallel_mode = self._parallel_mode
+        cb_params.device_number = self._device_number
+        cb_params.train_dataset = train_dataset
+        cb_params.list_callback = callbacks
+
+        with _CallbackManager(callbacks) as list_callback:
+            if not dataset_sink_mode:
+                self._train_process(epoch, train_dataset, list_callback, cb_params)
+            elif context.get_context("mode") == context.PYNATIVE_MODE:
+                logger.warning("The pynative mode cannot support dataset sink mode currently."
+                               "So the training process will be performed with dataset not sink.")
+                self._train_process(epoch, train_dataset, list_callback, cb_params)
+            else:
+                self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params)
+
+    def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None):
+        """
+        Training process. The data would be passed to network through dataset channel.
+
+        Args:
+            epoch (int): Total number of iterations on the data.
+            train_dataset (Dataset): A training dataset iterator. If there is no
+                                     loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be
+                                     returned and passed to the network. Otherwise, a tuple (data, label) should
+                                     be returned, and the data and label are passed to the network and loss
+                                     function respectively.
+            list_callback (Callback): Executor of callback list. Default: None.
+            cb_params (_InternalCallbackParam): Callback parameters. Default: None.
+        """
+        iter_first_order = self._frequency - 1
+        iter_second_order = 1
+        train_dataset.__loop_size__ = iter_second_order
+        dataset_helper, train_network = self._exec_preprocess(self._train_network,
+                                                              is_train=True,
+                                                              phase='train',
+                                                              dataset=train_dataset,
+                                                              dataset_sink_mode=True,
+                                                              iter_first_order=iter_first_order)
+        self._train_network = train_network
+        cb_params.train_network = self._train_network
+        cb_params.cur_step_num = 0
+
+        loop_size = dataset_helper.loop_size()
+        run_context = RunContext(cb_params)
+        list_callback.begin(run_context)
+
+        # used to stop training for early stop, such as stopAtTIme or stopATStep
+        should_stop = False
+        switch_branch_one = True
+        for i in range(epoch):
+            cb_params.cur_epoch_num = i + 1
+            list_callback.epoch_begin(run_context)
+
+            # for data sink dataset_helper only iter once, other wise iter epoch_size times.
+            for inputs in dataset_helper:
+                list_callback.step_begin(run_context)
+                if switch_branch_one:
+                    cb_params.cur_step_num += loop_size
+                    self._train_network.add_flags_recursive(thor=True)
+                    self._train_network.phase = 'train0'
+                else:
+                    cb_params.cur_step_num += iter_first_order
+                    self._train_network.add_flags_recursive(thor=False)
+                    self._train_network.phase = 'train1'
+                    if not self._has_do_dataset_init:
+                        _exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset')
+                        self._has_do_dataset_init = True
+                switch_branch_one = not switch_branch_one
+                outputs = self._train_network(*inputs)
+                cb_params.net_outputs = outputs
+                list_callback.step_end(run_context)
+
+            list_callback.epoch_end(run_context)
+            should_stop = should_stop or run_context.get_stop_requested()
+            if should_stop:
+                break
+
+        list_callback.end(run_context)
+
+    def _train_process(self, epoch, train_dataset, list_callback=None, cb_params=None):
+        """
+        Training process. The data would be passed to network directly.
+
+        Args:
+            epoch (int): Total number of iterations on the data.
+            train_dataset (Dataset): A training dataset iterator. If there is no
+                                     loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be
+                                     returned and passed to the network. Otherwise, a tuple (data, label) should
+                                     be returned, and the data and label are passed to the network and loss
+                                     function respectively.
+            list_callback (Callback): Executor of callback list. Default: None.
+            cb_params (_InternalCallbackParam): Callback parameters. Default: None.
+        """
+        dataset_helper, _ = self._exec_preprocess(self._train_network,
+                                                  is_train=True,
+                                                  phase='train',
+                                                  dataset=train_dataset,
+                                                  dataset_sink_mode=False)
+        cb_params.cur_step_num = 0
+        run_context = RunContext(cb_params)
+        list_callback.begin(run_context)
+        # used to stop training for early stop, such as stopAtTIme or stopATStep
+        should_stop = False
+
+        for i in range(epoch):
+            cb_params.cur_epoch_num = i + 1
+
+            list_callback.epoch_begin(run_context)
+
+            for next_element in dataset_helper:
+                len_element = len(next_element)
+                if self._loss_fn and len_element != 2:
+                    raise ValueError("when loss_fn is not None, train_dataset should"
+                                     "return two elements, but got {}".format(len_element))
+                cb_params.cur_step_num += 1
+                list_callback.step_begin(run_context)
+
+                overflow = False
+                if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update():
+                    scaling_sens = self._get_scaling_sens()
+                    next_element = tuple(next_element) + (Tensor(scaling_sens, mstype.float32),)
+
+                outputs = self._train_network(*next_element)
+                cb_params.net_outputs = outputs
+                if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update():
+                    _, overflow, _ = outputs
+                    overflow = np.all(overflow.asnumpy())
+                    self._loss_scale_manager.update_loss_scale(overflow)
+
+                list_callback.step_end(run_context)
+                should_stop = should_stop or run_context.get_stop_requested()
+                if should_stop:
+                    break
+
+            train_dataset.reset()
+
+            list_callback.epoch_end(run_context)
+            should_stop = should_stop or run_context.get_stop_requested()
+            if should_stop:
+                break
+
+        list_callback.end(run_context)
+
+    def train(self, epoch, train_dataset, callbacks=None, dataset_sink_mode=True):
+        """
+        Training API where the iteration is controlled by python front-end.
+
+        When setting pynative mode, the training process will be performed with dataset not sink.
+
+        Note:
+            CPU is not supported when dataset_sink_mode is true.
+            If dataset_sink_mode is True, epoch of training should be equal to the count of repeat
+            operation in dataset processing. Otherwise, errors could occur since the amount of data
+            is not the amount training requires.
+            If dataset_sink_mode is True, data will be sent to device. If device is Ascend, features
+            of data will be transferred one by one. The limitation of data transmission per time is 256M.
+
+        Args:
+            epoch (int): Total number of iterations on the data.
+            train_dataset (Dataset): A training dataset iterator. If there is no
+                                     loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be
+                                     returned and passed to the network. Otherwise, a tuple (data, label) should
+                                     be returned, and the data and label are passed to the network and loss
+                                     function respectively.
+            callbacks (list): List of callback object. Callbacks which should be excuted while training. Default: None.
+            dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True.
+                                      Configure pynative mode, the training process will be performed with
+                                      dataset not sink.
+
+
+        Examples:
+            >>> dataset = get_dataset()
+            >>> net = Net()
+            >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+            >>> loss_scale_manager = FixedLossScaleManager()
+            >>> optim = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9)
+            >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None, loss_scale_manager=loss_scale_manager)
+            >>> model.train(2, dataset)
+        """
+        repeat_count = train_dataset.get_repeat_count()
+        if epoch != repeat_count and dataset_sink_mode is True:
+            logger.warning(f"The epoch_size {epoch} is not the same with dataset repeat_count {repeat_count}")
+        check_bool(dataset_sink_mode)
+        _device_number_check(self._parallel_mode, self._device_number)
+        _parameter_broadcast_check(self._parallel_mode, self._parameter_broadcast)
+
+        self._train(epoch,
+                    train_dataset,
+                    callbacks=callbacks,
+                    dataset_sink_mode=dataset_sink_mode)
+
+    def _eval_dataset_sink_process(self, valid_dataset, list_callback=None, cb_params=None):
+        """
+        Evaluation. The data would be passed to network through dataset channel.
+
+        Args:
+            valid_dataset (Dataset): Dataset to evaluate the model.
+            list_callback (ListCallback): Executor of callback list. Default: None.
+            cb_params (_InternalCallbackParam): Callback parameters. Default: None.
+
+        Returns:
+            Dict, returns the loss value & metrics values for the model in test mode.
+        """
+        run_context = RunContext(cb_params)
+
+        dataset_helper, eval_network = self._exec_preprocess(self._eval_network,
+                                                             is_train=False,
+                                                             phase='eval',
+                                                             dataset=valid_dataset,
+                                                             dataset_sink_mode=True)
+        self._eval_network = eval_network
+        cb_params.eval_network = self._eval_network
+        list_callback.begin(run_context)
+
+        for inputs in dataset_helper:
+            cb_params.cur_step_num += 1
+            list_callback.step_begin(run_context)
+
+            outputs = self._eval_network(*inputs)
+
+            cb_params.net_outputs = outputs
+            list_callback.step_end(run_context)
+            self._update_metrics(outputs)
+
+        metrics = self._get_metrics()
+        cb_params.metrics = metrics
+        list_callback.end(run_context)
+
+        return metrics
+
+    def _eval_process(self, valid_dataset, list_callback=None, cb_params=None):
+        """
+        Evaluation. The data would be passed to network directly.
+
+        Args:
+            valid_dataset (Dataset): Dataset to evaluate the model.
+            list_callback (ListCallback): Executor of callback list. Default: None.
+            cb_params (_InternalCallbackParam): Callback parameters. Default: None.
+
+        Returns:
+            Dict, returns the loss value & metrics values for the model in test mode.
+        """
+        run_context = RunContext(cb_params)
+        list_callback.begin(run_context)
+
+        dataset_helper, _ = self._exec_preprocess(self._eval_network,
+                                                  is_train=False,
+                                                  phase='eval',
+                                                  dataset=valid_dataset,
+                                                  dataset_sink_mode=False)
+        for next_element in dataset_helper:
+            cb_params.cur_step_num += 1
+            list_callback.step_begin(run_context)
+            outputs = self._eval_network(*next_element)
+            cb_params.net_outputs = outputs
+            list_callback.step_end(run_context)
+            self._update_metrics(outputs)
+
+        metrics = self._get_metrics()
+        cb_params.metrics = metrics
+        list_callback.end(run_context)
+        return metrics
+
+    def eval(self, valid_dataset, callbacks=None, dataset_sink_mode=True):
+        """
+        Evaluation API where the iteration is controlled by python front-end.
+
+        Configure to pynative mode, the evaluation will be performed with dataset non-sink mode.
+
+        Note:
+            CPU is not supported when dataset_sink_mode is true.
+            If dataset_sink_mode is True, data will be sent to device. If device is Ascend, features
+            of data will be transferred one by one. The limitation of data transmission per time is 256M.
+
+        Args:
+            valid_dataset (Dataset): Dataset to evaluate the model.
+            callbacks (list): List of callback object. Callbacks which should be excuted
+                              while training. Default: None.
+            dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True.
+
+        Returns:
+            Dict, returns the loss value & metrics values for the model in test mode.
+
+        Examples:
+            >>> dataset = get_dataset()
+            >>> net = Net()
+            >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+            >>> model = Model(net, loss_fn=loss, optimizer=None, metrics={'acc'})
+            >>> model.eval(dataset)
+        """
+        check_bool(dataset_sink_mode)
+        _device_number_check(self._parallel_mode, self._device_number)
+        if not self._metric_fns:
+            raise ValueError("metric fn can not be None or empty.")
+
+        cb_params = _InternalCallbackParam()
+        cb_params.eval_network = self._eval_network
+        cb_params.valid_dataset = valid_dataset
+        cb_params.batch_num = valid_dataset.get_dataset_size()
+        cb_params.mode = "eval"
+        cb_params.cur_step_num = 0
+
+        self._eval_network.set_train(mode=False)
+        self._eval_network.phase = 'eval'
+
+        self._clear_metrics()
+
+        with _CallbackManager(callbacks) as list_callback:
+            if dataset_sink_mode:
+                return self._eval_dataset_sink_process(valid_dataset, list_callback, cb_params)
+            return self._eval_process(valid_dataset, list_callback, cb_params)
+
+    def predict(self, *predict_data):
+        """
+        Generates output predictions for the input samples.
+
+        Data could be single tensor, or list of tensor, tuple of tensor.
+
+        Note:
+            Batch data should be put together in one tensor.
+
+        Args:
+           predict_data (Tensor): Tensor of predict data. can be array, list or tuple.
+
+        Returns:
+            Tensor, array(s) of predictions.
+
+        Examples:
+            >>> input_data = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]), mindspore.float32)
+            >>> model = Model(Net())
+            >>> model.predict(input_data)
+        """
+        self._predict_network.set_train(False)
+        check_input_data(*predict_data, data_class=Tensor)
+        result = self._predict_network(*predict_data)
+
+        check_output_data(result)
+        return result
+
+
+__all__ = ["Model"]
diff --git a/tests/st/networks/models/resnet50/src_thor/resnet.py b/tests/st/networks/models/resnet50/src_thor/resnet.py
new file mode 100644
index 0000000000..88b99fb161
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/resnet.py
@@ -0,0 +1,359 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""ResNet."""
+import math
+import numpy as np
+import mindspore.nn as nn
+from mindspore.common.tensor import Tensor
+from mindspore.ops import operations as P
+
+from .thor_layer import Conv2d_Thor, Dense_Thor
+
+
+def calculate_gain(nonlinearity, param=None):
+    """calculate_gain"""
+    linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d']
+    res = 0
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        res = 1
+    elif nonlinearity == 'tanh':
+        res = 5.0 / 3
+    elif nonlinearity == 'relu':
+        res = math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(param))
+        res = math.sqrt(2.0 / (1 + negative_slope ** 2))
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+    return res
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    """_calculate_fan_in_and_fan_out"""
+    dimensions = len(tensor)
+    if dimensions < 2:
+        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
+    if dimensions == 2:  # Linear
+        fan_in = tensor[1]
+        fan_out = tensor[0]
+    else:
+        num_input_fmaps = tensor[1]
+        num_output_fmaps = tensor[0]
+        receptive_field_size = 1
+        if dimensions > 2:
+            receptive_field_size = tensor[2] * tensor[3]
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+    return fan_in, fan_out
+
+
+def _calculate_correct_fan(tensor, mode):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def kaiming_normal(inputs_shape, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    fan = _calculate_correct_fan(inputs_shape, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return np.random.normal(0, std, size=inputs_shape).astype(np.float32)
+
+
+def kaiming_uniform(inputs_shape, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    fan = _calculate_correct_fan(inputs_shape, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+    return np.random.uniform(-bound, bound, size=inputs_shape).astype(np.float32)
+
+
+def _conv3x3(in_channel, out_channel, stride=1, damping=0.03, loss_scale=1, frequency=278):
+    weight_shape = (out_channel, in_channel, 3, 3)
+    weight = Tensor(kaiming_normal(weight_shape, mode="fan_out", nonlinearity='relu'))
+    return Conv2d_Thor(in_channel, out_channel,
+                       kernel_size=3, stride=stride, padding=0, pad_mode='same', weight_init=weight,
+                       damping=damping, loss_scale=loss_scale, frequency=frequency)
+
+
+def _conv1x1(in_channel, out_channel, stride=1, damping=0.03, loss_scale=1, frequency=278):
+    weight_shape = (out_channel, in_channel, 1, 1)
+    weight = Tensor(kaiming_normal(weight_shape, mode="fan_out", nonlinearity='relu'))
+    return Conv2d_Thor(in_channel, out_channel,
+                       kernel_size=1, stride=stride, padding=0, pad_mode='same', weight_init=weight,
+                       damping=damping, loss_scale=loss_scale, frequency=frequency)
+
+
+def _conv7x7(in_channel, out_channel, stride=1, damping=0.03, loss_scale=1, frequency=278):
+    weight_shape = (out_channel, in_channel, 7, 7)
+    weight = Tensor(kaiming_normal(weight_shape, mode="fan_out", nonlinearity='relu'))
+    return Conv2d_Thor(in_channel, out_channel,
+                       kernel_size=7, stride=stride, padding=0, pad_mode='same', weight_init=weight,
+                       damping=damping, loss_scale=loss_scale, frequency=frequency)
+
+
+def _bn(channel):
+    return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
+                          gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
+
+
+def _bn_last(channel):
+    return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
+                          gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
+
+
+def _fc(in_channel, out_channel, damping, loss_scale, frequency):
+    weight_shape = (out_channel, in_channel)
+    weight = Tensor(kaiming_uniform(weight_shape, a=math.sqrt(5)))
+    return Dense_Thor(in_channel, out_channel, has_bias=False, weight_init=weight,
+                      bias_init=0, damping=damping, loss_scale=loss_scale, frequency=frequency)
+
+
+class ResidualBlock(nn.Cell):
+    """
+    ResNet V1 residual block definition.
+
+    Args:
+        in_channel (int): Input channel.
+        out_channel (int): Output channel.
+        stride (int): Stride size for the first convolutional layer. Default: 1.
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ResidualBlock(3, 256, stride=2)
+    """
+    expansion = 4
+
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 stride=1,
+                 damping=0.03,
+                 loss_scale=1,
+                 frequency=278):
+        super(ResidualBlock, self).__init__()
+
+        channel = out_channel // self.expansion
+        self.conv1 = _conv1x1(in_channel, channel, stride=1, damping=damping, loss_scale=loss_scale,
+                              frequency=frequency)
+        self.bn1 = _bn(channel)
+
+        self.conv2 = _conv3x3(channel, channel, stride=stride, damping=damping, loss_scale=loss_scale,
+                              frequency=frequency)
+        self.bn2 = _bn(channel)
+
+        self.conv3 = _conv1x1(channel, out_channel, stride=1, damping=damping, loss_scale=loss_scale,
+                              frequency=frequency)
+        self.bn3 = _bn_last(out_channel)
+
+        self.relu = nn.ReLU()
+
+        self.down_sample = False
+
+        if stride != 1 or in_channel != out_channel:
+            self.down_sample = True
+        self.down_sample_layer = None
+
+        if self.down_sample:
+            self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride,
+                                                                 damping=damping, loss_scale=loss_scale,
+                                                                 frequency=frequency),
+                                                        _bn(out_channel)])
+        self.add = P.TensorAdd()
+
+    def construct(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.down_sample:
+            identity = self.down_sample_layer(identity)
+
+        out = self.add(out, identity)
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Cell):
+    """
+    ResNet architecture.
+
+    Args:
+        block (Cell): Block for network.
+        layer_nums (list): Numbers of block in different layers.
+        in_channels (list): Input channel in each layer.
+        out_channels (list): Output channel in each layer.
+        strides (list):  Stride size in each layer.
+        num_classes (int): The number of classes that the training images are belonging to.
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ResNet(ResidualBlock,
+        >>>        [3, 4, 6, 3],
+        >>>        [64, 256, 512, 1024],
+        >>>        [256, 512, 1024, 2048],
+        >>>        [1, 2, 2, 2],
+        >>>        10)
+    """
+
+    def __init__(self,
+                 block,
+                 layer_nums,
+                 in_channels,
+                 out_channels,
+                 strides,
+                 num_classes,
+                 damping,
+                 loss_scale,
+                 frequency):
+        super(ResNet, self).__init__()
+
+        if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:
+            raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!")
+
+        self.conv1 = _conv7x7(3, 64, stride=2, damping=damping, loss_scale=loss_scale, frequency=frequency)
+        self.bn1 = _bn(64)
+        self.relu = P.ReLU()
+        self.maxpool = P.MaxPoolWithArgmax(padding="same", ksize=3, strides=2)
+
+        self.layer1 = self._make_layer(block,
+                                       layer_nums[0],
+                                       in_channel=in_channels[0],
+                                       out_channel=out_channels[0],
+                                       stride=strides[0],
+                                       damping=damping,
+                                       loss_scale=loss_scale,
+                                       frequency=frequency)
+        self.layer2 = self._make_layer(block,
+                                       layer_nums[1],
+                                       in_channel=in_channels[1],
+                                       out_channel=out_channels[1],
+                                       stride=strides[1],
+                                       damping=damping,
+                                       loss_scale=loss_scale,
+                                       frequency=frequency)
+        self.layer3 = self._make_layer(block,
+                                       layer_nums[2],
+                                       in_channel=in_channels[2],
+                                       out_channel=out_channels[2],
+                                       stride=strides[2], damping=damping,
+                                       loss_scale=loss_scale,
+                                       frequency=frequency)
+        self.layer4 = self._make_layer(block,
+                                       layer_nums[3],
+                                       in_channel=in_channels[3],
+                                       out_channel=out_channels[3],
+                                       stride=strides[3],
+                                       damping=damping,
+                                       loss_scale=loss_scale,
+                                       frequency=frequency)
+
+        self.mean = P.ReduceMean(keep_dims=True)
+        self.flatten = nn.Flatten()
+        self.end_point = _fc(out_channels[3], num_classes, damping=damping, loss_scale=loss_scale, frequency=frequency)
+
+    def _make_layer(self, block, layer_num, in_channel, out_channel, stride,
+                    damping, loss_scale, frequency):
+        """
+        Make stage network of ResNet.
+
+        Args:
+            block (Cell): Resnet block.
+            layer_num (int): Layer number.
+            in_channel (int): Input channel.
+            out_channel (int): Output channel.
+            stride (int): Stride size for the first convolutional layer.
+
+        Returns:
+            SequentialCell, the output layer.
+
+        Examples:
+            >>> _make_layer(ResidualBlock, 3, 128, 256, 2)
+        """
+        layers = []
+
+        resnet_block = block(in_channel, out_channel, stride=stride,
+                             damping=damping, loss_scale=loss_scale, frequency=frequency)
+        layers.append(resnet_block)
+
+        for _ in range(1, layer_num):
+            resnet_block = block(out_channel, out_channel, stride=1,
+                                 damping=damping, loss_scale=loss_scale, frequency=frequency)
+            layers.append(resnet_block)
+
+        return nn.SequentialCell(layers)
+
+    def construct(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        c1, _ = self.maxpool(x)
+
+        c2 = self.layer1(c1)
+        c3 = self.layer2(c2)
+        c4 = self.layer3(c3)
+        c5 = self.layer4(c4)
+
+        out = self.mean(c5, (2, 3))
+        out = self.flatten(out)
+        out = self.end_point(out)
+
+        return out
+
+
+def resnet50(class_num=10, damping=0.03, loss_scale=1, frequency=278):
+    """
+    Get ResNet50 neural network.
+
+    Args:
+        class_num (int): Class number.
+
+    Returns:
+        Cell, cell instance of ResNet50 neural network.
+
+    Examples:
+        >>> net = resnet50(10)
+    """
+    return ResNet(ResidualBlock,
+                  [3, 4, 6, 3],
+                  [64, 256, 512, 1024],
+                  [256, 512, 1024, 2048],
+                  [1, 2, 2, 2],
+                  class_num,
+                  damping,
+                  loss_scale,
+                  frequency)
diff --git a/tests/st/networks/models/resnet50/src_thor/thor.py b/tests/st/networks/models/resnet50/src_thor/thor.py
new file mode 100644
index 0000000000..d4469a5827
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/thor.py
@@ -0,0 +1,201 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""momentum"""
+import mindspore.common.dtype as mstype
+from mindspore.common.initializer import initializer
+from mindspore.common.parameter import Parameter
+from mindspore.common.parameter import ParameterTuple
+from mindspore.common.tensor import Tensor
+from mindspore.nn.optim.optimizer import Optimizer
+from mindspore.ops import functional as F, composite as C, operations as P
+from mindspore.parallel._utils import _get_device_num, _get_mirror_mean
+
+from .grad_reducer_thor import DistributedGradReducerThor
+
+momentum_opt = C.MultitypeFuncGraph("momentum_opt")
+
+
+@momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
+def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, moment):
+    """Apply momentum optimizer to the weight parameter using Tensor."""
+    success = True
+    success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum))
+    return success
+
+
+op_add = P.AddN()
+apply_decay = C.MultitypeFuncGraph("apply_decay")
+
+
+@apply_decay.register("Number", "Bool", "Tensor", "Tensor")
+def _tensor_apply_decay(weight_decay, if_apply, weight, gradient):
+    """Get grad with weight_decay."""
+    if if_apply:
+        return op_add((weight * weight_decay, gradient))
+    return gradient
+
+
+class THOR(Optimizer):
+    """THOR"""
+
+    def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0,
+                 loss_scale=1.0,
+                 decay_filter=lambda x: x.name not in []):
+        super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
+        if isinstance(momentum, float) and momentum < 0.0:
+            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
+        self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
+        self.params = self.parameters
+        self.moments = self.params.clone(prefix="moments", init='zeros')
+        self.hyper_map = C.HyperMap()
+        self.opt = P.ApplyMomentum()
+        self.matrix_A = ParameterTuple(matrix_A)
+        self.matrix_G = ParameterTuple(matrix_G)
+        self.A_inv_max = ParameterTuple(A_inv_max)
+        self.G_inv_max = ParameterTuple(G_inv_max)
+        self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
+        self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
+        self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
+        self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
+        self.transpose = P.Transpose()
+        self.shape = P.Shape()
+        self.reshape = P.Reshape()
+        self.mul = P.Mul()
+        self.weight_idx = []
+        for i in range(len(self.params)):
+            if "conv" in self.params[i].name or "end_point" in self.params[i].name:
+                self.weight_idx.append(i)
+        self.weight_idx.append(len(self.params))
+        self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
+                            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
+                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
+                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
+                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
+                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
+                            1.0 / 196, 1.0 / 196, 1.0 / 196,
+                            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
+                            1.0]
+        mean = _get_mirror_mean()
+        degree = _get_device_num()
+        self.grad_reducer_Amax = DistributedGradReducerThor(self.parameters, 2, mean, degree)
+        self.grad_reducer_Gmax = DistributedGradReducerThor(self.parameters, 5, mean, degree)
+        self.grad_reducer_A = DistributedGradReducerThor(self.parameters, 3, mean, degree)
+        self.grad_reducer_G = DistributedGradReducerThor(self.parameters, 4, mean, degree)
+        self.matrix_A_inv = ()
+        self.matrix_G_inv = ()
+        self.matrix_max_inv = ()
+
+        for i in range(54):
+            self.matrix_max_inv = self.matrix_max_inv + (
+                Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),)
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.sqrt = P.Sqrt()
+        self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
+        self.assign = P.Assign()
+        self.cast = P.Cast()
+        self.thor = True
+        self.weight_decay = weight_decay * loss_scale
+        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
+
+    def construct(self, gradients):
+        params = self.params
+        moments = self.moments
+        if self.thor:
+            matrix_A_allreduce = ()
+            matrix_G_allreduce = ()
+            matrix_A_max_allreduce = ()
+            matrix_G_max_allreduce = ()
+            for i in range(54):
+                g = gradients[i * 3]
+                matrix_A = self.matrix_A[i]
+                matrix_G = self.matrix_G[i]
+                A_max = self.A_inv_max[i]
+                G_max = self.G_inv_max[i]
+                matrix_A = F.depend(matrix_A, g)
+                matrix_G = F.depend(matrix_G, g)
+                A_max = F.depend(A_max, g)
+                G_max = F.depend(G_max, g)
+                matrix_A_allreduce = matrix_A_allreduce + (matrix_A,)
+                matrix_G_allreduce = matrix_G_allreduce + (matrix_G,)
+                matrix_A_max_allreduce = matrix_A_max_allreduce + (A_max,)
+                matrix_G_max_allreduce = matrix_G_max_allreduce + (G_max,)
+            matrix_A_allreduce = self.grad_reducer_A(matrix_A_allreduce)
+            matrix_G_allreduce = self.grad_reducer_G(matrix_G_allreduce)
+            matrix_A_max_allreduce = self.grad_reducer_Amax(matrix_A_max_allreduce)
+            matrix_G_max_allreduce = self.grad_reducer_Gmax(matrix_G_max_allreduce)
+            new_grads = ()
+            for i in range(54):
+                g = gradients[i * 3]
+                temp_a = matrix_A_allreduce[i]
+                temp_g = matrix_G_allreduce[i]
+                temp_a = self.cast(temp_a, mstype.float32)
+                temp_g = self.cast(temp_g, mstype.float32)
+                matrix_A_inv_max = self.log(matrix_A_max_allreduce[i])
+                matrix_A_inv_max = self.mul(matrix_A_inv_max, -1)
+                matrix_A_inv_max = self.exp(matrix_A_inv_max)
+                temp_a = self.mul(temp_a, matrix_A_inv_max)
+                matrix_G_inv_max = self.log(matrix_G_max_allreduce[i])
+                matrix_G_inv_max = self.mul(matrix_G_inv_max, -1)
+                matrix_G_inv_max = self.exp(matrix_G_inv_max)
+                temp_g = self.mul(temp_g, matrix_G_inv_max)
+                temp_max = self.mul(matrix_A_max_allreduce[i], matrix_G_max_allreduce[i])
+                temp_max = self.mul(temp_max, self.feature_map[i])
+                temp_a = self.cast(temp_a, mstype.float16)
+                temp_g = self.cast(temp_g, mstype.float16)
+                if i == 53:
+                    g = self.cube_matmul_left_fc(temp_g, g)
+                    g = self.cube_matmul_right_fc(g, temp_a, temp_max)
+                else:
+                    g = self.cube_matmul_left(temp_g, g)
+                    g = self.cube_matmul_right_mul(g, temp_a, temp_max)
+                fake_A = self.assign(self.matrix_A[i], temp_a)
+                fake_G = self.assign(self.matrix_G[i], temp_g)
+                fake_max = self.assign(self.matrix_max_inv[i], temp_max)
+                g = F.depend(g, fake_A)
+                g = F.depend(g, fake_G)
+                g = F.depend(g, fake_max)
+                if i == 53:
+                    new_grads = new_grads + (g,)
+                else:
+                    new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
+            gradients = new_grads
+        else:
+            new_grads = ()
+            for i in range(54):
+                g = gradients[i * 3]
+                matrix_A = self.matrix_A[i]
+                matrix_G = self.matrix_G[i]
+                matrix_max = self.matrix_max_inv[i]
+                matrix_A = F.depend(matrix_A, g)
+                matrix_G = F.depend(matrix_G, g)
+                matrix_max = F.depend(matrix_max, g)
+                if i == 53:
+                    g = self.cube_matmul_left_fc(matrix_G, g)
+                    g = self.cube_matmul_right_fc(g, matrix_A, matrix_max)
+                    new_grads = new_grads + (g,)
+                else:
+                    g = self.cube_matmul_left(matrix_G, g)
+                    g = self.cube_matmul_right_mul(g, matrix_A, matrix_max)
+                    new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
+            gradients = new_grads
+
+        if self.weight_decay > 0:
+            gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags,
+                                       params, gradients)
+        gradients = self.scale_grad(gradients)
+        lr = self.get_lr()
+        success = self.hyper_map(F.partial(momentum_opt, self.opt, lr, self.momentum), gradients, params, moments)
+        return success
diff --git a/tests/st/networks/models/resnet50/src_thor/thor_layer.py b/tests/st/networks/models/resnet50/src_thor/thor_layer.py
new file mode 100644
index 0000000000..6ef86de3b5
--- /dev/null
+++ b/tests/st/networks/models/resnet50/src_thor/thor_layer.py
@@ -0,0 +1,481 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""thor_layer"""
+import numpy as np
+import mindspore as ms
+import mindspore.common.dtype as mstype
+from mindspore._checkparam import check_bool, twice, check_int_positive
+from mindspore._extends import cell_attr_register
+from mindspore.common.initializer import initializer
+from mindspore.common.parameter import Parameter
+from mindspore.common.tensor import Tensor
+from mindspore.nn.cell import Cell
+from mindspore.nn.layer.activation import get_activation
+from mindspore.ops import operations as P
+
+C0 = 16
+
+
+def caculate_device_shape(matrix_dim, channel, is_A):
+    ll = (0)
+    if is_A:
+        if channel // C0 == 0:
+            matrix_dim = (matrix_dim / channel) * C0
+        ll = (int(matrix_dim // C0), int(matrix_dim // C0), C0, C0), int(matrix_dim)
+    else:
+        ll = (int(matrix_dim // C0), int(matrix_dim // C0), C0, C0), int(matrix_dim)
+    return ll
+
+
+class _Conv(Cell):
+    r"""Applies a N-D convolution over an input signal composed of several input
+       planes.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 pad_mode,
+                 padding,
+                 dilation,
+                 group,
+                 data_format,
+                 has_bias,
+                 weight_init,
+                 bias_init,
+                 ):
+        super(_Conv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.pad_mode = pad_mode
+        self.padding = padding
+        self.dilation = dilation
+        self.group = group
+        self.data_format = data_format
+        self.has_bias = has_bias
+        if not (isinstance(in_channels, int) and in_channels > 0):
+            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op passed '
+                             + str(in_channels) + ', should be a int and greater than 0.')
+        if (not isinstance(kernel_size, tuple)) or len(kernel_size) != 2 or \
+                (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
+                kernel_size[0] < 1 or kernel_size[1] < 1:
+            raise ValueError('Attr \'kernel_size\' of \'Conv2D\' Op passed '
+                             + str(self.kernel_size) + ', should be a int or tuple and equal to or greater than 1.')
+        if in_channels % group != 0:
+            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op must be divisible by '
+                             'attr \'group\' of \'Conv2D\' Op.')
+        if out_channels % group != 0:
+            raise ValueError('Attr \'out_channels\' of \'Conv2D\' Op must be divisible by '
+                             'attr \'group\' of \'Conv2D\' Op.')
+
+        self.weight = Parameter(initializer(
+            weight_init, [out_channels, in_channels // group, *kernel_size]), name='weight')
+
+        if check_bool(has_bias):
+            self.bias = Parameter(_initializer(
+                bias_init, [out_channels]), name='bias')
+        else:
+            if bias_init != 'zeros':
+                logger.warning("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
+            self.bias = None
+
+    def construct(self, *inputs):
+        raise NotImplementedError
+
+
+class Conv2d_Thor(_Conv):
+    """Conv2d_Thor"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 pad_mode='same',
+                 padding=0,
+                 dilation=1,
+                 group=1,
+                 data_format='NCHW',
+                 has_bias=False,
+                 weight_init='normal',
+                 damping=0.03,
+                 loss_scale=1,
+                 frequency=278,
+                 bias_init='zeros'):
+        self.thor = True
+        ksizes = (1, kernel_size, kernel_size, 1)
+        self.hw = kernel_size * kernel_size
+        strides = (1, stride, stride, 1)
+        kernel_size = twice(kernel_size)
+        super(Conv2d_Thor, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            pad_mode,
+            padding,
+            dilation,
+            group,
+            data_format,
+            has_bias,
+            weight_init,
+            bias_init,
+        )
+        self.conv2d = P.Conv2D(out_channel=self.out_channels,
+                               kernel_size=self.kernel_size,
+                               mode=1,
+                               pad_mode=self.pad_mode,
+                               pad=self.padding,
+                               stride=self.stride,
+                               dilation=self.dilation,
+                               group=self.group
+                               )
+
+        self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
+        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
+        self.matrix_combine = P.CusMatrixCombine()
+        self.cholesky = P.CusCholeskyTrsm()
+        self.transpose02314 = P.CusTranspose02314()
+        self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
+        self.matrix_G_dim = self.out_channels
+        self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape(self.matrix_A_dim,
+                                                                                     self.in_channels, True)
+        self.matrix_G_device_shape, self.matrix_G_device_dim = caculate_device_shape(self.matrix_G_dim,
+                                                                                     self.in_channels, False)
+        self.matrix_A_device_temp_shape = (
+            self.matrix_A_device_shape[0], self.matrix_A_device_shape[2], self.matrix_A_device_shape[1],
+            self.matrix_A_device_shape[3])
+        self.matrix_G_device_temp_shape = (
+            self.matrix_G_device_shape[0], self.matrix_G_device_shape[2], self.matrix_G_device_shape[1],
+            self.matrix_G_device_shape[3])
+        self.matrix_A_inv = Parameter(
+            Tensor(np.reshape(np.identity(self.matrix_A_device_dim).astype(np.float16), self.matrix_A_device_shape)),
+            name='matrix_A_inv', requires_grad=False)
+        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False)
+        self.matrix_G_inv = Parameter(
+            Tensor(np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)),
+            name="matrix_G_inv", requires_grad=False)
+
+        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False)
+        self.fake_G = Tensor(
+            np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape))
+
+        self.shape = P.Shape()
+        self.reshape = P.Reshape()
+        self.transpose = P.Transpose()
+        self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
+        self.mul = P.Mul()
+        self.cast = P.Cast()
+        self.damping = Tensor(damping)
+        self.vector_matmul = P.CusBatchMatMul()
+        self.diag_block_dim = 128
+        self.channels_slice_flag = False
+        if self.in_channels % C0 != 0:
+            self.channels_slice_flag = True
+
+        self.padA_flag = False
+        if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \
+                and self.matrix_A_dim > self.diag_block_dim:
+            self.padA_flag = True
+            pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim
+            self.padA = P.Pad(((0, pad_dim), (0, pad_dim)))
+        self.device_shape_pad_flag = False
+        if self.matrix_A_dim != self.matrix_A_device_dim:
+            self.device_shape_pad_flag = True
+            self.device_shape_pad = P.Pad(((0, 0), (0, C0 - self.in_channels), (0, 0), (0, C0 - self.in_channels)))
+        self.slice = P.Slice()
+        self.gather = P.GatherV2()
+        self.freq = Tensor(frequency, mstype.int32)
+        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
+        self.axis = 0
+
+        dampingA_dim = self.matrix_A_dim
+        if (self.matrix_A_dim % self.diag_block_dim) != 0 and self.matrix_A_dim > self.diag_block_dim:
+            dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim
+        dampingG_dim = self.matrix_G_dim
+        if (self.matrix_G_dim % self.diag_block_dim) != 0 and self.matrix_G_dim > self.diag_block_dim:
+            dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim
+
+        self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32)
+        self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32)
+        self.fused_abs_max1 = P.CusFusedAbsMax1([self.matrix_A_dim, self.matrix_A_dim])
+        self.fused_abs_max2 = P.CusFusedAbsMax1()
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.sqrt = P.Sqrt()
+        self.getG = P.InsertGradientOf(self.save_gradient)
+
+    def save_gradient(self, dout):
+        """save_gradient"""
+        out = dout
+        dout = self.mul(dout, self.loss_scale)
+        dout = self.mul(dout, 32.0)
+        dout = self.transpose02314(dout)
+        dout_shape = self.shape(dout)
+        normalizer = dout_shape[0]
+
+        matrix_G = self.cube_matmul(dout, dout)
+        normalizer = self.cast(normalizer, ms.float32)
+        matrix_G = self.mul(matrix_G, 1.0 / normalizer)
+        damping_step = self.gather(self.damping, self.cov_step, 0)
+        self.cov_step = self.cov_step + self.freq
+        damping_step = self.cast(damping_step, mstype.float32)
+        damping = self.mul(damping_step, 32.0 / normalizer)
+        damping = self.sqrt(damping)
+        dampingG = self.cast(self.dampingG, mstype.float32)
+        matrix_G = matrix_G + damping * dampingG
+
+        matrix_G_inv = self.cholesky(matrix_G)
+        matrix_G_inv = self.vector_matmul(matrix_G_inv, matrix_G_inv)
+        matrix_G_inv_max = self.fused_abs_max2(matrix_G_inv)
+        matrix_G_inv_max = self.fused_abs_max2(matrix_G_inv_max)
+        self.G_inv_max = matrix_G_inv_max
+        matrix_G_inv = self.matrix_combine(matrix_G_inv)
+        matrix_G_inv = self.reshape(matrix_G_inv, self.matrix_G_device_temp_shape)
+        matrix_G_inv = self.transpose(matrix_G_inv, (2, 0, 1, 3))
+        matrix_G = self.cast(matrix_G_inv, mstype.float16)
+        self.matrix_G_inv = matrix_G
+        return out
+
+    def construct(self, x):
+        if self.thor:
+            matrix_A = self.img2col(x)
+            matrix_A_shape = self.shape(matrix_A)
+            normalizer = matrix_A_shape[0]
+            matrix_A = self.cube_matmul(matrix_A, matrix_A)
+
+            if self.channels_slice_flag:
+                matrix_A = self.reshape(matrix_A, (self.hw, C0, self.hw, C0))
+                matrix_A = self.slice(matrix_A, (0, 0, 0, 0), (self.hw, self.in_channels, self.hw, self.in_channels))
+                matrix_A = self.reshape(matrix_A, (self.matrix_A_dim, self.matrix_A_dim))
+            normalizer = self.cast(normalizer, ms.float32)
+            matrix_A = self.mul(matrix_A, 1.0 / normalizer)
+            if self.padA_flag:
+                matrix_A = self.padA(matrix_A)
+            damping_step = self.gather(self.damping, self.cov_step, self.axis)
+            damping_step = self.cast(damping_step, mstype.float32)
+            damping = self.mul(damping_step, 32.0 / normalizer)
+            damping = self.sqrt(damping)
+            damping_A = self.cast(self.dampingA, mstype.float32)
+            matrix_A = matrix_A + damping * damping_A
+            matrix_A_inv = self.cholesky(matrix_A)
+            matrix_A_inv = self.vector_matmul(matrix_A_inv, matrix_A_inv)
+            matrix_A_inv_max = self.fused_abs_max1(matrix_A_inv)
+            matrix_A_inv_max = self.fused_abs_max2(matrix_A_inv_max)
+            self.A_inv_max = matrix_A_inv_max
+            matrix_A_inv = self.matrix_combine(matrix_A_inv)
+            matrix_A_inv = self.cast(matrix_A_inv, mstype.float16)
+            if self.padA_flag:
+                matrix_A_inv = self.slice(matrix_A_inv, (0, 0), (self.matrix_A_dim, self.matrix_A_dim))
+
+            if self.device_shape_pad_flag:
+                matrix_A_inv = self.reshape(matrix_A_inv, (self.hw, self.in_channels, self.hw, self.in_channels))
+                matrix_A_inv = self.device_shape_pad(matrix_A_inv)
+            matrix_A_inv = self.reshape(matrix_A_inv, self.matrix_A_device_temp_shape)
+            matrix_A_inv = self.transpose(matrix_A_inv, (2, 0, 1, 3))
+            self.matrix_A_inv = matrix_A_inv
+            self.matrix_G_inv = self.fake_G
+            out = self.conv2d(x, self.weight)
+            out = self.getG(out)
+        else:
+            out = self.conv2d(x, self.weight)
+
+        return out
+
+    def extra_repr(self):
+        """extra_repr"""
+        s = 'input_channels={}, output_channels={}, kernel_size={},' \
+            'stride={},  pad_mode={}, padding={}, dilation={}, ' \
+            'group={}, data_format={}, has_bias={},' \
+            'weight_init={}, bias_init={}'.format(
+                self.in_channels,
+                self.out_channels,
+                self.kernel_size,
+                self.stride,
+                self.pad_mode,
+                self.padding,
+                self.dilation,
+                self.group,
+                self.data_format,
+                self.has_bias,
+                self.weight,
+                self.bias)
+
+        if self.has_bias:
+            s += ', bias={}'.format(self.bias)
+        return s
+
+
+class Dense_Thor(Cell):
+    """Dense_Thor"""
+
+    @cell_attr_register(attrs=['has_bias', 'activation'])
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 weight_init='normal',
+                 bias_init='zeros',
+                 damping=0.03,
+                 loss_scale=1,
+                 frequency=278,
+                 has_bias=True,
+                 activation=None):
+        super(Dense_Thor, self).__init__()
+        self.in_channels = check_int_positive(in_channels)
+        self.out_channels = check_int_positive(out_channels)
+        self.has_bias = check_bool(has_bias)
+        self.thor = True
+        if isinstance(weight_init, Tensor):
+            if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \
+                    weight_init.shape[1] != in_channels:
+                raise ValueError("weight_init shape error")
+
+        self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight")
+
+        if self.has_bias:
+            if isinstance(bias_init, Tensor):
+                if bias_init.dim() != 1 or bias_init.shape[0] != out_channels:
+                    raise ValueError("bias_init shape error")
+
+            self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias")
+
+        self.matmul = P.MatMul(transpose_b=True)
+        self.bias_add = P.BiasAdd()
+
+        self.activation = get_activation(activation)
+        self.activation_flag = self.activation is not None
+
+        self.matrix_A_inv = Parameter(Tensor(np.zeros([128, 128, 16, 16]).astype(np.float16)), name='matrix_A_inv',
+                                      requires_grad=False)
+        self.matrix_G_inv = Parameter(Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)), name="matrix_G_inv",
+                                      requires_grad=False)
+        self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16))
+
+        self.matmul = P.MatMul(transpose_b=True)
+        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
+        self.matrix_combine = P.CusMatrixCombine()
+        self.cholesky = P.CusCholeskyTrsm()
+        self.shape = P.Shape()
+        self.reshape = P.Reshape()
+        self.transpose = P.Transpose()
+        self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
+        self.mul = P.Mul()
+        self.cast = P.Cast()
+        self.damping = Tensor(damping)
+        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
+        self.vector_matmul = P.CusBatchMatMul()
+        self.pad = P.Pad(((0, 24), (0, 24)))
+        self.pad1 = P.Pad(((0, 8), (0, 8)))
+        self.slice = P.Slice()
+        self.gather = P.GatherV2()
+        self.assignadd = P.AssignAdd()
+        self.freq = Tensor(frequency, mstype.int32)
+        self.axis = 0
+        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False)
+        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False)
+        self.fused_abs_max1 = P.CusFusedAbsMax1([1000, 1000])
+        self.fused_abs_max2 = P.CusFusedAbsMax1()
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.dampingA = Tensor(np.identity(2048), mstype.float32)
+        self.dampingG = Tensor(np.identity(1024), mstype.float32)
+        self.add = P.TensorAdd()
+        self.sqrt = P.Sqrt()
+        self.getG = P.InsertGradientOf(self.save_gradient)
+
+    def save_gradient(self, dout):
+        """save_gradient"""
+        out = dout
+        dout = self.mul(dout, self.loss_scale)
+        dout = self.mul(dout, 32.0)
+        normalizer = 32
+        matrix_G = self.cube_matmul(dout, dout)
+        normalizer = self.cast(normalizer, ms.float32)
+        matrix_G = self.mul(matrix_G, 1.0 / normalizer)
+        matrix_G = self.pad(matrix_G)
+        damping_step = self.gather(self.damping, self.cov_step, 0)
+        damping_step = self.cast(damping_step, mstype.float32)
+        self.cov_step = self.cov_step + self.freq
+        damping = self.sqrt(damping_step)
+        dampingG = self.cast(self.dampingG, mstype.float32)
+        matrix_G = matrix_G + damping * dampingG
+        matrix_G_inv = self.cholesky(matrix_G)
+        matrix_G_inv = self.vector_matmul(matrix_G_inv, matrix_G_inv)
+        matrix_G_inv_max = self.fused_abs_max1(matrix_G_inv)
+        matrix_G_inv_max = self.fused_abs_max2(matrix_G_inv_max)
+        self.G_inv_max = matrix_G_inv_max
+        matrix_G_inv = self.matrix_combine(matrix_G_inv)
+        matrix_G_inv = self.slice(matrix_G_inv, (0, 0), (1000, 1000))
+        matrix_G_inv = self.pad1(matrix_G_inv)
+        matrix_G_inv_shape = self.shape(matrix_G_inv)
+        matrix_G_inv = self.reshape(matrix_G_inv, (matrix_G_inv_shape[0] / 16, 16, matrix_G_inv_shape[0] / 16, 16))
+        matrix_G_inv = self.transpose(matrix_G_inv, (2, 0, 1, 3))
+        matrix_G_inv = self.cast(matrix_G_inv, mstype.float16)
+        self.matrix_G_inv = matrix_G_inv
+        return out
+
+    def construct(self, x):
+        """construct"""
+        if self.thor:
+            inputs = self.cube_matmul(x, x)
+            normalizer = 32
+            normalizer = self.cast(normalizer, ms.float32)
+            matrix_A = self.mul(inputs, 1.0 / normalizer)
+
+            damping_step = self.gather(self.damping, self.cov_step, self.axis)
+            damping_step = self.cast(damping_step, mstype.float32)
+            damping = self.sqrt(damping_step)
+            dampingA = self.cast(self.dampingA, mstype.float32)
+            matrix_A = matrix_A + damping * dampingA
+            matrix_A_inv = self.cholesky(matrix_A)
+            matrix_A_inv = self.vector_matmul(matrix_A_inv, matrix_A_inv)
+
+            matrix_A_inv_max = self.fused_abs_max2(matrix_A_inv)
+            matrix_A_inv_max = self.fused_abs_max2(matrix_A_inv_max)
+            self.A_inv_max = matrix_A_inv_max
+
+            matrix_A_inv = self.matrix_combine(matrix_A_inv)
+            matrix_A_inv_shape = self.shape(matrix_A_inv)
+            matrix_A_inv = self.reshape(matrix_A_inv, (matrix_A_inv_shape[0] / 16, 16, matrix_A_inv_shape[0] / 16, 16))
+            matrix_A_inv = self.transpose(matrix_A_inv, (2, 0, 1, 3))
+            matrix_A_inv = self.cast(matrix_A_inv, mstype.float16)
+            self.matrix_A_inv = matrix_A_inv
+            self.matrix_G_inv = self.fake_G
+            output = self.matmul(x, self.weight)
+            output = self.getG(output)
+        else:
+            output = self.matmul(x, self.weight)
+
+        if self.has_bias:
+            output = self.bias_add(output, self.bias)
+        if self.activation_flag:
+            return self.activation(output)
+        return output
+
+    def extend_repr(self):
+        """extend_repr"""
+        str_info = 'in_channels={}, out_channels={}, weight={}, has_bias={}' \
+            .format(self.in_channels, self.out_channels, self.weight, self.has_bias)
+        if self.has_bias:
+            str_info = str_info + ', bias={}'.format(self.bias)
+
+        if self.activation_flag:
+            str_info = str_info + ', activation={}'.format(self.activation)
+
+        return str_info
diff --git a/tests/st/networks/models/resnet50/test_resnet50_imagenet.py b/tests/st/networks/models/resnet50/test_resnet50_imagenet.py
new file mode 100644
index 0000000000..c991b469ee
--- /dev/null
+++ b/tests/st/networks/models/resnet50/test_resnet50_imagenet.py
@@ -0,0 +1,385 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""train and evaluate resnet50 network on imagenet dataset"""
+
+import os
+import time
+from multiprocessing import Process, Queue
+import pytest
+import numpy as np
+
+from mindspore import context, Tensor
+from mindspore.communication.management import init
+from mindspore.parallel._auto_parallel_context import auto_parallel_context
+from mindspore.train.model import Model, ParallelMode
+from mindspore.train.callback import Callback
+from mindspore.train.loss_scale_manager import FixedLossScaleManager
+from mindspore.model_zoo.resnet import resnet50
+import mindspore.nn as nn
+import mindspore.dataset as ds
+
+from tests.st.networks.models.resnet50.src.dataset import create_dataset
+from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate
+from tests.st.networks.models.resnet50.src.config import config
+from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
+from tests.st.networks.models.resnet50.src_thor.config import config as thor_config
+from tests.st.networks.models.resnet50.src_thor.model_thor import Model as THOR_Model
+from tests.st.networks.models.resnet50.src_thor.resnet import resnet50 as resnet50_thor
+from tests.st.networks.models.resnet50.src_thor.thor import THOR
+
+
+MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json"
+MINDSPORE_HCCL_CONFIG_PATH_2 = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json"
+dataset_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
+eval_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
+
+np.random.seed(1)
+ds.config.set_seed(1)
+os.environ['GLOG_v'] = str(2)
+
+def get_model_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch):
+    """get_model_lr"""
+    lr_each_step = []
+    total_steps = steps_per_epoch * total_epochs
+    for i in range(total_steps):
+        epoch = (i + 1) / steps_per_epoch
+        base = (1.0 - float(epoch) / total_epochs) ** decay
+        lr_local = lr_init * base
+        if epoch >= 39:
+            lr_local = lr_local * 0.5
+        if epoch >= 40:
+            lr_local = lr_local * 0.5
+        lr_each_step.append(lr_local)
+    current_step = global_step
+    lr_each_step = np.array(lr_each_step).astype(np.float32)
+    learning_rate = lr_each_step[current_step:]
+    return learning_rate
+
+
+def get_model_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
+    """get_model_damping"""
+    damping_each_step = []
+    total_steps = steps_per_epoch * total_epochs
+    for step in range(total_steps):
+        epoch = (step + 1) / steps_per_epoch
+        damping_here = damping_init * (decay_rate ** (epoch / 10))
+        damping_each_step.append(damping_here)
+
+    current_step = global_step
+    damping_each_step = np.array(damping_each_step).astype(np.float32)
+    damping_now = damping_each_step[current_step:]
+    return damping_now
+
+
+class LossGet(Callback):
+    def __init__(self, per_print_times, data_size):
+        super(LossGet, self).__init__()
+        if not isinstance(per_print_times, int) or per_print_times < 0:
+            raise ValueError("print_step must be int and >= 0.")
+        self._per_print_times = per_print_times
+        self._loss = 0.0
+        self.data_size = data_size
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        loss = cb_params.net_outputs
+
+        if isinstance(loss, (tuple, list)):
+            if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
+                loss = loss[0]
+
+        if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
+            loss = np.mean(loss.asnumpy())
+
+        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
+
+        if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
+            raise ValueError("epoch: {} step: {}. Invalid loss, terminating training."
+                             .format(cb_params.cur_epoch_num, cur_step_in_epoch))
+        if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
+            self._loss = loss
+
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        self._per_step_mseconds = epoch_mseconds / self.data_size
+
+    def get_loss(self):
+        return self._loss
+
+    def get_per_step_time(self):
+        return self._per_step_mseconds
+
+
+def train_process(q, device_id, epoch_size, device_num, enable_hccl):
+    os.system("mkdir " + str(device_id))
+    os.chdir(str(device_id))
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
+    context.set_context(device_id=device_id)
+    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
+    os.environ['RANK_ID'] = str(device_id)
+    os.environ['RANK_SIZE'] = str(device_num)
+    if enable_hccl:
+        context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
+                                          mirror_mean=True, parameter_broadcast=True)
+        auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
+        init()
+
+    # network
+    net = resnet50(class_num=config.class_num)
+
+    # evaluation network
+    dist_eval_network = ClassifyCorrectCell(net)
+
+    if not config.use_label_smooth:
+        config.label_smooth_factor = 0.0
+
+    # loss
+    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean", smooth_factor=config.label_smooth_factor,
+                                            num_classes=config.class_num)
+
+    # train dataset
+    dataset = create_dataset(dataset_path=dataset_path, do_train=True,
+                             repeat_num=epoch_size, batch_size=config.batch_size)
+
+    step_size = dataset.get_dataset_size()
+    eval_interval = config.eval_interval
+    dataset.__loop_size__ = step_size * eval_interval
+
+    # evalutation dataset
+    eval_dataset = create_dataset(dataset_path=eval_path, do_train=False,
+                                  repeat_num=epoch_size, batch_size=config.eval_batch_size)
+
+    # loss scale
+    loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
+
+    # learning rate
+    lr = Tensor(get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max,
+                                  warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size,
+                                  steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode))
+
+    # optimizer
+    decayed_params = list(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name,
+                                 net.trainable_params()))
+    no_decayed_params = [param for param in net.trainable_params() if param not in decayed_params]
+    group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
+                    {'params': no_decayed_params},
+                    {'order_params': net.trainable_params()}]
+
+    if config.use_lars:
+        momentum = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
+                               use_nesterov=config.use_nesterov)
+        opt = nn.LARS(momentum, epsilon=config.lars_epsilon, hyperpara=config.lars_coefficient,
+                      weight_decay=config.weight_decay,
+                      decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name,
+                      lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name,
+                      loss_scale=config.loss_scale)
+
+    else:
+        opt = nn.Momentum(group_params, lr, config.momentum,
+                          weight_decay=config.weight_decay, loss_scale=config.loss_scale,
+                          use_nesterov=config.use_nesterov)
+
+    # model
+    model = Model(net, loss_fn=loss, optimizer=opt,
+                  loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False,
+                  metrics={'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num)},
+                  eval_network=dist_eval_network)
+
+    # model init
+    print("init_start", device_id)
+    model.init(dataset, eval_dataset)
+    print("init_stop", device_id)
+
+    # callbacks
+    loss_cb = LossGet(1, step_size)
+
+    # train and eval
+    print("run_start", device_id)
+    acc = 0.0
+    time_cost = 0.0
+    for epoch_idx in range(0, int(epoch_size / eval_interval)):
+        model.train(1, dataset, callbacks=loss_cb)
+        eval_start = time.time()
+        output = model.eval(eval_dataset)
+        eval_cost = (time.time() - eval_start) * 1000
+        acc = float(output["acc"])
+        time_cost = loss_cb.get_per_step_time()
+        loss = loss_cb.get_loss()
+        print("the {} epoch's resnet result:\n "
+              "device{}, training loss {}, acc {}, "
+              "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms".format(
+                  epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost))
+    q.put({'acc': acc, 'cost': time_cost})
+
+
+def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl):
+    os.system("mkdir " + str(device_id))
+    os.chdir(str(device_id))
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
+    context.set_context(device_id=device_id)
+    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2
+    os.environ['RANK_ID'] = str(device_id - 4)
+    os.environ['RANK_SIZE'] = str(device_num)
+    if enable_hccl:
+        context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
+                                          mirror_mean=True, parameter_broadcast=True)
+        auto_parallel_context().set_all_reduce_fusion_split_indices([107], "hccl_world_groupsum1")
+        auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum2")
+        auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum3")
+        auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum4")
+        auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum5")
+        init()
+
+    # network
+    damping = get_model_damping(0, 0.03, 0.87, 50, 5004)
+    net = resnet50_thor(class_num=thor_config.class_num, damping=damping, loss_scale=thor_config.loss_scale,
+                        frequency=thor_config.frequency)
+
+    # evaluation network
+    dist_eval_network = ClassifyCorrectCell(net)
+
+    if not thor_config.label_smooth:
+        thor_config.label_smooth_factor = 0.0
+
+    # loss
+    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean",
+                                            smooth_factor=thor_config.label_smooth_factor,
+                                            num_classes=thor_config.class_num)
+
+    # train dataset
+    dataset = create_dataset(dataset_path=dataset_path, do_train=True,
+                             repeat_num=epoch_size, batch_size=thor_config.batch_size)
+
+    step_size = dataset.get_dataset_size()
+    eval_interval = thor_config.eval_interval
+
+    # evalutation dataset
+    eval_dataset = create_dataset(dataset_path=eval_path, do_train=False,
+                                  repeat_num=epoch_size, batch_size=thor_config.eval_batch_size)
+
+    # loss scale
+    loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False)
+
+    # learning rate
+    lr = Tensor(get_model_lr(0, 0.045, 6, 70, 5004))
+
+    # optimizer
+    opt = THOR(filter(lambda x: x.requires_grad, net.get_parameters()), lr, thor_config.momentum,
+               filter(lambda x: 'matrix_A' in x.name, net.get_parameters()),
+               filter(lambda x: 'matrix_G' in x.name, net.get_parameters()),
+               filter(lambda x: 'A_inv_max' in x.name, net.get_parameters()),
+               filter(lambda x: 'G_inv_max' in x.name, net.get_parameters()),
+               thor_config.weight_decay, thor_config.loss_scale)
+
+    # model
+    model = THOR_Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2",
+                       keep_batchnorm_fp32=False,
+                       metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)},
+                       eval_network=dist_eval_network, frequency=thor_config.frequency)
+
+    # model init
+    print("init_start", device_id)
+    model.init(dataset, eval_dataset)
+    print("init_stop", device_id)
+
+    # callbacks
+    loss_cb = LossGet(1, step_size)
+
+    # train and eval
+    acc = 0.0
+    time_cost = 0.0
+    print("run_start", device_id)
+    for epoch_idx in range(0, int(epoch_size / eval_interval)):
+        model.train(eval_interval, dataset, callbacks=loss_cb)
+        eval_start = time.time()
+        output = model.eval(eval_dataset)
+        eval_cost = (time.time() - eval_start) * 1000
+        acc = float(output["acc"])
+        time_cost = loss_cb.get_per_step_time()
+        loss = loss_cb.get_loss()
+        print("the {} epoch's resnet result:\n "
+              "device{}, training loss {}, acc {}, "
+              "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms".format(
+                  epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost))
+    q.put({'acc': acc, 'cost': time_cost})
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_single
+def test_resnet_and_resnet_thor_imagenet_4p():
+    q = Queue()
+    q2 = Queue()
+
+    # resnet50
+    device_num = 4
+    epoch_size = 2
+    epoch_size_2 = 1
+    enable_hccl = True
+    process = []
+    process2 = []
+    for i in range(device_num):
+        device_id = i
+        process.append(Process(target=train_process,
+                               args=(q, device_id, epoch_size, device_num, enable_hccl)))
+        process2.append(Process(target=train_process_thor,
+                                args=(q2, device_id + 4, epoch_size_2, device_num, enable_hccl)))
+
+    for i in range(device_num):
+        process[i].start()
+        process2[i].start()
+
+    print("Waiting for all subprocesses done...")
+
+    for i in range(device_num):
+        process[i].join()
+        process2[i].join()
+
+    # resnet
+    acc = 0.0
+    cost = 0.0
+    for i in range(device_num):
+        output = q.get()
+        acc += output['acc']
+        cost += output['cost']
+    acc = acc / device_num
+    cost = cost / device_num
+
+    for i in range(device_num):
+        os.system("rm -rf " + str(i))
+    print("End training...")
+    assert acc > 0.15
+    assert cost < 20
+
+    # THOR
+    thor_acc = 0.0
+    thor_cost = 0.0
+    for i in range(device_num):
+        output = q2.get()
+        thor_acc += output['acc']
+        thor_cost += output['cost']
+    thor_acc = thor_acc / device_num
+    thor_cost = thor_cost / device_num
+
+    for i in range(4, device_num + 4):
+        os.system("rm -rf " + str(i))
+    print("End training...")
+    assert thor_acc > 0.22
+    assert thor_cost < 21
diff --git a/tests/st/networks/test_gpu_alexnet.py b/tests/st/networks/test_gpu_alexnet.py
index 1009981464..7a55006571 100644
--- a/tests/st/networks/test_gpu_alexnet.py
+++ b/tests/st/networks/test_gpu_alexnet.py
@@ -86,6 +86,6 @@ def test_trainTensor(num_classes=10, epoch=15, batch_size=32):
     for i in range(0, epoch):
         data = Tensor(np.ones([batch_size, 3, 227, 227]).astype(np.float32) * 0.01)
         label = Tensor(np.ones([batch_size]).astype(np.int32))
-        loss = train_network(data, label)
+        loss = train_network(data, label).asnumpy()
         losses.append(loss)
-    assert (losses[-1].asnumpy() < 0.01)
+    assert losses[-1] < 0.01
diff --git a/tests/st/networks/test_gpu_lenet.py b/tests/st/networks/test_gpu_lenet.py
index d2e1c90981..4677c7ad00 100644
--- a/tests/st/networks/test_gpu_lenet.py
+++ b/tests/st/networks/test_gpu_lenet.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 # ============================================================================
 
-import numpy as np
 import os
+
+import numpy as np
 import pytest
 
 import mindspore.context as context
@@ -24,19 +25,67 @@ import mindspore.dataset.transforms.vision.c_transforms as CV
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common import dtype as mstype
-from mindspore.common.initializer import initializer
 from mindspore.dataset.transforms.vision import Inter
-from mindspore.model_zoo.lenet import LeNet5
 from mindspore.nn import Dense, TrainOneStepCell, WithLossCell
 from mindspore.nn.metrics import Accuracy
 from mindspore.nn.optim import Momentum
 from mindspore.ops import operations as P
 from mindspore.train import Model
 from mindspore.train.callback import LossMonitor
+from mindspore.common.initializer import TruncatedNormal
 
 context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
 
 
+def conv(in_channels, out_channels, kernel_size, stride=1, padding=0):
+    """weight initial for conv layer"""
+    weight = weight_variable()
+    return nn.Conv2d(in_channels, out_channels,
+                     kernel_size=kernel_size, stride=stride, padding=padding,
+                     weight_init=weight, has_bias=False, pad_mode="valid")
+
+
+def fc_with_initialize(input_channels, out_channels):
+    """weight initial for fc layer"""
+    weight = weight_variable()
+    bias = weight_variable()
+    return nn.Dense(input_channels, out_channels, weight, bias)
+
+
+def weight_variable():
+    """weight initial"""
+    return TruncatedNormal(0.02)
+
+
+class LeNet5(nn.Cell):
+    def __init__(self, num_class=10, channel=1):
+        super(LeNet5, self).__init__()
+        self.num_class = num_class
+        self.conv1 = conv(channel, 6, 5)
+        self.conv2 = conv(6, 16, 5)
+        self.fc1 = fc_with_initialize(16 * 5 * 5, 120)
+        self.fc2 = fc_with_initialize(120, 84)
+        self.fc3 = fc_with_initialize(84, self.num_class)
+        self.relu = nn.ReLU()
+        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.flatten = nn.Flatten()
+
+    def construct(self, x):
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.conv2(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.fc3(x)
+        return x
+
+
 class LeNet(nn.Cell):
     def __init__(self):
         super(LeNet, self).__init__()
@@ -83,7 +132,7 @@ def multisteplr(total_steps, gap, base_lr=0.9, gamma=0.1, dtype=mstype.float32):
 def test_train_lenet():
     epoch = 100
     net = LeNet()
-    momentum = initializer(Tensor(np.array([0.9]).astype(np.float32)), [1])
+    momentum = 0.9
     learning_rate = multisteplr(epoch, 30)
 
     optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
@@ -95,9 +144,9 @@ def test_train_lenet():
     for i in range(epoch):
         data = Tensor(np.ones([net.batch_size, 3, 32, 32]).astype(np.float32) * 0.01)
         label = Tensor(np.ones([net.batch_size]).astype(np.int32))
-        loss = train_network(data, label)
+        loss = train_network(data, label).asnumpy()
         losses.append(loss)
-    print(losses)
+    assert losses[-1] < 0.01
 
 
 def create_dataset(data_path, batch_size=32, repeat_size=1,
@@ -154,4 +203,4 @@ def test_train_and_eval_lenet():
     print("============== Starting Testing ==============")
     ds_eval = create_dataset(os.path.join('/home/workspace/mindspore_dataset/mnist', "test"), 32, 1)
     acc = model.eval(ds_eval, dataset_sink_mode=True)
-    print("============== Accuracy:{} ==============".format(acc))
+    print("============== {} ==============".format(acc))
diff --git a/tests/st/networks/test_gpu_resnet.py b/tests/st/networks/test_gpu_resnet.py
index fe74fb63f1..6bd947c712 100644
--- a/tests/st/networks/test_gpu_resnet.py
+++ b/tests/st/networks/test_gpu_resnet.py
@@ -397,5 +397,5 @@ def test_trainTensor_amp(num_classes=10, epoch=18, batch_size=16):
         loss = train_network(data, label)
         losses.append(loss)
     assert (losses[-1][0].asnumpy() < 1)
-    assert (losses[-1][1].asnumpy() == False)
+    assert not losses[-1][1].asnumpy()
     assert (losses[-1][2].asnumpy() > 1)
diff --git a/tests/st/networks/test_network_main.py b/tests/st/networks/test_network_main.py
index 3216d25bfe..a05798bfbe 100644
--- a/tests/st/networks/test_network_main.py
+++ b/tests/st/networks/test_network_main.py
@@ -19,9 +19,8 @@ Usage:
     python test_network_main.py --net lenet --target Ascend
 """
 import argparse
+
 import numpy as np
-import os
-import time
 from models.alexnet import AlexNet
 from models.lenet import LeNet
 from models.resnetv1_5 import resnet50
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_crop_and_reszie.py b/tests/st/ops/ascend/test_aicpu_ops/test_crop_and_reszie.py
new file mode 100644
index 0000000000..f85751975d
--- /dev/null
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_crop_and_reszie.py
@@ -0,0 +1,49 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.api import ms_function
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+
+
+class Net(nn.Cell):
+    def __init__(self, crop_size):
+        super(Net, self).__init__()
+        self.crop_and_resize = P.CropAndResize()
+        self.crop_size = crop_size
+
+    @ms_function
+    def construct(self, x, boxes, box_index):
+        return self.crop_and_resize(x, boxes, box_index, self.crop_size)
+
+
+def test_net_float32():
+    batch_size = 1
+    num_boxes = 5
+    image_height = 256
+    image_width = 256
+    channels = 3
+    image = np.random.normal(size=[batch_size, image_height, image_width, channels]).astype(np.float32)
+    boxes = np.random.uniform(shape=[num_boxes, 4]).astype(np.float32)
+    box_index = np.random.uniform(shape=[num_boxes], low=0, high=batch_size).astype(np.int32)
+    crop_size = np.array([24, 24]).astype(np.int32)
+    net = Net(crop_size=Tensor(crop_size))
+    output = net(Tensor(image), Tensor(boxes), Tensor(box_index))
+    print(output.asnumpy())
diff --git a/tests/st/ops/ascend/test_fused_batchnorm.py b/tests/st/ops/ascend/test_aicpu_ops/test_ctc_loss.py
similarity index 51%
rename from tests/st/ops/ascend/test_fused_batchnorm.py
rename to tests/st/ops/ascend/test_aicpu_ops/test_ctc_loss.py
index 59e2df67de..67949bf767 100644
--- a/tests/st/ops/ascend/test_fused_batchnorm.py
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_ctc_loss.py
@@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,7 @@ import numpy as np
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
-from mindspore.common.initializer import initializer
-from mindspore.common.parameter import Parameter
+from mindspore.common.api import ms_function
 from mindspore.ops import operations as P
 
 context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
@@ -27,24 +26,18 @@ context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
 class Net(nn.Cell):
     def __init__(self):
         super(Net, self).__init__()
-        self.bn = P.FusedBatchNorm()
-        self.scale = Parameter(initializer('ones', [64]), name='scale')
-        self.b = Parameter(initializer('zeros', [64]), name='b')
-        self.mean = Parameter(initializer('ones', [64]), name='mean')
-        self.variance = Parameter(initializer('zeros', [64]), name='variance')
+        self.ctc_loss = P.CTCLoss()
 
-    def construct(self, x):
-        return self.bn(x, self.scale, self.b, self.mean, self.variance)[0]
+    @ms_function
+    def construct(self, inputs, labels_indices, labels_values, sequence_length):
+        return self.ctc_loss(inputs, labels_indices, labels_values, sequence_length)
 
 
-def test_net():
-    x = np.random.randn(1, 64, 112, 112).astype(np.float32)
-    # mean = np.random.randn(1,16,1,1).astype(np.float32)
-    # variance = np.random.randn(1,16,1,1).astype(np.float32)
-    fusedBn = Net()
-    output = fusedBn(Tensor(x))
-    print("***********x*********")
-    print(x)
-
-    print("***********output y*********")
+def test_net_float32():
+    x = np.rand.randn(2, 2, 3).astype(np.float32)
+    labels_indices = np.array([[0, 0], [1, 0]]).astype(np.int64)
+    labels_values = np.array([2, 2]).astype(np.int32)
+    sequence_length = np.array([2, 2]).astype(np.int32)
+    net = Net()
+    output = net(Tensor(x), Tensor(labels_indices), Tensor(labels_values), Tensor(sequence_length))
     print(output.asnumpy())
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_normal.py b/tests/st/ops/ascend/test_aicpu_ops/test_normal.py
new file mode 100644
index 0000000000..66254caf21
--- /dev/null
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_normal.py
@@ -0,0 +1,43 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.common import Tensor
+from mindspore.common import dtype as mstype
+
+
+context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend")
+
+
+class Net(nn.Cell):
+    def __init__(self, shape=None, mean=0.0, stddev=1.0, seed=0):
+        super(Net, self).__init__()
+        self._mean = Tensor(mean, mstype.float32)
+        self._stddev = Tensor(stddev, mstype.float32)
+        self._normal = P.Normal(seed=seed)
+        self._shape = shape
+
+    def construct(self):
+        return self._normal(self._shape, self._mean, self._stddev)
+
+
+def test_net_3x2x4():
+    mean = 0.0
+    stddev = 1.0
+    seed = 0
+    net = Net((3, 2, 4), mean, stddev, seed)
+    out = net()
+    assert out.shape == (3, 2, 4)
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_pack.py b/tests/st/ops/ascend/test_aicpu_ops/test_pack.py
index affb9b90ef..48f3560051 100644
--- a/tests/st/ops/ascend/test_aicpu_ops/test_pack.py
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_pack.py
@@ -33,15 +33,15 @@ class Net(nn.Cell):
 
 
 def test_net_bool():
-    x = np.random.randn(3, 5, 4)>0
-    y = np.random.randn(3, 5, 4)>0
+    x = np.random.randn(3, 5, 4) > 0
+    y = np.random.randn(3, 5, 4) > 0
     axis = -1
     net = Net((Tensor(x), Tensor(y)), axis)
     output = net()
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_int8():
@@ -53,7 +53,7 @@ def test_net_int8():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_uint8():
@@ -65,7 +65,7 @@ def test_net_uint8():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_int16():
@@ -77,7 +77,7 @@ def test_net_int16():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_uint16():
@@ -89,7 +89,7 @@ def test_net_uint16():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_int32():
@@ -101,7 +101,7 @@ def test_net_int32():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_uint32():
@@ -113,7 +113,7 @@ def test_net_uint32():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_int64():
@@ -125,8 +125,7 @@ def test_net_int64():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
-
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 def test_net_uint64():
     x = np.random.randn(3, 5, 4).astype(np.uint64)
@@ -137,7 +136,7 @@ def test_net_uint64():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_float16():
@@ -149,7 +148,7 @@ def test_net_float16():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_float32():
@@ -161,7 +160,7 @@ def test_net_float32():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
 
 
 def test_net_float64():
@@ -173,4 +172,4 @@ def test_net_float64():
     print(x)
     print(y)
     print(output.asnumpy())
-    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+    assert np.array_equal(output.asnumpy(), np.stack([x, y], axis))
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_reverse_sequence.py b/tests/st/ops/ascend/test_aicpu_ops/test_reverse_sequence.py
new file mode 100644
index 0000000000..5927b62560
--- /dev/null
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_reverse_sequence.py
@@ -0,0 +1,55 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.api import ms_function
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+
+
+class Net(nn.Cell):
+    def __init__(self, seq_dim, batch_dim):
+        super(Net, self).__init__()
+        self.reverse_sequence = P.ReverseSequence(seq_dim=seq_dim, batch_dim=batch_dim)
+
+    @ms_function
+    def construct(self, x, seq_lengths):
+        return self.reverse_sequence(x, seq_lengths)
+
+
+def test_net_int8():
+    x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.int8)
+    seq_lengths = np.array([1, 2, 3]).astype(np.int32)
+    seq_dim = 0
+    batch_dim = 1
+    net = Net(seq_dim, batch_dim)
+    output = net(Tensor(x), Tensor(seq_lengths))
+    expected = np.array([1, 5, 9], [4, 2, 6], [7, 8, 3]).astype(np.int8)
+    assert np.array_equal(output.asnumpy(), expected)
+
+
+def test_net_int32():
+    x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.int32)
+    seq_lengths = np.array([1, 2, 3]).astype(np.int64)
+    seq_dim = 1
+    batch_dim = 0
+    net = Net(seq_dim, batch_dim)
+    output = net(Tensor(x), Tensor(seq_lengths))
+    expected = np.array([1, 2, 3], [5, 4, 6], [9, 8, 7]).astype(np.int32)
+    assert np.array_equal(output.asnumpy(), expected)
diff --git a/tests/st/ops/ascend/test_autocast.py b/tests/st/ops/ascend/test_autocast.py
index 2891e79d69..448dc9b4d6 100644
--- a/tests/st/ops/ascend/test_autocast.py
+++ b/tests/st/ops/ascend/test_autocast.py
@@ -75,93 +75,93 @@ def test_tensor_auto_cast():
     t_fp64 = Tensor(np.ones([2, 1, 2, 2]), mstype.float64)
     net = TensorAutoCast()
     rs = net(t_uint8, t_int8)
-    assert rs.dtype() == mstype.int16
+    assert rs.dtype == mstype.int16
     rs = net(t_uint8, t_int16)
-    assert rs.dtype() == mstype.int16
+    assert rs.dtype == mstype.int16
     rs = net(t_uint8, t_int32)
-    assert rs.dtype() == mstype.int32
+    assert rs.dtype == mstype.int32
     rs = net(t_uint8, t_int64)
-    assert rs.dtype() == mstype.int64
+    assert rs.dtype == mstype.int64
     rs = net(t_int8, t_int16)
-    assert rs.dtype() == mstype.int16
+    assert rs.dtype == mstype.int16
     rs = net(t_int8, t_int32)
-    assert rs.dtype() == mstype.int32
+    assert rs.dtype == mstype.int32
     rs = net(t_int8, t_int64)
-    assert rs.dtype() == mstype.int64
+    assert rs.dtype == mstype.int64
     rs = net(t_int16, t_int32)
-    assert rs.dtype() == mstype.int32
+    assert rs.dtype == mstype.int32
     rs = net(t_int16, t_int64)
-    assert rs.dtype() == mstype.int64
+    assert rs.dtype == mstype.int64
     rs = net(t_int32, t_int64)
-    assert rs.dtype() == mstype.int64
+    assert rs.dtype == mstype.int64
 
     rs = net(t_fp16, t_fp32)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = net(t_fp16, t_fp64)
-    assert rs.dtype() == mstype.float64
+    assert rs.dtype == mstype.float64
     rs = net(t_fp32, t_fp64)
-    assert rs.dtype() == mstype.float64
+    assert rs.dtype == mstype.float64
 
     rs = net(t_uint8, t_fp16)
-    assert rs.dtype() == mstype.float16
+    assert rs.dtype == mstype.float16
     rs = net(t_uint8, t_fp32)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = net(t_uint8, t_fp64)
-    assert rs.dtype() == mstype.float64
+    assert rs.dtype == mstype.float64
     rs = net(t_int8, t_fp64)
-    assert rs.dtype() == mstype.float64
+    assert rs.dtype == mstype.float64
     rs = net(t_int16, t_fp64)
-    assert rs.dtype() == mstype.float64
+    assert rs.dtype == mstype.float64
     rs = net(t_int32, t_fp64)
-    assert rs.dtype() == mstype.float64
+    assert rs.dtype == mstype.float64
     rs = net(t_int64, t_fp64)
-    assert rs.dtype() == mstype.float64
+    assert rs.dtype == mstype.float64
 
     rs = net(t_fp16, t_int8)
-    assert rs.dtype() == mstype.float16
+    assert rs.dtype == mstype.float16
     rs = net(t_fp16, t_uint8)
-    assert rs.dtype() == mstype.float16
+    assert rs.dtype == mstype.float16
     rs = net(t_fp16, t_int16)
-    assert rs.dtype() == mstype.float16
+    assert rs.dtype == mstype.float16
     rs = net(t_fp16, t_int32)
-    assert rs.dtype() == mstype.float16
+    assert rs.dtype == mstype.float16
     rs = net(t_fp16, t_int64)
-    assert rs.dtype() == mstype.float16
+    assert rs.dtype == mstype.float16
 
     tint = TensorIntAutoCast()
     rs = tint(t_uint8)
-    assert rs.dtype() == mstype.uint8
+    assert rs.dtype == mstype.uint8
     rs = tint(t_int8)
-    assert rs.dtype() == mstype.int8
+    assert rs.dtype == mstype.int8
     rs = tint(t_int16)
-    assert rs.dtype() == mstype.int16
+    assert rs.dtype == mstype.int16
     rs = tint(t_int32)
-    assert rs.dtype() == mstype.int32
+    assert rs.dtype == mstype.int32
     rs = tint(t_int64)
-    assert rs.dtype() == mstype.int64
+    assert rs.dtype == mstype.int64
     rs = tint(t_fp16)
-    assert rs.dtype() == mstype.float16
+    assert rs.dtype == mstype.float16
     rs = tint(t_fp32)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = tint(t_fp64)
-    assert rs.dtype() == mstype.float64
+    assert rs.dtype == mstype.float64
     tfp = TensorFPAutoCast()
     rs = tfp(t_uint8)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = tfp(t_int8)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = tfp(t_int16)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = tfp(t_int32)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = tfp(t_int64)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = tfp(t_fp16)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = tfp(t_fp32)
-    assert rs.dtype() == mstype.float32
+    assert rs.dtype == mstype.float32
     rs = tfp(t_fp64)
-    assert rs.dtype() == mstype.float64
+    assert rs.dtype == mstype.float64
 
     t_uint16 = Tensor(np.ones([2, 1, 2, 2]), mstype.uint16)
     t_uint32 = Tensor(np.ones([2, 1, 2, 2]), mstype.uint32)
diff --git a/tests/st/ops/ascend/test_tbe_ops/test_bias_add.py b/tests/st/ops/ascend/test_tbe_ops/test_bias_add.py
index bc4dcf7cdc..9455b7902b 100644
--- a/tests/st/ops/ascend/test_tbe_ops/test_bias_add.py
+++ b/tests/st/ops/ascend/test_tbe_ops/test_bias_add.py
@@ -35,7 +35,7 @@ class Net(nn.Cell):
         self.biasAdd = P.BiasAdd()
 
         if isinstance(bias_init, Tensor):
-            if bias_init.dim() != 1 or bias_init.shape()[0] != output_channels:
+            if bias_init.dim() != 1 or bias_init.shape[0] != output_channels:
                 raise ValueError("bias_init shape error")
 
         self.bias = Parameter(initializer(
diff --git a/tests/st/ops/ascend/test_tdt_data_ms.py b/tests/st/ops/ascend/test_tdt_data_ms.py
index 1cac1004fd..b4fae1c2d3 100644
--- a/tests/st/ops/ascend/test_tdt_data_ms.py
+++ b/tests/st/ops/ascend/test_tdt_data_ms.py
@@ -64,7 +64,7 @@ def convert_type(shapes, types):
     for np_shape, np_type in zip(shapes, types):
         input_np = np.zeros(np_shape, np_type)
         tensor = Tensor(input_np)
-        ms_types.append(tensor.dtype())
+        ms_types.append(tensor.dtype)
     return ms_types
 
 
diff --git a/tests/st/ops/cpu/test_argmax_op.py b/tests/st/ops/cpu/test_argmax_op.py
index 01481e427a..fdafd7750f 100644
--- a/tests/st/ops/cpu/test_argmax_op.py
+++ b/tests/st/ops/cpu/test_argmax_op.py
@@ -34,7 +34,7 @@ class NetArgmax(nn.Cell):
         x = Tensor(np.array([[1., 20., 5.],
                              [67., 8., 9.],
                              [130., 24., 15.]]).astype(np.float32))
-        self.x = Parameter(initializer(x, x.shape()), name='x')
+        self.x = Parameter(initializer(x, x.shape), name='x')
 
     def construct(self):
         return self.argmax(self.x)
diff --git a/tests/st/ops/cpu/test_equalcount_op.py b/tests/st/ops/cpu/test_equalcount_op.py
index 040e8a99ca..3f182f6bcc 100644
--- a/tests/st/ops/cpu/test_equalcount_op.py
+++ b/tests/st/ops/cpu/test_equalcount_op.py
@@ -32,8 +32,8 @@ class NetEqualCount(nn.Cell):
         self.equalcount = P.EqualCount()
         x = Tensor(np.array([1, 20, 5]).astype(np.int32))
         y = Tensor(np.array([2, 20, 5]).astype(np.int32))
-        self.x = Parameter(initializer(x, x.shape()), name='x')
-        self.y = Parameter(initializer(y, y.shape()), name='y')
+        self.x = Parameter(initializer(x, x.shape), name='x')
+        self.y = Parameter(initializer(y, y.shape), name='y')
 
     def construct(self):
         return self.equalcount(self.x, self.y)
diff --git a/tests/st/ops/cpu/test_lstm_op.py b/tests/st/ops/cpu/test_lstm_op.py
index 540cad98ee..7992bfbf0a 100644
--- a/tests/st/ops/cpu/test_lstm_op.py
+++ b/tests/st/ops/cpu/test_lstm_op.py
@@ -20,7 +20,6 @@ import mindspore.context as context
 from mindspore.common.api import ms_function
 from mindspore.common.initializer import initializer
 from mindspore.ops import composite as C
-from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import ParameterTuple, Parameter
 
@@ -28,34 +27,29 @@ context.set_context(device_target='CPU')
 
 
 class LstmNet(nn.Cell):
-    def __init__(self, seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+    def __init__(self, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
         super(LstmNet, self).__init__()
 
         num_directions = 1
         if bidirectional:
             num_directions = 2
 
-        self.lstm = P.LSTM(input_size, hidden_size, num_layers, has_bias, bidirectional, dropout)
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, has_bias, bidirectional, dropout)
         input_np = np.array([[[0.6755, -1.6607, 0.1367], [0.4276, -0.7850, -0.3758]],
                              [[-0.6424, -0.6095, 0.6639], [0.7918, 0.4147, -0.5089]],
                              [[-1.5612, 0.0120, -0.7289], [-0.6656, -0.6626, -0.5883]],
                              [[-0.9667, -0.6296, -0.7310], [0.1026, -0.6821, -0.4387]],
                              [[-0.4710, 0.6558, -0.3144], [-0.8449, -0.2184, -0.1806]]
                              ]).astype(np.float32)
-        self.x = Parameter(initializer(Tensor(input_np), [seq_len, batch_size, input_size]), name='x')
-
-        self.h = Parameter(initializer(
-            Tensor(
-                np.array([0.1, 0.1, 0.1, 0.1]).reshape((num_layers * num_directions, batch_size, hidden_size)).astype(
-                    np.float32)),
-            [num_layers * num_directions, batch_size, hidden_size]), name='h')
+        self.x = Tensor(input_np)
 
-        self.c = Parameter(initializer(
-            Tensor(
-                np.array([0.2, 0.2, 0.2, 0.2]).reshape((num_layers * num_directions, batch_size, hidden_size)).astype(
-                    np.float32)),
-            [num_layers * num_directions, batch_size, hidden_size]), name='c')
+        self.h = Tensor(np.array([0., 0., 0., 0.]).reshape((num_directions, batch_size, hidden_size)).astype(
+            np.float32))
 
+        self.c = Tensor(np.array([0., 0., 0., 0.]).reshape((num_directions, batch_size, hidden_size)).astype(
+            np.float32))
+        self.h = tuple((self.h,))
+        self.c = tuple((self.c,))
         wih = np.array([[3.4021e-01, -4.6622e-01, 4.5117e-01],
                         [-6.4257e-02, -2.4807e-01, 1.3550e-02],  # i
                         [-3.2140e-01, 5.5578e-01, 6.3589e-01],
@@ -63,7 +57,7 @@ class LstmNet(nn.Cell):
                         [-6.9863e-01, 5.9773e-01, -3.9062e-01],
                         [-3.0253e-01, -1.9464e-01, 7.0591e-01],
                         [-4.0835e-01, 3.6751e-01, 4.7989e-01],
-                        [-5.6894e-01, -5.0359e-01, 4.7491e-01]]).astype(np.float32)  # .reshape([1,-1])
+                        [-5.6894e-01, -5.0359e-01, 4.7491e-01]]).astype(np.float32).reshape([1, -1])
         whh = np.array([[-0.4820, -0.2350],
                         [-0.1195, 0.0519],
                         [0.2162, -0.1178],
@@ -71,16 +65,16 @@ class LstmNet(nn.Cell):
                         [0.4511, -0.3961],
                         [-0.5962, 0.0906],
                         [0.1867, -0.1225],
-                        [0.1831, 0.0850]]).astype(np.float32)  # .reshape([1,-1])
-        wih = wih.transpose((1, 0))
-        whh = whh.transpose((1, 0))
+                        [0.1831, 0.0850]]).astype(np.float32).reshape([1, -1])
         bih = np.zeros((1, 8)).astype(np.float32)
-        w_np = np.concatenate((wih, whh, bih), axis=0).reshape([-1, 1, 1])
+        w_np = np.concatenate((wih, whh, bih), axis=1).reshape([-1, 1, 1])
         self.w = Parameter(initializer(Tensor(w_np), w_np.shape), name='w')
+        self.lstm.weight = ParameterTuple((self.w,))
 
     @ms_function
     def construct(self):
-        return self.lstm(self.x, self.h, self.c, self.w)
+        return self.lstm(self.x, (self.h, self.c))
+
 
 @pytest.mark.level0
 @pytest.mark.platform_x86_cpu
@@ -97,45 +91,46 @@ def test_lstm():
     num_directions = 1
     if bidirectional:
         num_directions = 2
-    net = LstmNet(seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout)
-    y, h, c, _, _ = net()
+    net = LstmNet(batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout)
+    y, (h, c) = net()
     print(y)
     print(c)
     print(h)
-    expect_y = np.array([[[-0.16709016, 0.13125697],
-                          [-0.08438572, -0.01969833]],
-                         [[-0.2746155, 0.32764038],
-                          [-0.06504016, -0.07770399]],
-                         [[-0.00140004, 0.17706314],
-                          [0.03244496, -0.10135599]],
-                         [[0.08328028, 0.06437367],
-                          [-0.04133911, -0.11072896]],
-                         [[0.19004421, -0.02852732],
-                          [0.09138509, -0.00344161]]]
-                        )
-    error = np.ones([num_layers, batch_size, hidden_size]) * 1.0e-4
-    diff = y.asnumpy() - expect_y
-    assert np.all(diff < error)
-    assert np.all(-diff < error)
-    #
-    expect_h = np.array([[[0.19004421, -0.02852732],
-                          [0.09138509, -0.00344161]]])
-
-    error = np.ones((num_layers * num_directions, batch_size, hidden_size)) * 1.0e-4
-    diff = h.asnumpy() - expect_h
-    assert np.all(diff < error)
-    assert np.all(-diff < error)
-    #
-    expect_c = np.array([[[0.34533143, -0.06313794],
-                          [0.169008, -0.00555446]]])
-    error = np.ones((num_layers * num_directions, batch_size, hidden_size)) * 1.0e-4
-    diff = c.asnumpy() - expect_c
-    assert np.all(diff < error)
-    assert np.all(-diff < error)
+    expect_y = [[[-0.17992045, 0.07819052],
+                 [-0.10745212, -0.06291768]],
+
+                [[-0.28830513, 0.30579978],
+                 [-0.07570618, -0.08868407]],
+
+                [[-0.00814095, 0.16889746],
+                 [0.02814853, -0.11208838]],
+
+                [[0.08157863, 0.06088024],
+                 [-0.04227093, -0.11514835]],
+
+                [[0.18908429, -0.02963362],
+                 [0.09106826, -0.00602506]]]
+    expect_h = [[[0.18908429, -0.02963362],
+                 [0.09106826, -0.00602506]]]
+    expect_c = [[[0.3434288, -0.06561527],
+                 [0.16838229, -0.00972614]]]
+
+    diff_y = y.asnumpy() - expect_y
+    error_y = np.ones([seq_len, batch_size, hidden_size]) * 1.0e-4
+    assert np.all(diff_y < error_y)
+    assert np.all(-diff_y < error_y)
+    diff_h = h.asnumpy() - expect_h
+    error_h = np.ones([num_layers * num_directions, batch_size, hidden_size]) * 1.0e-4
+    assert np.all(diff_h < error_h)
+    assert np.all(-diff_h < error_h)
+    diff_c = c.asnumpy() - expect_c
+    error_c = np.ones([num_layers * num_directions, batch_size, hidden_size]) * 1.0e-4
+    assert np.all(diff_c < error_c)
+    assert np.all(-diff_c < error_c)
 
 
 class MultiLayerBiLstmNet(nn.Cell):
-    def __init__(self, seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+    def __init__(self, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
         super(MultiLayerBiLstmNet, self).__init__()
 
         num_directions = 1
@@ -161,22 +156,26 @@ class MultiLayerBiLstmNet(nn.Cell):
                               [1.2223, -1.3248, 0.1207, -0.8256, 0.1816, 0.7057, -0.3105, 0.5713, 0.2804,
                                -1.0685]]]).astype(np.float32)
 
-        self.x = Parameter(initializer(Tensor(input_np), [seq_len, batch_size, input_size]), name='x')
-
-        self.h0 = Parameter(initializer(
-            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
-            [num_directions, batch_size, hidden_size]), name='h0')
-        self.c0 = Parameter(initializer(
-            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
-            [num_directions, batch_size, hidden_size]), name='c0')
-        self.h1 = Parameter(initializer(
-            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
-            [num_directions, batch_size, hidden_size]), name='h1')
-        self.c1 = Parameter(initializer(
-            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
-            [num_directions, batch_size, hidden_size]), name='c1')
-        self.h = ParameterTuple((self.h0, self.h1))
-        self.c = ParameterTuple((self.c0, self.c1))
+        self.x = Tensor(input_np)
+
+        self.h0 = Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32))
+        self.c0 = Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32))
+        self.h1 = Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32))
+        self.c1 = Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32))
+
+        self.h = tuple((self.h0, self.h1))
+        self.c = tuple((self.c0, self.c1))
+        input_size_list = [input_size, hidden_size * num_directions]
+        weights = []
+        bias_size = 0 if not has_bias else num_directions * hidden_size * 4
+        for i in range(num_layers):
+            weight_size = (input_size_list[i] + hidden_size) * num_directions * hidden_size * 4
+            w_np = np.ones([weight_size, 1, 1]).astype(np.float32) * 0.02
+            if has_bias:
+                bias_np = np.zeros([bias_size, 1, 1]).astype(np.float32)
+                w_np = np.concatenate([w_np, bias_np], axis=0)
+            weights.append(Parameter(initializer(Tensor(w_np), w_np.shape), name='weight' + str(i)))
+        self.lstm.weight = weights
 
     @ms_function
     def construct(self):
@@ -187,7 +186,6 @@ class MultiLayerBiLstmNet(nn.Cell):
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_multi_layer_bilstm():
-    seq_len = 5
     batch_size = 2
     input_size = 10
     hidden_size = 2
@@ -196,9 +194,9 @@ def test_multi_layer_bilstm():
     bidirectional = True
     dropout = 0.0
 
-    net = MultiLayerBiLstmNet(seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional,
+    net = MultiLayerBiLstmNet(batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional,
                               dropout)
-    y, h, c, _, _ = net()
+    y, (h, c) = net()
     print(y)
     print(h)
     print(c)
@@ -227,66 +225,53 @@ class Net(nn.Cell):
         num_directions = 1
         if bidirectional:
             num_directions = 2
-        input_np = np.array([[[-0.5907, 1.0557, 1.7283, 0.6706, -1.2550, -0.5298, -0.2290, -0.6735, 0.8555, 1.4836],
-                              [-1.7070, -0.5347, -0.9105, -0.2598, 0.0588, 1.5496, 1.0757, 0.3760, -1.2020, -0.2868]],
-
-                             [[0.0151, 0.2126, 0.8090, -0.5292, -2.5590, 0.4279, -0.3081, -1.4706, -0.0498, 1.2301],
-                              [0.4165, -0.5391, -0.0996, 0.1928, -0.4909, -0.1255, 0.4444, -1.3687, 1.3096, 0.6553]],
-
-                             [[-0.7802, -0.2083, -0.6388, 1.3757, 0.4293, 0.5363, 0.3202, -0.6687, -1.3864, -0.2953],
-                              [1.0799, -0.7204, 0.1130, -0.5857, -0.4855, -1.1068, 1.0126, 0.8716, 1.5460, -0.7392]],
-
-                             [[2.2645, -0.6586, -0.2227, 1.4290, -0.5006, -1.6576, -0.1793, 0.5319, 0.1360, 0.2707],
-                              [-0.4071, 0.1575, 1.4199, -0.9156, 0.1855, 0.4947, 1.0460, -0.6365, 0.1191, -0.6374]],
-
-                             [[0.2468, 1.0815, -0.4893, 0.0664, 0.6405, -2.2967, 0.7612, 0.8759, 0.5685, -1.0999],
-                              [-0.7272, -1.7750, -0.1164, -0.7159, 0.0061, -0.7839, -1.8329, 0.3434, -0.5634,
-                               0.5384]]]).astype(np.float32)
-
+        input_np = np.array([[[0.6755, -1.6607, 0.1367], [0.4276, -0.7850, -0.3758]],
+                             [[-0.6424, -0.6095, 0.6639], [0.7918, 0.4147, -0.5089]],
+                             [[-1.5612, 0.0120, -0.7289], [-0.6656, -0.6626, -0.5883]],
+                             [[-0.9667, -0.6296, -0.7310], [0.1026, -0.6821, -0.4387]],
+                             [[-0.4710, 0.6558, -0.3144], [-0.8449, -0.2184, -0.1806]]
+                             ]).astype(np.float32)
         self.x = Parameter(initializer(Tensor(input_np), [seq_len, batch_size, input_size]), name='x')
-
-        self.h0 = Parameter(initializer(
-            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
-            [num_directions, batch_size, hidden_size]), name='h0')
-
-        self.c0 = Parameter(initializer(
-            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
-            [num_directions, batch_size, hidden_size]), name='c0')
-
-        wih_l0 = np.array([[0.2300, 0.6668, 0.4703, 0.0425, 0.0464, 0.6825, 0.2249, -0.4315, -0.2449, 0.2964],
-                           [-0.2811, -0.3444, 0.2557, -0.5137, -0.5518, 0.1652, -0.6720, 0.1066, 0.3586, 0.6299],
-                           [0.5728, -0.1784, 0.5661, 0.4012, 0.3856, -0.1899, 0.3102, 0.3717, -0.5651, 0.1952],
-                           [0.1026, -0.0527, 0.1198, -0.3080, 0.2292, 0.5757, -0.3567, -0.2731, -0.0586, -0.2849],
-                           [0.2194, -0.1622, 0.3219, -0.3008, -0.3713, -0.3034, -0.2385, 0.0412, -0.5205, 0.0280],
-                           [-0.5499, -0.0733, -0.5236, -0.6753, -0.7045, -0.1839, -0.1037, -0.5026, -0.4055, -0.3416],
-                           [0.1573, -0.1301, -0.2882, -0.3464, 0.6643, 0.1980, -0.6804, 0.5359, 0.5996, 0.0124],
-                           [-0.6436, 0.0587, -0.6520, -0.0471, 0.1667, 0.6042, 0.5752, -0.6296, -0.2976,
-                            -0.3757]]).astype(np.float32).reshape([1, -1])
-
-        whh_l0 = np.array([[0.3358, 0.2790],
-                           [-0.5355, 0.0989],
-                           [-0.1402, 0.5120],
-                           [0.1335, 0.1653],
-                           [0.3533, -0.3531],
-                           [0.4166, -0.4420],
-                           [-0.5454, -0.1720],
-                           [0.0041, -0.0799]]).astype(np.float32).reshape([1, -1])
-
-        bih_l0 = np.array([0.5518, 0.1083, 0.4829, 0.0607, -0.1770, -0.6944, 0.3059, 0.5354]).astype(
-            np.float32).reshape([1, -1])
-        bhh_l0 = np.array([0.5025, -0.1261, -0.5405, 0.3220, -0.3441, 0.6488, -0.0284, -0.2334]).astype(
-            np.float32).reshape([1, -1])
-
-        w0_np = np.concatenate(
-            (wih_l0, whh_l0, bih_l0 + bhh_l0),
-            axis=1).reshape([-1, 1, 1])
-        self.w0 = Parameter(initializer(Tensor(w0_np), w0_np.shape), name='w0')
-        self.lstm = P.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
-                           has_bias=has_bias, bidirectional=bidirectional, dropout=dropout)
+        self.hlist = []
+        self.clist = []
+        self.hlist.append(Parameter(initializer(
+            Tensor(
+                np.array([0.1, 0.1, 0.1, 0.1]).reshape((num_directions, batch_size, hidden_size)).astype(
+                    np.float32)),
+            [num_directions, batch_size, hidden_size]), name='h'))
+        self.clist.append(Parameter(initializer(
+            Tensor(
+                np.array([0.2, 0.2, 0.2, 0.2]).reshape((num_directions, batch_size, hidden_size)).astype(
+                    np.float32)),
+            [num_directions, batch_size, hidden_size]), name='c'))
+        self.h = ParameterTuple(tuple(self.hlist))
+        self.c = ParameterTuple(tuple(self.clist))
+        wih = np.array([[3.4021e-01, -4.6622e-01, 4.5117e-01],
+                        [-6.4257e-02, -2.4807e-01, 1.3550e-02],  # i
+                        [-3.2140e-01, 5.5578e-01, 6.3589e-01],
+                        [1.6547e-01, -7.9030e-02, -2.0045e-01],
+                        [-6.9863e-01, 5.9773e-01, -3.9062e-01],
+                        [-3.0253e-01, -1.9464e-01, 7.0591e-01],
+                        [-4.0835e-01, 3.6751e-01, 4.7989e-01],
+                        [-5.6894e-01, -5.0359e-01, 4.7491e-01]]).astype(np.float32).reshape([1, -1])
+        whh = np.array([[-0.4820, -0.2350],
+                        [-0.1195, 0.0519],
+                        [0.2162, -0.1178],
+                        [0.6237, 0.0711],
+                        [0.4511, -0.3961],
+                        [-0.5962, 0.0906],
+                        [0.1867, -0.1225],
+                        [0.1831, 0.0850]]).astype(np.float32).reshape([1, -1])
+        bih = np.zeros((1, 8)).astype(np.float32)
+        w_np = np.concatenate((wih, whh, bih), axis=1).reshape([-1, 1, 1])
+        self.w = Parameter(initializer(Tensor(w_np), w_np.shape), name='weight0')
+        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
+                            has_bias=has_bias, bidirectional=bidirectional, dropout=dropout)
+        self.lstm.weight = ParameterTuple(tuple([self.w]))
 
     @ms_function
     def construct(self):
-        return self.lstm(self.x, self.h0, self.c0, self.w0)[0]
+        return self.lstm(self.x, (self.h, self.c))[0]
 
 
 @pytest.mark.level0
@@ -295,10 +280,10 @@ class Net(nn.Cell):
 def test_grad():
     seq_len = 5
     batch_size = 2
-    input_size = 10
+    input_size = 3
     hidden_size = 2
     num_layers = 1
-    has_bias = True
+    has_bias = False
     bidirectional = False
     dropout = 0.0
     net = Grad(Net(seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout))
@@ -322,7 +307,6 @@ def test_grad():
     print(dcx)
     print(dw)
 
-# test_multi_layer_bilstm()
-# test_lstm()
-# tf_lstm_test()
-# test_grad()
+test_multi_layer_bilstm()
+test_lstm()
+test_grad()
diff --git a/tests/st/ops/cpu/test_reduce_op.py b/tests/st/ops/cpu/test_reduce_op.py
new file mode 100644
index 0000000000..39b2d8fa14
--- /dev/null
+++ b/tests/st/ops/cpu/test_reduce_op.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import pytest
+import numpy as np
+from mindspore import Tensor
+from mindspore.ops import operations as P
+import mindspore.nn as nn
+import mindspore.context as context
+from mindspore.common.api import ms_function
+
+context.set_context(device_target="CPU")
+
+
+class NetReduce(nn.Cell):
+    def __init__(self):
+        super(NetReduce, self).__init__()
+        self.axis0 = 0
+        self.axis1 = 1
+        self.axis2 = -1
+        self.axis3 = (0, 1)
+        self.axis4 = (0, 1, 2)
+        self.reduce_mean = P.ReduceMean(False)
+        self.reduce_sum = P.ReduceSum(False)
+        self.reduce_max = P.ReduceMax(False)
+
+    @ms_function
+    def construct(self, indice):
+        return (self.reduce_mean(indice, self.axis0),
+                self.reduce_mean(indice, self.axis1),
+                self.reduce_mean(indice, self.axis2),
+                self.reduce_mean(indice, self.axis3),
+                self.reduce_mean(indice, self.axis4),
+                self.reduce_sum(indice, self.axis0),
+                self.reduce_sum(indice, self.axis2),
+                self.reduce_max(indice, self.axis0),
+                self.reduce_max(indice, self.axis2))
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_reduce():
+    reduce = NetReduce()
+    indice = Tensor(np.array([
+        [[0., 2., 1., 4., 0., 2.], [3., 1., 2., 2., 4., 0.]],
+        [[2., 0., 1., 5., 0., 1.], [1., 0., 0., 4., 4., 3.]],
+        [[4., 1., 4., 0., 0., 0.], [2., 5., 1., 0., 1., 3.]]
+    ]).astype(np.float32))
+    output = reduce(indice)
+    print(output[0])
+    print(output[1])
+    print(output[2])
+    print(output[3])
+    print(output[4])
+    print(output[5])
+    print(output[6])
+    print(output[7])
+    print(output[8])
+    expect_0 = np.array([[2., 1., 2., 3., 0., 1], [2., 2., 1., 2., 3., 2.]]).astype(np.float32)
+    expect_1 = np.array([[1.5, 1.5, 1.5, 3., 2., 1.], [1.5, 0., 0.5, 4.5, 2., 2.], [3., 3., 2.5, 0., 0.5, 1.5]]).astype(
+        np.float32)
+    expect_2 = np.array([[1.5, 2.], [1.5, 2.], [1.5, 2.]]).astype(np.float32)
+    expect_3 = np.array([2, 1.5, 1.5, 2.5, 1.5, 1.5]).astype(np.float32)
+    expect_4 = np.array([1.75]).astype(np.float32)
+    expect_5 = np.array([[6., 3., 6., 9., 0., 3.], [6., 6., 3., 6., 9., 6.]]).astype(np.float32)
+    expect_6 = np.array([[9., 12.], [9., 12.], [9., 12.]]).astype(np.float32)
+    expect_7 = np.array([[4., 2., 4., 5., 0., 2.], [3., 5., 2., 4., 4., 3.]]).astype(np.float32)
+    expect_8 = np.array([[4., 4.], [5., 4.], [4., 5.]]).astype(np.float32)
+    assert (output[0].asnumpy() == expect_0).all()
+    assert (output[1].asnumpy() == expect_1).all()
+    assert (output[2].asnumpy() == expect_2).all()
+    assert (output[3].asnumpy() == expect_3).all()
+    assert (output[4].asnumpy() == expect_4).all()
+    assert (output[5].asnumpy() == expect_5).all()
+    assert (output[6].asnumpy() == expect_6).all()
+    assert (output[7].asnumpy() == expect_7).all()
+    assert (output[8].asnumpy() == expect_8).all()
+
+
+test_reduce()
diff --git a/tests/st/ops/cpu/test_reduce_scatter.py b/tests/st/ops/cpu/test_reduce_scatter.py
new file mode 100644
index 0000000000..6b21efe89c
--- /dev/null
+++ b/tests/st/ops/cpu/test_reduce_scatter.py
@@ -0,0 +1,76 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common import dtype as mstype
+from mindspore.ops import operations as P
+import mindspore._ms_mpi as mpi
+# run comand:
+# mpirun -output-filename log -merge-stderr-to-stdout -np 3 python test_reduce_scatter.py
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+context.set_mpi_config(enable_mpi=True)
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.op = "sum"
+
+        self.reducescatter = P.HostReduceScatter(op=self.op, group=[0,1,2])
+
+    def construct(self, x):
+        return self.reducescatter(x)
+
+class AllGatherNet(nn.Cell):
+    def __init__(self):
+        super(AllGatherNet, self).__init__()
+        self.hostallgather = P.HostAllGather(group=(0, 1, 2))
+
+    def construct(self, x):
+        return self.hostallgather(x)  
+
+def test_net_reduce_scatter():
+    x = np.arange(12).astype(np.float32) * 0.1
+    
+    reducescatter = Net()
+    rankid = mpi.get_rank_id()
+    print("self rankid:", rankid)
+    output = reducescatter(Tensor(x, mstype.float32))
+    print("output:\n", output)
+    if rankid == 0:
+        expect_result = np.arange(4).astype(np.float32) * 0.3
+    if rankid == 1:
+        expect_result = np.arange(4, 8).astype(np.float32) * 0.3
+    if rankid == 2:
+        expect_result = np.arange(8, 12).astype(np.float32) * 0.3
+    diff = abs(output.asnumpy() - expect_result)
+    error = np.ones(shape=expect_result.shape) * 1.0e-6
+    assert np.all(diff < error)
+
+    allgather = AllGatherNet()
+    allgather_output = allgather(output)
+    print("allgather result:\n", allgather_output)
+    expect_allgather_result =  np.arange(12).astype(np.float32) * 0.3
+    diff = abs(allgather_output.asnumpy() - expect_allgather_result)
+    error = np.ones(shape=expect_allgather_result.shape) * 1.0e-6
+    assert np.all(diff < error)
+
+if __name__ == '__main__':
+    test_net_reduce_scatter()
diff --git a/tests/st/ops/cpu/test_slice_grad_op.py b/tests/st/ops/cpu/test_slice_grad_op.py
index 0d76ed847e..a3d56f0f05 100644
--- a/tests/st/ops/cpu/test_slice_grad_op.py
+++ b/tests/st/ops/cpu/test_slice_grad_op.py
@@ -40,7 +40,7 @@ class SliceGrad(nn.Cell):
 @pytest.mark.level0
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
-def test_slice():
+def test_slice_grad():
     x = Tensor(np.array([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]], [[5, 5, 5], [6, 6, 6]]]), mstype.float32)
     dy = Tensor(np.array([[[3., 1., 2.]], [[4., 1., 4.]]]), mstype.float32)
     slicegrad = SliceGrad()
@@ -54,6 +54,27 @@ def test_slice():
     print("output:\n", output)
     assert (output.asnumpy() == expect).all()
 
+class SliceGrad2(nn.Cell):
+    def __init__(self):
+        super(SliceGrad2, self).__init__()
+        self.slicegrad = G.SliceGrad()
+
+    def construct(self, dy, x):
+        return self.slicegrad(dy, x, (0, 1, 0), (2, 2, 2))
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_slice_grad2():
+    dy = Tensor(np.array([[[2., 3.], [4., 5.]], [[8., 9.], [10., 11.]]]), mstype.float32)
+    x = Tensor(np.arange(2 * 3 * 2).reshape(2, 3, 2), mstype.float32)
+    grad = SliceGrad2()
+    output = grad(dy, x)
+    print("output:\n", output)
+    expect = [[[0., 0.], [2., 3.], [4.,  5.]],
+              [[0., 0.], [8., 9.], [10., 11.]]]
+    assert (output.asnumpy() == expect).all()
 
 if __name__ == '__main__':
-    test_slice()
+    test_slice_grad()
+    test_slice_grad2()
diff --git a/tests/st/ops/cpu/test_slice_op.py b/tests/st/ops/cpu/test_slice_op.py
index 90c777ef50..e927c31689 100644
--- a/tests/st/ops/cpu/test_slice_op.py
+++ b/tests/st/ops/cpu/test_slice_op.py
@@ -21,6 +21,7 @@ import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common import dtype as mstype
 from mindspore.ops import operations as P
+from mindspore.ops.operations import _grad_ops as G
 
 context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
 
@@ -46,6 +47,27 @@ def test_slice():
     print("output:\n", output)
     assert (output.asnumpy() == expect).all()
 
+class Slice2(nn.Cell):
+    def __init__(self):
+        super(Slice2, self).__init__()
+        self.slice = P.Slice()
+
+    def construct(self, x):
+        return self.slice(x, (1, 0, 0), (1, 2, 3))
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_slice2():
+    x = Tensor(np.arange(3 * 2 * 3).reshape(3, 2, 3), mstype.float32)
+    expect = [[[6., 7.,  8.],
+               [9., 10., 11.]]]
+
+    slice_op = Slice2()
+    output = slice_op(x)
+    print("output:\n", output)
+    assert (output.asnumpy() == expect).all()
 
 if __name__ == '__main__':
     test_slice()
+    test_slice2()
diff --git a/tests/st/ops/cpu/test_softmax_cross_entropy_with_logits_op.py b/tests/st/ops/cpu/test_softmax_cross_entropy_with_logits_op.py
new file mode 100644
index 0000000000..79689b0b87
--- /dev/null
+++ b/tests/st/ops/cpu/test_softmax_cross_entropy_with_logits_op.py
@@ -0,0 +1,52 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+
+
+class NetSoftmaxCrossEntropyWithLogits(nn.Cell):
+    def __init__(self):
+        super(NetSoftmaxCrossEntropyWithLogits, self).__init__()
+        self.loss = nn.SoftmaxCrossEntropyWithLogits(sparse=False)
+
+    def construct(self, logits, labels):
+        return self.loss(logits, labels)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_softmax_cross_entropy_with_logits():
+    logits = Tensor(np.array([[1, 1, 10],
+                              [1, 10, 1],
+                              [10, 1, 1]]).astype(np.float32))
+    labels = Tensor(np.array([[0, 0, 1],
+                              [0, 1, 0],
+                              [1, 0, 0]]).astype(np.float32))
+    expect_loss = [0.00024673, 0.00024673, 0.00024673]
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    softmax_cross_entropy_with_logits = NetSoftmaxCrossEntropyWithLogits()
+    output = softmax_cross_entropy_with_logits(logits, labels)
+    error0 = 1.0e-6
+    diff0 = output.asnumpy() - expect_loss
+    assert np.all(abs(diff0) < error0)
+
+test_softmax_cross_entropy_with_logits()
diff --git a/tests/st/ops/cpu/test_softmax_op.py b/tests/st/ops/cpu/test_softmax_op.py
index ca3fef003d..6562381076 100644
--- a/tests/st/ops/cpu/test_softmax_op.py
+++ b/tests/st/ops/cpu/test_softmax_op.py
@@ -33,7 +33,7 @@ class NetSoftmax(nn.Cell):
         x = Tensor(np.array([[0.1, 0.3, 0.6],
                              [0.2, -0.6, 0.8],
                              [0.6, 1, 0.4]]).astype(np.float32))
-        self.x = Parameter(initializer(x, x.shape()), name='x')
+        self.x = Parameter(initializer(x, x.shape), name='x')
 
     def construct(self):
         return self.softmax(self.x)
diff --git a/tests/st/ops/cpu/test_softmax_with_cross_entropy_op.py b/tests/st/ops/cpu/test_softmax_with_cross_entropy_op.py
index b00dc9e323..45430e1e98 100644
--- a/tests/st/ops/cpu/test_softmax_with_cross_entropy_op.py
+++ b/tests/st/ops/cpu/test_softmax_with_cross_entropy_op.py
@@ -32,9 +32,9 @@ class NetSoftmaxWithCrossEntropy(nn.Cell):
         logits = Tensor(np.array([[1, 1, 10],
                                   [1, 10, 1],
                                   [10, 1, 1]]).astype(np.float32))
-        self.logits = Parameter(initializer(logits, logits.shape()), name='logits')
+        self.logits = Parameter(initializer(logits, logits.shape), name='logits')
         labels = Tensor(np.array([2, 1, 0]).astype(np.int32))
-        self.labels = Parameter(initializer(labels, labels.shape()), name='labels')
+        self.labels = Parameter(initializer(labels, labels.shape), name='labels')
         self.SoftmaxWithCrossEntropy = P.SparseSoftmaxCrossEntropyWithLogits(True)
 
     def construct(self):
diff --git a/tests/st/ops/cpu/test_sparse_apply_adam_op.py b/tests/st/ops/cpu/test_sparse_apply_adam_op.py
new file mode 100644
index 0000000000..a62c8bfa10
--- /dev/null
+++ b/tests/st/ops/cpu/test_sparse_apply_adam_op.py
@@ -0,0 +1,53 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.parameter import Parameter
+from mindspore.ops import operations as P
+import mindspore.common.dtype as mstype
+
+beta1_power = 0.9
+beta2_power = 0.999
+lr = 0.001
+beta1 = 0.9
+beta2 = 0.999
+epsilon = 1e-8
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.sparse_apply_adam = P.SparseApplyAdam()
+        self.var = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="var")
+        self.m = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="m")
+        self.v = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="v")
+
+    def construct(self, grad, indices):
+        out = self.sparse_apply_adam(self.var, self.m, self.v, beta1_power, beta2_power, lr, beta1, beta2, epsilon,
+                                     grad, indices)
+        return out
+
+
+def test_net():
+    gradient = Tensor(np.random.rand(3, 3, 3).astype(np.float32))
+    indices = Tensor([0, 1, 2], mstype.int32)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    sparse_apply_adam = Net()
+    output = sparse_apply_adam(gradient, indices)
+    print(output[0].asnumpy())
diff --git a/tests/st/ops/cpu/test_sparse_apply_ftrl_op.py b/tests/st/ops/cpu/test_sparse_apply_ftrl_op.py
new file mode 100644
index 0000000000..cc1c3e3507
--- /dev/null
+++ b/tests/st/ops/cpu/test_sparse_apply_ftrl_op.py
@@ -0,0 +1,50 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.parameter import Parameter
+from mindspore.ops import operations as P
+import mindspore.common.dtype as mstype
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.sparse_apply_ftrl = P.SparseApplyFtrl(lr=0.001, l1=0.0, l2=0.0, lr_power=-0.5)
+        self.var = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="var")
+        self.accum = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="accum")
+        self.linear = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="linear")
+
+    def construct(self, grad, indices):
+        out = self.sparse_apply_ftrl(self.var, self.accum, self.linear, grad, indices)
+        return out
+
+
+def test_net():
+    gradient = Tensor(np.random.rand(3, 3, 3).astype(np.float32))
+    indices = Tensor([0, 1, 2], mstype.int32)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    sparse_apply_ftrl = Net()
+    output = sparse_apply_ftrl(gradient, indices)
+    print(output[0].asnumpy())
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+    sparse_apply_ftrl = Net()
+    output = sparse_apply_ftrl(gradient, indices)
+    print(output[0].asnumpy())
diff --git a/tests/st/ops/cpu/test_sparse_apply_proximal_adagrad_op.py b/tests/st/ops/cpu/test_sparse_apply_proximal_adagrad_op.py
new file mode 100644
index 0000000000..0eaa11a201
--- /dev/null
+++ b/tests/st/ops/cpu/test_sparse_apply_proximal_adagrad_op.py
@@ -0,0 +1,47 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.parameter import Parameter
+from mindspore.ops import operations as P
+import mindspore.common.dtype as mstype
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.sparse_apply_proximal_adagrad = P.SparseApplyProximalAdagrad()
+        self.var = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="var")
+        self.accum = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="accum")
+        self.lr = 0.01
+        self.l1 = 0.0
+        self.l2 = 0.0
+
+    def construct(self, grad, indices):
+        out = self.sparse_apply_proximal_adagrad(self.var, self.accum, self.lr, self.l1, self.l2, grad, indices)
+        return out
+
+
+def test_net():
+    gradient = Tensor(np.random.rand(3, 3, 3).astype(np.float32))
+    indices = Tensor([0, 1, 2], mstype.int32)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    sparse_apply_proximal_adagrad = Net()
+    output = sparse_apply_proximal_adagrad(gradient, indices)
+    print(output.asnumpy()[0])
diff --git a/tests/st/ops/cpu/test_stridedslice_grad_op.py b/tests/st/ops/cpu/test_stridedslice_grad_op.py
index d255694691..3db1e0df37 100644
--- a/tests/st/ops/cpu/test_stridedslice_grad_op.py
+++ b/tests/st/ops/cpu/test_stridedslice_grad_op.py
@@ -19,6 +19,7 @@ import pytest
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
+from mindspore.common import dtype as mstype
 from mindspore.common.api import ms_function
 from mindspore.ops import operations as P
 from mindspore.ops.operations import _grad_ops as G
@@ -38,7 +39,7 @@ class StridedSliceGrad(nn.Cell):
 
 
 @pytest.mark.level0
-@pytest.mark.platform_x86_cpu_training
+@pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_slice():
     x = Tensor(np.array([[[1., 1., 1.], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]], [[5, 5, 5], [6, 7, 8]]]).astype(np.float32))
@@ -47,3 +48,29 @@ def test_slice():
     output = ssg(dy, x)
     expect = [[[0, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 0]], [[5, 1, 5], [6, 1, 8]]]
     assert (output.asnumpy() == expect).all()
+
+
+class StridedSliceGrad2(nn.Cell):
+    def __init__(self):
+        super(StridedSliceGrad2, self).__init__()
+        self.ssg = G.StridedSliceGrad()
+        self.shape = P.Shape()
+
+    @ms_function
+    def construct(self, dy, x):
+        return self.ssg(dy, self.shape(x), (0, 0, 0), (1, 4, 2), (1, 1, 1))
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_slice2():
+    x = Tensor(np.arange(2 * 4 * 2).reshape(2, 4, 2), mstype.float32)
+    dy = Tensor(np.arange(4 * 2).reshape(4, 2), mstype.float32)
+    ssg = StridedSliceGrad2()
+    output = ssg(dy, x)
+    expect = [[[0., 1.], [2., 3.], [4., 5.], [6., 7.]], [[0., 0.], [0., 0.], [0., 0.], [0., 0.]]]
+    assert (output.asnumpy() == expect).all()
+
+if __name__ == '__main__':
+    test_slice()
+    test_slice2()
diff --git a/tests/st/ops/cpu/test_stridedslice_op.py b/tests/st/ops/cpu/test_stridedslice_op.py
index bb404c5266..82098f9cd4 100644
--- a/tests/st/ops/cpu/test_stridedslice_op.py
+++ b/tests/st/ops/cpu/test_stridedslice_op.py
@@ -34,7 +34,7 @@ class StridedSlice(nn.Cell):
 
 
 @pytest.mark.level0
-@pytest.mark.platform_x86_cpu_training
+@pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_slice():
     x = Tensor(np.array([[[1., 1., 1.], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]], [[5, 5, 5], [6, 7, 8]]]).astype(np.float32))
@@ -43,3 +43,6 @@ def test_slice():
     expect = [[[5., 5., 5.],
                [6., 7., 8.]]]
     assert (output.asnumpy() == expect).all()
+
+if __name__ == '__main__':
+    test_slice()
diff --git a/tests/st/ops/gpu/test_adam_op.py b/tests/st/ops/gpu/test_adam_op.py
new file mode 100644
index 0000000000..6e2bb0ddab
--- /dev/null
+++ b/tests/st/ops/gpu/test_adam_op.py
@@ -0,0 +1,78 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.nn import Dense
+from mindspore.nn import TrainOneStepCell, WithLossCell
+from mindspore.nn.optim import Adam
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+
+
+class NetAdam(nn.Cell):
+    def __init__(self):
+        super(NetAdam, self).__init__()
+        self.batch_size = 1
+        self.reshape = P.Reshape()
+        weight = Tensor(np.ones([10, 16]).astype(np.float32) * 0.01)
+        self.fc1 = Dense(16, 10, weight_init=weight)
+
+    def construct(self, input_x):
+        output = self.reshape(input_x, (self.batch_size, -1))
+        output = self.fc1(output)
+        return output
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_adam():
+    epoch = 3
+    net = NetAdam()
+    optimizer = Adam(filter(lambda x: x.requires_grad,
+                            net.get_parameters()), learning_rate=0.01)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(
+        net_with_criterion, optimizer)
+    train_network.set_train()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    losses1 = []
+    for _ in range(epoch):
+        data = Tensor(np.arange(0, 16).reshape(
+            1, 1, 4, 4).astype(np.float32) * 0.01)
+        label = Tensor(np.array([0]).astype(np.int32))
+        loss = train_network(data, label)
+        losses1.append(loss.asnumpy())
+    assert losses1[0] > losses1[1]
+    assert losses1[1] > losses1[2]
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    losses2 = []
+    for _ in range(epoch):
+        data = Tensor(np.arange(0, 16).reshape(
+            1, 1, 4, 4).astype(np.float32) * 0.01)
+        label = Tensor(np.array([0]).astype(np.int32))
+        loss = train_network(data, label)
+        losses2.append(loss.asnumpy())
+    assert losses2[0] > losses2[1]
+    assert losses2[1] > losses2[2]
diff --git a/tests/st/ops/gpu/test_argmaxwithvalue_op.py b/tests/st/ops/gpu/test_argmaxwithvalue_op.py
new file mode 100644
index 0000000000..6ce729a6cb
--- /dev/null
+++ b/tests/st/ops/gpu/test_argmaxwithvalue_op.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetArgmaxWithValue(nn.Cell):
+    def __init__(self):
+        super(NetArgmaxWithValue, self).__init__()
+        axis1 = 0
+        axis2 = -1
+        self.argmax1 = P.ArgMaxWithValue(axis1)
+        self.argmax2 = P.ArgMaxWithValue(axis2)
+        self.argmax3 = P.ArgMaxWithValue()
+
+    def construct(self, x):
+        return (self.argmax1(x), self.argmax2(x), self.argmax3(x))
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_argmaxwithvalue():
+    x = Tensor(np.array([[1., 20., 5.],
+                         [67., 8., 9.],
+                         [130., 24., 15.],
+                         [0.3, -0.4, -15.]]).astype(np.float32))
+    expect1 = np.array([2, 2, 2]).astype(np.float32)
+    expect2 = np.array([1, 0, 0, 0]).astype(np.float32)
+    expect11 = np.array([130, 24, 15]).astype(np.float32)
+    expect22 = np.array([20, 67, 130, 0.3]).astype(np.float32)
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    argmax = NetArgmaxWithValue()
+    output = argmax(x)
+    assert (output[0][0].asnumpy() == expect1).all()
+    assert (output[0][1].asnumpy() == expect11).all()
+    assert (output[1][0].asnumpy() == expect2).all()
+    assert (output[1][1].asnumpy() == expect22).all()
+    assert (output[2][0].asnumpy() == expect1).all()
+    assert (output[2][1].asnumpy() == expect11).all()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    argmax = NetArgmaxWithValue()
+    output = argmax(x)
+    assert (output[0][0].asnumpy() == expect1).all()
+    assert (output[0][1].asnumpy() == expect11).all()
+    assert (output[1][0].asnumpy() == expect2).all()
+    assert (output[1][1].asnumpy() == expect22).all()
+    assert (output[2][0].asnumpy() == expect1).all()
+    assert (output[2][1].asnumpy() == expect11).all()
diff --git a/tests/st/ops/gpu/test_batchnorm_fold2_op.py b/tests/st/ops/gpu/test_batchnorm_fold2_op.py
index f888666d20..0ca186e13b 100644
--- a/tests/st/ops/gpu/test_batchnorm_fold2_op.py
+++ b/tests/st/ops/gpu/test_batchnorm_fold2_op.py
@@ -21,6 +21,7 @@ import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common.api import ms_function
 from mindspore.ops import operations as P
+from mindspore.ops.operations import _quant_ops as Q
 
 context.set_context(device_target='GPU')
 
@@ -28,7 +29,7 @@ context.set_context(device_target='GPU')
 class Net(nn.Cell):
     def __init__(self):
         super(Net, self).__init__()
-        self.op = P.BatchNormFold2(100000)
+        self.op = Q.BatchNormFold2(100000)
 
     @ms_function
     def construct(self, x, beta, gamma, batch_std, batch_mean, running_std, running_mean, current_step):
diff --git a/tests/st/ops/gpu/test_batchnorm_fold_grad_op.py b/tests/st/ops/gpu/test_batchnorm_fold_grad_op.py
index 655f344624..6091162a8f 100644
--- a/tests/st/ops/gpu/test_batchnorm_fold_grad_op.py
+++ b/tests/st/ops/gpu/test_batchnorm_fold_grad_op.py
@@ -20,7 +20,7 @@ import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common.api import ms_function
-from mindspore.ops import operations as P
+from mindspore.ops.operations import _quant_ops as Q
 
 context.set_context(device_target='GPU')
 
@@ -28,7 +28,7 @@ context.set_context(device_target='GPU')
 class Net(nn.Cell):
     def __init__(self):
         super(Net, self).__init__()
-        self.op = P.BatchNormFoldGrad(freeze_bn=10)
+        self.op = Q.BatchNormFoldGrad(freeze_bn=10)
 
     @ms_function
     def construct(self, d_batch_mean, d_batch_std, x, batch_mean, batch_std, current_step):
diff --git a/tests/st/ops/gpu/test_batchnorm_fold_op.py b/tests/st/ops/gpu/test_batchnorm_fold_op.py
index 0572a3b901..b5b09a24d4 100644
--- a/tests/st/ops/gpu/test_batchnorm_fold_op.py
+++ b/tests/st/ops/gpu/test_batchnorm_fold_op.py
@@ -20,7 +20,7 @@ import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common.api import ms_function
-from mindspore.ops import operations as P
+from mindspore.ops.operations import _quant_ops as Q
 
 context.set_context(device_target='GPU')
 
@@ -28,7 +28,7 @@ context.set_context(device_target='GPU')
 class Net(nn.Cell):
     def __init__(self):
         super(Net, self).__init__()
-        self.op = P.BatchNormFold(freeze_bn=10)
+        self.op = Q.BatchNormFold(momentum=0.9, freeze_bn=10)
 
     @ms_function
     def construct(self, x, mean, variance, current_step):
@@ -40,8 +40,8 @@ def np_result(x, mean, var, momentum, epsilon):
     np_mean = x.mean(axis=(0, 2, 3))
     np_var = x.var(axis=(0, 2, 3))
     n = x.shape[0] * x.shape[2] * x.shape[3]
-    mean_update = momentum * np_mean + (1 - momentum) * mean
-    var_update = momentum * np_var * n / (n - 1) + (1 - momentum) * var
+    mean_update = (1 - momentum) * np_mean + momentum * mean
+    var_update = (1 - momentum) * np_var * n / (n - 1) + momentum * var
     np_var = np.sqrt(np_var + epsilon)
     delay_mean = mean.copy()
     delay_std = np.sqrt(var + epsilon)
diff --git a/tests/st/ops/gpu/test_cast_op.py b/tests/st/ops/gpu/test_cast_op.py
index 4e1d32d9da..793d92d7bc 100644
--- a/tests/st/ops/gpu/test_cast_op.py
+++ b/tests/st/ops/gpu/test_cast_op.py
@@ -24,13 +24,15 @@ from mindspore.ops import operations as P
 
 
 class Net(Cell):
-    def __init__(self):
+    def __init__(self, type0, type1):
         super(Net, self).__init__()
         self.Cast = P.Cast()
+        self.type0 = type0
+        self.type1 = type1
 
-    def construct(self, x0, type0, x1, type1):
-        output = (self.Cast(x0, type0),
-                  self.Cast(x1, type1))
+    def construct(self, x0, x1):
+        output = (self.Cast(x0, self.type0),
+                  self.Cast(x1, self.type1))
         return output
 
 
@@ -44,8 +46,8 @@ def test_cast():
     t1 = mstype.float32
 
     context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
-    net = Net()
-    output = net(x0, t0, x1, t1)
+    net = Net(t0, t1)
+    output = net(x0, x1)
     type0 = output[0].asnumpy().dtype
     assert type0 == 'float16'
     type1 = output[1].asnumpy().dtype
@@ -62,8 +64,8 @@ def test_cast1():
     t1 = mstype.float32
 
     context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
-    net = Net()
-    output = net(x0, t0, x1, t1)
+    net = Net(t0, t1)
+    output = net(x0, x1)
     type0 = output[0].asnumpy().dtype
     assert type0 == 'float32'
     type1 = output[1].asnumpy().dtype
diff --git a/tests/st/ops/gpu/test_conv2d_op.py b/tests/st/ops/gpu/test_conv2d_op.py
index a42114a106..6af5fc3965 100644
--- a/tests/st/ops/gpu/test_conv2d_op.py
+++ b/tests/st/ops/gpu/test_conv2d_op.py
@@ -53,7 +53,7 @@ def test_conv2d():
                          [162, 174, 186],
                          [198, 210, 222]]]]).astype(np.float32)
 
-    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU", max_device_memory="0.2GB")
     conv2d = NetConv2d()
     output = conv2d(x, w)
     assert (output.asnumpy() == expect).all()
diff --git a/tests/st/ops/gpu/test_correction_mul_grad_op.py b/tests/st/ops/gpu/test_correction_mul_grad_op.py
index 0bb825a3b4..ce32b9ebe3 100644
--- a/tests/st/ops/gpu/test_correction_mul_grad_op.py
+++ b/tests/st/ops/gpu/test_correction_mul_grad_op.py
@@ -20,7 +20,7 @@ import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common.api import ms_function
-from mindspore.ops import operations as P
+from mindspore.ops.operations import _quant_ops as Q
 
 context.set_context(device_target='GPU')
 
@@ -28,7 +28,7 @@ context.set_context(device_target='GPU')
 class Net(nn.Cell):
     def __init__(self):
         super(Net, self).__init__()
-        self.op_w = P.CorrectionMulGrad()
+        self.op_w = Q.CorrectionMulGrad()
 
     @ms_function
     def construct(self, dy, x, batch_std, running_std):
diff --git a/tests/st/ops/gpu/test_correction_mul_op.py b/tests/st/ops/gpu/test_correction_mul_op.py
index f505c62650..e9216e1b29 100644
--- a/tests/st/ops/gpu/test_correction_mul_op.py
+++ b/tests/st/ops/gpu/test_correction_mul_op.py
@@ -20,7 +20,7 @@ import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common.api import ms_function
-from mindspore.ops import operations as P
+from mindspore.ops.operations import _quant_ops as Q
 
 context.set_context(device_target='GPU')
 
@@ -28,7 +28,7 @@ context.set_context(device_target='GPU')
 class Net(nn.Cell):
     def __init__(self):
         super(Net, self).__init__()
-        self.op = P.CorrectionMul()
+        self.op = Q.CorrectionMul()
 
     @ms_function
     def construct(self, x, batch_var, moving_var):
@@ -50,4 +50,4 @@ def test_correction_mul():
     diff = output.asnumpy() - expect
     assert np.all(diff < error)
     assert np.all(diff > error * -1)
-    assert output.shape() == expect.shape
+    assert output.shape == expect.shape
diff --git a/tests/st/ops/gpu/test_equal_op.py b/tests/st/ops/gpu/test_equal_op.py
index ad627c666d..6dc08b08bb 100644
--- a/tests/st/ops/gpu/test_equal_op.py
+++ b/tests/st/ops/gpu/test_equal_op.py
@@ -30,6 +30,21 @@ class NetEqual(Cell):
     def construct(self, x, y):
         return self.Equal(x, y)
 
+class NetNotEqual(Cell):
+    def __init__(self):
+        super(NetNotEqual, self).__init__()
+        self.NotEqual = P.NotEqual()
+
+    def construct(self, x, y):
+        return self.NotEqual(x, y)
+
+class NetGreaterEqual(Cell):
+    def __init__(self):
+        super(NetGreaterEqual, self).__init__()
+        self.GreaterEqual = P.GreaterEqual()
+
+    def construct(self, x, y):
+        return self.GreaterEqual(x, y)
 
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
@@ -50,16 +65,58 @@ def test_equal():
     equal = NetEqual()
     output0 = equal(x0, y0)
     assert np.all(output0.asnumpy() == expect0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     output1 = equal(x1, y1)
     assert np.all(output1.asnumpy() == expect1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
     equal = NetEqual()
     output0 = equal(x0, y0)
     assert np.all(output0.asnumpy() == expect0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     output1 = equal(x1, y1)
     assert np.all(output1.asnumpy() == expect1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_notequal():
+    x0 = Tensor(np.array([[1.2, 1], [1, 0]]).astype(np.float32))
+    y0 = Tensor(np.array([[1, 2]]).astype(np.float32))
+    expect0 = np.array([[True, True], [False, True]])
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    notequal = NetNotEqual()
+    output0 = notequal(x0, y0)
+    assert np.all(output0.asnumpy() == expect0)
+    assert output0.shape == expect0.shape
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    notequal = NetNotEqual()
+    output0 = notequal(x0, y0)
+    assert np.all(output0.asnumpy() == expect0)
+    assert output0.shape == expect0.shape
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_greaterqual():
+    x0 = Tensor(np.array([[1.2, 1], [1, 0]]).astype(np.float32))
+    y0 = Tensor(np.array([[1, 2]]).astype(np.float32))
+    expect0 = np.array([[True, False], [True, False]])
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    gequal = NetGreaterEqual()
+    output0 = gequal(x0, y0)
+    assert np.all(output0.asnumpy() == expect0)
+    assert output0.shape == expect0.shape
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    gequal = NetGreaterEqual()
+    output0 = gequal(x0, y0)
+    assert np.all(output0.asnumpy() == expect0)
+    assert output0.shape == expect0.shape
diff --git a/tests/st/ops/gpu/test_exp_op.py b/tests/st/ops/gpu/test_exp_op.py
index ea205c707a..d84ba4a3b3 100644
--- a/tests/st/ops/gpu/test_exp_op.py
+++ b/tests/st/ops/gpu/test_exp_op.py
@@ -49,19 +49,19 @@ def test_exp():
     output0 = exp(x0)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     output1 = exp(x1)
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
     exp = NetExp()
     output0 = exp(x0)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     output1 = exp(x1)
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
diff --git a/tests/st/ops/gpu/test_flatten_op.py b/tests/st/ops/gpu/test_flatten_op.py
index 3d8ba96b7f..504d7c06b3 100644
--- a/tests/st/ops/gpu/test_flatten_op.py
+++ b/tests/st/ops/gpu/test_flatten_op.py
@@ -31,6 +31,49 @@ class NetFlatten(nn.Cell):
         return self.flatten(x)
 
 
+class NetAllFlatten(nn.Cell):
+    def __init__(self):
+        super(NetAllFlatten, self).__init__()
+        self.flatten = P.Flatten()
+
+    def construct(self, x):
+        loop_count = 4
+        while loop_count > 0:
+            x = self.flatten(x)
+            loop_count = loop_count - 1
+        return x
+
+
+class NetFirstFlatten(nn.Cell):
+    def __init__(self):
+        super(NetFirstFlatten, self).__init__()
+        self.flatten = P.Flatten()
+        self.relu = P.ReLU()
+
+    def construct(self, x):
+        loop_count = 4
+        while loop_count > 0:
+            x = self.flatten(x)
+            loop_count = loop_count - 1
+        x = self.relu(x)
+        return x
+
+
+class NetLastFlatten(nn.Cell):
+    def __init__(self):
+        super(NetLastFlatten, self).__init__()
+        self.flatten = P.Flatten()
+        self.relu = P.ReLU()
+
+    def construct(self, x):
+        loop_count = 4
+        x = self.relu(x)
+        while loop_count > 0:
+            x = self.flatten(x)
+            loop_count = loop_count - 1
+        return x
+
+
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
@@ -46,3 +89,55 @@ def test_flatten():
     flatten = NetFlatten()
     output = flatten(x)
     assert (output.asnumpy() == expect).all()
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_all_flatten():
+    x = Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]).astype(np.float32))
+    expect = np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]).astype(np.float32)
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    flatten = NetAllFlatten()
+    output = flatten(x)
+    assert (output.asnumpy() == expect).all()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    flatten = NetAllFlatten()
+    output = flatten(x)
+    assert (output.asnumpy() == expect).all()
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_first_flatten():
+    x = Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]).astype(np.float32))
+    expect = np.array([[0, 0.3, 3.6], [0.4, 0.5, 0]]).astype(np.float32)
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    flatten = NetFirstFlatten()
+    output = flatten(x)
+    assert (output.asnumpy() == expect).all()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    flatten = NetFirstFlatten()
+    output = flatten(x)
+    assert (output.asnumpy() == expect).all()
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_last_flatten():
+    x = Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]).astype(np.float32))
+    expect = np.array([[0, 0.3, 3.6], [0.4, 0.5, 0]]).astype(np.float32)
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    flatten = NetLastFlatten()
+    output = flatten(x)
+    assert (output.asnumpy() == expect).all()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    flatten = NetLastFlatten()
+    output = flatten(x)
+    assert (output.asnumpy() == expect).all()
+    
\ No newline at end of file
diff --git a/tests/st/ops/gpu/test_ftrl_op.py b/tests/st/ops/gpu/test_ftrl_op.py
new file mode 100644
index 0000000000..55d5972c20
--- /dev/null
+++ b/tests/st/ops/gpu/test_ftrl_op.py
@@ -0,0 +1,76 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.nn import Dense
+from mindspore.nn import TrainOneStepCell, WithLossCell
+from mindspore.nn.optim import FTRL
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+
+
+class NetFtrl(nn.Cell):
+    def __init__(self):
+        super(NetFtrl, self).__init__()
+        self.batch_size = 1
+        self.reshape = P.Reshape()
+        weight = Tensor(np.ones([10, 16]).astype(np.float32) * 0.01)
+        self.fc1 = Dense(16, 10, weight_init=weight)
+
+    def construct(self, input_x):
+        output = self.reshape(input_x, (self.batch_size, -1))
+        output = self.fc1(output)
+        return output
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_ftrl():
+    epoch = 3
+    net = NetFtrl()
+    optimizer = FTRL(filter(lambda x: x.requires_grad,
+                            net.get_parameters()), learning_rate=0.01)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(
+        net_with_criterion, optimizer)
+    train_network.set_train()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    losses1 = []
+    for _ in range(epoch):
+        data = Tensor(np.arange(0, 16).reshape(
+            1, 1, 4, 4).astype(np.float32) * 0.01)
+        label = Tensor(np.array([0]).astype(np.int32))
+        loss = train_network(data, label)
+        losses1.append(loss.asnumpy())
+    assert losses1[0] > losses1[1]
+    assert losses1[1] > losses1[2]
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    losses2 = []
+    for _ in range(epoch):
+        data = Tensor(np.arange(0, 16).reshape(
+            1, 1, 4, 4).astype(np.float32) * 0.01)
+        label = Tensor(np.array([0]).astype(np.int32))
+        loss = train_network(data, label)
+        losses2.append(loss.asnumpy())
diff --git a/tests/st/ops/gpu/test_gelu_grad_op.py b/tests/st/ops/gpu/test_gelu_grad_op.py
index 24137c241d..82145b9d3f 100644
--- a/tests/st/ops/gpu/test_gelu_grad_op.py
+++ b/tests/st/ops/gpu/test_gelu_grad_op.py
@@ -58,7 +58,37 @@ def test_gelugrad():
     grad = Grad(net)
 
     output = grad(x_ms, dy_ms)
-    print(output)
     expect = [0.50963277, 0.9414753, 0.2667653, 0.21358444, 0.25243032, 0.0352667,
               0.34266686, 0.57757664, 0.04707306, 0.51536125]
     assert np.allclose(output[0].asnumpy(), expect)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_gelugrad_fp16():
+    np.random.seed(42)
+    x_np = np.random.randn(5, 3, 6).astype(np.float16)
+    dy_np = np.random.randn(5, 3, 6).astype(np.float16)
+    net = GeluNet()
+    grad = Grad(net)
+    output = grad(Tensor(x_np), Tensor(dy_np))
+    expect = [[[8.4045e-02, 3.7817e-01, -6.6748e-01, -3.6914e-01, -1.2415e-01, -4.6362e-01],
+               [3.3301e-01, 2.6270e-01, 7.7534e-04, -2.0947e-01, -2.2021e-01, -6.4880e-02],
+               [-2.3633e-01, 7.6538e-02, 1.8280e-02, 3.8635e-02, -1.6235e-01, 1.2964e-01]],
+
+              [[-1.4801e-02, 9.6130e-03, -2.1660e+00, -8.5602e-03, 3.3356e-02, -3.1885e-01],
+               [-2.0355e-02, 1.7737e-01, 3.8719e-03, -9.1895e-01, 8.4717e-02, 2.0593e-01],
+               [5.8350e-02, -1.0020e+00, 6.8652e-01, 1.3428e-01, 6.0352e-01, -2.6270e-01]],
+
+              [[-6.5820e-01, 5.1147e-02, -1.2650e-02, -3.2983e-01, -1.5410e+00, 4.3518e-02],
+               [-4.3359e-01, 1.2659e-01, 1.1792e-01, 2.2705e-02, -1.2329e-01, -3.5278e-01],
+               [6.2109e-01, 1.3611e-01, 1.7041e-01, 2.7124e-01, -5.5908e-02, 1.7212e-01]],
+
+              [[2.8320e-01, 8.3252e-01, 4.2480e-02, -3.4473e-01, 3.9429e-01, 3.1958e-01],
+               [3.6499e-02, 1.2250e-01, 7.1350e-02, -2.7267e-02, 3.0029e-01, -8.0566e-01],
+               [8.2617e-01, 5.1367e-01, -9.2480e-01, 3.3203e-02, -7.5684e-01, 8.8623e-01]],
+
+              [[5.4590e-01, -9.2383e-01, -2.8107e-02, 4.2432e-01, 4.6826e-01, 5.0879e-01],
+               [-1.4062e-01, 6.6284e-02, -2.9126e-01, -6.3086e-01, -8.6975e-02, 4.1504e-02],
+               [-6.3171e-03, 1.0852e-01, 1.3779e-02, 1.0947e+00, -3.0334e-02, 2.3828e+00]]]
+    assert np.allclose(output[0].asnumpy(), expect, rtol=1e-2)
diff --git a/tests/st/ops/gpu/test_gelu_op.py b/tests/st/ops/gpu/test_gelu_op.py
index d56f3e662d..ec8e0041db 100644
--- a/tests/st/ops/gpu/test_gelu_op.py
+++ b/tests/st/ops/gpu/test_gelu_op.py
@@ -91,3 +91,16 @@ def test_gelu_neg():
     y_ms = net(x_ms)
 
     assert np.allclose(y_np, y_ms.asnumpy())
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_gelu_4d_fp16():
+    x_np = np.random.random((32, 3, 224, 224)).astype(np.float16)
+    y_np = GeluCompute(x_np)
+
+    x_ms = Tensor(x_np)
+    net = GeluNet()
+    y_ms = net(x_ms)
+
+    assert np.allclose(y_np, y_ms.asnumpy(), rtol=1e-3)
diff --git a/tests/st/ops/gpu/test_log_op.py b/tests/st/ops/gpu/test_log_op.py
index 00007b4330..47af674f81 100644
--- a/tests/st/ops/gpu/test_log_op.py
+++ b/tests/st/ops/gpu/test_log_op.py
@@ -50,10 +50,10 @@ def test_log():
     output1 = log(x1)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
     log = NetLog()
@@ -61,7 +61,7 @@ def test_log():
     output1 = log(x1)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
diff --git a/tests/st/ops/gpu/test_maximum_op.py b/tests/st/ops/gpu/test_maximum_op.py
index eafd9e5136..9566554231 100644
--- a/tests/st/ops/gpu/test_maximum_op.py
+++ b/tests/st/ops/gpu/test_maximum_op.py
@@ -222,3 +222,27 @@ def test_broadcast_diff_dims():
     output_ms = net(Tensor(x1_np), Tensor(x2_np), Tensor(dy_np))
     assert np.allclose(output_ms[0].asnumpy(), expect_dx1)
     assert np.allclose(output_ms[1].asnumpy(), expect_dx2)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_maximum_int():
+    x = Tensor(np.array([[1, 2, 3]]).astype(np.int32))
+    y = Tensor(np.array([[2]]).astype(np.int32))
+    expect = [[2, 2, 3]]
+    error = np.ones(shape=[1, 3]) * 1.0e-5
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    max_op = Net()
+    output = max_op(x, y)
+    diff = output.asnumpy() - expect
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    max_op_2 = Net()
+    output = max_op_2(x, y)
+    diff = output.asnumpy() - expect
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
diff --git a/tests/st/ops/gpu/test_minimum_op.py b/tests/st/ops/gpu/test_minimum_op.py
index c5669b17e0..2a14a5bb04 100644
--- a/tests/st/ops/gpu/test_minimum_op.py
+++ b/tests/st/ops/gpu/test_minimum_op.py
@@ -218,3 +218,21 @@ def test_broadcast_diff_dims():
     output_ms = net(Tensor(x1_np), Tensor(x2_np), Tensor(dy_np))
     assert np.allclose(output_ms[0].asnumpy(), expect_dx1)
     assert np.allclose(output_ms[1].asnumpy(), expect_dx2)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_broadcast_int32():
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True, device_target='GPU')
+
+    x1_np = np.random.rand(3, 4).astype(np.int32)
+    x2_np = np.random.rand(3, 4).astype(np.int32)
+    dy_np = np.random.rand(3, 4).astype(np.int32)
+
+    net = Grad(MinimumNet())
+    output_ms = net(Tensor(x1_np), Tensor(x2_np), Tensor(dy_np))
+    output0_np = np.where(x1_np < x2_np, dy_np, 0)
+    output1_np = np.where(x1_np < x2_np, 0, dy_np)
+    assert np.allclose(output_ms[0].asnumpy(), output0_np)
+    assert np.allclose(output_ms[1].asnumpy(), output1_np)
diff --git a/tests/st/ops/gpu/test_momentum_op.py b/tests/st/ops/gpu/test_momentum_op.py
index d81e39a48c..48b1ed3380 100644
--- a/tests/st/ops/gpu/test_momentum_op.py
+++ b/tests/st/ops/gpu/test_momentum_op.py
@@ -49,7 +49,7 @@ def test_momentum():
     epoch = 3
     net = NetMomentum()
     learning_rate = initializer(Tensor(np.array([0.01]).astype(np.float32)), [1])
-    momentum = initializer(Tensor(np.array([0.9]).astype(np.float32)), [1])
+    momentum = 0.9
 
     optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
     criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
diff --git a/tests/st/ops/gpu/test_mul_op.py b/tests/st/ops/gpu/test_mul_op.py
index 2fbd744d24..6ae038a3fd 100644
--- a/tests/st/ops/gpu/test_mul_op.py
+++ b/tests/st/ops/gpu/test_mul_op.py
@@ -64,35 +64,35 @@ def test_mul():
     diff0 = output0.asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
 
     output1 = mul(x1, y1)
     expect1 = np.multiply(x1_np, y1_np)
     diff1 = output1.asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     output2 = mul(x2, y2)
     expect2 = np.multiply(x2_np, y2_np)
     diff2 = output2.asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output2.shape() == expect2.shape
+    assert output2.shape == expect2.shape
 
     output3 = mul(x3, y3)
     expect3 = np.multiply(x3_np, y3_np)
     diff3 = output3.asnumpy() - expect3
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
-    assert output3.shape() == expect3.shape
+    assert output3.shape == expect3.shape
 
     output4 = mul(x4, y4)
     expect4 = np.multiply(x4_np, y4_np)
     diff4 = output4.asnumpy() - expect4
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
     assert np.all(diff4 < error4)
-    assert output4.shape() == expect4.shape
+    assert output4.shape == expect4.shape
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
     mul = NetMul()
@@ -101,32 +101,32 @@ def test_mul():
     diff0 = output0.asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
 
     output1 = mul(x1, y1)
     expect1 = np.multiply(x1_np, y1_np)
     diff1 = output1.asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     output2 = mul(x2, y2)
     expect2 = np.multiply(x2_np, y2_np)
     diff2 = output2.asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output2.shape() == expect2.shape
+    assert output2.shape == expect2.shape
 
     output3 = mul(x3, y3)
     expect3 = np.multiply(x3_np, y3_np)
     diff3 = output3.asnumpy() - expect3
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
-    assert output3.shape() == expect3.shape
+    assert output3.shape == expect3.shape
 
     output4 = mul(x4, y4)
     expect4 = np.multiply(x4_np, y4_np)
     diff4 = output4.asnumpy() - expect4
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
     assert np.all(diff4 < error4)
-    assert output4.shape() == expect4.shape
+    assert output4.shape == expect4.shape
diff --git a/tests/st/ops/gpu/test_neg_op.py b/tests/st/ops/gpu/test_neg_op.py
index 8d93c2d5f0..2f1d662bfc 100644
--- a/tests/st/ops/gpu/test_neg_op.py
+++ b/tests/st/ops/gpu/test_neg_op.py
@@ -49,19 +49,19 @@ def test_neg():
     output0 = neg(x0)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     output1 = neg(x1)
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
     neg = NetNeg()
     output0 = neg(x0)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     output1 = neg(x1)
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
diff --git a/tests/st/ops/gpu/test_realdiv_op.py b/tests/st/ops/gpu/test_realdiv_op.py
index 9268660260..29144ba972 100644
--- a/tests/st/ops/gpu/test_realdiv_op.py
+++ b/tests/st/ops/gpu/test_realdiv_op.py
@@ -64,35 +64,35 @@ def test_real_div():
     diff0 = output0.asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
 
     output1 = real_div(x1, y1)
     expect1 = np.divide(x1_np, y1_np)
     diff1 = output1.asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     output2 = real_div(x2, y2)
     expect2 = np.divide(x2_np, y2_np)
     diff2 = output2.asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output2.shape() == expect2.shape
+    assert output2.shape == expect2.shape
 
     output3 = real_div(x3, y3)
     expect3 = np.divide(x3_np, y3_np)
     diff3 = output3.asnumpy() - expect3
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
-    assert output3.shape() == expect3.shape
+    assert output3.shape == expect3.shape
 
     output4 = real_div(x4, y4)
     expect4 = np.divide(x4_np, y4_np)
     diff4 = output4.asnumpy() - expect4
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
     assert np.all(diff4 < error4)
-    assert output4.shape() == expect4.shape
+    assert output4.shape == expect4.shape
 
     context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
     real_div = NetRealDiv()
@@ -101,32 +101,32 @@ def test_real_div():
     diff0 = output0.asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
 
     output1 = real_div(x1, y1)
     expect1 = np.divide(x1_np, y1_np)
     diff1 = output1.asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     output2 = real_div(x2, y2)
     expect2 = np.divide(x2_np, y2_np)
     diff2 = output2.asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output2.shape() == expect2.shape
+    assert output2.shape == expect2.shape
 
     output3 = real_div(x3, y3)
     expect3 = np.divide(x3_np, y3_np)
     diff3 = output3.asnumpy() - expect3
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
-    assert output3.shape() == expect3.shape
+    assert output3.shape == expect3.shape
 
     output4 = real_div(x4, y4)
     expect4 = np.divide(x4_np, y4_np)
     diff4 = output4.asnumpy() - expect4
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
     assert np.all(diff4 < error4)
-    assert output4.shape() == expect4.shape
+    assert output4.shape == expect4.shape
diff --git a/tests/st/ops/gpu/test_reciprocal_op.py b/tests/st/ops/gpu/test_reciprocal_op.py
index b03558cf4e..6fd2095353 100644
--- a/tests/st/ops/gpu/test_reciprocal_op.py
+++ b/tests/st/ops/gpu/test_reciprocal_op.py
@@ -49,19 +49,19 @@ def test_Reciprocal():
     output0 = reciprocal(x0)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     output1 = reciprocal(x1)
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
     reciprocal = NetReciprocal()
     output0 = reciprocal(x0)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     output1 = reciprocal(x1)
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
diff --git a/tests/st/ops/gpu/test_reduce_max_op.py b/tests/st/ops/gpu/test_reduce_max_op.py
index b2ab808c79..afe558452d 100644
--- a/tests/st/ops/gpu/test_reduce_max_op.py
+++ b/tests/st/ops/gpu/test_reduce_max_op.py
@@ -128,43 +128,43 @@ def test_ReduceMax():
     diff0 = abs(output[0].asnumpy() - expect0)
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output[0].shape() == expect0.shape
+    assert output[0].shape == expect0.shape
 
     expect1 = np.max(x1, axis=axis1, keepdims=keep_dims1)
     diff1 = abs(output[1].asnumpy() - expect1)
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output[1].shape() == expect1.shape
+    assert output[1].shape == expect1.shape
 
     expect2 = np.max(x2, axis=axis2, keepdims=keep_dims2)
     diff2 = abs(output[2].asnumpy() - expect2)
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output[2].shape() == expect2.shape
+    assert output[2].shape == expect2.shape
 
     expect3 = np.max(x3, axis=axis3, keepdims=keep_dims3)
     diff3 = abs(output[3].asnumpy() - expect3)
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
-    assert output[3].shape() == expect3.shape
+    assert output[3].shape == expect3.shape
 
     expect4 = np.max(x4, axis=np_axis4, keepdims=keep_dims4)
     diff4 = abs(output[4].asnumpy() - expect4)
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
     assert np.all(diff4 < error4)
-    assert output[4].shape() == expect4.shape
+    assert output[4].shape == expect4.shape
 
     expect5 = np.max(x5, axis=np_axis5, keepdims=keep_dims5)
     diff5 = abs(output[5].asnumpy() - expect5)
     error5 = np.ones(shape=expect5.shape) * 1.0e-5
     assert np.all(diff5 < error5)
-    assert output[5].shape() == expect5.shape
+    assert output[5].shape == expect5.shape
 
     expect6 = np.max(x6, axis=axis6, keepdims=keep_dims6)
     diff6 = abs(output[6].asnumpy() - expect6)
     error6 = np.ones(shape=expect6.shape) * 1.0e-5
     assert np.all(diff6 < error6)
-    assert output[6].shape() == expect6.shape
+    assert output[6].shape == expect6.shape
 
     expect7 = np.max(x7, axis=axis7, keepdims=keep_dims7)
     diff7 = abs(output[7].asnumpy() - expect7)
diff --git a/tests/st/ops/gpu/test_reduce_mean_op.py b/tests/st/ops/gpu/test_reduce_mean_op.py
index d6a10c631f..867d7e8a33 100644
--- a/tests/st/ops/gpu/test_reduce_mean_op.py
+++ b/tests/st/ops/gpu/test_reduce_mean_op.py
@@ -180,88 +180,88 @@ def test_ReduceMean():
     diff0 = abs(output[0].asnumpy() - expect0)
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output[0].shape() == expect0.shape
+    assert output[0].shape == expect0.shape
 
     expect1 = np.mean(x1, axis=axis1, keepdims=keep_dims1)
     diff1 = abs(output[1].asnumpy() - expect1)
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output[1].shape() == expect1.shape
+    assert output[1].shape == expect1.shape
 
     expect2 = np.mean(x2, axis=axis2, keepdims=keep_dims2)
     diff2 = abs(output[2].asnumpy() - expect2)
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output[2].shape() == expect2.shape
+    assert output[2].shape == expect2.shape
 
     expect3 = np.mean(x3, axis=axis3, keepdims=keep_dims3)
     diff3 = abs(output[3].asnumpy() - expect3)
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
-    assert output[3].shape() == expect3.shape
+    assert output[3].shape == expect3.shape
 
     expect4 = np.mean(x4, axis=axis4, keepdims=keep_dims4)
     diff4 = abs(output[4].asnumpy() - expect4)
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
     assert np.all(diff4 < error4)
-    assert output[4].shape() == expect4.shape
+    assert output[4].shape == expect4.shape
 
     expect5 = np.mean(x5, axis=axis5, keepdims=keep_dims5)
     diff5 = abs(output[5].asnumpy() - expect5)
     error5 = np.ones(shape=expect5.shape) * 1.0e-5
     assert np.all(diff5 < error5)
-    assert output[5].shape() == expect5.shape
+    assert output[5].shape == expect5.shape
 
     expect6 = np.mean(x6, axis=axis6, keepdims=keep_dims6)
     diff6 = abs(output[6].asnumpy() - expect6)
     error6 = np.ones(shape=expect6.shape) * 1.0e-5
     assert np.all(diff6 < error6)
-    assert output[6].shape() == expect6.shape
+    assert output[6].shape == expect6.shape
 
     expect7 = np.mean(x7, axis=axis7, keepdims=keep_dims7)
     diff7 = abs(output[7].asnumpy() - expect7)
     error7 = np.ones(shape=expect7.shape) * 1.0e-5
     assert np.all(diff7 < error7)
-    assert output[7].shape() == expect7.shape
+    assert output[7].shape == expect7.shape
 
     expect8 = np.mean(x8, axis=axis8, keepdims=keep_dims8)
     diff8 = abs(output[8].asnumpy() - expect8)
     error8 = np.ones(shape=expect8.shape) * 1.0e-5
     assert np.all(diff8 < error8)
-    assert output[8].shape() == expect8.shape
+    assert output[8].shape == expect8.shape
 
     expect9 = np.mean(x9, axis=axis9, keepdims=keep_dims9)
     diff9 = abs(output[9].asnumpy() - expect9)
     error9 = np.ones(shape=expect9.shape) * 1.0e-5
     assert np.all(diff9 < error9)
-    assert output[9].shape() == expect9.shape
+    assert output[9].shape == expect9.shape
 
     expect10 = np.mean(x10, axis=axis10, keepdims=keep_dims10)
     diff10 = abs(output[10].asnumpy() - expect10)
     error10 = np.ones(shape=expect10.shape) * 1.0e-5
     assert np.all(diff10 < error10)
-    assert output[10].shape() == expect10.shape
+    assert output[10].shape == expect10.shape
 
     expect11 = np.mean(x11, axis=axis11, keepdims=keep_dims11)
     diff11 = abs(output[11].asnumpy() - expect11)
     error11 = np.ones(shape=expect11.shape) * 1.0e-5
     assert np.all(diff11 < error11)
-    assert output[11].shape() == expect11.shape
+    assert output[11].shape == expect11.shape
 
     expect12 = np.mean(x12, axis=axis12, keepdims=keep_dims12)
     diff12 = abs(output[12].asnumpy() - expect12)
     error12 = np.ones(shape=expect12.shape) * 1.0e-5
     assert np.all(diff12 < error12)
-    assert output[12].shape() == expect12.shape
+    assert output[12].shape == expect12.shape
 
     expect13 = np.mean(x13, axis=axis13, keepdims=keep_dims13)
     diff13 = abs(output[13].asnumpy() - expect13)
     error13 = np.ones(shape=expect13.shape) * 1.0e-5
     assert np.all(diff13 < error13)
-    assert output[13].shape() == expect13.shape
+    assert output[13].shape == expect13.shape
 
     expect14 = np.mean(x14, axis=np_axis14, keepdims=keep_dims14)
     diff14 = abs(output[14].asnumpy() - expect14)
     error14 = np.ones(shape=expect14.shape) * 1.0e-5
     assert np.all(diff14 < error14)
-    assert output[14].shape() == expect14.shape
+    assert output[14].shape == expect14.shape
diff --git a/tests/st/ops/gpu/test_reduce_sum_op.py b/tests/st/ops/gpu/test_reduce_sum_op.py
index 6c16235a80..f80c421cbe 100644
--- a/tests/st/ops/gpu/test_reduce_sum_op.py
+++ b/tests/st/ops/gpu/test_reduce_sum_op.py
@@ -182,88 +182,88 @@ def test_ReduceSum():
     diff0 = abs(output[0].asnumpy() - expect0)
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output[0].shape() == expect0.shape
+    assert output[0].shape == expect0.shape
 
     expect1 = np.sum(x1, axis=axis1, keepdims=keep_dims1)
     diff1 = abs(output[1].asnumpy() - expect1)
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output[1].shape() == expect1.shape
+    assert output[1].shape == expect1.shape
 
     expect2 = np.sum(x2, axis=axis2, keepdims=keep_dims2)
     diff2 = abs(output[2].asnumpy() - expect2)
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output[2].shape() == expect2.shape
+    assert output[2].shape == expect2.shape
 
     expect3 = np.sum(x3, axis=axis3, keepdims=keep_dims3)
     diff3 = abs(output[3].asnumpy() - expect3)
     error3 = np.ones(shape=expect3.shape) * 1.0e-5
     assert np.all(diff3 < error3)
-    assert output[3].shape() == expect3.shape
+    assert output[3].shape == expect3.shape
 
     expect4 = np.sum(x4, axis=np_axis4, keepdims=keep_dims4)
     diff4 = abs(output[4].asnumpy() - expect4)
     error4 = np.ones(shape=expect4.shape) * 1.0e-5
     assert np.all(diff4 < error4)
-    assert output[4].shape() == expect4.shape
+    assert output[4].shape == expect4.shape
 
     expect5 = np.sum(x5, axis=np_axis5, keepdims=keep_dims5)
     diff5 = abs(output[5].asnumpy() - expect5)
     error5 = np.ones(shape=expect5.shape) * 1.0e-5
     assert np.all(diff5 < error5)
-    assert output[5].shape() == expect5.shape
+    assert output[5].shape == expect5.shape
 
     expect6 = np.sum(x6, axis=axis6, keepdims=keep_dims6)
     diff6 = abs(output[6].asnumpy() - expect6)
     error6 = np.ones(shape=expect6.shape) * 1.0e-5
     assert np.all(diff6 < error6)
-    assert output[6].shape() == expect6.shape
+    assert output[6].shape == expect6.shape
 
     expect7 = np.sum(x7, axis=axis7, keepdims=keep_dims7)
     diff7 = abs(output[7].asnumpy() - expect7)
     error7 = np.ones(shape=expect7.shape) * 1.0e-5
     assert np.all(diff7 < error7)
-    assert output[7].shape() == expect7.shape
+    assert output[7].shape == expect7.shape
 
     expect8 = np.sum(x8, axis=axis8, keepdims=keep_dims8)
     diff8 = abs(output[8].asnumpy() - expect8)
     error8 = np.ones(shape=expect8.shape) * 1.0e-5
     assert np.all(diff8 < error8)
-    assert output[8].shape() == expect8.shape
+    assert output[8].shape == expect8.shape
 
     expect9 = np.sum(x9, axis=axis9, keepdims=keep_dims9)
     diff9 = abs(output[9].asnumpy() - expect9)
     error9 = np.ones(shape=expect9.shape) * 1.0e-5
     assert np.all(diff9 < error9)
-    assert output[9].shape() == expect9.shape
+    assert output[9].shape == expect9.shape
 
     expect10 = np.sum(x10, axis=axis10, keepdims=keep_dims10)
     diff10 = abs(output[10].asnumpy() - expect10)
     error10 = np.ones(shape=expect10.shape) * 1.0e-5
     assert np.all(diff10 < error10)
-    assert output[10].shape() == expect10.shape
+    assert output[10].shape == expect10.shape
 
     expect11 = np.sum(x11, axis=axis11, keepdims=keep_dims11)
     diff11 = abs(output[11].asnumpy() - expect11)
     error11 = np.ones(shape=expect11.shape) * 1.0e-5
     assert np.all(diff11 < error11)
-    assert output[11].shape() == expect11.shape
+    assert output[11].shape == expect11.shape
 
     expect12 = np.sum(x12, axis=axis12, keepdims=keep_dims12)
     diff12 = abs(output[12].asnumpy() - expect12)
     error12 = np.ones(shape=expect12.shape) * 1.0e-5
     assert np.all(diff12 < error12)
-    assert output[12].shape() == expect12.shape
+    assert output[12].shape == expect12.shape
 
     expect13 = np.sum(x13, axis=axis13, keepdims=keep_dims13)
     diff13 = abs(output[13].asnumpy() - expect13)
     error13 = np.ones(shape=expect13.shape) * 1.0e-5
     assert np.all(diff13 < error13)
-    assert output[13].shape() == expect13.shape
+    assert output[13].shape == expect13.shape
 
     expect14 = np.sum(x14, axis=np_axis14, keepdims=keep_dims14)
     diff14 = abs(output[14].asnumpy() - expect14)
     error14 = np.ones(shape=expect14.shape) * 1.0e-5
     assert np.all(diff14 < error14)
-    assert output[14].shape() == expect14.shape
+    assert output[14].shape == expect14.shape
diff --git a/tests/st/ops/test_rmsprop.py b/tests/st/ops/gpu/test_rmsprop.py
similarity index 78%
rename from tests/st/ops/test_rmsprop.py
rename to tests/st/ops/gpu/test_rmsprop.py
index d0b65d627f..24d1003475 100644
--- a/tests/st/ops/test_rmsprop.py
+++ b/tests/st/ops/gpu/test_rmsprop.py
@@ -24,19 +24,30 @@ from mindspore.ops import operations as P
 context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
 
 
+class NetCenteredRMSProp(nn.Cell):
+    def __init__(self, lr, decay, momentum, epsilon):
+        super(NetCenteredRMSProp, self).__init__()
+        self.rms_opt = P.ApplyCenteredRMSProp()
+        self.lr = lr
+        self.decay = decay
+        self.momentum = momentum
+        self.epsilon = epsilon
+
+    def construct(self, var, g, mg, rms, mom):
+        return self.rms_opt(var, mg, rms, mom, g, self.lr, self.decay, self.momentum, self.epsilon)
+
+
 class NetRMSProp(nn.Cell):
-    def __init__(self, use_centered):
+    def __init__(self, lr, decay, momentum, epsilon):
         super(NetRMSProp, self).__init__()
-        self.use_centered = use_centered
-        if use_centered:
-            self.rms_opt = P.ApplyCenteredRMSProp()
-        else:
-            self.rms_opt = P.ApplyRMSProp()
+        self.lr = lr
+        self.decay = decay
+        self.momentum = momentum
+        self.epsilon = epsilon
+        self.rms_opt = P.ApplyRMSProp()
 
-    def construct(self, var, g, mg, rms, mom, lr, decay, momentum, epsilon):
-        if self.use_centered:
-            return self.rms_opt(var, mg, rms, mom, g, lr, decay, momentum, epsilon)
-        return self.rms_opt(var, rms, mom, lr, g, decay, momentum, epsilon)
+    def construct(self, var, g, mg, rms, mom):
+        return self.rms_opt(var, rms, mom, self.lr, g, self.decay, self.momentum, self.epsilon)
 
 
 def rmsprop_numpy(variable, gradients, mean_square, moment,
@@ -76,13 +87,14 @@ def test_rmsprop():
     if centered:
         rmspropcented_numpy(variable_np, gradients_np, mean_gradients_np, mean_square_np, moment_np,
                             learning_rate, decay, momentum, epsilon)
+        net = NetCenteredRMSProp(learning_rate, decay, momentum, epsilon)
+        _ = net(variable_ms, gradients_ms, mean_gradients_ms, mean_square_ms, moment_ms)
+
     else:
         rmsprop_numpy(variable_np, gradients_np, mean_square_np, moment_np,
                       learning_rate, decay, momentum, epsilon)
-
-    net = NetRMSProp(centered)
-    _ = net(variable_ms, gradients_ms, mean_gradients_ms, mean_square_ms,
-            moment_ms, learning_rate, decay, momentum, epsilon)
+        net = NetRMSProp(learning_rate, decay, momentum, epsilon)
+        _ = net(variable_ms, gradients_ms, mean_gradients_ms, mean_square_ms, moment_ms)
 
     error = np.ones(shape=variable_np.shape) * 10e-6
     diff = variable_ms.asnumpy() - variable_np
@@ -126,13 +138,13 @@ def test_rmspropcenter():
     if centered:
         rmspropcented_numpy(variable_np, gradients_np, mean_gradients_np, mean_square_np, moment_np,
                             learning_rate, decay, momentum, epsilon)
+        net = NetCenteredRMSProp(learning_rate, decay, momentum, epsilon)
+        _ = net(variable_ms, gradients_ms, mean_gradients_ms, mean_square_ms, moment_ms)
     else:
         rmsprop_numpy(variable_np, gradients_np, mean_square_np, moment_np,
                       learning_rate, decay, momentum, epsilon)
-
-    net = NetRMSProp(centered)
-    _ = net(variable_ms, gradients_ms, mean_gradients_ms, mean_square_ms, moment_ms,
-            learning_rate, decay, momentum, epsilon)
+        net = NetRMSProp(learning_rate, decay, momentum, epsilon)
+        _ = net(variable_ms, gradients_ms, mean_gradients_ms, mean_square_ms, moment_ms)
 
     error = np.ones(shape=variable_np.shape) * 10e-6
     diff = variable_ms.asnumpy() - variable_np
diff --git a/tests/st/ops/gpu/test_sigmoid_cross_entropy_with_logits_grad_op.py b/tests/st/ops/gpu/test_sigmoid_cross_entropy_with_logits_grad_op.py
new file mode 100644
index 0000000000..a548cab0e7
--- /dev/null
+++ b/tests/st/ops/gpu/test_sigmoid_cross_entropy_with_logits_grad_op.py
@@ -0,0 +1,62 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops.operations import _grad_ops as G
+
+
+class NetSigmoidCrossEntropyWithLogits(nn.Cell):
+    def __init__(self):
+        super(NetSigmoidCrossEntropyWithLogits, self).__init__()
+        self.sigmoid_cross_entropy_with_logits_grad = G.SigmoidCrossEntropyWithLogitsGrad()
+
+    def construct(self, logits, labels, dout):
+        return self.sigmoid_cross_entropy_with_logits_grad(logits, labels, dout)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_sigmoid_cross_entropy_with_logits():
+    logits = Tensor(np.array([[1, 1, 2],
+                              [1, 2, 1],
+                              [2, 1, 1]]).astype(np.float32))
+    labels = Tensor(np.array([[0, 0, 1],
+                              [0, 1, 0],
+                              [1, 0, 0]]).astype(np.float32))
+    dout = Tensor(np.ones(shape=[3, 3]).astype(np.float32))
+
+    expect = np.array([[0.731059, 0.731059, -0.119203],
+                       [0.731059, -0.119203, 0.731059],
+                       [-0.119203, 0.731059, 0.731059]]).astype(np.float32)
+
+    error = np.ones(shape=[3, 3]) * 1.0e-6
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    sigmoid_cross_entropy_with_logits = NetSigmoidCrossEntropyWithLogits()
+    output = sigmoid_cross_entropy_with_logits(logits, labels, dout)
+    diff = output.asnumpy() - expect
+    assert np.all(abs(diff) < error)
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
+    sigmoid_cross_entropy_with_logits = NetSigmoidCrossEntropyWithLogits()
+    output = sigmoid_cross_entropy_with_logits(logits, labels, dout)
+    diff = output.asnumpy() - expect
+    assert np.all(abs(diff) < error)
diff --git a/tests/st/ops/gpu/test_sigmoid_cross_entropy_with_logits_op.py b/tests/st/ops/gpu/test_sigmoid_cross_entropy_with_logits_op.py
new file mode 100644
index 0000000000..e3f8512e9c
--- /dev/null
+++ b/tests/st/ops/gpu/test_sigmoid_cross_entropy_with_logits_op.py
@@ -0,0 +1,60 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetSigmoidCrossEntropyWithLogits(nn.Cell):
+    def __init__(self):
+        super(NetSigmoidCrossEntropyWithLogits, self).__init__()
+        self.loss = P.SigmoidCrossEntropyWithLogits()
+
+    def construct(self, logits, labels):
+        return self.loss(logits, labels)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_sigmoid_cross_entropy_with_logits():
+    logits = Tensor(np.array([[1, 1, 2],
+                              [1, 2, 1],
+                              [2, 1, 1]]).astype(np.float32))
+    labels = Tensor(np.array([[0, 0, 1],
+                              [0, 1, 0],
+                              [1, 0, 0]]).astype(np.float32))
+    expect_loss = np.array([[1.313262, 1.313262, 0.126928],
+                            [1.313262, 0.126928, 1.313262],
+                            [0.126928, 1.313262, 1.313262]]).astype(np.float32)
+
+    error = np.ones(shape=[3, 3]) * 1.0e-6
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    sigmoid_cross_entropy_with_logits = NetSigmoidCrossEntropyWithLogits()
+    output = sigmoid_cross_entropy_with_logits(logits, labels)
+    diff = output.asnumpy() - expect_loss
+    assert np.all(abs(diff) < error)
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
+    sigmoid_cross_entropy_with_logits = NetSigmoidCrossEntropyWithLogits()
+    output = sigmoid_cross_entropy_with_logits(logits, labels)
+    diff = output.asnumpy() - expect_loss
+    assert np.all(abs(diff) < error)
diff --git a/tests/st/ops/gpu/test_sigmoid_grad_op.py b/tests/st/ops/gpu/test_sigmoid_grad_op.py
new file mode 100644
index 0000000000..92d1d4d9f7
--- /dev/null
+++ b/tests/st/ops/gpu/test_sigmoid_grad_op.py
@@ -0,0 +1,61 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops.operations import _grad_ops as G
+
+
+class NetSigmoidGrad(nn.Cell):
+    def __init__(self):
+        super(NetSigmoidGrad, self).__init__()
+        self.sigmoid_grad = G.SigmoidGrad()
+
+    def construct(self, y, dy):
+        return self.sigmoid_grad(y, dy)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_sigmoid_grad():
+    y = Tensor(np.array([[[[-1, 1, 2],
+                           [1, -1, 1],
+                           [2, 1, -1]]]]).astype(np.float32))
+    dy = Tensor(np.array([[[[-11, 2, 4],
+                            [-1, 1, -1],
+                            [-4, 4, -4]]]]).astype(np.float32))
+
+    expect = np.array([[[[22, 0, -8],
+                         [0, -2, 0],
+                         [8, 0, 8]]]]).astype(np.float32)
+
+    error = np.ones(shape=[1, 1, 3, 3]) * 1.0e-6
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    sigmoid_grad = NetSigmoidGrad()
+    output = sigmoid_grad(y, dy)
+    diff = output.asnumpy() - expect
+    assert np.all(abs(diff) < error)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    sigmoid_grad = NetSigmoidGrad()
+    output = sigmoid_grad(y, dy)
+    diff = output.asnumpy() - expect
+    assert np.all(abs(diff) < error)
diff --git a/tests/st/ops/gpu/test_sigmoid_op.py b/tests/st/ops/gpu/test_sigmoid_op.py
new file mode 100644
index 0000000000..f3d724a35b
--- /dev/null
+++ b/tests/st/ops/gpu/test_sigmoid_op.py
@@ -0,0 +1,57 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetSigmoid(nn.Cell):
+    def __init__(self):
+        super(NetSigmoid, self).__init__()
+        self.sigmoid = P.Sigmoid()
+
+    def construct(self, x):
+        return self.sigmoid(x)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_sigmoid():
+    x = Tensor(np.array([[[[-1, 1, 10],
+                           [1, -1, 1],
+                           [10, 1, -1]]]]).astype(np.float32))
+    expect = np.array([[[[0.268941, 0.731059, 0.999955],
+                         [0.731059, 0.268941, 0.731059],
+                         [0.999955, 0.731059, 0.268941]]]]).astype(np.float32)
+
+    error = np.ones(shape=[1, 1, 3, 3]) * 1.0e-6
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+    sigmoid = NetSigmoid()
+    output = sigmoid(x)
+    diff = output.asnumpy() - expect
+    assert np.all(abs(diff) < error)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    sigmoid = NetSigmoid()
+    output = sigmoid(x)
+    diff = output.asnumpy() - expect
+    assert np.all(abs(diff) < error)
diff --git a/tests/st/ops/gpu/test_sub_op.py b/tests/st/ops/gpu/test_sub_op.py
index c28e745a7b..3ba03c6586 100644
--- a/tests/st/ops/gpu/test_sub_op.py
+++ b/tests/st/ops/gpu/test_sub_op.py
@@ -76,19 +76,19 @@ def test_Sub():
     output4 = sub(x4, y4)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
     diff2 = output2.asnumpy() - expect2
     assert np.all(diff2 < error2)
-    assert output2.shape() == expect2.shape
+    assert output2.shape == expect2.shape
     diff3 = output3.asnumpy() - expect3
     assert np.all(diff3 < error3)
-    assert output3.shape() == expect3.shape
+    assert output3.shape == expect3.shape
     diff4 = output4.asnumpy() - expect4
     assert np.all(diff4 < error4)
-    assert output4.shape() == expect4.shape
+    assert output4.shape == expect4.shape
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
     sub = Net()
@@ -99,16 +99,16 @@ def test_Sub():
     output4 = sub(x4, y4)
     diff0 = output0.asnumpy() - expect0
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
     diff1 = output1.asnumpy() - expect1
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
     diff2 = output2.asnumpy() - expect2
     assert np.all(diff2 < error2)
-    assert output2.shape() == expect2.shape
+    assert output2.shape == expect2.shape
     diff3 = output3.asnumpy() - expect3
     assert np.all(diff3 < error3)
-    assert output3.shape() == expect3.shape
+    assert output3.shape == expect3.shape
     diff4 = output4.asnumpy() - expect4
     assert np.all(diff4 < error4)
-    assert output4.shape() == expect4.shape
+    assert output4.shape == expect4.shape
diff --git a/tests/st/ops/gpu/test_tanh_op.py b/tests/st/ops/gpu/test_tanh_op.py
index 2e9fa8811d..065bf50f08 100644
--- a/tests/st/ops/gpu/test_tanh_op.py
+++ b/tests/st/ops/gpu/test_tanh_op.py
@@ -72,3 +72,40 @@ def test_Tanh():
               [1.78391056, 0.44159236, 0.33690308, 0.16800483, -0.13651318, -0.63878956, 0.18175511, 0.65280384]]
 
     assert np.allclose(output[0].asnumpy(), expect)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_Tanh_fp16():
+    np.random.seed(42)
+    x_np = np.random.randn(5, 3, 6).astype(np.float16)
+    dy_np = np.random.randn(5, 3, 6).astype(np.float16)
+
+    x_ms = Tensor(x_np)
+    dy_ms = Tensor(dy_np)
+
+    net = TanhNet()
+    grad = Grad(net)
+    output = grad(x_ms, dy_ms)
+
+    expect = [[[0.0766, 0.95, -0.474, -0.0568, -0.3713, -1.387],
+               [0.04626, 0.1521, 0.004135, -0.1771, -1.149, -0.341],
+               [-0.3235, -0.0666, -0.01921, 0.299, 0.7764, 0.1583]],
+
+              [[0.124, -0.0157, -0.3682, -0.0252, 0.05997, 0.51],
+               [-0.145, 0.2979, -0.01145, -1.019, 0.8125, 0.6914],
+               [0.562, -0.0848, 1.402, -0.5386, 0.318, 0.645]],
+
+              [[-0.9487, -0.04343, 0.02448, -0.4844, -0.939, 0.0666],
+               [-1.049, 0.433, -0.1724, 0.9604, -0.6377, -0.1241],
+               [0.7246, -0.1364, 0.2051, 1.132, -1.049, 0.1298]],
+
+              [[0.104, 0.3643, -0.6562, -1.202, 0.4688, 0.1294],
+               [0.2008, 0.3347, -0.2418, 0.07135, 0.1611, -0.1667],
+               [1.856, 0.1979, -1.048, 0.4443, -0.8574, 0.1329]],
+
+              [[1.156, -0.1322, 0.02069, 0.2241, 0.8164, 1.736],
+               [-0.2433, -0.05484, -0.848, -0.7197, -0.01453, 0.2637],
+               [0.1528, 0.6494, 0.006195, 1.307, -0.2024, 2.113]]]
+
+    assert np.allclose(output[0].asnumpy(), expect, rtol=1e-3, atol=1e-3)
diff --git a/tests/st/ops/gpu/test_tile_op.py b/tests/st/ops/gpu/test_tile_op.py
index 6e1b07fe17..6973ddcdb5 100644
--- a/tests/st/ops/gpu/test_tile_op.py
+++ b/tests/st/ops/gpu/test_tile_op.py
@@ -65,16 +65,16 @@ def test_tile():
     diff0 = output[0].asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output[0].shape() == expect0.shape
+    assert output[0].shape == expect0.shape
 
     expect1 = np.tile(input_x1, mul1)
     diff1 = output[1].asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output[1].shape() == expect1.shape
+    assert output[1].shape == expect1.shape
 
     expect2 = np.tile(input_x2, mul2)
     diff2 = output[2].asnumpy() - expect2
     error2 = np.ones(shape=expect2.shape) * 1.0e-5
     assert np.all(diff2 < error2)
-    assert output[2].shape() == expect2.shape
+    assert output[2].shape == expect2.shape
diff --git a/tests/st/ops/gpu/test_zeroslike_op.py b/tests/st/ops/gpu/test_zeroslike_op.py
index d8aa8ebdf1..d31197badd 100644
--- a/tests/st/ops/gpu/test_zeroslike_op.py
+++ b/tests/st/ops/gpu/test_zeroslike_op.py
@@ -50,14 +50,14 @@ def test_ZerosLike():
     diff0 = output0.asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
 
     output1 = zeros_like(x1)
     expect1 = np.zeros_like(x1_np)
     diff1 = output1.asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
 
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
     zeros_like = NetZerosLike()
@@ -66,11 +66,11 @@ def test_ZerosLike():
     diff0 = output0.asnumpy() - expect0
     error0 = np.ones(shape=expect0.shape) * 1.0e-5
     assert np.all(diff0 < error0)
-    assert output0.shape() == expect0.shape
+    assert output0.shape == expect0.shape
 
     output1 = zeros_like(x1)
     expect1 = np.zeros_like(x1_np)
     diff1 = output1.asnumpy() - expect1
     error1 = np.ones(shape=expect1.shape) * 1.0e-5
     assert np.all(diff1 < error1)
-    assert output1.shape() == expect1.shape
+    assert output1.shape == expect1.shape
diff --git a/tests/st/pynative/test_ops.py b/tests/st/pynative/test_ops.py
new file mode 100644
index 0000000000..3cec24fb10
--- /dev/null
+++ b/tests/st/pynative/test_ops.py
@@ -0,0 +1,31 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+
+import mindspore as ms
+import mindspore.ops.operations as P
+from mindspore import context, Tensor
+
+
+def test_cast():
+    """ tests cast for same dtype"""
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend")
+    input_np = np.random.randn(2, 3, 4, 5).astype(np.float32)
+    input_x = Tensor(input_np)
+    type_dst = ms.float32
+    cast = P.Cast()
+    result = cast(input_x, type_dst)
+    assert result.dtype() == type_dst
diff --git a/tests/st/pynative/test_tensor_index.py b/tests/st/pynative/test_tensor_index.py
new file mode 100644
index 0000000000..77ee7db5d6
--- /dev/null
+++ b/tests/st/pynative/test_tensor_index.py
@@ -0,0 +1,957 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+""" test_tensor_slice """
+import numpy as np
+import pytest
+
+from mindspore import Tensor, Parameter
+from mindspore import context
+from mindspore import dtype as mstype
+from mindspore.nn import Cell
+from mindspore.common.parameter import ParameterTuple
+from mindspore.ops import composite as C
+
+
+def setup_module():
+    context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend")
+
+
+class NetWorkSlicePositive(Cell):
+    def __init__(self):
+        super(NetWorkSlicePositive, self).__init__()
+        self.tensor_ret0 = Tensor(np.ones([1, 2, 3], np.int32))
+        self.tensor_ret1 = Tensor(np.ones([4, 8, 10], np.int32))
+        self.tensor_ret2 = Tensor(np.ones([6, 8, 10], np.int32))
+        self.tensor_ret3 = Tensor(np.ones([3, 8, 10], np.int32))
+
+    def construct(self, tensor):
+        ret0 = tensor[3:4:1, 1:5:2, 3:6:1] + self.tensor_ret0
+        ret1 = tensor[-6:4:1, 0:8:1, ::1] + self.tensor_ret1
+        ret2 = tensor[::, ::, ::] + self.tensor_ret2
+        ret3 = tensor[::2] + self.tensor_ret3
+        return ret0, ret1, ret2, ret3
+
+
+def test_slice_positive():
+    net = NetWorkSlicePositive()
+    input_np = np.arange(6*8*10).reshape(6, 8, 10).astype(np.int32)
+    input_0 = Tensor(input_np)
+    output0, output1, output2, output3 = net(input_0)
+    assert np.all(output0.asnumpy() == input_np[3:4:1, 1:5:2, 3:6:1] + np.ones([1, 2, 3]))
+    assert np.all(output1.asnumpy() == input_np[-6:4:1, 0:8:1, ::1] + np.ones([4, 8, 10]))
+    assert np.all(output2.asnumpy() == input_np[::, ::, ::] + np.ones([6, 8, 10]))
+    assert np.all(output3.asnumpy() == input_np[::2] + np.ones([3, 8, 10]))
+
+
+class NetWorkSliceEllipsis(Cell):
+    def __init__(self):
+        super(NetWorkSliceEllipsis, self).__init__()
+        self.tensor_ret0 = Tensor(np.ones([2, 7, 8], np.int32))
+        self.tensor_ret1 = Tensor(np.ones([6, 7, 8, 9], np.int32))
+        self.tensor_ret2 = Tensor(np.ones([1, 6, 7, 8, 9], np.int32))
+
+    def construct(self, tensor):
+        ret0 = tensor[0:4:2, ..., 1] + self.tensor_ret0
+        ret1 = tensor[...] + self.tensor_ret1
+        ret2 = tensor[None] + self.tensor_ret2
+        ret3 = tensor[True] + self.tensor_ret2
+        return ret0, ret1, ret2, ret3
+
+
+def Xtest_slice_ellipsis():
+    net = NetWorkSliceEllipsis()
+    input_np = np.arange(6*7*8*9).reshape(6, 7, 8, 9).astype(np.int32)
+    input_0 = Tensor(input_np)
+    output0, output1, output2, output3 = net(input_0)
+    assert np.all(output0.asnumpy() == input_np[0:4:2, ..., 1] + np.ones([1, 2, 3]))
+    assert np.all(output1.asnumpy() == input_np[...] + np.ones([6, 7, 8, 9]))
+    assert np.all(output2.asnumpy() == input_np[None] + np.ones([6, 7, 8, 9]))
+    assert np.all(output3.asnumpy() == input_np[True] + np.ones([1, 6, 7, 8, 9]))
+
+
+class NetWorkReduceDimension(Cell):
+    def __init__(self):
+        super(NetWorkReduceDimension, self).__init__()
+        self.tensor_ret1 = Tensor(np.ones([3, 10], np.int32))
+        self.tensor_ret2 = Tensor(np.ones([6, 8], np.int32))
+        self.tensor_ret3 = Tensor(np.array(8, np.int32))
+        self.tensor_ret4 = Tensor(np.ones([8, 10], np.int32))
+
+    def construct(self, tensor):
+        ret1 = tensor[::2, 1, ::1] + self.tensor_ret1
+        ret2 = tensor[::, ::, 0] + self.tensor_ret2
+        ret3 = tensor[3, 2, 5] + self.tensor_ret3
+        ret4 = tensor[1] + self.tensor_ret4
+        return ret1, ret2, ret3, ret4
+
+
+def Xtest_reduce_dimension():
+    net = NetWorkReduceDimension()
+    input_np = np.arange(6*8*10).reshape(6, 8, 10).astype(np.int32)
+    input_0 = Tensor(input_np)
+    output1, output2, output3, output4 = net(input_0)
+    assert np.all(output1.asnumpy() == input_np[::2, 1, ::1] + np.ones([3, 10]))
+    assert np.all(output2.asnumpy() == input_np[::, ::, 0] + np.ones([6, 8]))
+    assert np.all(output3.asnumpy() == input_np[3, 2, 5] + np.array(8, np.int32))
+    assert np.all(output4.asnumpy() == input_np[1] + np.ones([8, 10]))
+
+
+class NetWorkSliceStep(Cell):
+    def __init__(self):
+        super(NetWorkSliceStep, self).__init__()
+        self.tensor_ret1 = Tensor(np.ones([6, 5, 10], np.int32))
+        self.tensor_ret2 = Tensor(np.ones([3, 5, 5], np.int32))
+
+    def construct(self, tensor):
+        ret1 = tensor[::1, -5::, ::-1] + self.tensor_ret1
+        ret2 = tensor[::2, -5::, ::2] + self.tensor_ret2
+        return ret1, ret2
+
+
+def Xtest_step_negative():
+    net = NetWorkSliceEllipsis()
+    input_np = np.arange(6*8*10).reshape(6, 8, 10).astype(np.int32)
+    input_0 = Tensor(input_np)
+    output1, output2 = net(input_0)
+    assert np.all(output1.asnumpy() == input_np[::1, -5::, ::-1] + np.ones([6, 8, 10]))
+    assert np.all(output2.asnumpy() == input_np[::2, -5::, ::2] + np.ones([3, 5, 5]))
+
+
+class TensorGetItemByThreeTensors(Cell):
+    def __init__(self):
+        super(TensorGetItemByThreeTensors, self).__init__()
+        self.const0 = Tensor(np.ones((4, 5, 8, 10)), mstype.int32)
+        self.const1 = Tensor(np.ones((3, 4, 5, 10)), mstype.int32)
+        self.const2 = Tensor(np.ones((5, 3, 4, 5)), mstype.int32)
+
+    def construct(self, x, index_0, index_1, index_2):
+        ret0 = x[index_0] + self.const0
+        ret1 = x[index_0, index_1] + self.const1
+        ret2 = x[index_0, index_1, index_2] + self.const2
+        return ret0, ret1, ret2
+
+
+def test_getitem_by_tensors():
+    net = TensorGetItemByThreeTensors()
+    input_x = np.arange(6*8*10).reshape(6, 8, 10).astype(np.int32)
+    index_0 = np.random.randint(6, size=(3, 4, 5)).astype(np.int32)
+    index_1 = np.random.randint(6, size=(4, 5)).astype(np.int32)
+    index_2 = np.random.randint(6, size=(5, 3, 4, 5)).astype(np.int32)
+    input_x_ms = Tensor(input_x)
+    index_0_ms = Tensor(index_0)
+    index_1_ms = Tensor(index_1)
+    input_2_ms = Tensor(index_2)
+    output0, output1, output2 = net(input_x_ms, index_0_ms, index_1_ms, input_2_ms)
+    assert np.all(output0.asnumpy() == input_x[index_0] + np.ones([4, 5, 8, 10]))
+    assert np.all(output1.asnumpy() == input_x[index_0, index_1] + np.ones([3, 4, 5, 10]))
+    assert np.all(output2.asnumpy() == input_x[index_0, index_1, index_2] + np.ones([5, 3, 4, 5]))
+
+
+class TensorGetItemByMixedTensorsBasicCase(Cell):
+    def __init__(self, c0, c1, c2, c3, c4, c5):
+        super(TensorGetItemByMixedTensorsBasicCase, self).__init__()
+        self.const0 = Tensor(c0)
+        self.const1 = Tensor(c1)
+        self.const2 = Tensor(c2)
+        self.const3 = Tensor(c3)
+        self.const4 = Tensor(c4)
+        self.const5 = Tensor(c5)
+
+    def construct(self, tensor, index_0, index_1):
+        ret0 = tensor[index_0, index_1, 0:3] + self.const0
+        ret1 = tensor[0:3, index_0, ...] + self.const1
+        ret2 = tensor[0, index_0, index_1] + self.const2
+        ret3 = tensor[..., index_0, 0:3] + self.const3
+        ret4 = tensor[0:2, index_0, index_1] + self.const4
+        ret5 = tensor[..., index_0, index_1] + self.const5
+        return ret0, ret1, ret2, ret3, ret4, ret5
+
+
+def test_getitem_by_mixed_tensors():
+    const0 = np.ones((3, 4, 5, 3), np.float32)
+    const1 = np.ones((3, 3, 4, 5, 5), np.float32)
+    const2 = np.ones((3, 4, 5), np.float32)
+    const3 = np.ones((3, 3, 4, 5, 3), np.float32)
+    const4 = np.ones((2, 3, 4, 5), np.float32)
+    const5 = np.ones((3, 3, 4, 5), np.float32)
+    net = TensorGetItemByMixedTensorsBasicCase(const0, const1, const2, const3, const4, const5)
+    input_np = np.arange(3 * 4 * 5).reshape((3, 4, 5)).astype(np.float32)
+    input_ms = Tensor(input_np, mstype.float32)
+    index_np_0 = np.random.randint(3, size=(3, 4, 5)).astype(np.int32)
+    index_np_1 = np.random.randint(4, size=(4, 5)).astype(np.int32)
+    index_0 = Tensor(index_np_0, mstype.int32)
+    index_1 = Tensor(index_np_1, mstype.int32)
+    out0, out1, out2, out3, out4, out5 = net(input_ms, index_0, index_1)
+    assert np.all(out0.asnumpy() == (input_np[index_np_0, index_np_1, 0:3] + const0))
+    assert np.all(out1.asnumpy() == (input_np[0:3, index_np_0, ...] + const1))
+    assert np.all(out2.asnumpy() == (input_np[0, index_np_0, index_np_1] + const2))
+    assert np.all(out3.asnumpy() == (input_np[..., index_np_0, 0:3] + const3))
+    assert np.all(out4.asnumpy() == (input_np[0:2, index_np_0, index_np_1] + const4))
+    assert np.all(out5.asnumpy() == (input_np[..., index_np_0, index_np_1] + const5))
+
+
+class TensorSetItemByMixedTensors_0(Cell):
+    def __init__(self, value):
+        super(TensorSetItemByMixedTensors_0, self).__init__()
+        self.const = Tensor(np.ones((3, 4, 5), np.float32))
+        self.param = Parameter(Tensor(np.arange(3 * 4 * 5).reshape((3, 4, 5)),
+                                      mstype.float32),
+                               name="x")
+        self.value = value
+
+    def construct(self, index_0, index_1, index_2):
+        self.param[0:2, index_0, index_1] = self.value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_mixed_tensors_0():
+    value = 88.0
+    net = TensorSetItemByMixedTensors_0(value)
+    index_0 = np.random.randint(3, size=(3, 4, 5))
+    index_1 = np.random.randint(4, size=(4, 5))
+    index_2 = np.random.randint(3, size=(2, 1, 4, 5))
+    index_0_ms = Tensor(index_0, mstype.int32)
+    index_1_ms = Tensor(index_1, mstype.int32)
+    index_2_ms = Tensor(index_2, mstype.int32)
+    input_np = np.arange(3 * 4 * 5).reshape((3, 4, 5)).astype(np.float32)
+    const = np.ones((3, 4, 5), np.float32)
+    out = net(index_0_ms, index_1_ms, index_2_ms)
+    input_np[0:2, index_0, index_1] = value
+    assert np.all(out.asnumpy() == (input_np + const))
+
+
+class TensorSetItemByMixedTensors_1(Cell):
+    def __init__(self, value):
+        super(TensorSetItemByMixedTensors_1, self).__init__()
+        self.const = Tensor(np.ones((3, 4, 5), np.float32))
+        self.param = Parameter(Tensor(np.arange(3 * 4 * 5).reshape((3, 4, 5)), mstype.float32),
+                               name="x")
+        self.value = value
+
+    def construct(self, index_0, index_1, index_2):
+        self.param[0:2, index_0, ...] = self.value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_mixed_tensors_1():
+    value = 88.0
+    net = TensorSetItemByMixedTensors_1(value)
+    index_0 = np.random.randint(3, size=(3, 4, 5))
+    index_1 = np.random.randint(4, size=(4, 5))
+    index_2 = np.random.randint(3, size=(2, 1, 4, 5))
+    index_0_ms = Tensor(index_0, mstype.int32)
+    index_1_ms = Tensor(index_1, mstype.int32)
+    index_2_ms = Tensor(index_2, mstype.int32)
+    input_np = np.arange(3 * 4 * 5).reshape((3, 4, 5)).astype(np.float32)
+    const = np.ones((3, 4, 5), np.float32)
+    out = net(index_0_ms, index_1_ms, index_2_ms)
+    input_np[0:2, index_0, ...] = value
+    assert np.all(out.asnumpy() == (input_np + const))
+
+
+class TensorSetItemByMixedTensors_2(Cell):
+    def __init__(self, value):
+        super(TensorSetItemByMixedTensors_2, self).__init__()
+        self.const = Tensor(np.ones((3, 4, 5), np.float16))
+        self.param = Parameter(Tensor(np.arange(3 * 4 * 5).reshape((3, 4, 5)), mstype.float16),
+                               name="x")
+        self.value = value
+
+    def construct(self, index_0, index_1, index_2):
+        self.param[..., index_0, 1] = self.value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_mixed_tensors_2():
+    value = 88.0
+    net = TensorSetItemByMixedTensors_2(value)
+    index_0 = np.random.randint(3, size=(3, 4, 5))
+    index_1 = np.random.randint(4, size=(4, 5))
+    index_2 = np.random.randint(3, size=(2, 1, 4, 5))
+    index_0_ms = Tensor(index_0, mstype.int32)
+    index_1_ms = Tensor(index_1, mstype.int32)
+    index_2_ms = Tensor(index_2, mstype.int32)
+    input_np = np.arange(3 * 4 * 5).reshape((3, 4, 5)).astype(np.float32)
+    const = np.ones((3, 4, 5), np.float32)
+    out = net(index_0_ms, index_1_ms, index_2_ms)
+    input_np[..., index_0, 1] = value
+    assert np.all(out.asnumpy() == (input_np + const))
+
+
+class TensorGetItemByMixedTensorsTypeError(Cell):
+    def __init__(self):
+        super(TensorGetItemByMixedTensorsTypeError, self).__init__()
+
+    def construct(self, x, index_0, index_1):
+        ret = x[index_0, index_1, 0:3, ..., 0:5, [1, 2, 3, 4]]
+        return ret
+
+
+def test_getitem_by_mixedtensor_exception():
+    input_ms = Tensor(np.arange(3 * 4 * 5 * 6 * 7 * 8 * 9).reshape((3, 4, 5, 6, 7, 8, 9)), mstype.int32)
+    index_0 = Tensor(np.random.randint(3, size=(3, 4, 5)), mstype.int32)
+    index_1 = Tensor(np.random.randint(4, size=(3, 4, 5)), mstype.int32)
+    net1 = TensorGetItemByMixedTensorsTypeError()
+    with pytest.raises(TypeError):
+        net1(input_ms, index_0, index_1)
+
+
+class TensorSetItemByOneTensorWithNumber(Cell):
+    def __init__(self, value):
+        super(TensorSetItemByOneTensorWithNumber, self).__init__()
+        self.const = Tensor(np.ones((6, 7, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 7 * 8).reshape((6, 7, 8)), mstype.float32), name="x")
+        self.value = value
+
+    def construct(self, index):
+        self.param[index] = self.value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_one_tensor_with_number():
+    value = 0.0
+    net = TensorSetItemByOneTensorWithNumber(value)
+    index_np = np.random.randint(4, size=(5, 4))
+    index = Tensor(index_np, mstype.int32)
+    input_data = np.arange(6 * 7 * 8).reshape((6, 7, 8))
+    const = np.ones((6, 7, 8)).astype(np.float32)
+    out = net(index)
+    input_data[index_np] = value
+    assert np.all(out.asnumpy() == (input_data + const))
+
+
+class TensorSetItemByOneTensorWithTensor(Cell):
+    def __init__(self):
+        super(TensorSetItemByOneTensorWithTensor, self).__init__()
+        self.const = Tensor(np.ones((6, 7, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 7 * 8).reshape((6, 7, 8)), mstype.float32), name="x")
+
+    def construct(self, index, value):
+        self.param[index] = value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_one_tensor_with_tensor():
+    net = TensorSetItemByOneTensorWithTensor()
+    index_np = np.random.randint(4, size=(5, 4))
+    index = Tensor(index_np, mstype.int32)
+    input_data = np.arange(6 * 7 * 8).reshape((6, 7, 8))
+    const = np.ones((6, 7, 8)).astype(np.float32)
+    value = np.zeros((4, 7, 8)).astype(np.float32)
+    value_ms = Tensor(value, mstype.float32)
+    out = net(index, value_ms)
+    input_data[index_np] = value
+    assert np.all(out.asnumpy() == (input_data + const))
+
+
+class TensorSetItemByOneTensorWithTupleOfNumber(Cell):
+    def __init__(self, value):
+        super(TensorSetItemByOneTensorWithTupleOfNumber, self).__init__()
+        self.const = Tensor(np.ones((6, 7, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 7 * 8).reshape((6, 7, 8)), mstype.float32), name="x")
+        self.value = value
+
+    def construct(self, index):
+        self.param[index] = self.value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_one_tensor_with_tuple_number():
+    value = (0.0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7)
+    net = TensorSetItemByOneTensorWithTupleOfNumber(value)
+    input_np = np.random.randint(5, size=(5, 4))
+    input_ms = Tensor(input_np, mstype.int32)
+    input_data = np.arange(6 * 7 * 8).reshape((6, 7, 8)).astype(np.float32)
+    const = np.ones((6, 7, 8)).astype(np.float32)
+    out = net(input_ms)
+    input_data[input_np] = value
+    assert np.all(out.asnumpy() == (input_data + const))
+
+
+class TensorSetItemByOneTensorWithTupleOfTensor(Cell):
+    def __init__(self):
+        super(TensorSetItemByOneTensorWithTupleOfTensor, self).__init__()
+        self.const = Tensor(np.ones((6, 3, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 3 * 8).reshape((6, 3, 8)), mstype.float32), name="x")
+
+    def construct(self, index, value_0, value_1, value_2):
+        self.param[index] = (value_0, value_1, value_2)
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_one_tensor_with_tuple_tensors():
+    net = TensorSetItemByOneTensorWithTupleOfTensor()
+    input_np = np.random.randint(6, size=(5, 4)).astype(np.int32)
+    input_ms = Tensor(input_np, mstype.int32)
+    input_data = np.arange(6 * 3 * 8).reshape((6, 3, 8)).astype(np.float32)
+    value_0_np = np.zeros((8,), np.float32)
+    value_1_np = np.ones((8,), np.float32)
+    value_2_np = np.ones((8,), np.float32)*2
+    value_0 = Tensor(value_0_np)
+    value_1 = Tensor(value_1_np)
+    value_2 = Tensor(value_2_np)
+    const = np.ones((6, 3, 8)).astype(np.float32)
+    out = net(input_ms, value_0, value_1, value_2)
+    input_data[input_np] = (value_0_np, value_1_np, value_2_np)
+    assert np.all(out.asnumpy() == (input_data + const))
+
+
+class TensorSetItemByTensorsWithNumber(Cell):
+    def __init__(self, value):
+        super(TensorSetItemByTensorsWithNumber, self).__init__()
+        self.const = Tensor(np.ones((6, 7, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 7 * 8).reshape((6, 7, 8)), mstype.float32), name="x")
+        self.value = value
+
+    def construct(self, index_0, index_1, index_2):
+        self.param[index_0, index_1, index_2] = self.value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_tensors_with_number():
+    value = 0.0
+    net = TensorSetItemByTensorsWithNumber(value)
+    index_0 = np.random.randint(6, size=(3, 4, 5))
+    index_1 = np.random.randint(7, size=(4, 5))
+    index_2 = np.random.randint(8, size=(5, 3, 4, 5))
+    index_0_ms = Tensor(index_0, mstype.int32)
+    index_1_ms = Tensor(index_1, mstype.int32)
+    index_2_ms = Tensor(index_2, mstype.int32)
+    out = net(index_0_ms, index_1_ms, index_2_ms)
+    const = np.ones((6, 7, 8)).astype(np.float32)
+    input_data = np.arange(6 * 7 * 8).reshape((6, 7, 8)).astype(np.float32)
+    input_data[index_0, index_1, index_2] = value
+    assert np.all(out.asnumpy() == (input_data + const))
+
+
+class TensorSetItemByTensorsWithTensor(Cell):
+    def __init__(self):
+        super(TensorSetItemByTensorsWithTensor, self).__init__()
+        self.const = Tensor(np.ones((6, 7, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 7 * 8).reshape((6, 7, 8)), mstype.float32), name="x")
+
+    def construct(self, index_0, index_1, index_2, value):
+        self.param[index_0, index_1, index_2] = value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_tensors_with_tensor():
+    net = TensorSetItemByTensorsWithTensor()
+    index_0 = np.random.randint(6, size=(3, 4, 5))
+    index_1 = np.random.randint(7, size=(4, 5))
+    index_2 = np.random.randint(8, size=(5, 3, 4, 5))
+    value = np.zeros((4, 5)).astype(np.float32)
+    index_0_ms = Tensor(index_0, mstype.int32)
+    index_1_ms = Tensor(index_1, mstype.int32)
+    index_2_ms = Tensor(index_2, mstype.int32)
+    value_ms = Tensor(value, mstype.float32)
+    out = net(index_0_ms, index_1_ms, index_2_ms, value_ms)
+    const = np.ones((6, 7, 8)).astype(np.float32)
+    input_data = np.arange(6 * 7 * 8).reshape((6, 7, 8)).astype(np.float32)
+    input_data[index_0, index_1, index_2] = value
+    assert np.all(out.asnumpy() == (input_data + const))
+
+
+class TensorSetItemByTensorsWithTensorNumberError(Cell):
+    def __init__(self):
+        super(TensorSetItemByTensorsWithTensorNumberError, self).__init__()
+        self.const = Tensor(np.ones((6, 7, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 7 * 8).reshape((6, 7, 8)), mstype.float32), name="x")
+
+    def construct(self, index_0, index_1, index_2, index_3, value):
+        self.param[index_0, index_1, index_2, index_3] = value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_tensors_with_tensor_error():
+    index_0 = Tensor(np.random.randint(6, size=(3, 4, 5)), mstype.int32)
+    index_1 = Tensor(np.random.randint(7, size=(4, 5)), mstype.int32)
+    index_2 = Tensor(np.random.randint(8, size=(5, 3, 4, 5)), mstype.int32)
+    index_3 = Tensor(np.random.randint(8, size=(1, 3, 4, 5)), mstype.int32)
+    value = Tensor(np.zeros((2, 5)), mstype.float32)
+    net = TensorSetItemByTensorsWithTensorNumberError()
+    with pytest.raises(IndexError):
+        net(index_0, index_1, index_2, index_3, value)
+
+
+class TensorSetItemByTensorsWithTupleOfNumber(Cell):
+    def __init__(self, value):
+        super(TensorSetItemByTensorsWithTupleOfNumber, self).__init__()
+        self.const = Tensor(np.ones((6, 7, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 7 * 8).reshape((6, 7, 8)), mstype.float32), name="x")
+        self.value = value
+
+    def construct(self, index_0, index_1, index_2):
+        self.param[index_0, index_1, index_2] = self.value
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_tensors_with_tuple_of_number():
+    value = (0.0, 1.1, 2.2, 3.3, 4.4)
+    net = TensorSetItemByTensorsWithTupleOfNumber(value)
+    index_0 = np.random.randint(6, size=(3, 4, 5))
+    index_1 = np.random.randint(7, size=(4, 5))
+    index_2 = np.random.randint(8, size=(5, 3, 4, 5))
+    index_0_ms = Tensor(index_0, mstype.int32)
+    index_1_ms = Tensor(index_1, mstype.int32)
+    index_2_ms = Tensor(index_2, mstype.int32)
+    input_data = np.arange(6 * 7 * 8).reshape((6, 7, 8)).astype(np.float32)
+    input_data[index_0, index_1, index_2] = value
+    const = np.ones((6, 7, 8)).astype(np.float32)
+    out = net(index_0_ms, index_1_ms, index_2_ms)
+    assert np.all(out.asnumpy() == (input_data + const))
+
+
+class TensorSetItemByTensorsWithTupleOfTensor(Cell):
+    def __init__(self):
+        super(TensorSetItemByTensorsWithTupleOfTensor, self).__init__()
+        self.const = Tensor(np.ones((6, 7, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 7 * 8).reshape((6, 7, 8)), mstype.float32), name="x")
+
+    def construct(self, index_0, index_1, index_2, value_0, value_1, value_2):
+        self.param[index_0, index_1, index_2] = (value_0, value_1, value_2)
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_tensors_with_tuple_of_tensor():
+    value_0 = np.zeros((4, 5))
+    value_1 = np.ones((4, 5))
+    value_2 = np.ones((4, 5)) * 2
+    value_0_ms = Tensor(value_0, mstype.float32)
+    value_1_ms = Tensor(value_1, mstype.float32)
+    value_2_ms = Tensor(value_2, mstype.float32)
+    net = TensorSetItemByTensorsWithTupleOfTensor()
+    index_0 = np.random.randint(6, size=(3, 4, 5))
+    index_1 = np.random.randint(7, size=(4, 5))
+    index_2 = np.random.randint(8, size=(5, 3, 4, 5))
+    index_0_ms = Tensor(index_0, mstype.int32)
+    index_1_ms = Tensor(index_1, mstype.int32)
+    index_2_ms = Tensor(index_2, mstype.int32)
+    input_data = np.arange(6 * 7 * 8).reshape((6, 7, 8)).astype(np.float32)
+    input_data[index_0, index_1, index_2] = (value_0, value_1, value_2)
+    const = np.ones((6, 7, 8)).astype(np.float32)
+    out = net(index_0_ms, index_1_ms, index_2_ms, value_0_ms, value_1_ms, value_2_ms)
+    assert np.all(out.asnumpy() == (input_data + const))
+
+
+class TensorSetItemByTensorsWithTupleOfTensorNumberError(Cell):
+    def __init__(self):
+        super(TensorSetItemByTensorsWithTupleOfTensorNumberError, self).__init__()
+        self.const = Tensor(np.ones((6, 7, 8)), mstype.float32)
+        self.param = Parameter(Tensor(np.arange(6 * 7 * 8).reshape((6, 7, 8)), mstype.float32), name="x")
+
+    def construct(self, index_0, index_1, index_2, value_0, value_1):
+        self.param[index_0, index_1, index_2] = (value_0, value_1)
+        ret = self.param + self.const
+        return ret
+
+
+def test_setitem_by_tensor_with_tuple_of_tensor_error():
+    net = TensorSetItemByTensorsWithTupleOfTensorNumberError()
+    index_0_ms = Tensor(np.random.randint(6, size=(3, 4, 5)), mstype.int32)
+    index_1_ms = Tensor(np.random.randint(7, size=(4, 5)), mstype.int32)
+    index_2_ms = Tensor(np.random.randint(8, size=(5, 3, 4, 5)), mstype.int32)
+    value_0 = np.zeros((4, 5))
+    value_1 = np.ones((4, 5))
+    value_0_ms = Tensor(value_0, mstype.float32)
+    value_1_ms = Tensor(value_1, mstype.float32)
+    with pytest.raises(ValueError):
+        net(index_0_ms, index_1_ms, index_2_ms, value_0_ms, value_1_ms)
+
+
+def test_setitem_grad():
+    class Net(Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.weight = Parameter(
+                Tensor(np.ones([4, 4, 5]), dtype=mstype.float32), "b1", requires_grad=True)
+
+        def construct(self, a, b):
+            a[1:3:1, ::] = b
+            c = a + self.weight
+            return c
+
+    class GradNet(Cell):
+        def __init__(self, net):
+            super(GradNet, self).__init__()
+            self.net = net
+            self.weights = ParameterTuple(net.trainable_params())
+
+        def construct(self, x, y, sens):
+            return C.grad_by_list_with_sens(self.net, self.weights)(x, y, sens)
+    net = GradNet(Net())
+    x = Tensor(np.ones([4, 4, 5]).astype(np.float32), mstype.float32)
+    y = Tensor(np.array([3]).astype(np.float32), mstype.float32)
+    sens = Tensor(np.ones([4, 4, 5]).astype(np.float32), mstype.float32)
+    net(x, y, sens)
+
+
+class TensorAssignWithSliceError1(Cell):
+    def __init__(self):
+        super(TensorAssignWithSliceError1, self).__init__()
+
+    def construct(self, a, b):
+        a[1:3:-1, ::] = b
+        return a
+
+
+class TensorAssignWithSliceError2(Cell):
+    def __init__(self):
+        super(TensorAssignWithSliceError2, self).__init__()
+
+    def construct(self, a, b):
+        a[1:3:-1] = b
+        return a
+
+
+class TensorAssignWithSlice2(Cell):
+    def __init__(self):
+        super(TensorAssignWithSlice2, self).__init__()
+
+    def construct(self, a, b, ck):
+        a[1:5] = b
+        a[3:4] = 5
+        a[-1:1:-1] = b
+        a[-1:3:-1] = 5
+        a[::] = b
+        a[::] = 9
+        z = a + ck
+        return z
+
+
+class TensorAssignWithSlice(Cell):
+    def __init__(self):
+        super(TensorAssignWithSlice, self).__init__()
+        self.c = 2
+
+    def construct(self, a, b, ck):
+        a[1:3, ::] = b
+        a[2:3:, 3:] = b
+        a[::] = b
+        a[::] = self.c
+        a[::, ::] = b
+        a[::, ::] = self.c
+        a[2:3:, 0:, 4:1:-1] = b
+        a[2:3:, 0:, 4:1:-1] = self.c
+        z = a + ck
+        return z
+
+
+def test_tensor_assign():
+    net = TensorAssignWithSlice()
+    net2 = TensorAssignWithSlice2()
+    net_e1 = TensorAssignWithSliceError1()
+    net_e2 = TensorAssignWithSliceError2()
+    a = np.arange(60).reshape(3, 4, 5)
+    ck = np.arange(60).reshape(3, 4, 5)
+    b = Tensor([1], dtype=mstype.float32)
+    Ta = Tensor(a, dtype=mstype.float32)
+    Tck = Tensor(ck, dtype=mstype.float32)
+    Ta4d = Tensor(a.reshape(1, 3, 4, 5), dtype=mstype.float32)
+    Ta4d_ck = Tensor(ck.reshape(1, 3, 4, 5), dtype=mstype.float32)
+    Tb = Tensor([1, 3], dtype=mstype.float32)
+    Tc = Tensor([], dtype=mstype.float32)
+    t = Tensor([1, 2, 3, 4, 5, 6, 7, 8], dtype=mstype.float32)
+    tck = Tensor([1, 2, 3, 4, 5, 6, 7, 8], dtype=mstype.float32)
+    net(Ta, b, Tck)
+    net2(t, b, tck)
+    # Error for A[Slice] = Number
+    # 1. A[Slice] = Number,  Slice error
+    with pytest.raises(IndexError):
+        net_e2(t, 2)
+
+    # Error for A[Slice] = U, U is a Tensor
+    # 1. A[Slice] = U,  u.size is error
+    with pytest.raises(ValueError):
+        net2(t, Tb, tck)
+    # 2. A[Slice] = U, U is empty
+    with pytest.raises(ValueError):
+        net2(t, Tc, tck)
+    # 3. A[Slice] = U, U.size error
+    with pytest.raises(ValueError):
+        net2(t, Tb, tck)
+
+    # Error for A[Tuple(Slice...)] = Tensor
+    # 1. A[Tuple(Slice...)] = U, U is empty
+    with pytest.raises(ValueError):
+        net(Ta, Tc, Tck)
+    # 2. A[Tuple(Slice...)] = U, U.size error
+    with pytest.raises(ValueError):
+        net(Ta, Tb, Tck)
+    # 3. A[Tuple(Slice...)] = U,  Slice error
+    with pytest.raises(IndexError):
+        net_e1(Ta, b)
+
+    # Error for A[Tuple(Slice...)] = Number
+    # 1. A[Tuple(Slice...)] = Number,  Slice error
+    with pytest.raises(IndexError):
+        net_e1(Ta, 2)
+
+    net = TensorAssignWithInteger()
+    # Error for A[Number] = scalar/Tensor
+    # 1. A[Number] = U, U is a Tensor, u.size not match
+    with pytest.raises(ValueError):
+        net(Ta, Tb, Tck)
+    with pytest.raises(ValueError):
+        net(Ta, Tc, Tck)
+    # 2. A[Number] = U, the number index error
+    with pytest.raises(IndexError):
+        net(Ta4d, b, Ta4d_ck)
+
+    # Error for A[(n,m)] = scalar/Tensor
+    # 1. A[(n,m)] = U, U is a tensor. u.size not match
+    net = TensorAssignWithTupleInteger()
+    with pytest.raises(ValueError):
+        net(Ta, Tc, Tck)
+    with pytest.raises(ValueError):
+        net(Ta, Tb, Tck)
+    # 2. A[(n,m)] = U, the number index error
+    with pytest.raises(IndexError):
+        net(Ta4d, b, Ta4d_ck)
+
+    # Error for  A[...] = U or A[1:, ...] = u
+    # 1. A[...] = scalar/tensor
+    net = TensorAssignWithEllipsis()
+    net(Ta, Ta4d)
+    with pytest.raises(ValueError):
+        net(Ta, Tc)
+    with pytest.raises(ValueError):
+        net(Ta, Tb)
+    # 2. A[::, 1:, ...] = scalar/tensor
+    net = TensorAssignWithTupleEllipsis()
+    net(Ta, b)
+    Tc = Tensor(1, mstype.float32)
+    with pytest.raises(ValueError):
+        net(Ta, Tc)
+    with pytest.raises(ValueError):
+        net(Ta, Tb)
+
+
+class TensorAssignWithTupleEllipsis2(Cell):
+    def __init__(self):
+        super(TensorAssignWithTupleEllipsis2, self).__init__()
+
+    def construct(self, a, b):
+        a[1:, ..., ::] = b
+        return a
+
+
+class TensorAssignWithTupleEllipsis(Cell):
+    def __init__(self):
+        super(TensorAssignWithTupleEllipsis, self).__init__()
+
+    def construct(self, a, b):
+        a[:2, ...] = 1
+        a[1:, ...] = b
+        return a
+
+
+class TensorAssignWithEllipsis(Cell):
+    def __init__(self):
+        super(TensorAssignWithEllipsis, self).__init__()
+
+    def construct(self, a, b):
+        a[...] = 1
+        a[...] = b
+        return a
+
+
+class TensorAssignWithInteger(Cell):
+    def __init__(self):
+        super(TensorAssignWithInteger, self).__init__()
+
+    def construct(self, a, b, ck):
+        a[1] = 1
+        a[0] = b
+        z = a + ck
+        return z
+
+
+class TensorAssignWithTupleInteger(Cell):
+    def __init__(self):
+        super(TensorAssignWithTupleInteger, self).__init__()
+
+    def construct(self, a, b, ck):
+        a[(1)] = 1
+        a[(1)] = b
+        a[(1, 1)] = b
+        a[(1, 1)] = 1
+        z = a + ck
+        return z
+
+
+class TensorAssignWithBoolTensorIndex(Cell):
+    def __init__(self):
+        super(TensorAssignWithBoolTensorIndex, self).__init__()
+        self.t = Tensor(np.ones([3, 4, 5]), dtype=mstype.float32)
+        self.u_scalar = 5
+
+    def construct(self, a, b, c, u_tensor):
+        a[c] = self.u_scalar
+        a[b] = u_tensor
+        z = a + self.t
+        return z
+
+
+class TensorAssignWithBoolTensorIndexError(Cell):
+    def __init__(self):
+        super(TensorAssignWithBoolTensorIndexError, self).__init__()
+
+    def construct(self, a, b, c, u_tensor):
+        a[b][c] = u_tensor
+        return a
+
+
+class TensorAssignWithBoolTensorIndex2(Cell):
+    def __init__(self):
+        super(TensorAssignWithBoolTensorIndex2, self).__init__()
+        self.t = Tensor(np.ones([3, 4, 5]), dtype=mstype.float32)
+        self.u_scalar = 5
+
+    def construct(self, a, u_tensor):
+        a[a > 8] = u_tensor
+        a[a >= 6] = self.u_scalar
+        a[a < 3] = self.u_scalar
+        a[a <= 5] = u_tensor
+        a[a == 5] = self.u_scalar
+        z = a + self.t
+        return z
+
+
+class TensorAssignWithBoolTensorIndex2Error(Cell):
+    def __init__(self):
+        super(TensorAssignWithBoolTensorIndex2Error, self).__init__()
+
+    def construct(self, a, u_tensor):
+        a[a > 8][a > 5] = u_tensor
+        return a
+
+
+def test_tensor_assign_bool_index_0():
+    a = np.arange(60).reshape(3, 4, 5)
+    b = a > 5
+    c = a < 3
+    Ta = Tensor(a, dtype=mstype.float32)
+    Tb = Tensor(b)
+    Tc = Tensor(c)
+    u_tensor = Tensor([1], dtype=mstype.float32)
+    net1 = TensorAssignWithBoolTensorIndex()
+    out = net1(Ta, Tb, Tc, u_tensor)
+    res = np.arange(60).reshape(3, 4, 5)
+    res[c] = 5
+    res[b] = 1
+    res = res + np.ones([3, 4, 5])
+    assert np.all(out.asnumpy() == res)
+
+
+def test_tensor_assign_bool_index_1():
+    a = np.arange(60).reshape(3, 4, 5)
+    Ta = Tensor(a, dtype=mstype.float32)
+    u_tensor = Tensor([1], dtype=mstype.float32)
+    net2 = TensorAssignWithBoolTensorIndex2()
+    out = net2(Ta, u_tensor)
+    res = np.arange(60).reshape(3, 4, 5)
+    res[res > 8] = 1
+    res[res >= 6] = 5
+    res[res < 3] = 5
+    res[res <= 5] = 1
+    res[res == 5] = 5
+    res = res + np.ones([3, 4, 5])
+    assert np.all(out.asnumpy() == res)
+
+
+def test_tensor_assign_bool_index_exception():
+    a = np.arange(60).reshape(3, 4, 5)
+    b = a > 5
+    c = a < 3
+    Ta = Tensor(a, dtype=mstype.float32)
+    Tb = Tensor(b)
+    Tc = Tensor(c)
+    Td = Tensor([True, True])
+    u_tensor = Tensor([1], dtype=mstype.float32)
+    u_tensor_error = Tensor([1, 2], dtype=mstype.float32)
+    u_scalar = 5
+    net1 = TensorAssignWithBoolTensorIndex()
+    net2 = TensorAssignWithBoolTensorIndex2()
+    with pytest.raises(ValueError):
+        net1(Ta, Td, Tc, u_tensor)
+    with pytest.raises(IndexError):
+        net1(Ta, u_tensor, Tc, u_tensor)
+    with pytest.raises(ValueError):
+        net1(Ta, Tb, Td, u_tensor)
+    with pytest.raises(IndexError):
+        net1(Ta, Tb, Ta, u_tensor)
+    with pytest.raises(ValueError):
+        net1(Ta, Tb, Tc, u_tensor_error)
+    # net1(Ta, u_tensor, Tc, u_tensor_error, u_scalar)
+    with pytest.raises(ValueError):
+        net2(Ta, u_tensor_error)
+    net3 = TensorAssignWithBoolTensorIndexError()
+    with pytest.raises(IndexError):
+        net3(Ta, Tb, Tc, u_tensor)
+    with pytest.raises(IndexError):
+        net3(Ta, Tb, Tc, u_scalar)
+    net4 = TensorAssignWithBoolTensorIndex2Error()
+    with pytest.raises(IndexError):
+        net4(Ta, u_tensor)
+    with pytest.raises(IndexError):
+        net4(Ta, u_scalar)
+
+
+def Xtest_tensor_slice_reduce_out_of_bounds_neg():
+    class NetWork(Cell):
+        def __init__(self):
+            super(NetWork, self).__init__()
+            self.tensor_ret = Tensor(np.array(9, np.int32))
+
+        def construct(self, tensor):
+            ret = tensor[-7, 3, 4]
+            return ret
+
+    input_tensor = Tensor(np.ones([6, 8, 10], np.int32))
+    net = NetWork()
+    with pytest.raises(ValueError) as ex:
+        net(input_tensor)
+    assert "For 'StridedSlice' the `begin[0]` should be an int and must greater or equal to -6, but got `-7`" in str(
+        ex.value)
+
+
+def Xtest_tensor_slice_reduce_out_of_bounds_positive():
+    class NetWork(Cell):
+        def __init__(self):
+            super(NetWork, self).__init__()
+            self.tensor_ret = Tensor(np.array(9, np.int32))
+
+        def construct(self, tensor):
+            ret = tensor[6, 3, 4]
+            return ret
+
+    input_tensor = Tensor(np.ones([6, 8, 10], np.int32))
+    net = NetWork()
+    with pytest.raises(ValueError) as ex:
+        net(input_tensor)
+    assert "For 'StridedSlice' the `begin[0]` should be an int and must less than 6, but got `6`" in str(ex.value)
diff --git a/tests/st/summary/test_cpu_summary.py b/tests/st/summary/test_cpu_summary.py
new file mode 100644
index 0000000000..e2f6e8f616
--- /dev/null
+++ b/tests/st/summary/test_cpu_summary.py
@@ -0,0 +1,81 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Summary cpu st."""
+import os
+import platform
+import tempfile
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+from tests.summary_utils import SummaryReader
+from mindspore.train.summary.summary_record import SummaryRecord
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+
+
+class SummaryNet(nn.Cell):
+    def __init__(self):
+        super().__init__()
+        self.scalar_summary = P.ScalarSummary()
+        self.image_summary = P.ImageSummary()
+        self.tensor_summary = P.TensorSummary()
+        self.histogram_summary = P.HistogramSummary()
+
+    def construct(self, image_tensor):
+        self.image_summary("image", image_tensor)
+        self.tensor_summary("tensor", image_tensor)
+        self.histogram_summary("histogram", image_tensor)
+        scalar = image_tensor[0][0][0][0]
+        self.scalar_summary("scalar", scalar)
+        return scalar
+
+
+def train_summary_record(test_writer, steps):
+    """Train and record summary."""
+    net = SummaryNet()
+    out_me_dict = {}
+    for i in range(0, steps):
+        image_tensor = Tensor(np.array([[[[i]]]]).astype(np.float32))
+        out_put = net(image_tensor)
+        test_writer.record(i)
+        out_me_dict[i] = out_put.asnumpy()
+    return out_me_dict
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_summary_step2_summary_record1():
+    """Test record 10 step summary."""
+    if platform.system() == "Windows":
+        # Summary does not support windows currently.
+        return
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        steps = 2
+        with SummaryRecord(tmp_dir) as test_writer:
+            train_summary_record(test_writer, steps=steps)
+
+            file_name = os.path.realpath(test_writer.full_file_name)
+        with SummaryReader(file_name) as summary_writer:
+            for _ in range(steps):
+                event = summary_writer.read_event()
+                tags = set(value.tag for value in event.summary.value)
+                assert tags == {'tensor', 'histogram', 'scalar', 'image'}
diff --git a/tests/st/summary/test_gpu_summary.py b/tests/st/summary/test_gpu_summary.py
index a1e8ca17d8..9b4095b8d9 100644
--- a/tests/st/summary/test_gpu_summary.py
+++ b/tests/st/summary/test_gpu_summary.py
@@ -75,7 +75,7 @@ class TestGpuSummary:
         if not os.path.exists(self.summary_dir):
             os.mkdir(self.summary_dir)
 
-    def teardown_emthod(self):
+    def teardown_method(self):
         """Run after method."""
         if os.path.exists(self.summary_dir):
             shutil.rmtree(self.summary_dir)
diff --git a/tests/st/tbe_networks/resnet_cifar.py b/tests/st/tbe_networks/resnet_cifar.py
index 6b3b75a63c..cf9eb59400 100644
--- a/tests/st/tbe_networks/resnet_cifar.py
+++ b/tests/st/tbe_networks/resnet_cifar.py
@@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,20 +19,20 @@ import numpy as np
 from resnet import resnet50
 
 import mindspore.common.dtype as mstype
+import mindspore.ops.functional as F
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.c_transforms as C
 import mindspore.dataset.transforms.vision.c_transforms as vision
 import mindspore.nn as nn
-import mindspore.ops.functional as F
 from mindspore import Tensor
 from mindspore import context
 from mindspore.communication.management import init
 from mindspore.nn.optim.momentum import Momentum
 from mindspore.ops import operations as P
 from mindspore.parallel._auto_parallel_context import auto_parallel_context
-from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
 from mindspore.train.model import Model, ParallelMode
-from mindspore.train.serialization import load_checkpoint, load_param_into_net
 
 random.seed(1)
 np.random.seed(1)
@@ -62,12 +62,12 @@ def create_dataset(repeat_num=1, training=True):
     data_dir = data_home + "/cifar-10-batches-bin"
     if not training:
         data_dir = data_home + "/cifar-10-verify-bin"
-    data_set = ds.Cifar10Dataset(data_dir)
+    data_set = ds.Cifar10Dataset(data_dir, num_samples=32)
 
     if args_opt.run_distribute:
         rank_id = int(os.getenv('RANK_ID'))
         rank_size = int(os.getenv('RANK_SIZE'))
-        data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id)
+        data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id, num_samples=32)
 
     resize_height = 224
     resize_width = 224
@@ -140,8 +140,9 @@ if __name__ == '__main__':
         batch_num = dataset.get_dataset_size()
         config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, keep_checkpoint_max=10)
         ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10", directory="./", config=config_ck)
+        time_cb = TimeMonitor(data_size=batch_num)
         loss_cb = LossMonitor()
-        model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb])
+        model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb, time_cb])
 
     if args_opt.do_eval:
         if args_opt.checkpoint_path:
diff --git a/tests/st/tbe_networks/test_resnet_cifar_1p.py b/tests/st/tbe_networks/test_resnet_cifar_1p.py
index 45895185eb..672d17c72b 100644
--- a/tests/st/tbe_networks/test_resnet_cifar_1p.py
+++ b/tests/st/tbe_networks/test_resnet_cifar_1p.py
@@ -15,8 +15,6 @@
 
 import os
 import random
-
-import pytest
 import numpy as np
 from resnet import resnet50
 
@@ -152,10 +150,7 @@ def train_process(epoch_size, num_classes, batch_size):
     print("result: ", res)
     return res
 
-@pytest.mark.level0
-@pytest.mark.platform_arm_ascend_training
-@pytest.mark.platform_x86_ascend_training
-@pytest.mark.env_onecard
+
 def test_resnet_cifar_1p():
     epoch_size = 1
     num_classes = 10
@@ -163,4 +158,4 @@ def test_resnet_cifar_1p():
     acc = train_process(epoch_size, num_classes, batch_size)
     os.system("rm -rf kernel_meta")
     print("End training...")
-    assert acc['acc'] > 0.35
+    assert acc['acc'] > 0.20
diff --git a/tests/st/tbe_networks/test_resnet_cifar_8p.py b/tests/st/tbe_networks/test_resnet_cifar_8p.py
index 07a35f1591..a13f367b9f 100644
--- a/tests/st/tbe_networks/test_resnet_cifar_8p.py
+++ b/tests/st/tbe_networks/test_resnet_cifar_8p.py
@@ -17,7 +17,7 @@ import os
 import random
 from multiprocessing import Process, Queue
 import numpy as np
-import pytest
+
 from resnet import resnet50
 import mindspore.common.dtype as mstype
 import mindspore.dataset as ds
@@ -173,10 +173,6 @@ def train_process(q, device_id, epoch_size, num_classes, device_num, batch_size,
     q.put(loss_cb.get_loss())
 
 
-@pytest.mark.level0
-@pytest.mark.platform_arm_ascend_training
-@pytest.mark.platform_x86_ascend_training
-@pytest.mark.env_single
 def test_resnet_cifar_8p():
     q = Queue()
     device_num = 8
diff --git a/tests/ut/python/train/summary/summary_reader.py b/tests/summary_utils.py
similarity index 60%
rename from tests/ut/python/train/summary/summary_reader.py
rename to tests/summary_utils.py
index 647c25f25c..826a3106e5 100644
--- a/tests/ut/python/train/summary/summary_reader.py
+++ b/tests/summary_utils.py
@@ -22,22 +22,44 @@ _HEADER_CRC_SIZE = 4
 _DATA_CRC_SIZE = 4
 
 
+class _EndOfSummaryFileException(Exception):
+    """Indicates the summary file is exhausted."""
+
+
 class SummaryReader:
-    """Read events from summary file."""
+    """
+    Basic summary read function.
+
+    Args:
+        canonical_file_path (str): The canonical summary file path.
+        ignore_version_event (bool): Whether ignore the version event at the beginning of summary file.
+    """
+
+    def __init__(self, canonical_file_path, ignore_version_event=True):
+        self._file_path = canonical_file_path
+        self._ignore_version_event = ignore_version_event
 
-    def __init__(self, file_name):
-        self._file_name = file_name
-        self._file_handler = open(self._file_name, "rb")
-        # skip version event
-        self.read_event()
+    def __enter__(self):
+        self._file_handler = open(self._file_path, "rb")
+        if self._ignore_version_event:
+            self.read_event()
+        return self
+
+    def __exit__(self, *unused_args):
+        self._file_handler.close()
+        return False
 
     def read_event(self):
         """Read next event."""
         file_handler = self._file_handler
         header = file_handler.read(_HEADER_SIZE)
         data_len = struct.unpack('Q', header)[0]
+        # Ignore crc check.
         file_handler.read(_HEADER_CRC_SIZE)
+
         event_str = file_handler.read(data_len)
+        # Ignore crc check.
         file_handler.read(_DATA_CRC_SIZE)
         summary_event = summary_pb2.Event.FromString(event_str)
+
         return summary_event
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 5e30b074a3..13f961fa24 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -91,6 +91,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "../../../mindspore/ccsrc/device/kernel_info.cc"
         "../../../mindspore/ccsrc/device/ascend/profiling/*.cc"
         "../../../mindspore/ccsrc/device/ascend/kernel_select_ascend.cc"
+        "../../../mindspore/ccsrc/device/ascend/kernel_select_graph_kernel.cc"
         "../../../mindspore/ccsrc/device/convert_tensor_utils.cc"
         "../../../mindspore/ccsrc/device/ascend/kernel_build_ascend.cc"
         "../../../mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc"
@@ -106,9 +107,12 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         )
 
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/debug/dump_proto.cc")
+list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/ir/lite/tensor.cc")
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/parallel/strategy_checkpoint/parallel_strategy_checkpoint.cc")
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/utils/anf_ir.pb.cc")
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/utils/node_strategy.pb.cc")
+list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/utils/load_onnx/anf_model_parser.cc")
+list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/utils/load_onnx/anf_converter.cc")
 
 file(GLOB_RECURSE UT_SUTB_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "stub/aicpu/*.cc"
diff --git a/tests/ut/cpp/dataset/CMakeLists.txt b/tests/ut/cpp/dataset/CMakeLists.txt
index d8a3f0256e..3b16737f52 100644
--- a/tests/ut/cpp/dataset/CMakeLists.txt
+++ b/tests/ut/cpp/dataset/CMakeLists.txt
@@ -1,77 +1,83 @@
 include(GoogleTest)
 
 SET(DE_UT_SRCS
-    common/common.cc
-    common/cvop_common.cc
-    batch_op_test.cc
-    bit_functions_test.cc
-    storage_container_test.cc
-    treap_test.cc
-    interrupt_test.cc
-    image_folder_op_test.cc
-    buddy_test.cc
-    arena_test.cc
-    btree_test.cc
-    center_crop_op_test.cc
-    channel_swap_test.cc
-    circular_pool_test.cc
-    client_config_test.cc
-    connector_test.cc
-    datatype_test.cc
-    decode_op_test.cc
-    execution_tree_test.cc
-    global_context_test.cc
-    main_test.cc
-    map_op_test.cc
-    mind_record_op_test.cc
-    memory_pool_test.cc
-    normalize_op_test.cc
-    one_hot_op_test.cc
-    path_test.cc
-    project_op_test.cc
-    queue_test.cc
-    random_crop_op_test.cc
-    random_crop_decode_resize_op_test.cc
-    random_crop_and_resize_op_test.cc
-    random_color_adjust_op_test.cc
-    random_horizontal_flip_op_test.cc
-    random_resize_op_test.cc
-    random_rotation_op_test.cc
-    random_vertical_flip_op_test.cc
-    rename_op_test.cc
-    repeat_op_test.cc
-    skip_op_test.cc
-    rescale_op_test.cc
-    resize_bilinear_op_test.cc
-    resize_op_test.cc
-    shuffle_op_test.cc
-    stand_alone_samplers_test.cc
-    status_test.cc
-    storage_op_test.cc
-    task_manager_test.cc
-    tensor_test.cc
-    tensor_string_test.cc
-    tensorshape_test.cc
-    tfReader_op_test.cc
-    to_float16_op_test.cc
-    type_cast_op_test.cc
-    zip_op_test.cc
-    random_resize_op_test.cc
-    subset_random_sampler_test.cc
-    weighted_random_sampler_test.cc
-    mnist_op_test.cc
-    manifest_op_test.cc
-    voc_op_test.cc
-    cifar_op_test.cc
-    celeba_op_test.cc
-    take_op_test.cc
-    text_file_op_test.cc
-    filter_op_test.cc
-    concat_op_test.cc
-    jieba_tokenizer_op_test.cc
-    tokenizer_op_test.cc
-    gnn_graph_test.cc
-    )
+        common/common.cc
+        common/cvop_common.cc
+        batch_op_test.cc
+        bit_functions_test.cc
+        storage_container_test.cc
+        treap_test.cc
+        interrupt_test.cc
+        image_folder_op_test.cc
+        buddy_test.cc
+        arena_test.cc
+        btree_test.cc
+        center_crop_op_test.cc
+        channel_swap_test.cc
+        circular_pool_test.cc
+        client_config_test.cc
+        connector_test.cc
+        datatype_test.cc
+        decode_op_test.cc
+        execution_tree_test.cc
+        global_context_test.cc
+        main_test.cc
+        map_op_test.cc
+        mind_record_op_test.cc
+        memory_pool_test.cc
+        normalize_op_test.cc
+        one_hot_op_test.cc
+        pad_end_op_test.cc
+        path_test.cc
+        project_op_test.cc
+        queue_test.cc
+        random_crop_op_test.cc
+        random_crop_decode_resize_op_test.cc
+        random_crop_and_resize_op_test.cc
+        random_color_adjust_op_test.cc
+        random_horizontal_flip_op_test.cc
+        random_resize_op_test.cc
+        random_rotation_op_test.cc
+        random_vertical_flip_op_test.cc
+        rename_op_test.cc
+        repeat_op_test.cc
+        skip_op_test.cc
+        rescale_op_test.cc
+        resize_bilinear_op_test.cc
+        resize_op_test.cc
+        shuffle_op_test.cc
+        stand_alone_samplers_test.cc
+        status_test.cc
+        task_manager_test.cc
+        tensor_test.cc
+        tensor_string_test.cc
+        tensorshape_test.cc
+        tfReader_op_test.cc
+        to_float16_op_test.cc
+        type_cast_op_test.cc
+        zip_op_test.cc
+        random_resize_op_test.cc
+        subset_random_sampler_test.cc
+        weighted_random_sampler_test.cc
+        mnist_op_test.cc
+        manifest_op_test.cc
+        voc_op_test.cc
+        cifar_op_test.cc
+        celeba_op_test.cc
+        take_op_test.cc
+        clue_op_test.cc
+        text_file_op_test.cc
+        filter_op_test.cc
+        concat_op_test.cc
+        jieba_tokenizer_op_test.cc
+        tokenizer_op_test.cc
+        gnn_graph_test.cc
+        coco_op_test.cc
+        fill_op_test.cc
+        mask_test.cc
+        trucate_pair_test.cc
+        concatenate_op_test.cc
+        )
 
 add_executable(de_ut_tests ${DE_UT_SRCS})
 
@@ -82,8 +88,8 @@ target_link_libraries(de_ut_tests PRIVATE _c_dataengine pybind11::embed ${GTEST_
 gtest_discover_tests(de_ut_tests WORKING_DIRECTORY ${Project_DIR}/tests/dataset)
 
 install(TARGETS de_ut_tests
-    RUNTIME DESTINATION test)
+        RUNTIME DESTINATION test)
 
 # For internal testing only.
 install(DIRECTORY ${Project_DIR}/tests/dataset/data/
-    DESTINATION test/data)
+        DESTINATION test/data)
diff --git a/tests/ut/cpp/dataset/batch_op_test.cc b/tests/ut/cpp/dataset/batch_op_test.cc
index 866ebc9b19..54972af378 100644
--- a/tests/ut/cpp/dataset/batch_op_test.cc
+++ b/tests/ut/cpp/dataset/batch_op_test.cc
@@ -54,10 +54,10 @@ std::shared_ptr<de::RepeatOp> Repeat(int repeat_cnt = 1) {
   return op;
 }
 
-std::shared_ptr<de::StorageOp> Storage(std::string schema, int rows_per_buf = 2, int num_works = 8) {
-  std::shared_ptr<de::StorageOp> so;
-  de::StorageOp::Builder builder;
-  builder.SetDatasetFilesDir(schema).SetRowsPerBuffer(rows_per_buf).SetNumWorkers(num_works);
+std::shared_ptr<de::TFReaderOp> TFReader(std::string schema, int rows_per_buf = 2, int num_works = 8) {
+  std::shared_ptr<de::TFReaderOp> so;
+  de::TFReaderOp::Builder builder;
+  builder.SetDatasetFilesList({schema}).SetRowsPerBuffer(rows_per_buf).SetNumWorkers(num_works);
   Status rc = builder.Build(&so);
   return so;
 }
@@ -77,9 +77,9 @@ std::shared_ptr<de::ExecutionTree> Build(std::vector<std::shared_ptr<de::Dataset
 }
 
 TEST_F(MindDataTestBatchOp, TestSimpleBatch) {
-  std::string schema_file = datasets_root_path_ + "/testBatchDataset";
+  std::string schema_file = datasets_root_path_ + "/testBatchDataset/test.data";
   bool success = false;
-  auto tree = Build({Storage(schema_file), Batch(12)});
+  auto tree = Build({TFReader(schema_file), Batch(12)});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -108,9 +108,9 @@ TEST_F(MindDataTestBatchOp, TestSimpleBatch) {
 }
 
 TEST_F(MindDataTestBatchOp, TestRepeatBatchDropTrue) {
-  std::string schema_file = datasets_root_path_ + "/testBatchDataset";
+  std::string schema_file = datasets_root_path_ + "/testBatchDataset/test.data";
   bool success = false;
-  auto tree = Build({Storage(schema_file), Repeat(2), Batch(7, true, 99)});
+  auto tree = Build({TFReader(schema_file), Repeat(2), Batch(7, true, 99)});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -153,9 +153,9 @@ TEST_F(MindDataTestBatchOp, TestRepeatBatchDropTrue) {
 }
 
 TEST_F(MindDataTestBatchOp, TestRepeatBatchDropFalse) {
-  std::string schema_file = datasets_root_path_ + "/testBatchDataset";
+  std::string schema_file = datasets_root_path_ + "/testBatchDataset/test.data";
   bool success = false;
-  auto tree = Build({Storage(schema_file), Repeat(2), Batch(7, false, 99)});
+  auto tree = Build({TFReader(schema_file), Repeat(2), Batch(7, false, 99)});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -205,9 +205,9 @@ TEST_F(MindDataTestBatchOp, TestRepeatBatchDropFalse) {
 }
 
 TEST_F(MindDataTestBatchOp, TestBatchDropFalseRepeat) {
-  std::string schema_file = datasets_root_path_ + "/testBatchDataset";
+  std::string schema_file = datasets_root_path_ + "/testBatchDataset/test.data";
   bool success = false;
-  auto tree = Build({Storage(schema_file), Batch(7, false, 99), Repeat(2)});
+  auto tree = Build({TFReader(schema_file), Batch(7, false, 99), Repeat(2)});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -251,9 +251,9 @@ TEST_F(MindDataTestBatchOp, TestBatchDropFalseRepeat) {
 }
 
 TEST_F(MindDataTestBatchOp, TestBatchDropTrueRepeat) {
-  std::string schema_file = datasets_root_path_ + "/testBatchDataset";
+  std::string schema_file = datasets_root_path_ + "/testBatchDataset/test.data";
   bool success = false;
-  auto tree = Build({Storage(schema_file), Batch(5, true, 99), Repeat(2)});
+  auto tree = Build({TFReader(schema_file), Batch(5, true, 99), Repeat(2)});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -297,20 +297,68 @@ TEST_F(MindDataTestBatchOp, TestBatchDropTrueRepeat) {
 }
 
 TEST_F(MindDataTestBatchOp, TestSimpleBatchPadding) {
-  std::string schema_file = datasets_root_path_ + "/testBatchDataset";
+  std::string schema_file = datasets_root_path_ + "/testBatchDataset/test.data";
   std::shared_ptr<BatchOp> op;
-  std::map<std::string, std::pair<TensorShape, float>> m;
-  m.insert({"col_1d", std::make_pair(TensorShape({4}), -1)});
+  PadInfo m;
+  std::shared_ptr<Tensor> pad_value;
+  Tensor::CreateTensor(&pad_value, TensorImpl::kFlexible, TensorShape::CreateScalar(), DataType(DataType::DE_FLOAT32));
+  pad_value->SetItemAt<float>({}, -1);
+  m.insert({"col_1d", std::make_pair(TensorShape({4}), pad_value)});
   de::BatchOp::Builder(12).SetDrop(false).SetPaddingMap(m, true).Build(&op);
-  auto tree = Build({Storage(schema_file), op});
+  auto tree = Build({TFReader(schema_file), op});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
     MS_LOG(ERROR) << "Return code error detected during tree launch: " << rc.ToString() << ".";
   } else {
-    int64_t payload[] = {-9223372036854775807 - 1,  1,  -1, -1, 2,  3,  -1, -1, 4,  5,  -1, -1, 6,  7,  -1, -1,
-                         8,  9,  -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1,
-                         16, 17, -1, -1, 18, 19, -1, -1, 20, 21, -1, -1, 22, 23, -1, -1};
+    int64_t payload[] = {-9223372036854775807 - 1,
+                         1,
+                         -1,
+                         -1,
+                         2,
+                         3,
+                         -1,
+                         -1,
+                         4,
+                         5,
+                         -1,
+                         -1,
+                         6,
+                         7,
+                         -1,
+                         -1,
+                         8,
+                         9,
+                         -1,
+                         -1,
+                         10,
+                         11,
+                         -1,
+                         -1,
+                         12,
+                         13,
+                         -1,
+                         -1,
+                         14,
+                         15,
+                         -1,
+                         -1,
+                         16,
+                         17,
+                         -1,
+                         -1,
+                         18,
+                         19,
+                         -1,
+                         -1,
+                         20,
+                         21,
+                         -1,
+                         -1,
+                         22,
+                         23,
+                         -1,
+                         -1};
     std::shared_ptr<de::Tensor> t;
     rc = de::Tensor::CreateTensor(&t, TensorImpl::kFlexible, de::TensorShape({12, 4}), de::DataType(DataType::DE_INT64),
                                   (unsigned char *)payload);
diff --git a/tests/ut/cpp/dataset/btree_test.cc b/tests/ut/cpp/dataset/btree_test.cc
index 2e40f4a661..168f550f34 100644
--- a/tests/ut/cpp/dataset/btree_test.cc
+++ b/tests/ut/cpp/dataset/btree_test.cc
@@ -32,13 +32,8 @@ using mindspore::LogStream;
 // For testing purposes, we will make the branching factor very low.
 struct mytraits {
     using slot_type = uint16_t;
-
     static const slot_type kLeafSlots = 6;
-
     static const slot_type kInnerSlots = 3;
-
-    static const bool kAppendMode = false;
-
 };
 
 
@@ -95,13 +90,14 @@ TEST_F(MindDataTestBPlusTree, Test1) {
   // Test search
   {
     MS_LOG(INFO) << "Locate key " << 100 << " Expect found.";
-    auto it = btree.Search(100);
-    EXPECT_FALSE(it == btree.end());
+    auto r = btree.Search(100);
+    auto &it = r.first;
+    EXPECT_TRUE(r.second);
     EXPECT_EQ(it.key(), 100);
     EXPECT_EQ(it.value(), "Hello World. I am 100");
     MS_LOG(INFO) << "Locate key " << 300 << " Expect not found.";
-    it = btree.Search(300);
-    EXPECT_TRUE(it == btree.end());
+    auto q = btree.Search(300);
+    EXPECT_FALSE(q.second);
   }
 
   // Test duplicate key
@@ -169,26 +165,18 @@ TEST_F(MindDataTestBPlusTree, Test2) {
   {
     MS_LOG(INFO) << "Locating key from 0 to 9999. Expect found.";
     for (int i = 0; i < 10000; i++) {
-      auto it = btree.Search(i);
-      bool eoS = (it == btree.end());
-      EXPECT_FALSE(eoS);
-      if (!eoS) {
+      auto r = btree.Search(i);
+      EXPECT_TRUE(r.second);
+      if (r.second) {
+        auto &it = r.first;
         EXPECT_EQ(it.key(), i);
         std::string val = "Hello World. I am " + std::to_string(i);
         EXPECT_EQ(it.value(), val);
       }
     }
     MS_LOG(INFO) << "Locate key " << 10000 << ". Expect not found";
-    auto it = btree.Search(10000);
-    EXPECT_TRUE(it == btree.end());
-  }
-
-  // Test to retrieve key at certain position.
-  {
-    for (int i = 0; i < 10000; i++) {
-      int k = btree.KeyAtPos(i);
-      EXPECT_EQ(k, i);
-    }
+    auto q = btree.Search(10000);
+    EXPECT_FALSE(q.second);
   }
 }
 
@@ -204,7 +192,8 @@ TEST_F(MindDataTestBPlusTree, Test3) {
   uint64_t max = ai.max_key();
   EXPECT_EQ(min, 1);
   EXPECT_EQ(max, 4);
-  auto it = ai.Search(3);
+  auto r = ai.Search(3);
+  auto &it = r.first;
   EXPECT_EQ(it.value(), "b");
   MS_LOG(INFO) << "Dump all the values using [] operator.";
   for (uint64_t i = min; i <= max; i++) {
diff --git a/tests/ut/cpp/dataset/celeba_op_test.cc b/tests/ut/cpp/dataset/celeba_op_test.cc
index 35be4d7378..5fa50a85ff 100644
--- a/tests/ut/cpp/dataset/celeba_op_test.cc
+++ b/tests/ut/cpp/dataset/celeba_op_test.cc
@@ -39,14 +39,13 @@ std::shared_ptr<RepeatOp> Repeat(int repeat_cnt);
 std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);
 
 std::shared_ptr<CelebAOp> Celeba(int32_t num_workers, int32_t rows_per_buffer, int32_t queue_size,
-                                 const std::string &dir, int64_t num_samples = 0,
-                                 std::unique_ptr<Sampler> sampler = nullptr, bool decode = false,
-                                 const std::string &dataset_type="all") {
+                                 const std::string &dir, std::shared_ptr<Sampler> sampler = nullptr,
+                                 bool decode = false, const std::string &dataset_type="all") {
   std::shared_ptr<CelebAOp> so;
   CelebAOp::Builder builder;
   Status rc = builder.SetNumWorkers(num_workers).SetCelebADir(dir).SetRowsPerBuffer(rows_per_buffer)
                      .SetOpConnectorSize(queue_size).SetSampler(std::move(sampler)).SetDecode(decode)
-                     .SetNumSamples(num_samples).SetDatasetType(dataset_type).Build(&so);
+                     .SetDatasetType(dataset_type).Build(&so);
   return so;
 }
 
@@ -116,11 +115,12 @@ TEST_F(MindDataTestCelebaDataset, TestCelebaRepeat) {
 
 TEST_F(MindDataTestCelebaDataset, TestSubsetRandomSamplerCeleba) {
   std::vector<int64_t> indices({1});
-  std::unique_ptr<Sampler> sampler = std::make_unique<SubsetRandomSampler>(indices);
+  int64_t num_samples = 0;
+  std::shared_ptr<Sampler> sampler = std::make_shared<SubsetRandomSampler>(num_samples, indices);
   uint32_t expect_labels[1][40] = {{0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1}};
   std::string dir = datasets_root_path_ + "/testCelebAData/";
   uint32_t count = 0;
-  auto tree = Build({Celeba(16, 2, 32, dir, 0, std::move(sampler))});
+  auto tree = Build({Celeba(16, 2, 32, dir, std::move(sampler))});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -143,25 +143,3 @@ TEST_F(MindDataTestCelebaDataset, TestSubsetRandomSamplerCeleba) {
     EXPECT_TRUE(count == 1);
   }
 }
-
-TEST_F(MindDataTestCelebaDataset, TestCelebaNumSamples) {
-  std::string dir = datasets_root_path_ + "/testCelebAData/";
-  uint32_t count = 0;
-  auto tree = Build({Celeba(16, 2, 32, dir, 1)});
-  tree->Prepare();
-  Status rc = tree->Launch();
-  if (rc.IsError()) {
-    MS_LOG(ERROR) << "Return code error detected during tree launch: " << rc.ToString() << ".";
-    EXPECT_TRUE(false);
-  } else {
-    DatasetIterator di(tree);
-    TensorMap tersor_map;
-    di.GetNextAsMap(&tersor_map);
-    EXPECT_TRUE(rc.IsOk());
-    while (tersor_map.size() != 0) {
-      count++;
-      di.GetNextAsMap(&tersor_map);
-    }
-    EXPECT_TRUE(count == 1);
-  }
-}
diff --git a/tests/ut/cpp/dataset/cifar_op_test.cc b/tests/ut/cpp/dataset/cifar_op_test.cc
index 8eeeba76af..2992bc91a8 100644
--- a/tests/ut/cpp/dataset/cifar_op_test.cc
+++ b/tests/ut/cpp/dataset/cifar_op_test.cc
@@ -45,13 +45,12 @@ std::shared_ptr<RepeatOp> Repeat(int repeatCnt);
 std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);
 
 std::shared_ptr<CifarOp> Cifarop(uint64_t num_works, uint64_t rows, uint64_t conns, std::string path,
-                                 std::unique_ptr<Sampler> sampler = nullptr,
-                                 uint64_t num_samples = 0, bool cifar10 = true) {
+                                 std::shared_ptr<Sampler> sampler = nullptr, bool cifar10 = true) {
   std::shared_ptr<CifarOp> so;
   CifarOp::Builder builder;
   Status rc = builder.SetNumWorkers(num_works).SetCifarDir(path).SetRowsPerBuffer(rows)
                      .SetOpConnectorSize(conns).SetSampler(std::move(sampler)).SetCifarType(cifar10)
-                     .SetNumSamples(num_samples).Build(&so);
+                     .Build(&so);
   return so;
 }
 
@@ -66,7 +65,7 @@ TEST_F(MindDataTestCifarOp, TestSequentialSamplerCifar10) {
   //appear in this dataset
   //Example: python tests/dataset/data/prep_data.py
   std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
-  auto tree = Build({Cifarop(16, 2, 32, folder_path, nullptr, 100)});
+  auto tree = Build({Cifarop(16, 2, 32, folder_path, nullptr)});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -79,7 +78,8 @@ TEST_F(MindDataTestCifarOp, TestSequentialSamplerCifar10) {
     EXPECT_TRUE(rc.IsOk());
     uint64_t i = 0;
     uint32_t label = 0;
-    while (tensor_map.size() != 0) {
+    // Note: only iterating first 100 rows then break out.
+    while (tensor_map.size() != 0 && i < 100) {
       tensor_map["label"]->GetItemAt<uint32_t>(&label, {});
       MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "\n";
       i++;
@@ -92,9 +92,9 @@ TEST_F(MindDataTestCifarOp, TestSequentialSamplerCifar10) {
 TEST_F(MindDataTestCifarOp, TestRandomSamplerCifar10) {
   uint32_t original_seed = GlobalContext::config_manager()->seed();
   GlobalContext::config_manager()->set_seed(0);
-  std::unique_ptr<Sampler> sampler = std::make_unique<RandomSampler>(true, true, 12);
+  std::shared_ptr<Sampler> sampler = std::make_unique<RandomSampler>(12, true, true);
   std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
-  auto tree = Build({Cifarop(16, 2, 32, folder_path, std::move(sampler), 100)});
+  auto tree = Build({Cifarop(16, 2, 32, folder_path, std::move(sampler))});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -118,34 +118,9 @@ TEST_F(MindDataTestCifarOp, TestRandomSamplerCifar10) {
   GlobalContext::config_manager()->set_seed(original_seed);
 }
 
-TEST_F(MindDataTestCifarOp, TestCifar10NumSample) {
-  std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
-  auto tree = Build({Cifarop(16, 2, 32, folder_path, nullptr, 100)});
-  tree->Prepare();
-  Status rc = tree->Launch();
-  if (rc.IsError()) {
-    MS_LOG(ERROR) << "Return code error detected during tree launch: " << common::SafeCStr(rc.ToString()) << ".";
-    EXPECT_TRUE(false);
-  } else {
-    DatasetIterator di(tree);
-    TensorMap tensor_map;
-    di.GetNextAsMap(&tensor_map);
-    EXPECT_TRUE(rc.IsOk());
-    uint64_t i = 0;
-    uint32_t label = 0;
-    while (tensor_map.size() != 0) {
-      tensor_map["label"]->GetItemAt<uint32_t>(&label, {});
-      MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "\n";
-      i++;
-      di.GetNextAsMap(&tensor_map);
-    }
-    EXPECT_TRUE(i == 100);
-  }
-}
-
 TEST_F(MindDataTestCifarOp, TestSequentialSamplerCifar100) {
   std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
-  auto tree = Build({Cifarop(16, 2, 32, folder_path, nullptr, 100, false)});
+  auto tree = Build({Cifarop(16, 2, 32, folder_path, nullptr, false)});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -159,7 +134,8 @@ TEST_F(MindDataTestCifarOp, TestSequentialSamplerCifar100) {
     uint64_t i = 0;
     uint32_t coarse = 0;
     uint32_t fine = 0;
-    while (tensor_map.size() != 0) {
+    // only iterate to 100 then break out of loop
+    while (tensor_map.size() != 0 && i < 100) {
       tensor_map["coarse_label"]->GetItemAt<uint32_t>(&coarse, {});
       tensor_map["fine_label"]->GetItemAt<uint32_t>(&fine, {});
       MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << " coarse:"
diff --git a/tests/ut/cpp/dataset/client_config_test.cc b/tests/ut/cpp/dataset/client_config_test.cc
index d0082d1699..a907d50134 100644
--- a/tests/ut/cpp/dataset/client_config_test.cc
+++ b/tests/ut/cpp/dataset/client_config_test.cc
@@ -88,17 +88,17 @@ TEST_F(MindDataTestClientConfig, TestClientConfig2) {
   // Dataset from testDataset1 has 10 rows, 2 columns.
   // RowsPerBuffer buffer setting of 2 divides evenly into total rows.
   std::string dataset_path;
-  dataset_path = datasets_root_path_ + "/testDataset1";
-  std::shared_ptr<StorageOp> my_storage_op;
-  StorageOp::Builder builder;
-  builder.SetDatasetFilesDir(dataset_path);
-  rc = builder.Build(&my_storage_op);
+  dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  TFReaderOp::Builder builder;
+  builder.SetDatasetFilesList({dataset_path});
+  rc = builder.Build(&my_tfreader_op);
   ASSERT_TRUE(rc.IsOk());
-  ASSERT_EQ(my_storage_op->num_workers(),16);
-  my_tree->AssociateNode(my_storage_op);
+  ASSERT_EQ(my_tfreader_op->num_workers(),1);
+  my_tree->AssociateNode(my_tfreader_op);
 
   // Set children/root layout.
-  my_tree->AssignRoot(my_storage_op);
+  my_tree->AssignRoot(my_tfreader_op);
 
   my_tree->Prepare();
   my_tree->Launch();
@@ -116,5 +116,5 @@ TEST_F(MindDataTestClientConfig, TestClientConfig2) {
     row_count++;
   }
   ASSERT_EQ(row_count, 10); // Should be 10 rows fetched
-  ASSERT_EQ(my_storage_op->num_workers(),16);
+  ASSERT_EQ(my_tfreader_op->num_workers(),1);
 }
diff --git a/tests/ut/cpp/dataset/clue_op_test.cc b/tests/ut/cpp/dataset/clue_op_test.cc
new file mode 100644
index 0000000000..ff2f01a9ff
--- /dev/null
+++ b/tests/ut/cpp/dataset/clue_op_test.cc
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "dataset/core/client.h"
+#include "common/common.h"
+#include "common/utils.h"
+#include "gtest/gtest.h"
+#include "utils/log_adapter.h"
+#include "dataset/engine/datasetops/source/clue_op.h"
+#include "dataset/util/status.h"
+
+namespace common = mindspore::common;
+
+using namespace mindspore::dataset;
+using mindspore::MsLogLevel::INFO;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::LogStream;
+
+class MindDataTestCLUEOp : public UT::DatasetOpTesting {
+
+};
+
+TEST_F(MindDataTestCLUEOp, TestCLUEBasic) {
+  // Start with an empty execution tree
+  auto tree = std::make_shared<ExecutionTree>();
+
+  std::string dataset_path;
+  dataset_path = datasets_root_path_ + "/testCLUE/afqmc/train.json";
+  std::map<std::string, std::string> key_map;
+  key_map["sentence1"] = "sentence1";
+  key_map["sentence2"] = "sentence2";
+  key_map["label"] = "label";
+
+  std::shared_ptr<ClueOp> op;
+  ClueOp::Builder builder;
+  builder.SetClueFilesList({dataset_path})
+      .SetRowsPerBuffer(16)
+      .SetNumWorkers(16)
+      .SetOpConnectorSize(2)
+      .SetColsKeyMap(key_map);
+
+  Status rc = builder.Build(&op);
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = tree->AssociateNode(op);
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = tree->AssignRoot(op);
+  ASSERT_TRUE(rc.IsOk());
+
+  MS_LOG(INFO) << "Launching tree and begin iteration.";
+  rc = tree->Prepare();
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = tree->Launch();
+  ASSERT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator di(tree);
+  TensorRow tensor_list;
+  rc = di.FetchNextTensorRow(&tensor_list);
+  ASSERT_TRUE(rc.IsOk());
+
+  int row_count = 0;
+  while (!tensor_list.empty()) {
+    // Display the tensor by calling the printer on it
+    for (int i = 0; i < tensor_list.size(); i++) {
+      std::ostringstream ss;
+      ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
+      MS_LOG(INFO) << "Tensor print: " << ss.str() << ".";
+    }
+
+    rc = di.FetchNextTensorRow(&tensor_list);
+    ASSERT_TRUE(rc.IsOk());
+    row_count++;
+  }
+
+  ASSERT_EQ(row_count, 3);
+}
+
+TEST_F(MindDataTestCLUEOp, TestTotalRows) {
+  std::string tf_file1 = datasets_root_path_ + "/testCLUE/afqmc/train.json";
+  std::string tf_file2 = datasets_root_path_ + "/testCLUE/afqmc/dev.json";
+  std::vector<std::string> files;
+  files.push_back(tf_file1);
+  int64_t total_rows = 0;
+  ClueOp::CountAllFileRows(files, &total_rows);
+  ASSERT_EQ(total_rows, 3);
+  files.clear();
+
+  files.push_back(tf_file2);
+  ClueOp::CountAllFileRows(files, &total_rows);
+  ASSERT_EQ(total_rows, 3);
+  files.clear();
+
+  files.push_back(tf_file1);
+  files.push_back(tf_file2);
+  ClueOp::CountAllFileRows(files, &total_rows);
+  ASSERT_EQ(total_rows, 6);
+  files.clear();
+}
diff --git a/tests/ut/cpp/dataset/coco_op_test.cc b/tests/ut/cpp/dataset/coco_op_test.cc
new file mode 100644
index 0000000000..b412b66887
--- /dev/null
+++ b/tests/ut/cpp/dataset/coco_op_test.cc
@@ -0,0 +1,265 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "common/common.h"
+#include "common/utils.h"
+#include "dataset/core/client.h"
+#include "dataset/core/global_context.h"
+#include "dataset/engine/datasetops/source/coco_op.h"
+#include "dataset/engine/datasetops/source/sampler/distributed_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/pk_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/random_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/sampler.h"
+#include "dataset/engine/datasetops/source/sampler/sequential_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
+#include "dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
+#include "dataset/util/de_error.h"
+#include "dataset/util/path.h"
+#include "dataset/util/status.h"
+#include "gtest/gtest.h"
+#include "utils/log_adapter.h"
+#include "securec.h"
+
+namespace common = mindspore::common;
+
+using namespace mindspore::dataset;
+using mindspore::MsLogLevel::ERROR;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::LogStream;
+
+std::shared_ptr<BatchOp> Batch(int batch_size = 1, bool drop = false, int rows_per_buf = 2);
+
+std::shared_ptr<RepeatOp> Repeat(int repeat_cnt);
+
+std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);
+
+class MindDataTestCocoOp : public UT::DatasetOpTesting {
+ protected:
+};
+
+TEST_F(MindDataTestCocoOp, TestCocoDetection) {
+  // Start with an empty execution tree
+  auto my_tree = std::make_shared<ExecutionTree>();
+  std::string dataset_path, annotation_path;
+  dataset_path = datasets_root_path_ + "/testCOCO/train/";
+  annotation_path = datasets_root_path_ + "/testCOCO/annotations/train.json";
+
+  std::string task("Detection");
+  std::shared_ptr<CocoOp> my_coco_op;
+  CocoOp::Builder builder;
+  Status rc = builder.SetDir(dataset_path)
+                     .SetFile(annotation_path)
+                     .SetTask(task)
+                     .Build(&my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = my_tree->AssociateNode(my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+  rc = my_tree->AssignRoot(my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+
+  MS_LOG(DEBUG) << "Launch tree and begin iteration.";
+  rc = my_tree->Prepare();
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = my_tree->Launch();
+  ASSERT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator di(my_tree);
+  TensorRow tensor_list;
+  rc = di.FetchNextTensorRow(&tensor_list);
+  ASSERT_TRUE(rc.IsOk());
+
+  int row_count = 0;
+  while (!tensor_list.empty()) {
+    MS_LOG(DEBUG) << "Row display for row #: " << row_count << ".";
+
+    //Display the tensor by calling the printer on it
+    for (int i = 0; i < tensor_list.size(); i++) {
+      std::ostringstream ss;
+      ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
+      MS_LOG(DEBUG) << "Tensor print: " << ss.str() << ".";
+    }
+
+    rc = di.FetchNextTensorRow(&tensor_list);
+    ASSERT_TRUE(rc.IsOk());
+    row_count++;
+  }
+
+  ASSERT_EQ(row_count, 6);
+}
+
+TEST_F(MindDataTestCocoOp, TestCocoStuff) {
+  // Start with an empty execution tree
+  auto my_tree = std::make_shared<ExecutionTree>();
+  std::string dataset_path, annotation_path;
+  dataset_path = datasets_root_path_ + "/testCOCO/train/";
+  annotation_path = datasets_root_path_ + "/testCOCO/annotations/train.json";
+
+  std::string task("Stuff");
+  std::shared_ptr<CocoOp> my_coco_op;
+  CocoOp::Builder builder;
+  Status rc = builder.SetDir(dataset_path)
+    .SetFile(annotation_path)
+    .SetTask(task)
+    .Build(&my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = my_tree->AssociateNode(my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+  rc = my_tree->AssignRoot(my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+
+  MS_LOG(DEBUG) << "Launch tree and begin iteration.";
+  rc = my_tree->Prepare();
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = my_tree->Launch();
+  ASSERT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator di(my_tree);
+  TensorRow tensor_list;
+  rc = di.FetchNextTensorRow(&tensor_list);
+  ASSERT_TRUE(rc.IsOk());
+
+  int row_count = 0;
+  while (!tensor_list.empty()) {
+    MS_LOG(DEBUG) << "Row display for row #: " << row_count << ".";
+
+    //Display the tensor by calling the printer on it
+    for (int i = 0; i < tensor_list.size(); i++) {
+      std::ostringstream ss;
+      ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
+      MS_LOG(DEBUG) << "Tensor print: " << ss.str() << ".";
+    }
+
+    rc = di.FetchNextTensorRow(&tensor_list);
+    ASSERT_TRUE(rc.IsOk());
+    row_count++;
+  }
+
+  ASSERT_EQ(row_count, 6);
+}
+
+TEST_F(MindDataTestCocoOp, TestCocoKeypoint) {
+  // Start with an empty execution tree
+  auto my_tree = std::make_shared<ExecutionTree>();
+  std::string dataset_path, annotation_path;
+  dataset_path = datasets_root_path_ + "/testCOCO/train/";
+  annotation_path = datasets_root_path_ + "/testCOCO/annotations/key_point.json";
+
+  std::string task("Keypoint");
+  std::shared_ptr<CocoOp> my_coco_op;
+  CocoOp::Builder builder;
+  Status rc = builder.SetDir(dataset_path)
+    .SetFile(annotation_path)
+    .SetTask(task)
+    .Build(&my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = my_tree->AssociateNode(my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+  rc = my_tree->AssignRoot(my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+
+  MS_LOG(DEBUG) << "Launch tree and begin iteration.";
+  rc = my_tree->Prepare();
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = my_tree->Launch();
+  ASSERT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator di(my_tree);
+  TensorRow tensor_list;
+  rc = di.FetchNextTensorRow(&tensor_list);
+  ASSERT_TRUE(rc.IsOk());
+
+  int row_count = 0;
+  while (!tensor_list.empty()) {
+    MS_LOG(DEBUG) << "Row display for row #: " << row_count << ".";
+
+    //Display the tensor by calling the printer on it
+    for (int i = 0; i < tensor_list.size(); i++) {
+      std::ostringstream ss;
+      ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
+      MS_LOG(DEBUG) << "Tensor print: " << ss.str() << ".";
+    }
+    rc = di.FetchNextTensorRow(&tensor_list);
+    ASSERT_TRUE(rc.IsOk());
+    row_count++;
+  }
+
+  ASSERT_EQ(row_count, 2);
+}
+
+TEST_F(MindDataTestCocoOp, TestCocoPanoptic) {
+  // Start with an empty execution tree
+  auto my_tree = std::make_shared<ExecutionTree>();
+  std::string dataset_path, annotation_path;
+  dataset_path = datasets_root_path_ + "/testCOCO/train/";
+  annotation_path = datasets_root_path_ + "/testCOCO/annotations/panoptic.json";
+
+  std::string task("Panoptic");
+  std::shared_ptr<CocoOp> my_coco_op;
+  CocoOp::Builder builder;
+  Status rc = builder.SetDir(dataset_path)
+    .SetFile(annotation_path)
+    .SetTask(task)
+    .Build(&my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = my_tree->AssociateNode(my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+  rc = my_tree->AssignRoot(my_coco_op);
+  ASSERT_TRUE(rc.IsOk());
+
+  MS_LOG(DEBUG) << "Launch tree and begin iteration.";
+  rc = my_tree->Prepare();
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = my_tree->Launch();
+  ASSERT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator di(my_tree);
+  TensorRow tensor_list;
+  rc = di.FetchNextTensorRow(&tensor_list);
+  ASSERT_TRUE(rc.IsOk());
+
+  int row_count = 0;
+  while (!tensor_list.empty()) {
+    MS_LOG(DEBUG) << "Row display for row #: " << row_count << ".";
+
+    //Display the tensor by calling the printer on it
+    for (int i = 0; i < tensor_list.size(); i++) {
+      std::ostringstream ss;
+      ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
+      MS_LOG(DEBUG) << "Tensor print: " << ss.str() << ".";
+    }
+    rc = di.FetchNextTensorRow(&tensor_list);
+    ASSERT_TRUE(rc.IsOk());
+    row_count++;
+  }
+
+  ASSERT_EQ(row_count, 2);
+}
\ No newline at end of file
diff --git a/tests/ut/cpp/dataset/common/cvop_common.cc b/tests/ut/cpp/dataset/common/cvop_common.cc
index 7ee080dd68..6f66229e80 100644
--- a/tests/ut/cpp/dataset/common/cvop_common.cc
+++ b/tests/ut/cpp/dataset/common/cvop_common.cc
@@ -28,9 +28,9 @@
 namespace common = mindspore::common;
 
 using namespace mindspore::dataset;
-using mindspore::MsLogLevel::INFO;
-using mindspore::ExceptionType::NoExceptionType;
 using mindspore::LogStream;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::MsLogLevel::INFO;
 using UT::CVOP::CVOpCommon;
 
 CVOpCommon::CVOpCommon() {}
@@ -52,15 +52,7 @@ std::string CVOpCommon::GetFilename() {
 
 void CVOpCommon::GetInputImage(std::string filename) {
   try {
-    std::ifstream tmp(filename, std::ios::binary | std::ios::ate);
-    dsize_t file_size = tmp.tellg();
-    tmp.close();
-
-    std::ifstream file(filename, std::ios::binary);
-    TensorShape in_shape({file_size});
-    raw_input_tensor_ = std::make_shared<Tensor>(in_shape, DataType(DataType::DE_UINT8));
-
-    file.read(reinterpret_cast<char *>(raw_input_tensor_->GetMutableBuffer()), raw_input_tensor_->SizeInBytes());
+    Tensor::CreateTensor(&raw_input_tensor_, filename);
     raw_cv_image_ = cv::imread(filename, cv::ImreadModes::IMREAD_COLOR);
     input_tensor_ = std::dynamic_pointer_cast<Tensor>(std::make_shared<CVTensor>(raw_cv_image_));
     SwapRedAndBlue(input_tensor_, &input_tensor_);
diff --git a/tests/ut/cpp/dataset/concatenate_op_test.cc b/tests/ut/cpp/dataset/concatenate_op_test.cc
new file mode 100644
index 0000000000..1ceedbac38
--- /dev/null
+++ b/tests/ut/cpp/dataset/concatenate_op_test.cc
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/common.h"
+#include "dataset/kernels/data/concatenate_op.h"
+#include "utils/log_adapter.h"
+
+using namespace mindspore::dataset;
+using mindspore::LogStream;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::MsLogLevel::INFO;
+
+class MindDataTestConcatenateOp : public UT::Common {
+ protected:
+  MindDataTestConcatenateOp() {}
+};
+
+TEST_F(MindDataTestConcatenateOp, TestOp) {
+  MS_LOG(INFO) << "Doing MindDataTestConcatenate-TestOp.";
+  uint64_t labels[3] = {1, 1, 2};
+  TensorShape shape({3});
+  std::shared_ptr<Tensor> input =
+    std::make_shared<Tensor>(shape, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(labels));
+
+  uint64_t append_labels[3] = {4, 4, 4};
+  std::shared_ptr<Tensor> append =
+    std::make_shared<Tensor>(shape, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(append_labels));
+
+  std::shared_ptr<Tensor> output;
+  std::unique_ptr<ConcatenateOp> op(new ConcatenateOp(0, nullptr, append));
+  TensorRow in;
+  in.push_back(input);
+  TensorRow out_row;
+  Status s = op->Compute(in, &out_row);
+  uint64_t out[6] = {1, 1, 2, 4, 4, 4};
+
+  std::shared_ptr<Tensor> expected =
+    std::make_shared<Tensor>(TensorShape{6}, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(out));
+  output = out_row[0];
+  EXPECT_TRUE(s.IsOk());
+  ASSERT_TRUE(output->shape() == expected->shape());
+  ASSERT_TRUE(output->type() == expected->type());
+  MS_LOG(DEBUG) << *output << std::endl;
+  MS_LOG(DEBUG) << *expected << std::endl;
+
+  ASSERT_TRUE(*output == *expected);
+
+  //  std::vector<TensorShape> inputs = {TensorShape({3})};
+  //  std::vector<TensorShape> outputs = {};
+  //  s = op->OutputShape(inputs, outputs);
+  //  EXPECT_TRUE(s.IsOk());
+  //  ASSERT_TRUE(outputs[0] == TensorShape{6});
+  //  MS_LOG(INFO) << "MindDataTestConcatenateOp-TestOp end.";
+}
diff --git a/tests/ut/cpp/dataset/duplicate_op_test.cc b/tests/ut/cpp/dataset/duplicate_op_test.cc
new file mode 100644
index 0000000000..6c9c00a30e
--- /dev/null
+++ b/tests/ut/cpp/dataset/duplicate_op_test.cc
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/core/client.h"
+#include "common/common.h"
+#include "gtest/gtest.h"
+#include "dataset/core/tensor.h"
+#include "dataset/util/de_error.h"
+#include "dataset/kernels/data/duplicate_op.h"
+
+using namespace mindspore::dataset;
+
+namespace py = pybind11;
+
+class MindDataTestDuplicateOp : public UT::Common {
+ public:
+  MindDataTestDuplicateOp() {}
+
+  void SetUp() { GlobalInit(); }
+};
+
+TEST_F(MindDataTestDuplicateOp, Basics) {
+  std::shared_ptr<Tensor> t;
+  Tensor::CreateTensor(&t, std::vector<uint32_t>({1, 2, 3, 4, 5, 6}));
+  std::shared_ptr<Tensor> v;
+  Tensor::CreateTensor(&v, std::vector<uint32_t>({3}), TensorShape::CreateScalar());
+  std::shared_ptr<DuplicateOp> op = std::make_shared<DuplicateOp>();
+  TensorRow in;
+  in.push_back(t);
+  TensorRow out;
+  ASSERT_TRUE(op->Compute(in, &out).IsOk());
+
+  ASSERT_TRUE(*t == *out[0]);
+  ASSERT_TRUE(*t == *out[1]);
+  ASSERT_TRUE(t->GetBuffer() == out[0]->GetBuffer());
+  ASSERT_TRUE(t->GetBuffer() != out[1]->GetBuffer());
+}
diff --git a/tests/ut/cpp/dataset/execution_tree_test.cc b/tests/ut/cpp/dataset/execution_tree_test.cc
index 4a63271236..03072c0477 100644
--- a/tests/ut/cpp/dataset/execution_tree_test.cc
+++ b/tests/ut/cpp/dataset/execution_tree_test.cc
@@ -18,7 +18,7 @@
 #include "dataset/core/client.h"
 #include "dataset/engine/execution_tree.h"
 #include "dataset/engine/datasetops/shuffle_op.h"
-#include "dataset/engine/datasetops/source/storage_op.h"
+#include "dataset/engine/datasetops/source/tf_reader_op.h"
 #include "common/common.h"
 #include "gtest/gtest.h"
 #include "dataset/util/de_error.h"
@@ -103,17 +103,17 @@ TEST_F(MindDataTestExecutionTree, TestExecutionTree2) {
   Status rc;
   auto my_tree = std::make_shared<ExecutionTree>();
 
-  std::string dataset_path = datasets_root_path_ + "/testDataset1";
-  std::shared_ptr<StorageOp> my_storage_op;
-  StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path)
+  std::string dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path})
       .SetRowsPerBuffer(2)
       .SetWorkerConnectorSize(2)
       .SetNumWorkers(2)
-      .Build(&my_storage_op);
+      .Build(&my_tfreader_op);
 
-  my_tree->AssociateNode(my_storage_op);
-  my_tree->AssignRoot(my_storage_op);
+  my_tree->AssociateNode(my_tfreader_op);
+  my_tree->AssignRoot(my_tfreader_op);
 
   // prepare the tree
   my_tree->Prepare();
diff --git a/tests/ut/cpp/dataset/fill_op_test.cc b/tests/ut/cpp/dataset/fill_op_test.cc
new file mode 100644
index 0000000000..d43b7d7548
--- /dev/null
+++ b/tests/ut/cpp/dataset/fill_op_test.cc
@@ -0,0 +1,183 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/common.h"
+#include "dataset/kernels/data/fill_op.h"
+#include "utils/log_adapter.h"
+
+using namespace mindspore::dataset;
+using mindspore::LogStream;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::MsLogLevel::INFO;
+
+class MindDataTestFillOp : public UT::Common {
+ protected:
+  MindDataTestFillOp() {}
+};
+
+TEST_F(MindDataTestFillOp, TestOp) {
+  MS_LOG(INFO) << "Doing MindDataTestFillOp-TestOp.";
+  uint64_t labels[3] = {1, 1, 2};
+  TensorShape shape({3});
+  std::shared_ptr<Tensor> input =
+    std::make_shared<Tensor>(shape, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(labels));
+
+  TensorShape fill_shape({});
+  std::shared_ptr<Tensor> fill_tensor = std::make_shared<Tensor>(fill_shape, DataType(DataType::DE_UINT64));
+  fill_tensor->SetItemAt<uint64_t>({}, 4);
+
+  std::shared_ptr<Tensor> output;
+  std::unique_ptr<FillOp> op(new FillOp(fill_tensor));
+  Status s = op->Compute(input, &output);
+
+  uint64_t out[3] = {4, 4, 4};
+
+  std::shared_ptr<Tensor> expected =
+    std::make_shared<Tensor>(TensorShape{3}, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(out));
+
+  EXPECT_TRUE(s.IsOk());
+  ASSERT_TRUE(output->shape() == expected->shape());
+  ASSERT_TRUE(output->type() == expected->type());
+  MS_LOG(DEBUG) << *output << std::endl;
+  MS_LOG(DEBUG) << *expected << std::endl;
+
+  ASSERT_TRUE(*output == *expected);
+  MS_LOG(INFO) << "MindDataTestFillOp-TestOp end.";
+}
+
+TEST_F(MindDataTestFillOp, TestCasting) {
+  MS_LOG(INFO) << "Doing MindDataTestFillOp-TestCasting.";
+  uint64_t labels[3] = {0, 1, 2};
+  TensorShape shape({3});
+  std::shared_ptr<Tensor> input =
+    std::make_shared<Tensor>(shape, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(labels));
+
+  TensorShape fill_shape({});
+  std::shared_ptr<Tensor> fill_tensor = std::make_shared<Tensor>(fill_shape, DataType(DataType::DE_FLOAT32));
+  fill_tensor->SetItemAt<float>({}, 2.0);
+
+  std::shared_ptr<Tensor> output;
+  std::unique_ptr<FillOp> op(new FillOp(fill_tensor));
+  Status s = op->Compute(input, &output);
+
+  uint64_t out[3] = {2, 2, 2};
+
+  std::shared_ptr<Tensor> expected =
+    std::make_shared<Tensor>(TensorShape{3}, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(out));
+
+  ASSERT_TRUE(output->shape() == expected->shape());
+  ASSERT_TRUE(output->type() == expected->type());
+
+  EXPECT_TRUE(s.IsOk());
+  MS_LOG(DEBUG) << *output << std::endl;
+  MS_LOG(DEBUG) << *expected << std::endl;
+  ASSERT_TRUE(*output == *expected);
+
+  MS_LOG(INFO) << "MindDataTestFillOp-TestCasting end.";
+}
+
+TEST_F(MindDataTestFillOp, ScalarFill) {
+  MS_LOG(INFO) << "Doing MindDataTestFillOp-ScalarFill.";
+  uint64_t labels[3] = {0, 1, 2};
+  TensorShape shape({3});
+  std::shared_ptr<Tensor> input =
+    std::make_shared<Tensor>(shape, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(labels));
+
+  TensorShape fill_shape({2});
+  uint64_t fill_labels[3] = {0, 1};
+  std::shared_ptr<Tensor> fill_tensor =
+    std::make_shared<Tensor>(fill_shape, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(fill_labels));
+  std::shared_ptr<Tensor> output;
+  std::unique_ptr<FillOp> op(new FillOp(fill_tensor));
+  Status s = op->Compute(input, &output);
+
+  EXPECT_TRUE(s.IsError());
+  ASSERT_TRUE(s.get_code() == StatusCode::kUnexpectedError);
+
+  MS_LOG(INFO) << "MindDataTestFillOp-ScalarFill end.";
+}
+
+TEST_F(MindDataTestFillOp, StringFill) {
+  MS_LOG(INFO) << "Doing MindDataTestFillOp-StringFill.";
+  std::vector<std::string> strings = {"xyzzy", "plugh", "abracadabra"};
+  TensorShape shape({3});
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>(strings, shape);
+
+  TensorShape fill_shape({});
+  std::string fill_string = "hello";
+  std::shared_ptr<Tensor> fill_tensor = std::make_shared<Tensor>(fill_string);
+
+  std::shared_ptr<Tensor> output;
+
+  std::unique_ptr<FillOp> op(new FillOp(fill_tensor));
+  Status s = op->Compute(input, &output);
+
+  std::vector<std::string> expected_strings = {"hello", "hello", "hello"};
+  TensorShape expected_shape({3});
+  std::shared_ptr<Tensor> expected = std::make_shared<Tensor>(expected_strings, expected_shape);
+
+  EXPECT_TRUE(s.IsOk());
+  ASSERT_TRUE(output->shape() == expected->shape());
+  ASSERT_TRUE(output->type() == expected->type());
+  MS_LOG(DEBUG) << *output << std::endl;
+  MS_LOG(DEBUG) << *expected << std::endl;
+
+  ASSERT_TRUE(*output == *expected);
+
+  MS_LOG(INFO) << "MindDataTestFillOp-StringFill end.";
+}
+
+TEST_F(MindDataTestFillOp, NumericToString) {
+  MS_LOG(INFO) << "Doing MindDataTestFillOp-NumericToString.";
+  std::vector<std::string> strings = {"xyzzy", "plugh", "abracadabra"};
+  TensorShape shape({3});
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>(strings, shape);
+
+  TensorShape fill_shape({});
+  std::shared_ptr<Tensor> fill_tensor = std::make_shared<Tensor>(fill_shape, DataType(DataType::DE_FLOAT32));
+  fill_tensor->SetItemAt<float>({}, 2.0);
+
+  std::shared_ptr<Tensor> output;
+
+  std::unique_ptr<FillOp> op(new FillOp(fill_tensor));
+  Status s = op->Compute(input, &output);
+
+  EXPECT_TRUE(s.IsError());
+  ASSERT_TRUE(s.get_code() == StatusCode::kUnexpectedError);
+
+  MS_LOG(INFO) << "MindDataTestFillOp-NumericToString end.";
+}
+
+TEST_F(MindDataTestFillOp, StringToNumeric) {
+  MS_LOG(INFO) << "Doing MindDataTestFillOp-StringToNumeric.";
+  uint64_t labels[3] = {0, 1, 2};
+  TensorShape shape({3});
+  std::shared_ptr<Tensor> input =
+    std::make_shared<Tensor>(shape, DataType(DataType::DE_UINT64), reinterpret_cast<unsigned char *>(labels));
+
+  TensorShape fill_shape({});
+  std::string fill_string = "hello";
+  std::shared_ptr<Tensor> fill_tensor = std::make_shared<Tensor>(fill_string);
+
+  std::shared_ptr<Tensor> output;
+
+  std::unique_ptr<FillOp> op(new FillOp(fill_tensor));
+  Status s = op->Compute(input, &output);
+
+  EXPECT_TRUE(s.IsError());
+  ASSERT_TRUE(s.get_code() == StatusCode::kUnexpectedError);
+
+  MS_LOG(INFO) << "MindDataTestFillOp-StringToNumeric end.";
+}
\ No newline at end of file
diff --git a/tests/ut/cpp/dataset/gnn_graph_test.cc b/tests/ut/cpp/dataset/gnn_graph_test.cc
index 0aefffe784..ce2aca4ffd 100644
--- a/tests/ut/cpp/dataset/gnn_graph_test.cc
+++ b/tests/ut/cpp/dataset/gnn_graph_test.cc
@@ -13,8 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <algorithm>
 #include <string>
 #include <memory>
+#include <unordered_set>
 
 #include "common/common.h"
 #include "gtest/gtest.h"
@@ -25,6 +27,13 @@
 using namespace mindspore::dataset;
 using namespace mindspore::dataset::gnn;
 
+#define print_int_vec(_i, _str)                                           \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    std::copy(_i.begin(), _i.end(), std::ostream_iterator<int>(ss, " ")); \
+    MS_LOG(INFO) << _str << " " << ss.str();                              \
+  } while (false)
+
 class MindDataTestGNNGraph : public UT::Common {
  protected:
   MindDataTestGNNGraph() = default;
@@ -45,7 +54,7 @@ TEST_F(MindDataTestGNNGraph, TestGraphLoader) {
                                   &default_feature_map)
                 .IsOk());
   EXPECT_EQ(n_id_map.size(), 20);
-  EXPECT_EQ(e_id_map.size(), 20);
+  EXPECT_EQ(e_id_map.size(), 40);
   EXPECT_EQ(n_type_map[2].size(), 10);
   EXPECT_EQ(n_type_map[1].size(), 10);
 }
@@ -56,14 +65,13 @@ TEST_F(MindDataTestGNNGraph, TestGetAllNeighbors) {
   Status s = graph.Init();
   EXPECT_TRUE(s.IsOk());
 
-  std::vector<NodeMetaInfo> node_info;
-  std::vector<EdgeMetaInfo> edge_info;
-  s = graph.GetMetaInfo(&node_info, &edge_info);
+  MetaInfo meta_info;
+  s = graph.GetMetaInfo(&meta_info);
   EXPECT_TRUE(s.IsOk());
-  EXPECT_TRUE(node_info.size() == 2);
+  EXPECT_TRUE(meta_info.node_type.size() == 2);
 
   std::shared_ptr<Tensor> nodes;
-  s = graph.GetNodes(node_info[1].type, -1, &nodes);
+  s = graph.GetAllNodes(meta_info.node_type[0], &nodes);
   EXPECT_TRUE(s.IsOk());
   std::vector<NodeIdType> node_list;
   for (auto itr = nodes->begin<NodeIdType>(); itr != nodes->end<NodeIdType>(); ++itr) {
@@ -73,13 +81,13 @@ TEST_F(MindDataTestGNNGraph, TestGetAllNeighbors) {
     }
   }
   std::shared_ptr<Tensor> neighbors;
-  s = graph.GetAllNeighbors(node_list, node_info[0].type, &neighbors);
+  s = graph.GetAllNeighbors(node_list, meta_info.node_type[1], &neighbors);
   EXPECT_TRUE(s.IsOk());
   EXPECT_TRUE(neighbors->shape().ToString() == "<10,6>");
   TensorRow features;
-  s = graph.GetNodeFeature(nodes, node_info[1].feature_type, &features);
+  s = graph.GetNodeFeature(nodes, meta_info.node_feature_type, &features);
   EXPECT_TRUE(s.IsOk());
-  EXPECT_TRUE(features.size() == 3);
+  EXPECT_TRUE(features.size() == 4);
   EXPECT_TRUE(features[0]->shape().ToString() == "<10,5>");
   EXPECT_TRUE(features[0]->ToString() ==
               "Tensor (shape: <10,5>, Type: int32)\n"
@@ -91,3 +99,132 @@ TEST_F(MindDataTestGNNGraph, TestGetAllNeighbors) {
   EXPECT_TRUE(features[2]->shape().ToString() == "<10>");
   EXPECT_TRUE(features[2]->ToString() == "Tensor (shape: <10>, Type: int32)\n[1,2,3,1,4,3,5,3,5,4]");
 }
+
+TEST_F(MindDataTestGNNGraph, TestGetSampledNeighbors) {
+  std::string path = "data/mindrecord/testGraphData/testdata";
+  Graph graph(path, 1);
+  Status s = graph.Init();
+  EXPECT_TRUE(s.IsOk());
+
+  MetaInfo meta_info;
+  s = graph.GetMetaInfo(&meta_info);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_TRUE(meta_info.node_type.size() == 2);
+
+  std::shared_ptr<Tensor> edges;
+  s = graph.GetAllEdges(meta_info.edge_type[0], &edges);
+  EXPECT_TRUE(s.IsOk());
+  std::vector<EdgeIdType> edge_list;
+  edge_list.resize(edges->Size());
+  std::transform(edges->begin<EdgeIdType>(), edges->end<EdgeIdType>(), edge_list.begin(),
+                 [](const EdgeIdType edge) { return edge; });
+
+  std::shared_ptr<Tensor> nodes;
+  s = graph.GetNodesFromEdges(edge_list, &nodes);
+  EXPECT_TRUE(s.IsOk());
+  std::unordered_set<NodeIdType> node_set;
+  std::vector<NodeIdType> node_list;
+  int index = 0;
+  for (auto itr = nodes->begin<NodeIdType>(); itr != nodes->end<NodeIdType>(); ++itr) {
+    index++;
+    if (index % 2 == 0) {
+      continue;
+    }
+    node_set.emplace(*itr);
+    if (node_set.size() >= 5) {
+      break;
+    }
+  }
+  node_list.resize(node_set.size());
+  std::transform(node_set.begin(), node_set.end(), node_list.begin(), [](const NodeIdType node) { return node; });
+
+  std::shared_ptr<Tensor> neighbors;
+  s = graph.GetSampledNeighbors(node_list, {10}, {meta_info.node_type[1]}, &neighbors);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_TRUE(neighbors->shape().ToString() == "<5,11>");
+
+  neighbors.reset();
+  s = graph.GetSampledNeighbors(node_list, {2, 3}, {meta_info.node_type[1], meta_info.node_type[0]}, &neighbors);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_TRUE(neighbors->shape().ToString() == "<5,9>");
+
+  neighbors.reset();
+  s = graph.GetSampledNeighbors(node_list, {2, 3, 4},
+                                {meta_info.node_type[1], meta_info.node_type[0], meta_info.node_type[1]}, &neighbors);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_TRUE(neighbors->shape().ToString() == "<5,33>");
+
+  neighbors.reset();
+  s = graph.GetSampledNeighbors({}, {10}, {meta_info.node_type[1]}, &neighbors);
+  EXPECT_TRUE(s.ToString().find("Input node_list is empty.") != std::string::npos);
+
+  neighbors.reset();
+  s = graph.GetSampledNeighbors(node_list, {2, 3, 4}, {meta_info.node_type[1], meta_info.node_type[0]}, &neighbors);
+  EXPECT_TRUE(s.ToString().find("The sizes of neighbor_nums and neighbor_types are inconsistent.") !=
+              std::string::npos);
+
+  neighbors.reset();
+  s = graph.GetSampledNeighbors({301}, {10}, {meta_info.node_type[1]}, &neighbors);
+  EXPECT_TRUE(s.ToString().find("Invalid node id:301") != std::string::npos);
+}
+
+TEST_F(MindDataTestGNNGraph, TestGetNegSampledNeighbors) {
+  std::string path = "data/mindrecord/testGraphData/testdata";
+  Graph graph(path, 1);
+  Status s = graph.Init();
+  EXPECT_TRUE(s.IsOk());
+
+  MetaInfo meta_info;
+  s = graph.GetMetaInfo(&meta_info);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_TRUE(meta_info.node_type.size() == 2);
+
+  std::shared_ptr<Tensor> nodes;
+  s = graph.GetAllNodes(meta_info.node_type[0], &nodes);
+  EXPECT_TRUE(s.IsOk());
+  std::vector<NodeIdType> node_list;
+  for (auto itr = nodes->begin<NodeIdType>(); itr != nodes->end<NodeIdType>(); ++itr) {
+    node_list.push_back(*itr);
+    if (node_list.size() >= 10) {
+      break;
+    }
+  }
+  std::shared_ptr<Tensor> neg_neighbors;
+  s = graph.GetNegSampledNeighbors(node_list, 3, meta_info.node_type[1], &neg_neighbors);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_TRUE(neg_neighbors->shape().ToString() == "<10,4>");
+
+  neg_neighbors.reset();
+  s = graph.GetNegSampledNeighbors({}, 3, meta_info.node_type[1], &neg_neighbors);
+  EXPECT_TRUE(s.ToString().find("Input node_list is empty.") != std::string::npos);
+
+  neg_neighbors.reset();
+  s = graph.GetNegSampledNeighbors(node_list, 3, 3, &neg_neighbors);
+  EXPECT_TRUE(s.ToString().find("Invalid node type:3") != std::string::npos);
+}
+
+TEST_F(MindDataTestGNNGraph, TestRandomWalk) {
+  std::string path = "data/mindrecord/testGraphData/sns";
+  Graph graph(path, 1);
+  Status s = graph.Init();
+  EXPECT_TRUE(s.IsOk());
+
+  MetaInfo meta_info;
+  s = graph.GetMetaInfo(&meta_info);
+  EXPECT_TRUE(s.IsOk());
+
+  std::shared_ptr<Tensor> nodes;
+  s = graph.GetAllNodes(meta_info.node_type[0], &nodes);
+  EXPECT_TRUE(s.IsOk());
+  std::vector<NodeIdType> node_list;
+  for (auto itr = nodes->begin<NodeIdType>(); itr != nodes->end<NodeIdType>(); ++itr) {
+    node_list.push_back(*itr);
+  }
+
+  print_int_vec(node_list, "node list ");
+  std::vector<NodeType> meta_path(59, 1);
+  std::shared_ptr<Tensor> walk_path;
+  s = graph.RandomWalk(node_list, meta_path, 2.0, 0.5, -1, &walk_path);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_TRUE(walk_path->shape().ToString() == "<33,60>");
+}
\ No newline at end of file
diff --git a/tests/ut/cpp/dataset/image_folder_op_test.cc b/tests/ut/cpp/dataset/image_folder_op_test.cc
index 380b7cd02b..d143a9e06f 100644
--- a/tests/ut/cpp/dataset/image_folder_op_test.cc
+++ b/tests/ut/cpp/dataset/image_folder_op_test.cc
@@ -50,9 +50,8 @@ std::shared_ptr<RepeatOp> Repeat(int repeat_cnt);
 std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);
 
 std::shared_ptr<ImageFolderOp> ImageFolder(int64_t num_works, int64_t rows, int64_t conns, std::string path,
-                                           bool shuf = false, std::unique_ptr<Sampler> sampler = nullptr,
-                                           std::map<std::string, int32_t> map = {}, int64_t num_samples = 0,
-                                           bool decode = false) {
+                                           bool shuf = false, std::shared_ptr<Sampler> sampler = nullptr,
+                                           std::map<std::string, int32_t> map = {}, bool decode = false) {
   std::shared_ptr<ImageFolderOp> so;
   ImageFolderOp::Builder builder;
   Status rc = builder.SetNumWorkers(num_works)
@@ -63,7 +62,6 @@ std::shared_ptr<ImageFolderOp> ImageFolder(int64_t num_works, int64_t rows, int6
                      .SetSampler(std::move(sampler))
                      .SetClassIndex(map)
                      .SetDecode(decode)
-                     .SetNumSamples(num_samples)
                      .Build(&so);
   return so;
 }
@@ -71,11 +69,9 @@ std::shared_ptr<ImageFolderOp> ImageFolder(int64_t num_works, int64_t rows, int6
 Status Create1DTensor(std::shared_ptr<Tensor> *sample_ids, int64_t num_elements, unsigned char *data = nullptr,
                       DataType::Type data_type = DataType::DE_UINT32) {
   TensorShape shape(std::vector<int64_t>(1, num_elements));
-  RETURN_IF_NOT_OK(
-    Tensor::CreateTensor(sample_ids, TensorImpl::kFlexible, shape, DataType(data_type), data));
-  if (data == nullptr) {
-    (*sample_ids)->GetMutableBuffer();  // allocate memory in case user forgets!
-  }
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, TensorImpl::kFlexible, shape, DataType(data_type), data));
+  (*sample_ids)->AllocateBuffer((*sample_ids)->SizeInBytes());  // allocate memory in case user forgets!
+
   return Status::OK();
 }
 
@@ -138,7 +134,8 @@ TEST_F(MindDataTestImageFolderSampler, TestRandomImageFolder) {
 TEST_F(MindDataTestImageFolderSampler, TestRandomSamplerImageFolder) {
   int32_t original_seed = GlobalContext::config_manager()->seed();
   GlobalContext::config_manager()->set_seed(0);
-  std::unique_ptr<Sampler> sampler = std::make_unique<RandomSampler>(true, true, 12);
+  int64_t num_samples = 12;
+  std::shared_ptr<Sampler> sampler = std::make_unique<RandomSampler>(num_samples, true, true);
   int32_t res[] = {2, 2, 2, 3, 2, 3, 2, 3, 1, 2, 2, 1};  // ground truth label
   std::string folder_path = datasets_root_path_ + "/testPK/data";
   auto tree = Build({ImageFolder(16, 2, 32, folder_path, false, std::move(sampler))});
@@ -200,7 +197,8 @@ TEST_F(MindDataTestImageFolderSampler, TestSequentialImageFolderWithRepeatBatch)
 TEST_F(MindDataTestImageFolderSampler, TestSubsetRandomSamplerImageFolder) {
   // id range 0 - 10 is label 0, and id range 11 - 21 is label 1
   std::vector<int64_t> indices({0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 11});
-  std::unique_ptr<Sampler> sampler = std::make_unique<SubsetRandomSampler>(indices);
+  int64_t num_samples = 0;
+  std::shared_ptr<Sampler> sampler = std::make_shared<SubsetRandomSampler>(num_samples, indices);
   std::string folder_path = datasets_root_path_ + "/testPK/data";
   // Expect 6 samples for label 0 and 1
   int res[2] = {6, 6};
@@ -237,8 +235,8 @@ TEST_F(MindDataTestImageFolderSampler, TestWeightedRandomSamplerImageFolder) {
   std::vector<double> weights(total_samples, std::rand() % 100);
 
   // create sampler with replacement = replacement
-  std::unique_ptr<Sampler> sampler =
-    std::make_unique<WeightedRandomSampler>(weights, num_samples, true, samples_per_buffer);
+  std::shared_ptr<Sampler> sampler =
+    std::make_shared<WeightedRandomSampler>(num_samples, weights, true, samples_per_buffer);
 
   std::string folder_path = datasets_root_path_ + "/testPK/data";
   auto tree = Build({ImageFolder(16, 2, 32, folder_path, false, std::move(sampler))});
@@ -295,7 +293,8 @@ TEST_F(MindDataTestImageFolderSampler, TestImageFolderClassIndex) {
 }
 
 TEST_F(MindDataTestImageFolderSampler, TestDistributedSampler) {
-  std::unique_ptr<Sampler> sampler = std::make_unique<DistributedSampler>(11, 10, false);
+  int64_t num_samples = 0;
+  std::shared_ptr<Sampler> sampler = std::make_shared<DistributedSampler>(num_samples, 11, 10, false);
   std::string folder_path = datasets_root_path_ + "/testPK/data";
   auto tree = Build({ImageFolder(16, 2, 32, folder_path, false, std::move(sampler)), Repeat(4)});
   tree->Prepare();
@@ -322,7 +321,8 @@ TEST_F(MindDataTestImageFolderSampler, TestDistributedSampler) {
 }
 
 TEST_F(MindDataTestImageFolderSampler, TestPKSamplerImageFolder) {
-  std::unique_ptr<Sampler> sampler = std::make_unique<PKSampler>(3, false, 4);
+  int64_t num_samples = 0;
+  std::shared_ptr<Sampler> sampler = std::make_shared<PKSampler>(num_samples, 3, false, 4);
   int32_t res[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3};  // ground truth label
   std::string folder_path = datasets_root_path_ + "/testPK/data";
   auto tree = Build({ImageFolder(16, 2, 32, folder_path, false, std::move(sampler))});
@@ -349,39 +349,16 @@ TEST_F(MindDataTestImageFolderSampler, TestPKSamplerImageFolder) {
   }
 }
 
-TEST_F(MindDataTestImageFolderSampler, TestImageFolderNumSamples) {
-  std::string folder_path = datasets_root_path_ + "/testPK/data";
-  auto tree = Build({ImageFolder(16, 2, 32, folder_path, false, nullptr, {}, 11), Repeat(2)});
-  tree->Prepare();
-  Status rc = tree->Launch();
-  if (rc.IsError()) {
-    MS_LOG(ERROR) << "Return code error detected during tree launch: " << common::SafeCStr(rc.ToString()) << ".";
-    EXPECT_TRUE(false);
-  } else {
-    DatasetIterator di(tree);
-    TensorMap tensor_map;
-    di.GetNextAsMap(&tensor_map);
-    EXPECT_TRUE(rc.IsOk());
-    uint64_t i = 0;
-    int32_t label = 0;
-    while (tensor_map.size() != 0) {
-      tensor_map["label"]->GetItemAt<int32_t>(&label, {});
-      EXPECT_TRUE(0 == label);
-      MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "\n";
-      i++;
-      di.GetNextAsMap(&tensor_map);
-    }
-    EXPECT_TRUE(i == 22);
-  }
-}
-
 TEST_F(MindDataTestImageFolderSampler, TestImageFolderDecode) {
   std::string folder_path = datasets_root_path_ + "/testPK/data";
   std::map<std::string, int32_t> map;
   map["class3"] = 333;
   map["class1"] = 111;
   map["wrong folder name"] = 1234;  // this is skipped
-  auto tree = Build({ImageFolder(16, 2, 32, folder_path, false, nullptr, map, 20, true)});
+  int64_t num_samples = 20;
+  int64_t start_index = 0;
+  auto seq_sampler = std::make_shared<SequentialSampler>(num_samples, start_index);
+  auto tree = Build({ImageFolder(16, 2, 32, folder_path, false, std::move(seq_sampler), map, true)});
   int64_t res[2] = {111, 333};
   tree->Prepare();
   Status rc = tree->Launch();
@@ -408,33 +385,12 @@ TEST_F(MindDataTestImageFolderSampler, TestImageFolderDecode) {
   }
 }
 
-TEST_F(MindDataTestImageFolderSampler, TestImageFolderDatasetSize) {
-  std::string folder_path = datasets_root_path_ + "/testPK/data";
-  int64_t num_rows = 0;
-  int64_t num_classes = 0;
-  ImageFolderOp::CountRowsAndClasses(folder_path, 15, {}, &num_rows, &num_classes);
-  EXPECT_TRUE(num_rows == 15 && num_classes == 4);
-  ImageFolderOp::CountRowsAndClasses(folder_path, 44, {}, &num_rows, &num_classes);
-  EXPECT_TRUE(num_rows == 44 && num_classes == 4);
-  ImageFolderOp::CountRowsAndClasses(folder_path, 0, {}, &num_rows, &num_classes);
-  EXPECT_TRUE(num_rows == 44 && num_classes == 4);
-  ImageFolderOp::CountRowsAndClasses(folder_path, 55, {}, &num_rows, &num_classes);
-  EXPECT_TRUE(num_rows == 44 && num_classes == 4);
-  ImageFolderOp::CountRowsAndClasses(folder_path, 44, {}, &num_rows, &num_classes, 2, 3);
-  EXPECT_TRUE(num_rows == 15 && num_classes == 4);
-  ImageFolderOp::CountRowsAndClasses(folder_path, 33, {}, &num_rows, &num_classes, 0, 3);
-  EXPECT_TRUE(num_rows == 15 && num_classes == 4);
-  ImageFolderOp::CountRowsAndClasses(folder_path, 13, {}, &num_rows, &num_classes, 0, 11);
-  EXPECT_TRUE(num_rows == 4 && num_classes == 4);
-  ImageFolderOp::CountRowsAndClasses(folder_path, 3, {}, &num_rows, &num_classes, 0, 11);
-  EXPECT_TRUE(num_rows == 3 && num_classes == 4);
-}
-
 TEST_F(MindDataTestImageFolderSampler, TestImageFolderSharding1) {
-  std::unique_ptr<Sampler> sampler = std::make_unique<DistributedSampler>(4, 0, false);
+  int64_t num_samples = 5;
+  std::shared_ptr<Sampler> sampler = std::make_shared<DistributedSampler>(num_samples, 4, 0, false);
   std::string folder_path = datasets_root_path_ + "/testPK/data";
   // numWrks, rows, conns, path, shuffle, sampler, map, numSamples, decode
-  auto tree = Build({ImageFolder(16, 2, 32, folder_path, false, std::move(sampler), {}, 5)});
+  auto tree = Build({ImageFolder(16, 2, 32, folder_path, false, std::move(sampler), {})});
   tree->Prepare();
   Status rc = tree->Launch();
   int32_t labels[5] = {0, 0, 0, 1, 1};
@@ -460,10 +416,11 @@ TEST_F(MindDataTestImageFolderSampler, TestImageFolderSharding1) {
 }
 
 TEST_F(MindDataTestImageFolderSampler, TestImageFolderSharding2) {
-  std::unique_ptr<Sampler> sampler = std::make_unique<DistributedSampler>(4, 3, false);
+  int64_t num_samples = 12;
+  std::shared_ptr<Sampler> sampler = std::make_shared<DistributedSampler>(num_samples, 4, 3, false);
   std::string folder_path = datasets_root_path_ + "/testPK/data";
   // numWrks, rows, conns, path, shuffle, sampler, map, numSamples, decode
-  auto tree = Build({ImageFolder(16, 16, 32, folder_path, false, std::move(sampler), {}, 12)});
+  auto tree = Build({ImageFolder(16, 16, 32, folder_path, false, std::move(sampler), {})});
   tree->Prepare();
   Status rc = tree->Launch();
   uint32_t labels[11] = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3};
diff --git a/tests/ut/cpp/dataset/manifest_op_test.cc b/tests/ut/cpp/dataset/manifest_op_test.cc
index f662f98fc8..35773f6bbb 100644
--- a/tests/ut/cpp/dataset/manifest_op_test.cc
+++ b/tests/ut/cpp/dataset/manifest_op_test.cc
@@ -23,6 +23,7 @@
 #include "dataset/core/client.h"
 #include "dataset/core/global_context.h"
 #include "dataset/engine/datasetops/source/manifest_op.h"
+#include "dataset/engine/datasetops/source/sampler/sequential_sampler.h"
 #include "dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
 #include "dataset/util/de_error.h"
 #include "dataset/util/status.h"
@@ -42,14 +43,13 @@ std::shared_ptr<RepeatOp> Repeat(int repeatCnt);
 std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);
 
 std::shared_ptr<ManifestOp> Manifest(int32_t num_works, int32_t rows, int32_t conns, const std::string &file,
-                                     std::string usage = "train", std::unique_ptr<Sampler> sampler = nullptr,
-                                     std::map<std::string, int32_t> map = {}, uint64_t num_samples = 0,
-                                     bool decode = false) {
+                                     std::string usage = "train", std::shared_ptr<Sampler> sampler = nullptr,
+                                     std::map<std::string, int32_t> map = {}, bool decode = false) {
   std::shared_ptr<ManifestOp> so;
   ManifestOp::Builder builder;
   Status rc = builder.SetNumWorkers(num_works).SetManifestFile(file).SetRowsPerBuffer(
       rows).SetOpConnectorSize(conns).SetSampler(std::move(sampler)).SetClassIndex(map).SetDecode(decode)
-      .SetNumSamples(num_samples).SetUsage(usage).Build(&so);
+      .SetUsage(usage).Build(&so);
   return so;
 }
 
@@ -86,7 +86,8 @@ TEST_F(MindDataTestManifest, TestSequentialManifestWithRepeat) {
 
 TEST_F(MindDataTestManifest, TestSubsetRandomSamplerManifest) {
   std::vector<int64_t> indices({1});
-  std::unique_ptr<Sampler> sampler = std::make_unique<SubsetRandomSampler>(indices);
+  int64_t num_samples = 0;
+  std::shared_ptr<Sampler> sampler = std::make_shared<SubsetRandomSampler>(num_samples, indices);
   std::string file = datasets_root_path_ + "/testManifestData/cpp.json";
   // Expect 6 samples for label 0 and 1
   auto tree = Build({Manifest(16, 2, 32, file, "train", std::move(sampler))});
@@ -145,7 +146,10 @@ TEST_F(MindDataTestManifest, MindDataTestManifestClassIndex) {
 
 TEST_F(MindDataTestManifest, MindDataTestManifestNumSamples) {
   std::string file = datasets_root_path_ + "/testManifestData/cpp.json";
-  auto tree = Build({Manifest(16, 2, 32, file, "train", nullptr, {}, 1), Repeat(4)});
+  int64_t num_samples = 1;
+  int64_t start_index = 0;
+  auto seq_sampler = std::make_shared<SequentialSampler>(num_samples, start_index);
+  auto tree = Build({Manifest(16, 2, 32, file, "train", std::move(seq_sampler), {}), Repeat(4)});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
@@ -171,7 +175,10 @@ TEST_F(MindDataTestManifest, MindDataTestManifestNumSamples) {
 
 TEST_F(MindDataTestManifest, MindDataTestManifestEval) {
   std::string file = datasets_root_path_ + "/testManifestData/cpp.json";
-  auto tree = Build({Manifest(16, 2, 32, file, "eval", nullptr, {}, 1)});
+  int64_t num_samples = 1;
+  int64_t start_index = 0;
+  auto seq_sampler = std::make_shared<SequentialSampler>(num_samples, start_index);  
+  auto tree = Build({Manifest(16, 2, 32, file, "eval", std::move(seq_sampler), {})});
   tree->Prepare();
   Status rc = tree->Launch();
   if (rc.IsError()) {
diff --git a/tests/ut/cpp/dataset/map_op_test.cc b/tests/ut/cpp/dataset/map_op_test.cc
index 7a99007437..b01b4a6df6 100644
--- a/tests/ut/cpp/dataset/map_op_test.cc
+++ b/tests/ut/cpp/dataset/map_op_test.cc
@@ -55,8 +55,7 @@ class ThreeToOneOp : public TensorOp {
 
     uint32_t NumInput() override { return 3; }
     // Compute function that holds the actual implementation of the operation.
-    Status Compute(const std::vector<std::shared_ptr<Tensor>> &input,
-                   std::vector<std::shared_ptr<Tensor>> *output) override {
+    Status Compute(const TensorRow &input, TensorRow *output) override {
       output->push_back(input[0]);
       return Status::OK();
     };
@@ -74,8 +73,7 @@ class OneToThreeOp : public TensorOp {
 
     // Compute function that holds the actual implementation of the operation.
     // Simply pushing the same shared pointer of the first element of input vector three times.
-    Status Compute(const std::vector<std::shared_ptr<Tensor>> &input,
-                   std::vector<std::shared_ptr<Tensor>> *output) override {
+    Status Compute(const TensorRow &input, TensorRow *output) override {
       output->push_back(input[0]);
       output->push_back(input[0]);
       output->push_back(input[0]);
@@ -93,7 +91,8 @@ class MindDataTestMapOp : public UT::DatasetOpTesting {
  public:
     void SetUp() override {
       DatasetOpTesting::SetUp();
-      dataset_path_ = datasets_root_path_ + "" + "/testDataset2";
+      dataset_path_ = datasets_root_path_ + "" + "/testDataset2/testDataset2.data";
+      schema_path_ = datasets_root_path_ + "" + "/testDataset2/datasetSchema.json";
 
       GlobalInit();
 
@@ -101,33 +100,38 @@ class MindDataTestMapOp : public UT::DatasetOpTesting {
       my_tree_ = std::make_shared<ExecutionTree>();
     }
 
-    std::shared_ptr<StorageOp> CreateStorageOp() {
-      std::shared_ptr<StorageOp> my_storage_op;
-      StorageOp::Builder builder;
-      builder.SetDatasetFilesDir(dataset_path_)
+    std::shared_ptr<TFReaderOp> CreateTFReaderOp() {
+      std::shared_ptr<TFReaderOp> my_tfreader_op;
+      TFReaderOp::Builder builder;
+      builder.SetDatasetFilesList({dataset_path_})
           .SetColumnsToLoad({"image", "label", "A", "B"})
           .SetRowsPerBuffer(2)
           .SetWorkerConnectorSize(2)
           .SetNumWorkers(2);
-      Status rc = builder.Build(&my_storage_op);
+
+      std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
+      schema->LoadSchemaFile(schema_path_, {});
+      builder.SetDataSchema(std::move(schema));
+
+      Status rc = builder.Build(&my_tfreader_op);
       EXPECT_TRUE(rc.IsOk());
-      return my_storage_op;
+      return my_tfreader_op;
     }
 
     std::shared_ptr<ExecutionTree> my_tree_;
  private:
     std::string dataset_path_;
+    std::string schema_path_;
 };
 
 std::shared_ptr<ImageFolderOp> ImageFolder(int64_t num_works, int64_t rows, int64_t conns, std::string path,
-                                           bool shuf = false, std::unique_ptr<Sampler> sampler = nullptr,
-                                           std::map<std::string, int32_t> map = {}, int64_t num_samples = 0,
-                                           bool decode = false);
+                                           bool shuf = false, std::shared_ptr<Sampler> sampler = nullptr,
+                                           std::map<std::string, int32_t> map = {}, bool decode = false);
 
 std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);
 
 // TestByPosition scenario:
-//    StorageOp reads a dataset that have column ordering |image|label|A|B|.
+//    TFReaderOp reads a dataset that have column ordering |image|label|A|B|.
 //    A TensorOp that does nothing picks the label column and output a column also named label.
 //    Thus, based on the new MapOp behaviour, the column ordering will be |image|label|A|B|.
 //    Verify the column ordering based on the Tensor properties matching to that of in the schema file.
@@ -135,10 +139,10 @@ TEST_F(MindDataTestMapOp, TestByPosition) {
   Status rc;
   MS_LOG(INFO) << "Doing TestByPosition.";
 
-  // Note: The above storage config yields 5 buffers, each with 2 rows, for a total
+  // Note: The above TFReader config yields 5 buffers, each with 2 rows, for a total
   // of 10 rows.
-  auto my_storage_op = this->CreateStorageOp();
-  rc = my_tree_->AssociateNode(my_storage_op);
+  auto my_tfreader_op = this->CreateTFReaderOp();
+  rc = my_tree_->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   auto my_no_op = std::make_shared<mindspore::dataset::test::NoOp>();
   std::vector<std::shared_ptr<TensorOp>> my_func_list;
@@ -147,13 +151,14 @@ TEST_F(MindDataTestMapOp, TestByPosition) {
   MapOp::Builder builder;
   builder.SetInColNames({"label"})
       .SetOutColNames({})
+      .SetColOrder({"image", "label", "A", "B"})
       .SetTensorFuncs(std::move(my_func_list))
       .SetNumWorkers(100);
   rc = builder.Build(&my_map_op);
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree_->AssociateNode(my_map_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_map_op->AddChild(my_storage_op);
+  rc = my_map_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree_->AssignRoot(my_map_op);
   EXPECT_TRUE(rc.IsOk());
@@ -190,12 +195,12 @@ TEST_F(MindDataTestMapOp, TestByPosition) {
     EXPECT_EQ(tensor_list[i]->type(), golden_types[i]);
     EXPECT_EQ(tensor_list[i]->Rank(), golden_ranks[i]);
     EXPECT_EQ(tensor_list[i]->shape(), golden_shapes[i]);
-    EXPECT_NE(tensor_list[i]->GetMutableBuffer(), nullptr);
+    EXPECT_NE(tensor_list[i]->GetBuffer(), nullptr);
   }
 }
 
 // TestAsMap scenario:
-//    StorageOp reads a dataset that have column ordering |image|label|A|B|.
+//    TFReaderOp reads a dataset that have column ordering |image|label|A|B|.
 //    A TensorOp that does nothing picks the "image" column and produces a column named "X".
 //    Thus, based on the new MapOp behaviour, the column ordering will be |X|label|A|B|.
 //    Verify that the "image" column is removed and "X" column is added.
@@ -203,9 +208,9 @@ TEST_F(MindDataTestMapOp, TestAsMap) {
   Status rc;
   MS_LOG(INFO) << "Doing TestAsMap.";
 
-  // Note: The above storage config yields 5 buffers, each with 2 rows, for a total of 10 rows.
-  auto my_storage_op = this->CreateStorageOp();
-  rc = my_tree_->AssociateNode(my_storage_op);
+  // Note: The above TFReader config yields 5 buffers, each with 2 rows, for a total of 10 rows.
+  auto my_tfreader_op = this->CreateTFReaderOp();
+  rc = my_tree_->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   auto my_no_op = std::make_shared<mindspore::dataset::test::NoOp>();
   std::vector<std::shared_ptr<TensorOp>> my_func_list;
@@ -219,7 +224,7 @@ TEST_F(MindDataTestMapOp, TestAsMap) {
   rc = builder.Build(&my_map_op);
   rc = my_tree_->AssociateNode(my_map_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_map_op->AddChild(my_storage_op);
+  rc = my_map_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
 
   // Assign the tree root
@@ -246,7 +251,7 @@ TEST_F(MindDataTestMapOp, TestAsMap) {
 }
 
 // Test3to1 scenario:
-//    StorageOp reads a dataset that have column ordering |image|label|A|B|.
+//    TFReaderOp reads a dataset that have column ordering |image|label|A|B|.
 //    A 3-to-1 TensorOp picks the columns [image, A, B] and produce a column named "X".
 //    Thus, based on the new MapOp behaviour, the column ordering will be |X|label|.
 //    Verify that the only columns "X" and "label" exist.
@@ -254,9 +259,9 @@ TEST_F(MindDataTestMapOp, Test3to1) {
   Status rc;
   MS_LOG(INFO) << "Doing Test3to1.";
 
-  // Note: The above storage config yields 5 buffers, each with 2 rows, for a total of 10 rows.
-  auto my_storage_op = this->CreateStorageOp();
-  rc = my_tree_->AssociateNode(my_storage_op);
+  // Note: The above TFReader config yields 5 buffers, each with 2 rows, for a total of 10 rows.
+  auto my_tfreader_op = this->CreateTFReaderOp();
+  rc = my_tree_->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   auto my_op = std::make_shared<mindspore::dataset::test::ThreeToOneOp>();
   std::vector<std::shared_ptr<TensorOp>> my_func_list;
@@ -271,7 +276,7 @@ TEST_F(MindDataTestMapOp, Test3to1) {
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree_->AssociateNode(my_map_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_map_op->AddChild(my_storage_op);
+  rc = my_map_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree_->AssignRoot(my_map_op);
   EXPECT_TRUE(rc.IsOk());
@@ -298,7 +303,7 @@ TEST_F(MindDataTestMapOp, Test3to1) {
 }
 
 // Test1to3 scenario:
-//    StorageOp reads a dataset that have column ordering |image|label|A|B|.
+//    TFReaderOp reads a dataset that have column ordering |image|label|A|B|.
 //    A 1-to-3 TensorOp picks the columns [image] and produce a column named [X, Y, Z].
 //    Thus, based on the new MapOp behaviour, the column ordering will be |X|Y|Z|label|A|B|.
 //    Verify that the only columns X, Y, Z are added (to the front) and followed by columns label, A, B..
@@ -306,9 +311,9 @@ TEST_F(MindDataTestMapOp, Test1to3) {
   Status rc;
   MS_LOG(INFO) << "Doing Test1to3.";
 
-  // Note: The above storage config yields 5 buffers, each with 2 rows, for a total of 10 rows.
-  auto my_storage_op = this->CreateStorageOp();
-  rc = my_tree_->AssociateNode(my_storage_op);
+  // Note: The above TFReader config yields 5 buffers, each with 2 rows, for a total of 10 rows.
+  auto my_tfreader_op = this->CreateTFReaderOp();
+  rc = my_tree_->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   auto my_op = std::make_shared<mindspore::dataset::test::OneToThreeOp>();
   std::vector<std::shared_ptr<TensorOp>> my_func_list;
@@ -319,12 +324,25 @@ TEST_F(MindDataTestMapOp, Test1to3) {
       .SetOutColNames({"X", "Y", "Z"})
       .SetTensorFuncs(std::move(my_func_list))
       .SetNumWorkers(1);
+
+
+  // ProjectOp
+  std::vector<std::string> columns_to_project = {"X", "Y", "Z", "label", "A", "B"};
+  std::shared_ptr<ProjectOp> my_project_op = std::make_shared<ProjectOp>(columns_to_project);
+  rc = my_tree_->AssociateNode(my_project_op);
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = my_tree_->AssignRoot(my_project_op);
+  ASSERT_TRUE(rc.IsOk());
+
   rc = builder.Build(&my_map_op);
   rc = my_tree_->AssociateNode(my_map_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_map_op->AddChild(my_storage_op);
+
+  rc = my_project_op->AddChild(my_map_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_tree_->AssignRoot(my_map_op);
+
+  rc = my_map_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree_->Prepare();
   EXPECT_TRUE(rc.IsOk());
@@ -366,7 +384,7 @@ TEST_F(MindDataTestMapOp, Test1to3) {
       EXPECT_EQ(tensor_list[i]->type(), golden_types[i]);
       EXPECT_EQ(tensor_list[i]->Rank(), golden_ranks[i]);
       EXPECT_EQ(tensor_list[i]->shape(), golden_shapes[i]);
-      EXPECT_NE(tensor_list[i]->GetMutableBuffer(), nullptr);
+      EXPECT_NE(tensor_list[i]->GetBuffer(), nullptr);
     }
     rc = di.FetchNextTensorRow(&tensor_list);
     EXPECT_TRUE(rc.IsOk());
@@ -374,7 +392,7 @@ TEST_F(MindDataTestMapOp, Test1to3) {
 }
 
 // TestMultiTensorOp scenario:
-//    StorageOp reads a dataset that have column ordering |image|label|A|B|.
+//    TFReaderOp reads a dataset that have column ordering |image|label|A|B|.
 //    A series of 3-to-1 and 1-to-3 TensorOps are applied to [image, A, B] and
 //    produce final output columns [X, Y, Z].
 //    Based on the new MapOp behaviour, the column ordering will be |X|Y|Z|label|.
@@ -382,9 +400,9 @@ TEST_F(MindDataTestMapOp, TestMultiTensorOp) {
   Status rc;
   MS_LOG(INFO) << "Doing TestMultiTensorOp.";
 
-  // Note: The above storage config yields 5 buffers, each with 2 rows, for a total of 10 rows.
-  auto my_storage_op = this->CreateStorageOp();
-  rc = my_tree_->AssociateNode(my_storage_op);
+  // Note: The above TFReader config yields 5 buffers, each with 2 rows, for a total of 10 rows.
+  auto my_tfreader_op = this->CreateTFReaderOp();
+  rc = my_tree_->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   auto my_op1 = std::make_shared<mindspore::dataset::test::ThreeToOneOp>();
   auto my_op2 = std::make_shared<mindspore::dataset::test::OneToThreeOp>();
@@ -401,7 +419,7 @@ TEST_F(MindDataTestMapOp, TestMultiTensorOp) {
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree_->AssociateNode(my_map_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_map_op->AddChild(my_storage_op);
+  rc = my_map_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree_->AssignRoot(my_map_op);
   EXPECT_TRUE(rc.IsOk());
@@ -434,15 +452,15 @@ TEST_F(MindDataTestMapOp, TestMultiTensorOp) {
   }
 }
 
-TEST_F(MindDataTestMapOp, TestStorageRepeatMap) {
+TEST_F(MindDataTestMapOp, TestTFReaderRepeatMap) {
   Status rc;
-  MS_LOG(INFO) << "Doing TestStorageRepeatMap.";
+  MS_LOG(INFO) << "Doing TestTFReaderRepeatMap.";
   uint32_t num_repeats = 3;
 
-  // Note: The above storage config yields 5 buffers, each with 2 rows, for a total
+  // Note: The above TFReader config yields 5 buffers, each with 2 rows, for a total
   // of 10 rows.
-  auto my_storage_op = this->CreateStorageOp();
-  rc = my_tree_->AssociateNode(my_storage_op);
+  auto my_tfreader_op = this->CreateTFReaderOp();
+  rc = my_tree_->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   auto my_no_op = std::make_shared<mindspore::dataset::test::NoOp>();
   std::vector<std::shared_ptr<TensorOp>> my_func_list;
@@ -468,7 +486,7 @@ TEST_F(MindDataTestMapOp, TestStorageRepeatMap) {
   rc = my_map_op->AddChild(my_repeat_op);
   EXPECT_TRUE(rc.IsOk());
 
-  rc = my_repeat_op->AddChild(my_storage_op);
+  rc = my_repeat_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
 
   rc = my_tree_->AssignRoot(my_map_op);
@@ -496,15 +514,15 @@ TEST_F(MindDataTestMapOp, TestStorageRepeatMap) {
   ASSERT_EQ(row_count, 10 * num_repeats);
 }
 
-TEST_F(MindDataTestMapOp, TestStorageMapRepeat) {
+TEST_F(MindDataTestMapOp, TestTFReaderMapRepeat) {
   Status rc;
-  MS_LOG(INFO) << "Doing TestStorageMapRepeat.";
+  MS_LOG(INFO) << "Doing TestTFReaderMapRepeat.";
   uint32_t num_repeats = 3;
 
-  // Note: The above storage config yields 5 buffers, each with 2 rows, for a total
+  // Note: The above TFReader config yields 5 buffers, each with 2 rows, for a total
   // of 10 rows.
-  auto my_storage_op = this->CreateStorageOp();
-  rc = my_tree_->AssociateNode(my_storage_op);
+  auto my_tfreader_op = this->CreateTFReaderOp();
+  rc = my_tree_->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   auto my_no_op = std::make_shared<mindspore::dataset::test::NoOp>();
   std::vector<std::shared_ptr<TensorOp>> my_func_list;
@@ -530,7 +548,7 @@ TEST_F(MindDataTestMapOp, TestStorageMapRepeat) {
   rc = my_repeat_op->AddChild(my_map_op);
   EXPECT_TRUE(rc.IsOk());
 
-  rc = my_map_op->AddChild(my_storage_op);
+  rc = my_map_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
 
   rc = my_tree_->AssignRoot(my_repeat_op);
@@ -557,23 +575,23 @@ TEST_F(MindDataTestMapOp, TestStorageMapRepeat) {
   ASSERT_EQ(row_count, 10 * num_repeats);
 }
 
-TEST_F(MindDataTestMapOp, Storage_Decode_Repeat_Resize) {
+TEST_F(MindDataTestMapOp, TFReader_Decode_Repeat_Resize) {
   Status rc;
-  MS_LOG(INFO) << "Doing Storage_Decode_Repeat_Resize.";
+  MS_LOG(INFO) << "Doing TFReader_Decode_Repeat_Resize.";
   uint32_t num_repeats = 2;
 
-  std::string dataset_path_ = datasets_root_path_ + "/" + "test_tf_file_3_images";
-  std::shared_ptr<StorageOp> my_storage_op;
-  StorageOp::Builder sobuilder;
-  sobuilder.SetDatasetFilesDir(dataset_path_)
+  std::string dataset_path_ = datasets_root_path_ + "/" + "test_tf_file_3_images/train-0000-of-0001.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  TFReaderOp::Builder sobuilder;
+  sobuilder.SetDatasetFilesList({dataset_path_})
     .SetColumnsToLoad({"image", "label"})
     .SetRowsPerBuffer(2)
     .SetWorkerConnectorSize(2)
     .SetNumWorkers(2);
-  rc = sobuilder.Build(&my_storage_op);
+  rc = sobuilder.Build(&my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
 
-  rc = my_tree_->AssociateNode(my_storage_op);
+  rc = my_tree_->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   auto decode_op = std::make_shared<DecodeOp>();
   std::vector<std::shared_ptr<TensorOp>> my_func_list;
@@ -611,7 +629,7 @@ TEST_F(MindDataTestMapOp, Storage_Decode_Repeat_Resize) {
   rc = my_tree_->AssociateNode(my_map_resize_op);
   EXPECT_TRUE(rc.IsOk());
 
-  rc = my_map_decode_op->AddChild(my_storage_op);
+  rc = my_map_decode_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
 
   rc = my_repeat_op->AddChild(my_map_decode_op);
@@ -700,7 +718,7 @@ TEST_F(MindDataTestMapOp, ImageFolder_Decode_Repeat_Resize) {
     MS_LOG(DEBUG) << "row:" << i << "\tlabel:" << label << "\n";
     EXPECT_TRUE(img_class[(i % 44) / 11] == label);
     // Dump all the image into string, to be used as a comparison later.
-    result.append((char *)tensor_map["image"]->GetMutableBuffer(), (int64_t) tensor_map["image"]->Size());
+    result.append((char *)tensor_map["image"]->GetBuffer(), (int64_t)tensor_map["image"]->Size());
     di.GetNextAsMap(&tensor_map);
     i++;
   }
@@ -745,7 +763,7 @@ TEST_F(MindDataTestMapOp, ImageFolder_Decode_Repeat_Resize) {
     tensor_map["label"]->GetItemAt<int32_t>(&label, {});
     MS_LOG(DEBUG) << "row:" << i << "\tlabel:" << label << "\n";
     EXPECT_TRUE(img_class[(i % 44) / 11] == label);
-    result2.append((char *)tensor_map["image"]->GetMutableBuffer(), (int64_t) tensor_map["image"]->Size());
+    result2.append((char *)tensor_map["image"]->GetBuffer(), (int64_t)tensor_map["image"]->Size());
     di2.GetNextAsMap(&tensor_map);
     i++;
   }
diff --git a/tests/ut/cpp/dataset/mask_test.cc b/tests/ut/cpp/dataset/mask_test.cc
new file mode 100644
index 0000000000..eb8a49aa36
--- /dev/null
+++ b/tests/ut/cpp/dataset/mask_test.cc
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <memory>
+#include <string>
+#include "dataset/core/client.h"
+#include "common/common.h"
+#include "gtest/gtest.h"
+#include "securec.h"
+#include "dataset/core/tensor.h"
+#include "dataset/core/cv_tensor.h"
+#include "dataset/core/data_type.h"
+#include "dataset/util/de_error.h"
+#include "dataset/kernels/data/mask_op.h"
+#include "dataset/kernels/data/data_utils.h"
+
+using namespace mindspore::dataset;
+
+namespace py = pybind11;
+
+class MindDataTestMaskOp : public UT::Common {
+ public:
+  MindDataTestMaskOp() {}
+
+  void SetUp() { GlobalInit(); }
+};
+
+TEST_F(MindDataTestMaskOp, Basics) {
+  std::shared_ptr<Tensor> t;
+  Tensor::CreateTensor(&t, std::vector<uint32_t>({1, 2, 3, 4, 5, 6}));
+  std::shared_ptr<Tensor> v;
+  Tensor::CreateTensor(&v, std::vector<uint32_t>({3}), TensorShape::CreateScalar());
+  std::shared_ptr<MaskOp> op = std::make_shared<MaskOp>(RelationalOp::kEqual, v, DataType(DataType::DE_UINT16));
+  std::shared_ptr<Tensor> out;
+  ASSERT_TRUE(op->Compute(t, &out).IsOk());
+
+  op = std::make_shared<MaskOp>(RelationalOp::kNotEqual, v, DataType(DataType::DE_UINT16));
+  ASSERT_TRUE(op->Compute(t, &out).IsOk());
+
+  op = std::make_shared<MaskOp>(RelationalOp::kLessEqual, v, DataType(DataType::DE_UINT16));
+  ASSERT_TRUE(op->Compute(t, &out).IsOk());
+
+  op = std::make_shared<MaskOp>(RelationalOp::kLess, v, DataType(DataType::DE_UINT16));
+  ASSERT_TRUE(op->Compute(t, &out).IsOk());
+
+  op = std::make_shared<MaskOp>(RelationalOp::kGreaterEqual, v, DataType(DataType::DE_UINT16));
+  ASSERT_TRUE(op->Compute(t, &out).IsOk());
+
+  op = std::make_shared<MaskOp>(RelationalOp::kGreater, v, DataType(DataType::DE_UINT16));
+  ASSERT_TRUE(op->Compute(t, &out).IsOk());
+}
diff --git a/tests/ut/cpp/dataset/mnist_op_test.cc b/tests/ut/cpp/dataset/mnist_op_test.cc
index 2733597b35..26b7335ad3 100644
--- a/tests/ut/cpp/dataset/mnist_op_test.cc
+++ b/tests/ut/cpp/dataset/mnist_op_test.cc
@@ -53,13 +53,11 @@ Status Create1DTensor(std::shared_ptr<Tensor> *sample_ids, int64_t num_elements,
                       DataType::Type data_type = DataType::DE_UINT32);
 
 std::shared_ptr<MnistOp> CreateMnist(int64_t num_wrks, int64_t rows, int64_t conns, std::string path,
-                                     bool shuf = false, std::unique_ptr<Sampler> sampler = nullptr,
-                                     int64_t num_samples = 0) {
+                                     bool shuf = false, std::shared_ptr<Sampler> sampler = nullptr) {
   std::shared_ptr<MnistOp> so;
   MnistOp::Builder builder;
   Status rc = builder.SetNumWorkers(num_wrks).SetDir(path).SetRowsPerBuffer(rows)
-                     .SetOpConnectorSize(conns).SetSampler(std::move(sampler))
-                     .SetNumSamples(num_samples).Build(&so);
+                     .SetOpConnectorSize(conns).SetSampler(std::move(sampler)).Build(&so);
   return so;
 }
 
@@ -74,7 +72,10 @@ TEST_F(MindDataTestMnistSampler, TestSequentialMnistWithRepeat) {
   // appear in this dataset
   // Example: python tests/dataset/data/prep_data.py
   std::string folder_path = datasets_root_path_ + "/testMnistData/";
-  auto tree = Build({CreateMnist(16, 2, 32, folder_path, false, nullptr, 10), Repeat(2)});
+  int64_t num_samples = 10;
+  int64_t start_index = 0;
+  auto seq_sampler = std::make_shared<SequentialSampler>(num_samples, start_index);
+  auto tree = Build({CreateMnist(16, 2, 32, folder_path, false, std::move(seq_sampler)), Repeat(2)});
   tree->Prepare();
   uint32_t res[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   Status rc = tree->Launch();
@@ -101,7 +102,10 @@ TEST_F(MindDataTestMnistSampler, TestSequentialMnistWithRepeat) {
 
 TEST_F(MindDataTestMnistSampler, TestSequentialImageFolderWithRepeatBatch) {
   std::string folder_path = datasets_root_path_ + "/testMnistData/";
-  auto tree = Build({CreateMnist(16, 2, 32, folder_path, false, nullptr, 10), Repeat(2), Batch(5)});
+  int64_t num_samples = 10;
+  int64_t start_index = 0;
+  auto seq_sampler = std::make_shared<SequentialSampler>(num_samples, start_index);
+  auto tree = Build({CreateMnist(16, 2, 32, folder_path, false, std::move(seq_sampler)), Repeat(2), Batch(5)});
   tree->Prepare();
   uint32_t res[4][5] = { {0, 0, 0, 0, 0 },
                          {0, 0, 0, 0, 0 },
diff --git a/tests/ut/cpp/dataset/pad_end_op_test.cc b/tests/ut/cpp/dataset/pad_end_op_test.cc
new file mode 100644
index 0000000000..2787501aa9
--- /dev/null
+++ b/tests/ut/cpp/dataset/pad_end_op_test.cc
@@ -0,0 +1,140 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/common.h"
+#include "dataset/kernels/data/pad_end_op.h"
+#include "utils/log_adapter.h"
+
+using namespace mindspore::dataset;
+using mindspore::LogStream;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::MsLogLevel::INFO;
+
+class MindDataTestPadEndOp : public UT::Common {
+ protected:
+  MindDataTestPadEndOp() {}
+};
+
+TEST_F(MindDataTestPadEndOp, TestOp) {
+  MS_LOG(INFO) << "Doing MindDataTestPadEndOp.";
+
+  // first set of testunits for numeric values
+
+  TensorShape pad_data_shape({1});
+
+  // prepare input tensor
+  float_t orig1[4] = {1, 1, 1, 1};
+  TensorShape input_shape1({2, 2});
+  std::vector<TensorShape> input_shape1_vector = {input_shape1};
+  std::shared_ptr<Tensor> input1 =
+    std::make_shared<Tensor>(input_shape1, DataType(DataType::DE_FLOAT32), reinterpret_cast<unsigned char *>(orig1));
+
+  // pad_shape
+  TensorShape pad_shape1[3] = {TensorShape({3, 3}), TensorShape({2, 4}), TensorShape({4, 2})};
+
+  // value to pad
+  float_t pad_data1[3][1] = {0, 3.5, 3.5};
+
+  std::shared_ptr<Tensor> expected1[3];
+
+  // expected tensor output for testunit 1
+  float_t out1[9] = {1, 1, 0, 1, 1, 0, 0, 0, 0};
+
+  expected1[0] =
+    std::make_shared<Tensor>(pad_shape1[0], DataType(DataType::DE_FLOAT32), reinterpret_cast<unsigned char *>(out1));
+
+  // expected tensor output for testunit 2
+  float_t out2[8] = {1, 1, 3.5, 3.5, 1, 1, 3.5, 3.5};
+
+  expected1[1] =
+    std::make_shared<Tensor>(pad_shape1[1], DataType(DataType::DE_FLOAT32), reinterpret_cast<unsigned char *>(out2));
+
+  // expected tensor output for testunit 3
+  float_t out3[8] = {1, 1, 1, 1, 3.5, 3.5, 3.5, 3.5};
+
+  expected1[2] =
+    std::make_shared<Tensor>(pad_shape1[2], DataType(DataType::DE_FLOAT32), reinterpret_cast<unsigned char *>(out3));
+
+  // run the PadEndOp
+  for (auto i = 0; i < 3; i++) {
+    std::shared_ptr<Tensor> output;
+    std::vector<TensorShape> output_shape = {TensorShape({})};
+    std::shared_ptr<Tensor> pad_value1 = std::make_shared<Tensor>(pad_data_shape, DataType(DataType::DE_FLOAT32),
+                                                                  reinterpret_cast<unsigned char *>(pad_data1[i]));
+    std::unique_ptr<PadEndOp> op(new PadEndOp(pad_shape1[i], pad_value1));
+    Status s = op->Compute(input1, &output);
+
+    EXPECT_TRUE(s.IsOk());
+    ASSERT_TRUE(output->shape() == expected1[i]->shape());
+    ASSERT_TRUE(output->type() == expected1[i]->type());
+    MS_LOG(DEBUG) << *output << std::endl;
+    MS_LOG(DEBUG) << *expected1[i] << std::endl;
+    ASSERT_TRUE(*output == *expected1[i]);
+
+    s = op->OutputShape(input_shape1_vector, output_shape);
+    EXPECT_TRUE(s.IsOk());
+    ASSERT_TRUE(output_shape.size() == 1);
+    ASSERT_TRUE(output->shape() == output_shape[0]);
+  }
+
+  // second set of testunits for string
+
+  // input tensor
+  std::vector<std::string> orig2 = {"this", "is"};
+  TensorShape input_shape2({2});
+  std::vector<TensorShape> input_shape2_vector = {input_shape2};
+  std::shared_ptr<Tensor> input2;
+  Tensor::CreateTensor(&input2, orig2, input_shape2);
+
+  // pad_shape
+  TensorShape pad_shape2[3] = {TensorShape({5}), TensorShape({2}), TensorShape({10})};
+
+  // pad value
+  std::vector<std::string> pad_data2[3] = {{""}, {"P"}, {" "}};
+  std::shared_ptr<Tensor> pad_value2[3];
+
+  // expected output for 3 testunits
+  std::shared_ptr<Tensor> expected2[3];
+  std::vector<std::string> outstring[3] = {
+    {"this", "is", "", "", ""}, {"this", "is"}, {"this", "is", " ", " ", " ", " ", " ", " ", " ", " "}};
+
+  for (auto i = 0; i < 3; i++) {
+    // pad value
+    Tensor::CreateTensor(&pad_value2[i], pad_data2[i], pad_data_shape);
+
+    std::shared_ptr<Tensor> output;
+    std::vector<TensorShape> output_shape = {TensorShape({})};
+
+    std::unique_ptr<PadEndOp> op(new PadEndOp(pad_shape2[i], pad_value2[i]));
+
+    Status s = op->Compute(input2, &output);
+
+    Tensor::CreateTensor(&expected2[i], outstring[i], pad_shape2[i]);
+
+    EXPECT_TRUE(s.IsOk());
+    ASSERT_TRUE(output->shape() == expected2[i]->shape());
+    ASSERT_TRUE(output->type() == expected2[i]->type());
+    MS_LOG(DEBUG) << *output << std::endl;
+    MS_LOG(DEBUG) << *expected2[i] << std::endl;
+    ASSERT_TRUE(*output == *expected2[i]);
+
+    s = op->OutputShape(input_shape2_vector, output_shape);
+    EXPECT_TRUE(s.IsOk());
+    ASSERT_TRUE(output_shape.size() == 1);
+    ASSERT_TRUE(output->shape() == output_shape[0]);
+  }
+
+  MS_LOG(INFO) << "MindDataTestPadEndOp end.";
+}
diff --git a/tests/ut/cpp/dataset/queue_test.cc b/tests/ut/cpp/dataset/queue_test.cc
index 00366fcafd..578405e537 100644
--- a/tests/ut/cpp/dataset/queue_test.cc
+++ b/tests/ut/cpp/dataset/queue_test.cc
@@ -13,9 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-//
-// Created by jesse on 10/3/19.
-//
 
 #include "common/common.h"
 #include "gtest/gtest.h"
@@ -25,32 +22,32 @@
 #include "utils/log_adapter.h"
 
 using namespace mindspore::dataset;
-using mindspore::MsLogLevel::INFO;
-using mindspore::ExceptionType::NoExceptionType;
 using mindspore::LogStream;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::MsLogLevel::INFO;
 
 class MindDataTestQueue : public UT::Common {
  public:
-    MindDataTestQueue() {}
+  MindDataTestQueue() {}
 
-    void SetUp() {}
+  void SetUp() {}
 };
 
 int gRefCountDestructorCalled;
 
 class RefCount {
  public:
-    RefCount() : v_(nullptr) {}
-    explicit RefCount(int x) : v_(std::make_shared<int>(x)) {}
-    explicit RefCount(const RefCount &o) : v_(o.v_) {}
-    ~RefCount() {
-      MS_LOG(DEBUG) << "Destructor of RefCount called" << std::endl;
-      gRefCountDestructorCalled++;
-    }
-    RefCount& operator=(const RefCount &o) {
-      v_ = o.v_;
-      return *this;
-    }
+  RefCount() : v_(nullptr) {}
+  explicit RefCount(int x) : v_(std::make_shared<int>(x)) {}
+  explicit RefCount(const RefCount &o) : v_(o.v_) {}
+  ~RefCount() {
+    MS_LOG(DEBUG) << "Destructor of RefCount called" << std::endl;
+    gRefCountDestructorCalled++;
+  }
+  RefCount &operator=(const RefCount &o) {
+    v_ = o.v_;
+    return *this;
+  }
 
   std::shared_ptr<int> v_;
 };
@@ -70,22 +67,22 @@ TEST_F(MindDataTestQueue, Test1) {
   // Use count should remain 2. a and b. No copy in the queue.
   ASSERT_EQ(a.use_count(), 2);
   a.reset(new int(5));
-  ASSERT_EQ(a.use_count(),1);
+  ASSERT_EQ(a.use_count(), 1);
   // Push again but expect a is nullptr after push
   rc = que.Add(std::move(a));
   ASSERT_TRUE(rc.IsOk());
-  ASSERT_EQ(a.use_count(),0);
+  ASSERT_EQ(a.use_count(), 0);
   rc = que.PopFront(&b);
   ASSERT_TRUE(rc.IsOk());
   ASSERT_EQ(*b, 5);
-  ASSERT_EQ(b.use_count(),1);
+  ASSERT_EQ(b.use_count(), 1);
   // Test construct in place
   rc = que.EmplaceBack(std::make_shared<int>(100));
   ASSERT_TRUE(rc.IsOk());
   rc = que.PopFront(&b);
   ASSERT_TRUE(rc.IsOk());
   ASSERT_EQ(*b, 100);
-  ASSERT_EQ(b.use_count(),1);
+  ASSERT_EQ(b.use_count(), 1);
   // Test the destructor of the Queue by add an element in the queue without popping it and let the queue go
   // out of scope.
   rc = que.EmplaceBack(std::make_shared<int>(2000));
@@ -127,7 +124,7 @@ TEST_F(MindDataTestQueue, Test3) {
   ASSERT_EQ(*b, 40);
 }
 
-void test4(){
+void test4() {
   gRefCountDestructorCalled = 0;
   // Pass a structure along the queue.
   Queue<RefCount> que(3);
@@ -144,9 +141,7 @@ void test4(){
   ASSERT_TRUE(rc.IsOk());
 }
 
-TEST_F(MindDataTestQueue, Test4) {
- test4();
-}
+TEST_F(MindDataTestQueue, Test4) { test4(); }
 
 TEST_F(MindDataTestQueue, Test5) {
   test4();
diff --git a/tests/ut/cpp/dataset/random_crop_and_resize_op_test.cc b/tests/ut/cpp/dataset/random_crop_and_resize_op_test.cc
index 7be18fb02c..3d5298b071 100644
--- a/tests/ut/cpp/dataset/random_crop_and_resize_op_test.cc
+++ b/tests/ut/cpp/dataset/random_crop_and_resize_op_test.cc
@@ -28,17 +28,38 @@ class MindDataTestRandomCropAndResizeOp : public UT::CVOP::CVOpCommon {
  public:
   MindDataTestRandomCropAndResizeOp() : CVOpCommon() {}
 };
+TEST_F(MindDataTestRandomCropAndResizeOp, TestOpSimpleTest1) {
+  MS_LOG(INFO) << " starting RandomCropAndResizeOp simple test";
+  TensorShape s_in = input_tensor_->shape();
+  std::shared_ptr<Tensor> output_tensor;
+  int h_out = 1024;
+  int w_out = 2048;
+  float aspect_lb = 2;
+  float aspect_ub = 2.5;
+  float scale_lb = 0.2;
+  float scale_ub = 2.0;
 
-TEST_F(MindDataTestRandomCropAndResizeOp, TestOpSimpleTest) {
+  TensorShape s_out({h_out, w_out, s_in[2]});
+
+  auto op = std::make_unique<RandomCropAndResizeOp>(h_out, w_out, scale_lb, scale_ub, aspect_lb, aspect_ub);
+  Status s;
+  for (auto i = 0; i < 100; i++) {
+    s = op->Compute(input_tensor_, &output_tensor);
+    EXPECT_TRUE(s.IsOk());
+  }
+
+  MS_LOG(INFO) << "RandomCropAndResizeOp simple test finished";
+}
+TEST_F(MindDataTestRandomCropAndResizeOp, TestOpSimpleTest2) {
   MS_LOG(INFO) << " starting RandomCropAndResizeOp simple test";
   TensorShape s_in = input_tensor_->shape();
   std::shared_ptr<Tensor> output_tensor;
   int h_out = 1024;
   int w_out = 2048;
-  float aspect_lb = 0.2;
-  float aspect_ub = 5;
-  float scale_lb = 0.0001;
-  float scale_ub = 1.0;
+  float aspect_lb = 1;
+  float aspect_ub = 1.5;
+  float scale_lb = 0.2;
+  float scale_ub = 2.0;
 
   TensorShape s_out({h_out, w_out, s_in[2]});
 
@@ -51,3 +72,25 @@ TEST_F(MindDataTestRandomCropAndResizeOp, TestOpSimpleTest) {
 
   MS_LOG(INFO) << "RandomCropAndResizeOp simple test finished";
 }
+TEST_F(MindDataTestRandomCropAndResizeOp, TestOpSimpleTest3) {
+  MS_LOG(INFO) << " starting RandomCropAndResizeOp simple test";
+  TensorShape s_in = input_tensor_->shape();
+  std::shared_ptr<Tensor> output_tensor;
+  int h_out = 1024;
+  int w_out = 2048;
+  float aspect_lb = 0.2;
+  float aspect_ub = 3;
+  float scale_lb = 0.2;
+  float scale_ub = 2.0;
+
+  TensorShape s_out({h_out, w_out, s_in[2]});
+
+  auto op = std::make_unique<RandomCropAndResizeOp>(h_out, w_out, scale_lb, scale_ub, aspect_lb, aspect_ub);
+  Status s;
+  for (auto i = 0; i < 100; i++) {
+    s = op->Compute(input_tensor_, &output_tensor);
+    EXPECT_TRUE(s.IsOk());
+  }
+
+  MS_LOG(INFO) << "RandomCropAndResizeOp simple test finished";
+}
\ No newline at end of file
diff --git a/tests/ut/cpp/dataset/random_crop_decode_resize_op_test.cc b/tests/ut/cpp/dataset/random_crop_decode_resize_op_test.cc
index 42db95922a..1c9f3a98dc 100644
--- a/tests/ut/cpp/dataset/random_crop_decode_resize_op_test.cc
+++ b/tests/ut/cpp/dataset/random_crop_decode_resize_op_test.cc
@@ -54,16 +54,17 @@ TEST_F(MindDataTestRandomCropDecodeResizeOp, TestOp2) {
   auto decode_and_crop = static_cast<RandomCropAndResizeOp>(crop_and_decode_copy);
   EXPECT_TRUE(crop_and_decode.OneToOne());
   GlobalContext::config_manager()->set_seed(42);
-  for (int i = 0; i < 100; i++) {
+  for (int k = 0; k < 100; k++) {
     (void)crop_and_decode.Compute(raw_input_tensor_, &crop_and_decode_output);
     (void)decode_and_crop.Compute(input_tensor_, &decode_and_crop_output);
-    cv::Mat output1(target_height, target_width, CV_8UC3, crop_and_decode_output->GetMutableBuffer());
-    cv::Mat output2(target_height, target_width, CV_8UC3, decode_and_crop_output->GetMutableBuffer());
+    cv::Mat output1 = CVTensor::AsCVTensor(crop_and_decode_output)->mat().clone();
+    cv::Mat output2 = CVTensor::AsCVTensor(decode_and_crop_output)->mat().clone();
+
     long int mse_sum = 0;
     long int count = 0;
     int a, b;
-    for (int j = 0; j < target_height; j++) {
-      for (int k = 0; k < target_width; k++) {
+    for (int i = 0; i < target_height; i++) {
+      for (int j = 0; j < target_width; j++) {
         a = static_cast<int>(output1.at<cv::Vec3b>(i, j)[1]);
         b = static_cast<int>(output2.at<cv::Vec3b>(i, j)[1]);
         mse_sum += sqrt((a - b) * (a - b));
@@ -133,8 +134,8 @@ TEST_F(MindDataTestRandomCropDecodeResizeOp, TestOp1) {
     crop_and_decode_status = Crop(decoded, &decoded_and_cropped, x, y, crop_width, crop_height);
     decode_and_crop_status = JpegCropAndDecode(raw_input_tensor_, &cropped_and_decoded, x, y, crop_width, crop_height);
     {
-      cv::Mat M1(crop_height, crop_width, CV_8UC3, decoded_and_cropped->GetMutableBuffer());
-      cv::Mat M2(crop_height, crop_width, CV_8UC3, cropped_and_decoded->GetMutableBuffer());
+      cv::Mat M1 = CVTensor::AsCVTensor(decoded_and_cropped)->mat().clone();
+      cv::Mat M2 = CVTensor::AsCVTensor(cropped_and_decoded)->mat().clone();
       for (int i = 0; i < crop_height; ++i) {
         for (int j = 0; j < crop_width; ++j) {
           m1 = M1.at<cv::Vec3b>(i, j)[1];
diff --git a/tests/ut/cpp/dataset/rename_op_test.cc b/tests/ut/cpp/dataset/rename_op_test.cc
index 532f8f5691..b6849ec53e 100644
--- a/tests/ut/cpp/dataset/rename_op_test.cc
+++ b/tests/ut/cpp/dataset/rename_op_test.cc
@@ -44,23 +44,23 @@ TEST_F(MindDataTestRenameOp, TestRenameOpDefault) {
 //
 //       OpId(2) RenameOp
 //            |
-//     OpId(0) StorageOp
+//     OpId(0) TFReaderOp
 // Start with an empty execution tree
   Status rc;
   MS_LOG(INFO) << "UT test TestRenameBasic.";
   auto my_tree = std::make_shared<ExecutionTree>();
-  // Creating StorageOp
+  // Creating TFReaderOp
 
-  std::string dataset_path = datasets_root_path_ + "/test_tf_file_3_images_1";
-  std::shared_ptr<StorageOp> my_storage_op;
-  rc = StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path)
+  std::string dataset_path = datasets_root_path_ + "/test_tf_file_3_images_1/train-0000-of-0001.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  rc = TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path})
       .SetRowsPerBuffer(2)
       .SetWorkerConnectorSize(16)
       .SetNumWorkers(1)
-      .Build(&my_storage_op);
+      .Build(&my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_tree->AssociateNode(my_storage_op);
+  rc = my_tree->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
 
   // Creating DatasetOp
@@ -76,7 +76,7 @@ TEST_F(MindDataTestRenameOp, TestRenameOpDefault) {
 
   rc = my_tree->AssociateNode(rename_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = rename_op->AddChild(std::move(my_storage_op));
+  rc = rename_op->AddChild(std::move(my_tfreader_op));
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree->AssignRoot(rename_op);
   EXPECT_TRUE(rc.IsOk());
diff --git a/tests/ut/cpp/dataset/repeat_op_test.cc b/tests/ut/cpp/dataset/repeat_op_test.cc
index e32e98cbd7..42549546ba 100644
--- a/tests/ut/cpp/dataset/repeat_op_test.cc
+++ b/tests/ut/cpp/dataset/repeat_op_test.cc
@@ -51,6 +51,7 @@ TEST_F(MindDataTestrepeat_op, Testrepeat_opFuntions) {
   ASSERT_NE(my_tfreader_op, nullptr);
   parent_op->AddChild(std::move(my_tfreader_op));
   MS_LOG(INFO) << parent_op;
+  my_tree->AssignRoot(parent_op);
   my_tree->Prepare();
 
   RepeatOp RepeatOpOp();
diff --git a/tests/ut/cpp/dataset/shuffle_op_test.cc b/tests/ut/cpp/dataset/shuffle_op_test.cc
index 82c4a67957..c9bcb24c4e 100644
--- a/tests/ut/cpp/dataset/shuffle_op_test.cc
+++ b/tests/ut/cpp/dataset/shuffle_op_test.cc
@@ -39,11 +39,11 @@ class MindDataTestShuffleOp : public UT::DatasetOpTesting {
 // - RowsPerBuffer buffer setting of 2 divides evenly into total rows.
 // - Shuffle size is multiple of rows per buffer.
 //
-// Tree:  shuffle over storage
+// Tree:  shuffle over TFReader
 //
 //    ShuffleOp
 //        |
-//    StorageOp
+//    TFReaderOp
 //
 TEST_F(MindDataTestShuffleOp, TestShuffleBasic1) {
   Status rc;
@@ -53,16 +53,16 @@ TEST_F(MindDataTestShuffleOp, TestShuffleBasic1) {
   auto my_tree = std::make_shared<ExecutionTree>();
 
   std::string dataset_path;
-  dataset_path = datasets_root_path_ + "/testDataset1";
-  std::shared_ptr<StorageOp> my_storage_op;
-  rc = StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path)
+  dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  rc = TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path})
       .SetRowsPerBuffer(2)
       .SetWorkerConnectorSize(16)
       .SetNumWorkers(1)
-      .Build(&my_storage_op);
+      .Build(&my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_tree->AssociateNode(my_storage_op);
+  rc = my_tree->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   std::shared_ptr<ShuffleOp> my_shuffle_op;
   rc = ShuffleOp::Builder().SetRowsPerBuffer(2).SetShuffleSize(4).Build(&my_shuffle_op);
@@ -71,7 +71,7 @@ TEST_F(MindDataTestShuffleOp, TestShuffleBasic1) {
   EXPECT_TRUE(rc.IsOk());
 
   // Set children/root layout.
-  rc = my_shuffle_op->AddChild(my_storage_op);
+  rc = my_shuffle_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree->AssignRoot(my_shuffle_op);
   EXPECT_TRUE(rc.IsOk());
@@ -112,11 +112,11 @@ TEST_F(MindDataTestShuffleOp, TestShuffleBasic1) {
 // - Shuffle size is not a multiple of rows per buffer.
 // - User has provided a non-default seed value.
 //
-// Tree: shuffle over storage
+// Tree: shuffle over TFReader
 //
 //    ShuffleOp
 //       |
-//    StorageOp
+//    TFReaderOp
 //
 TEST_F(MindDataTestShuffleOp, TestShuffleBasic2) {
   Status rc;
@@ -126,16 +126,16 @@ TEST_F(MindDataTestShuffleOp, TestShuffleBasic2) {
   auto my_tree = std::make_shared<ExecutionTree>();
 
   std::string dataset_path;
-  dataset_path = datasets_root_path_ + "/testDataset1";
-  std::shared_ptr<StorageOp> my_storage_op;
-  rc = StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path)
+  dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  rc = TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path})
       .SetRowsPerBuffer(3)
       .SetWorkerConnectorSize(16)
       .SetNumWorkers(2)
-      .Build(&my_storage_op);
+      .Build(&my_tfreader_op);
   ASSERT_TRUE(rc.IsOk());
-  rc = my_tree->AssociateNode(my_storage_op);
+  rc = my_tree->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   std::shared_ptr<ShuffleOp> my_shuffle_op;
   rc = ShuffleOp::Builder().SetShuffleSize(4).SetShuffleSeed(100).SetRowsPerBuffer(3).Build(&my_shuffle_op);
@@ -144,7 +144,7 @@ TEST_F(MindDataTestShuffleOp, TestShuffleBasic2) {
   EXPECT_TRUE(rc.IsOk());
 
   // Set children/root layout.
-  rc = my_shuffle_op->AddChild(my_storage_op);
+  rc = my_shuffle_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree->AssignRoot(my_shuffle_op);
   EXPECT_TRUE(rc.IsOk());
@@ -183,11 +183,11 @@ TEST_F(MindDataTestShuffleOp, TestShuffleBasic2) {
 // - Shuffle size captures the entire dataset size (actually sets a value that is larger than the
 //   amount of rows in the dataset.
 //
-// Tree: shuffle over storage
+// Tree: shuffle over TFReader
 //
 //    ShuffleOp
 //        |
-//    StorageOp
+//    TFReaderOp
 //
 TEST_F(MindDataTestShuffleOp, TestShuffleBasic3) {
   Status rc;
@@ -197,16 +197,16 @@ TEST_F(MindDataTestShuffleOp, TestShuffleBasic3) {
   auto my_tree = std::make_shared<ExecutionTree>();
 
   std::string dataset_path;
-  dataset_path = datasets_root_path_ + "/testDataset1";
-  std::shared_ptr<StorageOp> my_storage_op;
-  rc = StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path)
+  dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  rc = TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path})
       .SetRowsPerBuffer(3)
       .SetWorkerConnectorSize(16)
       .SetNumWorkers(2)
-      .Build(&my_storage_op);
+      .Build(&my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
-  my_tree->AssociateNode(my_storage_op);
+  my_tree->AssociateNode(my_tfreader_op);
   std::shared_ptr<ShuffleOp> my_shuffle_op;
   rc = ShuffleOp::Builder().SetShuffleSize(100).SetRowsPerBuffer(3).Build(&my_shuffle_op);
   EXPECT_TRUE(rc.IsOk());
@@ -214,7 +214,7 @@ TEST_F(MindDataTestShuffleOp, TestShuffleBasic3) {
   EXPECT_TRUE(rc.IsOk());
 
   // Set children/root layout.
-  rc = my_shuffle_op->AddChild(my_storage_op);
+  rc = my_shuffle_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree->AssignRoot(my_shuffle_op);
   EXPECT_TRUE(rc.IsOk());
@@ -255,13 +255,13 @@ TEST_F(MindDataTestShuffleOp, TestShuffleBasic3) {
 // - shuffle seed is given, and subsequent epochs will change the seed each time.
 // - Repeat count of 2
 //
-// Tree: Repeat over shuffle over storage
+// Tree: Repeat over shuffle over TFReader
 //
 //    Repeat
 //       |
 //    shuffle
 //       |
-//    StorageOp
+//    TFReaderOp
 //
 TEST_F(MindDataTestShuffleOp, TestRepeatShuffle) {
   Status rc;
@@ -271,16 +271,16 @@ TEST_F(MindDataTestShuffleOp, TestRepeatShuffle) {
   auto my_tree = std::make_shared<ExecutionTree>();
 
   std::string dataset_path;
-  dataset_path = datasets_root_path_ + "/testDataset1";
-  std::shared_ptr<StorageOp> my_storage_op;
-  rc = StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path)
+  dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  rc = TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path})
       .SetRowsPerBuffer(3)
       .SetWorkerConnectorSize(16)
       .SetNumWorkers(2)
-      .Build(&my_storage_op);
+      .Build(&my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_tree->AssociateNode(my_storage_op);
+  rc = my_tree->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   std::shared_ptr<ShuffleOp> my_shuffle_op;
   rc = ShuffleOp::Builder()
@@ -302,7 +302,7 @@ TEST_F(MindDataTestShuffleOp, TestRepeatShuffle) {
   // Set children/root layout.
   rc = my_repeat_op->AddChild(my_shuffle_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_shuffle_op->AddChild(my_storage_op);
+  rc = my_shuffle_op->AddChild(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree->AssignRoot(my_repeat_op);
   EXPECT_TRUE(rc.IsOk());
diff --git a/tests/ut/cpp/dataset/stand_alone_samplers_test.cc b/tests/ut/cpp/dataset/stand_alone_samplers_test.cc
index 6ab7d0498f..ebfba3dde6 100644
--- a/tests/ut/cpp/dataset/stand_alone_samplers_test.cc
+++ b/tests/ut/cpp/dataset/stand_alone_samplers_test.cc
@@ -31,11 +31,9 @@ using namespace mindspore::dataset;
 
 Status CreateINT64Tensor(std::shared_ptr<Tensor> *sample_ids, int64_t num_elements, unsigned char *data = nullptr) {
   TensorShape shape(std::vector<int64_t>(1, num_elements));
-  RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, TensorImpl::kFlexible, shape,
-                                        DataType(DataType::DE_INT64), data));
-  if (data == nullptr) {
-    (*sample_ids)->GetMutableBuffer();  // allocate memory in case user forgets!
-  }
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, TensorImpl::kFlexible, shape, DataType(DataType::DE_INT64), data));
+  (*sample_ids)->AllocateBuffer((*sample_ids)->SizeInBytes());  // allocate memory in case user forgets!
+
   return Status::OK();
 }
 
@@ -43,20 +41,11 @@ class MindDataTestStandAloneSampler : public UT::DatasetOpTesting {
  protected:
   class MockStorageOp : public RandomAccessOp {
    public:
-    MockStorageOp(int64_t val) : m_val_(val) {}
-
-    Status GetNumSamples(int64_t *ptr) const override {
-      (*ptr) = m_val_;
-      return Status::OK();
+    MockStorageOp(int64_t val){
+      // row count is in base class as protected member
+      // GetNumRowsInDataset does not need an override, the default from base class is fine.
+      num_rows_ = val;
     }
-
-    Status GetNumRowsInDataset(int64_t *ptr) const override {
-      (*ptr) = m_val_;
-      return Status::OK();
-    }
-
-   private:
-    int64_t m_val_;
   };
 };
 
@@ -73,10 +62,11 @@ TEST_F(MindDataTestStandAloneSampler, TestDistributedSampler) {
   MockStorageOp mock(20);
   std::unique_ptr<DataBuffer> db;
   std::shared_ptr<Tensor> tensor;
+  int64_t num_samples = 0;
   for (int i = 0; i < 6; i++) {
-    std::unique_ptr<Sampler> sampler = std::make_unique<DistributedSampler>(3, i % 3, (i < 3 ? false : true));
+    std::shared_ptr<Sampler> sampler = std::make_shared<DistributedSampler>(num_samples, 3, i % 3, (i < 3 ? false : true));
     sampler->HandshakeRandomAccessOp(&mock);
-    sampler->GetNextBuffer(&db);
+    sampler->GetNextSample(&db);
     db->GetTensor(&tensor, 0, 0);
     MS_LOG(DEBUG) << (*tensor);
     if(i < 3) {  // This is added due to std::shuffle()
@@ -92,21 +82,23 @@ TEST_F(MindDataTestStandAloneSampler, TestStandAoneSequentialSampler) {
   std::shared_ptr<Tensor> label1, label2;
   CreateINT64Tensor(&label1, 3, reinterpret_cast<unsigned char *>(res));
   CreateINT64Tensor(&label2, 2, reinterpret_cast<unsigned char *>(res + 3));
-  std::shared_ptr<Sampler> sampler = std::make_shared<SequentialSampler>(3);
+  int64_t num_samples = 0;
+  int64_t start_index = 0;
+  std::shared_ptr<Sampler> sampler = std::make_shared<SequentialSampler>(num_samples, start_index, 3);
   std::unique_ptr<DataBuffer> db;
   std::shared_ptr<Tensor> tensor;
   sampler->HandshakeRandomAccessOp(&mock);
-  sampler->GetNextBuffer(&db);
+  sampler->GetNextSample(&db);
   db->GetTensor(&tensor, 0, 0);
   EXPECT_TRUE((*tensor) == (*label1));
-  sampler->GetNextBuffer(&db);
+  sampler->GetNextSample(&db);
   db->GetTensor(&tensor, 0, 0);
   EXPECT_TRUE((*tensor) == (*label2));
-  sampler->Reset();
-  sampler->GetNextBuffer(&db);
+  sampler->ResetSampler();
+  sampler->GetNextSample(&db);
   db->GetTensor(&tensor, 0, 0);
   EXPECT_TRUE((*tensor) == (*label1));
-  sampler->GetNextBuffer(&db);
+  sampler->GetNextSample(&db);
   db->GetTensor(&tensor, 0, 0);
   EXPECT_TRUE((*tensor) == (*label2));
 }
diff --git a/tests/ut/cpp/dataset/storage_op_test.cc b/tests/ut/cpp/dataset/storage_op_test.cc
deleted file mode 100644
index e6ce4af52f..0000000000
--- a/tests/ut/cpp/dataset/storage_op_test.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "dataset/core/client.h"
-#include "common/common.h"
-#include "common/utils.h"
-#include "gtest/gtest.h"
-#include "utils/log_adapter.h"
-#include <memory>
-#include <vector>
-#include <iostream>
-
-namespace common = mindspore::common;
-
-using namespace mindspore::dataset;
-using mindspore::MsLogLevel::INFO;
-using mindspore::ExceptionType::NoExceptionType;
-using mindspore::LogStream;
-
-class MindDataTestStorageOp : public UT::DatasetOpTesting {
-
-};
-
-TEST_F(MindDataTestStorageOp, TestStorageBasic1) {
-
-  // single storage op and nothing else
-  //
-  //    StorageOp
-
-  MS_LOG(INFO) << "UT test TestStorageBasic1.";
-
-  Status rc;
-
-  // Start with an empty execution tree
-  auto my_tree = std::make_shared<ExecutionTree>();
-
-  // Test info:
-  // Dataset from testDataset1 has 10 rows, 2 columns.
-  // RowsPerBuffer buffer setting of 2 divides evenly into total rows.
-  std::string dataset_path;
-  dataset_path = datasets_root_path_ + "/testDataset1";
-  std::shared_ptr<StorageOp> my_storage_op;
-  StorageOp::Builder builder;
-  builder.SetDatasetFilesDir(dataset_path)
-      .SetRowsPerBuffer(2)
-      .SetWorkerConnectorSize(16)
-      .SetNumWorkers(1);
-  rc = builder.Build(&my_storage_op);
-  ASSERT_TRUE(rc.IsOk());
-  my_tree->AssociateNode(my_storage_op);
-
-  // Set children/root layout.
-  my_tree->AssignRoot(my_storage_op);
-
-  MS_LOG(INFO) << "Launching tree and begin iteration.";
-  my_tree->Prepare();
-  my_tree->Launch();
-
-  // Start the loop of reading tensors from our pipeline
-  DatasetIterator di(my_tree);
-  TensorRow tensor_list;
-  rc = di.FetchNextTensorRow(&tensor_list);
-  ASSERT_TRUE(rc.IsOk());
-
-  int row_count = 0;
-  while (!tensor_list.empty()) {
-    MS_LOG(INFO) << "Row display for row #: " << row_count << ".";
-
-    // Display the tensor by calling the printer on it
-    for (int i = 0; i < tensor_list.size(); i++) {
-      std::ostringstream ss;
-      ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
-      MS_LOG(INFO) << "Tensor print: " << common::SafeCStr(ss.str()) << ".";
-    }
-
-    rc = di.FetchNextTensorRow(&tensor_list);
-    ASSERT_TRUE(rc.IsOk());
-    row_count++;
-  }
-  ASSERT_EQ(row_count, 10); // Should be 10 rows fetched
-
-  // debugging temp.  what happens if we keep fetching..
-  rc = di.FetchNextTensorRow(&tensor_list);
-  ASSERT_TRUE(rc.IsOk());
-
-  rc = di.FetchNextTensorRow(&tensor_list);
-  ASSERT_TRUE(rc.IsOk());
-}
-
-TEST_F(MindDataTestStorageOp, TestStorageBasic2) {
-
-  // single storage op and nothing else
-  //
-  //    StorageOp
-
-  MS_LOG(INFO) << "UT test TestStorageBasic1.";
-
-  Status rc;
-
-  // Start with an empty execution tree
-  auto my_tree = std::make_shared<ExecutionTree>();
-
-  // Test info:
-  // Dataset from testDataset1 has 10 rows, 2 columns.
-  // RowsPerBuffer buffer setting of 3 yields 4 buffers with the last buffer having single row
-  // only.  2 workers.
-  // Test a column selection instead of all columns as well.
-  std::string dataset_path;
-  dataset_path = datasets_root_path_ + "/testDataset1";
-  std::vector<std::string> column_list;
-  std::string label_colname("label");
-  column_list.push_back(label_colname);
-  std::shared_ptr<StorageOp> my_storage_op;
-  StorageOp::Builder builder;
-  builder.SetDatasetFilesDir(dataset_path)
-    .SetRowsPerBuffer(3)
-    .SetWorkerConnectorSize(16)
-    .SetNumWorkers(2)
-    .SetColumnsToLoad(column_list);
-  rc = builder.Build(&my_storage_op);
-  ASSERT_TRUE(rc.IsOk());
-  my_tree->AssociateNode(my_storage_op);
-
-  // Set children/root layout.
-  my_tree->AssignRoot(my_storage_op);
-
-  MS_LOG(INFO) << "Launching tree and begin iteration.";
-  my_tree->Prepare();
-  my_tree->Launch();
-
-  // Start the loop of reading tensors from our pipeline
-  DatasetIterator di(my_tree);
-  TensorRow tensor_list;
-  rc = di.FetchNextTensorRow(&tensor_list);
-  ASSERT_TRUE(rc.IsOk());
-
-  int row_count = 0;
-  while (!tensor_list.empty()) {
-    MS_LOG(INFO) << "Row display for row #: " << row_count << ".";
-
-    // Display the tensor by calling the printer on it
-    for (int i = 0; i < tensor_list.size(); i++) {
-      std::ostringstream ss;
-      ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
-      MS_LOG(INFO) << "Tensor print: " << common::SafeCStr(ss.str()) << ".";
-    }
-
-    rc = di.FetchNextTensorRow(&tensor_list);
-    ASSERT_TRUE(rc.IsOk());
-    row_count++;
-  }
-  ASSERT_EQ(row_count, 10); // Should be 10 rows fetched
-}
diff --git a/tests/ut/cpp/dataset/subset_random_sampler_test.cc b/tests/ut/cpp/dataset/subset_random_sampler_test.cc
index bb8b3439d5..22200ccbac 100644
--- a/tests/ut/cpp/dataset/subset_random_sampler_test.cc
+++ b/tests/ut/cpp/dataset/subset_random_sampler_test.cc
@@ -31,26 +31,17 @@ class MindDataTestSubsetRandomSampler : public UT::Common {
  public:
   class DummyRandomAccessOp : public RandomAccessOp {
    public:
-    DummyRandomAccessOp(int64_t num_rows) : num_rows_(num_rows) {};
-    Status GetNumSamples(int64_t *num) const {
-      *num = num_rows_;
-      return Status::OK();
-    }
-
-    Status GetNumRowsInDataset(int64_t *num) const {
-      *num = num_rows_;
-      return Status::OK();
-    }
-
-   private:
-    int64_t num_rows_;
+    DummyRandomAccessOp(int64_t num_rows) {
+      num_rows_ = num_rows;  // base class
+    };
   };
 };
 
 TEST_F(MindDataTestSubsetRandomSampler, TestAllAtOnce) {
   std::vector<int64_t> in({0, 1, 2, 3, 4});
   std::unordered_set<int64_t> in_set(in.begin(), in.end());
-  SubsetRandomSampler sampler(in);
+  int64_t num_samples = 0;
+  SubsetRandomSampler sampler(num_samples, in);
 
   DummyRandomAccessOp dummyRandomAccessOp(5);
   sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);
@@ -58,7 +49,7 @@ TEST_F(MindDataTestSubsetRandomSampler, TestAllAtOnce) {
   std::unique_ptr<DataBuffer> db;
   TensorRow row;
   std::vector<int64_t> out;
-  ASSERT_EQ(sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
   db->PopRow(&row);
   for (const auto &t : row) {
     for (auto it = t->begin<int64_t>(); it != t->end<int64_t>(); it++) {
@@ -70,15 +61,16 @@ TEST_F(MindDataTestSubsetRandomSampler, TestAllAtOnce) {
     ASSERT_NE(in_set.find(out[i]), in_set.end());
   }
 
-  ASSERT_EQ(sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
   ASSERT_EQ(db->eoe(), true);
 }
 
 TEST_F(MindDataTestSubsetRandomSampler, TestGetNextBuffer) {
   int64_t total_samples = 100000 - 5;
   int64_t samples_per_buffer = 10;
+  int64_t num_samples = 0;
   std::vector<int64_t> input(total_samples, 1);
-  SubsetRandomSampler sampler(input, samples_per_buffer);
+  SubsetRandomSampler sampler(num_samples, input, samples_per_buffer);
 
   DummyRandomAccessOp dummyRandomAccessOp(total_samples);
   sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);
@@ -87,7 +79,7 @@ TEST_F(MindDataTestSubsetRandomSampler, TestGetNextBuffer) {
   TensorRow row;
   std::vector<int64_t> out;
 
-  ASSERT_EQ(sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
   int epoch = 0;
   while (!db->eoe()) {
     epoch++;
@@ -99,7 +91,7 @@ TEST_F(MindDataTestSubsetRandomSampler, TestGetNextBuffer) {
     }
     db.reset();
 
-    ASSERT_EQ(sampler.GetNextBuffer(&db), Status::OK());
+    ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
   }
 
   ASSERT_EQ(epoch, (total_samples + samples_per_buffer - 1) / samples_per_buffer);
@@ -109,7 +101,8 @@ TEST_F(MindDataTestSubsetRandomSampler, TestGetNextBuffer) {
 TEST_F(MindDataTestSubsetRandomSampler, TestReset) {
   std::vector<int64_t> in({0, 1, 2, 3, 4});
   std::unordered_set<int64_t> in_set(in.begin(), in.end());
-  SubsetRandomSampler sampler(in);
+  int64_t num_samples = 0;
+  SubsetRandomSampler sampler(num_samples, in);
 
   DummyRandomAccessOp dummyRandomAccessOp(5);
   sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);
@@ -118,7 +111,7 @@ TEST_F(MindDataTestSubsetRandomSampler, TestReset) {
   TensorRow row;
   std::vector<int64_t> out;
 
-  ASSERT_EQ(sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
   db->PopRow(&row);
   for (const auto &t : row) {
     for (auto it = t->begin<int64_t>(); it != t->end<int64_t>(); it++) {
@@ -130,9 +123,9 @@ TEST_F(MindDataTestSubsetRandomSampler, TestReset) {
     ASSERT_NE(in_set.find(out[i]), in_set.end());
   }
 
-  sampler.Reset();
+  sampler.ResetSampler();
 
-  ASSERT_EQ(sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
   ASSERT_EQ(db->eoe(), false);
   db->PopRow(&row);
   out.clear();
@@ -146,6 +139,6 @@ TEST_F(MindDataTestSubsetRandomSampler, TestReset) {
     ASSERT_NE(in_set.find(out[i]), in_set.end());
   }
 
-  ASSERT_EQ(sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
   ASSERT_EQ(db->eoe(), true);
 }
diff --git a/tests/ut/cpp/dataset/tensor_string_test.cc b/tests/ut/cpp/dataset/tensor_string_test.cc
index a440a93c15..2480809681 100644
--- a/tests/ut/cpp/dataset/tensor_string_test.cc
+++ b/tests/ut/cpp/dataset/tensor_string_test.cc
@@ -69,10 +69,10 @@ TEST_F(MindDataTestStringTensorDE, Basics2) {
   std::vector<uint32_t> offsets = {0, 4, 9, 12, 18, 22, 26};
   uint32_t ctr = 0;
   for (auto i : offsets) {
-    ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i + 28);
+    ASSERT_TRUE(*(reinterpret_cast<const uint32_t *>(t->GetBuffer() + ctr)) == i + 28);
     ctr += 4;
   }
-  const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4 + 4;
+  const char *buf = reinterpret_cast<const char *>(t->GetBuffer()) + 6 * 4 + 4;
   std::vector<uint32_t> starts = {0, 4, 9, 12, 18, 22};
 
   uint32_t index = 0;
@@ -94,10 +94,10 @@ TEST_F(MindDataTestStringTensorDE, Empty) {
   std::vector<uint32_t> offsets = {0, 4, 9, 10, 11, 15, 16};
   uint32_t ctr = 0;
   for (auto i : offsets) {
-    ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i + 28);
+    ASSERT_TRUE(*(reinterpret_cast<const uint32_t *>(t->GetBuffer() + ctr)) == i + 28);
     ctr += 4;
   }
-  const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4 + 4;
+  const char *buf = reinterpret_cast<const char *>(t->GetBuffer()) + 6 * 4 + 4;
   std::vector<uint32_t> starts = {0, 4, 9, 10, 11, 15};
 
   uint32_t index = 0;
diff --git a/tests/ut/cpp/dataset/tensor_test.cc b/tests/ut/cpp/dataset/tensor_test.cc
index b36f71f4ef..1e75880a35 100644
--- a/tests/ut/cpp/dataset/tensor_test.cc
+++ b/tests/ut/cpp/dataset/tensor_test.cc
@@ -28,17 +28,13 @@ using namespace mindspore::dataset;
 
 namespace py = pybind11;
 
-
 class MindDataTestTensorDE : public UT::Common {
  public:
-    MindDataTestTensorDE() {}
+  MindDataTestTensorDE() {}
 
-    void SetUp() {
-      GlobalInit();
-    }
+  void SetUp() { GlobalInit(); }
 };
 
-
 TEST_F(MindDataTestTensorDE, Basics) {
   std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({2, 3}), DataType(DataType::DE_UINT64));
   ASSERT_TRUE((t->AllocateBuffer(t->SizeInBytes())).IsOk());
@@ -71,8 +67,9 @@ TEST_F(MindDataTestTensorDE, Basics) {
   ASSERT_TRUE(rc.IsError());
   ASSERT_EQ(t->ToString(), "Tensor (shape: <2,3>, Type: uint64)\n[[1,2,3],[4,5,6]]");
   std::vector<uint64_t> x = {1, 2, 3, 4, 5, 6};
-  std::shared_ptr<Tensor> t2 = std::make_shared<Tensor>(TensorShape({2, 3}), DataType(DataType::DE_UINT64),
-                                                        reinterpret_cast<unsigned char *>(&x[0]));
+  std::shared_ptr<Tensor> t2;
+  Tensor::CreateTensor(&t2, x, TensorShape({2, 3}));
+
   ASSERT_EQ(*t == *t2, true);
   ASSERT_EQ(*t != *t2, false);
 }
@@ -81,8 +78,8 @@ TEST_F(MindDataTestTensorDE, Fill) {
   std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({2, 2}), DataType(DataType::DE_FLOAT32));
   t->Fill<float>(2.5);
   std::vector<float> x = {2.5, 2.5, 2.5, 2.5};
-  std::shared_ptr<Tensor> t2 = std::make_shared<Tensor>(TensorShape({2, 2}), DataType(DataType::DE_FLOAT32),
-                                                        reinterpret_cast<unsigned char *>(&x[0]));
+  std::shared_ptr<Tensor> t2;
+  Tensor::CreateTensor(&t2, x, TensorShape({2, 2}));
   ASSERT_EQ(*t == *t2, true);
 }
 
@@ -91,8 +88,9 @@ TEST_F(MindDataTestTensorDE, Reshape) {
   t->Fill<uint8_t>(254);
   t->Reshape(TensorShape({4}));
   std::vector<uint8_t> x = {254, 254, 254, 254};
-  std::shared_ptr<Tensor> t2 = std::make_shared<Tensor>(TensorShape({4}), DataType(DataType::DE_UINT8),
-                                                        reinterpret_cast<unsigned char *>(&x[0]));
+  std::shared_ptr<Tensor> t2;
+  Tensor::CreateTensor(&t2, x);
+
   ASSERT_EQ(*t == *t2, true);
   Status rc = t->Reshape(TensorShape({5}));
   ASSERT_TRUE(rc.IsError());
@@ -112,17 +110,17 @@ TEST_F(MindDataTestTensorDE, CopyTensor) {
   int16_t o;
   t->GetItemAt<int16_t>(&o, {});
   ASSERT_EQ(o, -66);
-  unsigned char *addr = t->GetMutableBuffer();
+  const unsigned char *addr = t->GetBuffer();
   auto t2 = std::make_shared<Tensor>(std::move(*t));
   ASSERT_EQ(t2->shape(), TensorShape({}));
   ASSERT_EQ(t2->type(), DataType::DE_INT16);
   t2->GetItemAt<int16_t>(&o, {});
   ASSERT_EQ(o, -66);
-  unsigned char *new_addr = t2->GetMutableBuffer();
+  const unsigned char *new_addr = t2->GetBuffer();
   ASSERT_EQ(addr, new_addr);
   ASSERT_EQ(t->shape(), TensorShape::CreateUnknownRankShape());
   ASSERT_EQ(t->type(), DataType::DE_UNKNOWN);
-  ASSERT_EQ(t->GetMutableBuffer(), nullptr);
+  ASSERT_EQ(t->GetBuffer(), nullptr);
   Status rc = t->GetItemAt<int16_t>(&o, {});
   ASSERT_TRUE(rc.IsError());
 }
@@ -130,26 +128,29 @@ TEST_F(MindDataTestTensorDE, CopyTensor) {
 TEST_F(MindDataTestTensorDE, InsertTensor) {
   std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({2, 3}), DataType(DataType::DE_FLOAT64));
   std::vector<double> x = {1.1, 2.1, 3.1};
-  std::shared_ptr<Tensor> t2 = std::make_shared<Tensor>(TensorShape({3}), DataType(DataType::DE_FLOAT64),
-                                                        reinterpret_cast<unsigned char *>(&x[0]));
+  std::shared_ptr<Tensor> t2;
+  Tensor::CreateTensor(&t2, x);
+
   std::vector<double> y = {1.2, 2.2, 3.2};
-  std::shared_ptr<Tensor> t3 = std::make_shared<Tensor>(TensorShape({3}), DataType(DataType::DE_FLOAT64),
-                                                        reinterpret_cast<unsigned char *>(&y[0]));
+  std::shared_ptr<Tensor> t3;
+  Tensor::CreateTensor(&t3, y);
+
   ASSERT_TRUE(t->InsertTensor({0}, t2).OK());
   ASSERT_TRUE(t->InsertTensor({1}, t3).OK());
   std::vector<double> z = {1.1, 2.1, 3.1, 1.2, 2.2, 3.2};
 
-  std::shared_ptr<Tensor> t4 = std::make_shared<Tensor>(TensorShape({2, 3}), DataType(DataType::DE_FLOAT64),
-                                                        reinterpret_cast<unsigned char *>(&z[0]));
+  std::shared_ptr<Tensor> t4;
+  Tensor::CreateTensor(&t4, z, TensorShape({2, 3}));
   ASSERT_EQ(*t == *t4, true);
-  std::vector<double> x2 = {0};
 
-  std::shared_ptr<Tensor> t5 = std::make_shared<Tensor>(TensorShape({}), DataType(DataType::DE_FLOAT64),
-                                                        reinterpret_cast<unsigned char *>(&x2[0]));
+  std::shared_ptr<Tensor> t5;
+  Tensor::CreateTensor<double>(&t5, 0);
+
   ASSERT_TRUE(t->InsertTensor({1, 2}, t5).OK());
   z[5] = 0;
-  std::shared_ptr<Tensor> t6 = std::make_shared<Tensor>(TensorShape({2, 3}), DataType(DataType::DE_FLOAT64),
-                                                        reinterpret_cast<unsigned char *>(&z[0]));
+  std::shared_ptr<Tensor> t6;
+  Tensor::CreateTensor(&t6, z, TensorShape({2, 3}));
+
   ASSERT_EQ(*t == *t6, true);
   ASSERT_EQ(t->InsertTensor({2}, t5).get_code(), StatusCode::kUnexpectedError);
   ASSERT_EQ(t->InsertTensor({1}, t5).get_code(), StatusCode::kUnexpectedError);
@@ -161,8 +162,7 @@ TEST_F(MindDataTestTensorDE, InsertTensor) {
 
 // Test the bug of Tensor::ToString will exec failed for Tensor which store bool values
 TEST_F(MindDataTestTensorDE, BoolTensor) {
-  std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({2}),
-                                                       DataType(DataType::DE_BOOL));
+  std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({2}), DataType(DataType::DE_BOOL));
   t->SetItemAt<bool>({0}, true);
   t->SetItemAt<bool>({1}, true);
   std::string out = t->ToString();
@@ -238,26 +238,30 @@ TEST_F(MindDataTestTensorDE, Strides) {
 void checkCvMat(TensorShape shape, DataType type) {
   std::shared_ptr<CVTensor> t = std::make_shared<CVTensor>(shape, type);
   cv::Mat m = t->mat();
-  ASSERT_EQ(m.data, t->GetMutableBuffer());
+  ASSERT_EQ(m.data, t->GetBuffer());
   ASSERT_EQ(static_cast<uchar>(m.type()) & static_cast<uchar>(CV_MAT_DEPTH_MASK), type.AsCVType());
   if (shape.Rank() < 4) {
     if (shape.Rank() > 1) {
-      for (dsize_t i = 0; i < 2; i++)
-        ASSERT_EQ(m.size[static_cast<int>(i)], shape[i]);
+      for (dsize_t i = 0; i < 2; i++) ASSERT_EQ(m.size[static_cast<int>(i)], shape[i]);
     } else if (shape.Rank() == 0) {
       ASSERT_EQ(m.size[0], 1);
       ASSERT_EQ(m.size[1], 1);
     } else {
       ASSERT_EQ(m.size[0], shape[0]);
     }
-    if (shape.Rank() == 3) { ASSERT_EQ(m.channels(), shape[2]); }
+    if (shape.Rank() == 3) {
+      ASSERT_EQ(m.channels(), shape[2]);
+    }
     ASSERT_EQ(m.dims, 2);
     ASSERT_EQ(m.size.dims(), 2);
-    if (shape.Rank() > 0) { ASSERT_EQ(m.rows, shape[0]); }
-    if (shape.Rank() > 1) { ASSERT_EQ(m.cols, shape[1]); }
+    if (shape.Rank() > 0) {
+      ASSERT_EQ(m.rows, shape[0]);
+    }
+    if (shape.Rank() > 1) {
+      ASSERT_EQ(m.cols, shape[1]);
+    }
   } else {
-    for (dsize_t i = 0; i < shape.Rank(); i++)
-      ASSERT_EQ(m.size[static_cast<int>(i)], shape[i]);
+    for (dsize_t i = 0; i < shape.Rank(); i++) ASSERT_EQ(m.size[static_cast<int>(i)], shape[i]);
     ASSERT_EQ(m.dims, shape.Rank());
     ASSERT_EQ(m.size.dims(), shape.Rank());
     ASSERT_EQ(m.rows, -1);
@@ -312,15 +316,15 @@ TEST_F(MindDataTestTensorDE, CVTensorFromMat) {
 TEST_F(MindDataTestTensorDE, CVTensorAs) {
   std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({3, 2}), DataType(DataType::DE_FLOAT64));
   t->Fill<double>(2.2);
-  unsigned char *addr = t->GetMutableBuffer();
+  const unsigned char *addr = t->GetBuffer();
   std::shared_ptr<Tensor> t2 = std::make_shared<Tensor>(TensorShape({3, 2}), DataType(DataType::DE_FLOAT64));
   t2->Fill<double>(4.4);
   std::shared_ptr<CVTensor> ctv = CVTensor::AsCVTensor(t);
-  ASSERT_EQ(t->GetMutableBuffer(), nullptr);
-  ASSERT_EQ(ctv->GetMutableBuffer(), addr);
+  ASSERT_EQ(t->GetBuffer(), nullptr);
+  ASSERT_EQ(ctv->GetBuffer(), addr);
   cv::Mat m = ctv->mat();
   m = 2 * m;
-  ASSERT_EQ(ctv->GetMutableBuffer(), addr);
+  ASSERT_EQ(ctv->GetBuffer(), addr);
   ASSERT_TRUE(*t2 == *ctv);
   MS_LOG(DEBUG) << *t2 << std::endl << *ctv;
 }
@@ -357,8 +361,9 @@ TEST_F(MindDataTestTensorDE, TensorIterator) {
   std::vector<uint32_t> values = {1, 2, 3, 4, 5, 6};
   std::vector<uint32_t> values2 = {2, 3, 4, 5, 6, 7};
 
-  std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({6}), DataType(DataType::DE_UINT32),
-                                                       reinterpret_cast<unsigned char *>(&values[0]));
+  std::shared_ptr<Tensor> t;
+  Tensor::CreateTensor(&t, values);
+
   auto i = t->begin<uint32_t>();
   auto j = values.begin();
   uint32_t ctr = 0;
@@ -367,7 +372,7 @@ TEST_F(MindDataTestTensorDE, TensorIterator) {
     ctr++;
   }
   ASSERT_TRUE(ctr == 6);
-  t->Reshape(TensorShape {2, 3});
+  t->Reshape(TensorShape{2, 3});
   i = t->begin<uint32_t>();
   j = values.begin();
   ctr = 0;
@@ -388,3 +393,43 @@ TEST_F(MindDataTestTensorDE, TensorIterator) {
   }
   ASSERT_TRUE(ctr == 6);
 }
+
+TEST_F(MindDataTestTensorDE, TensorSlice) {
+  std::shared_ptr<Tensor> t;
+  Tensor::CreateTensor(&t, std::vector<dsize_t>{0, 1, 2, 3, 4});
+  std::shared_ptr<Tensor> t2;
+  auto x = std::vector<dsize_t>{0, 3, 4};
+  std::shared_ptr<Tensor> expected;
+  Tensor::CreateTensor(&expected, x);
+  t->Slice(&t2, x);
+  ASSERT_EQ(*t2, *expected);
+  t->Slice(&t2, std::vector<dsize_t>{0, 1, 2, 3, 4});
+  ASSERT_EQ(*t2, *t);
+}
+
+TEST_F(MindDataTestTensorDE, TensorConcatenate) {
+  std::vector<uint32_t> values1 = {1, 2, 3, 0, 0, 0};
+  std::vector<uint32_t> values2 = {4, 5, 6};
+  std::vector<uint32_t> expected = {1, 2, 3, 4, 5, 6};
+
+  std::shared_ptr<Tensor> t1;
+  Tensor::CreateTensor(&t1, values1);
+
+  std::shared_ptr<Tensor> t2;
+  Tensor::CreateTensor(&t2, values2);
+
+  std::shared_ptr<Tensor> out;
+  Tensor::CreateTensor(&out, expected);
+  Status s = t1->Concatenate({3}, t2);
+  EXPECT_TRUE(s.IsOk());
+
+  auto i = out->begin<uint32_t>();
+  auto j = t1->begin<uint32_t>();
+  for (; i != out->end<uint32_t>(); i++, j++) {
+    ASSERT_TRUE(*i == *j);
+  }
+
+  // should fail if the concatenated vector is too large
+  s = t1->Concatenate({5}, t2);
+  EXPECT_FALSE(s.IsOk());
+}
diff --git a/tests/ut/cpp/dataset/tokenizer_op_test.cc b/tests/ut/cpp/dataset/tokenizer_op_test.cc
index a828c97f7b..8a18f0da0c 100644
--- a/tests/ut/cpp/dataset/tokenizer_op_test.cc
+++ b/tests/ut/cpp/dataset/tokenizer_op_test.cc
@@ -18,7 +18,14 @@
 #include <string_view>
 
 #include "common/common.h"
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include "dataset/text/kernels/case_fold_op.h"
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include "dataset/text/kernels/regex_replace_op.h"
+#include "dataset/text/kernels/regex_tokenizer_op.h"
 #include "dataset/text/kernels/unicode_char_tokenizer_op.h"
+#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
+#include "dataset/text/kernels/whitespace_tokenizer_op.h"
 #include "gtest/gtest.h"
 #include "utils/log_adapter.h"
 
@@ -105,3 +112,229 @@ TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) {
   MS_LOG(INFO) << "Out tensor6: " << output->ToString();
   CheckEqual(output, {0}, "");
 }
+
+TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) {
+  MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp.";
+  std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp());
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China.");
+  std::shared_ptr<Tensor> output;
+  Status s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 3);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor1: " << output->ToString();
+  CheckEqual(output, {0}, "Welcome");
+  CheckEqual(output, {1}, "to");
+  CheckEqual(output, {2}, "China.");
+
+  input = std::make_shared<Tensor>("  hello");
+  s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor2: " << output->ToString();
+  CheckEqual(output, {0}, "hello");
+
+  input = std::make_shared<Tensor>("hello");
+  s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor3: " << output->ToString();
+  CheckEqual(output, {0}, "hello");
+
+  input = std::make_shared<Tensor>("hello  ");
+  s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor4: " << output->ToString();
+  CheckEqual(output, {0}, "hello");
+
+  input = std::make_shared<Tensor>("  ");
+  s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor5: " << output->ToString();
+  CheckEqual(output, {0}, "");
+}
+
+TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) {
+  MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer.";
+  std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true));
+  std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false));
+
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 10);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor1: " << output->ToString();
+  CheckEqual(output, {0}, "Welcome");
+  CheckEqual(output, {1}, " ");
+  CheckEqual(output, {2}, "to");
+  CheckEqual(output, {3}, " ");
+  CheckEqual(output, {4}, "China");
+  CheckEqual(output, {5}, ".");
+  CheckEqual(output, {6}, " \n ");
+  CheckEqual(output, {7}, "中国");
+  CheckEqual(output, {8}, "\t");
+  CheckEqual(output, {9}, "北京");
+  s = skip_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 6);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor2: " << output->ToString();
+  CheckEqual(output, {0}, "Welcome");
+  CheckEqual(output, {1}, "to");
+  CheckEqual(output, {2}, "China");
+  CheckEqual(output, {3}, ".");
+  CheckEqual(output, {4}, "中国");
+  CheckEqual(output, {5}, "北京");
+
+  input = std::make_shared<Tensor>("  Welcome to 中国.  ");
+  s = skip_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 4);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor3: " << output->ToString();
+  CheckEqual(output, {0}, "Welcome");
+  CheckEqual(output, {1}, "to");
+  CheckEqual(output, {2}, "中国");
+  CheckEqual(output, {3}, ".");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 8);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor4: " << output->ToString();
+  CheckEqual(output, {0}, "  ");
+  CheckEqual(output, {1}, "Welcome");
+  CheckEqual(output, {2}, " ");
+  CheckEqual(output, {3}, "to");
+  CheckEqual(output, {4}, " ");
+  CheckEqual(output, {5}, "中国");
+  CheckEqual(output, {6}, ".");
+  CheckEqual(output, {7}, "  ");
+
+  input = std::make_shared<Tensor>("Hello");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor5: " << output->ToString();
+  CheckEqual(output, {0}, "Hello");
+
+  input = std::make_shared<Tensor>("H");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor6: " << output->ToString();
+  CheckEqual(output, {0}, "H");
+
+  input = std::make_shared<Tensor>("");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor7: " << output->ToString();
+  CheckEqual(output, {0}, "");
+
+  input = std::make_shared<Tensor>("Hello中国Hello世界");
+  s = keep_whitespace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 4);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor8: " << output->ToString();
+  CheckEqual(output, {0}, "Hello");
+  CheckEqual(output, {1}, "中国");
+  CheckEqual(output, {2}, "Hello");
+  CheckEqual(output, {3}, "世界");
+
+  input = std::make_shared<Tensor>("   ");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor10: " << output->ToString();
+  CheckEqual(output, {0}, "   ");
+  input = std::make_shared<Tensor>("   ");
+  s = skip_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor11: " << output->ToString();
+  CheckEqual(output, {0}, "");
+}
+
+TEST_F(MindDataTestTokenizerOp, TestCaseFold) {
+  MS_LOG(INFO) << "Doing TestCaseFold.";
+  std::unique_ptr<CaseFoldOp> case_fold_op(new CaseFoldOp());
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = case_fold_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 0);
+  MS_LOG(INFO) << "Out tensor1: " << output->ToString();
+  CheckEqual(output, {}, "welcome to china. \n 中国\t北京");
+}
+
+TEST_F(MindDataTestTokenizerOp, TestNormalize) {
+  MS_LOG(INFO) << "Doing TestNormalize.";
+  std::unique_ptr<NormalizeUTF8Op> nfc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfc));
+  std::unique_ptr<NormalizeUTF8Op> nfkc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkc));
+  std::unique_ptr<NormalizeUTF8Op> nfd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfd));
+  std::unique_ptr<NormalizeUTF8Op> nfkd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkd));
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("ṩ");
+  std::shared_ptr<Tensor> output;
+  Status s = nfc_normalize_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  MS_LOG(INFO) << "NFC str:" << output->ToString();
+
+  nfkc_normalize_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  MS_LOG(INFO) << "NFKC str:" << output->ToString();
+
+  nfd_normalize_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  MS_LOG(INFO) << "NFD str:" << output->ToString();
+
+  nfkd_normalize_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  MS_LOG(INFO) << "NFKD str:" << output->ToString();
+}
+
+TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
+  MS_LOG(INFO) << "Doing TestRegexReplace.";
+  std::unique_ptr<RegexReplaceOp> regex_replace_op(new RegexReplaceOp("\\s+", "_", true));
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = regex_replace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 0);
+  MS_LOG(INFO) << "Out tensor1: " << output->ToString();
+  CheckEqual(output, {}, "Welcome_to_China._中国_北京");
+}
+
+TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) {
+  MS_LOG(INFO) << "Doing TestRegexTokenizerOp.";
+  std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", ""));
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = regex_tokenizer_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+}
+
+TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
+  MS_LOG(INFO) << "Doing TestBasicTokenizer.";
+  //bool lower_case, bool keep_whitespace, 
+  // NormalizeForm  normalization_form, bool preserve_unused_token
+  std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false));
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = basic_tokenizer->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+}
\ No newline at end of file
diff --git a/tests/ut/cpp/dataset/trucate_pair_test.cc b/tests/ut/cpp/dataset/trucate_pair_test.cc
new file mode 100644
index 0000000000..95e2aaa11b
--- /dev/null
+++ b/tests/ut/cpp/dataset/trucate_pair_test.cc
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <memory>
+#include <string>
+#include "dataset/core/client.h"
+#include "common/common.h"
+#include "gtest/gtest.h"
+#include "securec.h"
+#include "dataset/core/tensor.h"
+#include "mindspore/ccsrc/dataset/text/kernels/truncate_sequence_pair_op.h"
+
+using namespace mindspore::dataset;
+
+namespace py = pybind11;
+
+class MindDataTestTruncatePairOp : public UT::Common {
+ public:
+  MindDataTestTruncatePairOp() {}
+
+  void SetUp() { GlobalInit(); }
+};
+
+TEST_F(MindDataTestTruncatePairOp, Basics) {
+  std::shared_ptr<Tensor> t1;
+  Tensor::CreateTensor(&t1, std::vector<uint32_t>({1, 2, 3}));
+  std::shared_ptr<Tensor> t2;
+  Tensor::CreateTensor(&t2, std::vector<uint32_t>({4, 5}));
+  TensorRow in({t1, t2});
+  std::shared_ptr<TruncateSequencePairOp> op = std::make_shared<TruncateSequencePairOp>(4);
+  TensorRow out;
+  ASSERT_TRUE(op->Compute(in, &out).IsOk());
+  std::shared_ptr<Tensor> out1;
+  Tensor::CreateTensor(&out1, std::vector<uint32_t>({1, 2}));
+  std::shared_ptr<Tensor> out2;
+  Tensor::CreateTensor(&out2, std::vector<uint32_t>({4, 5}));
+  ASSERT_EQ(*out1, *out[0]);
+  ASSERT_EQ(*out2, *out[1]);
+}
diff --git a/tests/ut/cpp/dataset/weighted_random_sampler_test.cc b/tests/ut/cpp/dataset/weighted_random_sampler_test.cc
index 51a4bc3cb3..d146ed10ac 100644
--- a/tests/ut/cpp/dataset/weighted_random_sampler_test.cc
+++ b/tests/ut/cpp/dataset/weighted_random_sampler_test.cc
@@ -35,19 +35,11 @@ class MindDataTestWeightedRandomSampler : public UT::Common {
  public:
   class DummyRandomAccessOp : public RandomAccessOp {
    public:
-    DummyRandomAccessOp(uint64_t num_rows) : num_rows_(num_rows) {};
-    Status GetNumSamples(int64_t *num) const {
-      *num = num_rows_;
-      return Status::OK();
+    DummyRandomAccessOp(uint64_t num_rows) {
+      // row count is in base class as protected member
+      // GetNumRowsInDataset does not need an override, the default from base class is fine.
+      num_rows_ = num_rows;
     }
-
-    Status GetNumRowsInDataset(int64_t *num) const {
-      *num = num_rows_;
-      return Status::OK();
-    }
-
-   private:
-    uint64_t num_rows_;
   };
 };
 
@@ -59,14 +51,14 @@ TEST_F(MindDataTestWeightedRandomSampler, TestOneshotReplacement) {
   std::vector<uint64_t> freq(total_samples, 0);
 
   // create sampler with replacement = true
-  WeightedRandomSampler m_sampler(weights, num_samples, true);
+  WeightedRandomSampler m_sampler(num_samples, weights, true);
   DummyRandomAccessOp dummyRandomAccessOp(total_samples);
   m_sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);
 
   std::unique_ptr<DataBuffer> db;
   TensorRow row;
   std::vector<uint64_t> out;
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   db->PopRow(&row);
   for (const auto &t : row) {
     for (auto it = t->begin<uint64_t>(); it != t->end<uint64_t>(); it++) {
@@ -77,7 +69,7 @@ TEST_F(MindDataTestWeightedRandomSampler, TestOneshotReplacement) {
 
   ASSERT_EQ(num_samples, out.size());
 
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   ASSERT_EQ(db->eoe(), true);
 }
 
@@ -89,14 +81,14 @@ TEST_F(MindDataTestWeightedRandomSampler, TestOneshotNoReplacement) {
   std::vector<uint64_t> freq(total_samples, 0);
 
   // create sampler with replacement = replacement
-  WeightedRandomSampler m_sampler(weights, num_samples, false);
+  WeightedRandomSampler m_sampler(num_samples, weights, false);
   DummyRandomAccessOp dummyRandomAccessOp(total_samples);
   m_sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);
 
   std::unique_ptr<DataBuffer> db;
   TensorRow row;
   std::vector<uint64_t> out;
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   db->PopRow(&row);
   for (const auto &t : row) {
     for (auto it = t->begin<uint64_t>(); it != t->end<uint64_t>(); it++) {
@@ -113,7 +105,7 @@ TEST_F(MindDataTestWeightedRandomSampler, TestOneshotNoReplacement) {
     }
   }
 
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   ASSERT_EQ(db->eoe(), true);
 }
 
@@ -125,14 +117,14 @@ TEST_F(MindDataTestWeightedRandomSampler, TestGetNextBufferReplacement) {
   std::vector<double> weights(total_samples, std::rand() % 100);
 
   // create sampler with replacement = replacement
-  WeightedRandomSampler m_sampler(weights, num_samples, true, samples_per_buffer);
+  WeightedRandomSampler m_sampler(num_samples, weights, true, samples_per_buffer);
   DummyRandomAccessOp dummyRandomAccessOp(total_samples);
   m_sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);
 
   std::unique_ptr<DataBuffer> db;
   TensorRow row;
   std::vector<uint64_t> out;
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   int epoch = 0;
   while (!db->eoe()) {
     epoch++;
@@ -143,7 +135,7 @@ TEST_F(MindDataTestWeightedRandomSampler, TestGetNextBufferReplacement) {
       }
     }
     db.reset();
-    ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+    ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   }
 
   ASSERT_EQ(epoch, (num_samples + samples_per_buffer - 1) / samples_per_buffer);
@@ -161,14 +153,14 @@ TEST_F(MindDataTestWeightedRandomSampler, TestGetNextBufferNoReplacement) {
   std::vector<uint64_t> freq(total_samples, 0);
 
   // create sampler with replacement = replacement
-  WeightedRandomSampler m_sampler(weights, num_samples, false, samples_per_buffer);
+  WeightedRandomSampler m_sampler(num_samples, weights, false, samples_per_buffer);
   DummyRandomAccessOp dummyRandomAccessOp(total_samples);
   m_sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);
 
   std::unique_ptr<DataBuffer> db;
   TensorRow row;
   std::vector<uint64_t> out;
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   int epoch = 0;
   while (!db->eoe()) {
     epoch++;
@@ -180,7 +172,7 @@ TEST_F(MindDataTestWeightedRandomSampler, TestGetNextBufferNoReplacement) {
       }
     }
     db.reset();
-    ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+    ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   }
 
   // Without replacement, each sample only drawn once.
@@ -202,14 +194,14 @@ TEST_F(MindDataTestWeightedRandomSampler, TestResetReplacement) {
   std::vector<uint64_t> freq(total_samples, 0);
 
   // create sampler with replacement = true
-  WeightedRandomSampler m_sampler(weights, num_samples, true);
+  WeightedRandomSampler m_sampler(num_samples, weights, true);
   DummyRandomAccessOp dummyRandomAccessOp(total_samples);
   m_sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);
 
   std::unique_ptr<DataBuffer> db;
   TensorRow row;
   std::vector<uint64_t> out;
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   db->PopRow(&row);
   for (const auto &t : row) {
     for (auto it = t->begin<uint64_t>(); it != t->end<uint64_t>(); it++) {
@@ -219,13 +211,13 @@ TEST_F(MindDataTestWeightedRandomSampler, TestResetReplacement) {
   }
   ASSERT_EQ(num_samples, out.size());
 
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   ASSERT_EQ(db->eoe(), true);
 
-  m_sampler.Reset();
+  m_sampler.ResetSampler();
   out.clear();
 
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   db->PopRow(&row);
   for (const auto &t : row) {
     for (auto it = t->begin<uint64_t>(); it != t->end<uint64_t>(); it++) {
@@ -235,7 +227,7 @@ TEST_F(MindDataTestWeightedRandomSampler, TestResetReplacement) {
   }
   ASSERT_EQ(num_samples, out.size());
 
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   ASSERT_EQ(db->eoe(), true);
 }
 
@@ -247,14 +239,14 @@ TEST_F(MindDataTestWeightedRandomSampler, TestResetNoReplacement) {
   std::vector<uint64_t> freq(total_samples, 0);
 
   // create sampler with replacement = true
-  WeightedRandomSampler m_sampler(weights, num_samples, false);
+  WeightedRandomSampler m_sampler(num_samples, weights, false);
   DummyRandomAccessOp dummyRandomAccessOp(total_samples);
   m_sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);
 
   std::unique_ptr<DataBuffer> db;
   TensorRow row;
   std::vector<uint64_t> out;
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   db->PopRow(&row);
   for (const auto &t : row) {
     for (auto it = t->begin<uint64_t>(); it != t->end<uint64_t>(); it++) {
@@ -264,16 +256,16 @@ TEST_F(MindDataTestWeightedRandomSampler, TestResetNoReplacement) {
   }
   ASSERT_EQ(num_samples, out.size());
 
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   ASSERT_EQ(db->eoe(), true);
 
-  m_sampler.Reset();
+  m_sampler.ResetSampler();
   out.clear();
   freq.clear();
   freq.resize(total_samples, 0);
   MS_LOG(INFO) << "Resetting sampler";
 
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   db->PopRow(&row);
   for (const auto &t : row) {
     for (auto it = t->begin<uint64_t>(); it != t->end<uint64_t>(); it++) {
@@ -290,6 +282,6 @@ TEST_F(MindDataTestWeightedRandomSampler, TestResetNoReplacement) {
     }
   }
 
-  ASSERT_EQ(m_sampler.GetNextBuffer(&db), Status::OK());
+  ASSERT_EQ(m_sampler.GetNextSample(&db), Status::OK());
   ASSERT_EQ(db->eoe(), true);
 }
diff --git a/tests/ut/cpp/dataset/zip_op_test.cc b/tests/ut/cpp/dataset/zip_op_test.cc
index d083fa1c6d..7885369c07 100644
--- a/tests/ut/cpp/dataset/zip_op_test.cc
+++ b/tests/ut/cpp/dataset/zip_op_test.cc
@@ -51,35 +51,35 @@ TEST_F(MindDataTestZipOp, MindDataTestZipOpDefault) {
  *
  *                  OpId(2) ZipOp
  *            /                       \
- *     OpId(0) StorageOp    OpId(1) StorageOp
+ *     OpId(0) TFReaderOp    OpId(1) TFReaderOp
  * Start with an empty execution tree
 */
   Status rc;
   MS_LOG(INFO) << "UT test TestZipBasic.";
   auto my_tree = std::make_shared<ExecutionTree>();
-  // Creating StorageOp
+  // Creating TFReaderOp
 
-  std::string dataset_path = datasets_root_path_ + "/test_tf_file_3_images_1";
-  std::string dataset_path2 = datasets_root_path_ + "/test_tf_file_3_images_2";
-  std::shared_ptr<StorageOp> my_storage_op;
-  rc = StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path)
+  std::string dataset_path = datasets_root_path_ + "/test_tf_file_3_images_1/train-0000-of-0001.data";
+  std::string dataset_path2 = datasets_root_path_ + "/testBatchDataset/test.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  rc = TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path})
       .SetRowsPerBuffer(2)
       .SetWorkerConnectorSize(16)
       .SetNumWorkers(1)
-      .Build(&my_storage_op);
+      .Build(&my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_tree->AssociateNode(my_storage_op);
+  rc = my_tree->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
-  std::shared_ptr<StorageOp> my_storage_op2;
-  rc = StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path2)
+  std::shared_ptr<TFReaderOp> my_tfreader_op2;
+  rc = TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path2})
       .SetRowsPerBuffer(2)
       .SetWorkerConnectorSize(1)
       .SetNumWorkers(1)
-      .Build(&my_storage_op2);
+      .Build(&my_tfreader_op2);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_tree->AssociateNode(my_storage_op2);
+  rc = my_tree->AssociateNode(my_tfreader_op2);
   EXPECT_TRUE(rc.IsOk());
 
   // Creating DatasetOp
@@ -89,9 +89,9 @@ TEST_F(MindDataTestZipOp, MindDataTestZipOpDefault) {
 
   rc = my_tree->AssociateNode(zip_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = zip_op->AddChild(std::move(my_storage_op));
+  rc = zip_op->AddChild(std::move(my_tfreader_op));
   EXPECT_TRUE(rc.IsOk());
-  rc = zip_op->AddChild(std::move(my_storage_op2));
+  rc = zip_op->AddChild(std::move(my_tfreader_op2));
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree->AssignRoot(zip_op);
   EXPECT_TRUE(rc.IsOk());
@@ -125,6 +125,7 @@ TEST_F(MindDataTestZipOp, MindDataTestZipOpDefault) {
     EXPECT_TRUE(rc.IsOk());
     row_count++;
   }
+  MS_LOG(WARNING) <<"row count is: " << row_count;
   ASSERT_EQ(row_count, 3); // Should be 3 rows fetched
 }
 
@@ -135,7 +136,7 @@ TEST_F(MindDataTestZipOp, MindDataTestZipOpRepeat) {
  *
  *                  OpId(2) ZipOp
  *            /                       \
- *         OpId(0) StorageOp    OpId(1) StorageOp
+ *         OpId(0) TFReaderOp    OpId(1) TFReaderOp
  *
  * Start with an empty execution tree
 */
@@ -143,27 +144,27 @@ TEST_F(MindDataTestZipOp, MindDataTestZipOpRepeat) {
   MS_LOG(INFO) << "UT test TestZipRepeat.";
   auto my_tree = std::make_shared<ExecutionTree>();
 
-  std::string dataset_path = datasets_root_path_ + "/test_tf_file_3_images_1";
-  std::string dataset_path2 = datasets_root_path_ + "/test_tf_file_3_images_2";
-  std::shared_ptr<StorageOp> my_storage_op;
-  rc = StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path)
+  std::string dataset_path = datasets_root_path_ + "/test_tf_file_3_images_1/train-0000-of-0001.data";
+  std::string dataset_path2 = datasets_root_path_ + "/testBatchDataset/test.data";
+  std::shared_ptr<TFReaderOp> my_tfreader_op;
+  rc = TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path})
       .SetRowsPerBuffer(2)
       .SetWorkerConnectorSize(16)
       .SetNumWorkers(1)
-      .Build(&my_storage_op);
+      .Build(&my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_tree->AssociateNode(my_storage_op);
+  rc = my_tree->AssociateNode(my_tfreader_op);
   EXPECT_TRUE(rc.IsOk());
-  std::shared_ptr<StorageOp> my_storage_op2;
-  rc = StorageOp::Builder()
-      .SetDatasetFilesDir(dataset_path2)
+  std::shared_ptr<TFReaderOp> my_tfreader_op2;
+  rc = TFReaderOp::Builder()
+      .SetDatasetFilesList({dataset_path2})
       .SetRowsPerBuffer(2)
       .SetWorkerConnectorSize(1)
       .SetNumWorkers(1)
-      .Build(&my_storage_op2);
+      .Build(&my_tfreader_op2);
   EXPECT_TRUE(rc.IsOk());
-  rc = my_tree->AssociateNode(my_storage_op2);
+  rc = my_tree->AssociateNode(my_tfreader_op2);
   EXPECT_TRUE(rc.IsOk());
   // Creating DatasetOp
   std::shared_ptr<ZipOp> zip_op;
@@ -171,9 +172,9 @@ TEST_F(MindDataTestZipOp, MindDataTestZipOpRepeat) {
   EXPECT_TRUE(rc.IsOk());
   rc = my_tree->AssociateNode(zip_op);
   EXPECT_TRUE(rc.IsOk());
-  rc = zip_op->AddChild(std::move(my_storage_op));
+  rc = zip_op->AddChild(std::move(my_tfreader_op));
   EXPECT_TRUE(rc.IsOk());
-  rc = zip_op->AddChild(std::move(my_storage_op2));
+  rc = zip_op->AddChild(std::move(my_tfreader_op2));
   EXPECT_TRUE(rc.IsOk());
 
   // Builder(num_of_repeats)
diff --git a/tests/ut/cpp/ir/manager_test.cc b/tests/ut/cpp/ir/manager_test.cc
index 7b1e4d8554..04b584ec10 100644
--- a/tests/ut/cpp/ir/manager_test.cc
+++ b/tests/ut/cpp/ir/manager_test.cc
@@ -104,7 +104,7 @@ class NestingSpecs {
     return name;
   }
 
-  void Check(std::shared_ptr<FuncGraphAnalysis> results) {
+  void Check(std::shared_ptr<DepComputer> results) {
     if (expected_.empty() && expected_recursive_.empty()) {
       return;
     }
@@ -120,18 +120,6 @@ class NestingSpecs {
       CheckRecursive(recursive);
       return;
     }
-
-    auto counter_g = dynamic_pointer_cast<CounterFuncGraphCollector>(results);
-    if (counter_g != nullptr) {
-      CheckGraphCounter(counter_g);
-      return;
-    }
-
-    auto counter_p = dynamic_pointer_cast<CounterAnfNodeCollector<AnfNodePtr>>(results);
-    if (counter_p != nullptr) {
-      CheckAnfNodeCounter(counter_p);
-      return;
-    }
   }
 
  private:
@@ -193,59 +181,6 @@ class NestingSpecs {
     ASSERT_EQ(clean_results, expected_);
   }
 
-  // Add CheckNesting function
-  void CheckAnfNodeCounter(std::shared_ptr<CounterAnfNodeCollector<AnfNodePtr>> results) {
-    std::map<std::string, std::set<std::string>> clean_results;
-    for (auto& iter : results->count_nodes_map()) {
-      auto key = iter.first;
-      auto value = iter.second;
-      if (key == nullptr) {
-        continue;
-      }
-      std::string k = Name(key);
-
-      std::set<std::string> v;
-      for (auto& node : value) {
-        auto fg = node.first;
-        if (!Name(fg).empty()) {
-          v.insert(Name(fg));
-        }
-      }
-
-      if (!v.empty()) {
-        clean_results[k] = v;
-      }
-    }
-
-    ASSERT_EQ(clean_results, expected_);
-  }
-
-  void CheckGraphCounter(std::shared_ptr<CounterFuncGraphCollector> results) {
-    std::map<std::string, std::set<std::string>> clean_results;
-    for (auto& iter : results->count_func_graphs_map()) {
-      auto key = iter.first;
-      auto value = iter.second;
-      if (key == nullptr) {
-        continue;
-      }
-      std::string k = Name(key);
-
-      std::set<std::string> v;
-      for (auto& node : value) {
-        auto fg = node.first;
-        if (!Name(fg).empty()) {
-          v.insert(Name(fg));
-        }
-      }
-
-      if (!v.empty()) {
-        clean_results[k] = v;
-      }
-    }
-
-    ASSERT_EQ(clean_results, expected_);
-  }
-
   void CheckRecursive(std::shared_ptr<RecursiveComputer> results) {
     std::map<std::string, bool> clean_results;
     for (auto iter = results->recursive_analysis().begin(); iter != results->recursive_analysis().end(); ++iter) {
diff --git a/tests/ut/cpp/kernel/common_utils_test.cc b/tests/ut/cpp/kernel/common_utils_test.cc
new file mode 100644
index 0000000000..597f45ef8c
--- /dev/null
+++ b/tests/ut/cpp/kernel/common_utils_test.cc
@@ -0,0 +1,95 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include "common/common_test.h"
+#include "kernel/common_utils.h"
+
+namespace mindspore {
+namespace kernel {
+class CommonUtilTest : public UT::Common {
+ public:
+  CommonUtilTest() = default;
+};
+
+TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest1) {
+  // The indices is a vector and the grad is a tensor with shape (6, 2)
+  /* 0
+   * 0
+   * 1
+   * 1
+   * 0
+   * 3
+   */
+  std::vector<int> indices{0, 0, 1, 1, 0, 3};
+  /* 0 1
+   * 2 3
+   * 4 5
+   * 6 7
+   * 8 9
+   * 10 11
+   */
+  std::vector<float> grad;
+  for (int i = 0; i < 6 * 2; i++) {
+    grad.push_back(i);
+  }
+  std::vector<int> unique_indices(3);
+  std::vector<float> summed_grad(6);
+  SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 0});
+  DeduplicateIndexedSlices(SparseGradient({grad.data(), indices.data(), 6}), &unique_grad, 6, 2);
+  EXPECT_EQ(unique_grad.indices_size_, 3);
+  EXPECT_EQ(unique_indices, std::vector<int>({0, 1, 3}));
+  /* 10 13
+   * 10 12
+   * 10 11
+   */
+  EXPECT_EQ(summed_grad, std::vector<float>({10, 13, 10, 12, 10, 11}));
+}
+
+TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest2) {
+  // The indices is a vector and the grad is a tensor with shape (6, 2)
+  /* 0
+   * 0
+   * 1
+   * 1
+   * 0
+   * 6
+   */
+  std::vector<int> indices{0, 0, 1, 1, 0, 6};
+  /* 0 1
+   * 2 3
+   * 4 5
+   * 6 7
+   * 8 9
+   * 10 11
+   */
+  std::vector<float> grad;
+  for (int i = 0; i < 6 * 2; i++) {
+    grad.push_back(i);
+  }
+  std::vector<int> unique_indices(2);
+  std::vector<float> summed_grad(4);
+  SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 0});
+  DeduplicateIndexedSlices(SparseGradient({grad.data(), indices.data(), 6}), &unique_grad, 6, 2);
+  EXPECT_EQ(unique_grad.indices_size_, 2);
+  EXPECT_EQ(unique_indices, std::vector<int>({0, 1}));
+  /* 10 13
+   * 10 12
+   */
+  EXPECT_EQ(summed_grad, std::vector<float>({10, 13, 10, 12}));
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/tests/ut/cpp/mindrecord/ut_shard_operator_test.cc b/tests/ut/cpp/mindrecord/ut_shard_operator_test.cc
index 23c2c1e34f..7fe60c3bfa 100644
--- a/tests/ut/cpp/mindrecord/ut_shard_operator_test.cc
+++ b/tests/ut/cpp/mindrecord/ut_shard_operator_test.cc
@@ -139,9 +139,6 @@ TEST_F(TestShardOperator, TestShardSamplePartition) {
   const int kPar = 2;
   std::vector<std::shared_ptr<ShardOperator>> ops;
   ops.push_back(std::make_shared<ShardSample>(kNum, kDen, kPar));
-  auto partitions = std::dynamic_pointer_cast<ShardSample>(ops[0])->GetPartitions();
-  ASSERT_TRUE(partitions.first == 4);
-  ASSERT_TRUE(partitions.second == 2);
 
   ShardReader dataset;
   dataset.Open({file_name}, true, 4, column_list, ops);
diff --git a/tests/ut/cpp/operator/composite_test.cc b/tests/ut/cpp/operator/composite_test.cc
index 84e9fda9d2..8ca318300a 100644
--- a/tests/ut/cpp/operator/composite_test.cc
+++ b/tests/ut/cpp/operator/composite_test.cc
@@ -240,156 +240,6 @@ TEST_F(TestComposite, test_TupleSlice_arg_slice_step_positive) {
   ASSERT_EQ(real, expect);
 }
 
-TEST_F(TestComposite, test_TensorSliceBySlice) {
-  MetaFuncGraphPtr tensorSlicePtr = std::make_shared<prim::TensorSlice>("tensor_slice");
-  FuncGraphPtr tensorSlicePtrGraphPtr = UTCompositeUtils::MakeFuncGraph(tensorSlicePtr, 2);
-
-  AbstractBasePtrList eles;
-  AbstractScalarPtr start_index = std::make_shared<AbstractScalar>(1);
-  AbstractScalarPtr stop_index = std::make_shared<AbstractScalar>(6);
-  AbstractScalarPtr step = std::make_shared<AbstractScalar>(2);
-
-  AbstractTensorPtr tensor = UTCompositeUtils::ArrayInt32Of({6, 7, 8});
-  AbstractSlicePtr slice = std::make_shared<AbstractSlice>(start_index, stop_index, step);
-  AbstractBasePtrList args_spec_list = {tensor, slice};
-
-  AbstractTensorPtr ret = dyn_cast<AbstractTensor>(engine_->Run(tensorSlicePtrGraphPtr, args_spec_list).inferred->abstract());
-  if (ret == nullptr) {
-    FAIL() << "Cast ret to abstract array failed.";
-  }
-  AbstractTensorPtr expect = UTCompositeUtils::ArrayInt32Of({3, 7, 8});
-  ASSERT_EQ(*ret, *expect);
-}
-
-TEST_F(TestComposite, test_TensorSliceBySliceTuple) {
-  MetaFuncGraphPtr tensorSlicePtr = std::make_shared<prim::TensorSlice>("tensor_slice");
-  FuncGraphPtr tensorSliceGraphPtr = UTCompositeUtils::MakeFuncGraph(tensorSlicePtr, 2);
-
-  AbstractBasePtrList eles;
-  AbstractScalarPtr start_index = std::make_shared<AbstractScalar>(0);
-  AbstractScalarPtr stop_index = std::make_shared<AbstractScalar>(6);
-  AbstractScalarPtr step = std::make_shared<AbstractScalar>(2);
-  AbstractSlicePtr slice = std::make_shared<AbstractSlice>(start_index, stop_index, step);
-  eles.push_back(slice);
-
-  start_index = std::make_shared<AbstractScalar>(1);
-  stop_index = std::make_shared<AbstractScalar>(5);
-  step = std::make_shared<AbstractScalar>(1);
-  slice = std::make_shared<AbstractSlice>(start_index, stop_index, step);
-  eles.push_back(slice);
-
-  start_index = std::make_shared<AbstractScalar>(2);
-  stop_index = std::make_shared<AbstractScalar>(8);
-  step = std::make_shared<AbstractScalar>(3);
-  slice = std::make_shared<AbstractSlice>(start_index, stop_index, step);
-  eles.push_back(slice);
-
-  AbstractTensorPtr tensor = UTCompositeUtils::ArrayInt32Of({6, 7, 8});
-  AbstractTuplePtr slice_tuple = std::make_shared<AbstractTuple>(eles);
-  AbstractBasePtrList args_spec_list = {tensor, slice_tuple};
-
-  AbstractTensorPtr ret = dyn_cast<AbstractTensor>(engine_->Run(tensorSliceGraphPtr, args_spec_list).inferred->abstract());
-  if (ret == nullptr) {
-    FAIL() << "Cast ret to abstract array failed.";
-  }
-  AbstractTensorPtr expect = UTCompositeUtils::ArrayInt32Of({3, 4, 2});
-  ASSERT_EQ(*ret, *expect);
-}
-
-TEST_F(TestComposite, test_TensorSliceBySliceTupleToReduceDimension) {
-  MetaFuncGraphPtr tensorSlicePtr = std::make_shared<prim::TensorSlice>("tensor_slice");
-  FuncGraphPtr tensorSliceGraphPtr = UTCompositeUtils::MakeFuncGraph(tensorSlicePtr, 2);
-
-  AbstractBasePtrList eles;
-  AbstractScalarPtr start_index = std::make_shared<AbstractScalar>(1);
-  AbstractScalarPtr stop_index = std::make_shared<AbstractScalar>(5);
-  AbstractScalarPtr step = std::make_shared<AbstractScalar>(2);
-  AbstractSlicePtr slice = std::make_shared<AbstractSlice>(start_index, stop_index, step);
-  eles.push_back(slice);
-
-  AbstractScalarPtr elem_index = std::make_shared<AbstractScalar>(1);
-  eles.push_back(elem_index);
-
-  start_index = std::make_shared<AbstractScalar>(2);
-  stop_index = std::make_shared<AbstractScalar>(6);
-  step = std::make_shared<AbstractScalar>(1);
-  slice = std::make_shared<AbstractSlice>(start_index, stop_index, step);
-  eles.push_back(slice);
-
-  AbstractTensorPtr tensor = UTCompositeUtils::ArrayInt32Of({6, 7, 8});
-  AbstractTuplePtr slice_tuple = std::make_shared<AbstractTuple>(eles);
-  AbstractBasePtrList args_spec_list = {tensor, slice_tuple};
-
-  AbstractTensorPtr ret = dyn_cast<AbstractTensor>(engine_->Run(tensorSliceGraphPtr, args_spec_list).inferred->abstract());
-  if (ret == nullptr) {
-    FAIL() << "Cast ret to abstract array failed.";
-  }
-  AbstractTensorPtr expect = UTCompositeUtils::ArrayInt32Of({2, 4});
-  ASSERT_EQ(*ret, *expect);
-}
-
-TEST_F(TestComposite, test_TensorSliceByScalar) {
-  MetaFuncGraphPtr tensorSlicePtr = std::make_shared<prim::TensorSlice>("tensor_slice");
-  FuncGraphPtr tensorSliceGraphPtr = UTCompositeUtils::MakeFuncGraph(tensorSlicePtr, 2);
-
-  AbstractTensorPtr tensor = UTCompositeUtils::ArrayInt32Of({6, 7, 8});
-  AbstractScalarPtr start_index = std::make_shared<AbstractScalar>(2);
-  AbstractBasePtrList args_spec_list = {tensor, start_index};
-
-  AbstractTensorPtr ret = dyn_cast<AbstractTensor>(engine_->Run(tensorSliceGraphPtr, args_spec_list).inferred->abstract());
-  if (ret == nullptr) {
-    FAIL() << "Cast ret to abstract array failed.";
-  }
-  AbstractTensorPtr expect = UTCompositeUtils::ArrayInt32Of({7, 8});
-  ASSERT_EQ(*ret, *expect);
-}
-
-TEST_F(TestComposite, test_TensorSliceByScalarTuple) {
-  MetaFuncGraphPtr tensorSlicePtr = std::make_shared<prim::TensorSlice>("tensor_slice");
-  FuncGraphPtr tensorSliceGraphPtr = UTCompositeUtils::MakeFuncGraph(tensorSlicePtr, 2);
-
-  AbstractBasePtrList eles;
-  AbstractScalarPtr elem_index = std::make_shared<AbstractScalar>(1);
-  eles.push_back(elem_index);
-  elem_index = std::make_shared<AbstractScalar>(3);
-  eles.push_back(elem_index);
-
-  AbstractTensorPtr tensor = UTCompositeUtils::ArrayInt32Of({6, 7, 8});
-  AbstractTuplePtr slice_tuple = std::make_shared<AbstractTuple>(eles);
-  AbstractBasePtrList args_spec_list = {tensor, slice_tuple};
-
-  AbstractTensorPtr ret = dyn_cast<AbstractTensor>(engine_->Run(tensorSliceGraphPtr, args_spec_list).inferred->abstract());
-  if (ret == nullptr) {
-    FAIL() << "Cast ret to abstract array failed.";
-  }
-  AbstractTensorPtr expect = UTCompositeUtils::ArrayInt32Of({8});
-  ASSERT_EQ(*ret, *expect);
-}
-
-TEST_F(TestComposite, test_TensorSliceByScalarTupleToScalar) {
-  MetaFuncGraphPtr tensorSlicePtr = std::make_shared<prim::TensorSlice>("tensor_slice");
-  FuncGraphPtr tensorSliceGraphPtr = UTCompositeUtils::MakeFuncGraph(tensorSlicePtr, 2);
-
-  AbstractBasePtrList eles;
-  AbstractScalarPtr elem_index = std::make_shared<AbstractScalar>(3);
-  eles.push_back(elem_index);
-  elem_index = std::make_shared<AbstractScalar>(0);
-  eles.push_back(elem_index);
-  elem_index = std::make_shared<AbstractScalar>(6);
-  eles.push_back(elem_index);
-
-  AbstractTensorPtr tensor = UTCompositeUtils::ArrayInt32Of({6, 7, 8});
-  AbstractTuplePtr slice_tuple = std::make_shared<AbstractTuple>(eles);
-  AbstractBasePtrList args_spec_list = {tensor, slice_tuple};
-
-  AbstractTensorPtr ret = dyn_cast<AbstractTensor>(engine_->Run(tensorSliceGraphPtr, args_spec_list).inferred->abstract());
-  if (ret == nullptr) {
-    FAIL() << "Cast ret to abstract array failed.";
-  }
-  AbstractTensorPtr expect = UTCompositeUtils::ArrayInt32Of({});
-  ASSERT_EQ(*ret, *expect);
-}
-
 TEST_F(TestComposite, test_UnpackCall_3args) {
   MetaFuncGraphPtr unPackCallPtr = std::make_shared<prim::UnpackCall>("UnPackCall");
   FuncGraphPtr unPackCallGraphPtr = UTCompositeUtils::MakeFuncGraph(unPackCallPtr, 3);
diff --git a/tests/ut/cpp/operator/ops_test.cc b/tests/ut/cpp/operator/ops_test.cc
index 7a19d1795e..1d1389b54a 100644
--- a/tests/ut/cpp/operator/ops_test.cc
+++ b/tests/ut/cpp/operator/ops_test.cc
@@ -341,7 +341,7 @@ TEST_F(TestOps, ResolveTest) {
 }
 
 TEST_F(TestOps, PartialTest) {
-  auto prim = std::make_shared<Primitive>("partial");
+  auto prim = std::make_shared<Primitive>("Partial");
   ASSERT_EQ(prim->name(), kPrimPartial->name());
 }
 
diff --git a/tests/ut/cpp/optimizer/lib_test.cc b/tests/ut/cpp/optimizer/lib_test.cc
index 2d4cf0e78e..ed4497f9a5 100644
--- a/tests/ut/cpp/optimizer/lib_test.cc
+++ b/tests/ut/cpp/optimizer/lib_test.cc
@@ -147,8 +147,8 @@ TEST_F(TestOptLib, test_inline_new_closure) {
 TEST_F(TestOptLib, test_inline_while) {
   FuncGraphPtr before = getPyFun.CallAndParseRet("test_inline_while", "before");
   auto patterns = std::vector<SubstitutionPtr>({irpass.inline_});
-  FuncGraphPtr after_ = RunSubs(before, patterns);
-  ASSERT_TRUE(CheckOpt(before, before, patterns));
+  FuncGraphPtr after = RunSubs(before, patterns);
+  ASSERT_TRUE(CheckOpt(before, after, patterns, true));
 }
 
 TEST_F(TestOptLib, test_arithmetic) {
@@ -219,6 +219,7 @@ TEST_F(TestOptLib, test_elim_reshape_same_shape) {
     tensor::TensorPtr x_tensor = std::make_shared<tensor::Tensor>(kFloat32->type_id(), shp);
     auto x_abstract = x_tensor->ToAbstract();
     x_node->set_abstract(x_abstract);
+    before->output()->set_abstract(x_abstract);
   }
   auto patterns = std::vector<SubstitutionPtr>({irpass.reshape_eliminate_});
   ASSERT_TRUE(CheckOpt(before, after, patterns));
@@ -256,6 +257,14 @@ TEST_F(TestOptLib, test_elim_transpose) {
   ASSERT_TRUE(CheckOpt(before, after, patterns));
 }
 
+TEST_F(TestOptLib, test_elim_depend_value) {
+  FuncGraphPtr before = getPyFun.CallAndParseRet("test_elim_depend_value", "before");
+  FuncGraphPtr after = getPyFun.CallAndParseRet("test_elim_depend_value", "after");
+
+  auto patterns = std::vector<SubstitutionPtr>({irpass.depend_value_elim_});
+  ASSERT_TRUE(CheckOpt(before, after, patterns));
+}
+
 TEST_F(TestOptLib, test_elim_tile_multiply_one) {
   FuncGraphPtr before = getPyFun.CallAndParseRet("test_elim_tile_multiply_one", "before");
   FuncGraphPtr after = getPyFun.CallAndParseRet("test_elim_tile_multiply_one", "after");
@@ -400,7 +409,7 @@ TEST_F(TestOptLib, test_incorporate_getitem) {
   FuncGraphPtr after1 = getPyFun.CallAndParseRet("test_incorporate_getitem", "after1");
   FuncGraphPtr after2 = getPyFun.CallAndParseRet("test_incorporate_getitem", "after2");
 
-  auto patterns = std::vector<SubstitutionPtr>({irpass.incorporate_getitem_});
+  auto patterns = std::vector<SubstitutionPtr>({irpass.incorporate_getitem_set_});
 
   ASSERT_TRUE(CheckOpt(before1, after1, patterns));
   ASSERT_TRUE(CheckOpt(before2, after2, patterns));
@@ -410,7 +419,7 @@ TEST_F(TestOptLib, test_incorporate_getitem_through_switch) {
   FuncGraphPtr before = getPyFun.CallAndParseRet("test_incorporate_getitem_through_switch", "before");
   FuncGraphPtr after = getPyFun.CallAndParseRet("test_incorporate_getitem_through_switch", "after");
 
-  auto patterns = std::vector<SubstitutionPtr>({irpass.incorporate_getitem_switch_});
+  auto patterns = std::vector<SubstitutionPtr>({irpass.incorporate_getitem_set_});
   ASSERT_TRUE(CheckOpt(before, after, patterns));
 }
 
@@ -556,5 +565,23 @@ TEST_F(TestOptLib, test_constant_duplicate_mul) {
   ASSERT_TRUE(CheckOpt(beforerl, after, patterns));
   ASSERT_TRUE(CheckOpt(beforerr, after, patterns));
 }
+
+TEST_F(TestOptLib, test_adjust_allreduce_mul_add) {
+  FuncGraphPtr beforell = getPyFun.CallAndParseRet("test_adjust_allreduce_mul_add", "beforell");
+  FuncGraphPtr beforelr = getPyFun.CallAndParseRet("test_adjust_allreduce_mul_add", "beforelr");
+  FuncGraphPtr beforerl = getPyFun.CallAndParseRet("test_adjust_allreduce_mul_add", "beforerl");
+  FuncGraphPtr beforerr = getPyFun.CallAndParseRet("test_adjust_allreduce_mul_add", "beforerr");
+  FuncGraphPtr after1 = getPyFun.CallAndParseRet("test_adjust_allreduce_mul_add", "after1");
+  FuncGraphPtr before2r = getPyFun.CallAndParseRet("test_adjust_allreduce_mul_add", "before2r");
+  FuncGraphPtr before2l = getPyFun.CallAndParseRet("test_adjust_allreduce_mul_add", "before2l");
+  FuncGraphPtr after2 = getPyFun.CallAndParseRet("test_adjust_allreduce_mul_add", "after2");
+  auto patterns = std::vector<SubstitutionPtr>({irpass.adjust_all_reduce_mul_add_});
+  ASSERT_TRUE(CheckOpt(beforell, after1, patterns));
+  ASSERT_TRUE(CheckOpt(beforelr, after1, patterns));
+  ASSERT_TRUE(CheckOpt(beforerl, after1, patterns));
+  ASSERT_TRUE(CheckOpt(beforerr, after1, patterns));
+  ASSERT_TRUE(CheckOpt(before2l, after2, patterns));
+  ASSERT_TRUE(CheckOpt(before2r, after2, patterns));
+}
 }  // namespace opt
 }  // namespace mindspore
diff --git a/tests/ut/cpp/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op_test.cc b/tests/ut/cpp/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op_test.cc
new file mode 100644
index 0000000000..22cf70ded3
--- /dev/null
+++ b/tests/ut/cpp/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op_test.cc
@@ -0,0 +1,165 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/backend_common_test.h"
+#include "common/py_func_graph_fetcher.h"
+#include "session/anf_runtime_algorithm.h"
+#include "operator/ops.h"
+#include "ir/tensor.h"
+#include "debug/anf_ir_dump.h"
+#include "utils/utils.h"
+#include "kernel/kernel_build_info.h"
+#include "pre_activate/common/optimizer.h"
+#define private public
+#define protected public
+#include "pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
+#undef private
+#undef protected
+namespace mindspore {
+namespace opt {
+class TestHWInsertMemcpyForHccl : public BackendCommon {
+ public:
+  TestHWInsertMemcpyForHccl() : get_py_fun_("gtest_input.pre_activate.insert_memcpy_async_for_hccl_op", true) {}
+  ~TestHWInsertMemcpyForHccl() override = default;
+
+ public:
+  UT::PyFuncGraphFetcher get_py_fun_;
+};
+
+class MockInsertMemcpyForHcclKernelQuery : public KernelQuery {
+ public:
+  MockInsertMemcpyForHcclKernelQuery() = default;
+  ~MockInsertMemcpyForHcclKernelQuery() override = default;
+  bool IsTbeRef(const AnfNodePtr &node) override {
+    MS_EXCEPTION_IF_NULL(node);
+    auto cnode = node->cast<CNodePtr>();
+    if (cnode == nullptr) {
+      return false;
+    }
+    auto name = AnfAlgo::GetCNodeName(cnode);
+    return name == "ApplyMomentum";
+  }
+};
+
+TEST_F(TestHWInsertMemcpyForHccl, test_cond1) {
+  get_py_fun_.SetDoResolve(true);
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond1", "before1");
+  ASSERT_TRUE(g != nullptr);
+  std::vector<int> shp_x{1, 64, 112, 112};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
+  AbstractBasePtrList args_spec_list{x_abstract};
+  auto kg = GetKernelGraph(g, args_spec_list);
+  EXPECT_NE(kg, nullptr);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
+  pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
+  pm->AddPass(pass);
+  optimizer->AddPassManager(pm);
+  auto new_graph = optimizer->Optimize(kg);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond1", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
+
+TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {
+  get_py_fun_.SetDoResolve(true);
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond1", "before2");
+  ASSERT_TRUE(g != nullptr);
+  std::vector<int> shp_x{1, 64, 112, 112};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
+  AbstractBasePtrList args_spec_list{x_abstract};
+  auto kg = GetKernelGraph(g, args_spec_list);
+  EXPECT_NE(kg, nullptr);
+  auto origin_graph = std::make_shared<session::KernelGraph>(*kg);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
+  pm->AddPass(pass);
+  optimizer->AddPassManager(pm);
+  auto new_graph = optimizer->Optimize(kg);
+
+  EXPECT_TRUE(CheckEqualGraph(origin_graph, new_graph));
+}
+
+TEST_F(TestHWInsertMemcpyForHccl, test_cond2) {
+  get_py_fun_.SetDoResolve(true);
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "before");
+  ASSERT_TRUE(g != nullptr);
+  std::vector<int> shp_x{1, 64, 112, 112};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
+  AbstractBasePtrList args_spec_list{x_abstract};
+  auto kg = GetKernelGraph(g, args_spec_list);
+  EXPECT_NE(kg, nullptr);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
+  pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
+  pm->AddPass(pass);
+  optimizer->AddPassManager(pm);
+  auto new_graph = optimizer->Optimize(kg);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
+
+TEST_F(TestHWInsertMemcpyForHccl, test_cond3) {
+  get_py_fun_.SetDoResolve(true);
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "before");
+  ASSERT_TRUE(g != nullptr);
+  std::vector<int> shp_x{1, 64, 112, 112};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
+  AbstractBasePtrList args_spec_list{x_abstract, x_abstract, x_abstract, x_abstract, x_abstract};
+  auto kg = GetKernelGraph(g, args_spec_list);
+  EXPECT_NE(kg, nullptr);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
+  pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
+  pm->AddPass(pass);
+  optimizer->AddPassManager(pm);
+  auto new_graph = optimizer->Optimize(kg);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
+
+TEST_F(TestHWInsertMemcpyForHccl, test_cond4) {
+  get_py_fun_.SetDoResolve(true);
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "before");
+  ASSERT_TRUE(g != nullptr);
+  std::vector<int> shp_x{1, 64, 112, 112};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
+  AbstractBasePtrList args_spec_list{x_abstract, x_abstract, x_abstract, x_abstract, x_abstract};
+  auto kg = GetKernelGraph(g, args_spec_list);
+  EXPECT_NE(kg, nullptr);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
+  pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
+  pm->AddPass(pass);
+  optimizer->AddPassManager(pm);
+  auto new_graph = optimizer->Optimize(kg);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/tests/ut/cpp/pre_activate/ascend/format_type/insert_cast_test.cc b/tests/ut/cpp/pre_activate/ascend/format_type/insert_cast_test.cc
index 2da100af93..317eace6c6 100644
--- a/tests/ut/cpp/pre_activate/ascend/format_type/insert_cast_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/format_type/insert_cast_test.cc
@@ -60,7 +60,7 @@ TEST_F(TestHWInsertCast, test_insert_cast_op_for_single_output) {
   builder.SetOutputsDeviceType({kFloat16->type_id()});
   builder.SetFusionType(kernel::FusionType::ELEMWISE);
   builder.SetProcessor(kernel::Processor::AICORE);
-  builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
   kernel::KernelBuildInfo::KernelBuildInfoBuilder builder1;
   builder1.SetInputsFormat({"NC1HWC0"});
   builder1.SetInputsDeviceType({kFloat32->type_id()});
@@ -68,7 +68,7 @@ TEST_F(TestHWInsertCast, test_insert_cast_op_for_single_output) {
   builder1.SetOutputsDeviceType({kFloat32->type_id()});
   builder1.SetFusionType(kernel::FusionType::ELEMWISE);
   builder1.SetProcessor(kernel::Processor::AICORE);
-  builder1.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder1.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
   for (auto& node : node_list) {
     if (node == nullptr) {
@@ -122,7 +122,7 @@ TEST_F(TestHWInsertCast, test_insert_cast_op_for_multiple_output) {
   builder1.SetOutputsDeviceType({kFloat32->type_id()});
   builder1.SetFusionType(kernel::FusionType::ELEMWISE);
   builder1.SetProcessor(kernel::Processor::AICORE);
-  builder1.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder1.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
   for (auto& node : node_list) {
     if (node == nullptr) {
diff --git a/tests/ut/cpp/pre_activate/ascend/format_type/insert_trans_op_test.cc b/tests/ut/cpp/pre_activate/ascend/format_type/insert_trans_op_test.cc
index 9528beceef..8c57238e0a 100644
--- a/tests/ut/cpp/pre_activate/ascend/format_type/insert_trans_op_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/format_type/insert_trans_op_test.cc
@@ -21,6 +21,7 @@
 #include "pre_activate/common/pass_manager.h"
 #include "session/anf_runtime_algorithm.h"
 #include "device/kernel_info.h"
+#include "utils/context/ms_context.h"
 
 #define private public
 #define protected public
@@ -103,6 +104,9 @@ TEST_F(TestHWInsertTransOp, test_insert_trans_op_for_single_output) {
    *     return output
    *
    */
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  ms_context->set_execution_mode(kGraphMode);
   auto fg = GetSingleOutputGraph("test_insert_trans_op_for_single_output", "before", "NC1HWC0");
   // Do insert_trans_op_ pass of hardware opt
   auto graph_optimizer = std::make_shared<opt::GraphOptimizer>();
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fission/batch_norm_bert_fission_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fission/batch_norm_bert_fission_test.cc
index d3998f0736..06895cb081 100644
--- a/tests/ut/cpp/pre_activate/ascend/ir_fission/batch_norm_bert_fission_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fission/batch_norm_bert_fission_test.cc
@@ -80,6 +80,7 @@ TEST_F(TestHWBatchNormBertFission, test_fused_batch_norm_no_fission) {
     args_spec_list.push_back(y_abstract);
   }
   auto kg = GetKernelGraph(g, args_spec_list);
+  auto origin_graph = std::make_shared<session::KernelGraph>(*kg);
 
   auto optimizer = std::make_shared<opt::GraphOptimizer>();
   auto pm = std::make_shared<opt::PassManager>();
@@ -87,7 +88,7 @@ TEST_F(TestHWBatchNormBertFission, test_fused_batch_norm_no_fission) {
   optimizer->AddPassManager(pm);
   FuncGraphPtr new_graph = optimizer->Optimize(kg);
 
-  EXPECT_TRUE(CheckEqualGraph(kg, new_graph));
+  EXPECT_TRUE(CheckEqualGraph(origin_graph, new_graph));
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fission/layer_norm_grad_split_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fission/layer_norm_grad_split_test.cc
index 3ad13f7a64..1df87960e3 100644
--- a/tests/ut/cpp/pre_activate/ascend/ir_fission/layer_norm_grad_split_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fission/layer_norm_grad_split_test.cc
@@ -39,36 +39,6 @@ class TestHWLayerNormGradSplit : public BackendCommon {
   UT::PyFuncGraphFetcher get_py_fun_;
 };
 
-class MockLayerNormGradSplitKernelSelect : public KernelSelect {
- public:
-  MockLayerNormGradSplitKernelSelect() = default;
-  ~MockLayerNormGradSplitKernelSelect() override = default;
-  void SelectKernel(const CNodePtr &cnode) override {
-    auto name = AnfAlgo::GetCNodeName(cnode);
-
-    if (name == kLayerNormXBackpropOpName) {
-      kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
-      builder.SetInputsFormat(
-        {kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
-      builder.SetInputsDeviceType(
-        {kNumberTypeFloat16, kNumberTypeFloat16, kNumberTypeFloat16, kNumberTypeFloat16, kNumberTypeFloat16});
-      builder.SetOutputsFormat({kOpFormat_NC1HWC0});
-      builder.SetOutputsDeviceType({kNumberTypeFloat16});
-      AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), cnode.get());
-      return;
-    }
-    if (name == kLayerNormBetaGammaBackpropOpName) {
-      kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
-      builder.SetInputsFormat({kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
-      builder.SetInputsDeviceType({kNumberTypeFloat16, kNumberTypeFloat16, kNumberTypeFloat16, kNumberTypeFloat16});
-      builder.SetOutputsFormat({kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
-      builder.SetOutputsDeviceType({kNumberTypeFloat16, kNumberTypeFloat16});
-      AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), cnode.get());
-      return;
-    }
-  }
-};  // namespace opt
-
 TEST_F(TestHWLayerNormGradSplit, test_layer_norm_grad_split) {
   get_py_fun_.SetDoResolve(true);
   FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_layer_norm_grad_split", "before");
@@ -81,49 +51,9 @@ TEST_F(TestHWLayerNormGradSplit, test_layer_norm_grad_split) {
   auto kernel_graph = GetKernelGraph(g, args_spec_list);
   EXPECT_NE(kernel_graph, nullptr);
 
-  // get LayerNormGrad
-  CNodePtr ret = kernel_graph->get_return();
-  EXPECT_NE(ret, nullptr);
-  EXPECT_NE(ret->input(1), nullptr);
-  EXPECT_TRUE(ret->input(1)->isa<CNode>());
-  auto make_tuple1 = ret->input(1)->cast<CNodePtr>();
-  EXPECT_NE(make_tuple1->input(1), nullptr);
-  EXPECT_TRUE(make_tuple1->input(1)->isa<CNode>());
-  auto make_tuple2 = make_tuple1->input(1)->cast<CNodePtr>();
-  EXPECT_NE(make_tuple2->input(1), nullptr);
-  EXPECT_TRUE(make_tuple2->input(1)->isa<CNode>());
-  auto tuple_getitem = make_tuple2->input(1)->cast<CNodePtr>();
-  EXPECT_NE(tuple_getitem->input(1), nullptr);
-  EXPECT_TRUE(tuple_getitem->input(1)->isa<CNode>());
-  auto layer_norm_grad = tuple_getitem->input(1)->cast<CNodePtr>();
-
-  // set kernel for LayerNormGrad
-  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder1;
-  builder1.SetInputsFormat(
-    {kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
-  builder1.SetOutputsFormat({kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
-  builder1.SetInputsDeviceType(
-    {kNumberTypeFloat16, kNumberTypeFloat16, kNumberTypeFloat16, kNumberTypeFloat16, kNumberTypeFloat16});
-  builder1.SetOutputsDeviceType({kNumberTypeFloat16, kNumberTypeFloat16, kNumberTypeFloat16});
-  builder1.SetKernelType(TBE_KERNEL);
-  AnfAlgo::SetSelectKernelBuildInfo(builder1.Build(), layer_norm_grad.get());
-
-  // get param5
-  EXPECT_NE(layer_norm_grad->input(5), nullptr);
-  auto param = layer_norm_grad->input(5);
-
-  // set kernel for param5
-  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder2;
-  builder2.SetOutputsFormat({kOpFormat_NC1HWC0});
-  builder2.SetOutputsDeviceType({kNumberTypeFloat16});
-  AnfAlgo::SetSelectKernelBuildInfo(builder2.Build(), param.get());
-
-  // do layer_norm_grad_split pass
   auto optimizer = std::make_shared<opt::GraphOptimizer>();
   auto pm = std::make_shared<opt::PassManager>();
   auto pass = std::make_shared<opt::LayerNormGradSplit>();
-  auto kernel_select = std::make_shared<MockLayerNormGradSplitKernelSelect>();
-  pass->kernel_select_ = kernel_select;
   pm->AddPass(pass);
   optimizer->AddPassManager(pm);
   auto new_graph = optimizer->Optimize(kernel_graph);
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fission/single_batch_norm_fission_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fission/single_batch_norm_fission_test.cc
new file mode 100644
index 0000000000..b0aa455a0a
--- /dev/null
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fission/single_batch_norm_fission_test.cc
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pre_activate/ascend/ir_fission/single_batch_norm_fission.h"
+#include "common/backend_common_test.h"
+#include "common/py_func_graph_fetcher.h"
+#include "debug/anf_ir_dump.h"
+
+namespace mindspore {
+namespace opt {
+class TestHWSingleBatchNormFission : public BackendCommon {
+ public:
+  TestHWSingleBatchNormFission() : get_py_fun_("gtest_input.pre_activate.single_batch_norm_fission_test", true) {}
+  ~TestHWSingleBatchNormFission() override = default;
+
+  UT::PyFuncGraphFetcher get_py_fun_;
+};
+
+TEST_F(TestHWSingleBatchNormFission, test_fission) {
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_single_batch_norm_fission", "before");
+  EXPECT_NE(g, nullptr);
+  std::vector<int> shp_x{32, 64, 112, 112};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
+  std::vector<int> shp_y{64};
+  auto y_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_y);
+  AbstractBasePtrList args_spec_list{x_abstract};
+  for (size_t i = 0; i < 4; ++i) {
+    args_spec_list.push_back(y_abstract);
+  }
+  auto kg = GetKernelGraph(g, args_spec_list);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  pm->AddPass(std::make_shared<opt::SingleBatchNormFission>());
+  optimizer->AddPassManager(pm);
+  FuncGraphPtr new_graph = optimizer->Optimize(kg);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_single_batch_norm_fission", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
+
+TEST_F(TestHWSingleBatchNormFission, test_no_fission) {
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_single_batch_norm_fission", "before");
+  EXPECT_NE(g, nullptr);
+  std::vector<int> shp_x{32, 64, 112, 112};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
+  std::vector<int> shp_y{64};
+  auto y_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_y);
+  AbstractBasePtrList args_spec_list{x_abstract};
+  for (size_t i = 0; i < 4; ++i) {
+    args_spec_list.push_back(y_abstract);
+  }
+  auto kg = GetKernelGraph(g, args_spec_list);
+  auto origin_graph = std::make_shared<session::KernelGraph>(*kg);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  pm->AddPass(std::make_shared<opt::SingleBatchNormFission>());
+  optimizer->AddPassManager(pm);
+  FuncGraphPtr new_graph = optimizer->Optimize(kg);
+
+  EXPECT_TRUE(CheckEqualGraph(origin_graph, new_graph));
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/tests/ut/cpp/pre_activate/ascend/enhancer/add_memcpy_async_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fission/split_fission_test.cc
similarity index 50%
rename from tests/ut/cpp/pre_activate/ascend/enhancer/add_memcpy_async_test.cc
rename to tests/ut/cpp/pre_activate/ascend/ir_fission/split_fission_test.cc
index 50b76df864..ab70e83480 100644
--- a/tests/ut/cpp/pre_activate/ascend/enhancer/add_memcpy_async_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fission/split_fission_test.cc
@@ -13,45 +13,43 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "common/backend_common_test.h"
 #include "common/py_func_graph_fetcher.h"
-#include "session/anf_runtime_algorithm.h"
-#include "operator/ops.h"
-#include "ir/tensor.h"
-#include "debug/anf_ir_dump.h"
-#include "utils/utils.h"
-#include "kernel/kernel_build_info.h"
-#include "pre_activate/common/optimizer.h"
-#include "pre_activate/ascend/enhancer/add_memcpy_async.h"
+#define private public
+#define protected public
+#include "pre_activate/ascend/ir_fission/split_fission.h"
+#undef private
+#undef protected
 
 namespace mindspore {
 namespace opt {
-class TestHWAddMemcpyAsync : public BackendCommon {
+class TestHWSplitFission : public BackendCommon {
  public:
-  TestHWAddMemcpyAsync() : get_py_fun_("gtest_input.pre_activate.add_memcpy_async", true) {}
+  TestHWSplitFission() : get_py_fun_("gtest_input.pre_activate.split_fission_test", true) {}
+  ~TestHWSplitFission() override = default;
 
- public:
   UT::PyFuncGraphFetcher get_py_fun_;
 };
 
-TEST_F(TestHWAddMemcpyAsync, test_add_memcpy_async) {
-  get_py_fun_.SetDoResolve(true);
-  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_add_memcpy_async", "before");
-  ASSERT_TRUE(g != nullptr);
-  std::vector<int> shp_x{1, 64, 112, 112};
-  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
-  AbstractBasePtrList args_spec_list{x_abstract};
-  auto func_graph = GetKernelGraph(g, args_spec_list);
-  EXPECT_NE(func_graph, nullptr);
+TEST_F(TestHWSplitFission, test_split_fission_divided_by_3) {
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_split_fission", "before");
+  EXPECT_NE(g, nullptr);
+  std::vector<int> shp{512, 3, 1};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp);
+  AbstractBasePtrList args_spec_list;
+  args_spec_list.push_back(x_abstract);
+  auto kg = GetKernelGraph(g, args_spec_list);
 
   auto optimizer = std::make_shared<opt::GraphOptimizer>();
   auto pm = std::make_shared<opt::PassManager>();
-  auto pass = std::make_shared<opt::AddMemcpyAsync>();
-  pm->AddPass(pass);
+  auto split_fission = std::make_shared<opt::SplitFission>();
+  split_fission->outputs_divisor_ = 3;
+  pm->AddPass(split_fission);
   optimizer->AddPassManager(pm);
-  auto new_graph = optimizer->Optimize(func_graph);
+  FuncGraphPtr new_graph = optimizer->Optimize(kg);
 
-  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_add_memcpy_async", "after");
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_split_fission", "after");
   EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
 }
 }  // namespace opt
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fission/topk_split_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fission/topk_split_test.cc
index 4cee3577ed..b09268aa66 100644
--- a/tests/ut/cpp/pre_activate/ascend/ir_fission/topk_split_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fission/topk_split_test.cc
@@ -55,8 +55,7 @@ class MockSupportedChecker : public SupportedChecker {
  public:
   MockSupportedChecker() = default;
   ~MockSupportedChecker() override = default;
-  bool CheckAiCoreSupported(const AnfNodePtr &anf_node,
-                            const kernel::KernelBuildInfoPtr &select_kernel_build_info) override {
+  bool CheckAICoreSupported(const AnfNodePtr &anf_node, const kernel::KernelBuildInfoPtr &select_kernel_build_info) override {
     return true;
   }
 };  // namespace opt
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fission/transdata_split_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fission/transdata_split_test.cc
index b358b002a4..f2b975a08e 100644
--- a/tests/ut/cpp/pre_activate/ascend/ir_fission/transdata_split_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fission/transdata_split_test.cc
@@ -20,6 +20,8 @@
 #include "session/anf_runtime_algorithm.h"
 #include "kernel/oplib/oplib.h"
 #include "debug/anf_ir_dump.h"
+#include "utils/context/ms_context.h"
+
 #define private public
 #define protected public
 #include "pre_activate/ascend/format_type/insert_trans_op.h"
@@ -91,6 +93,9 @@ TEST_F(TestHWTransdataSplit, test_transdata_split_fraz_nchw) {
    * transdata = Transdata(transpose)
    * return transdata
    */
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  ms_context->set_execution_mode(kGraphMode);
   FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_transdata_split_fraz_nchw", "before");
   std::vector<int> shp{2, 4, 8, 16};
   auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp);
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fusion/conv_bn_fusion_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fusion/conv_bn_fusion_test.cc
deleted file mode 100644
index 4fd4db823d..0000000000
--- a/tests/ut/cpp/pre_activate/ascend/ir_fusion/conv_bn_fusion_test.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "common/backend_common_test.h"
-#include "operator/ops.h"
-#include "debug/anf_ir_dump.h"
-#include "common/py_func_graph_fetcher.h"
-#include "pre_activate/common/optimizer.h"
-#include "pre_activate/common/pass_manager.h"
-#include "session/anf_runtime_algorithm.h"
-#include "device/kernel_info.h"
-
-#define private public
-#define protected public
-#include "pre_activate/ascend/ir_fusion/conv_bn_fusion.h"
-#undef private
-#undef protected
-
-namespace mindspore {
-namespace opt {
-using KernelBuildInfoBuilder = kernel::KernelBuildInfo::KernelBuildInfoBuilder;
-
-class TestHWConvBnFusion : public BackendCommon {
- public:
-  TestHWConvBnFusion() : getPyFun_("gtest_input.pre_activate.ir_fusion_test", true) {}
-  ~TestHWConvBnFusion() override = default;
-
-  UT::PyFuncGraphFetcher getPyFun_;
-};
-
-TEST_F(TestHWConvBnFusion, test_conv_bn_fusion) {
-  /*
-   * def before(x, y):
-   *    conv_output = conv(x, y)
-   *    bn_output = bn(conv_output)
-   *    item0 = tuple_getitem(bn_output, 0)
-   *    item1 = tuple_getitem(bn_output, 3)
-   *    item2 = tuple_getitem(bn_output, 4)
-   *    res = make_tuple(item0, item1, item2)
-   *    return res
-   */
-  getPyFun_.SetDoResolve(true);
-  FuncGraphPtr g = getPyFun_.CallAndParseRet("test_conv_bn_fusion", "before");
-  std::vector<int> shp_x{32, 3, 224, 224};
-  std::vector<int> shp_w{64, 3, 7, 7};
-  std::vector<int> shp_b{64};
-  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
-  auto w_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_w);
-  auto b_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_b);
-  AbstractBasePtrList args_spec_list{x_abstract, w_abstract, b_abstract, b_abstract, b_abstract, b_abstract};
-  auto fg = GetKernelGraph(g, args_spec_list);
-
-  auto graph_optimizer = std::make_shared<opt::GraphOptimizer>();
-  auto pass_manager = std::make_shared<opt::PassManager>();
-  auto conv_bn_fusion_pass = std::make_shared<opt::ConvBnFusion>();
-  pass_manager->AddPass(conv_bn_fusion_pass);
-  graph_optimizer->AddPassManager(pass_manager);
-  auto new_g = graph_optimizer->Optimize(fg);
-
-  FuncGraphPtr g_after = getPyFun_.CallAndParseRet("test_conv_bn_fusion", "after");
-  EXPECT_TRUE(CheckEqualGraph(g_after, new_g));
-}
-
-}  // namespace opt
-}  // namespace mindspore
\ No newline at end of file
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fusion/conv_bn_relu_fusion_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fusion/conv_bn_relu_fusion_test.cc
deleted file mode 100644
index 8790b56d49..0000000000
--- a/tests/ut/cpp/pre_activate/ascend/ir_fusion/conv_bn_relu_fusion_test.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "common/backend_common_test.h"
-#include "common/py_func_graph_fetcher.h"
-#include "session/anf_runtime_algorithm.h"
-#include "debug/anf_ir_dump.h"
-#include "kernel/kernel_build_info.h"
-
-#define private public
-#define protected public
-#include "pre_activate/ascend/ir_fusion/conv_bn_relu_fusion.h"
-#undef private
-#undef protected
-
-namespace mindspore {
-namespace opt {
-class TestHWConvBnReluFusion : public BackendCommon {
- public:
-  TestHWConvBnReluFusion() : get_py_fun_("gtest_input.pre_activate.conv_bn_relu_fusion", true) {}
-  ~TestHWConvBnReluFusion() override = default;
-
-  UT::PyFuncGraphFetcher get_py_fun_;
-};
-
-TEST_F(TestHWConvBnReluFusion, test_conv_bn_relu_fusion) {
-  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_conv_bn_relu_fusion", "before");
-  ASSERT_TRUE(g != nullptr);
-  std::vector<int> shp_x{32, 3, 224, 224};
-  std::vector<int> shp_w{64, 3, 7, 7};
-  std::vector<int> shp_b{64};
-  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
-  auto w_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_w);
-  auto b_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_b);
-  AbstractBasePtrList args_spec_list{x_abstract, w_abstract, b_abstract, b_abstract, b_abstract, b_abstract};
-  auto kernel_graph = GetKernelGraph(g, args_spec_list);
-
-  // do bn_grad_split_pass
-  auto optimizer = std::make_shared<opt::GraphOptimizer>();
-  auto pm = std::make_shared<opt::PassManager>();
-  auto pass = std::make_shared<opt::ConvBnReluFusion>();
-  pm->AddPass(pass);
-  optimizer->AddPassManager(pm);
-  auto new_graph = optimizer->Optimize(kernel_graph);
-
-  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_conv_bn_relu_fusion", "after");
-  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
-}
-}  // namespace opt
-}  // namespace mindspore
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion_test.cc
index f023446698..597b7b18ff 100644
--- a/tests/ut/cpp/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fusion/fused_batch_norm_fusion_test.cc
@@ -51,8 +51,8 @@ TEST_F(TestHWFusedBatchNormFusion, test_fused_batch_norm_fusion) {
   EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
 }
 
-TEST_F(TestHWFusedBatchNormFusion, test_fused_batch_norm_mix_precision_fusion) {
-  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_fused_batch_norm_fusion", "before_mix_precision");
+TEST_F(TestHWFusedBatchNormFusion, test_fused_batch_norm_mix_precision_fusion0) {
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_fused_batch_norm_fusion", "before_mix_precision0");
   EXPECT_NE(g, nullptr);
   std::vector<int> shp_x{32, 64, 112, 112};
   auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@@ -66,7 +66,30 @@ TEST_F(TestHWFusedBatchNormFusion, test_fused_batch_norm_mix_precision_fusion) {
 
   auto optimizer = std::make_shared<opt::GraphOptimizer>();
   auto pm = std::make_shared<opt::PassManager>();
-  pm->AddPass(std::make_shared<opt::FusedBatchNormMixPrecisionFusion>());
+  pm->AddPass(std::make_shared<opt::FusedBatchNormMixPrecisionFusion0>());
+  optimizer->AddPassManager(pm);
+  FuncGraphPtr new_graph = optimizer->Optimize(kg);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_fused_batch_norm_fusion", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
+
+TEST_F(TestHWFusedBatchNormFusion, test_fused_batch_norm_mix_precision_fusion1) {
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_fused_batch_norm_fusion", "before_mix_precision1");
+  EXPECT_NE(g, nullptr);
+  std::vector<int> shp_x{32, 64, 112, 112};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
+  std::vector<int> shp_y{64};
+  auto y_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_y);
+  AbstractBasePtrList args_spec_list{x_abstract};
+  for (size_t i = 0; i < 6; ++i) {
+    args_spec_list.push_back(y_abstract);
+  }
+  auto kg = GetKernelGraph(g, args_spec_list);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  pm->AddPass(std::make_shared<opt::FusedBatchNormMixPrecisionFusion1>());
   optimizer->AddPassManager(pm);
   FuncGraphPtr new_graph = optimizer->Optimize(kg);
 
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion_test.cc
index 2543233613..5f02f0e9c1 100644
--- a/tests/ut/cpp/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fusion/softmax_grad_ext_fusion_test.cc
@@ -49,5 +49,47 @@ TEST_F(TestHWOptSoftmaxGradExtFusion, test_fusion) {
   FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_softmax_grad_ext_fusion", "after");
   EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
 }
+
+TEST_F(TestHWOptSoftmaxGradExtFusion, test_fusion_v2) {
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_softmax_grad_ext_fusion_v2", "before");
+  EXPECT_NE(g, nullptr);
+  std::vector<int> shp{1, 1, 1, 1};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp);
+  AbstractBasePtrList args_spec_list;
+  for (size_t i = 0; i < 3; ++i) {
+    args_spec_list.push_back(x_abstract);
+  }
+  auto fg = GetKernelGraph(g, args_spec_list);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  pm->AddPass(std::make_shared<opt::SoftmaxGradExtFusionV2>());
+  optimizer->AddPassManager(pm);
+  FuncGraphPtr new_graph = optimizer->Optimize(fg);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_softmax_grad_ext_fusion_v2", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
+
+TEST_F(TestHWOptSoftmaxGradExtFusion, test_fusion_v3) {
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_softmax_grad_ext_fusion_v3", "before");
+  EXPECT_NE(g, nullptr);
+  std::vector<int> shp{1, 1, 1, 1};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp);
+  AbstractBasePtrList args_spec_list;
+  for (size_t i = 0; i < 3; ++i) {
+    args_spec_list.push_back(x_abstract);
+  }
+  auto fg = GetKernelGraph(g, args_spec_list);
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  pm->AddPass(std::make_shared<opt::SoftmaxGradExtFusionV3>());
+  optimizer->AddPassManager(pm);
+  FuncGraphPtr new_graph = optimizer->Optimize(fg);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_softmax_grad_ext_fusion_v3", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
 }  // namespace opt
 }  // namespace mindspore
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fusion/transpose_transdata_fusion_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fusion/transpose_transdata_fusion_test.cc
index 25cd12edfe..98dc9e9efc 100644
--- a/tests/ut/cpp/pre_activate/ascend/ir_fusion/transpose_transdata_fusion_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fusion/transpose_transdata_fusion_test.cc
@@ -19,6 +19,7 @@
 #include "device/kernel_info.h"
 #include "session/anf_runtime_algorithm.h"
 #include "kernel/oplib/oplib.h"
+#include "utils/context/ms_context.h"
 #define private public
 #define protected public
 #include "pre_activate/ascend/format_type/insert_trans_op.h"
@@ -41,7 +42,7 @@ class MockSupportedChecker : public SupportedChecker {
  public:
   MockSupportedChecker() = default;
   ~MockSupportedChecker() override = default;
-  bool CheckAiCoreSupported(const AnfNodePtr &anf_node, const kernel::KernelBuildInfoPtr &select_kernel_build_info) override {
+  bool CheckAICoreSupported(const AnfNodePtr &anf_node, const kernel::KernelBuildInfoPtr &select_kernel_build_info) override {
     return true;
   }
 };
@@ -76,6 +77,9 @@ TEST_F(TestHWTransposeTransdataFusion, test_transpose_transdata_fusion) {
    * transdata = Transdata(transpose)
    * return transdata
    */
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  ms_context->set_execution_mode(kGraphMode);
   FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_transpose_transdata_fusion", "before");
   std::vector<int> shp{2, 4, 8, 16};
   auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp);
diff --git a/tests/ut/cpp/pre_activate/pass/allreduce_fusion_test.cc b/tests/ut/cpp/pre_activate/pass/allreduce_fusion_test.cc
index 3208e0b48e..69a330614e 100644
--- a/tests/ut/cpp/pre_activate/pass/allreduce_fusion_test.cc
+++ b/tests/ut/cpp/pre_activate/pass/allreduce_fusion_test.cc
@@ -56,9 +56,9 @@ TEST_F(TestHWAllReduceFusion, test_fusion_all) {
   builder.SetOutputsDeviceType({kFloat32->type_id()});
   builder.SetFusionType(kernel::FusionType::ELEMWISE);
   builder.SetProcessor(kernel::Processor::AICORE);
-  builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
-  for (auto& node : node_list) {
+  for (auto &node : node_list) {
     if (node == nullptr) {
       continue;
     }
@@ -97,9 +97,9 @@ TEST_F(TestHWAllReduceFusion, test_fusion_group) {
   builder.SetOutputsDeviceType({kFloat32->type_id()});
   builder.SetFusionType(kernel::FusionType::ELEMWISE);
   builder.SetProcessor(kernel::Processor::AICORE);
-  builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
-  for (auto& node : node_list) {
+  for (auto &node : node_list) {
     if (node == nullptr) {
       continue;
     }
@@ -138,10 +138,10 @@ TEST_F(TestHWAllReduceFusion, test_fusion_op) {
   builder.SetOutputsDeviceType({kFloat32->type_id()});
   builder.SetFusionType(kernel::FusionType::ELEMWISE);
   builder.SetProcessor(kernel::Processor::AICORE);
-  builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
   int count = 0;
-  for (auto& node : node_list) {
+  for (auto &node : node_list) {
     if (node == nullptr) {
       continue;
     }
@@ -171,5 +171,52 @@ TEST_F(TestHWAllReduceFusion, test_fusion_op) {
   EXPECT_NE(g_after, nullptr);
   EXPECT_TRUE(CheckEqualGraph(new_graph, g_after));
 }
+
+TEST_F(TestHWAllReduceFusion, test_fusion_sorted) {
+  getPyFun_.SetDoResolve(true);
+  FuncGraphPtr g = getPyFun_.CallAndParseRet("test_all_reduce_fusion_all", "before");
+  EXPECT_NE(g, nullptr);
+  std::vector<int> shp_x{1, 64, 112, 112};
+  auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
+  AbstractBasePtrList args_spec_list{x_abstract, x_abstract, x_abstract, x_abstract, x_abstract};
+  auto func_graph = GetKernelGraph(g, args_spec_list);
+  EXPECT_NE(func_graph, nullptr);
+  auto ret = func_graph->get_return();
+  auto make_tuple = ret->input(1);
+  auto make_tuple1 = make_tuple->cast<CNodePtr>()->input(1)->cast<CNodePtr>();
+  for (size_t i = 1; i < make_tuple1->inputs().size(); ++i) {
+    AnfAlgo::SetNodeAttr(kAttrIndex, MakeValue(SizeToInt(i)), make_tuple1->input(i));
+  }
+  // set kernel build info
+  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+  builder.SetInputsFormat({"NC1HWC0"});
+  builder.SetOutputsFormat({"NC1HWC0"});
+  builder.SetInputsDeviceType({kFloat32->type_id()});
+  builder.SetOutputsDeviceType({kFloat32->type_id()});
+  builder.SetFusionType(kernel::FusionType::ELEMWISE);
+  builder.SetProcessor(kernel::Processor::AICORE);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
+  auto node_list = TopoSort(func_graph->get_return());
+  for (auto &node : node_list) {
+    if (node == nullptr) {
+      continue;
+    }
+    if ((node->isa<CNode>() && AnfAlgo::GetCNodeName(node) == kAllReduceOpName) || node->isa<Parameter>()) {
+      node->set_kernel_info(std::make_shared<device::KernelInfo>());
+      AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), node.get());
+    }
+  }
+  // do all reduce fusion
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  pm->AddPass(std::make_shared<opt::AllReduceFusion>());
+  optimizer->AddPassManager(pm);
+  FuncGraphPtr new_graph = optimizer->Optimize(func_graph);
+  EXPECT_NE(new_graph, nullptr);
+  // check result
+  FuncGraphPtr g_after = getPyFun_.CallAndParseRet("test_all_reduce_fusion_all", "after1");
+  EXPECT_NE(g_after, nullptr);
+  EXPECT_TRUE(CheckEqualGraph(new_graph, g_after));
+}
 }  // namespace opt
 }  // namespace mindspore
diff --git a/tests/ut/cpp/pre_activate/pass/eliminate_redundant_op_test.cc b/tests/ut/cpp/pre_activate/pass/eliminate_redundant_op_test.cc
index 71225e40ad..3e43155011 100644
--- a/tests/ut/cpp/pre_activate/pass/eliminate_redundant_op_test.cc
+++ b/tests/ut/cpp/pre_activate/pass/eliminate_redundant_op_test.cc
@@ -30,6 +30,7 @@
 #include "utils/context/ms_context.h"
 #include "session/anf_runtime_algorithm.h"
 #include "device/kernel_info.h"
+#include "utils/context/ms_context.h"
 
 #define private public
 #define protected public
@@ -71,6 +72,9 @@ TEST_F(TestHWEliminateRedundantOp, test_eliminate_5to4_4to5) {
    *     output = make_tuple(res)
    *     return output
    */
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  ms_context->set_execution_mode(kGraphMode);
   FuncGraphPtr g = getPyFun_.CallAndParseRet("test_eliminate_5to4_4to5", "before");
   // Renormalize func_graph to infer and set shape and type information.
   std::vector<int> shp{2, 32, 224, 224};
diff --git a/tests/ut/cpp/pre_activate/pass/optimize_dependence_test.cc b/tests/ut/cpp/pre_activate/pass/optimize_dependence_test.cc
index e95d63e93e..04461e6602 100644
--- a/tests/ut/cpp/pre_activate/pass/optimize_dependence_test.cc
+++ b/tests/ut/cpp/pre_activate/pass/optimize_dependence_test.cc
@@ -68,5 +68,47 @@ TEST_F(TestHWOptimizeDependence, test_optimize_dependence_with_make_tuple) {
   FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_optimize_dependence_with_make_tuple", "after");
   EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
 }
+
+
+TEST_F(TestHWOptimizeDependence, test_optimize_control_dependence_with_make_tuple) {
+  /*
+   * def before(x, y, a, b):
+   *    z = make_tuple(TransData(a), TransData(b))
+   *    depend_intput = control_depend(y, z)
+   *    sum = add(x, depend_intput)
+   *    return sum
+   */
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_optimize_control_dependence_with_make_tuple", "before");
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  pm->AddPass(std::make_shared<opt::OptimizeDependence>());
+  optimizer->AddPassManager(pm);
+  FuncGraphPtr new_graph = optimizer->Optimize(g);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_optimize_control_dependence_with_make_tuple", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
+
+
+TEST_F(TestHWOptimizeDependence, test_optimize_control_dependence) {
+  /*
+   * def before(x, y, a, b):
+   *    z = make_tuple(TransData(a), TransData(b))
+   *    depend_intput = control_depend(y, z)
+   *    sum = add(x, depend_intput)
+   *    return sum
+   */
+  FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_optimize_control_dependence", "before");
+
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  pm->AddPass(std::make_shared<opt::OptimizeDependence>());
+  optimizer->AddPassManager(pm);
+  FuncGraphPtr new_graph = optimizer->Optimize(g);
+
+  FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_optimize_control_dependence", "after");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
+}
 }  // namespace opt
 }  // namespace mindspore
diff --git a/tests/ut/cpp/pynative/pynative_execute_test.cc b/tests/ut/cpp/pynative/pynative_execute_test.cc
index 34184516c2..a0d1516b58 100644
--- a/tests/ut/cpp/pynative/pynative_execute_test.cc
+++ b/tests/ut/cpp/pynative/pynative_execute_test.cc
@@ -35,7 +35,7 @@ class TestPynativeExecute : public UT::Common {
   TestPynativeExecute() {}
 };
 
-inline ValuePtr PyAttrValue(const py::object& obj) {
+inline ValuePtr PyAttrValue(const py::object &obj) {
   ValuePtr converted_ret;
   bool converted = parse::ConvertData(obj, &converted_ret);
   if (!converted) {
@@ -63,8 +63,9 @@ OpExecInfoPtr ConstructOpExecInfo() {
 
   auto conv_obj = prim::GetPythonOps("conv2d_prim", "gtest_input.pynative");
   py::none py_none;
-  py::tuple op_mask = py::make_tuple(0, 1);
-  return GenerateOpExecInfo(py::make_tuple(conv_obj, op_name, op_inputs, op_mask));
+  py::args args = py::make_tuple(conv_obj, op_name, op_inputs);
+  py::list args_input = args[PY_INPUTS];
+  return GenerateOpExecInfo(args, &args_input);
 }
 
 TEST_F(TestPynativeExecute, TestRunOpInVM) {
@@ -78,8 +79,8 @@ TEST_F(TestPynativeExecute, TestRunOpInVM) {
 TEST_F(TestPynativeExecute, TestRunOp) {
   py::none py_none;
   auto op_exec_info_ptr = ConstructOpExecInfo();
-  py::tuple outputs = pynative::RunOp(py::make_tuple(op_exec_info_ptr->py_primitive, op_exec_info_ptr->op_name,
-                                                     op_exec_info_ptr->op_inputs, op_exec_info_ptr->inputs_mask));
+  py::tuple outputs = pynative::RunOp(
+    py::make_tuple(op_exec_info_ptr->py_primitive, op_exec_info_ptr->op_name, op_exec_info_ptr->op_inputs));
   if (outputs.size() == 0) {
     FAIL();
   } else {
diff --git a/tests/ut/cpp/python_input/gtest_input/optimizer/opt_test.py b/tests/ut/cpp/python_input/gtest_input/optimizer/opt_test.py
index 66b442668b..af8cab902c 100644
--- a/tests/ut/cpp/python_input/gtest_input/optimizer/opt_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/optimizer/opt_test.py
@@ -494,6 +494,21 @@ def test_elim_transpose(tag):
 
     return fns[tag]
 
+def test_elim_depend_value(tag):
+    """ test_elim_depend_value """
+    fns = FnDict()
+    depend = P.Depend()
+
+    @fns
+    def before(x):
+        return depend(x, None)
+
+    @fns
+    def after(x):
+        return x
+
+    return fns[tag]
+
 
 def test_elim_tile_multiply_one(tag):
     """ test_elim_tile_multiply_one """
@@ -636,7 +651,7 @@ def test_tuple_get_set_item(tag):
 def test_partial(tag):
     """ test_partial """
     fns = FnDict()
-    partail = Primitive('partial')
+    partail = P.Partial()
 
     def f(x, y):
         return scalar_add(x, y)
@@ -655,7 +670,7 @@ def test_partial(tag):
 def test_replace_applicator(tag):
     """ test_replace_applicator """
     fns = FnDict()
-    partail = Primitive('partial')
+    partail = P.Partial()
 
     def app1(x, y):
         return scalar_add(x, y)
@@ -878,7 +893,7 @@ def test_addn_zero(tag):
     fns = FnDict()
     addn = P.AddN()
     AddN = P.AddN
-    zero_tensor = Primitive('zeros_like_tensor')
+    zero_tensor = Primitive('ZerosLike')
 
     @fns
     def before_1(x, y, z, a):
@@ -1046,8 +1061,8 @@ def test_print_tuple_wrapper(tag):
 # pylint: disable=unnecessary-semicolon
 def test_constant_duplicate_mul(tag):
     fns = FnDict()
-    Mul = Primitive('Mul');
-    Sqrt = Primitive('Sqrt');
+    Mul = Primitive('Mul')
+    Sqrt = Primitive('Sqrt')
 
     x = Tensor(np.array([[2, 2], [2, 3]]).astype('float32'))
     tensor1 = Tensor(np.array([[1.2, 2.1], [2.2, 3.2]]).astype('float32'))
@@ -1074,3 +1089,44 @@ def test_constant_duplicate_mul(tag):
         return Mul(Sqrt(x), Mul(tensor1, tensor2))
 
     return fns[tag]
+
+
+def test_adjust_allreduce_mul_add(tag):
+    fns = FnDict()
+    Mul = Primitive('Mul')
+    AddN = Primitive('AddN')
+    AllReduce = Primitive('AllReduce')
+
+    @fns
+    def beforell(x, y, z):
+        return AddN((z, Mul(y, AllReduce(x))))
+
+    @fns
+    def beforelr(x, y, z):
+        return AddN((z, Mul(AllReduce(x), y)))
+
+    @fns
+    def beforerl(x, y, z):
+        return AddN((Mul(y, AllReduce(x)), z))
+
+    @fns
+    def beforerr(x, y, z):
+        return AddN((Mul(AllReduce(x), y), z))
+
+    @fns
+    def after1(x, y, z):
+        return Mul(AllReduce(AddN((z, x))), y)
+
+    @fns
+    def before2r(x, y, z):
+        return AddN((Mul(AllReduce(x), y), Mul(z, z)))
+
+    @fns
+    def before2l(x, y, z):
+        return AddN((Mul(z, z), Mul(AllReduce(x), y)))
+
+    @fns
+    def after2(x, y, z):
+        return Mul(AllReduce(AddN((Mul(z, z), x))), y)
+
+    return fns[tag]
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/confusion_softmax_grad_rule.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/confusion_softmax_grad_rule.py
index db435712f8..93902c24ca 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/confusion_softmax_grad_rule.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/confusion_softmax_grad_rule.py
@@ -16,7 +16,7 @@ from mindspore.ops import Primitive
 from mindspore.ops import operations as P
 
 mul = P.Mul()
-reduce_sum = P.ReduceSum()
+reduce_sum = P.ReduceSum(keep_dims=True)
 sub = P.Sub()
 confusion_softmax_grad = Primitive('ConfusionSoftmaxGrad')
 make_tuple = Primitive('make_tuple')
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/conv_bn_relu_fusion.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/conv_bn_relu_fusion.py
deleted file mode 100644
index d9d2908591..0000000000
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/conv_bn_relu_fusion.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-from mindspore.ops import Primitive
-from mindspore.ops import operations as P
-
-make_tuple = Primitive('make_tuple')
-tuple_getitem = Primitive('tuple_getitem')
-conv = P.Conv2D(out_channel=64, kernel_size=7, mode=1, pad_mode="valid", pad=0, stride=1, dilation=1, group=1)
-bn = P.FusedBatchNorm()
-relu = P.ReLU()
-conv_bn1 = Primitive('ConvBN1')
-bn2_relu = Primitive('BN2Relu')
-
-
-class FnDict:
-    def __init__(self):
-        self.fnDict = {}
-
-    def __call__(self, fn):
-        self.fnDict[fn.__name__] = fn
-
-    def __getitem__(self, name):
-        return self.fnDict[name]
-
-
-def test_conv_bn_relu_fusion(tag):
-    """ test_conv_bn_relu_fusion """
-    fns = FnDict()
-
-    @fns
-    def before(x, w, scale, b, mean, variance):
-        conv_output = conv(x, w)
-        bn_output = bn(conv_output, scale, b, mean, variance)
-        item0 = tuple_getitem(bn_output, 0)
-        item1 = tuple_getitem(bn_output, 3)
-        item2 = tuple_getitem(bn_output, 4)
-        output = make_tuple(relu(item0), item1, item2)
-        res = tuple_getitem(output, 0)
-        return res
-
-    @fns
-    def after(x, w, scale, b, mean, variance):
-        conv_bn1_output = conv_bn1(x, w)
-        conv_item0 = tuple_getitem(conv_bn1_output, 0)
-        conv_item1 = tuple_getitem(conv_bn1_output, 1)
-        conv_item2 = tuple_getitem(conv_bn1_output, 2)
-        bn2_relu_output = bn2_relu(conv_item0, conv_item1, conv_item2, scale, b, mean, variance)
-        bn2_relu_item0 = tuple_getitem(bn2_relu_output, 0)
-        bn2_relu_item1 = tuple_getitem(bn2_relu_output, 1)
-        bn2_relu_item2 = tuple_getitem(bn2_relu_output, 2)
-        bn2_relu_item3 = tuple_getitem(bn2_relu_output, 3)
-        new_make_tuple = make_tuple(bn2_relu_item0, bn2_relu_item1, bn2_relu_item2, conv_item2, bn2_relu_item3)
-        item1 = tuple_getitem(new_make_tuple, 3)
-        item2 = tuple_getitem(new_make_tuple, 4)
-        output = make_tuple(bn2_relu_item0, item1, item2)
-        return make_tuple(tuple_getitem(output, 0))
-
-    return fns[tag]
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/eliminate_redundant_op_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/eliminate_redundant_op_test.py
index 329894ab5a..c8c5f1cc9b 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/eliminate_redundant_op_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/eliminate_redundant_op_test.py
@@ -22,7 +22,7 @@ four2five = Primitive('Four2Five')
 five2four = Primitive('Five2Four')
 transdata = Primitive("TransData")
 cast = Primitive('Cast')
-depend = Primitive('depend')
+depend = P.Depend()
 
 
 class FnDict:
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/fused_batch_norm_fusion_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/fused_batch_norm_fusion_test.py
index 5b286e358b..472e7a5d4b 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/fused_batch_norm_fusion_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/fused_batch_norm_fusion_test.py
@@ -16,13 +16,13 @@ import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 from mindspore.ops import Primitive
 from mindspore.ops import operations as P
+from mindspore.ops import functional as F
 
 AssignSub = P.AssignSub()
 Mul = P.Mul()
 Sub = P.Sub()
 make_tuple = Primitive('make_tuple')
 tuple_getitem = Primitive('tuple_getitem')
-depend = Primitive('depend')
 BatchNorm = P.BatchNorm()
 Cast = P.Cast()
 BNTrainingReduce = Primitive('BNTrainingReduce')
@@ -54,14 +54,14 @@ def test_fused_batch_norm_fusion(tag):
         mul1 = Mul(sub1, constant1)
         assign_sub0 = AssignSub(var0, mul0)
         assign_sub1 = AssignSub(var1, mul1)
-        depend0 = depend(tuple_getitem(batch_norm, 0), assign_sub0)
-        depend1 = depend(depend0, assign_sub1)
+        depend0 = F.depend(tuple_getitem(batch_norm, 0), assign_sub0)
+        depend1 = F.depend(depend0, assign_sub1)
         outputs = make_tuple(depend1, tuple_getitem(batch_norm, 3), tuple_getitem(batch_norm, 4))
         output = tuple_getitem(outputs, 0)
         return output
 
     @fns
-    def before_mix_precision(input0, input1, input2, input3, input4, var0, var1):
+    def before_mix_precision0(input0, input1, input2, input3, input4, var0, var1):
         batch_norm = BatchNorm(input0, input1, input2, input3, input4)
         sub0 = Sub(Cast(var0, mstype.float32), tuple_getitem(batch_norm, 1))
         sub1 = Sub(Cast(var1, mstype.float32), tuple_getitem(batch_norm, 2))
@@ -69,8 +69,23 @@ def test_fused_batch_norm_fusion(tag):
         mul1 = Mul(sub1, constant1)
         assign_sub0 = AssignSub(var0, Cast(mul0, mstype.float32))
         assign_sub1 = AssignSub(var1, Cast(mul1, mstype.float32))
-        depend0 = depend(tuple_getitem(batch_norm, 0), assign_sub0)
-        depend1 = depend(depend0, assign_sub1)
+        depend0 = F.depend(tuple_getitem(batch_norm, 0), assign_sub0)
+        depend1 = F.depend(depend0, assign_sub1)
+        outputs = make_tuple(depend1, tuple_getitem(batch_norm, 3), tuple_getitem(batch_norm, 4))
+        output = tuple_getitem(outputs, 0)
+        return output
+
+    @fns
+    def before_mix_precision1(input0, input1, input2, input3, input4, var0, var1):
+        batch_norm = BatchNorm(input0, input1, input2, input3, input4)
+        sub0 = Sub(Cast(var0, mstype.float32), tuple_getitem(batch_norm, 1))
+        sub1 = Sub(Cast(var1, mstype.float32), tuple_getitem(batch_norm, 2))
+        mul0 = Mul(Cast(sub0, mstype.float32), constant0)
+        mul1 = Mul(Cast(sub1, mstype.float32), constant1)
+        assign_sub0 = AssignSub(var0, mul0)
+        assign_sub1 = AssignSub(var1, mul1)
+        depend0 = F.depend(tuple_getitem(batch_norm, 0), assign_sub0)
+        depend1 = F.depend(depend0, assign_sub1)
         outputs = make_tuple(depend1, tuple_getitem(batch_norm, 3), tuple_getitem(batch_norm, 4))
         output = tuple_getitem(outputs, 0)
         return output
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/hw_opt_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/hw_opt_test.py
index dc6cdf7452..441abaf4a9 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/hw_opt_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/hw_opt_test.py
@@ -16,7 +16,7 @@ from mindspore.ops import Primitive
 from mindspore.ops import operations as P
 
 tuple_getitem = Primitive('tuple_getitem')
-depend = Primitive('depend')
+depend = P.Depend()
 addn = P.AddN()
 add = P.TensorAdd()
 sub = P.Sub()
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/insert_memcpy_async_for_hccl_op.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/insert_memcpy_async_for_hccl_op.py
new file mode 100644
index 0000000000..7ffcfd0578
--- /dev/null
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/insert_memcpy_async_for_hccl_op.py
@@ -0,0 +1,120 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+from mindspore.ops import Primitive
+from mindspore.ops import operations as P
+
+all_reduce = P.AllReduce()
+memcpy_async = Primitive('memcpy_async')
+make_tuple = Primitive('make_tuple')
+tuple_getitem = Primitive('tuple_getitem')
+apply_momentun = P.ApplyMomentum()
+control_depend = P.ControlDepend()
+relu = P.ReLU()
+
+
+class FnDict:
+    def __init__(self):
+        self.fnDict = {}
+
+    def __call__(self, fn):
+        self.fnDict[fn.__name__] = fn
+
+    def __getitem__(self, name):
+        return self.fnDict[name]
+
+
+def test_insert_memcpy_async_for_hccl_op_cond1(tag):
+    fns = FnDict()
+
+    @fns
+    def before1(x):
+        res1 = relu(x)
+        res2 = all_reduce(res1)
+        return make_tuple(res1, res2)
+
+    @fns
+    def before2(x):
+        res1 = relu(x)
+        res2 = all_reduce(res1)
+        return res2
+
+    @fns
+    def after(x):
+        res1 = relu(x)
+        res2 = memcpy_async(res1)
+        res2 = all_reduce(res2)
+        return make_tuple(make_tuple(res1, res2))
+
+    return fns[tag]
+
+
+def test_insert_memcpy_async_for_hccl_op_cond2(tag):
+    fns = FnDict()
+
+    @fns
+    def before(x):
+        res = all_reduce(x)
+        return res
+
+    @fns
+    def after(x):
+        res = memcpy_async(x)
+        res = all_reduce(res)
+        return make_tuple(res)
+
+    return fns[tag]
+
+
+def test_insert_memcpy_async_for_hccl_op_cond3(tag):
+    fns = FnDict()
+
+    @fns
+    def before(a, b, c, d, e):
+        res = apply_momentun(a, b, c, d, e)
+        res = all_reduce(res)
+        return res
+
+    @fns
+    def after(a, b, c, d, e):
+        res = apply_momentun(a, b, c, d, e)
+        res = memcpy_async(res)
+        res = all_reduce(res)
+        return make_tuple(res)
+
+    return fns[tag]
+
+
+def test_insert_memcpy_async_for_hccl_op_cond4(tag):
+    fns = FnDict()
+
+    @fns
+    def before(a, b, c, d, e):
+        res1 = apply_momentun(a, b, c, d, e)
+        res2 = all_reduce(a)
+        res = control_depend(res1, res2)
+        res = make_tuple(res, res2)
+        return res
+
+    @fns
+    def after(a, b, c, d, e):
+        res1 = apply_momentun(a, b, c, d, e)
+        res2 = memcpy_async(a)
+        res3 = all_reduce(res2)
+        res = control_depend(res1, res2)
+        res = make_tuple(res, res3)
+        return make_tuple(res)
+
+    return fns[tag]
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/ir_fusion_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/ir_fusion_test.py
index 32bf5ff629..195402c92b 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/ir_fusion_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/ir_fusion_test.py
@@ -49,104 +49,6 @@ class FnDict:
         return self.fnDict[name]
 
 
-def test_conv_bn_fusion(tag):
-    """ test_conv_bn_fusion """
-    fns = FnDict()
-
-    @fns
-    def before(x, w, scale, b, mean, variance):
-        conv_output = conv(x, w)
-        bn_output = bn(conv_output, scale, b, mean, variance)
-        item0 = tuple_getitem(bn_output, 0)
-        item1 = tuple_getitem(bn_output, 3)
-        item2 = tuple_getitem(bn_output, 4)
-        output = make_tuple(item0, item1, item2)
-        res = tuple_getitem(output, 0)
-        return res
-
-    @fns
-    def after(x, w, scale, b, mean, variance):
-        conv_bn1_output = conv_bn1(x, w)
-        conv_item0 = tuple_getitem(conv_bn1_output, 0)
-        conv_item1 = tuple_getitem(conv_bn1_output, 1)
-        conv_item2 = tuple_getitem(conv_bn1_output, 2)
-        bn2_output = fused_bn2(conv_item2, conv_item1, mean, variance)
-        bn2_item0 = tuple_getitem(bn2_output, 0)
-        bn2_item1 = tuple_getitem(bn2_output, 1)
-        bn2_item2 = tuple_getitem(bn2_output, 2)
-        bn3_output = fused_bn3(conv_item0, conv_item2, bn2_item0, scale, b)
-        output = make_tuple(bn3_output, bn2_item1, bn2_item2, conv_item2, bn2_item0)
-        item0 = tuple_getitem(output, 0)
-        item1 = tuple_getitem(output, 3)
-        item2 = tuple_getitem(output, 4)
-        new_output = make_tuple(item0, item1, item2)
-        return make_tuple(tuple_getitem(new_output, 0))
-
-    return fns[tag]
-
-
-def test_conv_bn_add_relu_fusion(tag):
-    """ test_conv_bn_add_relu_fusion """
-    fns = FnDict()
-
-    @fns
-    def before(x, w, scale, b, mean, variance, y):
-        conv_output = conv(x, w)
-        bn_output = bn(conv_output, scale, b, mean, variance)
-        item0 = tuple_getitem(bn_output, 0)
-        s = add(item0, y)
-        res = relu(s)
-        return res
-
-    @fns
-    def after(x, w, scale, b, mean, variance, y):
-        conv_bn1_output = conv_bn1(x, w)
-        conv_item0 = tuple_getitem(conv_bn1_output, 0)
-        conv_item1 = tuple_getitem(conv_bn1_output, 1)
-        conv_item2 = tuple_getitem(conv_bn1_output, 2)
-        bn2_add_relu_output = bn2_add_relu(conv_item0, conv_item1, conv_item2, y, scale, b, mean, variance)
-        bn2_add_relu_item0 = tuple_getitem(bn2_add_relu_output, 0)
-        res = make_tuple(bn2_add_relu_item0)
-        return res
-
-    return fns[tag]
-
-
-def test_conv_bn_relu_fusion(tag):
-    """ test_conv_bn_relu_fusion """
-    fns = FnDict()
-
-    @fns
-    def before(x, w, scale, b, mean, variance):
-        conv_output = conv(x, w)
-        bn_output = bn(conv_output, scale, b, mean, variance)
-        item0 = tuple_getitem(bn_output, 0)
-        item1 = tuple_getitem(bn_output, 3)
-        item2 = tuple_getitem(bn_output, 4)
-        output = make_tuple(relu(item0), item1, item2)
-        res = tuple_getitem(output, 0)
-        return res
-
-    @fns
-    def after(x, w, scale, b, mean, variance):
-        conv_bn1_output = conv_bn1(x, w)
-        conv_item0 = tuple_getitem(conv_bn1_output, 0)
-        conv_item1 = tuple_getitem(conv_bn1_output, 1)
-        conv_item2 = tuple_getitem(conv_bn1_output, 2)
-        bn2_relu_output = bn2_relu(conv_item0, conv_item1, conv_item2, scale, b, mean, variance)
-        bn2_relu_item0 = tuple_getitem(bn2_relu_output, 0)
-        bn2_relu_item1 = tuple_getitem(bn2_relu_output, 1)
-        bn2_relu_item2 = tuple_getitem(bn2_relu_output, 2)
-        bn2_relu_item3 = tuple_getitem(bn2_relu_output, 3)
-        new_make_tuple = make_tuple(bn2_relu_item0, bn2_relu_item1, bn2_relu_item2, conv_item2, bn2_relu_item3)
-        item1 = tuple_getitem(new_make_tuple, 3)
-        item2 = tuple_getitem(new_make_tuple, 4)
-        output = make_tuple(bn2_relu_item0, item1, item2)
-        return make_tuple(tuple_getitem(output, 0))
-
-    return fns[tag]
-
-
 def test_bn_split(tag):
     """ test_split_bn_fusion """
     fns = FnDict()
@@ -238,6 +140,17 @@ def test_all_reduce_fusion_all(tag):
         res = make_tuple(y1, y2, y3, y4, y5)
         return make_tuple(res)
 
+    @fns
+    def after1(x1, x2, x3, x4, x5):
+        ar = allreduce(x1, x2, x3, x4, x5)
+        y1 = tuple_getitem(ar, 0)
+        y2 = tuple_getitem(ar, 1)
+        y3 = tuple_getitem(ar, 2)
+        y4 = tuple_getitem(ar, 3)
+        y5 = tuple_getitem(ar, 4)
+        res = make_tuple(y1, y2, y3, y4, y5)
+        return make_tuple(res)
+
     return fns[tag]
 
 
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/mixed_precision_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/mixed_precision_test.py
index 8c6796cf7a..da8a396e98 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/mixed_precision_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/mixed_precision_test.py
@@ -16,7 +16,7 @@ from mindspore.ops import Primitive
 from mindspore.ops import operations as P
 
 tuple_getitem = Primitive('tuple_getitem')
-depend = Primitive('depend')
+depend = P.Depend()
 addn = P.AddN()
 add = P.TensorAdd()
 sub = P.Sub()
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/momentum_lossscale_fusion_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/momentum_lossscale_fusion_test.py
index 3568b5784b..3302daa879 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/momentum_lossscale_fusion_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/momentum_lossscale_fusion_test.py
@@ -47,6 +47,6 @@ def test_momentum_lossscale_fusion(tag):
 
     @fns
     def after(input0, input1, input2, input3, input4):
-        return make_tuple(tuple_getitem(FusedMulApplyMomentum(input0, input1, input2, input3, input4, constant), 0))
+        return make_tuple(FusedMulApplyMomentum(input0, input1, input2, input3, input4, constant))
 
     return fns[tag]
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/optimize_dependence_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/optimize_dependence_test.py
index 4faa5f3f71..2d98b50e3f 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/optimize_dependence_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/optimize_dependence_test.py
@@ -15,7 +15,8 @@
 from mindspore.ops import Primitive
 from mindspore.ops import operations as P
 
-depend = Primitive('depend')
+depend = P.Depend()
+controldepend = Primitive("ControlDepend")
 TransData = Primitive('TransData')
 add = P.TensorAdd()
 make_tuple = Primitive('make_tuple')
@@ -69,3 +70,42 @@ def test_optimize_dependence_with_make_tuple(tag):
         return sum_add
 
     return fns[tag]
+
+
+def test_optimize_control_dependence(tag):
+    fns = FnDict()
+
+    @fns
+    def before(x, y, z):
+        new_z = TransData(z)
+        depend_intput = controldepend(y, new_z)
+        sum_add = add(x, depend_intput)
+        return sum_add
+
+    @fns
+    def after(x, y, z):
+        depend_intput = controldepend(y, z)
+        sum_add = add(x, depend_intput)
+        return sum_add
+
+    return fns[tag]
+
+
+def test_optimize_control_dependence_with_make_tuple(tag):
+    fns = FnDict()
+
+    @fns
+    def before(x, y, a, b):
+        z = make_tuple(TransData(a), TransData(b))
+        depend_intput = controldepend(y, z)
+        sum_add = add(x, depend_intput)
+        return sum_add
+
+    @fns
+    def after(x, y, a, b):
+        z = make_tuple(a, b)
+        depend_intput = controldepend(y, z)
+        sum_add = add(x, depend_intput)
+        return sum_add
+
+    return fns[tag]
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/single_batch_norm_fission_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/single_batch_norm_fission_test.py
new file mode 100644
index 0000000000..1ea31fba50
--- /dev/null
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/single_batch_norm_fission_test.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from mindspore.ops import Primitive
+from mindspore.ops import operations as P
+
+make_tuple = Primitive('make_tuple')
+tuple_getitem = Primitive('tuple_getitem')
+BatchNorm = P.BatchNorm(is_training=True)
+BNTrainingReduce = Primitive('BNTrainingReduce')
+BNTrainingUpdateV3 = Primitive('BNTrainingUpdateV3')
+
+
+class FnDict:
+    def __init__(self):
+        self.fnDict = {}
+
+    def __call__(self, fn):
+        self.fnDict[fn.__name__] = fn
+
+    def __getitem__(self, name):
+        return self.fnDict[name]
+
+
+def test_single_batch_norm_fission(tag):
+    fns = FnDict()
+
+    @fns
+    def before(input0, input1, input2, input3, input4):
+        batch_norm = BatchNorm(input0, input1, input2, input3, input4)
+        item0 = tuple_getitem(batch_norm, 0)
+        item1 = tuple_getitem(batch_norm, 1)
+        item2 = tuple_getitem(batch_norm, 2)
+        item3 = tuple_getitem(batch_norm, 3)
+        item4 = tuple_getitem(batch_norm, 4)
+        output = make_tuple(item0, item1, item2, item3, item4)
+        return output
+
+    @fns
+    def after(input0, input1, input2, input3, input4):
+        reduce = BNTrainingReduce(input0)
+        reduce_item0 = tuple_getitem(reduce, 0)
+        reduce_item1 = tuple_getitem(reduce, 1)
+        update = BNTrainingUpdateV3(input0, reduce_item0, reduce_item1, input1, input2)
+        update_item0 = tuple_getitem(update, 0)
+        update_item1 = tuple_getitem(update, 1)
+        update_item2 = tuple_getitem(update, 2)
+        update_item3 = tuple_getitem(update, 3)
+        update_item4 = tuple_getitem(update, 4)
+        output = make_tuple(update_item0, update_item1, update_item2, update_item3, update_item4)
+        return make_tuple(output)
+
+    return fns[tag]
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/softmax_grad_ext_fusion.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/softmax_grad_ext_fusion.py
index fbcc3d7480..52ba86aaa3 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/softmax_grad_ext_fusion.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/softmax_grad_ext_fusion.py
@@ -16,7 +16,7 @@ from mindspore.ops import Primitive
 from mindspore.ops import operations as P
 
 Mul = P.Mul()
-ReduceSum = P.ReduceSum()
+ReduceSum = P.ReduceSum(keep_dims=True)
 Sub = P.Sub()
 SoftmaxGradExt = Primitive('SoftmaxGradExt')
 MakeTuple = Primitive('make_tuple')
@@ -54,3 +54,43 @@ def test_softmax_grad_ext_fusion(tag):
         return MakeTuple(res)
 
     return fns[tag]
+
+
+def test_softmax_grad_ext_fusion_v2(tag):
+    fns = FnDict()
+
+    @fns
+    def before(input0, input1, input2):
+        mul = Mul(input1, input0)
+        reduce_sum = ReduceSum(mul, axes)
+        sub = Sub(input0, reduce_sum)
+        mul1 = Mul(input1, sub)
+        mul_grad = Mul(input2, mul1)
+        return mul_grad
+
+    @fns
+    def after(input0, input1, input2):
+        res = SoftmaxGradExt(input0, input1, input2)
+        return MakeTuple(res)
+
+    return fns[tag]
+
+
+def test_softmax_grad_ext_fusion_v3(tag):
+    fns = FnDict()
+
+    @fns
+    def before(input0, input1, input2):
+        mul = Mul(input1, input0)
+        reduce_sum = ReduceSum(mul, axes)
+        sub = Sub(input0, reduce_sum)
+        mul1 = Mul(input1, sub)
+        mul_grad = Mul(mul1, input2)
+        return mul_grad
+
+    @fns
+    def after(input0, input1, input2):
+        res = SoftmaxGradExt(input0, input1, input2)
+        return MakeTuple(res)
+
+    return fns[tag]
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/add_memcpy_async.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/split_fission_test.py
similarity index 53%
rename from tests/ut/cpp/python_input/gtest_input/pre_activate/add_memcpy_async.py
rename to tests/ut/cpp/python_input/gtest_input/pre_activate/split_fission_test.py
index e087530acd..b25fa1f5d0 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/add_memcpy_async.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/split_fission_test.py
@@ -16,10 +16,10 @@
 from mindspore.ops import Primitive
 from mindspore.ops import operations as P
 
-all_reduce = P.AllReduce()
-memcpy_async = Primitive('memcpy_async')
+split = P.Split(0, 8)
 make_tuple = Primitive('make_tuple')
 tuple_getitem = Primitive('tuple_getitem')
+splitv = Primitive('SplitV')
 
 
 class FnDict:
@@ -33,18 +33,26 @@ class FnDict:
         return self.fnDict[name]
 
 
-def test_add_memcpy_async(tag):
+def test_split_fission(tag):
+    """ test_adam_apply_one_with_decay_rule """
     fns = FnDict()
 
     @fns
     def before(x):
-        res = all_reduce(x)
-        return make_tuple(x, res)
+        return split(x)
 
     @fns
     def after(x):
-        res = memcpy_async(x)
-        res = all_reduce(res)
-        return make_tuple(make_tuple(x, res))
+        splitv0 = splitv(x)
+        splitv1 = splitv(tuple_getitem(splitv0, 0))
+        splitv2 = splitv(tuple_getitem(splitv0, 1))
+        splitv3 = splitv(tuple_getitem(splitv0, 2))
+        make_tuple0 = make_tuple(tuple_getitem(splitv1, 0), tuple_getitem(splitv1, 1), tuple_getitem(splitv1, 2),
+                                 tuple_getitem(splitv2, 0), tuple_getitem(splitv2, 1), tuple_getitem(splitv2, 2),
+                                 tuple_getitem(splitv3, 0), tuple_getitem(splitv3, 1))
+        return make_tuple(
+            make_tuple(tuple_getitem(make_tuple0, 0), tuple_getitem(make_tuple0, 1), tuple_getitem(make_tuple0, 2),
+                       tuple_getitem(make_tuple0, 3), tuple_getitem(make_tuple0, 4), tuple_getitem(make_tuple0, 5),
+                       tuple_getitem(make_tuple0, 6), tuple_getitem(make_tuple0, 7)))
 
     return fns[tag]
diff --git a/tests/ut/cpp/session/anf_runtime_algorithm_test.cc b/tests/ut/cpp/session/anf_runtime_algorithm_test.cc
index 9ff8123004..2ea2453381 100644
--- a/tests/ut/cpp/session/anf_runtime_algorithm_test.cc
+++ b/tests/ut/cpp/session/anf_runtime_algorithm_test.cc
@@ -645,9 +645,9 @@ TEST_F(AnfRuntimeAlgorithmTest, GetKernelType) {
   auto d_kernel_info = add->kernel_info();
   MS_EXCEPTION_IF_NULL(d_kernel_info);
   KernelBuildInfoBuilder builder;
-  builder.SetKernelType(AUTO_DIFF_KERNEL);
+  builder.SetKernelType(AKG_KERNEL);
   d_kernel_info->set_select_kernel_build_info(builder.Build());
-  EXPECT_EQ(AnfAlgo::GetKernelType(add), AUTO_DIFF_KERNEL);
+  EXPECT_EQ(AnfAlgo::GetKernelType(add), AKG_KERNEL);
   EXPECT_THROW(AnfAlgo::GetKernelType(nullptr), std::runtime_error);
 }
 
diff --git a/tests/ut/cpp/stub/anf_ir/dump_proto_stub.cc b/tests/ut/cpp/stub/anf_ir/dump_proto_stub.cc
index 871fffc1c7..45b2f422ea 100644
--- a/tests/ut/cpp/stub/anf_ir/dump_proto_stub.cc
+++ b/tests/ut/cpp/stub/anf_ir/dump_proto_stub.cc
@@ -17,8 +17,9 @@
 
 namespace mindspore {
 
-std::string GetFuncGraphProtoString(const FuncGraphPtr& func_graph) { return ""; }
+std::string GetFuncGraphProtoString(const FuncGraphPtr &func_graph) { return ""; }
 
-std::string GetOnnxProtoString(const FuncGraphPtr& func_graph) { return ""; }
+std::string GetOnnxProtoString(const FuncGraphPtr &func_graph) { return ""; }
 
+std::string GetBinaryProtoString(const FuncGraphPtr &func_graph) { return ""; }
 }  // namespace mindspore
diff --git a/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc b/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc
index b77b83c7fe..a3a991247c 100644
--- a/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc
+++ b/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc
@@ -40,6 +40,11 @@ const std::vector<uint32_t> &ModelRunner::GetTaskIdList(uint32_t model_id) const
   static std::vector<uint32_t> task_id_list;
   return task_id_list;
 }
+
+const std::vector<uint32_t> &ModelRunner::GetStreamIdList(uint32_t model_id) const {
+  static std::vector<uint32_t> stream_id_list;
+  return stream_id_list;
+}
 }  // namespace model_runner
 }  // namespace ge
 
diff --git a/tests/ut/cpp/stub/hccl/hccl_stub.cc b/tests/ut/cpp/stub/hccl/hccl_stub.cc
index 00379ba650..e25ccc36c6 100644
--- a/tests/ut/cpp/stub/hccl/hccl_stub.cc
+++ b/tests/ut/cpp/stub/hccl/hccl_stub.cc
@@ -103,7 +103,7 @@ hcclResult_t hcom_receive(const char *tag, void *outputPtr, u64 count, hcclDataT
 
 /* 获取梯度参数切分方案 */
 hcclResult_t hcom_get_split_strategy(const char *group, const struct model_feature *feature, u32 maxSegmentNum,
-                                     u32 *segmentNum, u32 *segmentIdx) {
+                                     u32 *segmentNum, u32 *segmentIdx, GradSplitForceMode force) {
   return HCCL_SUCCESS;
 }
 
diff --git a/tests/ut/cpp/stub/runtime/runtime_stub.cc b/tests/ut/cpp/stub/runtime/runtime_stub.cc
index b7099124bc..8967c11ecf 100644
--- a/tests/ut/cpp/stub/runtime/runtime_stub.cc
+++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc
@@ -133,3 +133,7 @@ rtError_t rtGetStreamId(rtStream_t stream, int32_t *streamId) { return RT_ERROR_
 rtError_t rtGetFunctionByName(const char *stubName, void **stubFunc) { return RT_ERROR_NONE; }
 
 rtError_t rtSetTaskGenCallback(rtTaskGenCallback callback) { return RT_ERROR_NONE; }
+
+rtError_t rtProfilerStart(void) { return RT_ERROR_NONE; }
+
+rtError_t rtProfilerStop(void) { return RT_ERROR_NONE; }
diff --git a/tests/ut/cpp/stub/tasksink/ascend_stream_assign_stub.cc b/tests/ut/cpp/stub/tasksink/ascend_stream_assign_stub.cc
index 5d8e33b256..fba52323cf 100755
--- a/tests/ut/cpp/stub/tasksink/ascend_stream_assign_stub.cc
+++ b/tests/ut/cpp/stub/tasksink/ascend_stream_assign_stub.cc
@@ -26,12 +26,12 @@ void AscendLabelAssign::AssignLabel(NotNull<std::shared_ptr<session::KernelGraph
 uint32_t AscendLabelAssign::GetLabelNum(NotNull<const session::KernelGraph *> graph) { return 1; }
 uint32_t AscendLabelAssign::GetLabelNum(NotNull<std::shared_ptr<session::KernelGraph>> graph) { return 1; }
 
-void AscendStreamAssign::AssignStreamNew(const KernelGraphPtr &graph) { return; }
-
-uint32_t AscendStreamAssign::GetTotalStreamNum() const { return 1; }
+void AscendStreamAssign::AssignStream(const KernelGraphPtr &graph) { return; }
 
 void AscendStreamAssign::GetWaitStreams(vector<uint32_t> *wait_active_stream_list) { return; }
 
+void AscendStreamAssign::GetHcomStreams(std::vector<uint32_t> *streams) { return; }
+
 namespace tasksink {
 bool TaskGenerator::GenTasks(const std::vector<CNodePtr> &anf_node_list, std::vector<TaskInfoPtr> *const task_info_list,
                              uint32_t graph_id) {
@@ -39,7 +39,6 @@ bool TaskGenerator::GenTasks(const std::vector<CNodePtr> &anf_node_list, std::ve
 }
 }  // namespace tasksink
 }  // namespace ascend
-void KernelAdjust::Reorder(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) { return; }
 void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) { return; }
 bool KernelAdjust::StepLoadCtrlInputs(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) { return true; }
 bool KernelAdjust::NeedInsertSwitch() { return true; }
diff --git a/tests/ut/cpp/vm/segment_runner_test.cc b/tests/ut/cpp/vm/segment_runner_test.cc
index f08272c728..b9bc552d90 100644
--- a/tests/ut/cpp/vm/segment_runner_test.cc
+++ b/tests/ut/cpp/vm/segment_runner_test.cc
@@ -65,7 +65,7 @@ TEST_F(TestCompileSegmentRunner, test_MsVmConvert1) {
   for (auto &item : utils::cast<VectorRef>(todos[0])) {
     anf_list.push_back(utils::cast<AnfNodePtr>(item));
   }
-  auto convertResult = MsVmConvert(anf_list);
+  auto convertResult = MsVmConvert(anf_list, "");
   auto runResult = (*(convertResult.run))(args);
   ASSERT_TRUE(runResult.size() == 1 && py::cast<double>(BaseRefToPyData(runResult[0])) == 3.0);
 }
@@ -89,7 +89,7 @@ TEST_F(TestCompileSegmentRunner, test_MsVmConvert2) {
   for (auto &item : utils::cast<VectorRef>(todos[0])) {
     anf_list.push_back(utils::cast<AnfNodePtr>(item));
   }
-  auto convertResult = MsVmConvert(anf_list);
+  auto convertResult = MsVmConvert(anf_list, "");
   auto runResult = (*(convertResult.run))(args);
   ASSERT_TRUE(runResult.size() == 1 && py::cast<double>(BaseRefToPyData(runResult[0])) == 2.0);
 }
@@ -113,7 +113,7 @@ TEST_F(TestCompileSegmentRunner, test_if) {
   for (auto &item : utils::cast<VectorRef>(todos[0])) {
     anf_list.push_back(utils::cast<AnfNodePtr>(item));
   }
-  auto convertResult = MsVmConvert(anf_list);
+  auto convertResult = MsVmConvert(anf_list, "");
   auto runResult = (*(convertResult.run))(args);
 
   auto result = py::cast<bool>(BaseRefToPyData(runResult[0]));
diff --git a/tests/ut/data/dataset/declient_filter.cfg b/tests/ut/data/dataset/declient_filter.cfg
deleted file mode 100644
index 89e1199f5a..0000000000
--- a/tests/ut/data/dataset/declient_filter.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-   "rowsPerBuffer": 10,
-}
diff --git a/tests/ut/data/dataset/golden/normalize_01_c_result.npz b/tests/ut/data/dataset/golden/normalize_01_c_result.npz
new file mode 100644
index 0000000000..d397435ece
Binary files /dev/null and b/tests/ut/data/dataset/golden/normalize_01_c_result.npz differ
diff --git a/tests/ut/data/dataset/golden/normalize_01_py_result.npz b/tests/ut/data/dataset/golden/normalize_01_py_result.npz
new file mode 100644
index 0000000000..9d794aa5f2
Binary files /dev/null and b/tests/ut/data/dataset/golden/normalize_01_py_result.npz differ
diff --git a/tests/ut/data/dataset/golden/normalize_02_py_result.npz b/tests/ut/data/dataset/golden/normalize_02_py_result.npz
new file mode 100644
index 0000000000..82ab777c10
Binary files /dev/null and b/tests/ut/data/dataset/golden/normalize_02_py_result.npz differ
diff --git a/tests/ut/data/dataset/golden/normalize_03_py_result.npz b/tests/ut/data/dataset/golden/normalize_03_py_result.npz
new file mode 100644
index 0000000000..44db3250a6
Binary files /dev/null and b/tests/ut/data/dataset/golden/normalize_03_py_result.npz differ
diff --git a/tests/ut/data/dataset/golden/normalize_04_py_result.npz b/tests/ut/data/dataset/golden/normalize_04_py_result.npz
new file mode 100644
index 0000000000..a51404c5c3
Binary files /dev/null and b/tests/ut/data/dataset/golden/normalize_04_py_result.npz differ
diff --git a/tests/ut/data/dataset/golden/random_affine_01_result.npz b/tests/ut/data/dataset/golden/random_affine_01_result.npz
new file mode 100644
index 0000000000..85a887961d
Binary files /dev/null and b/tests/ut/data/dataset/golden/random_affine_01_result.npz differ
diff --git a/tests/ut/data/dataset/golden/random_apply_01_result.npz b/tests/ut/data/dataset/golden/random_apply_01_result.npz
new file mode 100644
index 0000000000..df6bda9cd0
Binary files /dev/null and b/tests/ut/data/dataset/golden/random_apply_01_result.npz differ
diff --git a/tests/ut/data/dataset/golden/random_crop_and_resize_01_c_result.npz b/tests/ut/data/dataset/golden/random_crop_and_resize_01_c_result.npz
index 9deccf5eec..7952076673 100644
Binary files a/tests/ut/data/dataset/golden/random_crop_and_resize_01_c_result.npz and b/tests/ut/data/dataset/golden/random_crop_and_resize_01_c_result.npz differ
diff --git a/tests/ut/data/dataset/golden/random_crop_and_resize_01_py_result.npz b/tests/ut/data/dataset/golden/random_crop_and_resize_01_py_result.npz
index 67459b2e82..bd5e6a83f0 100644
Binary files a/tests/ut/data/dataset/golden/random_crop_and_resize_01_py_result.npz and b/tests/ut/data/dataset/golden/random_crop_and_resize_01_py_result.npz differ
diff --git a/tests/ut/data/dataset/golden/random_crop_and_resize_02_c_result.npz b/tests/ut/data/dataset/golden/random_crop_and_resize_02_c_result.npz
index 6f1b051769..a8c5bf8e98 100644
Binary files a/tests/ut/data/dataset/golden/random_crop_and_resize_02_c_result.npz and b/tests/ut/data/dataset/golden/random_crop_and_resize_02_c_result.npz differ
diff --git a/tests/ut/data/dataset/golden/random_crop_and_resize_03_c_result.npz b/tests/ut/data/dataset/golden/random_crop_and_resize_03_c_result.npz
index 90903ec22f..229c9c9f2b 100644
Binary files a/tests/ut/data/dataset/golden/random_crop_and_resize_03_c_result.npz and b/tests/ut/data/dataset/golden/random_crop_and_resize_03_c_result.npz differ
diff --git a/tests/ut/data/dataset/golden/random_order_01_result.npz b/tests/ut/data/dataset/golden/random_order_01_result.npz
new file mode 100644
index 0000000000..e2d04834ad
Binary files /dev/null and b/tests/ut/data/dataset/golden/random_order_01_result.npz differ
diff --git a/tests/ut/data/dataset/golden/random_perspective_01_result.npz b/tests/ut/data/dataset/golden/random_perspective_01_result.npz
new file mode 100644
index 0000000000..2f3abd2230
Binary files /dev/null and b/tests/ut/data/dataset/golden/random_perspective_01_result.npz differ
diff --git a/tests/ut/data/dataset/golden/storage_result.npz b/tests/ut/data/dataset/golden/storage_result.npz
deleted file mode 100644
index 10cad9f2b0..0000000000
Binary files a/tests/ut/data/dataset/golden/storage_result.npz and /dev/null differ
diff --git a/tests/ut/data/dataset/testCLUE/afqmc/dev.json b/tests/ut/data/dataset/testCLUE/afqmc/dev.json
new file mode 100644
index 0000000000..4c3d942e2d
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/afqmc/dev.json
@@ -0,0 +1,3 @@
+{"sentence1": "你有花呗吗", "sentence2": "我的花呗没额度了", "label": "0"}
+{"sentence1": "吃饭能用花呗吗", "sentence2": "花呗太方便了", "label": "0"}
+{"sentence1": "蚂蚁花呗支付金额有什么限制", "sentence2": "我到实体店消费用花呗支付受金额限制", "label": "1"}
diff --git a/tests/ut/data/dataset/testCLUE/afqmc/test.json b/tests/ut/data/dataset/testCLUE/afqmc/test.json
new file mode 100644
index 0000000000..a7d63132d8
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/afqmc/test.json
@@ -0,0 +1,3 @@
+{"id": 0, "sentence1": "借呗取消的时间", "sentence2": "蚂蚁借呗恢复的月数"}
+{"id": 1, "sentence1": "网商贷用什么方法转变成借呗", "sentence2": "什么手段能将网商贷切换为借呗"}
+{"id": 2, "sentence1": "我的借呗为什么开通不了", "sentence2": "我为啥没法开通借呗"}
diff --git a/tests/ut/data/dataset/testCLUE/afqmc/train.json b/tests/ut/data/dataset/testCLUE/afqmc/train.json
new file mode 100644
index 0000000000..f69c29adcf
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/afqmc/train.json
@@ -0,0 +1,3 @@
+{"sentence1": "蚂蚁借呗等额还款能否换成先息后本", "sentence2": "借呗可以先息到期还本吗", "label": "0"}
+{"sentence1": "蚂蚁花呗说我违约了", "sentence2": "蚂蚁花呗违约行为是啥", "label": "0"}
+{"sentence1": "帮我看看本月花呗账单结清了没", "sentence2": "上月的花呗账单", "label": "0"}
diff --git a/tests/ut/data/dataset/testCLUE/cmnli/dev.json b/tests/ut/data/dataset/testCLUE/cmnli/dev.json
new file mode 100644
index 0000000000..09449683a9
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/cmnli/dev.json
@@ -0,0 +1,3 @@
+{"sentence1": "每个人都有权利", "sentence2": "每个人都有福利", "label": "neutral"}
+{"sentence1": "有时候我喜欢他，但我也喜欢看到有人打他", "sentence2": "说实话，我有点喜欢他，但还是喜欢看到有人打他。", "label": "entailment"}
+{"sentence1": "我最喜欢的餐馆是离你最近的一家", "sentence2": "我最喜欢的餐馆离你家至少一百英里远。", "label": "contradiction"}
diff --git a/tests/ut/data/dataset/testCLUE/cmnli/test.json b/tests/ut/data/dataset/testCLUE/cmnli/test.json
new file mode 100644
index 0000000000..ab249f6d24
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/cmnli/test.json
@@ -0,0 +1,3 @@
+{"id": 0, "sentence1": "今天，全球都在看着最新航天飞机的处女航。", "sentence2": "全世界都在看最新的航天飞机发射。"}
+{"id": 1, "sentence1": "而我们把竹篮放在一个地方，把玻璃瓶放在另一处，把书放在另一处，满了要把它放到车里", "sentence2": "我们没有分开任何东西，都把它全扔进一个箱子里。"}
+{"id": 2, "sentence1": "她占用了我的很多时间，她给我读了很多关于灵异的故事，我觉得很无聊。", "sentence2": "我喜欢和她一起读鬼故事。"}
diff --git a/tests/ut/data/dataset/testCLUE/cmnli/train.json b/tests/ut/data/dataset/testCLUE/cmnli/train.json
new file mode 100644
index 0000000000..705cc46438
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/cmnli/train.json
@@ -0,0 +1,3 @@
+{"sentence1": "你应该给这件衣服定一个价格。", "sentence2": "不同的衣服有不同的价格。", "label": "neutral"}
+{"sentence1": "我怎么知道他要说什么", "sentence2": "他说什么我并不知道。", "label": "entailment"}
+{"sentence1": "向左。", "sentence2": "向右。", "label": "contradiction"}
diff --git a/tests/ut/data/dataset/testCLUE/csl/dev.json b/tests/ut/data/dataset/testCLUE/csl/dev.json
new file mode 100644
index 0000000000..d43621bdca
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/csl/dev.json
@@ -0,0 +1,3 @@
+{"id": 1, "abst": "这是第一段很长的文本", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"], "label": "1"}
+{"id": 2, "abst": "这是第二段很长的文本", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"], "label": "1"}
+{"id": 3, "abst": "这是第三段很长的文本", "keyword": ["1", "2", "3"], "label": "0"}
diff --git a/tests/ut/data/dataset/testCLUE/csl/test.json b/tests/ut/data/dataset/testCLUE/csl/test.json
new file mode 100644
index 0000000000..9459fb3e09
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/csl/test.json
@@ -0,0 +1,3 @@
+{"id": 2415, "abst": "长文本1", "keyword": ["关键词1", "关键词2"]}
+{"id": 2565, "abst": "长文本2", "keyword": ["关键词1", "关键词2", "关键词3"]}
+{"id": 2625, "abst": "长文本3", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"]}
diff --git a/tests/ut/data/dataset/testCLUE/csl/train.json b/tests/ut/data/dataset/testCLUE/csl/train.json
new file mode 100644
index 0000000000..8e16f5b774
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/csl/train.json
@@ -0,0 +1,3 @@
+{"id": 1, "abst": "这是一段长文本", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"], "label": "0"}
+{"id": 2, "abst": "这是一段长文本", "keyword": ["关键词5", "关键词6", "关键词7", "关键词8"], "label": "0"}
+{"id": 3, "abst": "这是一段长文本", "keyword": ["关键词9", "关键词10", "关键词11", "关键词12"], "label": "0"}
diff --git a/tests/ut/data/dataset/testCLUE/iflytek/dev.json b/tests/ut/data/dataset/testCLUE/iflytek/dev.json
new file mode 100644
index 0000000000..95c8069a8a
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/iflytek/dev.json
@@ -0,0 +1,3 @@
+{"label": "110", "label_des": "社区超市", "sentence": "这是第一段文本"}
+{"label": "70", "label_des": "工具", "sentence": "这是第二段文本"}
+{"label": "10", "label_des": "社区服务", "sentence": "这是第三段文本"}
diff --git a/tests/ut/data/dataset/testCLUE/iflytek/test.json b/tests/ut/data/dataset/testCLUE/iflytek/test.json
new file mode 100644
index 0000000000..a7bf2bad7a
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/iflytek/test.json
@@ -0,0 +1,3 @@
+{"id": 0, "sentence": "文本1"}
+{"id": 1, "sentence": "文本2"}
+{"id": 2, "sentence": "文本3"}
diff --git a/tests/ut/data/dataset/testCLUE/iflytek/train.json b/tests/ut/data/dataset/testCLUE/iflytek/train.json
new file mode 100644
index 0000000000..786749bcb6
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/iflytek/train.json
@@ -0,0 +1,3 @@
+{"label": "11", "label_des": "薅羊毛", "sentence": "第一个文本"}
+{"label": "95", "label_des": "借贷", "sentence": "第二个文本"}
+{"label": "74", "label_des": "违章", "sentence": "第三个文本"}
diff --git a/tests/ut/data/dataset/testCLUE/tnews/dev.json b/tests/ut/data/dataset/testCLUE/tnews/dev.json
new file mode 100644
index 0000000000..0363cee745
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/tnews/dev.json
@@ -0,0 +1,3 @@
+{"label": "102", "label_desc": "news_entertainment", "sentence": "新闻1", "keywords": "关键词一,关键词二,关键词三,关键词四"}
+{"label": "110", "label_desc": "news_military", "sentence": "新闻2", "keywords": "关键词一,关键词二,关键词三,关键词四，关键词五"}
+{"label": "104", "label_desc": "news_finance", "sentence": "新闻3", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"}
diff --git a/tests/ut/data/dataset/testCLUE/tnews/test.json b/tests/ut/data/dataset/testCLUE/tnews/test.json
new file mode 100644
index 0000000000..39e36d91e2
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/tnews/test.json
@@ -0,0 +1,3 @@
+{"id": 0, "sentence": "新闻1", "keywords": "关键词1,关键词2,关键词3,关键词4,关键词5"}
+{"id": 1, "sentence": "新闻2", "keywords": "关键词1,关键词2,关键词3,关键词4"}
+{"id": 2, "sentence": "新闻3", "keywords": ""}
diff --git a/tests/ut/data/dataset/testCLUE/tnews/train.json b/tests/ut/data/dataset/testCLUE/tnews/train.json
new file mode 100644
index 0000000000..de784f1f82
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/tnews/train.json
@@ -0,0 +1,3 @@
+{"label": "108", "label_desc": "news_edu", "sentence": "新闻1", "keywords": ""}
+{"label": "104", "label_desc": "news_finance", "sentence": "新闻2", "keywords": "关键词1,关键词2,关键词3,关键词4,关键词5,关键词6"}
+{"label": "106", "label_desc": "news_house", "sentence": "新闻3", "keywords": ""}
diff --git a/tests/ut/data/dataset/testCLUE/wsc/dev.json b/tests/ut/data/dataset/testCLUE/wsc/dev.json
new file mode 100755
index 0000000000..57203a7fc9
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/wsc/dev.json
@@ -0,0 +1,3 @@
+{"target": {"span1_index": 0, "span1_text": "小明", "span2_index": 4, "span2_text": "他"}, "idx": 0, "text": "小明呢，他在哪？", "label": "true"}
+{"target": {"span1_index": 0, "span1_text": "小红", "span2_index": 9, "span2_text": "他"}, "idx": 1, "text": "小红刚刚看到小明，他在操场", "label": "false"}
+{"target": {"span1_index": 6, "span1_text": "小张", "span2_index": 8, "span2_text": "你"}, "idx": 2, "text": "等小明回来，小张你叫他交作业", "label": "true"}
diff --git a/tests/ut/data/dataset/testCLUE/wsc/test.json b/tests/ut/data/dataset/testCLUE/wsc/test.json
new file mode 100755
index 0000000000..c8e17d3e4c
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/wsc/test.json
@@ -0,0 +1,3 @@
+{"target": {"span1_index": 0, "span1_text": "小明", "span2_index": 4, "span2_text": "他"}, "idx": 0, "text": "小明呢，他在哪？"}
+{"target": {"span1_index": 0, "span1_text": "小红", "span2_index": 9, "span2_text": "他"}, "idx": 1, "text": "小红刚刚看到小明，他在操场"}
+{"target": {"span1_index": 6, "span1_text": "小张", "span2_index": 8, "span2_text": "你"}, "idx": 2, "text": "等小明回来，小张你叫他交作业"}
diff --git a/tests/ut/data/dataset/testCLUE/wsc/train.json b/tests/ut/data/dataset/testCLUE/wsc/train.json
new file mode 100755
index 0000000000..57203a7fc9
--- /dev/null
+++ b/tests/ut/data/dataset/testCLUE/wsc/train.json
@@ -0,0 +1,3 @@
+{"target": {"span1_index": 0, "span1_text": "小明", "span2_index": 4, "span2_text": "他"}, "idx": 0, "text": "小明呢，他在哪？", "label": "true"}
+{"target": {"span1_index": 0, "span1_text": "小红", "span2_index": 9, "span2_text": "他"}, "idx": 1, "text": "小红刚刚看到小明，他在操场", "label": "false"}
+{"target": {"span1_index": 6, "span1_text": "小张", "span2_index": 8, "span2_text": "你"}, "idx": 2, "text": "等小明回来，小张你叫他交作业", "label": "true"}
diff --git a/tests/ut/data/dataset/testCOCO/annotations/invalid.json b/tests/ut/data/dataset/testCOCO/annotations/invalid.json
new file mode 100644
index 0000000000..e07884e681
--- /dev/null
+++ b/tests/ut/data/dataset/testCOCO/annotations/invalid.json
@@ -0,0 +1 @@
+{"info": {"description": "COCO 2017 Dataset", "url": "http://cocodataset.org","version": "1.0","year": 2017,"contributor": "COCO Consortium", "data_created": "2017/09/01"}, "licenses": [{"url": "http://creativecommons.org/license/by-nc-sa/2.0/","id": 3,"name": "Attribution-Noncommercial License"}], "images": [{"license": 3, "file_name": "000000391895.jpg", "id": 391895}, {"license": 3, "file_name: "000000318219.jpg", "id": 318219}, {"license": 3, "file_name": "000000554625.jpg", "id": 554625}, {"license": 3, "file_name": "000000574769.jpg", "id": 574769}, {"license": 3, "file_name": "000000060623.jpg", "id": 60623}, {"license": 3, "file_name": "000000309022.jpg", "id": 309022}], "annotations": [{"segmentation": [[10.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]], "category_id": 1, "iscrowd": 0, "image_id": 391895, "bbox": [10,10,10,10], "area": 100, "id": 10000}, {"segmentation": [[20.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0]], "category_id": 2, "iscrowd": 0, "image_id": 318219, "bbox": [20,20,20,20], "area": 400, "id": 10001}, {"segmentation": [[40.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0,40.0,41.0,42.0]], "category_id": 3, "iscrowd": 0, "image_id": 554625, "bbox": [30,30,30,30], "area": 900, "id": 10002}, {"segmentation": [[50.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0]], "category_id": 4, "iscrowd": 0, "image_id": 574769, "bbox": [40,40,40,40], "area": 1600, "id": 10003}, {"segmentation": [[60.0,62.0,63.0,64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0]], "category_id": 5, "iscrowd": 0, "image_id": 60623, "bbox": [50,50,50,50], "area": 2500, "id": 10004}, {"segmentation": [[60.0,62.0,63.0,64.0,65.0,66.0,67.0],[68.0,69.0,70.0,71.0,72.0,73.0,74.0]], "category_id": 6, "iscrowd": 0, "image_id": 309022, "bbox": [60,60,60,60], "area": 3600, "id": 10005}, {"segmentation": [[70.0,72.0,73.0,74.0,75.0]], "category_id": 7, "iscrowd": 0, "image_id": 391895, "bbox": [70,70,70,70], "area": 4900, "id": 10006}, {"segmentation": {"counts": [10.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0],"size": [200,300]}, "category_id": 8, "iscrowd": 1, "image_id": 318219, "bbox": [80,80,80,80], "area": 6400, "id": 10007}], "categories": [{"supercategory": "person", "id": 1, "name": "person"},{"supercategory": "vehicle", "id": 2, "name": "bicycle"},{"supercategory": "vehicle", "id": 3, "name": "car"},{"supercategory": "vehicle", "id": 4, "name": "cat"},{"supercategory": "vehicle", "id": 5, "name": "dog"},{"supercategory": "vehicle", "id": 6, "name": "monkey"},{"supercategory": "vehicle", "id": 6, "name": "monkey"},{"supercategory": "vehicle", "id": 7, "name": "monkey"}]}
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testCOCO/annotations/invalid_category_id.json b/tests/ut/data/dataset/testCOCO/annotations/invalid_category_id.json
new file mode 100644
index 0000000000..f02de79f47
--- /dev/null
+++ b/tests/ut/data/dataset/testCOCO/annotations/invalid_category_id.json
@@ -0,0 +1 @@
+{"info": {"description": "COCO 2017 Dataset", "url": "http://cocodataset.org","version": "1.0","year": 2017,"contributor": "COCO Consortium", "data_created": "2017/09/01"}, "licenses": [{"url": "http://creativecommons.org/license/by-nc-sa/2.0/","id": 3,"name": "Attribution-Noncommercial License"}], "images": [{"license": 3, "file_name": "000000391895.jpg", "id": 391895}, {"license": 3, "file_name": "000000318219.jpg", "id": 318219}, {"license": 3, "file_name": "000000554625.jpg", "id": 554625}, {"license": 3, "file_name": "000000574769.jpg", "id": 574769}, {"license": 3, "file_name": "000000060623.jpg", "id": 60623}, {"license": 3, "file_name": "000000309022.jpg", "id": 309022}], "annotations": [{"segmentation": [[10.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]], "category_id": 1, "iscrowd": 0, "image_id": 391895, "bbox": [10,10,10,10], "area": 100, "id": 10000}, {"segmentation": [[20.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0]], "category_id": 2, "iscrowd": 0, "image_id": 318219, "bbox": [20,20,20,20], "area": 400, "id": 10001}, {"segmentation": [[40.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0,40.0,41.0,42.0]], "category_id": 3, "iscrowd": 0, "image_id": 554625, "bbox": [30,30,30,30], "area": 900, "id": 10002}, {"segmentation": [[50.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0]], "category_id": 4, "iscrowd": 0, "image_id": 574769, "bbox": [40,40,40,40], "area": 1600, "id": 10003}, {"segmentation": [[60.0,62.0,63.0,64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0]], "category_id": 5, "iscrowd": 0, "image_id": 60623, "bbox": [50,50,50,50], "area": 2500, "id": 10004}, {"segmentation": [[60.0,62.0,63.0,64.0,65.0,66.0,67.0],[68.0,69.0,70.0,71.0,72.0,73.0,74.0]], "category_id": 6, "iscrowd": 0, "image_id": 309022, "bbox": [60,60,60,60], "area": 3600, "id": 10005}, {"segmentation": [[70.0,72.0,73.0,74.0,75.0]], "category_id": 7, "iscrowd": 0, "image_id": 391895, "bbox": [70,70,70,70], "area": 4900, "id": 10006}, {"segmentation": {"counts": [10.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0],"size": [200,300]}, "category_id": 8, "iscrowd": 1, "image_id": 318219, "bbox": [80,80,80,80], "area": 6400, "id": 10007}], "categories": [{"supercategory": "person", "id": 1, "name": "person"},{"supercategory": "vehicle", "id": 2, "name": "bicycle"},{"supercategory": "vehicle", "id": 3, "name": "car"},{"supercategory": "vehicle", "id": 4, "name": "cat"},{"supercategory": "vehicle", "id": 5, "name": "dog"},{"supercategory": "vehicle", "id": 6, "name": "monkey"}]}
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testCOCO/annotations/key_point.json b/tests/ut/data/dataset/testCOCO/annotations/key_point.json
new file mode 100644
index 0000000000..f7f8be0baf
--- /dev/null
+++ b/tests/ut/data/dataset/testCOCO/annotations/key_point.json
@@ -0,0 +1 @@
+{"info": {"description": "COCO 2017 Dataset", "url": "http://cocodataset.org", "version": "1.0", "year": 2017, "contributor": "COCO Consortium", "data_created": "2017/09/01"}, "images":[{"license": 3, "file_name": "000000391895.jpg", "id": 391895},{"license": 3, "file_name": "000000318219.jpg", "id": 318219}],"annotations": [{"segmentation": [[10.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]], "num_keypoints": 10,"area": 12345,"iscrowd": 0,"keypoints": [244,139,2,0,0,0,226,118,2,0,0,0,154,159,2,143,261,2,135,312,2,271,423,2,184,530,2,261,280,2,347,592,2,0,0,0,123,596,2,0,0,0,0,0,0,0,0,0,0,0,0],"image_id": 318219,"bbox": [40.65,38.8,418.38,601.2],"category_id": 1, "id": 491276},{"segmentation": [[20.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0]], "num_keypoints": 14,"area": 45678,"iscrowd": 0,"keypoints": [368,61,1,369,52,2,0,0,0,382,48,2,0,0,0,368,84,2,435,81,2,362,125,2,446,125,2,360,153,2,0,0,0,397,167,1,439,166,1,369,193,2,461,234,2,361,246,2,474,287,2],"image_id": 391895,"bbox": [339.88,22.16,153.88,300.73],"category_id": 1, "id": 202758}]}
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testCOCO/annotations/lack_of_images.json b/tests/ut/data/dataset/testCOCO/annotations/lack_of_images.json
new file mode 100644
index 0000000000..113de527f7
--- /dev/null
+++ b/tests/ut/data/dataset/testCOCO/annotations/lack_of_images.json
@@ -0,0 +1 @@
+{"info": {"description": "COCO 2017 Dataset", "url": "http://cocodataset.org","version": "1.0","year": 2017,"contributor": "COCO Consortium", "data_created": "2017/09/01"}, "licenses": [{"url": "http://creativecommons.org/license/by-nc-sa/2.0/","id": 3,"name": "Attribution-Noncommercial License"}], "image": [{"license": 3, "file_name": "000000391895.jpg", "id": 391895}, {"license": 3, "file_name": "000000318219.jpg", "id": 318219}, {"license": 3, "file_name": "000000554625.jpg", "id": 554625}, {"license": 3, "file_name": "000000574769.jpg", "id": 574769}, {"license": 3, "file_name": "000000060623.jpg", "id": 60623}, {"license": 3, "file_name": "000000309022.jpg", "id": 309022}], "annotations": [{"segmentation": [[10.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]], "category_id": 1, "iscrowd": 0, "image_id": 391895, "bbox": [10,10,10,10], "area": 100, "id": 10000}, {"segmentation": [[20.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0]], "category_id": 2, "iscrowd": 0, "image_id": 318219, "bbox": [20,20,20,20], "area": 400, "id": 10001}, {"segmentation": [[40.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0,40.0,41.0,42.0]], "category_id": 3, "iscrowd": 0, "image_id": 554625, "bbox": [30,30,30,30], "area": 900, "id": 10002}, {"segmentation": [[50.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0]], "category_id": 4, "iscrowd": 0, "image_id": 574769, "bbox": [40,40,40,40], "area": 1600, "id": 10003}, {"segmentation": [[60.0,62.0,63.0,64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0]], "category_id": 5, "iscrowd": 0, "image_id": 60623, "bbox": [50,50,50,50], "area": 2500, "id": 10004}, {"segmentation": [[60.0,62.0,63.0,64.0,65.0,66.0,67.0],[68.0,69.0,70.0,71.0,72.0,73.0,74.0]], "category_id": 6, "iscrowd": 0, "image_id": 309022, "bbox": [60,60,60,60], "area": 3600, "id": 10005}, {"segmentation": [[70.0,72.0,73.0,74.0,75.0]], "category_id": 7, "iscrowd": 0, "image_id": 391895, "bbox": [70,70,70,70], "area": 4900, "id": 10006}, {"segmentation": {"counts": [10.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0],"size": [200,300]}, "category_id": 8, "iscrowd": 1, "image_id": 318219, "bbox": [80,80,80,80], "area": 6400, "id": 10007}], "categories": [{"supercategory": "person", "id": 1, "name": "person"},{"supercategory": "vehicle", "id": 2, "name": "bicycle"},{"supercategory": "vehicle", "id": 3, "name": "car"},{"supercategory": "vehicle", "id": 4, "name": "cat"},{"supercategory": "vehicle", "id": 5, "name": "dog"},{"supercategory": "vehicle", "id": 6, "name": "monkey"},{"supercategory": "vehicle", "id": 6, "name": "monkey"},{"supercategory": "vehicle", "id": 7, "name": "monkey"}]}
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testCOCO/annotations/panoptic.json b/tests/ut/data/dataset/testCOCO/annotations/panoptic.json
new file mode 100644
index 0000000000..8b299fd772
--- /dev/null
+++ b/tests/ut/data/dataset/testCOCO/annotations/panoptic.json
@@ -0,0 +1 @@
+{"info": {"description": "COCO 2017 Dataset", "url": "http://cocodataset.org","version": "1.0","year": 2017,"contributor": "COCO Consortium", "data_created": "2017/09/01"},"licenses": [{"url": "http://creativecommons.org/license/by-nc-sa/2.0/","id": 3,"name": "Attribution-Noncommercial License"}],"images": [{"license": 3, "file_name": "000000391895.jpg", "id": 391895}, {"license": 3, "file_name": "000000574769.jpg", "id": 574769}],"annotations": [{"segments_info": [{"id": 10461346, "category_id": 1, "iscrowd": 0, "bbox": [472,173,36,48],"area": 705},{"id": 5263261, "category_id": 1, "iscrowd": 0, "bbox": [340,22,154,301],"area": 14062},{"id": 770861, "category_id": 2, "iscrowd": 0, "bbox": [486, 183, 30, 35],"area": 626}], "file_name": "000000391895", "image_id": 391895}, {"segments_info": [{"id": 5000790, "category_id": 1, "iscrowd": 0, "bbox": [103,133,229,422],"area": 43102},{"id": 35650815, "category_id": 3, "iscrowd": 0, "bbox": [243,175,93,164],"area": 6079}], "file_name": "000000574769.png", "image_id": 574769}],"categories": [{"supercategory": "person","isthing": 1,"id": 1,"name": "person"},{"supercategory": "vehicle","isthing": 1,"id": 2,"name": "bicycle"},{"supercategory": "vehicle","isthing": 1,"id": 3, "name": "car"}]}
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testCOCO/annotations/train.json b/tests/ut/data/dataset/testCOCO/annotations/train.json
new file mode 100644
index 0000000000..40d4149638
--- /dev/null
+++ b/tests/ut/data/dataset/testCOCO/annotations/train.json
@@ -0,0 +1 @@
+{"info": {"description": "COCO 2017 Dataset", "url": "http://cocodataset.org","version": "1.0","year": 2017,"contributor": "COCO Consortium", "data_created": "2017/09/01"}, "licenses": [{"url": "http://creativecommons.org/license/by-nc-sa/2.0/","id": 3,"name": "Attribution-Noncommercial License"}], "images": [{"license": 3, "file_name": "000000391895.jpg", "id": 391895}, {"license": 3, "file_name": "000000318219.jpg", "id": 318219}, {"license": 3, "file_name": "000000554625.jpg", "id": 554625}, {"license": 3, "file_name": "000000574769.jpg", "id": 574769}, {"license": 3, "file_name": "000000060623.jpg", "id": 60623}, {"license": 3, "file_name": "000000309022.jpg", "id": 309022}], "annotations": [{"segmentation": [[10.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]], "category_id": 1, "iscrowd": 0, "image_id": 391895, "bbox": [10,10,10,10], "area": 100, "id": 10000}, {"segmentation": [[20.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0]], "category_id": 2, "iscrowd": 0, "image_id": 318219, "bbox": [20,20,20,20], "area": 400, "id": 10001}, {"segmentation": [[40.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0,40.0,41.0,42.0]], "category_id": 3, "iscrowd": 0, "image_id": 554625, "bbox": [30,30,30,30], "area": 900, "id": 10002}, {"segmentation": [[50.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0]], "category_id": 4, "iscrowd": 0, "image_id": 574769, "bbox": [40,40,40,40], "area": 1600, "id": 10003}, {"segmentation": [[60.0,62.0,63.0,64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0]], "category_id": 5, "iscrowd": 0, "image_id": 60623, "bbox": [50,50,50,50], "area": 2500, "id": 10004}, {"segmentation": [[60.0,62.0,63.0,64.0,65.0,66.0,67.0],[68.0,69.0,70.0,71.0,72.0,73.0,74.0]], "category_id": 6, "iscrowd": 0, "image_id": 309022, "bbox": [60,60,60,60], "area": 3600, "id": 10005}, {"segmentation": [[70.0,72.0,73.0,74.0,75.0]], "category_id": 7, "iscrowd": 0, "image_id": 391895, "bbox": [70,70,70,70], "area": 4900, "id": 10006}, {"segmentation": {"counts": [10.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0],"size": [200,300]}, "category_id": 8, "iscrowd": 1, "image_id": 318219, "bbox": [80,80,80,80], "area": 6400, "id": 10007}], "categories": [{"supercategory": "person", "id": 1, "name": "person"},{"supercategory": "vehicle", "id": 2, "name": "bicycle"},{"supercategory": "vehicle", "id": 3, "name": "car"},{"supercategory": "vehicle", "id": 4, "name": "cat"},{"supercategory": "vehicle", "id": 5, "name": "dog"},{"supercategory": "vehicle", "id": 6, "name": "monkey"},{"supercategory": "vehicle", "id": 7, "name": "bag"},{"supercategory": "vehicle", "id": 8, "name": "orange"}]}
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testCOCO/train/000000060623.jpg b/tests/ut/data/dataset/testCOCO/train/000000060623.jpg
new file mode 100644
index 0000000000..d622ab7d90
Binary files /dev/null and b/tests/ut/data/dataset/testCOCO/train/000000060623.jpg differ
diff --git a/tests/ut/data/dataset/testCOCO/train/000000309022.jpg b/tests/ut/data/dataset/testCOCO/train/000000309022.jpg
new file mode 100644
index 0000000000..f723a44c29
Binary files /dev/null and b/tests/ut/data/dataset/testCOCO/train/000000309022.jpg differ
diff --git a/tests/ut/data/dataset/testCOCO/train/000000318219.jpg b/tests/ut/data/dataset/testCOCO/train/000000318219.jpg
new file mode 100644
index 0000000000..eefa912354
Binary files /dev/null and b/tests/ut/data/dataset/testCOCO/train/000000318219.jpg differ
diff --git a/tests/ut/data/dataset/testCOCO/train/000000391895.jpg b/tests/ut/data/dataset/testCOCO/train/000000391895.jpg
new file mode 100644
index 0000000000..d6575891cb
Binary files /dev/null and b/tests/ut/data/dataset/testCOCO/train/000000391895.jpg differ
diff --git a/tests/ut/data/dataset/testCOCO/train/000000554625.jpg b/tests/ut/data/dataset/testCOCO/train/000000554625.jpg
new file mode 100644
index 0000000000..5ca2194e88
Binary files /dev/null and b/tests/ut/data/dataset/testCOCO/train/000000554625.jpg differ
diff --git a/tests/ut/data/dataset/testCOCO/train/000000574769.jpg b/tests/ut/data/dataset/testCOCO/train/000000574769.jpg
new file mode 100644
index 0000000000..add5186cfe
Binary files /dev/null and b/tests/ut/data/dataset/testCOCO/train/000000574769.jpg differ
diff --git a/tests/ut/data/dataset/testNumpySlicesDataset/heart.csv b/tests/ut/data/dataset/testNumpySlicesDataset/heart.csv
new file mode 100644
index 0000000000..92bc9db643
--- /dev/null
+++ b/tests/ut/data/dataset/testNumpySlicesDataset/heart.csv
@@ -0,0 +1,6 @@
+age,sex,height,weight,slope,state,target
+65,0,161,45,93,fixed,1
+72,1,164,60,86,good,0
+45,0,174,70,79,bad,1
+73,1,173,65,70,good,1
+55,1,182,80,104,good,0
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testTokenizerData/basic_tokenizer.txt b/tests/ut/data/dataset/testTokenizerData/basic_tokenizer.txt
new file mode 100644
index 0000000000..6e18b19f7c
--- /dev/null
+++ b/tests/ut/data/dataset/testTokenizerData/basic_tokenizer.txt
@@ -0,0 +1,7 @@
+Welcome to Beijing北京欢迎您
+長風破浪會有時，直掛雲帆濟滄海
+😀嘿嘿😃哈哈😄大笑😁嘻嘻
+明朝（1368—1644年）和清朝（1644—1911年），是中国封建王朝史上最后两个朝代
+明代（1368-1644）と清代（1644-1911）は、中国の封建王朝の歴史における最後の2つの王朝でした
+명나라 (1368-1644)와 청나라 (1644-1911)는 중국 봉건 왕조의 역사에서 마지막 두 왕조였다
+Tĥïŝ ĩš â fůňķŷ Šťŕĭńġ
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt b/tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt
new file mode 100644
index 0000000000..657b759976
--- /dev/null
+++ b/tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt
@@ -0,0 +1,14 @@
+床前明月光
+疑是地上霜
+举头望明月
+低头思故乡
+I am making small mistakes during working hours
+😀嘿嘿😃哈哈😄大笑😁嘻嘻
+繁體字
+unused [CLS]
+unused [SEP]
+unused [UNK]
+unused [PAD]
+unused [MASK]
+12+/-28=40/-16
+Hello World!
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testTokenizerData/normalize.txt b/tests/ut/data/dataset/testTokenizerData/normalize.txt
new file mode 100644
index 0000000000..85db53b845
--- /dev/null
+++ b/tests/ut/data/dataset/testTokenizerData/normalize.txt
@@ -0,0 +1,6 @@
+ṩ
+ḍ̇
+q̣̇
+ﬁ
+2⁵
+ẛ̣
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testTokenizerData/regex_replace.txt b/tests/ut/data/dataset/testTokenizerData/regex_replace.txt
new file mode 100644
index 0000000000..a1342ddb0f
--- /dev/null
+++ b/tests/ut/data/dataset/testTokenizerData/regex_replace.txt
@@ -0,0 +1,8 @@
+Hello World
+Let's Go
+1:hello
+2:world
+31:beijing
+Welcome to China!
+  我	不想  长大	
+Welcome to Shenzhen!
diff --git a/tests/ut/data/dataset/testTokenizerData/regex_tokenizer.txt b/tests/ut/data/dataset/testTokenizerData/regex_tokenizer.txt
new file mode 100644
index 0000000000..5846355afe
--- /dev/null
+++ b/tests/ut/data/dataset/testTokenizerData/regex_tokenizer.txt
@@ -0,0 +1,3 @@
+Welcome to Shenzhen!
+北京欢迎您!Welcome to Beijing!
+12￥+36￥=?
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testTokenizerData/wordpiece_tokenizer.txt b/tests/ut/data/dataset/testTokenizerData/wordpiece_tokenizer.txt
new file mode 100644
index 0000000000..683eebd221
--- /dev/null
+++ b/tests/ut/data/dataset/testTokenizerData/wordpiece_tokenizer.txt
@@ -0,0 +1,25 @@
+my
+favorite
+book
+is
+love
+during
+the
+cholera
+era
+what
+我
+最
+喜
+欢
+的
+书
+是
+霍
+乱
+时
+期
+的
+爱
+情
+您
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/121.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/121.xml
new file mode 100644
index 0000000000..73814c79e9
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/121.xml
@@ -0,0 +1,27 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>121.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>375</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>dog</name>
+		<pose>Frontal</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>55</xmin>
+			<ymin>34</ymin>
+			<xmax>624</xmax>
+			<ymax>555</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/123.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/123.xml
new file mode 100644
index 0000000000..8c985d3c69
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/123.xml
@@ -0,0 +1,27 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>123.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>375</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>car</name>
+		<pose>Unspecified</pose>
+		<truncated>1</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>42</xmin>
+			<ymin>6</ymin>
+			<xmax>610</xmax>
+			<ymax>600</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/129.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/129.xml
new file mode 100644
index 0000000000..62777d9457
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/129.xml
@@ -0,0 +1,27 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>129.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>375</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>dog</name>
+		<pose>Frontal</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>1328</xmin>
+			<ymin>431</ymin>
+			<xmax>2662</xmax>
+			<ymax>1695</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/15.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/15.xml
new file mode 100644
index 0000000000..1596f5e4fe
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/15.xml
@@ -0,0 +1,27 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>32.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>281</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>train</name>
+		<pose>Frontal</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>1168</xmin>
+			<ymin>405</ymin>
+			<xmax>3270</xmax>
+			<ymax>2022</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/32.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/32.xml
new file mode 100644
index 0000000000..f65be6b2ed
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/32.xml
@@ -0,0 +1,27 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>32.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>281</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>train</name>
+		<pose>Frontal</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>1168</xmin>
+			<ymin>405</ymin>
+			<xmax>3270</xmax>
+			<ymax>2022</ymax>
+		</bndbox>
+	</object>
+	</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/33.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/33.xml
new file mode 100644
index 0000000000..93ca455577
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/33.xml
@@ -0,0 +1,27 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>33.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>366</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>person</name>
+		<pose>Unspecified</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>1168</xmin>
+			<ymin>395</ymin>
+			<xmax>2859</xmax>
+			<ymax>2084</ymax>
+		</bndbox>
+	</object>
+	</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/39.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/39.xml
new file mode 100644
index 0000000000..039a34bd6c
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/39.xml
@@ -0,0 +1,27 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>39.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>375</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>dog</name>
+		<pose>Unspecified</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>684</xmin>
+			<ymin>311</ymin>
+			<xmax>3112</xmax>
+			<ymax>1820</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/42.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/42.xml
new file mode 100644
index 0000000000..91471630e3
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/42.xml
@@ -0,0 +1,27 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>42.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>335</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>person</name>
+		<pose>Unspecified</pose>
+		<truncated>1</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>874</xmin>
+			<ymin>152</ymin>
+			<xmax>2827</xmax>
+			<ymax>2000</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/61.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/61.xml
new file mode 100644
index 0000000000..989e3c07d1
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/61.xml
@@ -0,0 +1,39 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>61.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>333</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>train</name>
+		<pose>Unspecified</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>25</xmin>
+			<ymin>40</ymin>
+			<xmax>641</xmax>
+			<ymax>613</ymax>
+		</bndbox>
+	</object>
+	<object>
+		<name>person</name>
+		<pose>Frontal</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>204</xmin>
+			<ymin>198</ymin>
+			<xmax>271</xmax>
+			<ymax>293</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/63.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/63.xml
new file mode 100644
index 0000000000..51b72c3d05
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/63.xml
@@ -0,0 +1,39 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>63.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>375</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>cat</name>
+		<pose>Unspecified</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>23</xmin>
+			<ymin>17</ymin>
+			<xmax>565</xmax>
+			<ymax>591</ymax>
+		</bndbox>
+	</object>
+	<object>
+		<name>chair</name>
+		<pose>Frontal</pose>
+		<truncated>1</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>36</xmin>
+			<ymin>11</ymin>
+			<xmax>439</xmax>
+			<ymax>499</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/68.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/68.xml
new file mode 100644
index 0000000000..df0b6781ed
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/68.xml
@@ -0,0 +1,27 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>68.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>375</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>cat</name>
+		<pose>Unspecified</pose>
+		<truncated>1</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>35</xmin>
+			<ymin>11</ymin>
+			<xmax>564</xmax>
+			<ymax>545</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/invalidxml.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/invalidxml.xml
new file mode 100644
index 0000000000..8f6015b9da
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/invalidxml.xml
@@ -0,0 +1 @@
+invalidxml
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testVOC2012_2/Annotations/xmlnoobject.xml b/tests/ut/data/dataset/testVOC2012_2/Annotations/xmlnoobject.xml
new file mode 100644
index 0000000000..e0781e84f0
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/Annotations/xmlnoobject.xml
@@ -0,0 +1,15 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>33.jpg</filename>
+	<source>
+		<database>simulate VOC2007 Database</database>
+		<annotation>simulate VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>366</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+</annotation>
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/invalidxml.txt b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/invalidxml.txt
new file mode 100644
index 0000000000..d12b49a0ef
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/invalidxml.txt
@@ -0,0 +1 @@
+invalidxml
diff --git a/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/train.txt b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/train.txt
new file mode 100644
index 0000000000..54a7daf241
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/train.txt
@@ -0,0 +1,11 @@
+15
+32
+33
+39
+42
+61
+63
+68
+121
+123
+129
diff --git a/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/trainval.txt b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/trainval.txt
new file mode 100644
index 0000000000..60d3b2f4a4
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/trainval.txt
@@ -0,0 +1 @@
+15
diff --git a/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/val.txt b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/val.txt
new file mode 100644
index 0000000000..60d3b2f4a4
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/val.txt
@@ -0,0 +1 @@
+15
diff --git a/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/xmlnoobject.txt b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/xmlnoobject.txt
new file mode 100644
index 0000000000..bf42aaf75d
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/xmlnoobject.txt
@@ -0,0 +1 @@
+xmlnoobject
diff --git a/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/xmlnotexist.txt b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/xmlnotexist.txt
new file mode 100644
index 0000000000..4beb327e22
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Main/xmlnotexist.txt
@@ -0,0 +1 @@
+4176
diff --git a/tests/ut/data/dataset/testVOC2012_2/ImageSets/Segmentation/train.txt b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Segmentation/train.txt
new file mode 100644
index 0000000000..8a03056ffc
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Segmentation/train.txt
@@ -0,0 +1,10 @@
+32
+33
+39
+42
+61
+63
+68
+121
+123
+129
diff --git a/tests/ut/data/dataset/testVOC2012_2/ImageSets/Segmentation/trainval.txt b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Segmentation/trainval.txt
new file mode 100644
index 0000000000..3f10ffe7a4
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Segmentation/trainval.txt
@@ -0,0 +1 @@
+15
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testVOC2012_2/ImageSets/Segmentation/val.txt b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Segmentation/val.txt
new file mode 100644
index 0000000000..3f10ffe7a4
--- /dev/null
+++ b/tests/ut/data/dataset/testVOC2012_2/ImageSets/Segmentation/val.txt
@@ -0,0 +1 @@
+15
\ No newline at end of file
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/121.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/121.jpg
new file mode 100644
index 0000000000..53ce82f642
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/121.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/123.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/123.jpg
new file mode 100644
index 0000000000..4f44baacb5
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/123.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/129.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/129.jpg
new file mode 100644
index 0000000000..023bc50316
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/129.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/15.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/15.jpg
new file mode 100644
index 0000000000..d6575891cb
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/15.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/32.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/32.jpg
new file mode 100644
index 0000000000..d6575891cb
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/32.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/33.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/33.jpg
new file mode 100644
index 0000000000..1ce2f2801b
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/33.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/39.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/39.jpg
new file mode 100644
index 0000000000..f723a44c29
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/39.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/42.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/42.jpg
new file mode 100644
index 0000000000..d622ab7d90
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/42.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/61.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/61.jpg
new file mode 100644
index 0000000000..add5186cfe
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/61.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/63.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/63.jpg
new file mode 100644
index 0000000000..5ca2194e88
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/63.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/JPEGImages/68.jpg b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/68.jpg
new file mode 100644
index 0000000000..eefa912354
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/JPEGImages/68.jpg differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/121.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/121.png
new file mode 100644
index 0000000000..c8c46504e5
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/121.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/123.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/123.png
new file mode 100644
index 0000000000..c7e1792fb1
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/123.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/129.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/129.png
new file mode 100644
index 0000000000..131d681362
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/129.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/15.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/15.png
new file mode 100644
index 0000000000..7d8c2ace0e
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/15.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/32.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/32.png
new file mode 100644
index 0000000000..b3efd92cd9
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/32.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/33.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/33.png
new file mode 100644
index 0000000000..b53109f8fe
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/33.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/39.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/39.png
new file mode 100644
index 0000000000..a3f51afe1e
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/39.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/42.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/42.png
new file mode 100644
index 0000000000..8326250b26
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/42.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/61.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/61.png
new file mode 100644
index 0000000000..913ef0c282
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/61.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/63.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/63.png
new file mode 100644
index 0000000000..6b4e216ce6
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/63.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/68.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/68.png
new file mode 100644
index 0000000000..8a4b0a6dfc
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationClass/68.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/121.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/121.png
new file mode 100644
index 0000000000..c8c46504e5
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/121.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/123.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/123.png
new file mode 100644
index 0000000000..c7e1792fb1
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/123.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/129.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/129.png
new file mode 100644
index 0000000000..131d681362
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/129.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/15.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/15.png
new file mode 100644
index 0000000000..7d8c2ace0e
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/15.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/32.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/32.png
new file mode 100644
index 0000000000..b3efd92cd9
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/32.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/33.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/33.png
new file mode 100644
index 0000000000..b53109f8fe
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/33.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/39.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/39.png
new file mode 100644
index 0000000000..a3f51afe1e
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/39.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/42.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/42.png
new file mode 100644
index 0000000000..8326250b26
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/42.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/61.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/61.png
new file mode 100644
index 0000000000..913ef0c282
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/61.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/63.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/63.png
new file mode 100644
index 0000000000..6b4e216ce6
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/63.png differ
diff --git a/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/68.png b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/68.png
new file mode 100644
index 0000000000..8a4b0a6dfc
Binary files /dev/null and b/tests/ut/data/dataset/testVOC2012_2/SegmentationObject/68.png differ
diff --git a/tests/ut/data/dataset/testVocab/simple_vocab_list.txt b/tests/ut/data/dataset/testVocab/simple_vocab_list.txt
new file mode 100644
index 0000000000..ee4f612b7e
--- /dev/null
+++ b/tests/ut/data/dataset/testVocab/simple_vocab_list.txt
@@ -0,0 +1,8 @@
+w1
+w2
+w3
+w4
+w5
+w6
+w7
+w8
diff --git a/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionAll.json b/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionAll.json
deleted file mode 100644
index a83c5405f9..0000000000
--- a/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionAll.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "deviceNum":3,
-  "deviceId":1,
-  "shardConfig":"ALL",
-  "shuffle":"ON",
-  "seed": 0,
-  "epoch": 2
-}
diff --git a/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionEqualRows.json b/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionEqualRows.json
deleted file mode 100644
index 8dd85bfaf5..0000000000
--- a/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionEqualRows.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "deviceNum":7,
-  "deviceId":6,
-  "shardConfig":"RANDOM",
-  "shuffle":"ON",
-  "seed": 0,
-  "epoch": 1
-}
diff --git a/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionRandom.json b/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionRandom.json
deleted file mode 100644
index 0dce5921af..0000000000
--- a/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionRandom.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "deviceNum":3,
-  "deviceId":1,
-  "shardConfig":"RANDOM",
-  "shuffle":"ON",
-  "seed": 0,
-  "epoch": 1
-}
diff --git a/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionUnique.json b/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionUnique.json
deleted file mode 100644
index ebff298052..0000000000
--- a/tests/ut/data/dataset/test_tf_file_3_images2/dataDistributionUnique.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "deviceNum":3,
-  "deviceId":1,
-  "shardConfig":"UNIQUE",
-  "shuffle":"ON",
-  "seed": 0,
-  "epoch": 3
-}
diff --git a/tests/ut/data/mindrecord/testCsv/data.csv b/tests/ut/data/mindrecord/testCsv/data.csv
new file mode 100644
index 0000000000..8dad64b3c9
--- /dev/null
+++ b/tests/ut/data/mindrecord/testCsv/data.csv
@@ -0,0 +1,7 @@
+Age,EmployNumber,Name,Sales,Over18
+21, 10023,john, 123.45,True
+41, 10223,tom, 12111,True
+51, 10231,bob, 8779.0,True
+86, 10053,alice, 7777,True
+26, 1053,carol, 12345.8,False
+
diff --git a/tests/ut/data/mindrecord/testGraphData/sns b/tests/ut/data/mindrecord/testGraphData/sns
new file mode 100644
index 0000000000..37a2c3dd30
Binary files /dev/null and b/tests/ut/data/mindrecord/testGraphData/sns differ
diff --git a/tests/ut/data/mindrecord/testGraphData/sns.db b/tests/ut/data/mindrecord/testGraphData/sns.db
new file mode 100644
index 0000000000..14d0b4f6b9
Binary files /dev/null and b/tests/ut/data/mindrecord/testGraphData/sns.db differ
diff --git a/tests/ut/data/mindrecord/testGraphData/testdata b/tests/ut/data/mindrecord/testGraphData/testdata
index 8978131ee1..e206469ac6 100644
Binary files a/tests/ut/data/mindrecord/testGraphData/testdata and b/tests/ut/data/mindrecord/testGraphData/testdata differ
diff --git a/tests/ut/data/mindrecord/testGraphData/testdata.db b/tests/ut/data/mindrecord/testGraphData/testdata.db
index f846a67009..541da0e998 100644
Binary files a/tests/ut/data/mindrecord/testGraphData/testdata.db and b/tests/ut/data/mindrecord/testGraphData/testdata.db differ
diff --git a/tests/ut/data/mindrecord/testTFRecordData/README.md b/tests/ut/data/mindrecord/testTFRecordData/README.md
new file mode 100644
index 0000000000..f358cd564b
--- /dev/null
+++ b/tests/ut/data/mindrecord/testTFRecordData/README.md
@@ -0,0 +1 @@
+## tfrecord file dir
diff --git a/tests/ut/python/dataset/prep_data.py b/tests/ut/python/dataset/prep_data.py
deleted file mode 100644
index 46e656c29d..0000000000
--- a/tests/ut/python/dataset/prep_data.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# import jsbeautifier
-
-import os
-import urllib
-import urllib.request
-
-
-def create_data_cache_dir():
-    cwd = os.getcwd()
-    target_directory = os.path.join(cwd, "data_cache")
-    try:
-        if not os.path.exists(target_directory):
-            os.mkdir(target_directory)
-    except OSError:
-        print("Creation of the directory %s failed" % target_directory)
-    return target_directory
-
-
-def download_and_uncompress(files, source_url, target_directory, is_tar=False):
-    for f in files:
-        url = source_url + f
-        target_file = os.path.join(target_directory, f)
-
-        ##check if file already downloaded
-        if not (os.path.exists(target_file) or os.path.exists(target_file[:-3])):
-            urllib.request.urlretrieve(url, target_file)
-            if is_tar:
-                print("extracting from local tar file " + target_file)
-                rc = os.system("tar -C " + target_directory + " -xvf " + target_file)
-            else:
-                print("unzipping " + target_file)
-                rc = os.system("gunzip -f " + target_file)
-            if rc != 0:
-                print("Failed to uncompress ", target_file, " removing")
-                os.system("rm " + target_file)
-                ##exit with error so that build script will fail
-                raise SystemError
-        else:
-            print("Using cached dataset at ", target_file)
-
-
-def download_mnist(target_directory=None):
-    if target_directory is None:
-        target_directory = create_data_cache_dir()
-
-        ##create mnst directory
-        target_directory = os.path.join(target_directory, "mnist")
-        try:
-            if not os.path.exists(target_directory):
-                os.mkdir(target_directory)
-        except OSError:
-            print("Creation of the directory %s failed" % target_directory)
-
-    MNIST_URL = "http://yann.lecun.com/exdb/mnist/"
-    files = ['train-images-idx3-ubyte.gz',
-             'train-labels-idx1-ubyte.gz',
-             't10k-images-idx3-ubyte.gz',
-             't10k-labels-idx1-ubyte.gz']
-    download_and_uncompress(files, MNIST_URL, target_directory, is_tar=False)
-
-    return target_directory, os.path.join(target_directory, "datasetSchema.json")
-
-
-CIFAR_URL = "https://www.cs.toronto.edu/~kriz/"
-
-
-def download_cifar(target_directory, files, directory_from_tar):
-    if target_directory is None:
-        target_directory = create_data_cache_dir()
-
-    download_and_uncompress([files], CIFAR_URL, target_directory, is_tar=True)
-
-    ##if target dir was specify move data from directory created by tar
-    ##and put data into target dir
-    if target_directory is not None:
-        tar_dir_full_path = os.path.join(target_directory, directory_from_tar)
-        all_files = os.path.join(tar_dir_full_path, "*")
-        cmd = "mv " + all_files + " " + target_directory
-        if os.path.exists(tar_dir_full_path):
-            print("copy files back to target_directory")
-            print("Executing: ", cmd)
-            rc1 = os.system(cmd)
-            rc2 = os.system("rm -r " + tar_dir_full_path)
-            if rc1 != 0 or rc2 != 0:
-                print("error when running command: ", cmd)
-                download_file = os.path.join(target_directory, files)
-                print("removing " + download_file)
-                os.system("rm " + download_file)
-
-                ##exit with error so that build script will fail
-                raise SystemError
-
-    ##change target directory to directory after tar
-    return os.path.join(target_directory, directory_from_tar)
-
-
-def download_cifar10(target_directory=None):
-    return download_cifar(target_directory, "cifar-10-binary.tar.gz", "cifar-10-batches-bin")
-
-
-def download_cifar100(target_directory=None):
-    return download_cifar(target_directory, "cifar-100-binary.tar.gz", "cifar-100-binary")
-
-
-def download_all_for_test(cwd):
-    download_mnist(os.path.join(cwd, "testMnistData"))
-
-
-##Download all datasets to existing test directories
-if __name__ == "__main__":
-    download_all_for_test(os.getcwd())
diff --git a/tests/ut/python/dataset/test_HWC2CHW.py b/tests/ut/python/dataset/test_HWC2CHW.py
index 98e9ba72ba..63d5b33b61 100644
--- a/tests/ut/python/dataset/test_HWC2CHW.py
+++ b/tests/ut/python/dataset/test_HWC2CHW.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""
+Testing HWC2CHW op in DE
+"""
 import numpy as np
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as c_vision
 import mindspore.dataset.transforms.vision.py_transforms as py_vision
 from mindspore import log as logger
-from util import diff_mse, visualize, save_and_check_md5
+from util import diff_mse, visualize_list, save_and_check_md5
 
 GENERATE_GOLDEN = False
 
@@ -55,7 +58,7 @@ def test_HWC2CHW(plot=False):
         mse = diff_mse(transposed_item, original_item.transpose(2, 0, 1))
         assert mse == 0
     if plot:
-        visualize(image, image_transposed)
+        visualize_list(image, image_transposed)
 
 
 def test_HWC2CHW_md5():
@@ -112,7 +115,7 @@ def test_HWC2CHW_comp(plot=False):
         image_c_transposed.append(c_image.transpose(1, 2, 0))
         image_py_transposed.append(py_image.transpose(1, 2, 0))
     if plot:
-        visualize(image_c_transposed, image_py_transposed)
+        visualize_list(image_c_transposed, image_py_transposed, visualize_mode=2)
 
 
 if __name__ == '__main__':
diff --git a/tests/ut/python/dataset/test_autocontrast.py b/tests/ut/python/dataset/test_autocontrast.py
index 7755f3b7ad..648ecf5787 100644
--- a/tests/ut/python/dataset/test_autocontrast.py
+++ b/tests/ut/python/dataset/test_autocontrast.py
@@ -12,34 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-import matplotlib.pyplot as plt
+"""
+Testing AutoContrast op in DE
+"""
 import numpy as np
 
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.vision.py_transforms as F
 from mindspore import log as logger
+from util import visualize_list
 
 DATA_DIR = "../data/dataset/testImageNetData/train/"
 
 
-def visualize(image_original, image_auto_contrast):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    num = len(image_auto_contrast)
-    for i in range(num):
-        plt.subplot(2, num, i + 1)
-        plt.imshow(image_original[i])
-        plt.title("Original image")
-
-        plt.subplot(2, num, i + num + 1)
-        plt.imshow(image_auto_contrast[i])
-        plt.title("DE AutoContrast image")
-
-    plt.show()
-
-
 def test_auto_contrast(plot=False):
     """
     Test AutoContrast
@@ -94,7 +79,7 @@ def test_auto_contrast(plot=False):
     logger.info("MSE= {}".format(str(np.mean(mse))))
 
     if plot:
-        visualize(images_original, images_auto_contrast)
+        visualize_list(images_original, images_auto_contrast)
 
 
 if __name__ == "__main__":
diff --git a/tests/ut/python/dataset/test_basic_tokenizer.py b/tests/ut/python/dataset/test_basic_tokenizer.py
new file mode 100644
index 0000000000..45c9f94da4
--- /dev/null
+++ b/tests/ut/python/dataset/test_basic_tokenizer.py
@@ -0,0 +1,83 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing BasicTokenizer op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+from mindspore import log as logger
+import mindspore.dataset.text as nlp
+
+BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt"
+
+test_paras = [
+    dict(
+        first=1,
+        last=6,
+        expected_tokens=
+        [['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'],
+         ['長', '風', '破', '浪', '會', '有', '時', '，', '直', '掛', '雲', '帆', '濟', '滄', '海'],
+         ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
+         ['明', '朝', '（', '1368', '—', '1644', '年', '）', '和', '清', '朝',
+          '（', '1644', '—', '1911', '年', '）', '，', '是', '中', '国', '封',
+          '建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'],
+         ['明', '代', '（', '1368', '-', '1644', '）', 'と', '清', '代',
+          '（', '1644', '-', '1911', '）', 'は', '、', '中', '国', 'の', '封',
+          '建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'],
+         ['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는',
+          '중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']]
+    ),
+    dict(
+        first=7,
+        last=7,
+        expected_tokens=[['this', 'is', 'a', 'funky', 'string']],
+        lower_case=True
+    ),
+]
+
+
+def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False,
+                          normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False):
+    dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
+    if first > 1:
+        dataset = dataset.skip(first - 1)
+    if last >= first:
+        dataset = dataset.take(last - first + 1)
+
+    basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case,
+                                         keep_whitespace=keep_whitespace,
+                                         normalization_form=normalization_form,
+                                         preserve_unused_token=preserve_unused_token)
+
+    dataset = dataset.map(operations=basic_tokenizer)
+    count = 0
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text'])
+        logger.info("Out:", text)
+        logger.info("Exp:", expected_tokens[count])
+        np.testing.assert_array_equal(text, expected_tokens[count])
+        count = count + 1
+
+
+def test_basic_tokenizer():
+    """
+    Test BasicTokenizer
+    """
+    for paras in test_paras:
+        check_basic_tokenizer(**paras)
+
+
+if __name__ == '__main__':
+    test_basic_tokenizer()
diff --git a/tests/ut/python/dataset/test_bert_tokenizer.py b/tests/ut/python/dataset/test_bert_tokenizer.py
new file mode 100644
index 0000000000..ad7a663e93
--- /dev/null
+++ b/tests/ut/python/dataset/test_bert_tokenizer.py
@@ -0,0 +1,163 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing BertTokenizer op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+from mindspore import log as logger
+import mindspore.dataset.text as nlp
+
+BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt"
+
+vocab_bert = [
+    "床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低", "思", "故", "乡",
+    "繁", "體", "字", "嘿", "哈", "大", "笑", "嘻",
+    "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour",
+    "😀", "😃", "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I",
+    "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"
+]
+pad = '<pad>'
+test_paras = [
+    # test chinese text
+    dict(
+        first=1,
+        last=4,
+        expect_str=[['床', '前', '明', '月', '光'],
+                    ['疑', '是', '地', '上', '霜'],
+                    ['举', '头', '望', '明', '月'],
+                    ['低', '头', '思', '故', '乡']],
+        vocab_list=vocab_bert
+    ),
+    # test english text
+    dict(
+        first=5,
+        last=5,
+        expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
+        lower_case=True,
+        vocab_list=vocab_bert
+    ),
+    dict(
+        first=5,
+        last=5,
+        expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
+        lower_case=False,
+        vocab_list=vocab_bert
+    ),
+    # test emoji tokens
+    dict(
+        first=6,
+        last=7,
+        expect_str=[
+            ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
+            ['繁', '體', '字']],
+        normalization_form=nlp.utils.NormalizeForm.NFKC,
+        vocab_list=vocab_bert
+    ),
+    # test preserved tokens
+    dict(
+        first=8,
+        last=12,
+        expect_str=[
+            ['[UNK]', '[CLS]'],
+            ['[UNK]', '[SEP]'],
+            ['[UNK]', '[UNK]'],
+            ['[UNK]', '[PAD]'],
+            ['[UNK]', '[MASK]'],
+        ],
+        lower_case=False,
+        vocab_list=vocab_bert,
+        preserve_unused_token=True,
+    ),
+    # test special symbol
+    dict(
+        first=13,
+        last=13,
+        expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']],
+        preserve_unused_token=True,
+        vocab_list=vocab_bert
+    ),
+    # test non-default parms
+    dict(
+        first=8,
+        last=8,
+        expect_str=[['[UNK]', ' ', '[CLS]']],
+        lower_case=False,
+        vocab_list=vocab_bert,
+        preserve_unused_token=True,
+        keep_whitespace=True
+    ),
+    dict(
+        first=8,
+        last=8,
+        expect_str=[['unused', ' ', '[CLS]']],
+        lower_case=False,
+        vocab_list=vocab_bert,
+        preserve_unused_token=True,
+        keep_whitespace=True,
+        unknown_token=''
+    ),
+    dict(
+        first=8,
+        last=8,
+        expect_str=[['unused', ' ', '[', 'CLS', ']']],
+        lower_case=False,
+        vocab_list=vocab_bert,
+        preserve_unused_token=False,
+        keep_whitespace=True,
+        unknown_token=''
+    ),
+]
+
+
+def check_bert_tokenizer(first, last, expect_str,
+                         vocab_list,
+                         suffix_indicator='##',
+                         max_bytes_per_token=100, unknown_token='[UNK]',
+                         lower_case=False, keep_whitespace=False,
+                         normalization_form=nlp.utils.NormalizeForm.NONE,
+                         preserve_unused_token=False):
+    dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
+    if first > 1:
+        dataset = dataset.skip(first - 1)
+    if last >= first:
+        dataset = dataset.take(last - first + 1)
+    vocab = nlp.Vocab.from_list(vocab_list)
+    tokenizer_op = nlp.BertTokenizer(
+        vocab=vocab, suffix_indicator=suffix_indicator,
+        max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token,
+        lower_case=lower_case, keep_whitespace=keep_whitespace,
+        normalization_form=normalization_form,
+        preserve_unused_token=preserve_unused_token)
+    dataset = dataset.map(operations=tokenizer_op)
+    count = 0
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text'])
+        logger.info("Out:", text)
+        logger.info("Exp:", expect_str[count])
+        np.testing.assert_array_equal(text, expect_str[count])
+        count = count + 1
+
+
+def test_bert_tokenizer():
+    """
+    Test WordpieceTokenizer
+    """
+    for paras in test_paras:
+        check_bert_tokenizer(**paras)
+
+
+if __name__ == '__main__':
+    test_bert_tokenizer()
diff --git a/tests/ut/python/dataset/test_bounding_box_augment.py b/tests/ut/python/dataset/test_bounding_box_augment.py
new file mode 100644
index 0000000000..2b0d89239f
--- /dev/null
+++ b/tests/ut/python/dataset/test_bounding_box_augment.py
@@ -0,0 +1,294 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing the bounding box augment op in DE
+"""
+from enum import Enum
+import mindspore.log as logger
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.c_transforms as c_vision
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import numpy as np
+
+GENERATE_GOLDEN = False
+
+DATA_DIR = "../data/dataset/testVOC2012_2"
+
+
+class BoxType(Enum):
+    """
+    Defines box types for test cases
+    """
+    WidthOverflow = 1
+    HeightOverflow = 2
+    NegativeXY = 3
+    OnEdge = 4
+    WrongShape = 5
+
+
+def add_bad_annotation(img, bboxes, box_type):
+    """
+    Used to generate erroneous bounding box examples on given img.
+    :param img: image where the bounding boxes are.
+    :param bboxes: in [x_min, y_min, w, h, label, truncate, difficult] format
+    :param box_type: type of bad box
+    :return: bboxes with bad examples added
+    """
+    height = img.shape[0]
+    width = img.shape[1]
+    if box_type == BoxType.WidthOverflow:
+        # use box that overflows on width
+        return img, np.array([[0, 0, width + 1, height, 0, 0, 0]]).astype(np.uint32)
+
+    if box_type == BoxType.HeightOverflow:
+        # use box that overflows on height
+        return img, np.array([[0, 0, width, height + 1, 0, 0, 0]]).astype(np.uint32)
+
+    if box_type == BoxType.NegativeXY:
+        # use box with negative xy
+        return img, np.array([[-10, -10, width, height, 0, 0, 0]]).astype(np.uint32)
+
+    if box_type == BoxType.OnEdge:
+        # use box that covers the whole image
+        return img, np.array([[0, 0, width, height, 0, 0, 0]]).astype(np.uint32)
+
+    if box_type == BoxType.WrongShape:
+        # use box that covers the whole image
+        return img, np.array([[0, 0, width - 1]]).astype(np.uint32)
+    return img, bboxes
+
+
+def check_bad_box(data, box_type, expected_error):
+    """
+    :param data: de object detection pipeline
+    :param box_type: type of bad box
+    :param expected_error: error expected to get due to bad box
+    :return: None
+    """
+    try:
+        test_op = c_vision.BoundingBoxAugment(c_vision.RandomHorizontalFlip(1),
+                                              1)  # DEFINE TEST OP HERE -- (PROB 1 IN CASE OF RANDOM)
+        data = data.map(input_columns=["annotation"],
+                        output_columns=["annotation"],
+                        operations=fix_annotate)
+        # map to use width overflow
+        data = data.map(input_columns=["image", "annotation"],
+                        output_columns=["image", "annotation"],
+                        columns_order=["image", "annotation"],
+                        operations=lambda img, bboxes: add_bad_annotation(img, bboxes, box_type))
+        # map to apply ops
+        data = data.map(input_columns=["image", "annotation"],
+                        output_columns=["image", "annotation"],
+                        columns_order=["image", "annotation"],
+                        operations=[test_op])  # Add column for "annotation"
+        for _, _ in enumerate(data.create_dict_iterator()):
+            break
+    except RuntimeError as error:
+        logger.info("Got an exception in DE: {}".format(str(error)))
+        assert expected_error in str(error)
+
+
+def fix_annotate(bboxes):
+    """
+    Fix annotations to format followed by mindspore.
+    :param bboxes: in [label, x_min, y_min, w, h, truncate, difficult] format
+    :return: annotation in [x_min, y_min, w, h, label, truncate, difficult] format
+    """
+    for bbox in bboxes:
+        tmp = bbox[0]
+        bbox[0] = bbox[1]
+        bbox[1] = bbox[2]
+        bbox[2] = bbox[3]
+        bbox[3] = bbox[4]
+        bbox[4] = tmp
+    return bboxes
+
+
+def add_bounding_boxes(axis, bboxes):
+    """
+    :param axis: axis to modify
+    :param bboxes: bounding boxes to draw on the axis
+    :return: None
+    """
+    for bbox in bboxes:
+        rect = patches.Rectangle((bbox[0], bbox[1]),
+                                 bbox[2], bbox[3],
+                                 linewidth=1, edgecolor='r', facecolor='none')
+        # Add the patch to the Axes
+        axis.add_patch(rect)
+
+
+def visualize(unaugmented_data, augment_data):
+    """
+    :param unaugmented_data: original data
+    :param augment_data: data after augmentations
+    :return: None
+    """
+    for idx, (un_aug_item, aug_item) in \
+            enumerate(zip(unaugmented_data.create_dict_iterator(),
+                          augment_data.create_dict_iterator())):
+        axis = plt.subplot(141)
+        plt.imshow(un_aug_item["image"])
+        add_bounding_boxes(axis, un_aug_item["annotation"])  # add Orig BBoxes
+        plt.title("Original" + str(idx + 1))
+        logger.info("Original ", str(idx + 1), " :", un_aug_item["annotation"])
+
+        axis = plt.subplot(142)
+        plt.imshow(aug_item["image"])
+        add_bounding_boxes(axis, aug_item["annotation"])  # add AugBBoxes
+        plt.title("Augmented" + str(idx + 1))
+        logger.info("Augmented ", str(idx + 1), " ", aug_item["annotation"], "\n")
+        plt.show()
+
+
+def test_bounding_box_augment_with_rotation_op(plot=False):
+    """
+    Test BoundingBoxAugment op
+    Prints images side by side with and without Aug applied + bboxes to compare and test
+    """
+    logger.info("test_bounding_box_augment_with_rotation_op")
+
+    data_voc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    test_op = c_vision.BoundingBoxAugment(c_vision.RandomRotation(90), 1)
+    # DEFINE TEST OP HERE -- (PROB 1 IN CASE OF RANDOM)
+
+    # maps to fix annotations to minddata standard
+    data_voc1 = data_voc1.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    data_voc2 = data_voc2.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    # map to apply ops
+    data_voc2 = data_voc2.map(input_columns=["image", "annotation"],
+                              output_columns=["image", "annotation"],
+                              columns_order=["image", "annotation"],
+                              operations=[test_op])  # Add column for "annotation"
+    if plot:
+        visualize(data_voc1, data_voc2)
+
+
+def test_bounding_box_augment_with_crop_op(plot=False):
+    """
+    Test BoundingBoxAugment op
+    Prints images side by side with and without Aug applied + bboxes to compare and test
+    """
+    logger.info("test_bounding_box_augment_with_crop_op")
+
+    data_voc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    test_op = c_vision.BoundingBoxAugment(c_vision.RandomCrop(90), 1)
+
+    # maps to fix annotations to minddata standard
+    data_voc1 = data_voc1.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    data_voc2 = data_voc2.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    # map to apply ops
+    data_voc2 = data_voc2.map(input_columns=["image", "annotation"],
+                              output_columns=["image", "annotation"],
+                              columns_order=["image", "annotation"],
+                              operations=[test_op])  # Add column for "annotation"
+    if plot:
+        visualize(data_voc1, data_voc2)
+
+
+def test_bounding_box_augment_valid_ratio_c(plot=False):
+    """
+    Test RandomHorizontalFlipWithBBox op
+    Prints images side by side with and without Aug applied + bboxes to compare and test
+    """
+    logger.info("test_bounding_box_augment_valid_ratio_c")
+
+    data_voc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    test_op = c_vision.BoundingBoxAugment(c_vision.RandomHorizontalFlip(1), 0.9)
+    # DEFINE TEST OP HERE -- (PROB 1 IN CASE OF RANDOM)
+
+    # maps to fix annotations to minddata standard
+    data_voc1 = data_voc1.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    data_voc2 = data_voc2.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    # map to apply ops
+    data_voc2 = data_voc2.map(input_columns=["image", "annotation"],
+                              output_columns=["image", "annotation"],
+                              columns_order=["image", "annotation"],
+                              operations=[test_op])  # Add column for "annotation"
+    if plot:
+        visualize(data_voc1, data_voc2)
+
+
+def test_bounding_box_augment_invalid_ratio_c():
+    """
+    Test RandomHorizontalFlipWithBBox op with invalid input probability
+    """
+    logger.info("test_bounding_box_augment_invalid_ratio_c")
+
+    data_voc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    try:
+        # ratio range is from 0 - 1
+        test_op = c_vision.BoundingBoxAugment(c_vision.RandomHorizontalFlip(1), 1.5)
+        # maps to fix annotations to minddata standard
+        data_voc1 = data_voc1.map(input_columns=["annotation"],
+                                  output_columns=["annotation"],
+                                  operations=fix_annotate)
+        data_voc2 = data_voc2.map(input_columns=["annotation"],
+                                  output_columns=["annotation"],
+                                  operations=fix_annotate)
+        # map to apply ops
+        data_voc2 = data_voc2.map(input_columns=["image", "annotation"],
+                                  output_columns=["image", "annotation"],
+                                  columns_order=["image", "annotation"],
+                                  operations=[test_op])  # Add column for "annotation"
+    except ValueError as error:
+        logger.info("Got an exception in DE: {}".format(str(error)))
+        assert "Input is not" in str(error)
+
+
+def test_bounding_box_augment_invalid_bounds_c():
+    """
+    Test BoundingBoxAugment op with invalid bboxes.
+    """
+    logger.info("test_bounding_box_augment_invalid_bounds_c")
+
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    check_bad_box(data_voc2, BoxType.WidthOverflow, "bounding boxes is out of bounds of the image")
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    check_bad_box(data_voc2, BoxType.HeightOverflow, "bounding boxes is out of bounds of the image")
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    check_bad_box(data_voc2, BoxType.NegativeXY, "min_x")
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    check_bad_box(data_voc2, BoxType.WrongShape, "4 features")
+
+
+if __name__ == "__main__":
+    # set to false to not show plots
+    test_bounding_box_augment_with_rotation_op(False)
+    test_bounding_box_augment_with_crop_op(False)
+    test_bounding_box_augment_valid_ratio_c(False)
+    test_bounding_box_augment_invalid_ratio_c()
+    test_bounding_box_augment_invalid_bounds_c()
diff --git a/tests/ut/python/dataset/test_bucket_batch_by_length.py b/tests/ut/python/dataset/test_bucket_batch_by_length.py
new file mode 100644
index 0000000000..bca30723e9
--- /dev/null
+++ b/tests/ut/python/dataset/test_bucket_batch_by_length.py
@@ -0,0 +1,373 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import pytest
+import numpy as np
+import mindspore.dataset as ds
+
+# generates 1 column [0], [0, 1], ..., [0, ..., n-1]
+def generate_sequential(n):
+    for i in range(n):
+        yield (np.array([j for j in range(i + 1)]),)
+
+
+# generates 1 column [0], [1], ..., [n-1]
+def generate_sequential_same_shape(n):
+    for i in range(n):
+        yield (np.array([i]),)
+
+
+# combines generate_sequential_same_shape and generate_sequential
+def generate_2_columns(n):
+    for i in range(n):
+        yield (np.array([i]), np.array([j for j in range(i + 1)]))
+
+
+def test_bucket_batch_invalid_input():
+    dataset = ds.GeneratorDataset((lambda: generate_sequential_same_shape(10)), ["col1"])
+
+    column_names = ["col1"]
+    invalid_column_names = [1, 2, 3]
+
+    bucket_boundaries = [1, 2, 3]
+    empty_bucket_boundaries = []
+    invalid_bucket_boundaries = ["1", "2", "3"]
+    negative_bucket_boundaries = [1, 2, -3]
+    decreasing_bucket_boundaries = [3, 2, 1]
+    non_increasing_bucket_boundaries = [1, 2, 2]
+
+    bucket_batch_sizes = [1, 1, 1, 1]
+    invalid_bucket_batch_sizes = ["1", "2", "3", "4"]
+    negative_bucket_batch_sizes = [1, 2, 3, -4]
+
+    with pytest.raises(TypeError) as info:
+        _ = dataset.bucket_batch_by_length(invalid_column_names, bucket_boundaries, bucket_batch_sizes)
+    assert "column_names should be a list of str" in str(info.value)
+
+    with pytest.raises(ValueError) as info:
+        _ = dataset.bucket_batch_by_length(column_names, empty_bucket_boundaries, bucket_batch_sizes)
+    assert "bucket_boundaries cannot be empty" in str(info.value)
+
+    with pytest.raises(TypeError) as info:
+        _ = dataset.bucket_batch_by_length(column_names, invalid_bucket_boundaries, bucket_batch_sizes)
+    assert "bucket_boundaries should be a list of int" in str(info.value)
+
+    with pytest.raises(ValueError) as info:
+        _ = dataset.bucket_batch_by_length(column_names, negative_bucket_boundaries, bucket_batch_sizes)
+    assert "bucket_boundaries cannot contain any negative numbers" in str(info.value)
+
+    with pytest.raises(ValueError) as info:
+        _ = dataset.bucket_batch_by_length(column_names, decreasing_bucket_boundaries, bucket_batch_sizes)
+    assert "bucket_boundaries should be strictly increasing" in str(info.value)
+
+    with pytest.raises(ValueError) as info:
+        _ = dataset.bucket_batch_by_length(column_names, non_increasing_bucket_boundaries, bucket_batch_sizes)
+    assert "bucket_boundaries should be strictly increasing" in str(info.value)
+
+    with pytest.raises(TypeError) as info:
+        _ = dataset.bucket_batch_by_length(column_names, bucket_boundaries, invalid_bucket_batch_sizes)
+    assert "bucket_batch_sizes should be a list of int" in str(info.value)
+
+    with pytest.raises(ValueError) as info:
+        _ = dataset.bucket_batch_by_length(column_names, bucket_boundaries, negative_bucket_batch_sizes)
+    assert "bucket_batch_sizes cannot contain any negative numbers" in str(info.value)
+
+    with pytest.raises(ValueError) as info:
+        _ = dataset.bucket_batch_by_length(column_names, bucket_boundaries, bucket_boundaries)
+    assert "bucket_batch_sizes must contain one element more than bucket_boundaries" in str(info.value)
+
+
+def test_bucket_batch_multi_bucket_no_padding():
+    dataset = ds.GeneratorDataset((lambda: generate_sequential_same_shape(10)), ["col1"])
+
+    column_names = ["col1"]
+    bucket_boundaries = [1, 2, 3]
+    bucket_batch_sizes = [3, 3, 2, 2]
+    element_length_function = (lambda x: x[0] % 4)
+
+    dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
+                                             bucket_batch_sizes, element_length_function)
+
+    expected_output = [[[2], [6]],
+                       [[3], [7]],
+                       [[0], [4], [8]],
+                       [[1], [5], [9]]]
+
+    output = []
+    for data in dataset.create_dict_iterator():
+        output.append(data["col1"].tolist())
+
+    assert output == expected_output
+
+
+def test_bucket_batch_multi_bucket_with_padding():
+    dataset = ds.GeneratorDataset((lambda: generate_sequential(10)), ["col1"])
+
+    column_names = ["col1"]
+    bucket_boundaries = [1, 2, 3]
+    bucket_batch_sizes = [2, 3, 3, 2]
+    element_length_function = (lambda x: len(x) % 4)
+    pad_info = {"col1": ([10], 0)}
+
+    dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
+                                             bucket_batch_sizes, element_length_function,
+                                             pad_info)
+
+    expected_output = [[[0, 1, 2, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 0, 0, 0]],
+                       [[0, 1, 2, 3, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 0, 0]],
+                       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 0]],
+                       [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]]
+
+    output = []
+    for data in dataset.create_dict_iterator():
+        output.append(data["col1"].tolist())
+
+    assert output == expected_output
+
+
+def test_bucket_batch_single_bucket_no_padding():
+    dataset = ds.GeneratorDataset((lambda: generate_sequential_same_shape(10)), ["col1"])
+
+    column_names = ["col1"]
+    bucket_boundaries = [1, 2, 3]
+    bucket_batch_sizes = [1, 1, 5, 1]
+    element_length_function = (lambda x: 2)
+
+    dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
+                                             bucket_batch_sizes, element_length_function)
+
+    expected_output = [[[0], [1], [2], [3], [4]],
+                       [[5], [6], [7], [8], [9]]]
+
+    output = []
+    for data in dataset.create_dict_iterator():
+        output.append(data["col1"].tolist())
+
+    assert output == expected_output
+
+
+def test_bucket_batch_single_bucket_with_padding():
+    dataset = ds.GeneratorDataset((lambda: generate_sequential(9)), ["col1"])
+
+    column_names = ["col1"]
+    bucket_boundaries = [1, 2, 3]
+    bucket_batch_sizes = [1, 1, 1, 3]
+    element_length_function = (lambda x: 7)
+    pad_info = {"col1": ([12], 0)}
+
+    dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
+                                             bucket_batch_sizes, element_length_function,
+                                             pad_info)
+
+    expected_output = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+                       [[0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0]],
+                       [[0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0]]]
+
+    output = []
+    for data in dataset.create_dict_iterator():
+        output.append(data["col1"].tolist())
+
+    assert output == expected_output
+
+
+def test_bucket_batch_pad_to_bucket_boundary():
+    dataset = ds.GeneratorDataset((lambda: generate_sequential(9)), ["col1"])
+
+    column_names = ["col1"]
+    bucket_boundaries = [3, 6, 15]
+    bucket_batch_sizes = [2, 3, 4, 1]
+    element_length_function = len
+    pad_info = {"col1": ([None], 0)}
+    pad_to_bucket_boundary = True
+
+    dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
+                                             bucket_batch_sizes, element_length_function,
+                                             pad_info, pad_to_bucket_boundary)
+
+    expected_output = [[[0, 0],
+                        [0, 1]],
+                       [[0, 1, 2, 0, 0],
+                        [0, 1, 2, 3, 0],
+                        [0, 1, 2, 3, 4]],
+                       [[0, 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0]]]
+
+    output = []
+    for data in dataset.create_dict_iterator():
+        output.append(data["col1"].tolist())
+
+    assert output == expected_output
+
+
+def test_bucket_batch_default_pad():
+    dataset = ds.GeneratorDataset((lambda: generate_sequential(15)), ["col1"])
+
+    column_names = ["col1"]
+    bucket_boundaries = [5, 8, 17]
+    bucket_batch_sizes = [2, 1, 4, 1]
+    element_length_function = len
+    pad_info = {"col1": ([None], 0)}
+
+    dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
+                                             bucket_batch_sizes, element_length_function,
+                                             pad_info)
+
+    expected_output = [[[0, 0],
+                        [0, 1]],
+                       [[0, 1, 2, 0],
+                        [0, 1, 2, 3]],
+                       [[0, 1, 2, 3, 4]],
+                       [[0, 1, 2, 3, 4, 5]],
+                       [[0, 1, 2, 3, 4, 5, 6]],
+                       [[0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]],
+                       [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]]]
+
+
+    output = []
+    for data in dataset.create_dict_iterator():
+        output.append(data["col1"].tolist())
+
+    assert output == expected_output
+
+
+def test_bucket_batch_drop_remainder():
+    dataset = ds.GeneratorDataset((lambda: generate_sequential_same_shape(27)), ["col1"])
+
+    column_names = ["col1"]
+    bucket_boundaries = [1, 2]
+    bucket_batch_sizes = [2, 3, 5]
+    element_length_function = (lambda x: x[0] % 3)
+    pad_info = None
+    pad_to_bucket_boundary = False
+    drop_remainder = True
+
+    dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
+                                             bucket_batch_sizes, element_length_function,
+                                             pad_info, pad_to_bucket_boundary, drop_remainder)
+
+    expected_output = [[[0], [3]],
+                       [[1], [4], [7]],
+                       [[6], [9]],
+                       [[2], [5], [8], [11], [14]],
+                       [[12], [15]],
+                       [[10], [13], [16]],
+                       [[18], [21]],
+                       [[19], [22], [25]]]
+
+    output = []
+    for data in dataset.create_dict_iterator():
+        output.append(data["col1"].tolist())
+
+    assert output == expected_output
+
+
+def test_bucket_batch_default_length_function():
+    dataset = ds.GeneratorDataset((lambda: generate_sequential(9)), ["col1"])
+
+    column_names = ["col1"]
+    bucket_boundaries = [6, 12]
+    bucket_batch_sizes = [5, 4, 1]
+    element_length_function = None
+    pad_info = {}
+
+    dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
+                                             bucket_batch_sizes, element_length_function,
+                                             pad_info)
+
+    expected_output = [[[0, 0, 0, 0, 0],
+                        [0, 1, 0, 0, 0],
+                        [0, 1, 2, 0, 0],
+                        [0, 1, 2, 3, 0],
+                        [0, 1, 2, 3, 4]],
+                       [[0, 1, 2, 3, 4, 5, 0, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 0, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 0],
+                        [0, 1, 2, 3, 4, 5, 6, 7, 8]]]
+
+    output = []
+    for data in dataset.create_dict_iterator():
+        output.append(data["col1"].tolist())
+
+    assert output == expected_output
+
+
+def test_bucket_batch_multi_column():
+    dataset = ds.GeneratorDataset((lambda: generate_2_columns(10)), ["same_shape", "variable_shape"])
+
+    column_names = ["same_shape"]
+    bucket_boundaries = [6, 12]
+    bucket_batch_sizes = [5, 5, 1]
+    element_length_function = None
+    pad_info = {}
+
+    dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
+                                             bucket_batch_sizes, element_length_function,
+                                             pad_info)
+
+    same_shape_expected_output = [[[0], [1], [2], [3], [4]],
+                                  [[5], [6], [7], [8], [9]]]
+
+    variable_shape_expected_output = [[[0, 0, 0, 0, 0],
+                                       [0, 1, 0, 0, 0],
+                                       [0, 1, 2, 0, 0],
+                                       [0, 1, 2, 3, 0],
+                                       [0, 1, 2, 3, 4]],
+                                      [[0, 1, 2, 3, 4, 5, 0, 0, 0, 0],
+                                       [0, 1, 2, 3, 4, 5, 6, 0, 0, 0],
+                                       [0, 1, 2, 3, 4, 5, 6, 7, 0, 0],
+                                       [0, 1, 2, 3, 4, 5, 6, 7, 8, 0],
+                                       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]]
+
+    same_shape_output = []
+    variable_shape_output = []
+    for data in dataset.create_dict_iterator():
+        same_shape_output.append(data["same_shape"].tolist())
+        variable_shape_output.append(data["variable_shape"].tolist())
+
+    assert same_shape_output == same_shape_expected_output
+    assert variable_shape_output == variable_shape_expected_output
+
+
+if __name__ == '__main__':
+    test_bucket_batch_invalid_input()
+    test_bucket_batch_multi_bucket_no_padding()
+    test_bucket_batch_multi_bucket_with_padding()
+    test_bucket_batch_single_bucket_no_padding()
+    test_bucket_batch_single_bucket_with_padding()
+    test_bucket_batch_pad_to_bucket_boundary()
+    test_bucket_batch_default_pad()
+    test_bucket_batch_drop_remainder()
+    test_bucket_batch_default_length_function()
+    test_bucket_batch_multi_column()
diff --git a/tests/ut/python/dataset/test_center_crop.py b/tests/ut/python/dataset/test_center_crop.py
index 33ba89e735..d4f8735fb0 100644
--- a/tests/ut/python/dataset/test_center_crop.py
+++ b/tests/ut/python/dataset/test_center_crop.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""
+Testing CenterCrop op in DE
+"""
 import numpy as np
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as vision
 import mindspore.dataset.transforms.vision.py_transforms as py_vision
 from mindspore import log as logger
-from util import diff_mse, visualize, save_and_check_md5
+from util import diff_mse, visualize_list, save_and_check_md5
 
 GENERATE_GOLDEN = False
 
@@ -49,7 +52,7 @@ def test_center_crop_op(height=375, width=375, plot=False):
         image_cropped.append(item1["image"].copy())
         image.append(item2["image"].copy())
     if plot:
-        visualize(image, image_cropped)
+        visualize_list(image, image_cropped)
 
 
 def test_center_crop_md5(height=375, width=375):
@@ -93,17 +96,17 @@ def test_center_crop_comp(height=375, width=375, plot=False):
     transform = py_vision.ComposeOp(transforms)
     data2 = data2.map(input_columns=["image"], operations=transform())
 
-    image_cropped = []
-    image = []
+    image_c_cropped = []
+    image_py_cropped = []
     for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
         c_image = item1["image"]
         py_image = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
         # Note: The images aren't exactly the same due to rounding error
         assert diff_mse(py_image, c_image) < 0.001
-        image_cropped.append(c_image.copy())
-        image.append(py_image.copy())
+        image_c_cropped.append(c_image.copy())
+        image_py_cropped.append(py_image.copy())
     if plot:
-        visualize(image, image_cropped)
+        visualize_list(image_c_cropped, image_py_cropped, visualize_mode=2)
 
 
 # pylint: disable=unnecessary-lambda
@@ -141,9 +144,9 @@ def test_crop_grayscale(height=375, width=375):
 
 
 if __name__ == "__main__":
-    test_center_crop_op(600, 600, True)
+    test_center_crop_op(600, 600, plot=True)
     test_center_crop_op(300, 600)
     test_center_crop_op(600, 300)
     test_center_crop_md5()
-    test_center_crop_comp(True)
+    test_center_crop_comp(plot=True)
     test_crop_grayscale()
diff --git a/tests/ut/python/dataset/test_cifarop.py b/tests/ut/python/dataset/test_cifarop.py
index 41777e1cea..e944f8703d 100644
--- a/tests/ut/python/dataset/test_cifarop.py
+++ b/tests/ut/python/dataset/test_cifarop.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+import os
+
+import numpy as np
+
 import mindspore.dataset as ds
 from mindspore import log as logger
 
@@ -26,6 +30,20 @@ DATA_DIR_10 = "../data/dataset/testCifar10Data"
 DATA_DIR_100 = "../data/dataset/testCifar100Data"
 
 
+def load_cifar(path):
+    raw = np.empty(0, dtype=np.uint8)
+    for file_name in os.listdir(path):
+        if file_name.endswith(".bin"):
+            with open(os.path.join(path, file_name), mode='rb') as file:
+                raw = np.append(raw, np.fromfile(file, dtype=np.uint8), axis=0)
+    raw = raw.reshape(-1, 3073)
+    labels = raw[:, 0]
+    images = raw[:, 1:]
+    images = images.reshape(-1, 3, 32, 32)
+    images = images.transpose(0, 2, 3, 1)
+    return images, labels
+
+
 def test_case_dataset_cifar10():
     """
     dataset parameter
@@ -56,6 +74,18 @@ def test_case_dataset_cifar100():
     assert num_iter == 100
 
 
+def test_reading_cifar10():
+    """
+    Validate CIFAR10 image readings
+    """
+    data1 = ds.Cifar10Dataset(DATA_DIR_10, 100, shuffle=False)
+    images, labels = load_cifar(DATA_DIR_10)
+    for i, d in enumerate(data1.create_dict_iterator()):
+        np.testing.assert_array_equal(d["image"], images[i])
+        np.testing.assert_array_equal(d["label"], labels[i])
+
+
 if __name__ == '__main__':
     test_case_dataset_cifar10()
     test_case_dataset_cifar100()
+    test_reading_cifar10()
diff --git a/tests/ut/python/dataset/test_concatenate_op.py b/tests/ut/python/dataset/test_concatenate_op.py
new file mode 100644
index 0000000000..d04ff49724
--- /dev/null
+++ b/tests/ut/python/dataset/test_concatenate_op.py
@@ -0,0 +1,191 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing concatenate op
+"""
+
+import numpy as np
+import pytest
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as data_trans
+
+
+def test_concatenate_op_all():
+    def gen():
+        yield (np.array([5., 6., 7., 8.], dtype=np.float),)
+
+    prepend_tensor = np.array([1.4, 2., 3., 4., 4.5], dtype=np.float)
+    append_tensor = np.array([9., 10.3, 11., 12.], dtype=np.float)
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    concatenate_op = data_trans.Concatenate(0, prepend_tensor, append_tensor)
+    data = data.map(input_columns=["col"], operations=concatenate_op)
+    expected = np.array([1.4, 2., 3., 4., 4.5, 5., 6., 7., 8., 9., 10.3,
+                         11., 12.])
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], expected)
+
+
+def test_concatenate_op_none():
+    def gen():
+        yield (np.array([5., 6., 7., 8.], dtype=np.float),)
+
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    concatenate_op = data_trans.Concatenate()
+
+    data = data.map(input_columns=["col"], operations=concatenate_op)
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], np.array([5., 6., 7., 8.], dtype=np.float))
+
+
+def test_concatenate_op_string():
+    def gen():
+        yield (np.array(["ss", "ad"], dtype='S'),)
+
+    prepend_tensor = np.array(["dw", "df"], dtype='S')
+    append_tensor = np.array(["dwsdf", "df"], dtype='S')
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    concatenate_op = data_trans.Concatenate(0, prepend_tensor, append_tensor)
+
+    data = data.map(input_columns=["col"], operations=concatenate_op)
+    expected = np.array(["dw", "df", "ss", "ad", "dwsdf", "df"], dtype='S')
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], expected)
+
+
+def test_concatenate_op_multi_input_string():
+    prepend_tensor = np.array(["dw", "df"], dtype='S')
+    append_tensor = np.array(["dwsdf", "df"], dtype='S')
+
+    data = ([["1", "2", "d"]], [["3", "4", "e"]])
+    data = ds.NumpySlicesDataset(data, column_names=["col1", "col2"])
+
+    concatenate_op = data_trans.Concatenate(0, prepend=prepend_tensor, append=append_tensor)
+
+    data = data.map(input_columns=["col1", "col2"], columns_order=["out1"], output_columns=["out1"],
+                    operations=concatenate_op)
+    expected = np.array(["dw", "df", "1", "2", "d", "3", "4", "e", "dwsdf", "df"], dtype='S')
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], expected)
+
+
+def test_concatenate_op_multi_input_numeric():
+    prepend_tensor = np.array([3, 5])
+
+    data = ([[1, 2]], [[3, 4]])
+    data = ds.NumpySlicesDataset(data, column_names=["col1", "col2"])
+
+    concatenate_op = data_trans.Concatenate(0, prepend=prepend_tensor)
+
+    data = data.map(input_columns=["col1", "col2"], columns_order=["out1"], output_columns=["out1"],
+                    operations=concatenate_op)
+    expected = np.array([3, 5, 1, 2, 3, 4])
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], expected)
+
+
+def test_concatenate_op_type_mismatch():
+    def gen():
+        yield (np.array([3, 4], dtype=np.float),)
+
+    prepend_tensor = np.array(["ss", "ad"], dtype='S')
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    concatenate_op = data_trans.Concatenate(0, prepend_tensor)
+
+    data = data.map(input_columns=["col"], operations=concatenate_op)
+    with pytest.raises(RuntimeError) as error_info:
+        for _ in data:
+            pass
+    assert "Tensor types do not match" in repr(error_info.value)
+
+
+def test_concatenate_op_type_mismatch2():
+    def gen():
+        yield (np.array(["ss", "ad"], dtype='S'),)
+
+    prepend_tensor = np.array([3, 5], dtype=np.float)
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    concatenate_op = data_trans.Concatenate(0, prepend_tensor)
+
+    data = data.map(input_columns=["col"], operations=concatenate_op)
+    with pytest.raises(RuntimeError) as error_info:
+        for _ in data:
+            pass
+    assert "Tensor types do not match" in repr(error_info.value)
+
+
+def test_concatenate_op_incorrect_dim():
+    def gen():
+        yield (np.array([["ss", "ad"], ["ss", "ad"]], dtype='S'),)
+
+    prepend_tensor = np.array([3, 5], dtype=np.float)
+    concatenate_op = data_trans.Concatenate(0, prepend_tensor)
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+
+    data = data.map(input_columns=["col"], operations=concatenate_op)
+    with pytest.raises(RuntimeError) as error_info:
+        for _ in data:
+            pass
+    assert "Only 1D tensors supported" in repr(error_info.value)
+
+
+def test_concatenate_op_wrong_axis():
+    with pytest.raises(ValueError) as error_info:
+        data_trans.Concatenate(2)
+    assert "only 1D concatenation supported." in repr(error_info.value)
+
+
+def test_concatenate_op_negative_axis():
+    def gen():
+        yield (np.array([5., 6., 7., 8.], dtype=np.float),)
+
+    prepend_tensor = np.array([1.4, 2., 3., 4., 4.5], dtype=np.float)
+    append_tensor = np.array([9., 10.3, 11., 12.], dtype=np.float)
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    concatenate_op = data_trans.Concatenate(-1, prepend_tensor, append_tensor)
+    data = data.map(input_columns=["col"], operations=concatenate_op)
+    expected = np.array([1.4, 2., 3., 4., 4.5, 5., 6., 7., 8., 9., 10.3,
+                         11., 12.])
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], expected)
+
+
+def test_concatenate_op_incorrect_input_dim():
+    def gen():
+        yield (np.array(["ss", "ad"], dtype='S'),)
+
+    prepend_tensor = np.array([["ss", "ad"], ["ss", "ad"]], dtype='S')
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    concatenate_op = data_trans.Concatenate(0, prepend_tensor)
+
+    data = data.map(input_columns=["col"], operations=concatenate_op)
+    with pytest.raises(RuntimeError) as error_info:
+        for _ in data:
+            pass
+    assert "Only 1D tensors supported" in repr(error_info.value)
+
+
+if __name__ == "__main__":
+    test_concatenate_op_all()
+    test_concatenate_op_none()
+    test_concatenate_op_string()
+    test_concatenate_op_multi_input_string()
+    test_concatenate_op_multi_input_numeric()
+    test_concatenate_op_type_mismatch()
+    test_concatenate_op_type_mismatch2()
+    test_concatenate_op_incorrect_dim()
+    test_concatenate_op_negative_axis()
+    test_concatenate_op_wrong_axis()
+    test_concatenate_op_incorrect_input_dim()
diff --git a/tests/ut/python/dataset/test_cut_out.py b/tests/ut/python/dataset/test_cut_out.py
index badb2de435..483a939f65 100644
--- a/tests/ut/python/dataset/test_cut_out.py
+++ b/tests/ut/python/dataset/test_cut_out.py
@@ -15,37 +15,19 @@
 """
 Testing CutOut op in DE
 """
-import matplotlib.pyplot as plt
 import numpy as np
 
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as c
 import mindspore.dataset.transforms.vision.py_transforms as f
 from mindspore import log as logger
+from util import visualize_image, diff_mse
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
 
 
-def visualize(image_1, image_2):
-    """
-    visualizes the image using RandomErasing and Cutout
-    """
-    plt.subplot(141)
-    plt.imshow(image_1)
-    plt.title("RandomErasing")
-
-    plt.subplot(142)
-    plt.imshow(image_2)
-    plt.title("Cutout")
-
-    plt.subplot(143)
-    plt.imshow(image_1 - image_2)
-    plt.title("Difference image")
-    plt.show()
-
-
-def test_cut_out_op():
+def test_cut_out_op(plot=False):
     """
     Test Cutout
     """
@@ -87,7 +69,9 @@ def test_cut_out_op():
         logger.info("dtype of image_1: {}".format(image_1.dtype))
         logger.info("dtype of image_2: {}".format(image_2.dtype))
 
-        # visualize(image_1, image_2)
+        mse = diff_mse(image_1, image_2)
+        if plot:
+            visualize_image(image_1, image_2, mse)
 
 
 def test_cut_out_op_multicut():
@@ -134,5 +118,5 @@ def test_cut_out_op_multicut():
 
 
 if __name__ == "__main__":
-    test_cut_out_op()
+    test_cut_out_op(plot=True)
     test_cut_out_op_multicut()
diff --git a/tests/ut/python/dataset/test_dataset_numpy_slices.py b/tests/ut/python/dataset/test_dataset_numpy_slices.py
new file mode 100644
index 0000000000..4cd4e26a33
--- /dev/null
+++ b/tests/ut/python/dataset/test_dataset_numpy_slices.py
@@ -0,0 +1,199 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+import mindspore.dataset as de
+from mindspore import log as logger
+import mindspore.dataset.transforms.vision.c_transforms as vision
+import pandas as pd
+
+
+def test_numpy_slices_list_1():
+    logger.info("Test Slicing a 1D list.")
+
+    np_data = [1, 2, 3]
+    ds = de.NumpySlicesDataset(np_data, shuffle=False)
+
+    for i, data in enumerate(ds):
+        assert data[0] == np_data[i]
+
+
+def test_numpy_slices_list_2():
+    logger.info("Test Slicing a 2D list into 1D list.")
+
+    np_data = [[1, 2], [3, 4]]
+    ds = de.NumpySlicesDataset(np_data, column_names=["col1"], shuffle=False)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data[0], np_data[i]).all()
+
+
+def test_numpy_slices_list_3():
+    logger.info("Test Slicing list in the first dimension.")
+
+    np_data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+    ds = de.NumpySlicesDataset(np_data, column_names=["col1"], shuffle=False)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data[0], np_data[i]).all()
+
+
+def test_numpy_slices_list_append():
+    logger.info("Test reading data of image list.")
+
+    DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
+    resize_height, resize_width = 2, 2
+
+    data1 = de.TFRecordDataset(DATA_DIR)
+    resize_op = vision.Resize((resize_height, resize_width))
+    data1 = data1.map(input_columns=["image"], operations=[vision.Decode(True), resize_op])
+
+    res = []
+    for data in data1.create_dict_iterator():
+        res.append(data["image"])
+
+    ds = de.NumpySlicesDataset(res, column_names=["col1"], shuffle=False)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data, res[i]).all()
+
+
+def test_numpy_slices_dict_1():
+    logger.info("Test Dictionary structure data.")
+
+    np_data = {"a": [1, 2], "b": [3, 4]}
+    ds = de.NumpySlicesDataset(np_data, shuffle=False)
+    res = [[1, 3], [2, 4]]
+
+    for i, data in enumerate(ds):
+        assert data[0] == res[i][0]
+        assert data[1] == res[i][1]
+
+
+def test_numpy_slices_tuple_1():
+    logger.info("Test slicing a list of tuple.")
+
+    np_data = [([1, 2], [3, 4]), ([11, 12], [13, 14]), ([21, 22], [23, 24])]
+    ds = de.NumpySlicesDataset(np_data, shuffle=False)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data, np_data[i]).all()
+
+    assert sum([1 for _ in ds]) == 3
+
+
+def test_numpy_slices_tuple_2():
+    logger.info("Test slicing a tuple of list.")
+
+    np_data = ([1, 2], [3, 4], [5, 6])
+    expected = [[1, 3, 5], [2, 4, 6]]
+    ds = de.NumpySlicesDataset(np_data, shuffle=False)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data, expected[i]).all()
+
+    assert sum([1 for _ in ds]) == 2
+
+
+def test_numpy_slices_tuple_3():
+    logger.info("Test reading different dimension of tuple data.")
+    features, labels = np.random.sample((5, 2)), np.random.sample((5, 1))
+    data = (features, labels)
+
+    ds = de.NumpySlicesDataset(data, column_names=["col1", "col2"], shuffle=False)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data[0], features[i]).all()
+        assert data[1] == labels[i]
+
+
+def test_numpy_slices_csv_value():
+    logger.info("Test loading value of csv file.")
+    csv_file = "../data/dataset/testNumpySlicesDataset/heart.csv"
+
+    df = pd.read_csv(csv_file)
+    target = df.pop("target")
+    df.pop("state")
+    np_data = (df.values, target.values)
+
+    ds = de.NumpySlicesDataset(np_data, column_names=["col1", "col2"], shuffle=False)
+
+    for i, data in enumerate(ds):
+        assert np.equal(np_data[0][i], data[0]).all()
+        assert np.equal(np_data[1][i], data[1]).all()
+
+
+def test_numpy_slices_csv_dict():
+    logger.info("Test loading csv file as dict.")
+
+    csv_file = "../data/dataset/testNumpySlicesDataset/heart.csv"
+    df = pd.read_csv(csv_file)
+    df.pop("state")
+    res = df.values
+
+    ds = de.NumpySlicesDataset(dict(df), shuffle=False)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data, res[i]).all()
+
+
+def test_numpy_slices_num_samplers():
+    logger.info("Test num_samplers.")
+
+    np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
+    ds = de.NumpySlicesDataset(np_data, shuffle=False, num_samples=2)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data[0], np_data[i]).all()
+
+    assert sum([1 for _ in ds]) == 2
+
+
+def test_numpy_slices_distributed_sampler():
+    logger.info("Test distributed sampler.")
+
+    np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
+    ds = de.NumpySlicesDataset(np_data, shuffle=False, shard_id=0, num_shards=4)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data[0], np_data[i * 4]).all()
+
+    assert sum([1 for _ in ds]) == 2
+
+
+def test_numpy_slices_sequential_sampler():
+
+    logger.info("Test numpy_slices_dataset with SequentialSampler and repeat.")
+
+    np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
+    ds = de.NumpySlicesDataset(np_data, sampler=de.SequentialSampler()).repeat(2)
+
+    for i, data in enumerate(ds):
+        assert np.equal(data[0], np_data[i % 8]).all()
+
+
+if __name__ == "__main__":
+    test_numpy_slices_list_1()
+    test_numpy_slices_list_2()
+    test_numpy_slices_list_3()
+    test_numpy_slices_list_append()
+    test_numpy_slices_dict_1()
+    test_numpy_slices_tuple_1()
+    test_numpy_slices_tuple_2()
+    test_numpy_slices_tuple_3()
+    test_numpy_slices_csv_value()
+    test_numpy_slices_csv_dict()
+    test_numpy_slices_num_samplers()
+    test_numpy_slices_distributed_sampler()
+    test_numpy_slices_sequential_sampler()
diff --git a/tests/ut/python/dataset/test_datasets_clue.py b/tests/ut/python/dataset/test_datasets_clue.py
new file mode 100644
index 0000000000..e1959acb42
--- /dev/null
+++ b/tests/ut/python/dataset/test_datasets_clue.py
@@ -0,0 +1,364 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import mindspore.dataset as ds
+
+
+def test_clue():
+    """
+    Test CLUE with repeat, skip and so on
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'
+
+    buffer = []
+    data = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', shuffle=False)
+    data = data.repeat(2)
+    data = data.skip(3)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'].item().decode("utf8"),
+            'sentence1': d['sentence1'].item().decode("utf8"),
+            'sentence2': d['sentence2'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+
+def test_clue_num_shards():
+    """
+    Test num_shards param of CLUE dataset
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'
+
+    buffer = []
+    data = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', num_shards=3, shard_id=1)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'].item().decode("utf8"),
+            'sentence1': d['sentence1'].item().decode("utf8"),
+            'sentence2': d['sentence2'].item().decode("utf8")
+        })
+    assert len(buffer) == 1
+
+
+def test_clue_num_samples():
+    """
+    Test num_samples param of CLUE dataset
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'
+
+    data = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', num_samples=2)
+    count = 0
+    for _ in data.create_dict_iterator():
+        count += 1
+    assert count == 2
+
+
+def test_textline_dataset_get_datasetsize():
+    """
+    Test get_dataset_size of CLUE dataset
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'
+
+    data = ds.TextFileDataset(TRAIN_FILE)
+    size = data.get_dataset_size()
+    assert size == 3
+
+
+def test_clue_afqmc():
+    """
+    Test AFQMC for train, test and evaluation
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'
+    TEST_FILE = '../data/dataset/testCLUE/afqmc/test.json'
+    EVAL_FILE = '../data/dataset/testCLUE/afqmc/dev.json'
+
+    # train
+    buffer = []
+    data = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'].item().decode("utf8"),
+            'sentence1': d['sentence1'].item().decode("utf8"),
+            'sentence2': d['sentence2'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+    # test
+    buffer = []
+    data = ds.CLUEDataset(TEST_FILE, task='AFQMC', usage='test', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'id': d['id'],
+            'sentence1': d['sentence1'].item().decode("utf8"),
+            'sentence2': d['sentence2'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+    # evaluation
+    buffer = []
+    data = ds.CLUEDataset(EVAL_FILE, task='AFQMC', usage='eval', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'].item().decode("utf8"),
+            'sentence1': d['sentence1'].item().decode("utf8"),
+            'sentence2': d['sentence2'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+
+def test_clue_cmnli():
+    """
+    Test CMNLI for train, test and evaluation
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/cmnli/train.json'
+    TEST_FILE = '../data/dataset/testCLUE/cmnli/test.json'
+    EVAL_FILE = '../data/dataset/testCLUE/cmnli/dev.json'
+
+    # train
+    buffer = []
+    data = ds.CLUEDataset(TRAIN_FILE, task='CMNLI', usage='train', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'].item().decode("utf8"),
+            'sentence1': d['sentence1'].item().decode("utf8"),
+            'sentence2': d['sentence2'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+    # test
+    buffer = []
+    data = ds.CLUEDataset(TEST_FILE, task='CMNLI', usage='test', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'id': d['id'],
+            'sentence1': d['sentence1'],
+            'sentence2': d['sentence2']
+        })
+    assert len(buffer) == 3
+
+    # eval
+    buffer = []
+    data = ds.CLUEDataset(EVAL_FILE, task='CMNLI', usage='eval', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'],
+            'sentence1': d['sentence1'],
+            'sentence2': d['sentence2']
+        })
+    assert len(buffer) == 3
+
+
+def test_clue_csl():
+    """
+    Test CSL for train, test and evaluation
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/csl/train.json'
+    TEST_FILE = '../data/dataset/testCLUE/csl/test.json'
+    EVAL_FILE = '../data/dataset/testCLUE/csl/dev.json'
+
+    # train
+    buffer = []
+    data = ds.CLUEDataset(TRAIN_FILE, task='CSL', usage='train', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'id': d['id'],
+            'abst': d['abst'].item().decode("utf8"),
+            'keyword': [i.item().decode("utf8") for i in d['keyword']],
+            'label': d['label'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+    # test
+    buffer = []
+    data = ds.CLUEDataset(TEST_FILE, task='CSL', usage='test', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'id': d['id'],
+            'abst': d['abst'].item().decode("utf8"),
+            'keyword': [i.item().decode("utf8") for i in d['keyword']],
+        })
+    assert len(buffer) == 3
+
+    # eval
+    buffer = []
+    data = ds.CLUEDataset(EVAL_FILE, task='CSL', usage='eval', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'id': d['id'],
+            'abst': d['abst'].item().decode("utf8"),
+            'keyword': [i.item().decode("utf8") for i in d['keyword']],
+            'label': d['label'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+
+def test_clue_iflytek():
+    """
+    Test IFLYTEK for train, test and evaluation
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/iflytek/train.json'
+    TEST_FILE = '../data/dataset/testCLUE/iflytek/test.json'
+    EVAL_FILE = '../data/dataset/testCLUE/iflytek/dev.json'
+
+    # train
+    buffer = []
+    data = ds.CLUEDataset(TRAIN_FILE, task='IFLYTEK', usage='train', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'].item().decode("utf8"),
+            'label_des': d['label_des'].item().decode("utf8"),
+            'sentence': d['sentence'].item().decode("utf8"),
+        })
+    assert len(buffer) == 3
+
+    # test
+    buffer = []
+    data = ds.CLUEDataset(TEST_FILE, task='IFLYTEK', usage='test', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'id': d['id'],
+            'sentence': d['sentence'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+    # eval
+    buffer = []
+    data = ds.CLUEDataset(EVAL_FILE, task='IFLYTEK', usage='eval', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'].item().decode("utf8"),
+            'label_des': d['label_des'].item().decode("utf8"),
+            'sentence': d['sentence'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+
+def test_clue_tnews():
+    """
+    Test TNEWS for train, test and evaluation
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/tnews/train.json'
+    TEST_FILE = '../data/dataset/testCLUE/tnews/test.json'
+    EVAL_FILE = '../data/dataset/testCLUE/tnews/dev.json'
+
+    # train
+    buffer = []
+    data = ds.CLUEDataset(TRAIN_FILE, task='TNEWS', usage='train', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'].item().decode("utf8"),
+            'label_desc': d['label_desc'].item().decode("utf8"),
+            'sentence': d['sentence'].item().decode("utf8"),
+            'keywords':
+                d['keywords'].item().decode("utf8") if d['keywords'].size > 0 else d['keywords']
+        })
+    assert len(buffer) == 3
+
+    # test
+    buffer = []
+    data = ds.CLUEDataset(TEST_FILE, task='TNEWS', usage='test', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'id': d['id'],
+            'sentence': d['sentence'].item().decode("utf8"),
+            'keywords':
+                d['keywords'].item().decode("utf8") if d['keywords'].size > 0 else d['keywords']
+        })
+    assert len(buffer) == 3
+
+    # eval
+    buffer = []
+    data = ds.CLUEDataset(EVAL_FILE, task='TNEWS', usage='eval', shuffle=False)
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'label': d['label'].item().decode("utf8"),
+            'label_desc': d['label_desc'].item().decode("utf8"),
+            'sentence': d['sentence'].item().decode("utf8"),
+            'keywords':
+                d['keywords'].item().decode("utf8") if d['keywords'].size > 0 else d['keywords']
+        })
+    assert len(buffer) == 3
+
+
+def test_clue_wsc():
+    """
+    Test WSC for train, test and evaluation
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/wsc/train.json'
+    TEST_FILE = '../data/dataset/testCLUE/wsc/test.json'
+    EVAL_FILE = '../data/dataset/testCLUE/wsc/dev.json'
+
+    # train
+    buffer = []
+    data = ds.CLUEDataset(TRAIN_FILE, task='WSC', usage='train')
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'span1_index': d['span1_index'],
+            'span2_index': d['span2_index'],
+            'span1_text': d['span1_text'].item().decode("utf8"),
+            'span2_text': d['span2_text'].item().decode("utf8"),
+            'idx': d['idx'],
+            'label': d['label'].item().decode("utf8"),
+            'text': d['text'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+    # test
+    buffer = []
+    data = ds.CLUEDataset(TEST_FILE, task='WSC', usage='test')
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'span1_index': d['span1_index'],
+            'span2_index': d['span2_index'],
+            'span1_text': d['span1_text'].item().decode("utf8"),
+            'span2_text': d['span2_text'].item().decode("utf8"),
+            'idx': d['idx'],
+            'text': d['text'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+    # eval
+    buffer = []
+    data = ds.CLUEDataset(EVAL_FILE, task='WSC', usage='eval')
+    for d in data.create_dict_iterator():
+        buffer.append({
+            'span1_index': d['span1_index'],
+            'span2_index': d['span2_index'],
+            'span1_text': d['span1_text'].item().decode("utf8"),
+            'span2_text': d['span2_text'].item().decode("utf8"),
+            'idx': d['idx'],
+            'label': d['label'].item().decode("utf8"),
+            'text': d['text'].item().decode("utf8")
+        })
+    assert len(buffer) == 3
+
+def test_clue_to_device():
+    """
+    Test CLUE with to_device
+    """
+    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'
+    data = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', shuffle=False)
+    data = data.to_device()
+    data.send()
+
+
+if __name__ == "__main__":
+    test_clue()
+    test_clue_afqmc()
+    test_clue_cmnli()
+    test_clue_csl()
+    test_clue_iflytek()
+    test_clue_tnews()
+    test_clue_wsc()
diff --git a/tests/ut/python/dataset/test_datasets_coco.py b/tests/ut/python/dataset/test_datasets_coco.py
new file mode 100644
index 0000000000..f5bf7caa6c
--- /dev/null
+++ b/tests/ut/python/dataset/test_datasets_coco.py
@@ -0,0 +1,274 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.c_transforms as vision
+
+DATA_DIR = "../data/dataset/testCOCO/train/"
+ANNOTATION_FILE = "../data/dataset/testCOCO/annotations/train.json"
+KEYPOINT_FILE = "../data/dataset/testCOCO/annotations/key_point.json"
+PANOPTIC_FILE = "../data/dataset/testCOCO/annotations/panoptic.json"
+INVALID_FILE = "../data/dataset/testCOCO/annotations/invalid.json"
+LACKOFIMAGE_FILE = "../data/dataset/testCOCO/annotations/lack_of_images.json"
+INVALID_CATEGORY_ID_FILE = "../data/dataset/testCOCO/annotations/invalid_category_id.json"
+
+def test_coco_detection():
+    data1 = ds.CocoDataset(DATA_DIR, annotation_file=ANNOTATION_FILE, task="Detection",
+                           decode=True, shuffle=False)
+    num_iter = 0
+    image_shape = []
+    bbox = []
+    category_id = []
+    for data in data1.create_dict_iterator():
+        image_shape.append(data["image"].shape)
+        bbox.append(data["bbox"])
+        category_id.append(data["category_id"])
+        num_iter += 1
+    assert num_iter == 6
+    assert image_shape[0] == (2268, 4032, 3)
+    assert image_shape[1] == (561, 595, 3)
+    assert image_shape[2] == (607, 585, 3)
+    assert image_shape[3] == (642, 675, 3)
+    assert image_shape[4] == (2268, 4032, 3)
+    assert image_shape[5] == (2268, 4032, 3)
+    assert np.array_equal(np.array([[10., 10., 10., 10.], [70., 70., 70., 70.]]), bbox[0])
+    assert np.array_equal(np.array([[20., 20., 20., 20.], [80., 80., 80.0, 80.]]), bbox[1])
+    assert np.array_equal(np.array([[30.0, 30.0, 30.0, 30.]]), bbox[2])
+    assert np.array_equal(np.array([[40., 40., 40., 40.]]), bbox[3])
+    assert np.array_equal(np.array([[50., 50., 50., 50.]]), bbox[4])
+    assert np.array_equal(np.array([[60., 60., 60., 60.]]), bbox[5])
+    assert np.array_equal(np.array([[1], [7]]), category_id[0])
+    assert np.array_equal(np.array([[2], [8]]), category_id[1])
+    assert np.array_equal(np.array([[3]]), category_id[2])
+    assert np.array_equal(np.array([[4]]), category_id[3])
+    assert np.array_equal(np.array([[5]]), category_id[4])
+    assert np.array_equal(np.array([[6]]), category_id[5])
+
+def test_coco_stuff():
+    data1 = ds.CocoDataset(DATA_DIR, annotation_file=ANNOTATION_FILE, task="Stuff",
+                           decode=True, shuffle=False)
+    num_iter = 0
+    image_shape = []
+    segmentation = []
+    iscrowd = []
+    for data in data1.create_dict_iterator():
+        image_shape.append(data["image"].shape)
+        segmentation.append(data["segmentation"])
+        iscrowd.append(data["iscrowd"])
+        num_iter += 1
+    assert num_iter == 6
+    assert image_shape[0] == (2268, 4032, 3)
+    assert image_shape[1] == (561, 595, 3)
+    assert image_shape[2] == (607, 585, 3)
+    assert image_shape[3] == (642, 675, 3)
+    assert image_shape[4] == (2268, 4032, 3)
+    assert image_shape[5] == (2268, 4032, 3)
+    assert np.array_equal(np.array([[10., 12., 13., 14., 15., 16., 17., 18., 19., 20.],
+                                    [70., 72., 73., 74., 75., -1., -1., -1., -1., -1.]]),
+                          segmentation[0])
+    assert np.array_equal(np.array([[0], [0]]), iscrowd[0])
+    assert np.array_equal(np.array([[20.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0],
+                                    [10.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, -1.0]]),
+                          segmentation[1])
+    assert np.array_equal(np.array([[0], [1]]), iscrowd[1])
+    assert np.array_equal(np.array([[40., 42., 43., 44., 45., 46., 47., 48., 49., 40., 41., 42.]]), segmentation[2])
+    assert np.array_equal(np.array([[0]]), iscrowd[2])
+    assert np.array_equal(np.array([[50., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63.]]),
+                          segmentation[3])
+    assert np.array_equal(np.array([[0]]), iscrowd[3])
+    assert np.array_equal(np.array([[60., 62., 63., 64., 65., 66., 67., 68., 69., 70., 71., 72., 73., 74.]]),
+                          segmentation[4])
+    assert np.array_equal(np.array([[0]]), iscrowd[4])
+    assert np.array_equal(np.array([[60., 62., 63., 64., 65., 66., 67.], [68., 69., 70., 71., 72., 73., 74.]]),
+                          segmentation[5])
+    assert np.array_equal(np.array([[0]]), iscrowd[5])
+
+def test_coco_keypoint():
+    data1 = ds.CocoDataset(DATA_DIR, annotation_file=KEYPOINT_FILE, task="Keypoint",
+                           decode=True, shuffle=False)
+    num_iter = 0
+    image_shape = []
+    keypoints = []
+    num_keypoints = []
+    for data in data1.create_dict_iterator():
+        image_shape.append(data["image"].shape)
+        keypoints.append(data["keypoints"])
+        num_keypoints.append(data["num_keypoints"])
+        num_iter += 1
+    assert num_iter == 2
+    assert image_shape[0] == (2268, 4032, 3)
+    assert image_shape[1] == (561, 595, 3)
+    assert np.array_equal(np.array([[368., 61., 1., 369., 52., 2., 0., 0., 0., 382., 48., 2., 0., 0., 0., 368., 84., 2.,
+                                     435., 81., 2., 362., 125., 2., 446., 125., 2., 360., 153., 2., 0., 0., 0., 397.,
+                                     167., 1., 439., 166., 1., 369., 193., 2., 461., 234., 2., 361., 246., 2., 474.,
+                                     287., 2.]]), keypoints[0])
+    assert np.array_equal(np.array([[14]]), num_keypoints[0])
+    assert np.array_equal(np.array([[244., 139., 2., 0., 0., 0., 226., 118., 2., 0., 0., 0., 154., 159., 2., 143., 261.,
+                                     2., 135., 312., 2., 271., 423., 2., 184., 530., 2., 261., 280., 2., 347., 592., 2.,
+                                     0., 0., 0., 123., 596., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
+                          keypoints[1])
+    assert np.array_equal(np.array([[10]]), num_keypoints[1])
+
+def test_coco_panoptic():
+    data1 = ds.CocoDataset(DATA_DIR, annotation_file=PANOPTIC_FILE, task="Panoptic", decode=True, shuffle=False)
+    num_iter = 0
+    image_shape = []
+    bbox = []
+    category_id = []
+    iscrowd = []
+    area = []
+    for data in data1.create_dict_iterator():
+        image_shape.append(data["image"].shape)
+        bbox.append(data["bbox"])
+        category_id.append(data["category_id"])
+        iscrowd.append(data["iscrowd"])
+        area.append(data["area"])
+        num_iter += 1
+    assert num_iter == 2
+    assert image_shape[0] == (2268, 4032, 3)
+    assert np.array_equal(np.array([[472, 173, 36, 48], [340, 22, 154, 301], [486, 183, 30, 35]]), bbox[0])
+    assert np.array_equal(np.array([[1], [1], [2]]), category_id[0])
+    assert np.array_equal(np.array([[0], [0], [0]]), iscrowd[0])
+    assert np.array_equal(np.array([[705], [14062], [626]]), area[0])
+    assert image_shape[1] == (642, 675, 3)
+    assert np.array_equal(np.array([[103, 133, 229, 422], [243, 175, 93, 164]]), bbox[1])
+    assert np.array_equal(np.array([[1], [3]]), category_id[1])
+    assert np.array_equal(np.array([[0], [0]]), iscrowd[1])
+    assert np.array_equal(np.array([[43102], [6079]]), area[1])
+
+def test_coco_detection_classindex():
+    data1 = ds.CocoDataset(DATA_DIR, annotation_file=ANNOTATION_FILE, task="Detection", decode=True)
+    class_index = data1.get_class_indexing()
+    assert class_index == {'person': [1], 'bicycle': [2], 'car': [3], 'cat': [4], 'dog': [5], 'monkey': [6],
+                           'bag': [7], 'orange': [8]}
+    num_iter = 0
+    for _ in data1.__iter__():
+        num_iter += 1
+    assert num_iter == 6
+
+def test_coco_panootic_classindex():
+    data1 = ds.CocoDataset(DATA_DIR, annotation_file=PANOPTIC_FILE, task="Panoptic", decode=True)
+    class_index = data1.get_class_indexing()
+    assert class_index == {'person': [1, 1], 'bicycle': [2, 1], 'car': [3, 1]}
+    num_iter = 0
+    for _ in data1.__iter__():
+        num_iter += 1
+    assert num_iter == 2
+
+def test_coco_case_0():
+    data1 = ds.CocoDataset(DATA_DIR, annotation_file=ANNOTATION_FILE, task="Detection", decode=True)
+    data1 = data1.shuffle(10)
+    data1 = data1.batch(3, pad_info={})
+    num_iter = 0
+    for _ in data1.create_dict_iterator():
+        num_iter += 1
+    assert num_iter == 2
+
+def test_coco_case_1():
+    data1 = ds.CocoDataset(DATA_DIR, annotation_file=ANNOTATION_FILE, task="Detection", decode=True)
+    sizes = [0.5, 0.5]
+    randomize = False
+    dataset1, dataset2 = data1.split(sizes=sizes, randomize=randomize)
+
+    num_iter = 0
+    for _ in dataset1.create_dict_iterator():
+        num_iter += 1
+    assert num_iter == 3
+    num_iter = 0
+    for _ in dataset2.create_dict_iterator():
+        num_iter += 1
+    assert num_iter == 3
+
+def test_coco_case_2():
+    data1 = ds.CocoDataset(DATA_DIR, annotation_file=ANNOTATION_FILE, task="Detection", decode=True)
+    resize_op = vision.Resize((224, 224))
+
+    data1 = data1.map(input_columns=["image"], operations=resize_op)
+    data1 = data1.repeat(4)
+    num_iter = 0
+    for _ in data1.__iter__():
+        num_iter += 1
+    assert num_iter == 24
+
+def test_coco_case_exception():
+    try:
+        data1 = ds.CocoDataset("path_not_exist/", annotation_file=ANNOTATION_FILE, task="Detection")
+        for _ in data1.__iter__():
+            pass
+        assert False
+    except ValueError as e:
+        assert "does not exist or permission denied" in str(e)
+
+    try:
+        data1 = ds.CocoDataset(DATA_DIR, annotation_file="./file_not_exist", task="Detection")
+        for _ in data1.__iter__():
+            pass
+        assert False
+    except ValueError as e:
+        assert "does not exist or permission denied" in str(e)
+
+    try:
+        data1 = ds.CocoDataset(DATA_DIR, annotation_file=ANNOTATION_FILE, task="Invalid task")
+        for _ in data1.__iter__():
+            pass
+        assert False
+    except ValueError as e:
+        assert "Invalid task type" in str(e)
+
+    try:
+        data1 = ds.CocoDataset(DATA_DIR, annotation_file=LACKOFIMAGE_FILE, task="Detection")
+        for _ in data1.__iter__():
+            pass
+        assert False
+    except RuntimeError as e:
+        assert "Invalid node found in json" in str(e)
+
+    try:
+        data1 = ds.CocoDataset(DATA_DIR, annotation_file=INVALID_CATEGORY_ID_FILE, task="Detection")
+        for _ in data1.__iter__():
+            pass
+        assert False
+    except RuntimeError as e:
+        assert "category_id can't find in categories" in str(e)
+
+    try:
+        data1 = ds.CocoDataset(DATA_DIR, annotation_file=INVALID_FILE, task="Detection")
+        for _ in data1.__iter__():
+            pass
+        assert False
+    except RuntimeError as e:
+        assert "json.exception.parse_error" in str(e)
+
+    try:
+        sampler = ds.PKSampler(3)
+        data1 = ds.CocoDataset(DATA_DIR, annotation_file=INVALID_FILE, task="Detection", sampler=sampler)
+        for _ in data1.__iter__():
+            pass
+        assert False
+    except ValueError as e:
+        assert "CocoDataset doesn't support PKSampler" in str(e)
+
+
+if __name__ == '__main__':
+    test_coco_detection()
+    test_coco_stuff()
+    test_coco_keypoint()
+    test_coco_panoptic()
+    test_coco_detection_classindex()
+    test_coco_panootic_classindex()
+    test_coco_case_0()
+    test_coco_case_1()
+    test_coco_case_2()
+    test_coco_case_exception()
diff --git a/tests/ut/python/dataset/test_datasets_imagefolder.py b/tests/ut/python/dataset/test_datasets_imagefolder.py
index a88111ccbe..8e5679076d 100644
--- a/tests/ut/python/dataset/test_datasets_imagefolder.py
+++ b/tests/ut/python/dataset/test_datasets_imagefolder.py
@@ -58,7 +58,7 @@ def test_imagefolder_numsamples():
     assert num_iter == 10
 
     random_sampler = ds.RandomSampler(num_samples=3, replacement=True)
-    data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10, num_parallel_workers=2, sampler=random_sampler)
+    data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_parallel_workers=2, sampler=random_sampler)
 
     num_iter = 0
     for item in data1.create_dict_iterator():
@@ -67,7 +67,7 @@ def test_imagefolder_numsamples():
     assert num_iter == 3
 
     random_sampler = ds.RandomSampler(num_samples=3, replacement=False)
-    data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10, num_parallel_workers=2, sampler=random_sampler)
+    data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_parallel_workers=2, sampler=random_sampler)
 
     num_iter = 0
     for item in data1.create_dict_iterator():
diff --git a/tests/ut/python/dataset/test_datasets_sharding.py b/tests/ut/python/dataset/test_datasets_sharding.py
index 02db3589e6..94c39fb34c 100644
--- a/tests/ut/python/dataset/test_datasets_sharding.py
+++ b/tests/ut/python/dataset/test_datasets_sharding.py
@@ -162,8 +162,8 @@ def test_voc_shardings(print_res=False):
     voc_dir = "../data/dataset/testVOC2012"
 
     def sharding_config(num_shards, shard_id, num_samples, shuffle, repeat_cnt=1):
-        sampler = ds.DistributedSampler(num_shards, shard_id, shuffle=shuffle)
-        data1 = ds.VOCDataset(voc_dir, decode=True, sampler=sampler, num_samples=num_samples)
+        sampler = ds.DistributedSampler(num_shards, shard_id, shuffle=shuffle, num_samples=num_samples)
+        data1 = ds.VOCDataset(voc_dir, decode=True, sampler=sampler)
         data1 = data1.repeat(repeat_cnt)
         res = []
         for item in data1.create_dict_iterator():  # each data is a dictionary
diff --git a/tests/ut/python/dataset/test_datasets_textfileop.py b/tests/ut/python/dataset/test_datasets_textfileop.py
index a1d19d88e4..1732c1817d 100644
--- a/tests/ut/python/dataset/test_datasets_textfileop.py
+++ b/tests/ut/python/dataset/test_datasets_textfileop.py
@@ -89,6 +89,10 @@ def test_textline_dataset_get_datasetsize():
     size = data.get_dataset_size()
     assert size == 3
 
+def test_textline_dataset_to_device():
+    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    data = data.to_device()
+    data.send()
 
 if __name__ == "__main__":
     test_textline_dataset_one_file()
diff --git a/tests/ut/python/dataset/test_datasets_voc.py b/tests/ut/python/dataset/test_datasets_voc.py
index 0e53e9eec6..8db65e9734 100644
--- a/tests/ut/python/dataset/test_datasets_voc.py
+++ b/tests/ut/python/dataset/test_datasets_voc.py
@@ -124,12 +124,12 @@ def test_case_2():
     num_iter = 0
     for _ in dataset1.create_dict_iterator():
         num_iter += 1
-    assert (num_iter == 5)
+    assert num_iter == 5
 
     num_iter = 0
     for _ in dataset2.create_dict_iterator():
         num_iter += 1
-    assert (num_iter == 5)
+    assert num_iter == 5
 
 
 def test_voc_exception():
diff --git a/tests/ut/python/dataset/test_decode.py b/tests/ut/python/dataset/test_decode.py
index ecb56b1cec..ac4995fc23 100644
--- a/tests/ut/python/dataset/test_decode.py
+++ b/tests/ut/python/dataset/test_decode.py
@@ -15,12 +15,12 @@
 """
 Testing Decode op in DE
 """
-import numpy as np
 import cv2
 
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as vision
 from mindspore import log as logger
+from util import diff_mse
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
@@ -45,8 +45,7 @@ def test_decode_op():
         expected = cv2.imdecode(item2["image"], cv2.IMREAD_COLOR)
         expected = cv2.cvtColor(expected, cv2.COLOR_BGR2RGB)
         assert actual.shape == expected.shape
-        diff = actual - expected
-        mse = np.sum(np.power(diff, 2))
+        mse = diff_mse(actual, expected)
         assert mse == 0
 
 
@@ -71,8 +70,7 @@ def test_decode_op_tf_file_dataset():
         expected = cv2.imdecode(item2["image"], cv2.IMREAD_COLOR)
         expected = cv2.cvtColor(expected, cv2.COLOR_BGR2RGB)
         assert actual.shape == expected.shape
-        diff = actual - expected
-        mse = np.sum(np.power(diff, 2))
+        mse = diff_mse(actual, expected)
         assert mse == 0
 
 
diff --git a/tests/ut/python/dataset/test_duplicate_op.py b/tests/ut/python/dataset/test_duplicate_op.py
new file mode 100644
index 0000000000..9de3453a7e
--- /dev/null
+++ b/tests/ut/python/dataset/test_duplicate_op.py
@@ -0,0 +1,40 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing Duplicate op in DE
+"""
+import numpy as np
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as ops
+
+
+def compare(array):
+    data = ds.NumpySlicesDataset([array], column_names="x")
+    array = np.array(array)
+    data = data.map(input_columns=["x"], output_columns=["x", "y"], columns_order=["x", "y"],
+                    operations=ops.Duplicate())
+    for d in data.create_dict_iterator():
+        np.testing.assert_array_equal(array, d["x"])
+        np.testing.assert_array_equal(array, d["y"])
+
+
+def test_duplicate_basics():
+    compare([1, 2, 3])
+    compare([b"1", b"2", b"3"])
+
+
+if __name__ == "__main__":
+    test_duplicate_basics()
diff --git a/tests/ut/python/dataset/test_equalize.py b/tests/ut/python/dataset/test_equalize.py
index 0934bb4836..85cb25d9ee 100644
--- a/tests/ut/python/dataset/test_equalize.py
+++ b/tests/ut/python/dataset/test_equalize.py
@@ -12,34 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-import matplotlib.pyplot as plt
+"""
+Testing Equalize op in DE
+"""
 import numpy as np
 
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.vision.py_transforms as F
 from mindspore import log as logger
+from util import visualize_list
 
 DATA_DIR = "../data/dataset/testImageNetData/train/"
 
 
-def visualize(image_original, image_equalize):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    num = len(image_equalize)
-    for i in range(num):
-        plt.subplot(2, num, i + 1)
-        plt.imshow(image_original[i])
-        plt.title("Original image")
-
-        plt.subplot(2, num, i + num + 1)
-        plt.imshow(image_equalize[i])
-        plt.title("DE Color Equalized image")
-
-    plt.show()
-
-
 def test_equalize(plot=False):
     """
     Test Equalize
@@ -94,7 +79,7 @@ def test_equalize(plot=False):
     logger.info("MSE= {}".format(str(np.mean(mse))))
 
     if plot:
-        visualize(images_original, images_equalize)
+        visualize_list(images_original, images_equalize)
 
 
 if __name__ == "__main__":
diff --git a/tests/ut/python/dataset/test_exceptions.py b/tests/ut/python/dataset/test_exceptions.py
index cb79d456d4..cbfa402bb0 100644
--- a/tests/ut/python/dataset/test_exceptions.py
+++ b/tests/ut/python/dataset/test_exceptions.py
@@ -35,18 +35,13 @@ def test_exception_01():
 
 def test_exception_02():
     """
-    Test multiple exceptions with invalid input
+    Test exceptions with invalid input, and test valid input
     """
     logger.info("test_exception_02")
-    num_samples = 0
-    with pytest.raises(ValueError) as info:
-        data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], num_samples=num_samples)
-    assert "num_samples must be greater than 0" in str(info.value)
-
     num_samples = -1
     with pytest.raises(ValueError) as info:
         data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], num_samples=num_samples)
-    assert "num_samples must be greater than 0" in str(info.value)
+    assert "num_samples cannot be less than 0" in str(info.value)
 
     num_samples = 1
     data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], num_samples=num_samples)
diff --git a/tests/ut/python/dataset/test_fill_op.py b/tests/ut/python/dataset/test_fill_op.py
new file mode 100644
index 0000000000..f138dd15ec
--- /dev/null
+++ b/tests/ut/python/dataset/test_fill_op.py
@@ -0,0 +1,95 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing fill op
+"""
+import numpy as np
+import pytest
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as data_trans
+
+
+def test_fillop_basic():
+    def gen():
+        yield (np.array([4, 5, 6, 7], dtype=np.uint8),)
+
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    fill_op = data_trans.Fill(3)
+
+    data = data.map(input_columns=["col"], operations=fill_op)
+    expected = np.array([3, 3, 3, 3], dtype=np.uint8)
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], expected)
+
+
+def test_fillop_down_type_cast():
+    def gen():
+        yield (np.array([4, 5, 6, 7], dtype=np.uint8),)
+
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    fill_op = data_trans.Fill(-3)
+
+    data = data.map(input_columns=["col"], operations=fill_op)
+    expected = np.array([253, 253, 253, 253], dtype=np.uint8)
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], expected)
+
+
+def test_fillop_up_type_cast():
+    def gen():
+        yield (np.array([4, 5, 6, 7], dtype=np.float),)
+
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    fill_op = data_trans.Fill(3)
+
+    data = data.map(input_columns=["col"], operations=fill_op)
+    expected = np.array([3., 3., 3., 3.], dtype=np.float)
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], expected)
+
+
+def test_fillop_string():
+    def gen():
+        yield (np.array(["45555", "45555"], dtype='S'),)
+
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    fill_op = data_trans.Fill("error")
+
+    data = data.map(input_columns=["col"], operations=fill_op)
+    expected = np.array(['error', 'error'], dtype='S')
+    for data_row in data:
+        np.testing.assert_array_equal(data_row[0], expected)
+
+
+def test_fillop_error_handling():
+    def gen():
+        yield (np.array([4, 4, 4, 4]),)
+
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    fill_op = data_trans.Fill("words")
+    data = data.map(input_columns=["col"], operations=fill_op)
+
+    with pytest.raises(RuntimeError) as error_info:
+        for data_row in data:
+            print(data_row)
+    assert "Types do not match" in repr(error_info.value)
+
+
+if __name__ == "__main__":
+    test_fillop_basic()
+    test_fillop_up_type_cast()
+    test_fillop_down_type_cast()
+    test_fillop_string()
+    test_fillop_error_handling()
diff --git a/tests/ut/python/dataset/test_filterop.py b/tests/ut/python/dataset/test_filterop.py
index 25fde151e2..015d580379 100644
--- a/tests/ut/python/dataset/test_filterop.py
+++ b/tests/ut/python/dataset/test_filterop.py
@@ -390,7 +390,6 @@ def filter_func_Partial_0(col1, col2, col3, col4):
 
 # test with  row_data_buffer > 1
 def test_filter_by_generator_Partial0():
-    ds.config.load('../data/dataset/declient_filter.cfg')
     dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"])
     dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"])
     dataset_zip = ds.zip((dataset1, dataset2))
@@ -404,7 +403,6 @@ def test_filter_by_generator_Partial0():
 
 # test with  row_data_buffer > 1
 def test_filter_by_generator_Partial1():
-    ds.config.load('../data/dataset/declient_filter.cfg')
     dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"])
     dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"])
     dataset_zip = ds.zip((dataset1, dataset2))
@@ -419,7 +417,6 @@ def test_filter_by_generator_Partial1():
 
 # test with  row_data_buffer > 1
 def test_filter_by_generator_Partial2():
-    ds.config.load('../data/dataset/declient_filter.cfg')
     dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"])
     dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"])
 
@@ -454,7 +451,6 @@ def generator_big(maxid=20):
 
 # test with  row_data_buffer > 1
 def test_filter_by_generator_Partial():
-    ds.config.load('../data/dataset/declient_filter.cfg')
     dataset = ds.GeneratorDataset(source=generator_mc(99), column_names=["col1", "col2"])
     dataset_s = dataset.shuffle(4)
     dataset_f1 = dataset_s.filter(input_columns=["col1", "col2"], predicate=filter_func_Partial, num_parallel_workers=1)
@@ -473,7 +469,6 @@ def filter_func_cifar(col1, col2):
 # test with  cifar10
 def test_filte_case_dataset_cifar10():
     DATA_DIR_10 = "../data/dataset/testCifar10Data"
-    ds.config.load('../data/dataset/declient_filter.cfg')
     dataset_c = ds.Cifar10Dataset(dataset_dir=DATA_DIR_10, num_samples=100000, shuffle=False)
     dataset_f1 = dataset_c.filter(input_columns=["image", "label"], predicate=filter_func_cifar, num_parallel_workers=1)
     for item in dataset_f1.create_dict_iterator():
diff --git a/tests/ut/python/dataset/test_five_crop.py b/tests/ut/python/dataset/test_five_crop.py
index d7192584d2..61632e3989 100644
--- a/tests/ut/python/dataset/test_five_crop.py
+++ b/tests/ut/python/dataset/test_five_crop.py
@@ -14,14 +14,13 @@
 """
 Testing FiveCrop in DE
 """
-import matplotlib.pyplot as plt
-import numpy as np
 import pytest
+import numpy as np
 
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.py_transforms as vision
 from mindspore import log as logger
-from util import visualize
+from util import visualize_list
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
@@ -64,7 +63,7 @@ def test_five_crop_op(plot=False):
         logger.info("dtype of image_1: {}".format(image_1.dtype))
         logger.info("dtype of image_2: {}".format(image_2.dtype))
         if plot:
-            visualize(np.array([image_1]*10), (image_2 * 255).astype(np.uint8).transpose(0, 2, 3, 1))
+            visualize_list(np.array([image_1]*10), (image_2 * 255).astype(np.uint8).transpose(0, 2, 3, 1))
 
         # The output data should be of a 4D tensor shape, a stack of 5 images.
         assert len(image_2.shape) == 4
diff --git a/tests/ut/python/dataset/test_from_dataset.py b/tests/ut/python/dataset/test_from_dataset.py
new file mode 100644
index 0000000000..207a6be6a1
--- /dev/null
+++ b/tests/ut/python/dataset/test_from_dataset.py
@@ -0,0 +1,146 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing from_dataset in mindspore.dataset
+"""
+import numpy as np
+import mindspore.dataset as ds
+import mindspore.dataset.text as text
+
+
+def test_demo_basic_from_dataset():
+    """ this is a tutorial on how from_dataset should be used in a normal use case"""
+    data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
+    vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None, special_tokens=["<pad>", "<unk>"],
+                                    special_first=True)
+    data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
+    res = []
+    for d in data.create_dict_iterator():
+        res.append(d["text"].item())
+    assert res == [4, 5, 3, 6, 7, 2], res
+
+
+def test_demo_basic_from_dataset_with_tokenizer():
+    """ this is a tutorial on how from_dataset should be used in a normal use case with tokenizer"""
+    data = ds.TextFileDataset("../data/dataset/testTokenizerData/1.txt", shuffle=False)
+    data = data.map(input_columns=["text"], operations=text.UnicodeCharTokenizer())
+    vocab = text.Vocab.from_dataset(data, None, freq_range=None, top_k=None, special_tokens=["<pad>", "<unk>"],
+                                    special_first=True)
+    data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
+    res = []
+    for d in data.create_dict_iterator():
+        res.append(list(d["text"]))
+    assert res == [[13, 3, 7, 14, 9, 17, 3, 2, 19, 9, 2, 11, 3, 4, 16, 4, 8, 6, 5], [21, 20, 10, 25, 23, 26],
+                   [24, 22, 10, 12, 8, 6, 7, 4, 18, 15, 5], [2, 2]]
+
+
+def test_from_dataset():
+    """ test build vocab with generator dataset """
+
+    def gen_corpus():
+        # key: word, value: number of occurrences, reason for using letters is so their order is apparent
+        corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1}
+        for k, v in corpus.items():
+            yield (np.array([k] * v, dtype='S'),)
+
+    def test_config(freq_range, top_k):
+        corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"])
+        vocab = text.Vocab.from_dataset(corpus_dataset, None, freq_range, top_k, special_tokens=["<pad>", "<unk>"],
+                                        special_first=True)
+        corpus_dataset = corpus_dataset.map(input_columns="text", operations=text.Lookup(vocab))
+        res = []
+        for d in corpus_dataset.create_dict_iterator():
+            res.append(list(d["text"]))
+        return res
+
+    # take words whose frequency is with in [3,4] order them alphabetically for words with the same frequency
+    test1_res = test_config(freq_range=(3, 4), top_k=4)
+    assert test1_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [5, 5, 5], [1, 1], [1]], str(test1_res)
+
+    # test words with frequency range [2,inf], only the last word will be filtered out
+    test2_res = test_config((2, None), None)
+    assert test2_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [6, 6, 6], [5, 5, 5], [7, 7], [1]], str(test2_res)
+
+    # test filter only by top_k
+    test3_res = test_config(None, 4)
+    assert test3_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [5, 5, 5], [1, 1], [1]], str(test3_res)
+
+    # test filtering out the most frequent
+    test4_res = test_config((None, 3), 100)
+    assert test4_res == [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [3, 3, 3], [2, 2, 2], [4, 4], [5]], str(test4_res)
+
+    # test top_k == 1
+    test5_res = test_config(None, 1)
+    assert test5_res == [[1, 1, 1, 1], [1, 1, 1, 1], [2, 2, 2, 2], [1, 1, 1], [1, 1, 1], [1, 1], [1]], str(test5_res)
+
+    # test min_frequency == max_frequency
+    test6_res = test_config((4, 4), None)
+    assert test6_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [1, 1, 1], [1, 1], [1]], str(test6_res)
+
+
+def test_from_dataset_special_token():
+    """ test build vocab with generator dataset """
+
+    def gen_corpus():
+        # key: word, value: number of occurrences, reason for using letters is so their order is apparent
+        corpus = {"D": 1, "C": 1, "B": 1, "A": 1}
+        for k, v in corpus.items():
+            yield (np.array([k] * v, dtype='S'),)
+
+    def gen_input(texts):
+        for word in texts.split(" "):
+            yield (np.array(word, dtype='S'),)
+
+    def test_config(texts, top_k, special_tokens, special_first):
+        corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"])
+        vocab = text.Vocab.from_dataset(corpus_dataset, None, None, top_k, special_tokens, special_first)
+        data = ds.GeneratorDataset(gen_input(texts), column_names=["text"])
+        data = data.map(input_columns="text", operations=text.Lookup(vocab))
+        res = []
+        for d in data.create_dict_iterator():
+            res.append(d["text"].item())
+        return res
+
+    # test special tokens are inserted before
+    assert test_config("A B C D <pad> <unk>", 4, ["<pad>", "<unk>"], True) == [2, 3, 4, 5, 0, 1]
+    # test special tokens are inserted after
+    assert test_config("A B C D <pad> <unk>", 4, ["<pad>", "<unk>"], False) == [0, 1, 2, 3, 4, 5]
+
+
+def test_from_dataset_exceptions():
+    """ test various exceptions during that are checked in validator """
+
+    def test_config(columns, freq_range, top_k, s):
+        try:
+            data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
+            vocab = text.Vocab.from_dataset(data, columns, freq_range, top_k)
+            assert isinstance(vocab.text.Vocab)
+        except ValueError as e:
+            assert s in str(e), str(e)
+
+    test_config("text", (), 1, "freq_range needs to be either None or a tuple of 2 integers")
+    test_config("text", (2, 3), 1.2345, "top_k needs to be a positive integer")
+    test_config(23, (2, 3), 1.2345, "columns need to be a list of strings")
+    test_config("text", (100, 1), 12, "frequency range [a,b] should be 0 <= a <= b")
+    test_config("text", (2, 3), 0, "top_k needs to be a positive integer")
+    test_config([123], (2, 3), 0, "columns need to be a list of strings")
+
+
+if __name__ == '__main__':
+    test_demo_basic_from_dataset()
+    test_from_dataset()
+    test_from_dataset_exceptions()
+    test_demo_basic_from_dataset_with_tokenizer()
+    test_from_dataset_special_token()
diff --git a/tests/ut/python/dataset/test_generator.py b/tests/ut/python/dataset/test_generator.py
index 30c36cdcb4..926b84a7f4 100644
--- a/tests/ut/python/dataset/test_generator.py
+++ b/tests/ut/python/dataset/test_generator.py
@@ -544,7 +544,7 @@ def test_distributed_sampler():
 def test_num_samples():
     source = [(np.array([x]),) for x in range(64)]
     num_samples = 32
-    ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(), num_samples=num_samples)
+    ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(num_samples=num_samples))
     ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(32)], num_samples=num_samples)
     ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples)
 
@@ -660,4 +660,6 @@ if __name__ == "__main__":
     test_sequential_sampler()
     test_distributed_sampler()
     test_random_sampler()
+    test_num_samples()
+    test_num_samples_underflow()
     test_schema()
diff --git a/tests/ut/python/dataset/test_graphdata.py b/tests/ut/python/dataset/test_graphdata.py
index 4aa4fc89ee..4083336623 100644
--- a/tests/ut/python/dataset/test_graphdata.py
+++ b/tests/ut/python/dataset/test_graphdata.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+import random
 import pytest
 import numpy as np
 import mindspore.dataset as ds
 from mindspore import log as logger
 
 DATASET_FILE = "../data/mindrecord/testGraphData/testdata"
+SOCIAL_DATA_FILE = "../data/mindrecord/testGraphData/sns"
 
 
 def test_graphdata_getfullneighbor():
@@ -77,8 +79,123 @@ def test_graphdata_getnodefeature_input_check():
         g.get_node_feature(input_list, [1, "a"])
 
 
+def test_graphdata_getsampledneighbors():
+    g = ds.GraphData(DATASET_FILE, 1)
+    edges = g.get_all_edges(0)
+    nodes = g.get_nodes_from_edges(edges)
+    assert len(nodes) == 40
+    neighbor = g.get_sampled_neighbors(
+        np.unique(nodes[0:21, 0]), [2, 3], [2, 1])
+    assert neighbor.shape == (10, 9)
+
+
+def test_graphdata_getnegsampledneighbors():
+    g = ds.GraphData(DATASET_FILE, 2)
+    nodes = g.get_all_nodes(1)
+    assert len(nodes) == 10
+    neighbor = g.get_neg_sampled_neighbors(nodes, 5, 2)
+    assert neighbor.shape == (10, 6)
+
+
+def test_graphdata_graphinfo():
+    g = ds.GraphData(DATASET_FILE, 2)
+    graph_info = g.graph_info()
+    assert graph_info['node_type'] == [1, 2]
+    assert graph_info['edge_type'] == [0]
+    assert graph_info['node_num'] == {1: 10, 2: 10}
+    assert graph_info['edge_num'] == {0: 40}
+    assert graph_info['node_feature_type'] == [1, 2, 3, 4]
+    assert graph_info['edge_feature_type'] == []
+
+
+class RandomBatchedSampler(ds.Sampler):
+    # RandomBatchedSampler generate random sequence without replacement in a batched manner
+    def __init__(self, index_range, num_edges_per_sample):
+        super().__init__()
+        self.index_range = index_range
+        self.num_edges_per_sample = num_edges_per_sample
+
+    def __iter__(self):
+        indices = [i+1 for i in range(self.index_range)]
+        # Reset random seed here if necessary
+        # random.seed(0)
+        random.shuffle(indices)
+        for i in range(0, self.index_range, self.num_edges_per_sample):
+            # Drop reminder
+            if i + self.num_edges_per_sample <= self.index_range:
+                yield indices[i: i + self.num_edges_per_sample]
+
+
+class GNNGraphDataset():
+    def __init__(self, g, batch_num):
+        self.g = g
+        self.batch_num = batch_num
+
+    def __len__(self):
+        # Total sample size of GNN dataset
+        # In this case, the size should be total_num_edges/num_edges_per_sample
+        return self.g.graph_info()['edge_num'][0] // self.batch_num
+
+    def __getitem__(self, index):
+        # index will be a list of indices yielded from RandomBatchedSampler
+        # Fetch edges/nodes/samples/features based on indices
+        nodes = self.g.get_nodes_from_edges(index.astype(np.int32))
+        nodes = nodes[:, 0]
+        neg_nodes = self.g.get_neg_sampled_neighbors(
+            node_list=nodes, neg_neighbor_num=3, neg_neighbor_type=1)
+        nodes_neighbors = self.g.get_sampled_neighbors(node_list=nodes, neighbor_nums=[
+            2, 2], neighbor_types=[2, 1])
+        neg_nodes_neighbors = self.g.get_sampled_neighbors(
+            node_list=neg_nodes[:, 1:].reshape(-1), neighbor_nums=[2, 2], neighbor_types=[2, 2])
+        nodes_neighbors_features = self.g.get_node_feature(
+            node_list=nodes_neighbors, feature_types=[2, 3])
+        neg_neighbors_features = self.g.get_node_feature(
+            node_list=neg_nodes_neighbors, feature_types=[2, 3])
+        return nodes_neighbors, neg_nodes_neighbors, nodes_neighbors_features[0], neg_neighbors_features[1]
+
+
+def test_graphdata_generatordataset():
+    g = ds.GraphData(DATASET_FILE)
+    batch_num = 2
+    edge_num = g.graph_info()['edge_num'][0]
+    out_column_names = ["neighbors", "neg_neighbors", "neighbors_features", "neg_neighbors_features"]
+    dataset = ds.GeneratorDataset(source=GNNGraphDataset(g, batch_num), column_names=out_column_names,
+                                  sampler=RandomBatchedSampler(edge_num, batch_num), num_parallel_workers=4)
+    dataset = dataset.repeat(2)
+    itr = dataset.create_dict_iterator()
+    i = 0
+    for data in itr:
+        assert data['neighbors'].shape == (2, 7)
+        assert data['neg_neighbors'].shape == (6, 7)
+        assert data['neighbors_features'].shape == (2, 7)
+        assert data['neg_neighbors_features'].shape == (6, 7)
+        i += 1
+    assert i == 40
+
+
+def test_graphdata_randomwalk():
+    g = ds.GraphData(SOCIAL_DATA_FILE, 1)
+    nodes = g.get_all_nodes(1)
+    print(len(nodes))
+    assert len(nodes) == 33
+
+    meta_path = [1 for _ in range(39)]
+    walks = g.random_walk(nodes, meta_path)
+    assert walks.shape == (33, 40)
+
+
 if __name__ == '__main__':
     test_graphdata_getfullneighbor()
     logger.info('test_graphdata_getfullneighbor Ended.\n')
     test_graphdata_getnodefeature_input_check()
     logger.info('test_graphdata_getnodefeature_input_check Ended.\n')
+    test_graphdata_getsampledneighbors()
+    logger.info('test_graphdata_getsampledneighbors Ended.\n')
+    test_graphdata_getnegsampledneighbors()
+    logger.info('test_graphdata_getnegsampledneighbors Ended.\n')
+    test_graphdata_graphinfo()
+    logger.info('test_graphdata_graphinfo Ended.\n')
+    test_graphdata_generatordataset()
+    logger.info('test_graphdata_generatordataset Ended.\n')
+    test_graphdata_randomwalk()
+    logger.info('test_graphdata_randomwalk Ended.\n')
diff --git a/tests/ut/python/dataset/test_invert.py b/tests/ut/python/dataset/test_invert.py
index 9c919c30af..8bdf63dd72 100644
--- a/tests/ut/python/dataset/test_invert.py
+++ b/tests/ut/python/dataset/test_invert.py
@@ -12,34 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-import matplotlib.pyplot as plt
+"""
+Testing Invert op in DE
+"""
 import numpy as np
 
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.vision.py_transforms as F
 from mindspore import log as logger
+from util import visualize_list
 
 DATA_DIR = "../data/dataset/testImageNetData/train/"
 
 
-def visualize(image_original, image_invert):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    num = len(image_invert)
-    for i in range(num):
-        plt.subplot(2, num, i + 1)
-        plt.imshow(image_original[i])
-        plt.title("Original image")
-
-        plt.subplot(2, num, i + num + 1)
-        plt.imshow(image_invert[i])
-        plt.title("DE Color Inverted image")
-
-    plt.show()
-
-
 def test_invert(plot=False):
     """
     Test Invert
@@ -94,7 +79,7 @@ def test_invert(plot=False):
     logger.info("MSE= {}".format(str(np.mean(mse))))
 
     if plot:
-        visualize(images_original, images_invert)
+        visualize_list(images_original, images_invert)
 
 
 if __name__ == "__main__":
diff --git a/tests/ut/python/dataset/test_iterator.py b/tests/ut/python/dataset/test_iterator.py
index 0b896e8d31..af5a66e89e 100644
--- a/tests/ut/python/dataset/test_iterator.py
+++ b/tests/ut/python/dataset/test_iterator.py
@@ -103,7 +103,7 @@ def test_tree_copy():
     itr = data1.create_tuple_iterator()
 
     assert id(data1) != id(itr.dataset)
-    assert id(data) != id(itr.dataset.input[0])
+    assert id(data) != id(itr.dataset.children[0])
     assert id(data1.operations[0]) == id(itr.dataset.operations[0])
 
     itr.release()
diff --git a/tests/ut/python/dataset/test_linear_transformation.py b/tests/ut/python/dataset/test_linear_transformation.py
index b79f3718bb..80153902ab 100644
--- a/tests/ut/python/dataset/test_linear_transformation.py
+++ b/tests/ut/python/dataset/test_linear_transformation.py
@@ -19,7 +19,7 @@ import numpy as np
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.py_transforms as py_vision
 from mindspore import log as logger
-from util import diff_mse, visualize, save_and_check_md5
+from util import diff_mse, visualize_list, save_and_check_md5
 
 GENERATE_GOLDEN = False
 
@@ -71,7 +71,7 @@ def test_linear_transformation_op(plot=False):
         mse = diff_mse(image1, image2)
         assert mse == 0
     if plot:
-        visualize(image, image_transformed)
+        visualize_list(image, image_transformed)
 
 def test_linear_transformation_md5_01():
     """
diff --git a/tests/ut/python/dataset/test_mask_op.py b/tests/ut/python/dataset/test_mask_op.py
new file mode 100644
index 0000000000..54f2cc65be
--- /dev/null
+++ b/tests/ut/python/dataset/test_mask_op.py
@@ -0,0 +1,132 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing Mask op in DE
+"""
+import numpy as np
+import pytest
+
+import mindspore.common.dtype as mstype
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as ops
+
+mstype_to_np_type = {
+    mstype.bool_: np.bool,
+    mstype.int8: np.int8,
+    mstype.uint8: np.uint8,
+    mstype.int16: np.int16,
+    mstype.uint16: np.uint16,
+    mstype.int32: np.int32,
+    mstype.uint32: np.uint32,
+    mstype.int64: np.int64,
+    mstype.uint64: np.uint64,
+    mstype.float16: np.float16,
+    mstype.float32: np.float32,
+    mstype.float64: np.float64,
+    mstype.string: np.str
+}
+
+
+def mask_compare(array, op, constant, dtype=mstype.bool_):
+    data = ds.NumpySlicesDataset([array])
+    array = np.array(array)
+    data = data.map(operations=ops.Mask(op, constant, dtype))
+    for d in data:
+        if op == ops.Relational.EQ:
+            array = array == np.array(constant, dtype=array.dtype)
+        elif op == ops.Relational.NE:
+            array = array != np.array(constant, dtype=array.dtype)
+        elif op == ops.Relational.GT:
+            array = array > np.array(constant, dtype=array.dtype)
+        elif op == ops.Relational.GE:
+            array = array >= np.array(constant, dtype=array.dtype)
+        elif op == ops.Relational.LT:
+            array = array < np.array(constant, dtype=array.dtype)
+        elif op == ops.Relational.LE:
+            array = array <= np.array(constant, dtype=array.dtype)
+
+        array = array.astype(dtype=mstype_to_np_type[dtype])
+
+        np.testing.assert_array_equal(array, d[0])
+
+
+def test_mask_int_comparison():
+    for k in mstype_to_np_type:
+        if k == mstype.string:
+            continue
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.EQ, 3, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.NE, 3, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.LT, 3, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.LE, 3, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.GT, 3, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.GE, 3, k)
+
+
+def test_mask_float_comparison():
+    for k in mstype_to_np_type:
+        if k == mstype.string:
+            continue
+        mask_compare([1.5, 2.5, 3., 4.5, 5.5], ops.Relational.EQ, 3, k)
+        mask_compare([1.5, 2.5, 3., 4.5, 5.5], ops.Relational.NE, 3, k)
+        mask_compare([1.5, 2.5, 3., 4.5, 5.5], ops.Relational.LT, 3, k)
+        mask_compare([1.5, 2.5, 3., 4.5, 5.5], ops.Relational.LE, 3, k)
+        mask_compare([1.5, 2.5, 3., 4.5, 5.5], ops.Relational.GT, 3, k)
+        mask_compare([1.5, 2.5, 3., 4.5, 5.5], ops.Relational.GE, 3, k)
+
+
+def test_mask_float_comparison2():
+    for k in mstype_to_np_type:
+        if k == mstype.string:
+            continue
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.EQ, 3.5, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.NE, 3.5, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.LT, 3.5, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.LE, 3.5, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.GT, 3.5, k)
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.GE, 3.5, k)
+
+
+def test_mask_string_comparison():
+    for k in mstype_to_np_type:
+        if k == mstype.string:
+            continue
+        mask_compare(["1.5", "2.5", "3.", "4.5", "5.5"], ops.Relational.EQ, "3.", k)
+        mask_compare(["1.5", "2.5", "3.", "4.5", "5.5"], ops.Relational.NE, "3.", k)
+        mask_compare(["1.5", "2.5", "3.", "4.5", "5.5"], ops.Relational.LT, "3.", k)
+        mask_compare(["1.5", "2.5", "3.", "4.5", "5.5"], ops.Relational.LE, "3.", k)
+        mask_compare(["1.5", "2.5", "3.", "4.5", "5.5"], ops.Relational.GT, "3.", k)
+        mask_compare(["1.5", "2.5", "3.", "4.5", "5.5"], ops.Relational.GE, "3.", k)
+
+
+def test_mask_exceptions_str():
+    with pytest.raises(RuntimeError) as info:
+        mask_compare([1, 2, 3, 4, 5], ops.Relational.EQ, "3.5")
+    assert "Cannot convert constant value to the type of the input tensor." in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        mask_compare(["1", "2", "3", "4", "5"], ops.Relational.EQ, 3.5)
+    assert "Cannot convert constant value to the type of the input tensor." in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        mask_compare(["1", "2", "3", "4", "5"], ops.Relational.EQ, "3.5", mstype.string)
+    assert "Cannot generate a string mask. Type should be numeric." in str(info.value)
+
+
+if __name__ == "__main__":
+    test_mask_int_comparison()
+    test_mask_float_comparison()
+    test_mask_float_comparison2()
+    test_mask_string_comparison()
+    test_mask_exceptions_str()
diff --git a/tests/ut/python/dataset/test_minddataset.py b/tests/ut/python/dataset/test_minddataset.py
index 00af3fa660..7d613d414f 100644
--- a/tests/ut/python/dataset/test_minddataset.py
+++ b/tests/ut/python/dataset/test_minddataset.py
@@ -17,6 +17,7 @@ This is the test module for mindrecord
 """
 import collections
 import json
+import math
 import os
 import re
 import string
@@ -227,10 +228,9 @@ def test_cv_minddataset_partition_tutorial(add_and_remove_cv_file):
                                       num_shards=num_shards, shard_id=partition_id)
             num_iter = 0
             for item in data_set.create_dict_iterator():
-                logger.info(
-                    "-------------- partition : {} ------------------------".format(partition_id))
-                logger.info(
-                    "-------------- item[label]: {} -----------------------".format(item["label"]))
+                logger.info("-------------- partition : {} ------------------------".format(partition_id))
+                logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
+                logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
                 num_iter += 1
         return num_iter
 
@@ -239,6 +239,174 @@ def test_cv_minddataset_partition_tutorial(add_and_remove_cv_file):
     assert partitions(9) == 2
 
 
+def test_cv_minddataset_partition_tutorial_check_shuffle_result(add_and_remove_cv_file):
+    """tutorial for cv minddataset."""
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    num_shards = 3
+    epoch1 = []
+    epoch2 = []
+    epoch3 = []
+
+    for partition_id in range(num_shards):
+        data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                  num_shards=num_shards, shard_id=partition_id)
+
+        data_set = data_set.repeat(3)
+
+        num_iter = 0
+        for item in data_set.create_dict_iterator():
+            logger.info("-------------- partition : {} ------------------------".format(partition_id))
+            logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
+            logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+            num_iter += 1
+            if num_iter <= 4:
+                epoch1.append(item["file_name"])   # save epoch 1 list
+            elif num_iter <= 8:
+                epoch2.append(item["file_name"])   # save epoch 2 list
+            else:
+                epoch3.append(item["file_name"])   # save epoch 3 list
+        assert num_iter == 12
+        assert len(epoch1) == 4
+        assert len(epoch2) == 4
+        assert len(epoch3) == 4
+        assert epoch1 not in (epoch2, epoch3)
+        assert epoch2 not in (epoch1, epoch3)
+        assert epoch3 not in (epoch1, epoch2)
+        epoch1 = []
+        epoch2 = []
+        epoch3 = []
+
+
+def test_cv_minddataset_partition_tutorial_check_whole_reshuffle_result_per_epoch(add_and_remove_cv_file):
+    """tutorial for cv minddataset."""
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    num_shards = 3
+    epoch_result = [[["", "", "", ""], ["", "", "", ""], ["", "", "", ""]],    # save partition 0 result
+                    [["", "", "", ""], ["", "", "", ""], ["", "", "", ""]],    # save partition 1 result
+                    [["", "", "", ""], ["", "", "", ""], ["", "", "", ""]]]    # svae partition 2 result
+
+    for partition_id in range(num_shards):
+        data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                  num_shards=num_shards, shard_id=partition_id)
+
+        data_set = data_set.repeat(3)
+
+        num_iter = 0
+        for item in data_set.create_dict_iterator():
+            logger.info("-------------- partition : {} ------------------------".format(partition_id))
+            logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
+            logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+            # total 3 partition, 4 result per epoch, total 12 result
+            epoch_result[partition_id][int(num_iter / 4)][num_iter % 4] = item["file_name"]    # save epoch result
+            num_iter += 1
+        assert num_iter == 12
+        assert epoch_result[partition_id][0] not in (epoch_result[partition_id][1], epoch_result[partition_id][2])
+        assert epoch_result[partition_id][1] not in (epoch_result[partition_id][0], epoch_result[partition_id][2])
+        assert epoch_result[partition_id][2] not in (epoch_result[partition_id][1], epoch_result[partition_id][0])
+        epoch_result[partition_id][0].sort()
+        epoch_result[partition_id][1].sort()
+        epoch_result[partition_id][2].sort()
+        assert epoch_result[partition_id][0] != epoch_result[partition_id][1]
+        assert epoch_result[partition_id][1] != epoch_result[partition_id][2]
+        assert epoch_result[partition_id][2] != epoch_result[partition_id][0]
+
+
+def test_cv_minddataset_check_shuffle_result(add_and_remove_cv_file):
+    """tutorial for cv minddataset."""
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+
+    ds.config.set_seed(54321)
+    epoch1 = []
+    epoch2 = []
+    epoch3 = []
+
+    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers)
+    data_set = data_set.repeat(3)
+
+    num_iter = 0
+    for item in data_set.create_dict_iterator():
+        logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
+        logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+        num_iter += 1
+        if num_iter <= 10:
+            epoch1.append(item["file_name"])   # save epoch 1 list
+        elif num_iter <= 20:
+            epoch2.append(item["file_name"])   # save epoch 2 list
+        else:
+            epoch3.append(item["file_name"])   # save epoch 3 list
+    assert num_iter == 30
+    assert len(epoch1) == 10
+    assert len(epoch2) == 10
+    assert len(epoch3) == 10
+    assert epoch1 not in (epoch2, epoch3)
+    assert epoch2 not in (epoch1, epoch3)
+    assert epoch3 not in (epoch1, epoch2)
+
+    epoch1_new_dataset = []
+    epoch2_new_dataset = []
+    epoch3_new_dataset = []
+
+    data_set2 = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers)
+    data_set2 = data_set2.repeat(3)
+
+    num_iter = 0
+    for item in data_set2.create_dict_iterator():
+        logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
+        logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+        num_iter += 1
+        if num_iter <= 10:
+            epoch1_new_dataset.append(item["file_name"])   # save epoch 1 list
+        elif num_iter <= 20:
+            epoch2_new_dataset.append(item["file_name"])   # save epoch 2 list
+        else:
+            epoch3_new_dataset.append(item["file_name"])   # save epoch 3 list
+    assert num_iter == 30
+    assert len(epoch1_new_dataset) == 10
+    assert len(epoch2_new_dataset) == 10
+    assert len(epoch3_new_dataset) == 10
+    assert epoch1_new_dataset not in (epoch2_new_dataset, epoch3_new_dataset)
+    assert epoch2_new_dataset not in (epoch1_new_dataset, epoch3_new_dataset)
+    assert epoch3_new_dataset not in (epoch1_new_dataset, epoch2_new_dataset)
+
+    assert epoch1 == epoch1_new_dataset
+    assert epoch2 == epoch2_new_dataset
+    assert epoch3 == epoch3_new_dataset
+
+    ds.config.set_seed(12345)
+    epoch1_new_dataset2 = []
+    epoch2_new_dataset2 = []
+    epoch3_new_dataset2 = []
+
+    data_set3 = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers)
+    data_set3 = data_set3.repeat(3)
+
+    num_iter = 0
+    for item in data_set3.create_dict_iterator():
+        logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
+        logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+        num_iter += 1
+        if num_iter <= 10:
+            epoch1_new_dataset2.append(item["file_name"])   # save epoch 1 list
+        elif num_iter <= 20:
+            epoch2_new_dataset2.append(item["file_name"])   # save epoch 2 list
+        else:
+            epoch3_new_dataset2.append(item["file_name"])   # save epoch 3 list
+    assert num_iter == 30
+    assert len(epoch1_new_dataset2) == 10
+    assert len(epoch2_new_dataset2) == 10
+    assert len(epoch3_new_dataset2) == 10
+    assert epoch1_new_dataset2 not in (epoch2_new_dataset2, epoch3_new_dataset2)
+    assert epoch2_new_dataset2 not in (epoch1_new_dataset2, epoch3_new_dataset2)
+    assert epoch3_new_dataset2 not in (epoch1_new_dataset2, epoch2_new_dataset2)
+
+    assert epoch1 != epoch1_new_dataset2
+    assert epoch2 != epoch2_new_dataset2
+    assert epoch3 != epoch3_new_dataset2
+
+
 def test_cv_minddataset_dataset_size(add_and_remove_cv_file):
     """tutorial for cv minddataset."""
     columns_list = ["data", "file_name", "label"]
@@ -321,12 +489,11 @@ def test_cv_minddataset_issue_888(add_and_remove_cv_file):
     """issue 888 test."""
     columns_list = ["data", "label"]
     num_readers = 2
-    data = ds.MindDataset(CV_FILE_NAME + "0", columns_list,
-                          num_readers, shuffle=False, num_shards=5, shard_id=1)
-    data = data.shuffle(2)
-    data = data.repeat(9)
+    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, shuffle=False, num_shards=5, shard_id=1)
+    data_set = data_set.shuffle(2)
+    data_set = data_set.repeat(9)
     num_iter = 0
-    for _ in data.create_dict_iterator():
+    for _ in data_set.create_dict_iterator():
         num_iter += 1
     assert num_iter == 18
 
@@ -335,8 +502,7 @@ def test_cv_minddataset_blockreader_tutorial(add_and_remove_cv_file):
     """tutorial for cv minddataset."""
     columns_list = ["data", "label"]
     num_readers = 4
-    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
-                              block_reader=True)
+    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, block_reader=True)
     assert data_set.get_dataset_size() == 10
     repeat_num = 2
     data_set = data_set.repeat(repeat_num)
@@ -544,7 +710,6 @@ def test_cv_minddataset_reader_basic_tutorial(add_and_remove_cv_file):
         num_iter += 1
     assert num_iter == 10
 
-
 def test_nlp_minddataset_reader_basic_tutorial(add_and_remove_nlp_file):
     """tutorial for nlp minderdataset."""
     num_readers = 4
@@ -1476,3 +1641,149 @@ def test_write_with_multi_array_and_MindDataset():
 
     os.remove("{}".format(mindrecord_file_name))
     os.remove("{}.db".format(mindrecord_file_name))
+
+def test_write_with_float32_float64_float32_array_float64_array_and_MindDataset():
+    mindrecord_file_name = "test.mindrecord"
+    data = [{"float32_array": np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12345,
+             "float64": 1987654321.123456785,
+             "int32_array": np.array([1, 2, 3, 4, 5], dtype=np.int32),
+             "int64_array": np.array([48, 49, 50, 51, 123414314, 87], dtype=np.int64),
+             "int32": 3456,
+             "int64": 947654321123},
+            {"float32_array": np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12445,
+             "float64": 1987654321.123456786,
+             "int32_array": np.array([11, 21, 31, 41, 51], dtype=np.int32),
+             "int64_array": np.array([481, 491, 501, 511, 1234143141, 871], dtype=np.int64),
+             "int32": 3466,
+             "int64": 957654321123},
+            {"float32_array": np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12545,
+             "float64": 1987654321.123456787,
+             "int32_array": np.array([12, 22, 32, 42, 52], dtype=np.int32),
+             "int64_array": np.array([482, 492, 502, 512, 1234143142, 872], dtype=np.int64),
+             "int32": 3476,
+             "int64": 967654321123},
+            {"float32_array": np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12645,
+             "float64": 1987654321.123456788,
+             "int32_array": np.array([13, 23, 33, 43, 53], dtype=np.int32),
+             "int64_array": np.array([483, 493, 503, 513, 1234143143, 873], dtype=np.int64),
+             "int32": 3486,
+             "int64": 977654321123},
+            {"float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12745,
+             "float64": 1987654321.123456789,
+             "int32_array": np.array([14, 24, 34, 44, 54], dtype=np.int32),
+             "int64_array": np.array([484, 494, 504, 514, 1234143144, 874], dtype=np.int64),
+             "int32": 3496,
+             "int64": 987654321123},
+            ]
+    writer = FileWriter(mindrecord_file_name)
+    schema = {"float32_array": {"type": "float32", "shape": [-1]},
+              "float64_array": {"type": "float64", "shape": [-1]},
+              "float32": {"type": "float32"},
+              "float64": {"type": "float64"},
+              "int32_array": {"type": "int32", "shape": [-1]},
+              "int64_array": {"type": "int64", "shape": [-1]},
+              "int32": {"type": "int32"},
+              "int64": {"type": "int64"}}
+    writer.add_schema(schema, "data is so cool")
+    writer.write_raw_data(data)
+    writer.commit()
+
+    # change data value to list - do none
+    data_value_to_list = []
+    for item in data:
+        new_data = {}
+        new_data['float32_array'] = item["float32_array"]
+        new_data['float64_array'] = item["float64_array"]
+        new_data['float32'] = item["float32"]
+        new_data['float64'] = item["float64"]
+        new_data['int32_array'] = item["int32_array"]
+        new_data['int64_array'] = item["int64_array"]
+        new_data['int32'] = item["int32"]
+        new_data['int64'] = item["int64"]
+        data_value_to_list.append(new_data)
+
+    num_readers = 2
+    data_set = ds.MindDataset(dataset_file=mindrecord_file_name,
+                              num_parallel_workers=num_readers,
+                              shuffle=False)
+    assert data_set.get_dataset_size() == 5
+    num_iter = 0
+    for item in data_set.create_dict_iterator():
+        assert len(item) == 8
+        for field in item:
+            if isinstance(item[field], np.ndarray):
+                if item[field].dtype == np.float32:
+                    assert (item[field] ==
+                            np.array(data_value_to_list[num_iter][field], np.float32)).all()
+                else:
+                    assert (item[field] ==
+                            data_value_to_list[num_iter][field]).all()
+            else:
+                assert item[field] == data_value_to_list[num_iter][field]
+        num_iter += 1
+    assert num_iter == 5
+
+    num_readers = 2
+    data_set = ds.MindDataset(dataset_file=mindrecord_file_name,
+                              columns_list=["float32", "int32"],
+                              num_parallel_workers=num_readers,
+                              shuffle=False)
+    assert data_set.get_dataset_size() == 5
+    num_iter = 0
+    for item in data_set.create_dict_iterator():
+        assert len(item) == 2
+        for field in item:
+            if isinstance(item[field], np.ndarray):
+                if item[field].dtype == np.float32:
+                    assert (item[field] ==
+                            np.array(data_value_to_list[num_iter][field], np.float32)).all()
+                else:
+                    assert (item[field] ==
+                            data_value_to_list[num_iter][field]).all()
+            else:
+                assert item[field] == data_value_to_list[num_iter][field]
+        num_iter += 1
+    assert num_iter == 5
+
+    num_readers = 2
+    data_set = ds.MindDataset(dataset_file=mindrecord_file_name,
+                              columns_list=["float64", "int64"],
+                              num_parallel_workers=num_readers,
+                              shuffle=False)
+    assert data_set.get_dataset_size() == 5
+    num_iter = 0
+    for item in data_set.create_dict_iterator():
+        assert len(item) == 2
+        for field in item:
+            if isinstance(item[field], np.ndarray):
+                if item[field].dtype == np.float32:
+                    assert (item[field] ==
+                            np.array(data_value_to_list[num_iter][field], np.float32)).all()
+                elif item[field].dtype == np.float64:
+                    assert math.isclose(item[field],
+                                        np.array(data_value_to_list[num_iter][field], np.float64), rel_tol=1e-14)
+                else:
+                    assert (item[field] ==
+                            data_value_to_list[num_iter][field]).all()
+            else:
+                assert item[field] == data_value_to_list[num_iter][field]
+        num_iter += 1
+    assert num_iter == 5
+
+    os.remove("{}".format(mindrecord_file_name))
+    os.remove("{}.db".format(mindrecord_file_name))
diff --git a/tests/ut/python/dataset/test_minddataset_padded.py b/tests/ut/python/dataset/test_minddataset_padded.py
new file mode 100644
index 0000000000..c0724e3236
--- /dev/null
+++ b/tests/ut/python/dataset/test_minddataset_padded.py
@@ -0,0 +1,638 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+This is the test module for mindrecord
+"""
+import collections
+import json
+import numpy as np
+import os
+import pytest
+import re
+import string
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.c_transforms as vision
+from mindspore import log as logger
+from mindspore.dataset.transforms.vision import Inter
+from mindspore.mindrecord import FileWriter
+
+FILES_NUM = 4
+CV_FILE_NAME = "../data/mindrecord/imagenet.mindrecord"
+CV1_FILE_NAME = "../data/mindrecord/imagenet1.mindrecord"
+CV2_FILE_NAME = "../data/mindrecord/imagenet2.mindrecord"
+CV_DIR_NAME = "../data/mindrecord/testImageNetData"
+NLP_FILE_NAME = "../data/mindrecord/aclImdb.mindrecord"
+NLP_FILE_POS = "../data/mindrecord/testAclImdbData/pos"
+NLP_FILE_VOCAB = "../data/mindrecord/testAclImdbData/vocab.txt"
+
+
+@pytest.fixture
+def add_and_remove_cv_file():
+    """add/remove cv file"""
+    paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0'))
+             for x in range(FILES_NUM)]
+    for x in paths:
+        os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None
+        os.remove("{}.db".format(x)) if os.path.exists(
+            "{}.db".format(x)) else None
+    writer = FileWriter(CV_FILE_NAME, FILES_NUM)
+    data = get_data(CV_DIR_NAME)
+    cv_schema_json = {"id": {"type": "int32"},
+                      "file_name": {"type": "string"},
+                      "label": {"type": "int32"},
+                      "data": {"type": "bytes"}}
+    writer.add_schema(cv_schema_json, "img_schema")
+    writer.add_index(["file_name", "label"])
+    writer.write_raw_data(data)
+    writer.commit()
+    yield "yield_cv_data"
+    for x in paths:
+        os.remove("{}".format(x))
+        os.remove("{}.db".format(x))
+
+
+@pytest.fixture
+def add_and_remove_nlp_file():
+    """add/remove nlp file"""
+    paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0'))
+             for x in range(FILES_NUM)]
+    for x in paths:
+        if os.path.exists("{}".format(x)):
+            os.remove("{}".format(x))
+        if os.path.exists("{}.db".format(x)):
+            os.remove("{}.db".format(x))
+    writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
+    data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)]
+    nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"},
+                       "rating": {"type": "float32"},
+                       "input_ids": {"type": "int64",
+                                     "shape": [-1]},
+                       "input_mask": {"type": "int64",
+                                      "shape": [1, -1]},
+                       "segment_ids": {"type": "int64",
+                                       "shape": [2, -1]}
+                       }
+    writer.set_header_size(1 << 14)
+    writer.set_page_size(1 << 15)
+    writer.add_schema(nlp_schema_json, "nlp_schema")
+    writer.add_index(["id", "rating"])
+    writer.write_raw_data(data)
+    writer.commit()
+    yield "yield_nlp_data"
+    for x in paths:
+        os.remove("{}".format(x))
+        os.remove("{}.db".format(x))
+
+def test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file):
+    """tutorial for cv minderdataset."""
+    columns_list = ["label", "file_name", "data"]
+
+    data = get_data(CV_DIR_NAME)
+    padded_sample = data[0]
+    padded_sample['label'] = -1
+    padded_sample['file_name'] = 'dummy.jpg'
+    num_readers = 4
+    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, padded_sample=padded_sample, num_padded=5)
+    assert data_set.get_dataset_size() == 15
+    num_iter = 0
+    num_padded_iter = 0
+    for item in data_set.create_dict_iterator():
+        logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
+        if item['label'] == -1:
+            num_padded_iter += 1
+            assert item['file_name'] == bytes(padded_sample['file_name'],
+                    encoding='utf8')
+            assert item['label'] == padded_sample['label']
+            assert (item['data'] == np.array(list(padded_sample['data']))).all()
+        num_iter += 1 
+    assert num_padded_iter == 5
+    assert num_iter == 15
+
+
+def test_cv_minddataset_partition_padded_samples(add_and_remove_cv_file):
+    """tutorial for cv minddataset."""
+    columns_list = ["data", "file_name", "label"]
+
+    data = get_data(CV_DIR_NAME)
+    padded_sample = data[0]
+    padded_sample['label'] = -2
+    padded_sample['file_name'] = 'dummy.jpg'
+    num_readers = 4
+
+    def partitions(num_shards, num_padded, dataset_size):
+        num_padded_iter = 0
+        num_iter = 0
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample,
+                                      num_padded=num_padded)
+            assert data_set.get_dataset_size() == dataset_size
+            for item in data_set.create_dict_iterator():
+                logger.info("-------------- partition : {} ------------------------".format(partition_id))
+                logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
+                logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
+                logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+                logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+                if item['label'] == -2:
+                    num_padded_iter += 1
+                    assert item['file_name'] == bytes(padded_sample['file_name'], encoding='utf8')
+                    assert item['label'] == padded_sample['label']
+                    assert (item['data'] == np.array(list(padded_sample['data']))).all()
+                num_iter += 1
+        assert num_padded_iter == num_padded
+        return num_iter == dataset_size * num_shards
+
+    partitions(4, 2, 3)
+    partitions(5, 5, 3)
+    partitions(9, 8, 2)
+
+def test_cv_minddataset_partition_padded_samples_multi_epoch(add_and_remove_cv_file):
+    """tutorial for cv minddataset."""
+    columns_list = ["data", "file_name", "label"]
+
+    data = get_data(CV_DIR_NAME)
+    padded_sample = data[0]
+    padded_sample['label'] = -2
+    padded_sample['file_name'] = 'dummy.jpg'
+    num_readers = 4
+
+    def partitions(num_shards, num_padded, dataset_size):
+        repeat_size = 5
+        num_padded_iter = 0
+        num_iter = 0
+        for partition_id in range(num_shards):
+            epoch1_shuffle_result = []
+            epoch2_shuffle_result = []
+            epoch3_shuffle_result = []
+            epoch4_shuffle_result = []
+            epoch5_shuffle_result = []
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample,
+                                      num_padded=num_padded)
+            assert data_set.get_dataset_size() == dataset_size
+            data_set = data_set.repeat(repeat_size)
+            local_index = 0
+            for item in data_set.create_dict_iterator():
+                logger.info("-------------- partition : {} ------------------------".format(partition_id))
+                logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
+                logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
+                logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+                logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+                if item['label'] == -2:
+                    num_padded_iter += 1
+                    assert item['file_name'] == bytes(padded_sample['file_name'], encoding='utf8')
+                    assert item['label'] == padded_sample['label']
+                    assert (item['data'] == np.array(list(padded_sample['data']))).all()
+                if local_index < dataset_size:
+                    epoch1_shuffle_result.append(item["file_name"])
+                elif local_index < dataset_size * 2:
+                    epoch2_shuffle_result.append(item["file_name"])
+                elif local_index < dataset_size * 3:
+                    epoch3_shuffle_result.append(item["file_name"])
+                elif local_index < dataset_size * 4:
+                    epoch4_shuffle_result.append(item["file_name"])
+                elif local_index < dataset_size * 5:
+                    epoch5_shuffle_result.append(item["file_name"])
+                local_index += 1
+                num_iter += 1
+            assert len(epoch1_shuffle_result) == dataset_size
+            assert len(epoch2_shuffle_result) == dataset_size
+            assert len(epoch3_shuffle_result) == dataset_size
+            assert len(epoch4_shuffle_result) == dataset_size
+            assert len(epoch5_shuffle_result) == dataset_size
+            assert local_index == dataset_size * repeat_size
+
+            # When dataset_size is equal to 2, too high probability is the same result after shuffle operation
+            if dataset_size > 2:
+                assert epoch1_shuffle_result != epoch2_shuffle_result
+                assert epoch2_shuffle_result != epoch3_shuffle_result
+                assert epoch3_shuffle_result != epoch4_shuffle_result
+                assert epoch4_shuffle_result != epoch5_shuffle_result
+        assert num_padded_iter == num_padded * repeat_size
+        assert num_iter == dataset_size * num_shards * repeat_size
+
+    partitions(4, 2, 3)
+    partitions(5, 5, 3)
+    partitions(9, 8, 2)
+
+def test_cv_minddataset_partition_padded_samples_no_dividsible(add_and_remove_cv_file):
+    """tutorial for cv minddataset."""
+    columns_list = ["data", "file_name", "label"]
+
+    data = get_data(CV_DIR_NAME)
+    padded_sample = data[0]
+    padded_sample['label'] = -2
+    padded_sample['file_name'] = 'dummy.jpg'
+    num_readers = 4
+
+    def partitions(num_shards, num_padded):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample,
+                                      num_padded=num_padded)
+            num_iter = 0
+            for item in data_set.create_dict_iterator():
+                num_iter += 1
+        return num_iter
+
+    with pytest.raises(RuntimeError):
+        partitions(4, 1)
+
+def test_cv_minddataset_partition_padded_samples_dataset_size_no_divisible(add_and_remove_cv_file):
+    columns_list = ["data", "file_name", "label"]
+
+    data = get_data(CV_DIR_NAME)
+    padded_sample = data[0]
+    padded_sample['label'] = -2
+    padded_sample['file_name'] = 'dummy.jpg'
+    num_readers = 4
+
+    def partitions(num_shards, num_padded):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample,
+                                      num_padded=num_padded)
+            with pytest.raises(RuntimeError):
+                data_set.get_dataset_size() == 3
+    partitions(4, 1)
+
+def test_cv_minddataset_partition_padded_samples_no_equal_column_list(add_and_remove_cv_file):
+    columns_list = ["data", "file_name", "label"]
+
+    data = get_data(CV_DIR_NAME)
+    padded_sample = data[0]
+    padded_sample.pop('label', None)
+    padded_sample['file_name'] = 'dummy.jpg'
+    num_readers = 4
+
+    def partitions(num_shards, num_padded):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample,
+                                      num_padded=num_padded)
+        for item in data_set.create_dict_iterator():
+            logger.info("-------------- partition : {} ------------------------".format(partition_id))
+            logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
+            logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
+            logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+    with pytest.raises(Exception, match="padded_sample cannot match columns_list."):
+        partitions(4, 2)
+
+def test_cv_minddataset_partition_padded_samples_no_column_list(add_and_remove_cv_file):
+    data = get_data(CV_DIR_NAME)
+    padded_sample = data[0]
+    padded_sample['label'] = -2
+    padded_sample['file_name'] = 'dummy.jpg'
+    num_readers = 4
+
+    def partitions(num_shards, num_padded):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample,
+                                      num_padded=num_padded)
+        for item in data_set.create_dict_iterator():
+            logger.info("-------------- partition : {} ------------------------".format(partition_id))
+            logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
+            logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
+            logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+    with pytest.raises(Exception, match="padded_sample is specified and requires columns_list as well."):
+        partitions(4, 2)
+
+def test_cv_minddataset_partition_padded_samples_no_num_padded(add_and_remove_cv_file):
+    columns_list = ["data", "file_name", "label"]
+    data = get_data(CV_DIR_NAME)
+    padded_sample = data[0]
+    padded_sample['file_name'] = 'dummy.jpg'
+    num_readers = 4
+
+    def partitions(num_shards, num_padded):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample)
+        for item in data_set.create_dict_iterator():
+            logger.info("-------------- partition : {} ------------------------".format(partition_id))
+            logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
+            logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
+            logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+    with pytest.raises(Exception, match="padded_sample is specified and requires num_padded as well."):
+        partitions(4, 2)
+
+def test_cv_minddataset_partition_padded_samples_no_padded_samples(add_and_remove_cv_file):
+    columns_list = ["data", "file_name", "label"]
+    data = get_data(CV_DIR_NAME)
+    padded_sample = data[0]
+    padded_sample['file_name'] = 'dummy.jpg'
+    num_readers = 4
+
+    def partitions(num_shards, num_padded):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      num_padded=num_padded)
+        for item in data_set.create_dict_iterator():
+            logger.info("-------------- partition : {} ------------------------".format(partition_id))
+            logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
+            logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
+            logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+    with pytest.raises(Exception, match="num_padded is specified but padded_sample is not."):
+        partitions(4, 2)
+
+
+
+def test_nlp_minddataset_reader_basic_padded_samples(add_and_remove_nlp_file):
+    columns_list = ["input_ids", "id", "rating"]
+
+    data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)]
+    padded_sample = data[0]
+    padded_sample['id'] = "-1"
+    padded_sample['input_ids'] = np.array([-1,-1,-1,-1], dtype=np.int64)
+    padded_sample['rating'] = 1.0
+    num_readers = 4
+
+    def partitions(num_shards, num_padded, dataset_size):
+        num_padded_iter = 0
+        num_iter = 0
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(NLP_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample,
+                                      num_padded=num_padded)
+            assert data_set.get_dataset_size() == dataset_size
+            for item in data_set.create_dict_iterator():
+                logger.info("-------------- item[id]: {} ------------------------".format(item["id"]))
+                logger.info("-------------- item[rating]: {} --------------------".format(item["rating"]))
+                logger.info("-------------- item[input_ids]: {}, shape: {} -----------------".format(item["input_ids"], item["input_ids"].shape))
+                if item['id'] == bytes('-1', encoding='utf-8'):
+                    num_padded_iter += 1
+                    assert item['id'] == bytes(padded_sample['id'], encoding='utf-8')
+                    assert (item['input_ids'] == padded_sample['input_ids']).all()
+                    assert (item['rating'] == padded_sample['rating']).all()
+                num_iter += 1
+        assert num_padded_iter == num_padded
+        assert num_iter == dataset_size * num_shards
+
+    partitions(4, 6, 4)
+    partitions(5, 5, 3)
+    partitions(9, 8, 2)
+
+def test_nlp_minddataset_reader_basic_padded_samples_multi_epoch(add_and_remove_nlp_file):
+    columns_list = ["input_ids", "id", "rating"]
+
+    data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)]
+    padded_sample = data[0]
+    padded_sample['id'] = "-1"
+    padded_sample['input_ids'] = np.array([-1,-1,-1,-1], dtype=np.int64)
+    padded_sample['rating'] = 1.0
+    num_readers = 4
+    repeat_size = 3
+
+    def partitions(num_shards, num_padded, dataset_size):
+        num_padded_iter = 0
+        num_iter = 0
+
+        for partition_id in range(num_shards):
+            epoch1_shuffle_result = []
+            epoch2_shuffle_result = []
+            epoch3_shuffle_result = []
+            data_set = ds.MindDataset(NLP_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample,
+                                      num_padded=num_padded)
+            assert data_set.get_dataset_size() == dataset_size
+            data_set = data_set.repeat(repeat_size)
+
+            local_index = 0
+            for item in data_set.create_dict_iterator():
+                logger.info("-------------- item[id]: {} ------------------------".format(item["id"]))
+                logger.info("-------------- item[rating]: {} --------------------".format(item["rating"]))
+                logger.info("-------------- item[input_ids]: {}, shape: {} -----------------".format(item["input_ids"], item["input_ids"].shape))
+                if item['id'] == bytes('-1', encoding='utf-8'):
+                    num_padded_iter += 1
+                    assert item['id'] == bytes(padded_sample['id'], encoding='utf-8')
+                    assert (item['input_ids'] == padded_sample['input_ids']).all()
+                    assert (item['rating'] == padded_sample['rating']).all()
+
+                if local_index < dataset_size:
+                    epoch1_shuffle_result.append(item['id'])
+                elif local_index < dataset_size * 2:
+                    epoch2_shuffle_result.append(item['id'])
+                elif local_index < dataset_size * 3:
+                    epoch3_shuffle_result.append(item['id'])
+                local_index += 1
+                num_iter += 1
+            assert len(epoch1_shuffle_result) == dataset_size
+            assert len(epoch2_shuffle_result) == dataset_size
+            assert len(epoch3_shuffle_result) == dataset_size
+            assert local_index == dataset_size * repeat_size
+
+            # When dataset_size is equal to 2, too high probability is the same result after shuffle operation
+            if dataset_size > 2:
+                assert epoch1_shuffle_result != epoch2_shuffle_result
+                assert epoch2_shuffle_result != epoch3_shuffle_result
+        assert num_padded_iter == num_padded * repeat_size
+        assert num_iter == dataset_size * num_shards * repeat_size
+
+    partitions(4, 6, 4)
+    partitions(5, 5, 3)
+    partitions(9, 8, 2)
+
+
+def test_nlp_minddataset_reader_basic_padded_samples_check_whole_reshuffle_result_per_epoch(add_and_remove_nlp_file):
+    columns_list = ["input_ids", "id", "rating"]
+
+    padded_sample = {}
+    padded_sample['id'] = "-1"
+    padded_sample['input_ids'] = np.array([-1,-1,-1,-1], dtype=np.int64)
+    padded_sample['rating'] = 1.0
+    num_readers = 4
+    repeat_size = 3
+
+    def partitions(num_shards, num_padded, dataset_size):
+        num_padded_iter = 0
+        num_iter = 0
+
+        epoch_result = [[["" for i in range(dataset_size)] for i in range(repeat_size)] for i in range(num_shards)]
+
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(NLP_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id,
+                                      padded_sample=padded_sample,
+                                      num_padded=num_padded)
+            assert data_set.get_dataset_size() == dataset_size
+            data_set = data_set.repeat(repeat_size)
+            inner_num_iter = 0
+            for item in data_set.create_dict_iterator():
+                logger.info("-------------- item[id]: {} ------------------------".format(item["id"]))
+                logger.info("-------------- item[rating]: {} --------------------".format(item["rating"]))
+                logger.info("-------------- item[input_ids]: {}, shape: {} -----------------"
+                    .format(item["input_ids"], item["input_ids"].shape))
+                if item['id'] == bytes('-1', encoding='utf-8'):
+                    num_padded_iter += 1
+                    assert item['id'] == bytes(padded_sample['id'], encoding='utf-8')
+                    assert (item['input_ids'] == padded_sample['input_ids']).all()
+                    assert (item['rating'] == padded_sample['rating']).all()
+                # save epoch result
+                epoch_result[partition_id][int(inner_num_iter / dataset_size)][inner_num_iter % dataset_size] = item["id"]
+                num_iter += 1
+                inner_num_iter += 1
+            assert epoch_result[partition_id][0] not in (epoch_result[partition_id][1], epoch_result[partition_id][2])
+            assert epoch_result[partition_id][1] not in (epoch_result[partition_id][0], epoch_result[partition_id][2])
+            assert epoch_result[partition_id][2] not in (epoch_result[partition_id][1], epoch_result[partition_id][0])
+            if dataset_size > 2:
+                epoch_result[partition_id][0].sort()
+                epoch_result[partition_id][1].sort()
+                epoch_result[partition_id][2].sort()
+                assert epoch_result[partition_id][0] != epoch_result[partition_id][1]
+                assert epoch_result[partition_id][1] != epoch_result[partition_id][2]
+                assert epoch_result[partition_id][2] != epoch_result[partition_id][0]
+        assert num_padded_iter == num_padded * repeat_size
+        assert num_iter == dataset_size * num_shards * repeat_size
+
+    partitions(4, 6, 4)
+    partitions(5, 5, 3)
+    partitions(9, 8, 2)
+
+
+def get_data(dir_name):
+    """
+    usage: get data from imagenet dataset
+    params:
+    dir_name: directory containing folder images and annotation information
+
+    """
+    if not os.path.isdir(dir_name):
+        raise IOError("Directory {} not exists".format(dir_name))
+    img_dir = os.path.join(dir_name, "images")
+    ann_file = os.path.join(dir_name, "annotation.txt")
+    with open(ann_file, "r") as file_reader:
+        lines = file_reader.readlines()
+
+    data_list = []
+    for i, line in enumerate(lines):
+        try:
+            filename, label = line.split(",")
+            label = label.strip("\n")
+            with open(os.path.join(img_dir, filename), "rb") as file_reader:
+                img = file_reader.read()
+            data_json = {"id": i,
+                         "file_name": filename,
+                         "data": img,
+                         "label": int(label)}
+            data_list.append(data_json)
+        except FileNotFoundError:
+            continue
+    return data_list
+
+
+def get_nlp_data(dir_name, vocab_file, num):
+    """
+    Return raw data of aclImdb dataset.
+
+    Args:
+        dir_name (str): String of aclImdb dataset's path.
+        vocab_file (str): String of dictionary's path.
+        num (int): Number of sample.
+
+    Returns:
+        List
+    """
+    if not os.path.isdir(dir_name):
+        raise IOError("Directory {} not exists".format(dir_name))
+    for root, dirs, files in os.walk(dir_name):
+        for index, file_name_extension in enumerate(files):
+            if index < num:
+                file_path = os.path.join(root, file_name_extension)
+                file_name, _ = file_name_extension.split('.', 1)
+                id_, rating = file_name.split('_', 1)
+                with open(file_path, 'r') as f:
+                    raw_content = f.read()
+
+                dictionary = load_vocab(vocab_file)
+                vectors = [dictionary.get('[CLS]')]
+                vectors += [dictionary.get(i) if i in dictionary
+                            else dictionary.get('[UNK]')
+                            for i in re.findall(r"[\w']+|[{}]"
+                                                .format(string.punctuation),
+                                                raw_content)]
+                vectors += [dictionary.get('[SEP]')]
+                input_, mask, segment = inputs(vectors)
+                input_ids = np.reshape(np.array(input_), [-1])
+                input_mask = np.reshape(np.array(mask), [1, -1])
+                segment_ids = np.reshape(np.array(segment), [2, -1])
+                data = {
+                    "label": 1,
+                    "id": id_,
+                    "rating": float(rating),
+                    "input_ids": input_ids,
+                    "input_mask": input_mask,
+                    "segment_ids": segment_ids
+                }
+                yield data
+
+
+def convert_to_uni(text):
+    if isinstance(text, str):
+        return text
+    if isinstance(text, bytes):
+        return text.decode('utf-8', 'ignore')
+    raise Exception("The type %s does not convert!" % type(text))
+
+
+def load_vocab(vocab_file):
+    """load vocabulary to translate statement."""
+    vocab = collections.OrderedDict()
+    vocab.setdefault('blank', 2)
+    index = 0
+    with open(vocab_file) as reader:
+        while True:
+            tmp = reader.readline()
+            if not tmp:
+                break
+            token = convert_to_uni(tmp)
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def inputs(vectors, maxlen=50):
+    length = len(vectors)
+    if length > maxlen:
+        return vectors[0:maxlen], [1] * maxlen, [0] * maxlen
+    input_ = vectors + [0] * (maxlen - length)
+    mask = [1] * length + [0] * (maxlen - length)
+    segment = [0] * maxlen
+    return input_, mask, segment
diff --git a/tests/ut/python/dataset/test_minddataset_sampler.py b/tests/ut/python/dataset/test_minddataset_sampler.py
index 4e6087b9da..8d099f1af2 100644
--- a/tests/ut/python/dataset/test_minddataset_sampler.py
+++ b/tests/ut/python/dataset/test_minddataset_sampler.py
@@ -17,6 +17,7 @@ This is the test module for mindrecord
 """
 import os
 import pytest
+import numpy as np
 
 import mindspore.dataset as ds
 from mindspore import log as logger
@@ -64,10 +65,12 @@ def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file):
     assert data_set.get_dataset_size() == 6
     num_iter = 0
     for item in data_set.create_dict_iterator():
-        logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info(
+            "-------------- cv reader basic: {} ------------------------".format(num_iter))
         logger.info("-------------- item[file_name]: \
                 {}------------------------".format(to_str(item["file_name"])))
-        logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
         num_iter += 1
 
 
@@ -82,12 +85,14 @@ def test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file):
     assert data_set.get_dataset_size() == 6
     num_iter = 0
     for item in data_set.create_dict_iterator():
-        logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info(
+            "-------------- cv reader basic: {} ------------------------".format(num_iter))
         logger.info("-------------- item[data]: \
                 {}------------------------".format(item["data"][:10]))
         logger.info("-------------- item[file_name]: \
                 {}------------------------".format(to_str(item["file_name"])))
-        logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
         num_iter += 1
 
 
@@ -102,10 +107,12 @@ def test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file):
     assert data_set.get_dataset_size() == 9
     num_iter = 0
     for item in data_set.create_dict_iterator():
-        logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info(
+            "-------------- cv reader basic: {} ------------------------".format(num_iter))
         logger.info("-------------- item[file_name]: \
                 {}------------------------".format(to_str(item["file_name"])))
-        logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
         num_iter += 1
 
 
@@ -119,10 +126,12 @@ def test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file):
     assert data_set.get_dataset_size() == 15
     num_iter = 0
     for item in data_set.create_dict_iterator():
-        logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info(
+            "-------------- cv reader basic: {} ------------------------".format(num_iter))
         logger.info("-------------- item[file_name]: \
                 {}------------------------".format(to_str(item["file_name"])))
-        logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
         num_iter += 1
 
 
@@ -219,7 +228,6 @@ def test_cv_minddataset_subset_random_sample_out_of_range(add_and_remove_cv_file
 
 
 def test_cv_minddataset_subset_random_sample_negative(add_and_remove_cv_file):
-    """tutorial for cv minderdataset."""
     columns_list = ["data", "file_name", "label"]
     num_readers = 4
     indices = [1, 2, 4, -1, -2]
@@ -241,6 +249,351 @@ def test_cv_minddataset_subset_random_sample_negative(add_and_remove_cv_file):
     assert num_iter == 5
 
 
+def test_cv_minddataset_random_sampler_basic(add_and_remove_cv_file):
+    data = get_data(CV_DIR_NAME, True)
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    sampler = ds.RandomSampler()
+    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                              sampler=sampler)
+    assert data_set.get_dataset_size() == 10
+    num_iter = 0
+    new_dataset = []
+    for item in data_set.create_dict_iterator():
+        logger.info(
+            "-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        num_iter += 1
+        new_dataset.append(item['file_name'])
+    assert num_iter == 10
+    assert new_dataset != [x['file_name'] for x in data]
+
+def test_cv_minddataset_random_sampler_repeat(add_and_remove_cv_file):
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    sampler = ds.RandomSampler()
+    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                              sampler=sampler)
+    assert data_set.get_dataset_size() == 10
+    ds1 = data_set.repeat(3)
+    num_iter = 0
+    epoch1_dataset = []
+    epoch2_dataset = []
+    epoch3_dataset = []
+    for item in ds1.create_dict_iterator():
+        logger.info(
+            "-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        num_iter += 1
+        if num_iter <= 10:
+            epoch1_dataset.append(item['file_name'])
+        elif num_iter <= 20:
+            epoch2_dataset.append(item['file_name'])
+        else:
+            epoch3_dataset.append(item['file_name'])
+    assert num_iter == 30
+    assert epoch1_dataset not in (epoch2_dataset, epoch3_dataset)
+    assert epoch2_dataset not in (epoch1_dataset, epoch3_dataset)
+    assert epoch3_dataset not in (epoch1_dataset, epoch2_dataset)
+
+def test_cv_minddataset_random_sampler_replacement(add_and_remove_cv_file):
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    sampler = ds.RandomSampler(replacement=True, num_samples=5)
+    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                              sampler=sampler)
+    assert data_set.get_dataset_size() == 5
+    num_iter = 0
+    for item in data_set.create_dict_iterator():
+        logger.info(
+            "-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        num_iter += 1
+    assert num_iter == 5
+
+
+def test_cv_minddataset_sequential_sampler_basic(add_and_remove_cv_file):
+    data = get_data(CV_DIR_NAME, True)
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    sampler = ds.SequentialSampler(1, 4)
+    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                              sampler=sampler)
+    assert data_set.get_dataset_size() == 4
+    num_iter = 0
+    for item in data_set.create_dict_iterator():
+        logger.info(
+            "-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        assert item['file_name'] == np.array(
+            data[num_iter+1]['file_name'], dtype='S')
+        num_iter += 1
+    assert num_iter == 4
+
+
+def test_cv_minddataset_sequential_sampler_exceed_size(add_and_remove_cv_file):
+    data = get_data(CV_DIR_NAME, True)
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    sampler = ds.SequentialSampler(2, 10)
+    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                              sampler=sampler)
+    dataset_size = data_set.get_dataset_size()
+    assert dataset_size == 10
+    num_iter = 0
+    for item in data_set.create_dict_iterator():
+        logger.info(
+            "-------------- cv reader basic: {} ------------------------".format(num_iter))
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        assert item['file_name'] == np.array(
+            data[(num_iter + 2) % dataset_size]['file_name'], dtype='S')
+        num_iter += 1
+    assert num_iter == 10
+
+
+def test_cv_minddataset_split_basic(add_and_remove_cv_file):
+    data = get_data(CV_DIR_NAME, True)
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    d = ds.MindDataset(CV_FILE_NAME + "0", columns_list,
+                       num_readers, shuffle=False)
+    d1, d2 = d.split([8, 2], randomize=False)
+    assert d.get_dataset_size() == 10
+    assert d1.get_dataset_size() == 8
+    assert d2.get_dataset_size() == 2
+    num_iter = 0
+    for item in d1.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        assert item['file_name'] == np.array(data[num_iter]['file_name'],
+                                             dtype='S')
+        num_iter += 1
+    assert num_iter == 8
+    num_iter = 0
+    for item in d2.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        assert item['file_name'] == np.array(data[num_iter + 8]['file_name'],
+                                             dtype='S')
+        num_iter += 1
+    assert num_iter == 2
+
+
+def test_cv_minddataset_split_exact_percent(add_and_remove_cv_file):
+    data = get_data(CV_DIR_NAME, True)
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    d = ds.MindDataset(CV_FILE_NAME + "0", columns_list,
+                       num_readers, shuffle=False)
+    d1, d2 = d.split([0.8, 0.2], randomize=False)
+    assert d.get_dataset_size() == 10
+    assert d1.get_dataset_size() == 8
+    assert d2.get_dataset_size() == 2
+    num_iter = 0
+    for item in d1.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        assert item['file_name'] == np.array(
+            data[num_iter]['file_name'], dtype='S')
+        num_iter += 1
+    assert num_iter == 8
+    num_iter = 0
+    for item in d2.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        assert item['file_name'] == np.array(data[num_iter + 8]['file_name'],
+                                             dtype='S')
+        num_iter += 1
+    assert num_iter == 2
+
+
+def test_cv_minddataset_split_fuzzy_percent(add_and_remove_cv_file):
+    data = get_data(CV_DIR_NAME, True)
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    d = ds.MindDataset(CV_FILE_NAME + "0", columns_list,
+                       num_readers, shuffle=False)
+    d1, d2 = d.split([0.41, 0.59], randomize=False)
+    assert d.get_dataset_size() == 10
+    assert d1.get_dataset_size() == 4
+    assert d2.get_dataset_size() == 6
+    num_iter = 0
+    for item in d1.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        assert item['file_name'] == np.array(
+            data[num_iter]['file_name'], dtype='S')
+        num_iter += 1
+    assert num_iter == 4
+    num_iter = 0
+    for item in d2.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        assert item['file_name'] == np.array(data[num_iter + 4]['file_name'],
+                                             dtype='S')
+        num_iter += 1
+    assert num_iter == 6
+
+
+def test_cv_minddataset_split_deterministic(add_and_remove_cv_file):
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    d = ds.MindDataset(CV_FILE_NAME + "0", columns_list,
+                       num_readers, shuffle=False)
+    # should set seed to avoid data overlap
+    ds.config.set_seed(111)
+    d1, d2 = d.split([0.8, 0.2])
+    assert d.get_dataset_size() == 10
+    assert d1.get_dataset_size() == 8
+    assert d2.get_dataset_size() == 2
+
+    d1_dataset = []
+    d2_dataset = []
+    num_iter = 0
+    for item in d1.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        d1_dataset.append(item['file_name'])
+        num_iter += 1
+    assert num_iter == 8
+    num_iter = 0
+    for item in d2.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        d2_dataset.append(item['file_name'])
+        num_iter += 1
+    assert num_iter == 2
+    inter_dataset = [x for x in d1_dataset if x in d2_dataset]
+    assert inter_dataset == []   # intersection of  d1 and d2
+
+
+def test_cv_minddataset_split_sharding(add_and_remove_cv_file):
+    data = get_data(CV_DIR_NAME, True)
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+    d = ds.MindDataset(CV_FILE_NAME + "0", columns_list,
+                       num_readers, shuffle=False)
+    # should set seed to avoid data overlap
+    ds.config.set_seed(111)
+    d1, d2 = d.split([0.8, 0.2])
+    assert d.get_dataset_size() == 10
+    assert d1.get_dataset_size() == 8
+    assert d2.get_dataset_size() == 2
+    distributed_sampler = ds.DistributedSampler(2, 0)
+    d1.use_sampler(distributed_sampler)
+    assert d1.get_dataset_size() == 4
+
+    num_iter = 0
+    d1_shard1 = []
+    for item in d1.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        num_iter += 1
+        d1_shard1.append(item['file_name'])
+    assert num_iter == 4
+    assert d1_shard1 != [x['file_name'] for x in data[0:4]]
+
+    distributed_sampler = ds.DistributedSampler(2, 1)
+    d1.use_sampler(distributed_sampler)
+    assert d1.get_dataset_size() == 4
+
+    d1s = d1.repeat(3)
+    epoch1_dataset = []
+    epoch2_dataset = []
+    epoch3_dataset = []
+    num_iter = 0
+    for item in d1s.create_dict_iterator():
+        logger.info(
+            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
+        logger.info(
+            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
+        logger.info(
+            "-------------- item[label]: {} ----------------------------".format(item["label"]))
+        num_iter += 1
+        if num_iter <= 4:
+            epoch1_dataset.append(item['file_name'])
+        elif num_iter <= 8:
+            epoch2_dataset.append(item['file_name'])
+        else:
+            epoch3_dataset.append(item['file_name'])
+    assert len(epoch1_dataset) == 4
+    assert len(epoch2_dataset) == 4
+    assert len(epoch3_dataset) == 4
+    inter_dataset = [x for x in d1_shard1 if x in epoch1_dataset]
+    assert inter_dataset == [] # intersection of d1's shard1 and d1's shard2
+    assert epoch1_dataset not in (epoch2_dataset, epoch3_dataset)
+    assert epoch2_dataset not in (epoch1_dataset, epoch3_dataset)
+    assert epoch3_dataset not in (epoch1_dataset, epoch2_dataset)
+
+    epoch1_dataset.sort()
+    epoch2_dataset.sort()
+    epoch3_dataset.sort()
+    assert epoch1_dataset != epoch2_dataset
+    assert epoch2_dataset != epoch3_dataset
+    assert epoch3_dataset != epoch1_dataset
+
+
 def get_data(dir_name, sampler=False):
     """
     usage: get data from imagenet dataset
diff --git a/tests/ut/python/dataset/test_ngram_op.py b/tests/ut/python/dataset/test_ngram_op.py
new file mode 100644
index 0000000000..73b2702378
--- /dev/null
+++ b/tests/ut/python/dataset/test_ngram_op.py
@@ -0,0 +1,117 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing Ngram in mindspore.dataset
+"""
+import mindspore.dataset as ds
+import mindspore.dataset.text as text
+import numpy as np
+
+
+def test_multiple_ngrams():
+    """ test n-gram where n is a list of integers"""
+    plates_mottos = ["WildRose Country", "Canada's Ocean Playground", "Land of Living Skies"]
+    n_gram_mottos = []
+    n_gram_mottos.append(
+        ['WildRose', 'Country', '_ WildRose', 'WildRose Country', 'Country _', '_ _ WildRose', '_ WildRose Country',
+         'WildRose Country _', 'Country _ _'])
+    n_gram_mottos.append(
+        ["Canada's", 'Ocean', 'Playground', "_ Canada's", "Canada's Ocean", 'Ocean Playground', 'Playground _',
+         "_ _ Canada's", "_ Canada's Ocean", "Canada's Ocean Playground", 'Ocean Playground _', 'Playground _ _'])
+    n_gram_mottos.append(
+        ['Land', 'of', 'Living', 'Skies', '_ Land', 'Land of', 'of Living', 'Living Skies', 'Skies _', '_ _ Land',
+         '_ Land of', 'Land of Living', 'of Living Skies', 'Living Skies _', 'Skies _ _'])
+
+    def gen(texts):
+        for line in texts:
+            yield (np.array(line.split(" "), dtype='S'),)
+
+    dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"])
+    dataset = dataset.map(input_columns=["text"], operations=text.Ngram([1, 2, 3], ("_", 2), ("_", 2), " "))
+
+    i = 0
+    for data in dataset.create_dict_iterator():
+        assert [d.decode("utf8") for d in data["text"]] == n_gram_mottos[i]
+        i += 1
+
+
+def test_simple_ngram():
+    """ test simple gram with only one n value"""
+    plates_mottos = ["Friendly Manitoba", "Yours to Discover", "Land of Living Skies",
+                     "Birthplace of the Confederation"]
+    n_gram_mottos = [[""]]
+    n_gram_mottos.append(["Yours to Discover"])
+    n_gram_mottos.append(['Land of Living', 'of Living Skies'])
+    n_gram_mottos.append(['Birthplace of the', 'of the Confederation'])
+
+    def gen(texts):
+        for line in texts:
+            yield (np.array(line.split(" "), dtype='S'),)
+
+    dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"])
+    dataset = dataset.map(input_columns=["text"], operations=text.Ngram(3, separator=None))
+
+    i = 0
+    for data in dataset.create_dict_iterator():
+        assert [d.decode("utf8") for d in data["text"]] == n_gram_mottos[i], i
+        i += 1
+
+
+def test_corner_cases():
+    """ testing various corner cases and exceptions"""
+
+    def test_config(input_line, output_line, n, l_pad=None, r_pad=None, sep=None):
+        def gen(texts):
+            yield (np.array(texts.split(" "), dtype='S'),)
+
+        dataset = ds.GeneratorDataset(gen(input_line), column_names=["text"])
+        dataset = dataset.map(input_columns=["text"], operations=text.Ngram(n, l_pad, r_pad, separator=sep))
+        for data in dataset.create_dict_iterator():
+            assert [d.decode("utf8") for d in data["text"]] == output_line, output_line
+
+    # test tensor length smaller than n
+    test_config("Lone Star", ["Lone Star", "", "", ""], [2, 3, 4, 5])
+    # test empty separator
+    test_config("Beautiful British Columbia", ['BeautifulBritish', 'BritishColumbia'], 2, sep="")
+    # test separator with longer length
+    test_config("Beautiful British Columbia", ['Beautiful^-^British^-^Columbia'], 3, sep="^-^")
+    # test left pad != right pad
+    test_config("Lone Star", ['The Lone Star State'], 4, ("The", 1), ("State", 1))
+    # test invalid n
+    try:
+        test_config("Yours to Discover", "", [0, [1]])
+    except Exception as e:
+        assert "ngram needs to be a positive number" in str(e)
+    # test empty n
+    try:
+        test_config("Yours to Discover", "", [])
+    except Exception as e:
+        assert "n needs to be a non-empty list" in str(e)
+    # test invalid pad
+    try:
+        test_config("Yours to Discover", "", [1], ("str", -1))
+    except Exception as e:
+        assert "padding width need to be positive numbers" in str(e)
+    # test invalid pad
+    try:
+        test_config("Yours to Discover", "", [1], ("str", "rts"))
+    except Exception as e:
+        assert "pad needs to be a tuple of (str, int)" in str(e)
+
+
+if __name__ == '__main__':
+    test_multiple_ngrams()
+    test_simple_ngram()
+    test_corner_cases()
diff --git a/tests/ut/python/dataset/test_nlp.py b/tests/ut/python/dataset/test_nlp.py
index 28fc9108f6..6b44cfc80b 100644
--- a/tests/ut/python/dataset/test_nlp.py
+++ b/tests/ut/python/dataset/test_nlp.py
@@ -33,7 +33,7 @@ def test_on_tokenized_line():
             word = line.split(',')[0]
             jieba_op.add_word(word)
     data = data.map(input_columns=["text"], operations=jieba_op)
-    vocab = text.Vocab.from_file(VOCAB_FILE, ",")
+    vocab = text.Vocab.from_file(VOCAB_FILE, ",", special_tokens=["<pad>", "<unk>"])
     lookup = text.Lookup(vocab)
     data = data.map(input_columns=["text"], operations=lookup)
     res = np.array([[10, 1, 11, 1, 12, 1, 15, 1, 13, 1, 14],
diff --git a/tests/ut/python/dataset/test_normalizeOp.py b/tests/ut/python/dataset/test_normalizeOp.py
index 5c9c7aff93..af97ee0c08 100644
--- a/tests/ut/python/dataset/test_normalizeOp.py
+++ b/tests/ut/python/dataset/test_normalizeOp.py
@@ -12,55 +12,89 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
+"""
+Testing Normalize op in DE
+"""
 import numpy as np
-
 import mindspore.dataset as ds
-import mindspore.dataset.transforms.vision.c_transforms as vision
+import mindspore.dataset.transforms.vision.c_transforms as c_vision
+import mindspore.dataset.transforms.vision.py_transforms as py_vision
 from mindspore import log as logger
+from util import diff_mse, save_and_check_md5, visualize_image
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
 
+GENERATE_GOLDEN = False
+
 
-def normalize_np(image):
+def normalize_np(image, mean, std):
     """
     Apply the normalization
     """
     #  DE decodes the image in RGB by deafult, hence
     #  the values here are in RGB
     image = np.array(image, np.float32)
-    image = image - np.array([121.0, 115.0, 100.0])
-    image = image * (1.0 / np.array([70.0, 68.0, 71.0]))
+    image = image - np.array(mean)
+    image = image * (1.0 / np.array(std))
     return image
 
 
-# pylint: disable=inconsistent-return-statements
-def get_normalized(image_id):
+def util_test_normalize(mean, std, op_type):
     """
-    Reads the image using DE ops and then normalizes using Numpy
+    Utility function for testing Normalize. Input arguments are given by other tests
     """
-    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
-    decode_op = vision.Decode()
-    data1 = data1.map(input_columns=["image"], operations=decode_op)
-    num_iter = 0
-    for item in data1.create_dict_iterator():
-        image = item["image"]
-        if num_iter == image_id:
-            return normalize_np(image)
-        num_iter += 1
-    return None
+    if op_type == "cpp":
+        # define map operations
+        decode_op = c_vision.Decode()
+        normalize_op = c_vision.Normalize(mean, std)
+        # Generate dataset
+        data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+        data = data.map(input_columns=["image"], operations=decode_op)
+        data = data.map(input_columns=["image"], operations=normalize_op)
+    elif op_type == "python":
+        # define map operations
+        transforms = [
+            py_vision.Decode(),
+            py_vision.ToTensor(),
+            py_vision.Normalize(mean, std)
+        ]
+        transform = py_vision.ComposeOp(transforms)
+        # Generate dataset
+        data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+        data = data.map(input_columns=["image"], operations=transform())
+    else:
+        raise ValueError("Wrong parameter value")
+    return data
 
 
-def test_normalize_op():
+def util_test_normalize_grayscale(num_output_channels, mean, std):
     """
-    Test Normalize
+    Utility function for testing Normalize. Input arguments are given by other tests
     """
-    logger.info("Test Normalize")
+    transforms = [
+        py_vision.Decode(),
+        py_vision.Grayscale(num_output_channels),
+        py_vision.ToTensor(),
+        py_vision.Normalize(mean, std)
+    ]
+    transform = py_vision.ComposeOp(transforms)
+    # Generate dataset
+    data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data = data.map(input_columns=["image"], operations=transform())
+    return data
+
 
+def test_normalize_op_c(plot=False):
+    """
+    Test Normalize in cpp transformations
+    """
+    logger.info("Test Normalize in cpp")
+    mean = [121.0, 115.0, 100.0]
+    std = [70.0, 68.0, 71.0]
     # define map operations
-    decode_op = vision.Decode()
-    normalize_op = vision.Normalize([121.0, 115.0, 100.0], [70.0, 68.0, 71.0])
+    decode_op = c_vision.Decode()
+    normalize_op = c_vision.Normalize(mean, std)
 
     #  First dataset
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
@@ -74,36 +108,64 @@ def test_normalize_op():
     num_iter = 0
     for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
         image_de_normalized = item1["image"]
-        image_np_normalized = normalize_np(item2["image"])
-        diff = image_de_normalized - image_np_normalized
-        mse = np.sum(np.power(diff, 2))
+        image_original = item2["image"]
+        image_np_normalized = normalize_np(image_original, mean, std)
+        mse = diff_mse(image_de_normalized, image_np_normalized)
+        logger.info("image_{}, mse: {}".format(num_iter + 1, mse))
+        assert mse < 0.01
+        if plot:
+            visualize_image(image_original, image_de_normalized, mse, image_np_normalized)
+        num_iter += 1
+
+
+def test_normalize_op_py(plot=False):
+    """
+    Test Normalize in python transformations
+    """
+    logger.info("Test Normalize in python")
+    mean = [0.475, 0.45, 0.392]
+    std = [0.275, 0.267, 0.278]
+    # define map operations
+    transforms = [
+        py_vision.Decode(),
+        py_vision.ToTensor()
+    ]
+    transform = py_vision.ComposeOp(transforms)
+    normalize_op = py_vision.Normalize(mean, std)
+
+    #  First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data1 = data1.map(input_columns=["image"], operations=transform())
+    data1 = data1.map(input_columns=["image"], operations=normalize_op)
+
+    #  Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=transform())
+
+    num_iter = 0
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        image_de_normalized = (item1["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image_np_normalized = (normalize_np(item2["image"].transpose(1, 2, 0), mean, std) * 255).astype(np.uint8)
+        image_original = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        mse = diff_mse(image_de_normalized, image_np_normalized)
         logger.info("image_{}, mse: {}".format(num_iter + 1, mse))
         assert mse < 0.01
-        # Uncomment these blocks to see visual results
-        # plt.subplot(131)
-        # plt.imshow(image_de_normalized)
-        # plt.title("DE normalize image")
-        #
-        # plt.subplot(132)
-        # plt.imshow(image_np_normalized)
-        # plt.title("Numpy normalized image")
-        #
-        # plt.subplot(133)
-        # plt.imshow(diff)
-        # plt.title("Difference image, mse : {}".format(mse))
-        #
-        # plt.show()
+        if plot:
+            visualize_image(image_original, image_de_normalized, mse, image_np_normalized)
         num_iter += 1
 
 
 def test_decode_op():
+    """
+    Test Decode op
+    """
     logger.info("Test Decode")
 
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image", "label"], num_parallel_workers=1,
                                shuffle=False)
 
     # define map operations
-    decode_op = vision.Decode()
+    decode_op = c_vision.Decode()
 
     # apply map operations on images
     data1 = data1.map(input_columns=["image"], operations=decode_op)
@@ -112,22 +174,21 @@ def test_decode_op():
     for item in data1.create_dict_iterator():
         logger.info("Looping inside iterator {}".format(num_iter))
         _ = item["image"]
-        # plt.subplot(131)
-        # plt.imshow(image)
-        # plt.title("DE image")
-        # plt.show()
         num_iter += 1
 
 
 def test_decode_normalize_op():
+    """
+    Test Decode op followed by Normalize op
+    """
     logger.info("Test [Decode, Normalize] in one Map")
 
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image", "label"], num_parallel_workers=1,
                                shuffle=False)
 
     # define map operations
-    decode_op = vision.Decode()
-    normalize_op = vision.Normalize([121.0, 115.0, 100.0], [70.0, 68.0, 71.0])
+    decode_op = c_vision.Decode()
+    normalize_op = c_vision.Normalize([121.0, 115.0, 100.0], [70.0, 68.0, 71.0])
 
     # apply map operations on images
     data1 = data1.map(input_columns=["image"], operations=[decode_op, normalize_op])
@@ -136,14 +197,139 @@ def test_decode_normalize_op():
     for item in data1.create_dict_iterator():
         logger.info("Looping inside iterator {}".format(num_iter))
         _ = item["image"]
-        # plt.subplot(131)
-        # plt.imshow(image)
-        # plt.title("DE image")
-        # plt.show()
         num_iter += 1
 
 
+def test_normalize_md5_01():
+    """
+    Test Normalize with md5 check: valid mean and std
+    expected to pass
+    """
+    logger.info("test_normalize_md5_01")
+    data_c = util_test_normalize([121.0, 115.0, 100.0], [70.0, 68.0, 71.0], "cpp")
+    data_py = util_test_normalize([0.475, 0.45, 0.392], [0.275, 0.267, 0.278], "python")
+
+    # check results with md5 comparison
+    filename1 = "normalize_01_c_result.npz"
+    filename2 = "normalize_01_py_result.npz"
+    save_and_check_md5(data_c, filename1, generate_golden=GENERATE_GOLDEN)
+    save_and_check_md5(data_py, filename2, generate_golden=GENERATE_GOLDEN)
+
+
+def test_normalize_md5_02():
+    """
+    Test Normalize with md5 check: len(mean)=len(std)=1 with RGB images
+    expected to pass
+    """
+    logger.info("test_normalize_md5_02")
+    data_py = util_test_normalize([0.475], [0.275], "python")
+
+    # check results with md5 comparison
+    filename2 = "normalize_02_py_result.npz"
+    save_and_check_md5(data_py, filename2, generate_golden=GENERATE_GOLDEN)
+
+
+def test_normalize_exception_unequal_size_c():
+    """
+    Test Normalize in c transformation: len(mean) != len(std)
+    expected to raise ValueError
+    """
+    logger.info("test_normalize_exception_unequal_size_c")
+    try:
+        _ = c_vision.Normalize([100, 250, 125], [50, 50, 75, 75])
+    except ValueError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "Length of mean and std must be equal"
+
+
+def test_normalize_exception_unequal_size_py():
+    """
+    Test Normalize in python transformation: len(mean) != len(std)
+    expected to raise ValueError
+    """
+    logger.info("test_normalize_exception_unequal_size_py")
+    try:
+        _ = py_vision.Normalize([0.50, 0.30, 0.75], [0.18, 0.32, 0.71, 0.72])
+    except ValueError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "Length of mean and std must be equal"
+
+
+def test_normalize_exception_invalid_size_py():
+    """
+    Test Normalize in python transformation: len(mean)=len(std)=2
+    expected to raise RuntimeError
+    """
+    logger.info("test_normalize_exception_invalid_size_py")
+    data = util_test_normalize([0.75, 0.25], [0.18, 0.32], "python")
+    try:
+        _ = data.create_dict_iterator().get_next()
+    except RuntimeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Length of mean and std must both be 1 or" in str(e)
+
+
+def test_normalize_exception_invalid_range_py():
+    """
+    Test Normalize in python transformation: value is not in range [0,1]
+    expected to raise ValueError
+    """
+    logger.info("test_normalize_exception_invalid_range_py")
+    try:
+        _ = py_vision.Normalize([0.75, 1.25, 0.5], [0.1, 0.18, 1.32])
+    except ValueError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Input is not within the required range" in str(e)
+
+
+def test_normalize_grayscale_md5_01():
+    """
+    Test Normalize with md5 check: len(mean)=len(std)=1 with 1 channel grayscale images
+    expected to pass
+    """
+    logger.info("test_normalize_grayscale_md5_01")
+    data = util_test_normalize_grayscale(1, [0.5], [0.175])
+    # check results with md5 comparison
+    filename = "normalize_03_py_result.npz"
+    save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
+
+
+def test_normalize_grayscale_md5_02():
+    """
+    Test Normalize with md5 check: len(mean)=len(std)=3 with 3 channel grayscale images
+    expected to pass
+    """
+    logger.info("test_normalize_grayscale_md5_02")
+    data = util_test_normalize_grayscale(3, [0.5, 0.5, 0.5], [0.175, 0.235, 0.512])
+    # check results with md5 comparison
+    filename = "normalize_04_py_result.npz"
+    save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
+
+
+def test_normalize_grayscale_exception():
+    """
+    Test Normalize: len(mean)=len(std)=3 with 1 channel grayscale images
+    expected to raise RuntimeError
+    """
+    logger.info("test_normalize_grayscale_exception")
+    try:
+        _ = util_test_normalize_grayscale(1, [0.5, 0.5, 0.5], [0.175, 0.235, 0.512])
+    except RuntimeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Input is not within the required range" in str(e)
+
+
 if __name__ == "__main__":
     test_decode_op()
     test_decode_normalize_op()
-    test_normalize_op()
+    test_normalize_op_c(plot=True)
+    test_normalize_op_py(plot=True)
+    test_normalize_md5_01()
+    test_normalize_md5_02()
+    test_normalize_exception_unequal_size_c()
+    test_normalize_exception_unequal_size_py()
+    test_normalize_exception_invalid_size_py()
+    test_normalize_exception_invalid_range_py()
+    test_normalize_grayscale_md5_01()
+    test_normalize_grayscale_md5_02()
+    test_normalize_grayscale_exception()
diff --git a/tests/ut/python/dataset/test_onehot_op.py b/tests/ut/python/dataset/test_onehot_op.py
index 4b9802e301..500f770b9b 100644
--- a/tests/ut/python/dataset/test_onehot_op.py
+++ b/tests/ut/python/dataset/test_onehot_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.c_transforms as data_trans
 from mindspore import log as logger
+from util import diff_mse
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
@@ -55,7 +56,7 @@ def test_one_hot():
         assert len(item1) == len(item2)
         label1 = item1["label"]
         label2 = one_hot(item2["label"][0], depth)
-        mse = np.sum(label1 - label2)
+        mse = diff_mse(label1, label2)
         logger.info("DE one_hot: {}, Numpy one_hot: {}, diff: {}".format(label1, label2, mse))
         num_iter += 1
 
diff --git a/tests/ut/python/dataset/test_opt_pass.py b/tests/ut/python/dataset/test_opt_pass.py
new file mode 100644
index 0000000000..bab881e283
--- /dev/null
+++ b/tests/ut/python/dataset/test_opt_pass.py
@@ -0,0 +1,90 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+
+import mindspore.dataset as ds
+
+
+def test_map_reorder_pass_0():
+    def generator_mc(maxid=1):
+        for _ in range(maxid):
+            yield (np.array([0]), np.array([1]))
+
+    # Generator -> Map
+    data0 = ds.GeneratorDataset(generator_mc, ["col0", "col1"])
+
+    data0 = data0.map(input_columns="col0", output_columns="out", columns_order=["col1", "out"],
+                      operations=(lambda x: x))
+
+    for item in data0.create_tuple_iterator():  # each data is a dictionary
+        assert item == [np.array(1), np.array(0)]
+
+
+def test_map_reorder_pass_1():
+    def generator_mc(maxid=1):
+        for _ in range(maxid):
+            yield (np.array([0]), np.array([1]), np.array([2]))
+
+    # Three map and zip
+    data0 = ds.GeneratorDataset(generator_mc, ["a0", "a1", "a2"])
+    data0 = data0.map(input_columns="a0", columns_order=["a2", "a1", "a0"], operations=(lambda x: x))
+    data1 = ds.GeneratorDataset(generator_mc, ["b0", "b1", "b2"])
+    data1 = data1.map(input_columns="b0", columns_order=["b1", "b2", "b0"], operations=(lambda x: x))
+    data2 = ds.zip((data0, data1))
+    data2 = data2.map(input_columns="a0", columns_order=["b2", "a2", "b1", "a1", "b0", "a0"], operations=(lambda x: x))
+
+    for item in data2.create_tuple_iterator():
+        assert item == [np.array(2), np.array(2), np.array(1), np.array(1), np.array(0), np.array(0)]
+
+
+def test_global_shuffle_pass():
+
+    FILES = ["../data/dataset/testTFTestAllTypes/test.data"]
+    SCHEMA_FILE = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
+
+    ds.config.set_seed(1)
+    data1 = ds.TFRecordDataset(FILES, schema=SCHEMA_FILE, shuffle=ds.Shuffle.GLOBAL)
+    data2 = ds.TFRecordDataset(FILES, schema=SCHEMA_FILE, shuffle=ds.Shuffle.FILES)
+    data2 = data2.shuffle(10000)
+
+    for d1, d2 in zip(data1, data2):
+        for t1, t2 in zip(d1, d2):
+            assert np.array_equal(t1, t2)
+
+    ds.config.set_seed(1)
+    DATA_ALL_FILE = "../data/dataset/testTextFileDataset/*"
+    data1 = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.GLOBAL)
+    data2 = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.FILES)
+    data2 = data2.shuffle(10000)
+
+    for d1, d2 in zip(data1, data2):
+        for t1, t2 in zip(d1, d2):
+            assert np.array_equal(t1, t2)
+
+    ds.config.set_seed(1)
+    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'
+    data1 = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', shuffle=ds.Shuffle.GLOBAL)
+    data2 = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', shuffle=ds.Shuffle.FILES)
+    data2 = data2.shuffle(10000)
+
+    for d1, d2 in zip(data1, data2):
+        for t1, t2 in zip(d1, d2):
+            assert np.array_equal(t1, t2)
+
+
+if __name__ == "__main__":
+    test_map_reorder_pass_0()
+    test_map_reorder_pass_1()
+    test_global_shuffle_pass()
diff --git a/tests/ut/python/dataset/test_pad_end_op.py b/tests/ut/python/dataset/test_pad_end_op.py
new file mode 100644
index 0000000000..5742d73665
--- /dev/null
+++ b/tests/ut/python/dataset/test_pad_end_op.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing PadEnd op in DE
+"""
+import numpy as np
+import pytest
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as ops
+
+
+# Extensive testing of PadEnd is already done in batch with Pad test cases
+
+def pad_compare(array, pad_shape, pad_value, res):
+    data = ds.NumpySlicesDataset([array])
+    if pad_value is not None:
+        data = data.map(operations=ops.PadEnd(pad_shape, pad_value))
+    else:
+        data = data.map(operations=ops.PadEnd(pad_shape))
+    for d in data:
+        np.testing.assert_array_equal(res, d[0])
+
+
+def test_pad_end_basics():
+    pad_compare([1, 2], [3], -1, [1, 2, -1])
+    pad_compare([1, 2, 3], [3], -1, [1, 2, 3])
+    pad_compare([1, 2, 3], [2], -1, [1, 2])
+    pad_compare([1, 2, 3], [5], None, [1, 2, 3, 0, 0])
+
+
+def test_pad_end_str():
+    pad_compare([b"1", b"2"], [3], b"-1", [b"1", b"2", b"-1"])
+    pad_compare([b"1", b"2", b"3"], [3], b"-1", [b"1", b"2", b"3"])
+    pad_compare([b"1", b"2", b"3"], [2], b"-1", [b"1", b"2"])
+    pad_compare([b"1", b"2", b"3"], [5], None, [b"1", b"2", b"3", b"", b""])
+
+
+def test_pad_end_exceptions():
+    with pytest.raises(RuntimeError) as info:
+        pad_compare([1, 2], [3], "-1", [])
+    assert "Source and pad_value tensors are not of the same type." in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        pad_compare([b"1", b"2", b"3", b"4", b"5"], [2], 1, [])
+    assert "Source and pad_value tensors are not of the same type." in str(info.value)
+
+    with pytest.raises(TypeError) as info:
+        pad_compare([3, 4, 5], ["2"], 1, [])
+    assert "a value in the list is not an integer." in str(info.value)
+
+
+if __name__ == "__main__":
+    test_pad_end_basics()
+    test_pad_end_str()
+    test_pad_end_exceptions()
diff --git a/tests/ut/python/dataset/test_pair_truncate.py b/tests/ut/python/dataset/test_pair_truncate.py
new file mode 100644
index 0000000000..6b1138e5a9
--- /dev/null
+++ b/tests/ut/python/dataset/test_pair_truncate.py
@@ -0,0 +1,67 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing Mask op in DE
+"""
+import numpy as np
+import pytest
+
+import mindspore.dataset as ds
+import mindspore.dataset.text as text
+
+
+def compare(in1, in2, length, out1, out2):
+    data = ds.NumpySlicesDataset({"s1": [in1], "s2": [in2]})
+    data = data.map(input_columns=["s1", "s2"], operations=text.TruncateSequencePair(length))
+    for d in data.create_dict_iterator():
+        np.testing.assert_array_equal(out1, d["s1"])
+        np.testing.assert_array_equal(out2, d["s2"])
+
+
+def test_basics():
+    compare(in1=[1, 2, 3], in2=[4, 5], length=4, out1=[1, 2], out2=[4, 5])
+    compare(in1=[1, 2], in2=[4, 5], length=4, out1=[1, 2], out2=[4, 5])
+    compare(in1=[1], in2=[4], length=4, out1=[1], out2=[4])
+    compare(in1=[1, 2, 3, 4], in2=[5], length=4, out1=[1, 2, 3], out2=[5])
+    compare(in1=[1, 2, 3, 4], in2=[5, 6, 7, 8], length=4, out1=[1, 2], out2=[5, 6])
+
+
+def test_basics_odd():
+    compare(in1=[1, 2, 3], in2=[4, 5], length=3, out1=[1, 2], out2=[4])
+    compare(in1=[1, 2], in2=[4, 5], length=3, out1=[1, 2], out2=[4])
+    compare(in1=[1], in2=[4], length=5, out1=[1], out2=[4])
+    compare(in1=[1, 2, 3, 4], in2=[5], length=3, out1=[1, 2], out2=[5])
+    compare(in1=[1, 2, 3, 4], in2=[5, 6, 7, 8], length=3, out1=[1, 2], out2=[5])
+
+
+def test_basics_str():
+    compare(in1=[b"1", b"2", b"3"], in2=[4, 5], length=4, out1=[b"1", b"2"], out2=[4, 5])
+    compare(in1=[b"1", b"2"], in2=[b"4", b"5"], length=4, out1=[b"1", b"2"], out2=[b"4", b"5"])
+    compare(in1=[b"1"], in2=[4], length=4, out1=[b"1"], out2=[4])
+    compare(in1=[b"1", b"2", b"3", b"4"], in2=[b"5"], length=4, out1=[b"1", b"2", b"3"], out2=[b"5"])
+    compare(in1=[b"1", b"2", b"3", b"4"], in2=[5, 6, 7, 8], length=4, out1=[b"1", b"2"], out2=[5, 6])
+
+
+def test_exceptions():
+    with pytest.raises(RuntimeError) as info:
+        compare(in1=[1, 2, 3, 4], in2=[5, 6, 7, 8], length=1, out1=[1, 2], out2=[5])
+    assert "Indices are empty, generated tensor would be empty" in str(info.value)
+
+
+if __name__ == "__main__":
+    test_basics()
+    test_basics_odd()
+    test_basics_str()
+    test_exceptions()
diff --git a/tests/ut/python/dataset/test_profiling.py b/tests/ut/python/dataset/test_profiling.py
new file mode 100644
index 0000000000..fca0a4c1dc
--- /dev/null
+++ b/tests/ut/python/dataset/test_profiling.py
@@ -0,0 +1,119 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing profiling support in DE
+"""
+import os
+import numpy as np
+import mindspore.dataset as ds
+
+FILES = ["../data/dataset/testTFTestAllTypes/test.data"]
+DATASET_ROOT = "../data/dataset/testTFTestAllTypes/"
+SCHEMA_FILE = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
+
+PIPELINE_FILE = "./pipeline_profiling_1.json"
+DATASET_ITERATOR_FILE = "./dataset_iterator_profiling_1.txt"
+
+
+def test_profiling_simple_pipeline():
+    """
+    Generator -> Shuffle -> Batch
+    """
+    os.environ['PROFILING_MODE'] = 'true'
+    os.environ['MINDDATA_PROFILING_DIR'] = '.'
+    os.environ['DEVICE_ID'] = '1'
+
+    source = [(np.array([x]),) for x in range(1024)]
+    data1 = ds.GeneratorDataset(source, ["data"])
+    data1 = data1.shuffle(64)
+    data1 = data1.batch(32)
+
+    for _ in data1:
+        pass
+
+    assert os.path.exists(PIPELINE_FILE) is True
+    os.remove(PIPELINE_FILE)
+    assert os.path.exists(DATASET_ITERATOR_FILE) is True
+    os.remove(DATASET_ITERATOR_FILE)
+    del os.environ['PROFILING_MODE']
+    del os.environ['MINDDATA_PROFILING_DIR']
+
+
+def test_profiling_complex_pipeline():
+    """
+    Generator -> Map     ->
+                             -> Zip -> Batch
+    TFReader  -> Shuffle ->
+    """
+    os.environ['PROFILING_MODE'] = 'true'
+    os.environ['MINDDATA_PROFILING_DIR'] = '.'
+    os.environ['DEVICE_ID'] = '1'
+
+    source = [(np.array([x]),) for x in range(1024)]
+    data1 = ds.GeneratorDataset(source, ["gen"])
+    data1 = data1.map("gen", operations=[(lambda x: x + 1)])
+
+    pattern = DATASET_ROOT + "/test.data"
+    data2 = ds.TFRecordDataset(pattern, SCHEMA_FILE, shuffle=ds.Shuffle.FILES)
+    data2 = data2.shuffle(4)
+
+    data3 = ds.zip((data1, data2))
+
+    for _ in data3:
+        pass
+
+    assert os.path.exists(PIPELINE_FILE) is True
+    os.remove(PIPELINE_FILE)
+    assert os.path.exists(DATASET_ITERATOR_FILE) is True
+    os.remove(DATASET_ITERATOR_FILE)
+    del os.environ['PROFILING_MODE']
+    del os.environ['MINDDATA_PROFILING_DIR']
+
+
+def test_profiling_sampling_iterval():
+    """
+    Test non-default monitor sampling interval
+    """
+    os.environ['PROFILING_MODE'] = 'true'
+    os.environ['MINDDATA_PROFILING_DIR'] = '.'
+    os.environ['DEVICE_ID'] = '1'
+    interval_origin = ds.config.get_monitor_sampling_interval()
+
+    ds.config.set_monitor_sampling_interval(30)
+    interval = ds.config.get_monitor_sampling_interval()
+    assert interval == 30
+
+    source = [(np.array([x]),) for x in range(1024)]
+    data1 = ds.GeneratorDataset(source, ["data"])
+    data1 = data1.shuffle(64)
+    data1 = data1.batch(32)
+
+    for _ in data1:
+        pass
+
+    assert os.path.exists(PIPELINE_FILE) is True
+    os.remove(PIPELINE_FILE)
+    assert os.path.exists(DATASET_ITERATOR_FILE) is True
+    os.remove(DATASET_ITERATOR_FILE)
+
+    ds.config.set_monitor_sampling_interval(interval_origin)
+    del os.environ['PROFILING_MODE']
+    del os.environ['MINDDATA_PROFILING_DIR']
+
+
+if __name__ == "__main__":
+    test_profiling_simple_pipeline()
+    test_profiling_complex_pipeline()
+    test_profiling_sampling_iterval()
diff --git a/tests/ut/python/dataset/test_python_tokenizer.py b/tests/ut/python/dataset/test_python_tokenizer.py
new file mode 100644
index 0000000000..78db553214
--- /dev/null
+++ b/tests/ut/python/dataset/test_python_tokenizer.py
@@ -0,0 +1,52 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing PythonTokenizer op in DE
+"""
+import mindspore.dataset as ds
+import mindspore.dataset.text as text
+from mindspore import log as logger
+
+DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
+
+
+def test_whitespace_tokenizer_ch():
+    """
+    Test PythonTokenizer
+    """
+    whitespace_strs = [["Welcome", "to", "Beijing!"],
+                       ["北京欢迎您！"],
+                       ["我喜欢English!"],
+                       [""]]
+
+    def my_tokenizer(line):
+        words = line.split()
+        if not words:
+            return [""]
+        return words
+
+    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    tokenizer = text.PythonTokenizer(my_tokenizer)
+    dataset = dataset.map(operations=tokenizer, num_parallel_workers=1)
+    tokens = []
+    for i in dataset.create_dict_iterator():
+        s = text.to_str(i['text']).tolist()
+        tokens.append(s)
+    logger.info("The out tokens is : {}".format(tokens))
+    assert whitespace_strs == tokens
+
+
+if __name__ == '__main__':
+    test_whitespace_tokenizer_ch()
diff --git a/tests/ut/python/dataset/test_random_affine.py b/tests/ut/python/dataset/test_random_affine.py
new file mode 100644
index 0000000000..b856684ed1
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_affine.py
@@ -0,0 +1,207 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing RandomAffine op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.py_transforms as py_vision
+from mindspore import log as logger
+from util import visualize_list, save_and_check_md5, \
+    config_get_set_seed, config_get_set_num_parallel_workers
+
+GENERATE_GOLDEN = False
+
+DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
+SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
+
+
+def test_random_affine_op(plot=False):
+    """
+    Test RandomAffine in python transformations
+    """
+    logger.info("test_random_affine_op")
+    # define map operations
+    transforms1 = [
+        py_vision.Decode(),
+        py_vision.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
+        py_vision.ToTensor()
+    ]
+    transform1 = py_vision.ComposeOp(transforms1)
+
+    transforms2 = [
+        py_vision.Decode(),
+        py_vision.ToTensor()
+    ]
+    transform2 = py_vision.ComposeOp(transforms2)
+
+    #  First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data1 = data1.map(input_columns=["image"], operations=transform1())
+    #  Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=transform2())
+
+    image_affine = []
+    image_original = []
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        image1 = (item1["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image2 = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image_affine.append(image1)
+        image_original.append(image2)
+    if plot:
+        visualize_list(image_original, image_affine)
+
+
+def test_random_affine_md5():
+    """
+    Test RandomAffine with md5 comparison
+    """
+    logger.info("test_random_affine_md5")
+    original_seed = config_get_set_seed(55)
+    original_num_parallel_workers = config_get_set_num_parallel_workers(1)
+    # define map operations
+    transforms = [
+        py_vision.Decode(),
+        py_vision.RandomAffine(degrees=(-5, 15), translate=(0.1, 0.3),
+                               scale=(0.9, 1.1), shear=(-10, 10, -5, 5)),
+        py_vision.ToTensor()
+    ]
+    transform = py_vision.ComposeOp(transforms)
+
+    #  Generate dataset
+    data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data = data.map(input_columns=["image"], operations=transform())
+
+    # check results with md5 comparison
+    filename = "random_affine_01_result.npz"
+    save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
+
+    # Restore configuration
+    ds.config.set_seed(original_seed)
+    ds.config.set_num_parallel_workers((original_num_parallel_workers))
+
+
+def test_random_affine_exception_negative_degrees():
+    """
+    Test RandomAffine: input degrees in negative, expected to raise ValueError
+    """
+    logger.info("test_random_affine_exception_negative_degrees")
+    try:
+        _ = py_vision.RandomAffine(degrees=-15)
+    except ValueError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "If degrees is a single number, it cannot be negative."
+
+
+def test_random_affine_exception_translation_range():
+    """
+    Test RandomAffine: translation value is not in [0, 1], expected to raise ValueError
+    """
+    logger.info("test_random_affine_exception_translation_range")
+    try:
+        _ = py_vision.RandomAffine(degrees=15, translate=(0.1, 1.5))
+    except ValueError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "translation values should be between 0 and 1"
+
+
+def test_random_affine_exception_scale_value():
+    """
+    Test RandomAffine: scale is not positive, expected to raise ValueError
+    """
+    logger.info("test_random_affine_exception_scale_value")
+    try:
+        _ = py_vision.RandomAffine(degrees=15, scale=(0.0, 1.1))
+    except ValueError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "scale values should be positive"
+
+
+def test_random_affine_exception_shear_value():
+    """
+    Test RandomAffine: shear is a number but is not positive, expected to raise ValueError
+    """
+    logger.info("test_random_affine_exception_shear_value")
+    try:
+        _ = py_vision.RandomAffine(degrees=15, shear=-5)
+    except ValueError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "If shear is a single number, it must be positive."
+
+
+def test_random_affine_exception_degrees_size():
+    """
+    Test RandomAffine: degrees is a list or tuple and its length is not 2,
+    expected to raise TypeError
+    """
+    logger.info("test_random_affine_exception_degrees_size")
+    try:
+        _ = py_vision.RandomAffine(degrees=[15])
+    except TypeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "If degrees is a sequence, the length must be 2."
+
+
+def test_random_affine_exception_translate_size():
+    """
+    Test RandomAffine: translate is not list or a tuple of length 2,
+    expected to raise TypeError
+    """
+    logger.info("test_random_affine_exception_translate_size")
+    try:
+        _ = py_vision.RandomAffine(degrees=15, translate=(0.1))
+    except TypeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "translate should be a list or tuple of length 2."
+
+
+def test_random_affine_exception_scale_size():
+    """
+    Test RandomAffine: scale is not a list or tuple of length 2,
+    expected to raise TypeError
+    """
+    logger.info("test_random_affine_exception_scale_size")
+    try:
+        _ = py_vision.RandomAffine(degrees=15, scale=(0.5))
+    except TypeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "scale should be a list or tuple of length 2."
+
+
+def test_random_affine_exception_shear_size():
+    """
+    Test RandomAffine: shear is not a list or tuple of length 2 or 4,
+    expected to raise TypeError
+    """
+    logger.info("test_random_affine_exception_shear_size")
+    try:
+        _ = py_vision.RandomAffine(degrees=15, shear=(-5, 5, 10))
+    except TypeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "shear should be a list or tuple and it must be of length 2 or 4."
+
+
+if __name__ == "__main__":
+    test_random_affine_op(plot=True)
+    test_random_affine_md5()
+    test_random_affine_exception_negative_degrees()
+    test_random_affine_exception_translation_range()
+    test_random_affine_exception_scale_value()
+    test_random_affine_exception_shear_value()
+    test_random_affine_exception_degrees_size()
+    test_random_affine_exception_translate_size()
+    test_random_affine_exception_scale_size()
+    test_random_affine_exception_shear_size()
diff --git a/tests/ut/python/dataset/test_random_apply.py b/tests/ut/python/dataset/test_random_apply.py
new file mode 100644
index 0000000000..23e5d1a22f
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_apply.py
@@ -0,0 +1,133 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing RandomApply op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.py_transforms as py_vision
+from mindspore import log as logger
+from util import visualize_list, config_get_set_seed, \
+    config_get_set_num_parallel_workers, save_and_check_md5
+
+GENERATE_GOLDEN = False
+
+DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
+SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
+
+
+def test_random_apply_op(plot=False):
+    """
+    Test RandomApply in python transformations
+    """
+    logger.info("test_random_apply_op")
+    # define map operations
+    transforms_list = [py_vision.CenterCrop(64), py_vision.RandomRotation(30)]
+    transforms1 = [
+        py_vision.Decode(),
+        py_vision.RandomApply(transforms_list, prob=0.6),
+        py_vision.ToTensor()
+    ]
+    transform1 = py_vision.ComposeOp(transforms1)
+
+    transforms2 = [
+        py_vision.Decode(),
+        py_vision.ToTensor()
+    ]
+    transform2 = py_vision.ComposeOp(transforms2)
+
+    #  First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data1 = data1.map(input_columns=["image"], operations=transform1())
+    #  Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=transform2())
+
+    image_apply = []
+    image_original = []
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        image1 = (item1["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image2 = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image_apply.append(image1)
+        image_original.append(image2)
+    if plot:
+        visualize_list(image_original, image_apply)
+
+
+def test_random_apply_md5():
+    """
+    Test RandomApply op with md5 check
+    """
+    logger.info("test_random_apply_md5")
+    original_seed = config_get_set_seed(10)
+    original_num_parallel_workers = config_get_set_num_parallel_workers(1)
+    # define map operations
+    transforms_list = [py_vision.CenterCrop(64), py_vision.RandomRotation(30)]
+    transforms = [
+        py_vision.Decode(),
+        # Note: using default value "prob=0.5"
+        py_vision.RandomApply(transforms_list),
+        py_vision.ToTensor()
+    ]
+    transform = py_vision.ComposeOp(transforms)
+
+    #  Generate dataset
+    data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data = data.map(input_columns=["image"], operations=transform())
+
+    # check results with md5 comparison
+    filename = "random_apply_01_result.npz"
+    save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
+
+    # Restore configuration
+    ds.config.set_seed(original_seed)
+    ds.config.set_num_parallel_workers((original_num_parallel_workers))
+
+
+def test_random_apply_exception_random_crop_badinput():
+    """
+    Test RandomApply: test invalid input for one of the transform functions,
+    expected to raise error
+    """
+    logger.info("test_random_apply_exception_random_crop_badinput")
+    original_seed = config_get_set_seed(200)
+    original_num_parallel_workers = config_get_set_num_parallel_workers(1)
+    # define map operations
+    transforms_list = [py_vision.Resize([32, 32]),
+                       py_vision.RandomCrop(100),  # crop size > image size
+                       py_vision.RandomRotation(30)]
+    transforms = [
+        py_vision.Decode(),
+        py_vision.RandomApply(transforms_list, prob=0.6),
+        py_vision.ToTensor()
+    ]
+    transform = py_vision.ComposeOp(transforms)
+    #  Generate dataset
+    data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data = data.map(input_columns=["image"], operations=transform())
+    try:
+        _ = data.create_dict_iterator().get_next()
+    except RuntimeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Crop size" in str(e)
+    # Restore configuration
+    ds.config.set_seed(original_seed)
+    ds.config.set_num_parallel_workers(original_num_parallel_workers)
+
+
+if __name__ == '__main__':
+    test_random_apply_op(plot=True)
+    test_random_apply_md5()
+    test_random_apply_exception_random_crop_badinput()
diff --git a/tests/ut/python/dataset/test_random_choice.py b/tests/ut/python/dataset/test_random_choice.py
new file mode 100644
index 0000000000..eb0a5c9eba
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_choice.py
@@ -0,0 +1,136 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing RandomChoice op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.py_transforms as py_vision
+from mindspore import log as logger
+from util import visualize_list, diff_mse
+
+DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
+SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
+
+
+def test_random_choice_op(plot=False):
+    """
+    Test RandomChoice in python transformations
+    """
+    logger.info("test_random_choice_op")
+    # define map operations
+    transforms_list = [py_vision.CenterCrop(64), py_vision.RandomRotation(30)]
+    transforms1 = [
+        py_vision.Decode(),
+        py_vision.RandomChoice(transforms_list),
+        py_vision.ToTensor()
+    ]
+    transform1 = py_vision.ComposeOp(transforms1)
+
+    transforms2 = [
+        py_vision.Decode(),
+        py_vision.ToTensor()
+    ]
+    transform2 = py_vision.ComposeOp(transforms2)
+
+    #  First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data1 = data1.map(input_columns=["image"], operations=transform1())
+    #  Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=transform2())
+
+    image_choice = []
+    image_original = []
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        image1 = (item1["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image2 = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image_choice.append(image1)
+        image_original.append(image2)
+    if plot:
+        visualize_list(image_original, image_choice)
+
+
+def test_random_choice_comp(plot=False):
+    """
+    Test RandomChoice and compare with single CenterCrop results
+    """
+    logger.info("test_random_choice_comp")
+    # define map operations
+    transforms_list = [py_vision.CenterCrop(64)]
+    transforms1 = [
+        py_vision.Decode(),
+        py_vision.RandomChoice(transforms_list),
+        py_vision.ToTensor()
+    ]
+    transform1 = py_vision.ComposeOp(transforms1)
+
+    transforms2 = [
+        py_vision.Decode(),
+        py_vision.CenterCrop(64),
+        py_vision.ToTensor()
+    ]
+    transform2 = py_vision.ComposeOp(transforms2)
+
+    #  First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data1 = data1.map(input_columns=["image"], operations=transform1())
+    #  Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=transform2())
+
+    image_choice = []
+    image_original = []
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        image1 = (item1["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image2 = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image_choice.append(image1)
+        image_original.append(image2)
+
+        mse = diff_mse(image1, image2)
+        assert mse == 0
+    if plot:
+        visualize_list(image_original, image_choice)
+
+
+def test_random_choice_exception_random_crop_badinput():
+    """
+    Test RandomChoice: hit error in RandomCrop with greater crop size,
+    expected to raise error
+    """
+    logger.info("test_random_choice_exception_random_crop_badinput")
+    # define map operations
+    # note: crop size[5000, 5000] > image size[4032, 2268]
+    transforms_list = [py_vision.RandomCrop(5000)]
+    transforms = [
+        py_vision.Decode(),
+        py_vision.RandomChoice(transforms_list),
+        py_vision.ToTensor()
+    ]
+    transform = py_vision.ComposeOp(transforms)
+    #  Generate dataset
+    data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data = data.map(input_columns=["image"], operations=transform())
+    try:
+        _ = data.create_dict_iterator().get_next()
+    except RuntimeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Crop size" in str(e)
+
+
+if __name__ == '__main__':
+    test_random_choice_op(plot=True)
+    test_random_choice_comp(plot=True)
+    test_random_choice_exception_random_crop_badinput()
diff --git a/tests/ut/python/dataset/test_random_color.py b/tests/ut/python/dataset/test_random_color.py
index 1f54712108..8ca0071e42 100644
--- a/tests/ut/python/dataset/test_random_color.py
+++ b/tests/ut/python/dataset/test_random_color.py
@@ -12,34 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-import matplotlib.pyplot as plt
+"""
+Testing RandomColor op in DE
+"""
 import numpy as np
 
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.vision.py_transforms as F
 from mindspore import log as logger
+from util import visualize_list
 
 DATA_DIR = "../data/dataset/testImageNetData/train/"
 
 
-def visualize(image_original, image_random_color):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    num = len(image_random_color)
-    for i in range(num):
-        plt.subplot(2, num, i + 1)
-        plt.imshow(image_original[i])
-        plt.title("Original image")
-
-        plt.subplot(2, num, i + num + 1)
-        plt.imshow(image_random_color[i])
-        plt.title("DE Random Color image")
-
-    plt.show()
-
-
 def test_random_color(degrees=(0.1, 1.9), plot=False):
     """
     Test RandomColor
@@ -94,7 +79,7 @@ def test_random_color(degrees=(0.1, 1.9), plot=False):
     logger.info("MSE= {}".format(str(np.mean(mse))))
 
     if plot:
-        visualize(images_original, images_random_color)
+        visualize_list(images_original, images_random_color)
 
 
 if __name__ == "__main__":
diff --git a/tests/ut/python/dataset/test_random_color_adjust.py b/tests/ut/python/dataset/test_random_color_adjust.py
index 85fea20e72..f79137e14f 100644
--- a/tests/ut/python/dataset/test_random_color_adjust.py
+++ b/tests/ut/python/dataset/test_random_color_adjust.py
@@ -15,48 +15,61 @@
 """
 Testing RandomColorAdjust op in DE
 """
-import matplotlib.pyplot as plt
+import pytest
 import numpy as np
-from util import diff_mse
 
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as c_vision
 import mindspore.dataset.transforms.vision.py_transforms as py_vision
 from mindspore import log as logger
+from util import diff_mse, visualize_image
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
 
 
-def visualize(first, mse, second):
+def util_test_random_color_adjust_error(brightness=(1, 1), contrast=(1, 1), saturation=(1, 1), hue=(0, 0)):
     """
-    visualizes the image using DE op and OpenCV
+    Util function that tests the error message in case of grayscale images
     """
-    plt.subplot(141)
-    plt.imshow(first)
-    plt.title("c transformed image")
 
-    plt.subplot(142)
-    plt.imshow(second)
-    plt.title("py random_color_adjust image")
+    transforms = [
+        py_vision.Decode(),
+        py_vision.Grayscale(1),
+        py_vision.ToTensor(),
+        (lambda image: (image.transpose(1, 2, 0) * 255).astype(np.uint8))
+    ]
+
+    transform = py_vision.ComposeOp(transforms)
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data1 = data1.map(input_columns=["image"], operations=transform())
 
-    plt.subplot(143)
-    plt.imshow(first - second)
-    plt.title("Difference image, mse : {}".format(mse))
-    plt.show()
+    # if input is grayscale, the output dimensions should be single channel, the following should fail
+    random_adjust_op = c_vision.RandomColorAdjust(brightness=brightness, contrast=contrast, saturation=saturation,
+                                                  hue=hue)
+    with pytest.raises(RuntimeError) as info:
+        data1 = data1.map(input_columns=["image"], operations=random_adjust_op)
+        dataset_shape_1 = []
+        for item1 in data1.create_dict_iterator():
+            c_image = item1["image"]
+            dataset_shape_1.append(c_image.shape)
 
+    error_msg = "The shape is incorrect: number of channels does not equal 3"
 
-def test_random_color_adjust_op_brightness(plot=False):
+    assert error_msg in str(info.value)
+
+
+def util_test_random_color_adjust_op(brightness=(1, 1), contrast=(1, 1), saturation=(1, 1), hue=(0, 0), plot=False):
     """
-    Test RandomColorAdjust op
+    Util function that tests RandomColorAdjust for a specific argument
     """
-    logger.info("test_random_color_adjust_op")
 
     # First dataset
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
     decode_op = c_vision.Decode()
 
-    random_adjust_op = c_vision.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0))
+    random_adjust_op = c_vision.RandomColorAdjust(brightness=brightness, contrast=contrast, saturation=saturation,
+                                                  hue=hue)
 
     ctrans = [decode_op,
               random_adjust_op,
@@ -67,8 +80,9 @@ def test_random_color_adjust_op_brightness(plot=False):
     # Second dataset
     transforms = [
         py_vision.Decode(),
-        py_vision.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0)),
-        py_vision.ToTensor(),
+        py_vision.RandomColorAdjust(brightness=brightness, contrast=contrast, saturation=saturation,
+                                    hue=hue),
+        py_vision.ToTensor()
     ]
     transform = py_vision.ComposeOp(transforms)
     data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
@@ -91,206 +105,95 @@ def test_random_color_adjust_op_brightness(plot=False):
 
         logger.info("random_rotation_op_{}, mse: {}".format(num_iter + 1, mse))
         assert mse < 0.01
-        # if mse != 0:
-        #     logger.info("mse is: {}".format(mse))
+
         if plot:
-            visualize(c_image, mse, py_image)
+            visualize_image(c_image, py_image, mse)
 
 
-def test_random_color_adjust_op_contrast(plot=False):
+def test_random_color_adjust_op_brightness(plot=False):
     """
-    Test RandomColorAdjust op
+    Test RandomColorAdjust op for brightness
     """
-    logger.info("test_random_color_adjust_op")
 
-    # First dataset
-    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
-    decode_op = c_vision.Decode()
+    logger.info("test_random_color_adjust_op_brightness")
 
-    random_adjust_op = c_vision.RandomColorAdjust((1, 1), (0.5, 0.5), (1, 1), (0, 0))
+    util_test_random_color_adjust_op(brightness=(0.5, 0.5), plot=plot)
 
-    ctrans = [decode_op,
-              random_adjust_op
-              ]
-
-    data1 = data1.map(input_columns=["image"], operations=ctrans)
-
-    # Second dataset
-    transforms = [
-        py_vision.Decode(),
-        py_vision.RandomColorAdjust((1, 1), (0.5, 0.5), (1, 1), (0, 0)),
-        py_vision.ToTensor(),
-    ]
-    transform = py_vision.ComposeOp(transforms)
-    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
-    data2 = data2.map(input_columns=["image"], operations=transform())
 
-    num_iter = 0
-    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
-        num_iter += 1
-        c_image = item1["image"]
-        py_image = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+def test_random_color_adjust_op_brightness_error():
+    """
+    Test RandomColorAdjust error message with brightness input in case of grayscale image
+    """
 
-        logger.info("shape of c_image: {}".format(c_image.shape))
-        logger.info("shape of py_image: {}".format(py_image.shape))
+    logger.info("test_random_color_adjust_op_brightness_error")
 
-        logger.info("dtype of c_image: {}".format(c_image.dtype))
-        logger.info("dtype of py_image: {}".format(py_image.dtype))
-        diff = c_image - py_image
-        logger.info("contrast difference c is : {}".format(c_image[0][0]))
-        logger.info("contrast difference  py is : {}".format(py_image[0][0]))
-        diff = c_image - py_image
-        logger.info("contrast difference is : {}".format(diff[0][0]))
-        # mse = (np.sum(np.power(diff, 2))) / (c_image.shape[0] * c_image.shape[1])
-        mse = diff_mse(c_image, py_image)
-        logger.info("mse is {}".format(mse))
-        # assert mse < 0.01
-        # logger.info("random_rotation_op_{}, mse: {}".format(num_iter + 1, mse))
-        # if mse != 0:
-        #     logger.info("mse is: {}".format(mse))
-        if plot:
-            visualize(c_image, mse, py_image)
+    util_test_random_color_adjust_error(brightness=(0.5, 0.5))
 
 
-def test_random_color_adjust_op_saturation(plot=False):
+def test_random_color_adjust_op_contrast(plot=False):
     """
-    Test RandomColorAdjust op
+    Test RandomColorAdjust op for contrast
     """
-    logger.info("test_random_color_adjust_op")
-
-    # First dataset
-    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
-    decode_op = c_vision.Decode()
 
-    random_adjust_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (0.5, 0.5), (0, 0))
+    logger.info("test_random_color_adjust_op_contrast")
 
-    ctrans = [decode_op,
-              random_adjust_op
-              ]
-
-    data1 = data1.map(input_columns=["image"], operations=ctrans)
+    util_test_random_color_adjust_op(contrast=(0.5, 0.5), plot=plot)
 
-    # Second dataset
-    transforms = [
-        py_vision.Decode(),
-        py_vision.RandomColorAdjust((1, 1), (1, 1), (0.5, 0.5), (0, 0)),
-        py_vision.ToTensor(),
-    ]
-    transform = py_vision.ComposeOp(transforms)
-    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
-    data2 = data2.map(input_columns=["image"], operations=transform())
 
-    num_iter = 0
-
-    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
-        num_iter += 1
-        c_image = item1["image"]
-        py_image = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
-
-        logger.info("shape of c_image: {}".format(c_image.shape))
-        logger.info("shape of py_image: {}".format(py_image.shape))
+def test_random_color_adjust_op_contrast_error():
+    """
+    Test RandomColorAdjust error message with contrast input in case of grayscale image
+    """
 
-        logger.info("dtype of c_image: {}".format(c_image.dtype))
-        logger.info("dtype of py_image: {}".format(py_image.dtype))
+    logger.info("test_random_color_adjust_op_contrast_error")
 
-        mse = diff_mse(c_image, py_image)
-        logger.info("mse is {}".format(mse))
-        assert mse < 0.01
-        # logger.info("random_rotation_op_{}, mse: {}".format(num_iter + 1, mse))
-        # if mse != 0:
-        #     logger.info("mse is: {}".format(mse))
-        if plot:
-            visualize(c_image, mse, py_image)
+    util_test_random_color_adjust_error(contrast=(0.5, 0.5))
 
 
-def test_random_color_adjust_op_hue(plot=False):
+def test_random_color_adjust_op_saturation(plot=False):
     """
-    Test RandomColorAdjust op
+    Test RandomColorAdjust op for saturation
     """
-    logger.info("test_random_color_adjust_op")
+    logger.info("test_random_color_adjust_op_saturation")
 
-    # First dataset
-    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
-    decode_op = c_vision.Decode()
+    util_test_random_color_adjust_op(saturation=(0.5, 0.5), plot=plot)
 
-    random_adjust_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (1, 1), (0.2, 0.2))
 
-    ctrans = [decode_op,
-              random_adjust_op,
-              ]
+def test_random_color_adjust_op_saturation_error():
+    """
+    Test RandomColorAdjust error message with saturation input in case of grayscale image
+    """
 
-    data1 = data1.map(input_columns=["image"], operations=ctrans)
+    logger.info("test_random_color_adjust_op_saturation_error")
 
-    # Second dataset
-    transforms = [
-        py_vision.Decode(),
-        py_vision.RandomColorAdjust((1, 1), (1, 1), (1, 1), (0.2, 0.2)),
-        py_vision.ToTensor(),
-    ]
-    transform = py_vision.ComposeOp(transforms)
-    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
-    data2 = data2.map(input_columns=["image"], operations=transform())
+    util_test_random_color_adjust_error(saturation=(0.5, 0.5))
 
-    num_iter = 0
-    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
-        num_iter += 1
-        c_image = item1["image"]
-        py_image = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
-
-        # logger.info("shape of img: {}".format(img.shape))
-        logger.info("shape of c_image: {}".format(c_image.shape))
-        logger.info("shape of py_image: {}".format(py_image.shape))
 
-        logger.info("dtype of c_image: {}".format(c_image.dtype))
-        logger.info("dtype of py_image: {}".format(py_image.dtype))
-        # logger.info("dtype of img: {}".format(img.dtype))
+def test_random_color_adjust_op_hue(plot=False):
+    """
+    Test RandomColorAdjust op for hue
+    """
+    logger.info("test_random_color_adjust_op_hue")
 
-        # mse = (np.sum(np.power(diff, 2))) / (c_image.shape[0] * c_image.shape[1])
-        mse = diff_mse(c_image, py_image)
-        logger.info("mse is {}".format(mse))
-        assert mse < 0.01
-        if plot:
-            visualize(c_image, mse, py_image)
+    util_test_random_color_adjust_op(hue=(0.5, 0.5), plot=plot)
 
 
-# pylint: disable=unnecessary-lambda
-def test_random_color_adjust_grayscale():
+def test_random_color_adjust_op_hue_error():
     """
-    Tests that the random color adjust works for grayscale images
+    Test RandomColorAdjust error message with hue input in case of grayscale image
     """
 
-    def channel_swap(image):
-        """
-        Py func hack for our pytransforms to work with c transforms
-        """
-        return (image.transpose(1, 2, 0) * 255).astype(np.uint8)
+    logger.info("test_random_color_adjust_op_hue_error")
 
-    transforms = [
-        py_vision.Decode(),
-        py_vision.Grayscale(1),
-        py_vision.ToTensor(),
-        (lambda image: channel_swap(image))
-    ]
-
-    transform = py_vision.ComposeOp(transforms)
-    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
-    data1 = data1.map(input_columns=["image"], operations=transform())
-
-    # if input is grayscale, the output dimensions should be single channel, the following should fail
-    random_adjust_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (1, 1), (0.2, 0.2))
-    try:
-        data1 = data1.map(input_columns=["image"], operations=random_adjust_op)
-        dataset_shape_1 = []
-        for item1 in data1.create_dict_iterator():
-            c_image = item1["image"]
-            dataset_shape_1.append(c_image.shape)
-    except Exception as e:
-        logger.info("Got an exception in DE: {}".format(str(e)))
+    util_test_random_color_adjust_error(hue=(0.5, 0.5))
 
 
 if __name__ == "__main__":
-    test_random_color_adjust_op_brightness()
-    test_random_color_adjust_op_contrast()
-    test_random_color_adjust_op_saturation()
-    test_random_color_adjust_op_hue()
-    test_random_color_adjust_grayscale()
+    test_random_color_adjust_op_brightness(plot=True)
+    test_random_color_adjust_op_brightness_error()
+    test_random_color_adjust_op_contrast(plot=True)
+    test_random_color_adjust_op_contrast_error()
+    test_random_color_adjust_op_saturation(plot=True)
+    test_random_color_adjust_op_saturation_error()
+    test_random_color_adjust_op_hue(plot=True)
+    test_random_color_adjust_op_hue_error()
diff --git a/tests/ut/python/dataset/test_random_crop.py b/tests/ut/python/dataset/test_random_crop.py
index 8074684579..89153dee56 100644
--- a/tests/ut/python/dataset/test_random_crop.py
+++ b/tests/ut/python/dataset/test_random_crop.py
@@ -21,7 +21,7 @@ import mindspore.dataset.transforms.vision.py_transforms as py_vision
 import mindspore.dataset.transforms.vision.utils as mode
 import mindspore.dataset as ds
 from mindspore import log as logger
-from util import save_and_check_md5, visualize, config_get_set_seed, \
+from util import save_and_check_md5, visualize_list, config_get_set_seed, \
     config_get_set_num_parallel_workers
 
 
@@ -57,7 +57,7 @@ def test_random_crop_op_c(plot=False):
         image_cropped.append(image1)
         image.append(image2)
     if plot:
-        visualize(image, image_cropped)
+        visualize_list(image, image_cropped)
 
 def test_random_crop_op_py(plot=False):
     """
@@ -91,7 +91,7 @@ def test_random_crop_op_py(plot=False):
         crop_images.append(crop)
         original_images.append(original)
     if plot:
-        visualize(original_images, crop_images)
+        visualize_list(original_images, crop_images)
 
 def test_random_crop_01_c():
     """
@@ -280,6 +280,7 @@ def test_random_crop_04_py():
         data.create_dict_iterator().get_next()
     except RuntimeError as e:
         logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Crop size" in str(e)
 
 def test_random_crop_05_c():
     """
@@ -533,7 +534,7 @@ def test_random_crop_comp(plot=False):
         image_c_cropped.append(c_image)
         image_py_cropped.append(py_image)
     if plot:
-        visualize(image_c_cropped, image_py_cropped)
+        visualize_list(image_c_cropped, image_py_cropped, visualize_mode=2)
 
 
 if __name__ == "__main__":
diff --git a/tests/ut/python/dataset/test_random_crop_and_resize.py b/tests/ut/python/dataset/test_random_crop_and_resize.py
index 8249d6a18a..8ccbb98c2c 100644
--- a/tests/ut/python/dataset/test_random_crop_and_resize.py
+++ b/tests/ut/python/dataset/test_random_crop_and_resize.py
@@ -23,7 +23,7 @@ import mindspore.dataset.transforms.vision.py_transforms as py_vision
 import mindspore.dataset.transforms.vision.utils as mode
 import mindspore.dataset as ds
 from mindspore import log as logger
-from util import diff_mse, save_and_check_md5, visualize, \
+from util import diff_mse, save_and_check_md5, visualize_list, \
     config_get_set_seed, config_get_set_num_parallel_workers
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
@@ -41,7 +41,8 @@ def test_random_crop_and_resize_op_c(plot=False):
     # First dataset
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
     decode_op = c_vision.Decode()
-    random_crop_and_resize_op = c_vision.RandomResizedCrop((256, 512), (1, 1), (0.5, 0.5))
+    # With these inputs we expect the code to crop the whole image
+    random_crop_and_resize_op = c_vision.RandomResizedCrop((256, 512), (2, 2), (1, 3))
     data1 = data1.map(input_columns=["image"], operations=decode_op)
     data1 = data1.map(input_columns=["image"], operations=random_crop_and_resize_op)
 
@@ -63,7 +64,8 @@ def test_random_crop_and_resize_op_c(plot=False):
         crop_and_resize_images.append(crop_and_resize)
         original_images.append(original)
     if plot:
-        visualize(original_images, crop_and_resize_images)
+        visualize_list(original_images, crop_and_resize_images)
+
 
 def test_random_crop_and_resize_op_py(plot=False):
     """
@@ -72,9 +74,10 @@ def test_random_crop_and_resize_op_py(plot=False):
     logger.info("test_random_crop_and_resize_op_py")
     # First dataset
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    # With these inputs we expect the code to crop the whole image
     transforms1 = [
         py_vision.Decode(),
-        py_vision.RandomResizedCrop((256, 512), (1, 1), (0.5, 0.5)),
+        py_vision.RandomResizedCrop((256, 512), (2, 2), (1, 3)),
         py_vision.ToTensor()
     ]
     transform1 = py_vision.ComposeOp(transforms1)
@@ -96,12 +99,15 @@ def test_random_crop_and_resize_op_py(plot=False):
         original = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
         original = cv2.resize(original, (512, 256))
         mse = diff_mse(crop_and_resize, original)
+        # Due to rounding error the mse for Python is not exactly 0
+        assert mse <= 0.05
         logger.info("random_crop_and_resize_op_{}, mse: {}".format(num_iter + 1, mse))
         num_iter += 1
         crop_and_resize_images.append(crop_and_resize)
         original_images.append(original)
     if plot:
-        visualize(original_images, crop_and_resize_images)
+        visualize_list(original_images, crop_and_resize_images)
+
 
 def test_random_crop_and_resize_01():
     """
@@ -114,7 +120,7 @@ def test_random_crop_and_resize_01():
     # First dataset
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
     decode_op = c_vision.Decode()
-    random_crop_and_resize_op = c_vision.RandomResizedCrop((256, 512), (0.5, 1), (0.5, 1))
+    random_crop_and_resize_op = c_vision.RandomResizedCrop((256, 512), (0.5, 0.5), (1, 1))
     data1 = data1.map(input_columns=["image"], operations=decode_op)
     data1 = data1.map(input_columns=["image"], operations=random_crop_and_resize_op)
 
@@ -122,7 +128,7 @@ def test_random_crop_and_resize_01():
     data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
     transforms = [
         py_vision.Decode(),
-        py_vision.RandomResizedCrop((256, 512), (0.5, 1), (0.5, 1)),
+        py_vision.RandomResizedCrop((256, 512), (0.5, 0.5), (1, 1)),
         py_vision.ToTensor()
     ]
     transform = py_vision.ComposeOp(transforms)
@@ -137,6 +143,7 @@ def test_random_crop_and_resize_01():
     ds.config.set_seed(original_seed)
     ds.config.set_num_parallel_workers(original_num_parallel_workers)
 
+
 def test_random_crop_and_resize_02():
     """
     Test RandomCropAndResize with md5 check:Image interpolation mode is Inter.NEAREST,
@@ -172,6 +179,7 @@ def test_random_crop_and_resize_02():
     ds.config.set_seed(original_seed)
     ds.config.set_num_parallel_workers(original_num_parallel_workers)
 
+
 def test_random_crop_and_resize_03():
     """
     Test RandomCropAndResize with md5 check: max_attempts is 1, expected to pass
@@ -206,6 +214,7 @@ def test_random_crop_and_resize_03():
     ds.config.set_seed(original_seed)
     ds.config.set_num_parallel_workers(original_num_parallel_workers)
 
+
 def test_random_crop_and_resize_04_c():
     """
     Test RandomCropAndResize with c_tranforms: invalid range of scale (max<min),
@@ -225,6 +234,7 @@ def test_random_crop_and_resize_04_c():
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "Input range is not valid" in str(e)
 
+
 def test_random_crop_and_resize_04_py():
     """
     Test RandomCropAndResize with py_transforms: invalid range of scale (max<min),
@@ -247,6 +257,7 @@ def test_random_crop_and_resize_04_py():
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "Input range is not valid" in str(e)
 
+
 def test_random_crop_and_resize_05_c():
     """
     Test RandomCropAndResize with c_transforms: invalid range of ratio (max<min),
@@ -266,13 +277,14 @@ def test_random_crop_and_resize_05_c():
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "Input range is not valid" in str(e)
 
+
 def test_random_crop_and_resize_05_py():
     """
     Test RandomCropAndResize with py_transforms: invalid range of ratio (max<min),
     expected to raise ValueError
     """
     logger.info("test_random_crop_and_resize_05_py")
-    
+
     # Generate dataset
     data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
     try:
@@ -288,6 +300,7 @@ def test_random_crop_and_resize_05_py():
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "Input range is not valid" in str(e)
 
+
 def test_random_crop_and_resize_comp(plot=False):
     """
     Test RandomCropAndResize and compare between python and c image augmentation
@@ -319,7 +332,8 @@ def test_random_crop_and_resize_comp(plot=False):
         image_c_cropped.append(c_image)
         image_py_cropped.append(py_image)
     if plot:
-        visualize(image_c_cropped, image_py_cropped)
+        visualize_list(image_c_cropped, image_py_cropped, visualize_mode=2)
+
 
 if __name__ == "__main__":
     test_random_crop_and_resize_op_c(True)
diff --git a/tests/ut/python/dataset/test_random_crop_and_resize_with_bbox.py b/tests/ut/python/dataset/test_random_crop_and_resize_with_bbox.py
new file mode 100644
index 0000000000..3dd97d2512
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_crop_and_resize_with_bbox.py
@@ -0,0 +1,312 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing RandomCropAndResizeWithBBox op
+"""
+import numpy as np
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.c_transforms as c_vision
+
+from mindspore import log as logger
+
+# updated VOC dataset with correct annotations
+DATA_DIR = "../data/dataset/testVOC2012_2"
+
+
+def fix_annotate(bboxes):
+    """
+    Update Current VOC dataset format to Proposed HQ BBox format
+
+    :param bboxes: in [label, x_min, y_min, w, h, truncate, difficult] format
+    :return: annotation in [x_min, y_min, w, h, label, truncate, difficult] format
+    """
+    for bbox in bboxes:
+        tmp = bbox[0]
+        bbox[0] = bbox[1]
+        bbox[1] = bbox[2]
+        bbox[2] = bbox[3]
+        bbox[3] = bbox[4]
+        bbox[4] = tmp
+    return bboxes
+
+
+def add_bounding_boxes(ax, bboxes):
+    for bbox in bboxes:
+        rect = patches.Rectangle((bbox[0], bbox[1]),
+                                 bbox[2], bbox[3],
+                                 linewidth=1, edgecolor='r', facecolor='none')
+        # Add the patch to the Axes
+        ax.add_patch(rect)
+
+
+def vis_check(orig, aug):
+    if not isinstance(orig, list) or not isinstance(aug, list):
+        return False
+    if len(orig) != len(aug):
+        return False
+    return True
+
+
+def visualize(orig, aug):
+
+    if not vis_check(orig, aug):
+        return
+
+    plotrows = 3
+    compset = int(len(orig)/plotrows)
+
+    orig, aug = np.array(orig), np.array(aug)
+
+    orig = np.split(orig[:compset*plotrows], compset) + [orig[compset*plotrows:]]
+    aug = np.split(aug[:compset*plotrows], compset) + [aug[compset*plotrows:]]
+
+    for ix, allData in enumerate(zip(orig, aug)):
+        base_ix = ix * plotrows  # will signal what base level we're on
+        fig, axs = plt.subplots(len(allData[0]), 2)
+        fig.tight_layout(pad=1.5)
+
+        for x, (dataA, dataB) in enumerate(zip(allData[0], allData[1])):
+            cur_ix = base_ix + x
+
+            axs[x, 0].imshow(dataA["image"])
+            add_bounding_boxes(axs[x, 0], dataA["annotation"])
+            axs[x, 0].title.set_text("Original" + str(cur_ix+1))
+            print("Original **\n ", str(cur_ix+1), " :", dataA["annotation"])
+
+            axs[x, 1].imshow(dataB["image"])
+            add_bounding_boxes(axs[x, 1], dataB["annotation"])
+            axs[x, 1].title.set_text("Augmented" + str(cur_ix+1))
+            print("Augmented **\n", str(cur_ix+1), " ", dataB["annotation"], "\n")
+
+        plt.show()
+
+# Functions to pass to Gen for creating invalid bounding boxes
+
+
+def gen_bad_bbox_neg_xy(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [-50, -50, im_w - 10, im_h - 10]
+    return im, bbox
+
+
+def gen_bad_bbox_overflow_width(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [0, 0, im_w + 10, im_h - 10]
+    return im, bbox
+
+
+def gen_bad_bbox_overflow_height(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [0, 0, im_w - 10, im_h + 10]
+    return im, bbox
+
+
+def gen_bad_bbox_wrong_shape(im, bbox):
+    bbox = np.array([[0, 0, 0]]).astype(bbox.dtype)
+    return im, bbox
+
+
+badGenFuncs = [gen_bad_bbox_neg_xy,
+               gen_bad_bbox_overflow_width,
+               gen_bad_bbox_overflow_height,
+               gen_bad_bbox_wrong_shape]
+
+
+assertVal = ["min_x",
+             "is out of bounds of the image",
+             "is out of bounds of the image",
+             "4 features"]
+
+
+# Gen Edge case BBox
+def gen_bbox_edge(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [0, 0, im_w, im_h]
+    return im, bbox
+
+
+def test_c_random_resized_crop_with_bbox_op(plot_vis=False):
+    """
+     Prints images side by side with and without Aug applied + bboxes to compare and test
+    """
+    # Load dataset
+    dataVoc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    test_op = c_vision.RandomResizedCropWithBBox((256, 512), (0.5, 0.5), (0.5, 0.5))
+
+    # maps to fix annotations to HQ standard
+    dataVoc1 = dataVoc1.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    # map to apply ops
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[test_op])  # Add column for "annotation"
+
+    unaugSamp, augSamp = [], []
+
+    for unAug, Aug in zip(dataVoc1.create_dict_iterator(), dataVoc2.create_dict_iterator()):
+        unaugSamp.append(unAug)
+        augSamp.append(Aug)
+
+    if plot_vis:
+        visualize(unaugSamp, augSamp)
+
+
+def test_c_random_resized_crop_with_bbox_op_edge(plot_vis=False):
+    """
+     Prints images side by side with and without Aug applied + bboxes to compare and test
+    """
+    # Load dataset
+    dataVoc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    test_op = c_vision.RandomResizedCropWithBBox((256, 512), (0.5, 0.5), (0.5, 0.5))
+
+    # maps to fix annotations to HQ standard
+    dataVoc1 = dataVoc1.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+
+    # Modify BBoxes to serve as valid edge cases
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[gen_bbox_edge])
+
+    # map to apply ops
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[test_op])  # Add column for "annotation"
+
+    unaugSamp, augSamp = [], []
+
+    for unAug, Aug in zip(dataVoc1.create_dict_iterator(), dataVoc2.create_dict_iterator()):
+        unaugSamp.append(unAug)
+        augSamp.append(Aug)
+
+    if plot_vis:
+        visualize(unaugSamp, augSamp)
+
+
+def test_c_random_resized_crop_with_bbox_op_invalid():
+    """
+     Prints images side by side with and without Aug applied + bboxes to compare and test
+    """
+    # Load dataset # only loading the to AugDataset as test will fail on this
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    try:
+        # If input range of scale is not in the order of (min, max), ValueError will be raised.
+        test_op = c_vision.RandomResizedCropWithBBox((256, 512), (1, 0.5), (0.5, 0.5))
+
+        # maps to fix annotations to HQ standard
+        dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                                output_columns=["annotation"],
+                                operations=fix_annotate)
+        # map to apply ops
+        dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                output_columns=["image", "annotation"],
+                                columns_order=["image", "annotation"],
+                                operations=[test_op])
+
+        for _ in dataVoc2.create_dict_iterator():
+            break
+
+    except ValueError as err:
+        logger.info("Got an exception in DE: {}".format(str(err)))
+        assert "Input range is not valid" in str(err)
+
+
+def test_c_random_resized_crop_with_bbox_op_invalid2():
+    """
+     Prints images side by side with and without Aug applied + bboxes to compare and test
+    """
+    # Load dataset # only loading the to AugDataset as test will fail on this
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    try:
+        # If input range of ratio is not in the order of (min, max), ValueError will be raised.
+        test_op = c_vision.RandomResizedCropWithBBox((256, 512), (1, 1), (1, 0.5))
+
+        # maps to fix annotations to HQ standard
+        dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                                output_columns=["annotation"],
+                                operations=fix_annotate)
+        # map to apply ops
+        dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                output_columns=["image", "annotation"],
+                                columns_order=["image", "annotation"],
+                                operations=[test_op])
+
+        for _ in dataVoc2.create_dict_iterator():
+            break
+
+    except ValueError as err:
+        logger.info("Got an exception in DE: {}".format(str(err)))
+        assert "Input range is not valid" in str(err)
+
+
+def test_c_random_resized_crop_with_bbox_op_bad():
+    # Should Fail - Errors logged to logger
+    for ix, badFunc in enumerate(badGenFuncs):
+        try:
+            dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                                     decode=True, shuffle=False)
+
+            test_op = c_vision.RandomVerticalFlipWithBBox(1)
+
+            dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                                    output_columns=["annotation"],
+                                    operations=fix_annotate)
+
+            dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                    output_columns=["image", "annotation"],
+                                    columns_order=["image", "annotation"],
+                                    operations=[badFunc])
+
+            # map to apply ops
+            dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                    output_columns=["image", "annotation"],
+                                    columns_order=["image", "annotation"],
+                                    operations=[test_op])
+
+            for _ in dataVoc2.create_dict_iterator():
+                break  # first sample will cause exception
+
+        except RuntimeError as err:
+            logger.info("Got an exception in DE: {}".format(str(err)))
+            assert assertVal[ix] in str(err)
+
+
+if __name__ == "__main__":
+    test_c_random_resized_crop_with_bbox_op(False)
+    test_c_random_resized_crop_with_bbox_op_edge(False)
+    test_c_random_resized_crop_with_bbox_op_invalid()
+    test_c_random_resized_crop_with_bbox_op_invalid2()
+    test_c_random_resized_crop_with_bbox_op_bad()
diff --git a/tests/ut/python/dataset/test_random_crop_decode_resize.py b/tests/ut/python/dataset/test_random_crop_decode_resize.py
index e645f18bb1..4a46851f9b 100644
--- a/tests/ut/python/dataset/test_random_crop_decode_resize.py
+++ b/tests/ut/python/dataset/test_random_crop_decode_resize.py
@@ -15,37 +15,18 @@
 """
 Testing RandomCropDecodeResize op in DE
 """
-import matplotlib.pyplot as plt
-import numpy as np
 import cv2
 
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as vision
 from mindspore import log as logger
+from util import diff_mse, visualize_image
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
 
 
-def visualize(a, mse, original):
-    """
-    visualizes the image using DE op and Numpy Op
-    """
-    plt.subplot(141)
-    plt.imshow(original)
-    plt.title("Original image")
-
-    plt.subplot(142)
-    plt.imshow(a)
-    plt.title("DE random_crop_decode_resize image")
-
-    plt.subplot(143)
-    plt.imshow(a - original)
-    plt.title("Difference image, mse : {}".format(mse))
-    plt.show()
-
-
-def test_random_crop_decode_resize_op():
+def test_random_crop_decode_resize_op(plot=False):
     """
     Test RandomCropDecodeResize op
     """
@@ -66,16 +47,15 @@ def test_random_crop_decode_resize_op():
 
         if num_iter > 0:
             break
-        crop_and_resize = item1["image"]
+        crop_and_resize_de = item1["image"]
         original = item2["image"]
-        original = cv2.resize(original, (512, 256))
-        diff = crop_and_resize - original
-        mse = np.sum(np.power(diff, 2))
+        crop_and_resize_cv = cv2.resize(original, (512, 256))
+        mse = diff_mse(crop_and_resize_de, crop_and_resize_cv)
         logger.info("random_crop_decode_resize_op_{}, mse: {}".format(num_iter + 1, mse))
-        # Uncomment below line if you want to visualize images
-        # visualize(crop_and_resize, mse, original)
+        if plot:
+            visualize_image(original, crop_and_resize_de, mse, crop_and_resize_cv)
         num_iter += 1
 
 
 if __name__ == "__main__":
-    test_random_crop_decode_resize_op()
+    test_random_crop_decode_resize_op(plot=True)
diff --git a/tests/ut/python/dataset/test_random_crop_with_bbox.py b/tests/ut/python/dataset/test_random_crop_with_bbox.py
new file mode 100644
index 0000000000..d1e2e08419
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_crop_with_bbox.py
@@ -0,0 +1,360 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing RandomCropWithBBox op
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.c_transforms as c_vision
+import mindspore.dataset.transforms.vision.utils as mode
+
+from mindspore import log as logger
+
+# updated VOC dataset with correct annotations
+DATA_DIR = "../data/dataset/testVOC2012_2"
+
+
+def fix_annotate(bboxes):
+    """
+    Update Current VOC dataset format to Proposed HQ BBox format
+
+    :param bboxes: in [label, x_min, y_min, w, h, truncate, difficult] format
+    :return: annotation in [x_min, y_min, w, h, label, truncate, difficult] format
+    """
+    for bbox in bboxes:
+        tmp = bbox[0]
+        bbox[0] = bbox[1]
+        bbox[1] = bbox[2]
+        bbox[2] = bbox[3]
+        bbox[3] = bbox[4]
+        bbox[4] = tmp
+    return bboxes
+
+
+def add_bounding_boxes(ax, bboxes):
+    for bbox in bboxes:
+        rect = patches.Rectangle((bbox[0], bbox[1]),
+                                 bbox[2], bbox[3],
+                                 linewidth=1, edgecolor='r', facecolor='none')
+        # Add the patch to the Axes
+        ax.add_patch(rect)
+
+
+def vis_check(orig, aug):
+    if not isinstance(orig, list) or not isinstance(aug, list):
+        return False
+    if len(orig) != len(aug):
+        return False
+    return True
+
+
+def visualize(orig, aug):
+
+    if not vis_check(orig, aug):
+        return
+
+    plotrows = 3
+    compset = int(len(orig)/plotrows)
+
+    orig, aug = np.array(orig), np.array(aug)
+
+    orig = np.split(orig[:compset*plotrows], compset) + [orig[compset*plotrows:]]
+    aug = np.split(aug[:compset*plotrows], compset) + [aug[compset*plotrows:]]
+
+    for ix, allData in enumerate(zip(orig, aug)):
+        base_ix = ix * plotrows  # will signal what base level we're on
+        fig, axs = plt.subplots(len(allData[0]), 2)
+        fig.tight_layout(pad=1.5)
+
+        for x, (dataA, dataB) in enumerate(zip(allData[0], allData[1])):
+            cur_ix = base_ix + x
+
+            axs[x, 0].imshow(dataA["image"])
+            add_bounding_boxes(axs[x, 0], dataA["annotation"])
+            axs[x, 0].title.set_text("Original" + str(cur_ix+1))
+            print("Original **\n ", str(cur_ix+1), " :", dataA["annotation"])
+
+            axs[x, 1].imshow(dataB["image"])
+            add_bounding_boxes(axs[x, 1], dataB["annotation"])
+            axs[x, 1].title.set_text("Augmented" + str(cur_ix+1))
+            print("Augmented **\n", str(cur_ix+1), " ", dataB["annotation"], "\n")
+
+        plt.show()
+
+# Functions to pass to Gen for creating invalid bounding boxes
+
+
+def gen_bad_bbox_neg_xy(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [-50, -50, im_w - 10, im_h - 10]
+    return im, bbox
+
+
+def gen_bad_bbox_overflow_width(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [0, 0, im_w + 10, im_h - 10]
+    return im, bbox
+
+
+def gen_bad_bbox_overflow_height(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [0, 0, im_w - 10, im_h + 10]
+    return im, bbox
+
+
+def gen_bad_bbox_wrong_shape(im, bbox):
+    bbox = np.array([[0, 0, 0]]).astype(bbox.dtype)
+    return im, bbox
+
+
+badGenFuncs = [gen_bad_bbox_neg_xy,
+               gen_bad_bbox_overflow_width,
+               gen_bad_bbox_overflow_height,
+               gen_bad_bbox_wrong_shape]
+
+
+assertVal = ["min_x",
+             "is out of bounds of the image",
+             "is out of bounds of the image",
+             "4 features"]
+
+
+# Gen Edge case BBox
+def gen_bbox_edge(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [0, 0, im_w, im_h]
+    return im, bbox
+
+
+def c_random_crop_with_bbox_op(plot_vis=False):
+    """
+     Prints images side by side with and without Aug applied + bboxes
+    """
+    # Load dataset
+    dataVoc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    # define test OP with values to match existing Op unit - test
+    test_op = c_vision.RandomCropWithBBox([512, 512], [200, 200, 200, 200])
+
+    # maps to fix annotations to HQ standard
+    dataVoc1 = dataVoc1.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    # map to apply ops
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[test_op])  # Add column for "annotation"
+
+    unaugSamp, augSamp = [], []
+
+    for unAug, Aug in zip(dataVoc1.create_dict_iterator(), dataVoc2.create_dict_iterator()):
+        unaugSamp.append(unAug)
+        augSamp.append(Aug)
+
+    if plot_vis:
+        visualize(unaugSamp, augSamp)
+
+
+def c_random_crop_with_bbox_op2(plot_vis=False):
+    """
+     Prints images side by side with and without Aug applied + bboxes
+     With Fill Value
+    """
+
+    # Load dataset
+    dataVoc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    # define test OP with values to match existing Op unit - test
+    test_op = c_vision.RandomCropWithBBox(512, [200, 200, 200, 200], fill_value=(255, 255, 255))
+
+    # maps to fix annotations to HQ standard
+    dataVoc1 = dataVoc1.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    # map to apply ops
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[test_op])  # Add column for "annotation"
+
+    unaugSamp, augSamp = [], []
+
+    for unAug, Aug in zip(dataVoc1.create_dict_iterator(), dataVoc2.create_dict_iterator()):
+        unaugSamp.append(unAug)
+        augSamp.append(Aug)
+
+    if plot_vis:
+        visualize(unaugSamp, augSamp)
+
+
+def c_random_crop_with_bbox_op3(plot_vis=False):
+    """
+     Prints images side by side with and without Aug applied + bboxes
+     With Padding Mode passed
+    """
+    # Load dataset
+    dataVoc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    # define test OP with values to match existing Op unit - test
+    test_op = c_vision.RandomCropWithBBox(512, [200, 200, 200, 200], padding_mode=mode.Border.EDGE)
+
+    # maps to fix annotations to HQ standard
+    dataVoc1 = dataVoc1.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    # map to apply ops
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[test_op])  # Add column for "annotation"
+
+    unaugSamp, augSamp = [], []
+
+    for unAug, Aug in zip(dataVoc1.create_dict_iterator(), dataVoc2.create_dict_iterator()):
+        unaugSamp.append(unAug)
+        augSamp.append(Aug)
+
+    if plot_vis:
+        visualize(unaugSamp, augSamp)
+
+
+def c_random_crop_with_bbox_op_edge(plot_vis=False):
+    """
+     Prints images side by side with and without Aug applied + bboxes
+     Testing for an Edge case
+    """
+    # Load dataset
+    dataVoc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    # define test OP with values to match existing Op unit - test
+    test_op = c_vision.RandomCropWithBBox(512, [200, 200, 200, 200], padding_mode=mode.Border.EDGE)
+
+    # maps to fix annotations to HQ standard
+    dataVoc1 = dataVoc1.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+
+    # Modify BBoxes to serve as valid edge cases
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[gen_bbox_edge])
+
+    # map to apply ops
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[test_op])  # Add column for "annotation"
+
+    unaugSamp, augSamp = [], []
+
+    for unAug, Aug in zip(dataVoc1.create_dict_iterator(), dataVoc2.create_dict_iterator()):
+        unaugSamp.append(unAug)
+        augSamp.append(Aug)
+
+    if plot_vis:
+        visualize(unaugSamp, augSamp)
+
+
+def c_random_crop_with_bbox_op_invalid():
+    """
+    Checking for invalid params passed to Aug Constructor
+    """
+    # Load dataset
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    try:
+        # define test OP with values to match existing Op unit - test
+        test_op = c_vision.RandomCropWithBBox([512, 512, 375])
+
+        # maps to fix annotations to HQ standard
+
+        dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                                output_columns=["annotation"],
+                                operations=fix_annotate)
+
+        # map to apply ops
+        dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                output_columns=["image", "annotation"],
+                                columns_order=["image", "annotation"],
+                                operations=[test_op])  # Add column for "annotation"
+
+        for _ in dataVoc2.create_dict_iterator():
+            break
+    except TypeError as err:
+        logger.info("Got an exception in DE: {}".format(str(err)))
+        assert "Size should be a single integer" in str(err)
+
+
+def c_random_crop_with_bbox_op_bad():
+    # Should Fail - Errors logged to logger
+    for ix, badFunc in enumerate(badGenFuncs):
+        try:
+            dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                                     decode=True, shuffle=False)
+
+            test_op = c_vision.RandomCropWithBBox([512, 512], [200, 200, 200, 200])
+
+            dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                                    output_columns=["annotation"],
+                                    operations=fix_annotate)
+
+            dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                    output_columns=["image", "annotation"],
+                                    columns_order=["image", "annotation"],
+                                    operations=[badFunc])
+
+            # map to apply ops
+            dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                    output_columns=["image", "annotation"],
+                                    columns_order=["image", "annotation"],
+                                    operations=[test_op])
+
+            for _ in dataVoc2.create_dict_iterator():
+                break  # first sample will cause exception
+
+        except RuntimeError as err:
+            logger.info("Got an exception in DE: {}".format(str(err)))
+            assert assertVal[ix] in str(err)
+
+
+if __name__ == "__main__":
+    c_random_crop_with_bbox_op(False)
+    c_random_crop_with_bbox_op2(False)
+    c_random_crop_with_bbox_op3(False)
+    c_random_crop_with_bbox_op_edge(False)
+    c_random_crop_with_bbox_op_invalid()
+    c_random_crop_with_bbox_op_bad()
diff --git a/tests/ut/python/dataset/test_random_erasing.py b/tests/ut/python/dataset/test_random_erasing.py
index d6f76992a9..842b4a15cc 100644
--- a/tests/ut/python/dataset/test_random_erasing.py
+++ b/tests/ut/python/dataset/test_random_erasing.py
@@ -15,36 +15,18 @@
 """
 Testing RandomErasing op in DE
 """
-import matplotlib.pyplot as plt
 import numpy as np
 
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.py_transforms as vision
 from mindspore import log as logger
+from util import diff_mse, visualize_image
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
 
 
-def visualize(image_1, image_2):
-    """
-    visualizes the image using RandomErasing and Cutout
-    """
-    plt.subplot(141)
-    plt.imshow(image_1)
-    plt.title("RandomErasing")
-
-    plt.subplot(142)
-    plt.imshow(image_2)
-    plt.title("Cutout")
-
-    plt.subplot(143)
-    plt.imshow(image_1 - image_2)
-    plt.title("Difference image")
-    plt.show()
-
-
-def test_random_erasing_op():
+def test_random_erasing_op(plot=False):
     """
     Test RandomErasing and Cutout
     """
@@ -82,8 +64,10 @@ def test_random_erasing_op():
         logger.info("dtype of image_1: {}".format(image_1.dtype))
         logger.info("dtype of image_2: {}".format(image_2.dtype))
 
-        # visualize(image_1, image_2)
+        mse = diff_mse(image_1, image_2)
+        if plot:
+            visualize_image(image_1, image_2, mse)
 
 
 if __name__ == "__main__":
-    test_random_erasing_op()
+    test_random_erasing_op(plot=True)
diff --git a/tests/ut/python/dataset/test_random_grayscale.py b/tests/ut/python/dataset/test_random_grayscale.py
index dd786f7f20..83514a55f6 100644
--- a/tests/ut/python/dataset/test_random_grayscale.py
+++ b/tests/ut/python/dataset/test_random_grayscale.py
@@ -19,7 +19,7 @@ import numpy as np
 import mindspore.dataset.transforms.vision.py_transforms as py_vision
 import mindspore.dataset as ds
 from mindspore import log as logger
-from util import save_and_check_md5, visualize, \
+from util import save_and_check_md5, visualize_list, \
     config_get_set_seed, config_get_set_num_parallel_workers
 
 GENERATE_GOLDEN = False
@@ -61,7 +61,7 @@ def test_random_grayscale_valid_prob(plot=False):
         image_gray.append(image1)
         image.append(image2)
     if plot:
-        visualize(image, image_gray)
+        visualize_list(image, image_gray)
 
 def test_random_grayscale_input_grayscale_images():
     """
diff --git a/tests/ut/python/dataset/test_random_horizontal_flip.py b/tests/ut/python/dataset/test_random_horizontal_flip.py
index 0fc3d16ff2..b6a4fef00c 100644
--- a/tests/ut/python/dataset/test_random_horizontal_flip.py
+++ b/tests/ut/python/dataset/test_random_horizontal_flip.py
@@ -15,13 +15,12 @@
 """
 Testing the random horizontal flip op in DE
 """
-import matplotlib.pyplot as plt
 import numpy as np
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as c_vision
 import mindspore.dataset.transforms.vision.py_transforms as py_vision
 from mindspore import log as logger
-from util import save_and_check_md5, visualize, diff_mse, \
+from util import save_and_check_md5, visualize_list, visualize_image, diff_mse, \
     config_get_set_seed, config_get_set_num_parallel_workers
 
 GENERATE_GOLDEN = False
@@ -41,29 +40,7 @@ def h_flip(image):
     return image
 
 
-def visualize_mse(image_de_random_horizontal, image_pil_random_horizontal, mse, image_original):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    plt.subplot(141)
-    plt.imshow(image_original)
-    plt.title("Original image")
-
-    plt.subplot(142)
-    plt.imshow(image_de_random_horizontal)
-    plt.title("DE random_horizontal image")
-
-    plt.subplot(143)
-    plt.imshow(image_pil_random_horizontal)
-    plt.title("Horizontally flipped image")
-
-    plt.subplot(144)
-    plt.imshow(image_de_random_horizontal - image_pil_random_horizontal)
-    plt.title("Difference image, mse : {}".format(mse))
-    plt.show()
-
-
-def test_random_horizontal_op():
+def test_random_horizontal_op(plot=False):
     """
     Test RandomHorizontalFlip op
     """
@@ -93,9 +70,10 @@ def test_random_horizontal_op():
 
         mse = diff_mse(image_h_flipped, image_h_flipped_2)
         logger.info("image_{}, mse: {}".format(num_iter + 1, mse))
-        # Uncomment below line if you want to visualize images
-        # visualize_mse(image_h_flipped, image_h_flipped_2, mse, image)
         num_iter += 1
+        if plot:
+            visualize_image(image, image_h_flipped, mse, image_h_flipped_2)
+
 
 def test_random_horizontal_valid_prob_c():
     """
@@ -119,6 +97,7 @@ def test_random_horizontal_valid_prob_c():
     ds.config.set_seed(original_seed)
     ds.config.set_num_parallel_workers(original_num_parallel_workers)
 
+
 def test_random_horizontal_valid_prob_py():
     """
     Test RandomHorizontalFlip op with py_transforms: valid non-default input, expect to pass
@@ -144,6 +123,7 @@ def test_random_horizontal_valid_prob_py():
     ds.config.set_seed(original_seed)
     ds.config.set_num_parallel_workers(original_num_parallel_workers)
 
+
 def test_random_horizontal_invalid_prob_c():
     """
     Test RandomHorizontalFlip op in c_transforms: invalid input, expect to raise error
@@ -162,6 +142,7 @@ def test_random_horizontal_invalid_prob_c():
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "Input is not" in str(e)
 
+
 def test_random_horizontal_invalid_prob_py():
     """
     Test RandomHorizontalFlip op in py_transforms: invalid input, expect to raise error
@@ -184,6 +165,7 @@ def test_random_horizontal_invalid_prob_py():
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "Input is not" in str(e)
 
+
 def test_random_horizontal_comp(plot=False):
     """
     Test test_random_horizontal_flip and compare between python and c image augmentation ops
@@ -215,17 +197,18 @@ def test_random_horizontal_comp(plot=False):
         image_py = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
         images_list_c.append(image_c)
         images_list_py.append(image_py)
-        
+
         # Check if the output images are the same
         mse = diff_mse(image_c, image_py)
         assert mse < 0.001
     if plot:
-        visualize(images_list_c, images_list_py)
+        visualize_list(images_list_c, images_list_py, visualize_mode=2)
+
 
 if __name__ == "__main__":
-    test_random_horizontal_op()
+    test_random_horizontal_op(plot=True)
     test_random_horizontal_valid_prob_c()
     test_random_horizontal_valid_prob_py()
     test_random_horizontal_invalid_prob_c()
     test_random_horizontal_invalid_prob_py()
-    test_random_horizontal_comp(True)
+    test_random_horizontal_comp(plot=True)
diff --git a/tests/ut/python/dataset/test_random_horizontal_flip_bbox.py b/tests/ut/python/dataset/test_random_horizontal_flip_bbox.py
new file mode 100644
index 0000000000..f7f14fd160
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_horizontal_flip_bbox.py
@@ -0,0 +1,266 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing the random horizontal flip with bounding boxes op in DE
+"""
+from enum import Enum
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import numpy as np
+import mindspore.log as logger
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.c_transforms as c_vision
+
+GENERATE_GOLDEN = False
+
+DATA_DIR = "../data/dataset/testVOC2012_2"
+
+
+class BoxType(Enum):
+    """
+    Defines box types for test cases
+    """
+    WidthOverflow = 1
+    HeightOverflow = 2
+    NegativeXY = 3
+    OnEdge = 4
+    WrongShape = 5
+
+def add_bad_annotation(img, bboxes, box_type):
+    """
+    Used to generate erroneous bounding box examples on given img.
+    :param img: image where the bounding boxes are.
+    :param bboxes: in [x_min, y_min, w, h, label, truncate, difficult] format
+    :param box_type: type of bad box
+    :return: bboxes with bad examples added
+    """
+    height = img.shape[0]
+    width = img.shape[1]
+    if box_type == BoxType.WidthOverflow:
+        # use box that overflows on width
+        return img, np.array([[0, 0, width + 1, height, 0, 0, 0]]).astype(np.uint32)
+
+    if box_type == BoxType.HeightOverflow:
+        # use box that overflows on height
+        return img, np.array([[0, 0, width, height + 1, 0, 0, 0]]).astype(np.uint32)
+
+    if box_type == BoxType.NegativeXY:
+        # use box with negative xy
+        return img, np.array([[-10, -10, width, height, 0, 0, 0]]).astype(np.uint32)
+
+    if box_type == BoxType.OnEdge:
+        # use box that covers the whole image
+        return img, np.array([[0, 0, width, height, 0, 0, 0]]).astype(np.uint32)
+
+    if box_type == BoxType.WrongShape:
+        # use box that covers the whole image
+        return img, np.array([[0, 0, width - 1]]).astype(np.uint32)
+    return img, bboxes
+
+
+def h_flip(image):
+    """
+    Apply the random_horizontal
+    """
+    # that's why we flip here too
+    image = image[:, ::-1, :]
+    return image
+
+
+def check_bad_box(data, box_type, expected_error):
+    """
+    :param data: de object detection pipeline
+    :param box_type: type of bad box
+    :param expected_error: error expected to get due to bad box
+    :return: None
+    """
+    # DEFINE TEST OP HERE -- (PROB 1 IN CASE OF RANDOM)
+    try:
+        test_op = c_vision.RandomHorizontalFlipWithBBox(1)
+        data = data.map(input_columns=["annotation"],
+                        output_columns=["annotation"],
+                        operations=fix_annotate)
+        # map to use width overflow
+        data = data.map(input_columns=["image", "annotation"],
+                        output_columns=["image", "annotation"],
+                        columns_order=["image", "annotation"],
+                        operations=lambda img, bboxes: add_bad_annotation(img, bboxes, box_type))
+        # map to apply ops
+        data = data.map(input_columns=["image", "annotation"],
+                        output_columns=["image", "annotation"],
+                        columns_order=["image", "annotation"],
+                        operations=[test_op])  # Add column for "annotation"
+        for _, _ in enumerate(data.create_dict_iterator()):
+            break
+    except RuntimeError as error:
+        logger.info("Got an exception in DE: {}".format(str(error)))
+        assert expected_error in str(error)
+
+
+def fix_annotate(bboxes):
+    """
+    Fix annotations to format followed by mindspore.
+    :param bboxes: in [label, x_min, y_min, w, h, truncate, difficult] format
+    :return: annotation in [x_min, y_min, w, h, label, truncate, difficult] format
+    """
+    for bbox in bboxes:
+        tmp = bbox[0]
+        bbox[0] = bbox[1]
+        bbox[1] = bbox[2]
+        bbox[2] = bbox[3]
+        bbox[3] = bbox[4]
+        bbox[4] = tmp
+    return bboxes
+
+
+def add_bounding_boxes(axis, bboxes):
+    """
+    :param axis: axis to modify
+    :param bboxes: bounding boxes to draw on the axis
+    :return: None
+    """
+    for bbox in bboxes:
+        rect = patches.Rectangle((bbox[0], bbox[1]),
+                                 bbox[2], bbox[3],
+                                 linewidth=1, edgecolor='r', facecolor='none')
+        # Add the patch to the Axes
+        axis.add_patch(rect)
+
+
+def visualize(unaugmented_data, augment_data):
+    """
+    :param unaugmented_data: original data
+    :param augment_data: data after augmentations
+    :return: None
+    """
+    for idx, (un_aug_item, aug_item) in \
+            enumerate(zip(unaugmented_data.create_dict_iterator(),
+                          augment_data.create_dict_iterator())):
+        axis = plt.subplot(141)
+        plt.imshow(un_aug_item["image"])
+        add_bounding_boxes(axis, un_aug_item["annotation"])  # add Orig BBoxes
+        plt.title("Original" + str(idx + 1))
+        logger.info("Original ", str(idx + 1), " :", un_aug_item["annotation"])
+
+        axis = plt.subplot(142)
+        plt.imshow(aug_item["image"])
+        add_bounding_boxes(axis, aug_item["annotation"])  # add AugBBoxes
+        plt.title("Augmented" + str(idx + 1))
+        logger.info("Augmented ", str(idx + 1), " ", aug_item["annotation"], "\n")
+        plt.show()
+
+
+def test_random_horizontal_bbox_op(plot=False):
+    """
+    Test RandomHorizontalFlipWithBBox op
+    Prints images side by side with and without Aug applied + bboxes to compare and test
+    """
+    logger.info("test_random_horizontal_bbox_c")
+
+    data_voc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    # DEFINE TEST OP HERE -- (PROB 1 IN CASE OF RANDOM)
+    test_op = c_vision.RandomHorizontalFlipWithBBox(1)
+
+    # maps to fix annotations to minddata standard
+    data_voc1 = data_voc1.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    data_voc2 = data_voc2.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    # map to apply ops
+    data_voc2 = data_voc2.map(input_columns=["image", "annotation"],
+                              output_columns=["image", "annotation"],
+                              columns_order=["image", "annotation"],
+                              operations=[test_op])  # Add column for "annotation"
+    if plot:
+        visualize(data_voc1, data_voc2)
+
+
+def test_random_horizontal_bbox_valid_prob_c(plot=False):
+    """
+    Test RandomHorizontalFlipWithBBox op
+    Prints images side by side with and without Aug applied + bboxes to compare and test
+    """
+    logger.info("test_random_horizontal_bbox_valid_prob_c")
+
+    data_voc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    # DEFINE TEST OP HERE -- (PROB 1 IN CASE OF RANDOM)
+    test_op = c_vision.RandomHorizontalFlipWithBBox(0.3)
+
+    # maps to fix annotations to minddata standard
+    data_voc1 = data_voc1.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    data_voc2 = data_voc2.map(input_columns=["annotation"],
+                              output_columns=["annotation"],
+                              operations=fix_annotate)
+    # map to apply ops
+    data_voc2 = data_voc2.map(input_columns=["image", "annotation"],
+                              output_columns=["image", "annotation"],
+                              columns_order=["image", "annotation"],
+                              operations=[test_op])  # Add column for "annotation"
+    if plot:
+        visualize(data_voc1, data_voc2)
+
+
+def test_random_horizontal_bbox_invalid_prob_c():
+    """
+    Test RandomHorizontalFlipWithBBox op with invalid input probability
+    """
+    logger.info("test_random_horizontal_bbox_invalid_prob_c")
+
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+
+    try:
+        # Note: Valid range of prob should be [0.0, 1.0]
+        test_op = c_vision.RandomHorizontalFlipWithBBox(1.5)
+        data_voc2 = data_voc2.map(input_columns=["annotation"],
+                                  output_columns=["annotation"],
+                                  operations=fix_annotate)
+        # map to apply ops
+        data_voc2 = data_voc2.map(input_columns=["image", "annotation"],
+                                  output_columns=["image", "annotation"],
+                                  columns_order=["image", "annotation"],
+                                  operations=[test_op])  # Add column for "annotation"
+    except ValueError as error:
+        logger.info("Got an exception in DE: {}".format(str(error)))
+        assert "Input is not" in str(error)
+
+
+def test_random_horizontal_bbox_invalid_bounds_c():
+    """
+    Test RandomHorizontalFlipWithBBox op with invalid bounding boxes
+    """
+    logger.info("test_random_horizontal_bbox_invalid_bounds_c")
+
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    check_bad_box(data_voc2, BoxType.WidthOverflow, "bounding boxes is out of bounds of the image")
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    check_bad_box(data_voc2, BoxType.HeightOverflow, "bounding boxes is out of bounds of the image")
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    check_bad_box(data_voc2, BoxType.NegativeXY, "min_x")
+    data_voc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train", decode=True, shuffle=False)
+    check_bad_box(data_voc2, BoxType.WrongShape, "4 features")
+
+if __name__ == "__main__":
+    # set to false to not show plots
+    test_random_horizontal_bbox_op(False)
+    test_random_horizontal_bbox_valid_prob_c(False)
+    test_random_horizontal_bbox_invalid_prob_c()
+    test_random_horizontal_bbox_invalid_bounds_c()
diff --git a/tests/ut/python/dataset/test_random_order.py b/tests/ut/python/dataset/test_random_order.py
new file mode 100644
index 0000000000..863bcd6778
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_order.py
@@ -0,0 +1,100 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing RandomOrder op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.py_transforms as py_vision
+from mindspore import log as logger
+from util import visualize_list, config_get_set_seed, \
+    config_get_set_num_parallel_workers, save_and_check_md5
+
+GENERATE_GOLDEN = False
+
+DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
+SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
+
+
+def test_random_order_op(plot=False):
+    """
+    Test RandomOrder in python transformations
+    """
+    logger.info("test_random_order_op")
+    # define map operations
+    transforms_list = [py_vision.CenterCrop(64), py_vision.RandomRotation(30)]
+    transforms1 = [
+        py_vision.Decode(),
+        py_vision.RandomOrder(transforms_list),
+        py_vision.ToTensor()
+    ]
+    transform1 = py_vision.ComposeOp(transforms1)
+
+    transforms2 = [
+        py_vision.Decode(),
+        py_vision.ToTensor()
+    ]
+    transform2 = py_vision.ComposeOp(transforms2)
+
+    #  First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data1 = data1.map(input_columns=["image"], operations=transform1())
+    #  Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=transform2())
+
+    image_order = []
+    image_original = []
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        image1 = (item1["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image2 = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image_order.append(image1)
+        image_original.append(image2)
+    if plot:
+        visualize_list(image_original, image_order)
+
+
+def test_random_order_md5():
+    """
+    Test RandomOrder op with md5 check
+    """
+    logger.info("test_random_order_md5")
+    original_seed = config_get_set_seed(8)
+    original_num_parallel_workers = config_get_set_num_parallel_workers(1)
+    # define map operations
+    transforms_list = [py_vision.RandomCrop(64), py_vision.RandomRotation(30)]
+    transforms = [
+        py_vision.Decode(),
+        py_vision.RandomOrder(transforms_list),
+        py_vision.ToTensor()
+    ]
+    transform = py_vision.ComposeOp(transforms)
+
+    #  Generate dataset
+    data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data = data.map(input_columns=["image"], operations=transform())
+
+    # check results with md5 comparison
+    filename = "random_order_01_result.npz"
+    save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
+
+    # Restore configuration
+    ds.config.set_seed(original_seed)
+    ds.config.set_num_parallel_workers((original_num_parallel_workers))
+
+
+if __name__ == '__main__':
+    test_random_order_op(plot=True)
+    test_random_order_md5()
diff --git a/tests/ut/python/dataset/test_random_perspective.py b/tests/ut/python/dataset/test_random_perspective.py
new file mode 100644
index 0000000000..0497e3985c
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_perspective.py
@@ -0,0 +1,128 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing RandomPerspective op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.py_transforms as py_vision
+from mindspore.dataset.transforms.vision.utils import Inter
+from mindspore import log as logger
+from util import visualize_list, save_and_check_md5, \
+    config_get_set_seed, config_get_set_num_parallel_workers
+
+
+GENERATE_GOLDEN = False
+
+DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
+SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
+
+
+def test_random_perspective_op(plot=False):
+    """
+    Test RandomPerspective in python transformations
+    """
+    logger.info("test_random_perspective_op")
+    # define map operations
+    transforms1 = [
+        py_vision.Decode(),
+        py_vision.RandomPerspective(),
+        py_vision.ToTensor()
+    ]
+    transform1 = py_vision.ComposeOp(transforms1)
+
+    transforms2 = [
+        py_vision.Decode(),
+        py_vision.ToTensor()
+    ]
+    transform2 = py_vision.ComposeOp(transforms2)
+
+    #  First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data1 = data1.map(input_columns=["image"], operations=transform1())
+    #  Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=transform2())
+
+    image_perspective = []
+    image_original = []
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        image1 = (item1["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image2 = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8)
+        image_perspective.append(image1)
+        image_original.append(image2)
+    if plot:
+        visualize_list(image_original, image_perspective)
+
+
+def skip_test_random_perspective_md5():
+    """
+    Test RandomPerspective with md5 comparison
+    """
+    logger.info("test_random_perspective_md5")
+    original_seed = config_get_set_seed(5)
+    original_num_parallel_workers = config_get_set_num_parallel_workers(1)
+
+    # define map operations
+    transforms = [
+        py_vision.Decode(),
+        py_vision.RandomPerspective(distortion_scale=0.3, prob=0.7,
+                                    interpolation=Inter.BILINEAR),
+        py_vision.ToTensor()
+    ]
+    transform = py_vision.ComposeOp(transforms)
+
+    #  Generate dataset
+    data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data = data.map(input_columns=["image"], operations=transform())
+
+    # check results with md5 comparison
+    filename = "random_perspective_01_result.npz"
+    save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
+
+    # Restore configuration
+    ds.config.set_seed(original_seed)
+    ds.config.set_num_parallel_workers((original_num_parallel_workers))
+
+
+def test_random_perspective_exception_distortion_scale_range():
+    """
+    Test RandomPerspective: distortion_scale is not in [0, 1], expected to raise ValueError
+    """
+    logger.info("test_random_perspective_exception_distortion_scale_range")
+    try:
+        _ = py_vision.RandomPerspective(distortion_scale=1.5)
+    except ValueError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "Input is not within the required range"
+
+
+def test_random_perspective_exception_prob_range():
+    """
+    Test RandomPerspective: prob is not in [0, 1], expected to raise ValueError
+    """
+    logger.info("test_random_perspective_exception_prob_range")
+    try:
+        _ = py_vision.RandomPerspective(prob=1.2)
+    except ValueError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert str(e) == "Input is not within the required range"
+
+
+if __name__ == "__main__":
+    test_random_perspective_op(plot=True)
+    skip_test_random_perspective_md5()
+    test_random_perspective_exception_distortion_scale_range()
+    test_random_perspective_exception_prob_range()
diff --git a/tests/ut/python/dataset/test_random_resize.py b/tests/ut/python/dataset/test_random_resize.py
index bbea33b5b4..c581712ace 100644
--- a/tests/ut/python/dataset/test_random_resize.py
+++ b/tests/ut/python/dataset/test_random_resize.py
@@ -15,35 +15,16 @@
 """
 Testing the resize op in DE
 """
-import matplotlib.pyplot as plt
-
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as vision
 from mindspore import log as logger
+from util import visualize_list
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
 
 
-def visualize(image_de_resized, image_np_resized, mse):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    plt.subplot(131)
-    plt.imshow(image_de_resized)
-    plt.title("DE resize image")
-
-    plt.subplot(132)
-    plt.imshow(image_np_resized)
-    plt.title("Numpy resized image")
-
-    plt.subplot(133)
-    plt.imshow(image_de_resized - image_np_resized)
-    plt.title("Difference image, mse : {}".format(mse))
-    plt.show()
-
-
-def test_random_resize_op():
+def test_random_resize_op(plot=False):
     """
     Test random_resize_op
     """
@@ -56,15 +37,20 @@ def test_random_resize_op():
 
     # apply map operations on images
     data1 = data1.map(input_columns=["image"], operations=decode_op)
-    data1 = data1.map(input_columns=["image"], operations=resize_op)
 
+    data2 = data1.map(input_columns=["image"], operations=resize_op)
+    image_original = []
+    image_resized = []
     num_iter = 0
-    for item in data1.create_dict_iterator():
-        _ = item["image"]
-        # Uncomment below line if you want to visualize images
-        # visualize(image_de_resized, image_np_resized, mse)
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        image_1 = item1["image"]
+        image_2 = item2["image"]
+        image_original.append(image_1)
+        image_resized.append(image_2)
         num_iter += 1
+    if plot:
+        visualize_list(image_original, image_resized)
 
 
 if __name__ == "__main__":
-    test_random_resize_op()
+    test_random_resize_op(plot=True)
diff --git a/tests/ut/python/dataset/test_random_rotation.py b/tests/ut/python/dataset/test_random_rotation.py
index e47f77aec7..d399dee00a 100644
--- a/tests/ut/python/dataset/test_random_rotation.py
+++ b/tests/ut/python/dataset/test_random_rotation.py
@@ -15,7 +15,6 @@
 """
 Testing RandomRotation op in DE
 """
-import matplotlib.pyplot as plt
 import numpy as np
 import cv2
 
@@ -23,35 +22,13 @@ import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as c_vision
 import mindspore.dataset.transforms.vision.py_transforms as py_vision
 from mindspore import log as logger
+from util import visualize_image, diff_mse
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
 
 
-def visualize(a, mse, original):
-    """
-    visualizes the image using DE op and enCV
-    """
-    plt.subplot(141)
-    plt.imshow(original)
-    plt.title("Original image")
-
-    plt.subplot(142)
-    plt.imshow(a)
-    plt.title("DE random_crop_and_resize image")
-
-    plt.subplot(143)
-    plt.imshow(a - original)
-    plt.title("Difference image, mse : {}".format(mse))
-    plt.show()
-
-
-def diff_mse(in1, in2):
-    mse = (np.square(in1.astype(float) / 255 - in2.astype(float) / 255)).mean()
-    return mse * 100
-
-
-def test_random_rotation_op():
+def test_random_rotation_op(plot=False):
     """
     Test RandomRotation op
     """
@@ -73,17 +50,16 @@ def test_random_rotation_op():
     for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
         if num_iter > 0:
             break
-        rotation = item1["image"]
+        rotation_de = item1["image"]
         original = item2["image"]
         logger.info("shape before rotate: {}".format(original.shape))
-        original = cv2.rotate(original, cv2.ROTATE_90_COUNTERCLOCKWISE)
-        diff = rotation - original
-        mse = np.sum(np.power(diff, 2))
+        rotation_cv = cv2.rotate(original, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        mse = diff_mse(rotation_de, rotation_cv)
         logger.info("random_rotation_op_{}, mse: {}".format(num_iter + 1, mse))
         assert mse == 0
-        # Uncomment below line if you want to visualize images
-        # visualize(rotation, mse, original)
         num_iter += 1
+    if plot:
+        visualize_image(original, rotation_de, mse, rotation_cv)
 
 
 def test_random_rotation_expand():
@@ -148,6 +124,6 @@ def test_rotation_diff():
 
 
 if __name__ == "__main__":
-    test_random_rotation_op()
+    test_random_rotation_op(True)
     test_random_rotation_expand()
     test_rotation_diff()
diff --git a/tests/ut/python/dataset/test_random_sharpness.py b/tests/ut/python/dataset/test_random_sharpness.py
index fa2bd687c5..8689ae8ffd 100644
--- a/tests/ut/python/dataset/test_random_sharpness.py
+++ b/tests/ut/python/dataset/test_random_sharpness.py
@@ -12,34 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-import matplotlib.pyplot as plt
+"""
+Testing RandomSharpness op in DE
+"""
 import numpy as np
 
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.vision.py_transforms as F
 from mindspore import log as logger
+from util import visualize_list
 
 DATA_DIR = "../data/dataset/testImageNetData/train/"
 
 
-def visualize(image_original, image_random_sharpness):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    num = len(image_random_sharpness)
-    for i in range(num):
-        plt.subplot(2, num, i + 1)
-        plt.imshow(image_original[i])
-        plt.title("Original image")
-
-        plt.subplot(2, num, i + num + 1)
-        plt.imshow(image_random_sharpness[i])
-        plt.title("DE Random Sharpness image")
-
-    plt.show()
-
-
 def test_random_sharpness(degrees=(0.1, 1.9), plot=False):
     """
     Test RandomSharpness
@@ -94,7 +79,7 @@ def test_random_sharpness(degrees=(0.1, 1.9), plot=False):
     logger.info("MSE= {}".format(str(np.mean(mse))))
 
     if plot:
-        visualize(images_original, images_random_sharpness)
+        visualize_list(images_original, images_random_sharpness)
 
 
 if __name__ == "__main__":
diff --git a/tests/ut/python/dataset/test_random_vertical_flip.py b/tests/ut/python/dataset/test_random_vertical_flip.py
index 500f9df80f..c09d9df224 100644
--- a/tests/ut/python/dataset/test_random_vertical_flip.py
+++ b/tests/ut/python/dataset/test_random_vertical_flip.py
@@ -15,14 +15,12 @@
 """
 Testing the random vertical flip op in DE
 """
-import matplotlib.pyplot as plt
 import numpy as np
-
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as c_vision
 import mindspore.dataset.transforms.vision.py_transforms as py_vision
 from mindspore import log as logger
-from util import save_and_check_md5, visualize, diff_mse, \
+from util import save_and_check_md5, visualize_list, visualize_image, diff_mse, \
     config_get_set_seed, config_get_set_num_parallel_workers
 
 GENERATE_GOLDEN = False
@@ -42,29 +40,7 @@ def v_flip(image):
     return image
 
 
-def visualize_with_mse(image_de_random_vertical, image_pil_random_vertical, mse, image_original):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    plt.subplot(141)
-    plt.imshow(image_original)
-    plt.title("Original image")
-
-    plt.subplot(142)
-    plt.imshow(image_de_random_vertical)
-    plt.title("DE random_vertical image")
-
-    plt.subplot(143)
-    plt.imshow(image_pil_random_vertical)
-    plt.title("vertically flipped image")
-
-    plt.subplot(144)
-    plt.imshow(image_de_random_vertical - image_pil_random_vertical)
-    plt.title("Difference image, mse : {}".format(mse))
-    plt.show()
-
-
-def test_random_vertical_op():
+def test_random_vertical_op(plot=False):
     """
     Test random_vertical with default probability
     """
@@ -96,9 +72,10 @@ def test_random_vertical_op():
         diff = image_v_flipped - image_v_flipped_2
         mse = np.sum(np.power(diff, 2))
         logger.info("image_{}, mse: {}".format(num_iter + 1, mse))
-        # Uncomment below line if you want to visualize images
-        # visualize_with_mse(image_v_flipped, image_v_flipped_2, mse, image)
         num_iter += 1
+        if plot:
+            visualize_image(image, image_v_flipped, mse, image_v_flipped_2)
+
 
 def test_random_vertical_valid_prob_c():
     """
@@ -122,6 +99,7 @@ def test_random_vertical_valid_prob_c():
     ds.config.set_seed(original_seed)
     ds.config.set_num_parallel_workers(original_num_parallel_workers)
 
+
 def test_random_vertical_valid_prob_py():
     """
     Test RandomVerticalFlip op with py_transforms: valid non-default input, expect to pass
@@ -147,6 +125,7 @@ def test_random_vertical_valid_prob_py():
     ds.config.set_seed(original_seed)
     ds.config.set_num_parallel_workers(original_num_parallel_workers)
 
+
 def test_random_vertical_invalid_prob_c():
     """
     Test RandomVerticalFlip op in c_transforms: invalid input, expect to raise error
@@ -165,6 +144,7 @@ def test_random_vertical_invalid_prob_c():
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "Input is not" in str(e)
 
+
 def test_random_vertical_invalid_prob_py():
     """
     Test RandomVerticalFlip op in py_transforms: invalid input, expect to raise error
@@ -186,6 +166,7 @@ def test_random_vertical_invalid_prob_py():
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "Input is not" in str(e)
 
+
 def test_random_vertical_comp(plot=False):
     """
     Test test_random_vertical_flip and compare between python and c image augmentation ops
@@ -223,13 +204,13 @@ def test_random_vertical_comp(plot=False):
         mse = diff_mse(image_c, image_py)
         assert mse < 0.001
     if plot:
-        visualize(images_list_c, images_list_py)
+        visualize_list(images_list_c, images_list_py, visualize_mode=2)
 
 
 if __name__ == "__main__":
-    test_random_vertical_op()
+    test_random_vertical_op(plot=True)
     test_random_vertical_valid_prob_c()
     test_random_vertical_valid_prob_py()
     test_random_vertical_invalid_prob_c()
     test_random_vertical_invalid_prob_py()
-    test_random_vertical_comp(True)
+    test_random_vertical_comp(plot=True)
diff --git a/tests/ut/python/dataset/test_random_vertical_flip_with_bbox.py b/tests/ut/python/dataset/test_random_vertical_flip_with_bbox.py
new file mode 100644
index 0000000000..e0c8b455f4
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_vertical_flip_with_bbox.py
@@ -0,0 +1,326 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing RandomVerticalFlipWithBBox op
+"""
+import numpy as np
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.c_transforms as c_vision
+
+from mindspore import log as logger
+
+# updated VOC dataset with correct annotations
+DATA_DIR = "../data/dataset/testVOC2012_2"
+
+
+def fix_annotate(bboxes):
+    """
+    Update Current VOC dataset format to Proposed HQ BBox format
+
+    :param bboxes: as [label, x_min, y_min, w, h, truncate, difficult]
+    :return: annotation as [x_min, y_min, w, h, label, truncate, difficult]
+    """
+    for bbox in bboxes:
+        tmp = bbox[0]
+        bbox[0] = bbox[1]
+        bbox[1] = bbox[2]
+        bbox[2] = bbox[3]
+        bbox[3] = bbox[4]
+        bbox[4] = tmp
+    return bboxes
+
+
+def add_bounding_boxes(ax, bboxes):
+    for bbox in bboxes:
+        rect = patches.Rectangle((bbox[0], bbox[1]),
+                                 bbox[2], bbox[3],
+                                 linewidth=1, edgecolor='r', facecolor='none')
+        # Add the patch to the Axes
+        ax.add_patch(rect)
+
+
+def vis_check(orig, aug):
+    if not isinstance(orig, list) or not isinstance(aug, list):
+        return False
+    if len(orig) != len(aug):
+        return False
+    return True
+
+
+def visualize(orig, aug):
+
+    if not vis_check(orig, aug):
+        return
+
+    plotrows = 3
+    compset = int(len(orig)/plotrows)
+
+    orig, aug = np.array(orig), np.array(aug)
+
+    orig = np.split(orig[:compset*plotrows], compset) + [orig[compset*plotrows:]]
+    aug = np.split(aug[:compset*plotrows], compset) + [aug[compset*plotrows:]]
+
+    for ix, allData in enumerate(zip(orig, aug)):
+        base_ix = ix * plotrows  # will signal what base level we're on
+        fig, axs = plt.subplots(len(allData[0]), 2)
+        fig.tight_layout(pad=1.5)
+
+        for x, (dataA, dataB) in enumerate(zip(allData[0], allData[1])):
+            cur_ix = base_ix + x
+
+            axs[x, 0].imshow(dataA["image"])
+            add_bounding_boxes(axs[x, 0], dataA["annotation"])
+            axs[x, 0].title.set_text("Original" + str(cur_ix+1))
+            print("Original **\n ", str(cur_ix+1), " :", dataA["annotation"])
+
+            axs[x, 1].imshow(dataB["image"])
+            add_bounding_boxes(axs[x, 1], dataB["annotation"])
+            axs[x, 1].title.set_text("Augmented" + str(cur_ix+1))
+            print("Augmented **\n", str(cur_ix+1), " ", dataB["annotation"], "\n")
+
+        plt.show()
+
+# Functions to pass to Gen for creating invalid bounding boxes
+
+
+def gen_bad_bbox_neg_xy(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [-50, -50, im_w - 10, im_h - 10]
+    return im, bbox
+
+
+def gen_bad_bbox_overflow_width(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [0, 0, im_w + 10, im_h - 10]
+    return im, bbox
+
+
+def gen_bad_bbox_overflow_height(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [0, 0, im_w - 10, im_h + 10]
+    return im, bbox
+
+
+def gen_bad_bbox_wrong_shape(im, bbox):
+    bbox = np.array([[0, 0, 0]]).astype(bbox.dtype)
+    return im, bbox
+
+
+badGenFuncs = [gen_bad_bbox_neg_xy,
+               gen_bad_bbox_overflow_width,
+               gen_bad_bbox_overflow_height,
+               gen_bad_bbox_wrong_shape]
+
+assertVal = ["min_x",
+             "is out of bounds of the image",
+             "is out of bounds of the image",
+             "4 features"]
+
+
+# Gen Edge case BBox
+def gen_bbox_edge(im, bbox):
+    im_h, im_w = im.shape[0], im.shape[1]
+    bbox[0][:4] = [0, 0, im_w, im_h]
+    return im, bbox
+
+
+def c_random_vertical_flip_with_bbox_op(plot_vis=False):
+    """
+     Prints images side by side with and without Aug applied + bboxes to
+     compare and test
+    """
+
+    # Load dataset
+    dataVoc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                             decode=True, shuffle=False)
+
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                             decode=True, shuffle=False)
+
+    test_op = c_vision.RandomVerticalFlipWithBBox(1)
+
+    # maps to fix annotations to HQ standard
+    dataVoc1 = dataVoc1.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    # map to apply ops
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[test_op])
+
+    unaugSamp, augSamp = [], []
+
+    for unAug, Aug in zip(dataVoc1.create_dict_iterator(), dataVoc2.create_dict_iterator()):
+        unaugSamp.append(unAug)
+        augSamp.append(Aug)
+
+    if plot_vis:
+        visualize(unaugSamp, augSamp)
+
+
+def c_random_vertical_flip_with_bbox_op_rand(plot_vis=False):
+    """
+     Prints images side by side with and without Aug applied + bboxes to
+     compare and test
+    """
+
+    # Load dataset
+    dataVoc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                             decode=True, shuffle=False)
+
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                             decode=True, shuffle=False)
+
+    test_op = c_vision.RandomVerticalFlipWithBBox(0.6)
+
+    # maps to fix annotations to HQ standard
+    dataVoc1 = dataVoc1.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    # map to apply ops
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[test_op])
+
+    unaugSamp, augSamp = [], []
+
+    for unAug, Aug in zip(dataVoc1.create_dict_iterator(), dataVoc2.create_dict_iterator()):
+        unaugSamp.append(unAug)
+        augSamp.append(Aug)
+
+    if plot_vis:
+        visualize(unaugSamp, augSamp)
+
+
+def c_random_vertical_flip_with_bbox_op_edge(plot_vis=False):
+    # Should Pass
+    # Load dataset
+    dataVoc1 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                             decode=True, shuffle=False)
+
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                             decode=True, shuffle=False)
+
+    test_op = c_vision.RandomVerticalFlipWithBBox(0.6)
+
+    # maps to fix annotations to HQ standard
+    dataVoc1 = dataVoc1.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+    dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                            output_columns=["annotation"],
+                            operations=fix_annotate)
+
+    # Modify BBoxes to serve as valid edge cases
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[gen_bbox_edge])
+
+    # map to apply ops
+    dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                            output_columns=["image", "annotation"],
+                            columns_order=["image", "annotation"],
+                            operations=[test_op])
+
+    unaugSamp, augSamp = [], []
+
+    for unAug, Aug in zip(dataVoc1.create_dict_iterator(), dataVoc2.create_dict_iterator()):
+        unaugSamp.append(unAug)
+        augSamp.append(Aug)
+
+    if plot_vis:
+        visualize(unaugSamp, augSamp)
+
+
+def c_random_vertical_flip_with_bbox_op_invalid():
+    # Should Fail
+    # Load dataset
+
+    dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                             decode=True, shuffle=False)
+
+    try:
+        test_op = c_vision.RandomVerticalFlipWithBBox(2)
+
+        # maps to fix annotations to HQ standard
+
+        dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                                output_columns=["annotation"],
+                                operations=fix_annotate)
+        # map to apply ops
+        dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                output_columns=["image", "annotation"],
+                                columns_order=["image", "annotation"],
+                                operations=[test_op])
+
+        for _ in dataVoc2.create_dict_iterator():
+            break
+
+    except ValueError as err:
+        logger.info("Got an exception in DE: {}".format(str(err)))
+        assert "Input is not" in str(err)
+
+
+def c_random_vertical_flip_with_bbox_op_bad():
+    # Should Fail - Errors logged to logger
+    for ix, badFunc in enumerate(badGenFuncs):
+        try:
+            dataVoc2 = ds.VOCDataset(DATA_DIR, task="Detection", mode="train",
+                                     decode=True, shuffle=False)
+
+            test_op = c_vision.RandomVerticalFlipWithBBox(1)
+
+            dataVoc2 = dataVoc2.map(input_columns=["annotation"],
+                                    output_columns=["annotation"],
+                                    operations=fix_annotate)
+
+            dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                    output_columns=["image", "annotation"],
+                                    columns_order=["image", "annotation"],
+                                    operations=[badFunc])
+
+            # map to apply ops
+            dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                    output_columns=["image", "annotation"],
+                                    columns_order=["image", "annotation"],
+                                    operations=[test_op])
+
+            for _ in dataVoc2.create_dict_iterator():
+                break  # first sample will cause exception
+
+        except RuntimeError as err:
+            logger.info("Got an exception in DE: {}".format(str(err)))
+            assert assertVal[ix] in str(err)
+
+
+if __name__ == "__main__":
+    c_random_vertical_flip_with_bbox_op(False)
+    c_random_vertical_flip_with_bbox_op_rand(False)
+    c_random_vertical_flip_with_bbox_op_edge(False)
+    c_random_vertical_flip_with_bbox_op_invalid()
+    c_random_vertical_flip_with_bbox_op_bad()
diff --git a/tests/ut/python/dataset/test_rescale_op.py b/tests/ut/python/dataset/test_rescale_op.py
index 9e85fe2709..e6ccf17e82 100644
--- a/tests/ut/python/dataset/test_rescale_op.py
+++ b/tests/ut/python/dataset/test_rescale_op.py
@@ -15,12 +15,10 @@
 """
 Testing the rescale op in DE
 """
-import matplotlib.pyplot as plt
-import numpy as np
-
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as vision
 from mindspore import log as logger
+from util import visualize_image, diff_mse
 
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
@@ -52,25 +50,7 @@ def get_rescaled(image_id):
     return None
 
 
-def visualize(image_de_rescaled, image_np_rescaled, mse):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    plt.subplot(131)
-    plt.imshow(image_de_rescaled)
-    plt.title("DE rescale image")
-
-    plt.subplot(132)
-    plt.imshow(image_np_rescaled)
-    plt.title("Numpy rescaled image")
-
-    plt.subplot(133)
-    plt.imshow(image_de_rescaled - image_np_rescaled)
-    plt.title("Difference image, mse : {}".format(mse))
-    plt.show()
-
-
-def test_rescale_op():
+def test_rescale_op(plot=False):
     """
     Test rescale
     """
@@ -83,19 +63,20 @@ def test_rescale_op():
 
     # apply map operations on images
     data1 = data1.map(input_columns=["image"], operations=decode_op)
-    data1 = data1.map(input_columns=["image"], operations=rescale_op)
+
+    data2 = data1.map(input_columns=["image"], operations=rescale_op)
 
     num_iter = 0
-    for item in data1.create_dict_iterator():
-        image_de_rescaled = item["image"]
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        image_original = item1["image"]
+        image_de_rescaled = item2["image"]
         image_np_rescaled = get_rescaled(num_iter)
-        diff = image_de_rescaled - image_np_rescaled
-        mse = np.sum(np.power(diff, 2))
+        mse = diff_mse(image_de_rescaled, image_np_rescaled)
         logger.info("image_{}, mse: {}".format(num_iter + 1, mse))
-        # Uncomment below line if you want to visualize images
-        # visualize(image_de_rescaled, image_np_rescaled, mse)
         num_iter += 1
+        if plot:
+            visualize_image(image_original, image_de_rescaled, mse, image_np_rescaled)
 
 
 if __name__ == "__main__":
-    test_rescale_op()
+    test_rescale_op(plot=True)
diff --git a/tests/ut/python/dataset/test_sampler.py b/tests/ut/python/dataset/test_sampler.py
index 40d4b740c6..a7ec89c209 100644
--- a/tests/ut/python/dataset/test_sampler.py
+++ b/tests/ut/python/dataset/test_sampler.py
@@ -28,8 +28,8 @@ def test_sequential_sampler(print_res=False):
     map_ = {(172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4}
 
     def test_config(num_samples, num_repeats=None):
-        sampler = ds.SequentialSampler()
-        data1 = ds.ManifestDataset(manifest_file, num_samples=num_samples, sampler=sampler)
+        sampler = ds.SequentialSampler(num_samples=num_samples)
+        data1 = ds.ManifestDataset(manifest_file, sampler=sampler)
         if num_repeats is not None:
             data1 = data1.repeat(num_repeats)
         res = []
@@ -119,8 +119,8 @@ def test_python_sampler():
             return iter([i for i in range(self.dataset_size)])
 
     class Sp2(ds.Sampler):
-        def __init__(self):
-            super(Sp2, self).__init__()
+        def __init__(self, num_samples=None):
+            super(Sp2, self).__init__(num_samples)
             # at this stage, self.dataset_size and self.num_samples are not yet known
             self.cnt = 0
 
@@ -130,8 +130,8 @@ def test_python_sampler():
         def reset(self):
             self.cnt = (self.cnt + 1) % self.dataset_size
 
-    def test_config(num_samples, num_repeats, sampler):
-        data1 = ds.ManifestDataset(manifest_file, num_samples=num_samples, sampler=sampler)
+    def test_config(num_repeats, sampler):
+        data1 = ds.ManifestDataset(manifest_file, sampler=sampler)
         if num_repeats is not None:
             data1 = data1.repeat(num_repeats)
         res = []
@@ -154,8 +154,8 @@ def test_python_sampler():
             assert data[0] == (np.array(i),)
             i = i - 1
 
-    assert test_config(5, 2, Sp1()) == [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
-    assert test_config(2, 6, Sp2()) == [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 0, 0]
+    assert test_config(2, Sp1(5)) == [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
+    assert test_config(6, Sp2(2)) == [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 0, 0]
     test_generator()
 
     sp1 = Sp1().create()
@@ -169,8 +169,8 @@ def test_subset_sampler():
     manifest_file = "../data/dataset/testManifestData/test5trainimgs.json"
     map_ = {(172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4}
 
-    def test_config(num_samples, start_index, subset_size):
-        sampler = ds.SubsetSampler(start_index, subset_size)
+    def test_config(start_index, num_samples):
+        sampler = ds.SequentialSampler(start_index, num_samples)
         d = ds.ManifestDataset(manifest_file, sampler=sampler)
 
         res = []
@@ -179,19 +179,15 @@ def test_subset_sampler():
 
         return res
 
-    with pytest.raises(RuntimeError) as info:
-        test_config(5, 0, 0)
-    assert "subset_size <= 0" in str(info.value)
-
-    assert test_config(5, 0, 1) == [0]
-    assert test_config(5, 0, 2) == [0, 1]
-    assert test_config(5, 0, 3) == [0, 1, 2]
-    assert test_config(5, 0, 4) == [0, 1, 2, 3]
-    assert test_config(5, 0, 5) == [0, 1, 2, 3, 4]
-    assert test_config(5, 1, 1) == [1]
-    assert test_config(5, 2, 3) == [2, 3, 4]
-    assert test_config(5, 3, 2) == [3, 4]
-    assert test_config(5, 4, 1) == [4]
+    assert test_config(0, 1) == [0]
+    assert test_config(0, 2) == [0, 1]
+    assert test_config(0, 3) == [0, 1, 2]
+    assert test_config(0, 4) == [0, 1, 2, 3]
+    assert test_config(0, 5) == [0, 1, 2, 3, 4]
+    assert test_config(1, 1) == [1]
+    assert test_config(2, 3) == [2, 3, 4]
+    assert test_config(3, 2) == [3, 4]
+    assert test_config(4, 1) == [4]
 
 
 def test_sampler_chain():
@@ -199,11 +195,11 @@ def test_sampler_chain():
     map_ = {(172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4}
 
     def test_config(num_shards, shard_id):
-        sampler = ds.DistributedSampler(num_shards, shard_id, False)
+        sampler = ds.DistributedSampler(num_shards, shard_id, shuffle=False, num_samples=5)
         child_sampler = ds.SequentialSampler()
         sampler.add_child(child_sampler)
 
-        data1 = ds.ManifestDataset(manifest_file, num_samples=5, sampler=sampler)
+        data1 = ds.ManifestDataset(manifest_file, sampler=sampler)
 
         res = []
         for item in data1.create_dict_iterator():
@@ -233,6 +229,11 @@ def test_add_sampler_invalid_input():
         data1.use_sampler("sampler")
     assert "not an instance of a sampler" in str(info.value)
 
+    sampler = ds.SequentialSampler()
+    with pytest.raises(ValueError) as info:
+        data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20)        
+    assert "Conflicting arguments during sampler assignments" in str(info.value)
+
 
 if __name__ == '__main__':
     test_sequential_sampler(True)
diff --git a/tests/ut/python/dataset/test_slice_op.py b/tests/ut/python/dataset/test_slice_op.py
new file mode 100644
index 0000000000..6e81133a2a
--- /dev/null
+++ b/tests/ut/python/dataset/test_slice_op.py
@@ -0,0 +1,236 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing Slice op in DE
+"""
+import numpy as np
+import pytest
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as ops
+
+
+def slice_compare(array, indexing):
+    data = ds.NumpySlicesDataset([array])
+    array = np.array(array)
+    data = data.map(operations=ops.Slice(indexing))
+    for d in data:
+        if indexing is None:
+            array = array[:]
+        else:
+            array = array[indexing]
+        np.testing.assert_array_equal(array, d[0])
+
+
+def test_slice_all():
+    slice_compare([1, 2, 3, 4, 5], None)
+    slice_compare([1, 2, 3, 4, 5], ...)
+
+
+def test_slice_single_index():
+    slice_compare([1, 2, 3, 4, 5], 0)
+    slice_compare([1, 2, 3, 4, 5], 4)
+    slice_compare([1, 2, 3, 4, 5], 2)
+    slice_compare([1, 2, 3, 4, 5], -1)
+    slice_compare([1, 2, 3, 4, 5], -5)
+    slice_compare([1, 2, 3, 4, 5], -3)
+
+
+def test_slice_list_index():
+    slice_compare([1, 2, 3, 4, 5], [0, 1, 4])
+    slice_compare([1, 2, 3, 4, 5], [4, 1, 0])
+    slice_compare([1, 2, 3, 4, 5], [-1, 1, 0])
+    slice_compare([1, 2, 3, 4, 5], [-1, -4, -2])
+    slice_compare([1, 2, 3, 4, 5], [3, 3, 3])
+    slice_compare([1, 2, 3, 4, 5], [1, 1, 1, 1, 1])
+
+
+def test_slice_slice_obj_2s():
+    slice_compare([1, 2, 3, 4, 5], slice(0, 2))
+    slice_compare([1, 2, 3, 4, 5], slice(2, 4))
+    slice_compare([1, 2, 3, 4, 5], slice(4, 10))
+
+
+def test_slice_slice_obj_1s():
+    slice_compare([1, 2, 3, 4, 5], slice(1))
+    slice_compare([1, 2, 3, 4, 5], slice(4))
+    slice_compare([1, 2, 3, 4, 5], slice(10))
+
+
+def test_slice_slice_obj_3s():
+    slice_compare([1, 2, 3, 4, 5], slice(0, 2, 1))
+    slice_compare([1, 2, 3, 4, 5], slice(0, 4, 1))
+    slice_compare([1, 2, 3, 4, 5], slice(0, 10, 1))
+    slice_compare([1, 2, 3, 4, 5], slice(0, 5, 2))
+    slice_compare([1, 2, 3, 4, 5], slice(0, 2, 2))
+    slice_compare([1, 2, 3, 4, 5], slice(0, 1, 2))
+    slice_compare([1, 2, 3, 4, 5], slice(4, 5, 1))
+    slice_compare([1, 2, 3, 4, 5], slice(2, 5, 3))
+
+
+def test_slice_multiple_rows():
+    dataset = [[1, 2], [3, 4, 5], [1], [1, 2, 3, 4, 5, 6, 7]]
+
+    def gen():
+        for row in dataset:
+            yield (np.array(row),)
+
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    indexing = slice(0, 4)
+    data = data.map(operations=ops.Slice(indexing))
+    for i, d in enumerate(data):
+        array = np.array(dataset[i])
+        array = array[indexing]
+        np.testing.assert_array_equal(array, d[0])
+
+
+def test_slice_slice_obj_3s_double():
+    slice_compare([1., 2., 3., 4., 5.], slice(0, 2, 1))
+    slice_compare([1., 2., 3., 4., 5.], slice(0, 4, 1))
+    slice_compare([1., 2., 3., 4., 5.], slice(0, 10, 1))
+    slice_compare([1., 2., 3., 4., 5.], slice(0, 5, 2))
+    slice_compare([1., 2., 3., 4., 5.], slice(0, 2, 2))
+    slice_compare([1., 2., 3., 4., 5.], slice(0, 1, 2))
+    slice_compare([1., 2., 3., 4., 5.], slice(4, 5, 1))
+    slice_compare([1., 2., 3., 4., 5.], slice(2, 5, 3))
+
+
+def test_slice_slice_obj_neg():
+    slice_compare([1, 2, 3, 4, 5], slice(-1, -5, -1))
+    slice_compare([1, 2, 3, 4, 5], slice(-1))
+    slice_compare([1, 2, 3, 4, 5], slice(-2))
+    slice_compare([1, 2, 3, 4, 5], slice(-1, -5, -2))
+    slice_compare([1, 2, 3, 4, 5], slice(-5, -1, 2))
+    slice_compare([1, 2, 3, 4, 5], slice(-5, -1))
+
+
+def test_slice_exceptions():
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([1, 2, 3, 4, 5], 5)
+    assert "Index 5 is out of bounds [0,5)" in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([1, 2, 3, 4, 5], slice(0))
+    assert "Indices are empty, generated tensor would be empty." in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([1, 2, 3, 4, 5], slice(3, 1, 1))
+    assert "Indices are empty, generated tensor would be empty." in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([1, 2, 3, 4, 5], slice(5, 10, 1))
+    assert "Indices are empty, generated tensor would be empty." in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([1, 2, 3, 4, 5], slice(-1, -5, 1))
+    assert "Indices are empty, generated tensor would be empty." in str(info.value)
+
+
+def test_slice_all_str():
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], None)
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], ...)
+
+
+def test_slice_single_index_str():
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], 0)
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], 4)
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], 2)
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], -1)
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], -5)
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], -3)
+
+
+def test_slice_list_index_str():
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], [0, 1, 4])
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], [4, 1, 0])
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], [-1, 1, 0])
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], [-1, -4, -2])
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], [3, 3, 3])
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], [1, 1, 1, 1, 1])
+
+
+def test_slice_slice_obj_2s_str():
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(0, 2))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(2, 4))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(4, 10))
+
+
+def test_slice_slice_obj_1s_str():
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(1))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(4))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(10))
+
+
+def test_slice_slice_obj_3s_str():
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(0, 2, 1))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(0, 4, 1))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(0, 10, 1))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(0, 5, 2))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(0, 2, 2))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(0, 1, 2))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(4, 5, 1))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(2, 5, 3))
+
+
+def test_slice_slice_obj_neg_str():
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(-1, -5, -1))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(-1))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(-2))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(-1, -5, -2))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(-5, -1, 2))
+    slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(-5, -1))
+
+
+def test_slice_exceptions_str():
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([b"1", b"2", b"3", b"4", b"5"], 5)
+    assert "Index 5 is out of bounds [0,5)" in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(0))
+    assert "Indices are empty, generated tensor would be empty." in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(3, 1, 1))
+    assert "Indices are empty, generated tensor would be empty." in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(5, 10, 1))
+    assert "Indices are empty, generated tensor would be empty." in str(info.value)
+
+    with pytest.raises(RuntimeError) as info:
+        slice_compare([b"1", b"2", b"3", b"4", b"5"], slice(-1, -5, 1))
+    assert "Indices are empty, generated tensor would be empty." in str(info.value)
+
+
+if __name__ == "__main__":
+    test_slice_all()
+    test_slice_single_index()
+    test_slice_list_index()
+    test_slice_slice_obj_3s()
+    test_slice_slice_obj_2s()
+    test_slice_slice_obj_1s()
+    test_slice_slice_obj_neg()
+    test_slice_exceptions()
+    test_slice_slice_obj_3s_double()
+    test_slice_all_str()
+    test_slice_single_index_str()
+    test_slice_list_index_str()
+    test_slice_slice_obj_3s_str()
+    test_slice_slice_obj_2s_str()
+    test_slice_slice_obj_1s_str()
+    test_slice_slice_obj_neg_str()
+    test_slice_exceptions_str()
+    test_slice_multiple_rows()
diff --git a/tests/ut/python/dataset/test_split.py b/tests/ut/python/dataset/test_split.py
index b904e2e016..a51e852454 100644
--- a/tests/ut/python/dataset/test_split.py
+++ b/tests/ut/python/dataset/test_split.py
@@ -42,15 +42,15 @@ def split_with_invalid_inputs(d):
 
     with pytest.raises(RuntimeError) as info:
         _, _ = d.split([3, 1])
-    assert "sum of split sizes 4 is not equal to dataset size 5" in str(info.value)
+    assert "Sum of split sizes 4 is not equal to dataset size 5" in str(info.value)
 
     with pytest.raises(RuntimeError) as info:
         _, _ = d.split([5, 1])
-    assert "sum of split sizes 6 is not equal to dataset size 5" in str(info.value)
+    assert "Sum of split sizes 6 is not equal to dataset size 5" in str(info.value)
 
     with pytest.raises(RuntimeError) as info:
         _, _ = d.split([0.15, 0.15, 0.15, 0.15, 0.15, 0.25])
-    assert "sum of calculated split sizes 6 is not equal to dataset size 5" in str(info.value)
+    assert "Sum of calculated split sizes 6 is not equal to dataset size 5" in str(info.value)
 
     with pytest.raises(ValueError) as info:
         _, _ = d.split([-0.5, 0.5])
@@ -80,7 +80,7 @@ def test_unmappable_invalid_input():
     d = ds.TextFileDataset(text_file_dataset_path, num_shards=2, shard_id=0)
     with pytest.raises(RuntimeError) as info:
         _, _ = d.split([4, 1])
-    assert "dataset should not be sharded before split" in str(info.value)
+    assert "Dataset should not be sharded before split" in str(info.value)
 
 
 def test_unmappable_split():
@@ -274,7 +274,7 @@ def test_mappable_invalid_input():
     d = ds.ManifestDataset(manifest_file, num_shards=2, shard_id=0)
     with pytest.raises(RuntimeError) as info:
         _, _ = d.split([4, 1])
-    assert "dataset should not be sharded before split" in str(info.value)
+    assert "Dataset should not be sharded before split" in str(info.value)
 
 
 def test_mappable_split_general():
diff --git a/tests/ut/python/dataset/test_sync_wait.py b/tests/ut/python/dataset/test_sync_wait.py
index 669df0a219..a5727a2991 100644
--- a/tests/ut/python/dataset/test_sync_wait.py
+++ b/tests/ut/python/dataset/test_sync_wait.py
@@ -204,7 +204,6 @@ def test_sync_exception_03():
     Test sync: with wrong batch size
     """
     logger.info("test_sync_exception_03")
-    batch_size = 6
 
     dataset = ds.GeneratorDataset(gen, column_names=["input"])
 
@@ -223,7 +222,6 @@ def test_sync_exception_04():
     Test sync: with negative batch size in update
     """
     logger.info("test_sync_exception_04")
-    batch_size = 6
 
     dataset = ds.GeneratorDataset(gen, column_names=["input"])
 
@@ -233,7 +231,7 @@ def test_sync_exception_04():
     dataset = dataset.map(input_columns=["input"], operations=[aug.preprocess])
     count = 0
     try:
-        for item in dataset.create_dict_iterator():
+        for _ in dataset.create_dict_iterator():
             count += 1
             data = {"loss": count}
             # dataset.disable_sync()
@@ -246,7 +244,6 @@ def test_sync_exception_05():
     Test sync: with wrong batch size in update
     """
     logger.info("test_sync_exception_05")
-    batch_size = 6
 
     dataset = ds.GeneratorDataset(gen, column_names=["input"])
     count = 0
@@ -255,7 +252,7 @@ def test_sync_exception_05():
     dataset = dataset.sync_wait(condition_name="every batch", callback=aug.update)
     dataset = dataset.map(input_columns=["input"], operations=[aug.preprocess])
     try:
-        for item in dataset.create_dict_iterator():
+        for _ in dataset.create_dict_iterator():
             dataset.disable_sync()
             count += 1
             data = {"loss": count}
diff --git a/tests/ut/python/dataset/test_ten_crop.py b/tests/ut/python/dataset/test_ten_crop.py
index de860cb7d2..7bffea5cc9 100644
--- a/tests/ut/python/dataset/test_ten_crop.py
+++ b/tests/ut/python/dataset/test_ten_crop.py
@@ -19,8 +19,8 @@ import numpy as np
 
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.py_transforms as vision
-from util import visualize, save_and_check_md5
 from mindspore import log as logger
+from util import visualize_list, save_and_check_md5
 
 GENERATE_GOLDEN = False
 
@@ -62,7 +62,7 @@ def util_test_ten_crop(crop_size, vertical_flip=False, plot=False):
         logger.info("dtype of image_2: {}".format(image_2.dtype))
 
         if plot:
-            visualize(np.array([image_1]*10), (image_2 * 255).astype(np.uint8).transpose(0, 2, 3, 1))
+            visualize_list(np.array([image_1]*10), (image_2 * 255).astype(np.uint8).transpose(0, 2, 3, 1))
 
         # The output data should be of a 4D tensor shape, a stack of 10 images.
         assert len(image_2.shape) == 4
@@ -123,7 +123,7 @@ def test_ten_crop_list_size_error_msg():
     logger.info("test_ten_crop_list_size_error_msg")
 
     with pytest.raises(TypeError) as info:
-        transforms = [
+        _ = [
             vision.Decode(),
             vision.TenCrop([200, 200, 200]),
             lambda images: np.stack([vision.ToTensor()(image) for image in images])  # 4D stack of 10 images
@@ -139,7 +139,7 @@ def test_ten_crop_invalid_size_error_msg():
     logger.info("test_ten_crop_invalid_size_error_msg")
 
     with pytest.raises(ValueError) as info:
-        transforms = [
+        _ = [
             vision.Decode(),
             vision.TenCrop(0),
             lambda images: np.stack([vision.ToTensor()(image) for image in images])  # 4D stack of 10 images
@@ -148,7 +148,7 @@ def test_ten_crop_invalid_size_error_msg():
     assert error_msg == str(info.value)
 
     with pytest.raises(ValueError) as info:
-        transforms = [
+        _ = [
             vision.Decode(),
             vision.TenCrop(-10),
             lambda images: np.stack([vision.ToTensor()(image) for image in images])  # 4D stack of 10 images
diff --git a/tests/ut/python/dataset/test_tensor_string.py b/tests/ut/python/dataset/test_tensor_string.py
index 4005fca050..c833527bc3 100644
--- a/tests/ut/python/dataset/test_tensor_string.py
+++ b/tests/ut/python/dataset/test_tensor_string.py
@@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-import pytest
+import mindspore._c_dataengine as cde
 import numpy as np
 
-import mindspore.dataset as ds
-import mindspore._c_dataengine as cde
 import mindspore.common.dtype as mstype
-from mindspore.dataset.text import to_str
+import mindspore.dataset as ds
+from mindspore.dataset.text import to_str, to_bytes
+
 
-# pylint: disable=comparison-with-itself
 def test_basic():
     x = np.array([["ab", "cde", "121"], ["x", "km", "789"]], dtype='S')
     n = cde.Tensor(x)
@@ -28,8 +27,8 @@ def test_basic():
     np.testing.assert_array_equal(x, arr)
 
 
-def compare(strings):
-    arr = np.array(strings, dtype='S')
+def compare(strings, dtype='S'):
+    arr = np.array(strings, dtype=dtype)
 
     def gen():
         (yield arr,)
@@ -37,25 +36,51 @@ def compare(strings):
     data = ds.GeneratorDataset(gen, column_names=["col"])
 
     for d in data:
-        np.testing.assert_array_equal(d[0], arr)
+        np.testing.assert_array_equal(d[0], arr.astype('S'))
 
 
 def test_generator():
     compare(["ab"])
+    compare(["", ""])
+    compare([""])
+    compare(["ab", ""])
     compare(["ab", "cde", "121"])
     compare([["ab", "cde", "121"], ["x", "km", "789"]])
+    compare([["ab", "", "121"], ["", "km", "789"]])
+    compare(["ab"], dtype='U')
+    compare(["", ""], dtype='U')
+    compare([""], dtype='U')
+    compare(["ab", ""], dtype='U')
+    compare(["", ""], dtype='U')
+    compare(["", "ab"], dtype='U')
+    compare(["ab", "cde", "121"], dtype='U')
+    compare([["ab", "cde", "121"], ["x", "km", "789"]], dtype='U')
+    compare([["ab", "", "121"], ["", "km", "789"]], dtype='U')
+
+
+line = np.array(["This is a text file.",
+                 "Be happy every day.",
+                 "Good luck to everyone."])
+
+words = np.array([["This", "text", "file", "a"],
+                  ["Be", "happy", "day", "b"],
+                  ["女", "", "everyone", "c"]])
+
+chinese = np.array(["今天天气太好了我们一起去外面玩吧",
+                    "男默女泪",
+                    "江州市长江大桥参加了长江大桥的通车仪式"])
 
 
 def test_batching_strings():
     def gen():
-        yield (np.array(["ab", "cde", "121"], dtype='S'),)
+        for row in chinese:
+            yield (np.array(row),)
 
-    data = ds.GeneratorDataset(gen, column_names=["col"]).batch(10)
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    data = data.batch(2, drop_remainder=True)
 
-    with pytest.raises(RuntimeError) as info:
-        for _ in data:
-            pass
-    assert "[Batch ERROR] Batch does not support" in str(info.value)
+    for d in data:
+        np.testing.assert_array_equal(d[0], to_bytes(chinese[0:2]))
 
 
 def test_map():
@@ -67,7 +92,7 @@ def test_map():
     def split(b):
         s = to_str(b)
         splits = s.item().split()
-        return np.array(splits, dtype='S')
+        return np.array(splits)
 
     data = data.map(input_columns=["col"], operations=split)
     expected = np.array(["ab", "cde", "121"], dtype='S')
@@ -91,19 +116,6 @@ def test_map2():
         np.testing.assert_array_equal(d[0], expected)
 
 
-line = np.array(["This is a text file.",
-                 "Be happy every day.",
-                 "Good luck to everyone."])
-
-words = np.array([["This", "text", "file", "a"],
-                  ["Be", "happy", "day", "b"],
-                  ["女", "", "everyone", "c"]])
-
-chinese = np.array(["今天天气太好了我们一起去外面玩吧",
-                    "男默女泪",
-                    "江州市长江大桥参加了长江大桥的通车仪式"])
-
-
 def test_tfrecord1():
     s = ds.Schema()
     s.add_column("line", "string", [])
@@ -181,6 +193,94 @@ def test_mindrecord():
         np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
 
 
+# The following tests cases were copied from test_pad_batch but changed to strings instead
+
+
+# this generator function yield two columns
+# col1d: [0],[1], [2], [3]
+# col2d: [[100],[200]], [[101],[201]], [102],[202]], [103],[203]]
+def gen_2cols(num):
+    for i in range(num):
+        yield (np.array([str(i)]), np.array([[str(i + 100)], [str(i + 200)]]))
+
+
+# this generator function yield one column of variable shapes
+# col: [0], [0,1], [0,1,2], [0,1,2,3]
+def gen_var_col(num):
+    for i in range(num):
+        yield (np.array([str(j) for j in range(i + 1)]),)
+
+
+# this generator function yield two columns of variable shapes
+# col1: [0], [0,1], [0,1,2], [0,1,2,3]
+# col2: [100], [100,101], [100,101,102], [100,110,102,103]
+def gen_var_cols(num):
+    for i in range(num):
+        yield (np.array([str(j) for j in range(i + 1)]), np.array([str(100 + j) for j in range(i + 1)]))
+
+
+# this generator function yield two columns of variable shapes
+# col1: [[0]], [[0,1]], [[0,1,2]], [[0,1,2,3]]
+# col2: [[100]], [[100,101]], [[100,101,102]], [[100,110,102,103]]
+def gen_var_cols_2d(num):
+    for i in range(num):
+        yield (np.array([[str(j) for j in range(i + 1)]]), np.array([[str(100 + j) for j in range(i + 1)]]))
+
+
+def test_batch_padding_01():
+    data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"])
+    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([2, 2], b"-2"), "col1d": ([2], b"-1")})
+    data1 = data1.repeat(2)
+    for data in data1.create_dict_iterator():
+        np.testing.assert_array_equal([[b"0", b"-1"], [b"1", b"-1"]], data["col1d"])
+        np.testing.assert_array_equal([[[b"100", b"-2"], [b"200", b"-2"]], [[b"101", b"-2"], [b"201", b"-2"]]],
+                                      data["col2d"])
+
+
+def test_batch_padding_02():
+    data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"])
+    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([1, 2], "")})
+    data1 = data1.repeat(2)
+    for data in data1.create_dict_iterator():
+        np.testing.assert_array_equal([[b"0"], [b"1"]], data["col1d"])
+        np.testing.assert_array_equal([[[b"100", b""]], [[b"101", b""]]], data["col2d"])
+
+
+def test_batch_padding_03():
+    data1 = ds.GeneratorDataset((lambda: gen_var_col(4)), ["col"])
+    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col": (None, "PAD_VALUE")})  # pad automatically
+    data1 = data1.repeat(2)
+    res = dict()
+    for ind, data in enumerate(data1.create_dict_iterator()):
+        res[ind] = data["col"].copy()
+    np.testing.assert_array_equal(res[0], [[b"0", b"PAD_VALUE"], [0, 1]])
+    np.testing.assert_array_equal(res[1], [[b"0", b"1", b"2", b"PAD_VALUE"], [b"0", b"1", b"2", b"3"]])
+    np.testing.assert_array_equal(res[2], [[b"0", b"PAD_VALUE"], [b"0", b"1"]])
+    np.testing.assert_array_equal(res[3], [[b"0", b"1", b"2", b"PAD_VALUE"], [b"0", b"1", b"2", b"3"]])
+
+
+def test_batch_padding_04():
+    data1 = ds.GeneratorDataset((lambda: gen_var_cols(2)), ["col1", "col2"])
+    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={})  # pad automatically
+    data1 = data1.repeat(2)
+    for data in data1.create_dict_iterator():
+        np.testing.assert_array_equal(data["col1"], [[b"0", b""], [b"0", b"1"]])
+        np.testing.assert_array_equal(data["col2"], [[b"100", b""], [b"100", b"101"]])
+
+
+def test_batch_padding_05():
+    data1 = ds.GeneratorDataset((lambda: gen_var_cols_2d(3)), ["col1", "col2"])
+    data1 = data1.batch(batch_size=3, drop_remainder=False,
+                        pad_info={"col2": ([2, None], "-2"), "col1": (None, "-1")})  # pad automatically
+    for data in data1.create_dict_iterator():
+        np.testing.assert_array_equal(data["col1"],
+                                      [[[b"0", b"-1", b"-1"]], [[b"0", b"1", b"-1"]], [[b"0", b"1", b"2"]]])
+        np.testing.assert_array_equal(data["col2"],
+                                      [[[b"100", b"-2", b"-2"], [b"-2", b"-2", b"-2"]],
+                                       [[b"100", b"101", b"-2"], [b"-2", b"-2", b"-2"]],
+                                       [[b"100", b"101", b"102"], [b"-2", b"-2", b"-2"]]])
+
+
 if __name__ == '__main__':
     test_generator()
     test_basic()
@@ -191,3 +291,8 @@ if __name__ == '__main__':
     test_tfrecord2()
     test_tfrecord3()
     test_mindrecord()
+    test_batch_padding_01()
+    test_batch_padding_02()
+    test_batch_padding_03()
+    test_batch_padding_04()
+    test_batch_padding_05()
diff --git a/tests/ut/python/dataset/test_to_number_op.py b/tests/ut/python/dataset/test_to_number_op.py
new file mode 100644
index 0000000000..47b39e7a68
--- /dev/null
+++ b/tests/ut/python/dataset/test_to_number_op.py
@@ -0,0 +1,194 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+import pytest
+
+import mindspore.common.dtype as mstype
+import mindspore.dataset as ds
+import mindspore.dataset.text as text
+
+np_integral_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16,
+                     np.uint32, np.uint64]
+ms_integral_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8,
+                     mstype.uint16, mstype.uint32, mstype.uint64]
+
+np_non_integral_types = [np.float16, np.float32, np.float64]
+ms_non_integral_types = [mstype.float16, mstype.float32, mstype.float64]
+
+def string_dataset_generator(strings):
+    for string in strings:
+        yield (np.array(string, dtype='S'),)
+
+
+def test_to_number_typical_case_integral():
+    input_strings = [["-121", "14"], ["-2219", "7623"], ["-8162536", "162371864"],
+                     ["-1726483716", "98921728421"]]
+
+    for ms_type, inputs in zip(ms_integral_types, input_strings):
+        dataset = ds.GeneratorDataset(string_dataset_generator(inputs), "strings")
+        dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_type))
+
+        expected_output = [int(string) for string in inputs]
+        output = []
+        for data in dataset.create_dict_iterator():
+            output.append(data["strings"])
+
+        assert output == expected_output
+
+
+def test_to_number_typical_case_non_integral():
+    input_strings = [["-1.1", "1.4"], ["-2219.321", "7623.453"], ["-816256.234282", "162371864.243243"]]
+    epsilons = [0.001, 0.001, 0.0001, 0.0001, 0.0000001, 0.0000001]
+
+    for ms_type, inputs in zip(ms_non_integral_types, input_strings):
+        dataset = ds.GeneratorDataset(string_dataset_generator(inputs), "strings")
+        dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_type))
+
+        expected_output = [float(string) for string in inputs]
+        output = []
+        for data in dataset.create_dict_iterator():
+            output.append(data["strings"])
+
+        for expected, actual, epsilon in zip(expected_output, output, epsilons):
+            assert abs(expected - actual) < epsilon
+
+
+def out_of_bounds_error_message_check(dataset, np_type, value_to_cast):
+    type_info = np.iinfo(np_type)
+    type_max = str(type_info.max)
+    type_min = str(type_info.min)
+    type_name = str(np.dtype(np_type))
+
+    with pytest.raises(RuntimeError) as info:
+        for _ in dataset.create_dict_iterator():
+            pass
+    assert "String input " + value_to_cast + " will be out of bounds if casted to " + type_name in str(info.value)
+    assert "valid range is: [" + type_min + ", " + type_max + "]" in str(info.value)
+
+
+def test_to_number_out_of_bounds_integral():
+    for np_type, ms_type in zip(np_integral_types, ms_integral_types):
+        type_info = np.iinfo(np_type)
+        input_strings = [str(type_info.max + 10)]
+        dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+        dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_type))
+        out_of_bounds_error_message_check(dataset, np_type, input_strings[0])
+
+        input_strings = [str(type_info.min - 10)]
+        dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+        dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_type))
+        out_of_bounds_error_message_check(dataset, np_type, input_strings[0])
+
+
+def test_to_number_out_of_bounds_non_integral():
+    above_range = [str(np.finfo(np.float16).max * 10), str(np.finfo(np.float32).max * 10), "1.8e+308"]
+
+    input_strings = [above_range[0]]
+    dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+    dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_non_integral_types[0]))
+
+    with pytest.raises(RuntimeError) as info:
+        for _ in dataset.create_dict_iterator():
+            pass
+    assert "outside of valid float16 range" in str(info.value)
+
+    input_strings = [above_range[1]]
+    dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+    dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_non_integral_types[1]))
+
+    with pytest.raises(RuntimeError) as info:
+        for _ in dataset.create_dict_iterator():
+            pass
+    assert "String input " + input_strings[0] + " will be out of bounds if casted to float32" in str(info.value)
+
+    input_strings = [above_range[2]]
+    dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+    dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_non_integral_types[2]))
+
+    with pytest.raises(RuntimeError) as info:
+        for _ in dataset.create_dict_iterator():
+            pass
+    assert "String input " + input_strings[0] + " will be out of bounds if casted to float64" in str(info.value)
+
+    below_range = [str(np.finfo(np.float16).min * 10), str(np.finfo(np.float32).min * 10), "-1.8e+308"]
+
+    input_strings = [below_range[0]]
+    dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+    dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_non_integral_types[0]))
+
+    with pytest.raises(RuntimeError) as info:
+        for _ in dataset.create_dict_iterator():
+            pass
+    assert "outside of valid float16 range" in str(info.value)
+
+    input_strings = [below_range[1]]
+    dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+    dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_non_integral_types[1]))
+
+    with pytest.raises(RuntimeError) as info:
+        for _ in dataset.create_dict_iterator():
+            pass
+    assert "String input " + input_strings[0] + " will be out of bounds if casted to float32" in str(info.value)
+
+    input_strings = [below_range[2]]
+    dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+    dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_non_integral_types[2]))
+
+    with pytest.raises(RuntimeError) as info:
+        for _ in dataset.create_dict_iterator():
+            pass
+    assert "String input " + input_strings[0] + " will be out of bounds if casted to float64" in str(info.value)
+
+
+def test_to_number_boundaries_integral():
+    for np_type, ms_type in zip(np_integral_types, ms_integral_types):
+        type_info = np.iinfo(np_type)
+        input_strings = [str(type_info.max)]
+        dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+        dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_type))
+        for data in dataset.create_dict_iterator():
+            assert data["strings"] == int(input_strings[0])
+
+        input_strings = [str(type_info.min)]
+        dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+        dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_type))
+        for data in dataset.create_dict_iterator():
+            assert data["strings"] == int(input_strings[0])
+
+        input_strings = [str(0)]
+        dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+        dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(ms_type))
+        for data in dataset.create_dict_iterator():
+            assert data["strings"] == int(input_strings[0])
+
+
+def test_to_number_invalid_input():
+    input_strings = ["a8fa9ds8fa"]
+    dataset = ds.GeneratorDataset(string_dataset_generator(input_strings), "strings")
+    dataset = dataset.map(input_columns=["strings"], operations=text.ToNumber(mstype.int32))
+
+    with pytest.raises(RuntimeError) as info:
+        for _ in dataset.create_dict_iterator():
+            pass
+    assert "It is invalid to convert " + input_strings[0] + " to a number" in str(info.value)
+
+
+if __name__ == '__main__':
+    test_to_number_typical_case_integral()
+    test_to_number_typical_case_non_integral()
+    test_to_number_boundaries_integral()
+    test_to_number_out_of_bounds_integral()
+    test_to_number_out_of_bounds_non_integral()
+    test_to_number_invalid_input()
diff --git a/tests/ut/python/dataset/test_tokenizer.py b/tests/ut/python/dataset/test_tokenizer.py
index 3aeb035312..2ec988d8dc 100644
--- a/tests/ut/python/dataset/test_tokenizer.py
+++ b/tests/ut/python/dataset/test_tokenizer.py
@@ -15,11 +15,15 @@
 """
 Testing UnicodeCharTokenizer op in DE
 """
+import numpy as np
 import mindspore.dataset as ds
 from mindspore import log as logger
 import mindspore.dataset.text as nlp
 
 DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
+NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
+REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
+REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
 
 
 def split_by_unicode_char(input_strs):
@@ -48,5 +52,182 @@ def test_unicode_char_tokenizer():
     assert split_by_unicode_char(input_strs) == tokens
 
 
+def test_whitespace_tokenizer():
+    """
+    Test WhitespaceTokenizer
+    """
+    whitespace_strs = [["Welcome", "to", "Beijing!"],
+                       ["北京欢迎您！"],
+                       ["我喜欢English!"],
+                       [""]]
+    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    tokenizer = nlp.WhitespaceTokenizer()
+    dataset = dataset.map(operations=tokenizer)
+    tokens = []
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text']).tolist()
+        tokens.append(text)
+    logger.info("The out tokens is : {}".format(tokens))
+    assert whitespace_strs == tokens
+
+
+def test_unicode_script_tokenizer():
+    """
+    Test UnicodeScriptTokenizer when para keep_whitespace=False
+    """
+    unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
+                           ["北京欢迎您", "！"],
+                           ["我喜欢", "English", "!"],
+                           [""]]
+    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False)
+    dataset = dataset.map(operations=tokenizer)
+
+    tokens = []
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text']).tolist()
+        tokens.append(text)
+    logger.info("The out tokens is : {}".format(tokens))
+    assert unicode_script_strs == tokens
+
+
+def test_unicode_script_tokenizer2():
+    """
+    Test UnicodeScriptTokenizer when para keep_whitespace=True
+    """
+    unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
+                            ["北京欢迎您", "！"],
+                            ["我喜欢", "English", "!"],
+                            ["  "]]
+    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True)
+    dataset = dataset.map(operations=tokenizer)
+    tokens = []
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text']).tolist()
+        tokens.append(text)
+    logger.info("The out tokens is :", tokens)
+    assert unicode_script_strs2 == tokens
+
+
+def test_case_fold():
+    """
+    Test CaseFold
+    """
+    expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", "  "]
+    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    op = nlp.CaseFold()
+    dataset = dataset.map(operations=op)
+
+    lower_strs = []
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text']).tolist()
+        lower_strs.append(text)
+    assert lower_strs == expect_strs
+
+
+def test_normalize_utf8():
+    """
+    Test NormalizeUTF8
+    """
+
+    def normalize(normalize_form):
+        dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
+        normalize = nlp.NormalizeUTF8(normalize_form=normalize_form)
+        dataset = dataset.map(operations=normalize)
+        out_bytes = []
+        out_texts = []
+        for i in dataset.create_dict_iterator():
+            out_bytes.append(i['text'])
+            out_texts.append(nlp.to_str(i['text']).tolist())
+        logger.info("The out bytes is : ", out_bytes)
+        logger.info("The out texts is: ", out_texts)
+        return out_bytes
+
+    expect_normlize_data = [
+        # NFC
+        [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
+         b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
+        # NFKC
+        [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
+         b'fi', b'25', b'\xe1\xb9\xa9'],
+        # NFD
+        [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
+         b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
+        # NFKD
+        [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
+         b'fi', b'25', b's\xcc\xa3\xcc\x87']
+    ]
+    assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0]
+    assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
+    assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2]
+    assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
+
+
+def test_regex_replace():
+    """
+    Test RegexReplace
+    """
+
+    def regex_replace(first, last, expect_str, pattern, replace):
+        dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
+        if first > 1:
+            dataset = dataset.skip(first - 1)
+        if last >= first:
+            dataset = dataset.take(last - first + 1)
+        replace_op = nlp.RegexReplace(pattern, replace)
+        dataset = dataset.map(operations=replace_op)
+        out_text = []
+        for i in dataset.create_dict_iterator():
+            text = nlp.to_str(i['text']).tolist()
+            out_text.append(text)
+        logger.info("Out:", out_text)
+        logger.info("Exp:", expect_str)
+        assert expect_str == out_text
+
+    regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
+    regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
+    regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
+    regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
+
+
+def test_regex_tokenizer():
+    """
+    Test RegexTokenizer
+    """
+
+    def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
+        dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
+        if first > 1:
+            dataset = dataset.skip(first - 1)
+        if last >= first:
+            dataset = dataset.take(last - first + 1)
+        tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern)
+        dataset = dataset.map(operations=tokenizer_op)
+        out_text = []
+        count = 0
+        for i in dataset.create_dict_iterator():
+            text = nlp.to_str(i['text']).tolist()
+            np.testing.assert_array_equal(text, expect_str[count])
+            count += 1
+            out_text.append(text)
+        logger.info("Out:", out_text)
+        logger.info("Exp:", expect_str)
+
+    regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
+    regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
+    regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
+    regex_tokenizer(3, 3, [['12', '￥+', '36', '￥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
+    regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
+    regex_tokenizer(3, 3, [['￥+', '￥=?']], r"[\p{N}]+", "")
+
+
 if __name__ == '__main__':
     test_unicode_char_tokenizer()
+    test_whitespace_tokenizer()
+    test_unicode_script_tokenizer()
+    test_unicode_script_tokenizer2()
+    test_case_fold()
+    test_normalize_utf8()
+    test_regex_replace()
+    test_regex_tokenizer()
diff --git a/tests/ut/python/dataset/test_uniform_augment.py b/tests/ut/python/dataset/test_uniform_augment.py
index bc1a9ae0d2..26bca3bd0a 100644
--- a/tests/ut/python/dataset/test_uniform_augment.py
+++ b/tests/ut/python/dataset/test_uniform_augment.py
@@ -12,35 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-import matplotlib.pyplot as plt
+"""
+Testing UniformAugment in DE
+"""
 import numpy as np
 
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.vision.c_transforms as C
 import mindspore.dataset.transforms.vision.py_transforms as F
 from mindspore import log as logger
+from util import visualize_list
 
 DATA_DIR = "../data/dataset/testImageNetData/train/"
 
 
-def visualize(image_original, image_ua):
-    """
-    visualizes the image using DE op and Numpy op
-    """
-    num = len(image_ua)
-    for i in range(num):
-        plt.subplot(2, num, i + 1)
-        plt.imshow(image_original[i])
-        plt.title("Original image")
-
-        plt.subplot(2, num, i + num + 1)
-        plt.imshow(image_ua[i])
-        plt.title("DE UniformAugment image")
-
-    plt.show()
-
-
 def test_uniform_augment(plot=False, num_ops=2):
     """
     Test UniformAugment
@@ -102,7 +87,7 @@ def test_uniform_augment(plot=False, num_ops=2):
     logger.info("MSE= {}".format(str(np.mean(mse))))
 
     if plot:
-        visualize(images_original, images_ua)
+        visualize_list(images_original, images_ua)
 
 
 def test_cpp_uniform_augment(plot=False, num_ops=2):
@@ -157,7 +142,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2):
                                   np.transpose(image, (0, 2, 3, 1)),
                                   axis=0)
     if plot:
-        visualize(images_original, images_ua)
+        visualize_list(images_original, images_ua)
 
     num_samples = images_original.shape[0]
     mse = np.zeros(num_samples)
@@ -226,6 +211,7 @@ def test_cpp_uniform_augment_exception_nonpositive_numops(num_ops=0):
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "num_ops" in str(e)
 
+
 def test_cpp_uniform_augment_exception_float_numops(num_ops=2.5):
     """
     Test UniformAugment invalid float number of ops
@@ -245,6 +231,7 @@ def test_cpp_uniform_augment_exception_float_numops(num_ops=2.5):
         logger.info("Got an exception in DE: {}".format(str(e)))
         assert "integer" in str(e)
 
+
 def test_cpp_uniform_augment_random_crop_badinput(num_ops=1):
     """
     Test UniformAugment with greater crop size
@@ -273,8 +260,8 @@ def test_cpp_uniform_augment_random_crop_badinput(num_ops=1):
 
 
 if __name__ == "__main__":
-    test_uniform_augment(num_ops=1)
-    test_cpp_uniform_augment(num_ops=1)
+    test_uniform_augment(num_ops=1, plot=True)
+    test_cpp_uniform_augment(num_ops=1, plot=True)
     test_cpp_uniform_augment_exception_pyops(num_ops=1)
     test_cpp_uniform_augment_exception_large_numops(num_ops=6)
     test_cpp_uniform_augment_exception_nonpositive_numops(num_ops=0)
diff --git a/tests/ut/python/dataset/test_vocab.py b/tests/ut/python/dataset/test_vocab.py
index 4230f9324b..35411e5c80 100644
--- a/tests/ut/python/dataset/test_vocab.py
+++ b/tests/ut/python/dataset/test_vocab.py
@@ -1,13 +1,31 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+
 import mindspore.dataset as ds
 import mindspore.dataset.text as text
 
 # this file contains "home is behind the world head" each word is 1 line
 DATA_FILE = "../data/dataset/testVocab/words.txt"
 VOCAB_FILE = "../data/dataset/testVocab/vocab_list.txt"
+SIMPLE_VOCAB_FILE = "../data/dataset/testVocab/simple_vocab_list.txt"
 
 
-def test_from_list():
-    vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "))
+def test_from_list_tutorial():
+    vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "), ["<pad>", "<unk>"], True)
     lookup = text.Lookup(vocab)
     data = ds.TextFileDataset(DATA_FILE, shuffle=False)
     data = data.map(input_columns=["text"], operations=lookup)
@@ -18,8 +36,8 @@ def test_from_list():
         ind += 1
 
 
-def test_from_file():
-    vocab = text.Vocab.from_file(VOCAB_FILE, ",")
+def test_from_file_tutorial():
+    vocab = text.Vocab.from_file(VOCAB_FILE, ",", None, ["<pad>", "<unk>"], True)
     lookup = text.Lookup(vocab)
     data = ds.TextFileDataset(DATA_FILE, shuffle=False)
     data = data.map(input_columns=["text"], operations=lookup)
@@ -30,7 +48,7 @@ def test_from_file():
         ind += 1
 
 
-def test_from_dict():
+def test_from_dict_tutorial():
     vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6})
     lookup = text.Lookup(vocab, 6)  # default value is -1
     data = ds.TextFileDataset(DATA_FILE, shuffle=False)
@@ -41,7 +59,67 @@ def test_from_dict():
         assert d["text"] == res[ind], ind
         ind += 1
 
+
+def test_from_list():
+    def gen(texts):
+        for word in texts.split(" "):
+            yield (np.array(word, dtype='S'),)
+
+    def test_config(lookup_str, vocab_input, special_tokens, special_first):
+        try:
+            vocab = text.Vocab.from_list(vocab_input, special_tokens, special_first)
+            data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"])
+            data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
+            res = []
+            for d in data.create_dict_iterator():
+                res.append(d["text"].item())
+            return res
+        except ValueError as e:
+            return str(e)
+
+    # test normal operations
+    assert test_config("w1 w2 w3 s1 s2", ["w1", "w2", "w3"], ["s1", "s2"], True) == [2, 3, 4, 0, 1]
+    assert test_config("w1 w2 w3 s1 s2", ["w1", "w2", "w3"], ["s1", "s2"], False) == [0, 1, 2, 3, 4]
+    assert test_config("w3 w2 w1", ["w1", "w2", "w3"], None, True) == [2, 1, 0]
+    assert test_config("w3 w2 w1", ["w1", "w2", "w3"], None, False) == [2, 1, 0]
+
+    # test exceptions
+    assert "word_list contains duplicate" in test_config("w1", ["w1", "w1"], [], True)
+    assert "special_tokens contains duplicate" in test_config("w1", ["w1", "w2"], ["s1", "s1"], True)
+    assert "special_tokens and word_list contain duplicate" in test_config("w1", ["w1", "w2"], ["s1", "w1"], True)
+
+
+def test_from_file():
+    def gen(texts):
+        for word in texts.split(" "):
+            yield (np.array(word, dtype='S'),)
+
+    def test_config(lookup_str, vocab_size, special_tokens, special_first):
+        try:
+            vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, vocab_size=vocab_size, special_tokens=special_tokens,
+                                         special_first=special_first)
+            data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"])
+            data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
+            res = []
+            for d in data.create_dict_iterator():
+                res.append(d["text"].item())
+            return res
+        except ValueError as e:
+            return str(e)
+
+    # test special tokens are prepended
+    assert test_config("w1 w2 w3 s1 s2 s3", None, ["s1", "s2", "s3"], True) == [3, 4, 5, 0, 1, 2]
+    # test special tokens are appended
+    assert test_config("w1 w2 w3 s1 s2 s3", None, ["s1", "s2", "s3"], False) == [0, 1, 2, 8, 9, 10]
+    # test special tokens are prepended when not all words in file are used
+    assert test_config("w1 w2 w3 s1 s2 s3", 3, ["s1", "s2", "s3"], False) == [0, 1, 2, 3, 4, 5]
+    # text exception special_words contains duplicate words
+    assert "special_tokens contains duplicate" in test_config("w1", None, ["s1", "s1"], True)
+
+
 if __name__ == '__main__':
+    test_from_list_tutorial()
+    test_from_file_tutorial()
+    test_from_dict_tutorial()
     test_from_list()
     test_from_file()
-    test_from_dict()
diff --git a/tests/ut/python/dataset/test_wordpiece_tokenizer.py b/tests/ut/python/dataset/test_wordpiece_tokenizer.py
new file mode 100644
index 0000000000..7934884740
--- /dev/null
+++ b/tests/ut/python/dataset/test_wordpiece_tokenizer.py
@@ -0,0 +1,113 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing WordpieceTokenizer op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+from mindspore import log as logger
+import mindspore.dataset.text as nlp
+
+WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
+
+vocab_english = [
+    "book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"
+]
+
+vocab_chinese = [
+    "我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情'
+]
+
+vocab_mix = vocab_chinese + vocab_english
+
+test_paras = [
+    dict(
+        first=1,
+        last=10,
+        expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
+                    ['era'], ['[UNK]']],
+        vocab_list=vocab_english
+    ),
+    dict(
+        first=1,
+        last=10,
+        expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
+                    ['era'], ['what']],
+        vocab_list=vocab_english,
+        unknown_token=""
+    ),
+    dict(
+        first=1,
+        last=10,
+        expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']],
+        vocab_list=vocab_english,
+        max_bytes_per_token=4
+    ),
+    dict(
+        first=11,
+        last=25,
+        expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
+                    ['[UNK]']],
+        vocab_list=vocab_chinese,
+    ),
+    dict(
+        first=25,
+        last=25,
+        expect_str=[['您']],
+        vocab_list=vocab_chinese,
+        unknown_token=""
+    ),
+    dict(
+        first=1,
+        last=25,
+        expect_str=[
+            ['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'],
+            ['[UNK]'],
+            ['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
+            ['[UNK]']],
+        vocab_list=vocab_mix,
+    ),
+]
+
+
+def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
+    dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
+    if first > 1:
+        dataset = dataset.skip(first - 1)
+    if last >= first:
+        dataset = dataset.take(last - first + 1)
+    vocab = nlp.Vocab.from_list(vocab_list)
+    tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token,
+                                          max_bytes_per_token=max_bytes_per_token)
+    dataset = dataset.map(operations=tokenizer_op)
+    count = 0
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text'])
+        logger.info("Out:", text)
+        logger.info("Exp:", expect_str[count])
+        np.testing.assert_array_equal(text, expect_str[count])
+        count = count + 1
+
+
+def test_wordpiece_tokenizer():
+    """
+    Test WordpieceTokenizer
+    """
+    for paras in test_paras:
+        check_wordpiece_tokenizer(**paras)
+
+
+if __name__ == '__main__':
+    test_wordpiece_tokenizer()
diff --git a/tests/ut/python/dataset/util.py b/tests/ut/python/dataset/util.py
index feb1e7b406..11335e120b 100644
--- a/tests/ut/python/dataset/util.py
+++ b/tests/ut/python/dataset/util.py
@@ -25,6 +25,11 @@ from mindspore import log as logger
 # These are the column names defined in the testTFTestAllTypes dataset
 COLUMNS = ["col_1d", "col_2d", "col_3d", "col_binary", "col_float",
            "col_sint16", "col_sint32", "col_sint64"]
+# These are list of plot title in different visualize modes
+PLOT_TITLE_DICT = {
+    1: ["Original image", "Transformed image"],
+    2: ["c_transform image", "py_transform image"]
+}
 SAVE_JSON = False
 
 
@@ -206,19 +211,54 @@ def diff_me(in1, in2):
     return mse / 255 * 100
 
 
-def visualize(image_original, image_transformed):
+def visualize_list(image_list_1, image_list_2, visualize_mode=1):
     """
-    visualizes the image using DE op and Numpy op
+    visualizes a list of images using DE op
     """
-    num = len(image_transformed)
+    plot_title = PLOT_TITLE_DICT[visualize_mode]
+    num = len(image_list_1)
     for i in range(num):
         plt.subplot(2, num, i + 1)
-        plt.imshow(image_original[i])
-        plt.title("Original image")
+        plt.imshow(image_list_1[i])
+        plt.title(plot_title[0])
 
         plt.subplot(2, num, i + num + 1)
-        plt.imshow(image_transformed[i])
-        plt.title("Transformed image")
+        plt.imshow(image_list_2[i])
+        plt.title(plot_title[1])
+
+    plt.show()
+
+
+def visualize_image(image_original, image_de, mse=None, image_lib=None):
+    """
+    visualizes one example image with optional input: mse, image using 3rd party op.
+    If three images are passing in, different image is calculated by 2nd and 3rd images.
+    """
+    num = 2
+    if image_lib is not None:
+        num += 1
+    if mse is not None:
+        num += 1
+    plt.subplot(1, num, 1)
+    plt.imshow(image_original)
+    plt.title("Original image")
+
+    plt.subplot(1, num, 2)
+    plt.imshow(image_de)
+    plt.title("DE Op image")
+
+    if image_lib is not None:
+        plt.subplot(1, num, 3)
+        plt.imshow(image_lib)
+        plt.title("Lib Op image")
+        if mse is not None:
+            plt.subplot(1, num, 4)
+            plt.imshow(image_de - image_lib)
+            plt.title("Diff image,\n mse : {}".format(mse))
+    elif mse is not None:
+        plt.subplot(1, num, 3)
+        plt.imshow(image_original - image_de)
+        plt.title("Diff image,\n mse : {}".format(mse))
 
     plt.show()
 
diff --git a/tests/ut/python/dtype/test_dictionary.py b/tests/ut/python/dtype/test_dictionary.py
index 14033873f7..052372dd39 100644
--- a/tests/ut/python/dtype/test_dictionary.py
+++ b/tests/ut/python/dtype/test_dictionary.py
@@ -17,6 +17,7 @@
 @Desc   : test_dictionary
 """
 import numpy as np
+import pytest
 
 from mindspore import Tensor, context
 from mindspore.nn import Cell
@@ -89,7 +90,9 @@ def test_dict_set_or_get_item():
             return ret
 
     net = DictNet()
-    assert net() == (88, 99, 4, 5, 6)
+    with pytest.raises(TypeError) as ex:
+        net()
+    assert "'self.dict_' should be a Parameter" in str(ex.value)
 
 
 def test_dict_set_or_get_item_2():
@@ -135,7 +138,9 @@ def test_dict_set_or_get_item_3():
             return self.dict_["x"]
 
     net = DictNet()
-    assert net() == Tensor(np.ones([4, 2, 3], np.float32))
+    with pytest.raises(TypeError) as ex:
+        net()
+    assert "'self.dict_' should be a Parameter" in str(ex.value)
 
 
 def test_dict_set_item():
diff --git a/tests/ut/python/dtype/test_list.py b/tests/ut/python/dtype/test_list.py
index f9ddb1a16c..ffd8c8eae3 100644
--- a/tests/ut/python/dtype/test_list.py
+++ b/tests/ut/python/dtype/test_list.py
@@ -15,15 +15,18 @@
 import functools
 import numpy as np
 
+import pytest
 import mindspore.nn as nn
 import mindspore.context as context
 from mindspore import Tensor
 from mindspore.ops import operations as P
+from mindspore.common import dtype as mstype
 from tests.ut.python.ut_filter import non_graph_engine
 from tests.mindspore_test_framework.mindspore_test import mindspore_test
 from tests.mindspore_test_framework.pipeline.forward.compile_forward \
     import pipeline_for_compile_forward_ge_graph_for_case_by_case_config
 
+context.set_context(mode=context.GRAPH_MODE)
 
 def test_list_equal():
     class Net(nn.Cell):
@@ -42,7 +45,12 @@ def test_list_equal():
     y = Tensor(np.zeros([3, 4, 5], np.int32))
     z = [1, 2, 3]
     net = Net(z)
-    assert net(x, y) == x
+    ret = net(x, y)
+
+    print(ret.asnumpy())
+    assert ret == x
+    assert ret.dtype == mstype.int32
+    assert ret.shape == (6, 8, 10)
 
 
 def test_list_not_equal():
@@ -109,7 +117,7 @@ def test_list_append():
     assert net(x, y) == y
 
 
-def test_list_append_2():
+def test_class_member_list_append():
     class Net(nn.Cell):
         def __init__(self, z: list):
             super(Net, self).__init__()
@@ -129,7 +137,45 @@ def test_list_append_2():
     y = Tensor(np.zeros([3, 4, 5], np.int32))
     z = [[1, 2], 3]
     net = Net(z)
-    assert net(x, y) == x
+    with pytest.raises(TypeError) as ex:
+        net(x, y)
+    assert "'self.z' should be a Parameter, but got '[[1, 2], 3]' with type 'list'." in str(ex.value)
+
+
+def test_class_member_not_defined():
+    class Net(nn.Cell):
+        def __init__(self, z: list):
+            super(Net, self).__init__()
+            self.z = z
+
+        def construct(self, x, y):
+            self.x[0] = 9
+            return self.x
+
+    z = [[1, 2], 3]
+    net = Net(z)
+    with pytest.raises(TypeError) as ex:
+        net()
+    assert "'self.x' was not defined in the class '__init__' function." in str(ex.value)
+
+
+def test_change_list_element():
+    class Net(nn.Cell):
+        def __init__(self, z: list):
+            super(Net, self).__init__()
+            self.z = z
+
+        def construct(self, x, y):
+            self.z[0] = x
+            return self.z[0]
+
+    x = Tensor(np.ones([6, 8, 10], np.int32))
+    y = Tensor(np.zeros([3, 4, 5], np.int32))
+    z = [[1, 2], 3]
+    net = Net(z)
+    with pytest.raises(TypeError) as ex:
+        net(x, y)
+    assert "'self.z' should be a Parameter, but got '[[1, 2], 3]' with type 'list'." in str(ex.value)
 
 
 class ListOperate(nn.Cell):
diff --git a/tests/ut/python/exec/test_bias_add.py b/tests/ut/python/exec/test_bias_add.py
index a80be608fd..2349c61a9b 100644
--- a/tests/ut/python/exec/test_bias_add.py
+++ b/tests/ut/python/exec/test_bias_add.py
@@ -33,7 +33,7 @@ class Net(nn.Cell):
         self.biasAdd = P.BiasAdd()
 
         if isinstance(bias_init, Tensor):
-            if bias_init.dim() != 1 or bias_init.shape()[0] != output_channels:
+            if bias_init.dim() != 1 or bias_init.shape[0] != output_channels:
                 raise ValueError("bias_init shape error")
 
         self.bias = Parameter(initializer(
diff --git a/tests/ut/python/exec/test_train.py b/tests/ut/python/exec/test_train.py
index 9f0e267197..2cd9b9cad4 100644
--- a/tests/ut/python/exec/test_train.py
+++ b/tests/ut/python/exec/test_train.py
@@ -65,7 +65,7 @@ def test_bias_add(test_with_simu):
             self.biasAdd = P.BiasAdd()
 
             if isinstance(bias_init, Tensor):
-                if bias_init.dim() != 1 or bias_init.shape()[0] != output_channels:
+                if bias_init.dim() != 1 or bias_init.shape[0] != output_channels:
                     raise ValueError("bias_init shape error")
 
             self.bias = Parameter(initializer(
diff --git a/tests/ut/python/ir/test_tensor.py b/tests/ut/python/ir/test_tensor.py
index 3786505a50..ff0a5c971f 100644
--- a/tests/ut/python/ir/test_tensor.py
+++ b/tests/ut/python/ir/test_tensor.py
@@ -50,148 +50,148 @@ def test_tensor():
     """test_tensor"""
     t1 = ms.Tensor(ndarr)
     assert isinstance(t1, ms.Tensor)
-    assert t1.dtype() == ms.float64
+    assert t1.dtype == ms.float64
 
     t2 = ms.Tensor(np.zeros([1, 2, 3]), ms.float32)
     assert isinstance(t2, ms.Tensor)
-    assert t2.shape() == (1, 2, 3)
-    assert t2.dtype() == ms.float32
+    assert t2.shape == (1, 2, 3)
+    assert t2.dtype == ms.float32
 
     t3 = ms.Tensor(0.1)
     assert isinstance(t3, ms.Tensor)
-    assert t3.dtype() == ms.float64
+    assert t3.dtype == ms.float64
 
     t4 = ms.Tensor(1)
     assert isinstance(t4, ms.Tensor)
-    assert t4.dtype() == ms.int64
+    assert t4.dtype == ms.int64
 
 
 def test_tensor_type_float16():
     t_float16 = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float16))
     assert isinstance(t_float16, ms.Tensor)
-    assert t_float16.shape() == (2, 3)
-    assert t_float16.dtype() == ms.float16
+    assert t_float16.shape == (2, 3)
+    assert t_float16.dtype == ms.float16
 
 
 def test_tensor_type_float32():
     t_float32 = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32))
     assert isinstance(t_float32, ms.Tensor)
-    assert t_float32.shape() == (2, 3)
-    assert t_float32.dtype() == ms.float32
+    assert t_float32.shape == (2, 3)
+    assert t_float32.dtype == ms.float32
 
 
 def test_tensor_type_float32_user_define():
     t = ms.Tensor(np.zeros([1, 2, 3]), ms.float32)
     assert isinstance(t, ms.Tensor)
-    assert t.shape() == (1, 2, 3)
-    assert t.dtype() == ms.float32
+    assert t.shape == (1, 2, 3)
+    assert t.dtype == ms.float32
 
 
 def test_tensor_type_float64():
     t = ms.Tensor([[1.0, 2, 3], [4, 5, 6]])
     assert isinstance(t, ms.Tensor)
-    assert t.shape() == (2, 3)
-    assert t.dtype() == ms.float64
+    assert t.shape == (2, 3)
+    assert t.dtype == ms.float64
 
     t_zero = ms.Tensor(np.zeros([1, 2, 3]))
     assert isinstance(t_zero, ms.Tensor)
-    assert t_zero.shape() == (1, 2, 3)
-    assert t_zero.dtype() == ms.float64
+    assert t_zero.shape == (1, 2, 3)
+    assert t_zero.dtype == ms.float64
 
 
 def test_tensor_type_float64_user_define():
     t = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=float))
     assert isinstance(t, ms.Tensor)
-    assert t.shape() == (2, 3)
-    assert t.dtype() == ms.float64
+    assert t.shape == (2, 3)
+    assert t.dtype == ms.float64
 
     t_float64 = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]]), ms.float64)
     assert isinstance(t_float64, ms.Tensor)
-    assert t_float64.shape() == (2, 3)
-    assert t_float64.dtype() == ms.float64
+    assert t_float64.shape == (2, 3)
+    assert t_float64.dtype == ms.float64
 
 
 def test_tensor_type_bool():
     # init a tensor with bool type
     ts_bool_array = ms.Tensor(np.zeros([2, 3], np.bool), ms.bool_)
     assert isinstance(ts_bool_array, ms.Tensor)
-    assert ts_bool_array.dtype() == ms.bool_
+    assert ts_bool_array.dtype == ms.bool_
 
     t_bool = ms.Tensor(True)
     assert isinstance(t_bool, ms.Tensor)
-    assert t_bool.dtype() == ms.bool_
+    assert t_bool.dtype == ms.bool_
 
     t_bool_array = ms.Tensor(np.array([[True, False, True], [False, False, False]]))
     assert isinstance(t_bool_array, ms.Tensor)
-    assert t_bool_array.shape() == (2, 3)
-    assert t_bool_array.dtype() == ms.bool_
+    assert t_bool_array.shape == (2, 3)
+    assert t_bool_array.dtype == ms.bool_
 
 
 def test_tensor_type_int8():
     t_int8_array = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int8))
     assert isinstance(t_int8_array, ms.Tensor)
-    assert t_int8_array.shape() == (2, 3)
-    assert t_int8_array.dtype() == ms.int8
+    assert t_int8_array.shape == (2, 3)
+    assert t_int8_array.dtype == ms.int8
 
 
 def test_tensor_type_int16():
     t_int16_array = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int16))
     assert isinstance(t_int16_array, ms.Tensor)
-    assert t_int16_array.shape() == (2, 3)
-    assert t_int16_array.dtype() == ms.int16
+    assert t_int16_array.shape == (2, 3)
+    assert t_int16_array.dtype == ms.int16
 
 
 def test_tensor_type_int32():
     t_int = ms.Tensor([[1, 2, 3], [4, 5, 6]])
     assert isinstance(t_int, ms.Tensor)
-    assert t_int.shape() == (2, 3)
-    assert t_int.dtype() == ms.int64
+    assert t_int.shape == (2, 3)
+    assert t_int.dtype == ms.int64
 
     t_int_array = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
     assert isinstance(t_int_array, ms.Tensor)
-    assert t_int_array.shape() == (2, 3)
-    assert t_int_array.dtype() == ms.int32
+    assert t_int_array.shape == (2, 3)
+    assert t_int_array.dtype == ms.int32
 
 
 def test_tensor_type_int64():
     t_int64 = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64))
     assert isinstance(t_int64, ms.Tensor)
-    assert t_int64.shape() == (2, 3)
-    assert t_int64.dtype() == ms.int64
+    assert t_int64.shape == (2, 3)
+    assert t_int64.dtype == ms.int64
 
 
 def test_tensor_type_uint8():
     t_uint8_array = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8))
     assert isinstance(t_uint8_array, ms.Tensor)
-    assert t_uint8_array.shape() == (2, 3)
-    assert t_uint8_array.dtype() == ms.uint8
+    assert t_uint8_array.shape == (2, 3)
+    assert t_uint8_array.dtype == ms.uint8
 
 
 def test_tensor_type_uint16():
     t_uint16_array = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint16))
     assert isinstance(t_uint16_array, ms.Tensor)
-    assert t_uint16_array.shape() == (2, 3)
-    assert t_uint16_array.dtype() == ms.uint16
+    assert t_uint16_array.shape == (2, 3)
+    assert t_uint16_array.dtype == ms.uint16
 
 
 def test_tensor_type_uint32():
     t_uint32_array = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint32))
     assert isinstance(t_uint32_array, ms.Tensor)
-    assert t_uint32_array.shape() == (2, 3)
-    assert t_uint32_array.dtype() == ms.uint32
+    assert t_uint32_array.shape == (2, 3)
+    assert t_uint32_array.dtype == ms.uint32
 
 
 def test_tensor_type_uint64():
     t_uint64 = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint64))
     assert isinstance(t_uint64, ms.Tensor)
-    assert t_uint64.shape() == (2, 3)
-    assert t_uint64.dtype() == ms.uint64
+    assert t_uint64.shape == (2, 3)
+    assert t_uint64.dtype == ms.uint64
 
 
 def test_set_type():
     t = ms.Tensor(ndarr)
     t.set_dtype(ms.float32)
-    assert t.dtype() == ms.float32
+    assert t.dtype == ms.float32
 
 
 @non_graph_engine
@@ -250,11 +250,11 @@ def test_return_tensor():
     tensor_ = exe(net, input_data)
 
     # get shape
-    shape_ = tensor_.shape()
+    shape_ = tensor_.shape
     print("shape = ", shape_)
 
     # get type
-    type_ = tensor_.dtype()
+    type_ = tensor_.dtype
     print("type = ", type_)
 
     # get value
@@ -452,5 +452,5 @@ def test_tensor_operation():
     assert np.all(res.asnumpy() == np.ones((3, 3)) * 2)
     res = 8 / x
     assert np.all(res.asnumpy() == np.ones((3, 3)) * 2)
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError):
         res = x * (2, 3)
diff --git a/tests/ut/python/ir/test_tensor_py.py b/tests/ut/python/ir/test_tensor_py.py
index 389730413b..2aa08aa4ef 100644
--- a/tests/ut/python/ir/test_tensor_py.py
+++ b/tests/ut/python/ir/test_tensor_py.py
@@ -71,7 +71,7 @@ def test_tensor_size():
 
 def test_dtype():
     a = ms.Tensor(np.ones((2, 3), dtype=np.int32))
-    assert a.dtype() == ms.int32
+    assert a.dtype == ms.int32
 
 
 def test_asnumpy():
@@ -89,7 +89,7 @@ def test_print():
 
 def test_float():
     a = ms.Tensor(np.ones((2, 3)), ms.float16)
-    assert a.dtype() == ms.float16
+    assert a.dtype == ms.float16
 
 
 def test_tensor_method_sub():
diff --git a/tests/ut/python/mindrecord/test_csv_to_mindrecord.py b/tests/ut/python/mindrecord/test_csv_to_mindrecord.py
new file mode 100644
index 0000000000..02c19359f2
--- /dev/null
+++ b/tests/ut/python/mindrecord/test_csv_to_mindrecord.py
@@ -0,0 +1,143 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test csv to mindrecord tool"""
+import os
+from importlib import import_module
+import pytest
+
+from mindspore import log as logger
+from mindspore.mindrecord import FileReader
+from mindspore.mindrecord import CsvToMR
+
+try:
+    pd = import_module('pandas')
+except ModuleNotFoundError:
+    pd = None
+
+CSV_FILE = "../data/mindrecord/testCsv/data.csv"
+MINDRECORD_FILE = "../data/mindrecord/testCsv/csv.mindrecord"
+PARTITION_NUMBER = 4
+
+@pytest.fixture(name="remove_mindrecord_file")
+def fixture_remove():
+    """add/remove file"""
+    def remove_one_file(x):
+        if os.path.exists(x):
+            os.remove(x)
+    def remove_file():
+        x = MINDRECORD_FILE
+        remove_one_file(x)
+        x = MINDRECORD_FILE + ".db"
+        remove_one_file(x)
+        for i in range(PARTITION_NUMBER):
+            x = MINDRECORD_FILE + str(i)
+            remove_one_file(x)
+            x = MINDRECORD_FILE + str(i) + ".db"
+            remove_one_file(x)
+
+    remove_file()
+    yield "yield_fixture_data"
+    remove_file()
+
+def read(filename, columns, row_num):
+    """test file reade"""
+    if not pd:
+        raise Exception("Module pandas is not found, please use pip install it.")
+    df = pd.read_csv(CSV_FILE)
+    count = 0
+    reader = FileReader(filename)
+    for _, x in enumerate(reader.get_next()):
+        for  col in columns:
+            assert x[col] == df[col].iloc[count]
+        assert len(x) == len(columns)
+        count = count + 1
+        if count == 1:
+            logger.info("data: {}".format(x))
+    assert count == row_num
+    reader.close()
+
+def test_csv_to_mindrecord(remove_mindrecord_file):
+    """test transform csv  to mindrecord."""
+    csv_trans = CsvToMR(CSV_FILE, MINDRECORD_FILE, partition_number=PARTITION_NUMBER)
+    csv_trans.transform()
+    for i in range(PARTITION_NUMBER):
+        assert os.path.exists(MINDRECORD_FILE + str(i))
+        assert os.path.exists(MINDRECORD_FILE + str(i) + ".db")
+    read(MINDRECORD_FILE + "0", ["Age", "EmployNumber", "Name", "Sales", "Over18"], 5)
+
+def test_csv_to_mindrecord_with_columns(remove_mindrecord_file):
+    """test transform csv  to mindrecord."""
+    csv_trans = CsvToMR(CSV_FILE, MINDRECORD_FILE, columns_list=['Age', 'Sales'], partition_number=PARTITION_NUMBER)
+    csv_trans.transform()
+    for i in range(PARTITION_NUMBER):
+        assert os.path.exists(MINDRECORD_FILE + str(i))
+        assert os.path.exists(MINDRECORD_FILE + str(i) + ".db")
+    read(MINDRECORD_FILE + "0", ["Age", "Sales"], 5)
+
+def test_csv_to_mindrecord_with_no_exist_columns(remove_mindrecord_file):
+    """test transform csv  to mindrecord."""
+    with pytest.raises(Exception, match="The parameter columns_list is illegal, column ssales does not exist."):
+        csv_trans = CsvToMR(CSV_FILE, MINDRECORD_FILE, columns_list=['Age', 'ssales'],
+                            partition_number=PARTITION_NUMBER)
+        csv_trans.transform()
+
+def test_csv_partition_number_with_illegal_columns(remove_mindrecord_file):
+    """
+    test transform csv  to mindrecord
+    """
+    with pytest.raises(Exception, match="The parameter columns_list must be list of str."):
+        csv_trans = CsvToMR(CSV_FILE, MINDRECORD_FILE, ["Sales", 2])
+        csv_trans.transform()
+
+
+def test_csv_to_mindrecord_default_partition_number(remove_mindrecord_file):
+    """
+    test transform csv to mindrecord
+    when partition number is default.
+    """
+    csv_trans = CsvToMR(CSV_FILE, MINDRECORD_FILE)
+    csv_trans.transform()
+    assert os.path.exists(MINDRECORD_FILE)
+    assert os.path.exists(MINDRECORD_FILE + ".db")
+    read(MINDRECORD_FILE, ["Age", "EmployNumber", "Name", "Sales", "Over18"], 5)
+
+def test_csv_partition_number_0(remove_mindrecord_file):
+    """
+    test transform csv  to mindrecord
+    when partition number is 0.
+    """
+    with pytest.raises(Exception, match="Invalid parameter value"):
+        csv_trans = CsvToMR(CSV_FILE, MINDRECORD_FILE, None, 0)
+        csv_trans.transform()
+
+def test_csv_to_mindrecord_partition_number_none(remove_mindrecord_file):
+    """
+    test transform csv to mindrecord
+    when partition number is none.
+    """
+    with pytest.raises(Exception,
+                       match="The parameter partition_number must be int"):
+        csv_trans = CsvToMR(CSV_FILE, MINDRECORD_FILE, None, None)
+        csv_trans.transform()
+
+def test_csv_to_mindrecord_illegal_filename(remove_mindrecord_file):
+    """
+    test transform csv  to mindrecord
+    when file name contains illegal character.
+    """
+    filename = "not_*ok"
+    with pytest.raises(Exception, match="File name should not contains"):
+        csv_trans = CsvToMR(CSV_FILE, filename)
+        csv_trans.transform()
diff --git a/tests/ut/python/mindrecord/test_tfrecord_to_mr.py b/tests/ut/python/mindrecord/test_tfrecord_to_mr.py
new file mode 100644
index 0000000000..87c002aa3b
--- /dev/null
+++ b/tests/ut/python/mindrecord/test_tfrecord_to_mr.py
@@ -0,0 +1,400 @@
+# Copyright 2020 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""test tfrecord to mindrecord tool"""
+import collections
+from importlib import import_module
+import os
+
+import numpy as np
+import pytest
+from mindspore import log as logger
+from mindspore.mindrecord import FileReader
+from mindspore.mindrecord import TFRecordToMR
+
+SupportedTensorFlowVersion = '2.1.0'
+
+try:
+    tf = import_module("tensorflow")    # just used to convert tfrecord to mindrecord
+except ModuleNotFoundError:
+    logger.warning("tensorflow module not found.")
+    tf = None
+
+TFRECORD_DATA_DIR = "../data/mindrecord/testTFRecordData"
+TFRECORD_FILE_NAME = "test.tfrecord"
+MINDRECORD_FILE_NAME = "test.mindrecord"
+PARTITION_NUM = 1
+
+def verify_data(transformer, reader):
+    """Verify the data by read from mindrecord"""
+    tf_iter = transformer.tfrecord_iterator()
+    mr_iter = reader.get_next()
+
+    count = 0
+    for tf_item, mr_item in zip(tf_iter, mr_iter):
+        count = count + 1
+        assert len(tf_item) == 6
+        assert len(mr_item) == 6
+        for key, value in tf_item.items():
+            logger.info("key: {}, tfrecord: value: {}, mindrecord: value: {}".format(key, value, mr_item[key]))
+            if isinstance(value, np.ndarray):
+                assert (value == mr_item[key]).all()
+            else:
+                assert value == mr_item[key]
+    assert count == 10
+
+def generate_tfrecord():
+    def create_int_feature(values):
+        if isinstance(values, list):
+            feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))  # values: [int, int, int]
+        else:
+            feature = tf.train.Feature(int64_list=tf.train.Int64List(value=[values]))      # values: int
+        return feature
+
+    def create_float_feature(values):
+        if isinstance(values, list):
+            feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))  # values: [float, float]
+        else:
+            feature = tf.train.Feature(float_list=tf.train.FloatList(value=[values]))      # values: float
+        return feature
+
+    def create_bytes_feature(values):
+        if isinstance(values, bytes):
+            feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))      # values: bytes
+        else:
+            # values: string
+            feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(values, encoding='utf-8')]))
+        return feature
+
+    writer = tf.io.TFRecordWriter(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+    example_count = 0
+    for i in range(10):
+        file_name = "000" + str(i) + ".jpg"
+        image_bytes = bytes(str("aaaabbbbcccc" + str(i)), encoding="utf-8")
+        int64_scalar = i
+        float_scalar = float(i)
+        int64_list = [i, i+1, i+2, i+3, i+4, i+1234567890]
+        float_list = [float(i), float(i+1), float(i+2.8), float(i+3.2),
+                      float(i+4.4), float(i+123456.9), float(i+98765432.1)]
+
+        features = collections.OrderedDict()
+        features["file_name"] = create_bytes_feature(file_name)
+        features["image_bytes"] = create_bytes_feature(image_bytes)
+        features["int64_scalar"] = create_int_feature(int64_scalar)
+        features["float_scalar"] = create_float_feature(float_scalar)
+        features["int64_list"] = create_int_feature(int64_list)
+        features["float_list"] = create_float_feature(float_list)
+
+        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+        writer.write(tf_example.SerializeToString())
+        example_count += 1
+    writer.close()
+    logger.info("Write {} rows in tfrecord.".format(example_count))
+
+def test_tfrecord_to_mindrecord():
+    """test transform tfrecord to mindrecord."""
+    if not tf or tf.__version__ < SupportedTensorFlowVersion:
+        # skip the test
+        logger.warning("Module tensorflow is not found or version wrong, \
+            please use pip install it / reinstall version >= {}.".format(SupportedTensorFlowVersion))
+        return
+
+    generate_tfrecord()
+    assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+    feature_dict = {"file_name": tf.io.FixedLenFeature([], tf.string),
+                    "image_bytes": tf.io.FixedLenFeature([], tf.string),
+                    "int64_scalar": tf.io.FixedLenFeature([], tf.int64),
+                    "float_scalar": tf.io.FixedLenFeature([], tf.float32),
+                    "int64_list": tf.io.FixedLenFeature([6], tf.int64),
+                    "float_list": tf.io.FixedLenFeature([7], tf.float32),
+                    }
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    tfrecord_transformer = TFRecordToMR(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME),
+                                        MINDRECORD_FILE_NAME, feature_dict, ["image_bytes"])
+    tfrecord_transformer.transform()
+
+    assert os.path.exists(MINDRECORD_FILE_NAME)
+    assert os.path.exists(MINDRECORD_FILE_NAME + ".db")
+
+    fr_mindrecord = FileReader(MINDRECORD_FILE_NAME)
+    verify_data(tfrecord_transformer, fr_mindrecord)
+
+    os.remove(MINDRECORD_FILE_NAME)
+    os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+def test_tfrecord_to_mindrecord_scalar_with_1():
+    """test transform tfrecord to mindrecord."""
+    if not tf or tf.__version__ < SupportedTensorFlowVersion:
+        # skip the test
+        logger.warning("Module tensorflow is not found or version wrong, \
+            please use pip install it / reinstall version >= {}.".format(SupportedTensorFlowVersion))
+        return
+
+    generate_tfrecord()
+    assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+    feature_dict = {"file_name": tf.io.FixedLenFeature([], tf.string),
+                    "image_bytes": tf.io.FixedLenFeature([], tf.string),
+                    "int64_scalar": tf.io.FixedLenFeature([1], tf.int64),
+                    "float_scalar": tf.io.FixedLenFeature([1], tf.float32),
+                    "int64_list": tf.io.FixedLenFeature([6], tf.int64),
+                    "float_list": tf.io.FixedLenFeature([7], tf.float32),
+                    }
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    tfrecord_transformer = TFRecordToMR(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME),
+                                        MINDRECORD_FILE_NAME, feature_dict, ["image_bytes"])
+    tfrecord_transformer.transform()
+
+    assert os.path.exists(MINDRECORD_FILE_NAME)
+    assert os.path.exists(MINDRECORD_FILE_NAME + ".db")
+
+    fr_mindrecord = FileReader(MINDRECORD_FILE_NAME)
+    verify_data(tfrecord_transformer, fr_mindrecord)
+
+    os.remove(MINDRECORD_FILE_NAME)
+    os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+def test_tfrecord_to_mindrecord_scalar_with_1_list_small_len_exception():
+    """test transform tfrecord to mindrecord."""
+    if not tf or tf.__version__ < SupportedTensorFlowVersion:
+        # skip the test
+        logger.warning("Module tensorflow is not found or version wrong, \
+            please use pip install it / reinstall version >= {}.".format(SupportedTensorFlowVersion))
+        return
+
+    generate_tfrecord()
+    assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+    feature_dict = {"file_name": tf.io.FixedLenFeature([], tf.string),
+                    "image_bytes": tf.io.FixedLenFeature([], tf.string),
+                    "int64_scalar": tf.io.FixedLenFeature([1], tf.int64),
+                    "float_scalar": tf.io.FixedLenFeature([1], tf.float32),
+                    "int64_list": tf.io.FixedLenFeature([6], tf.int64),
+                    "float_list": tf.io.FixedLenFeature([2], tf.float32),
+                    }
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    with pytest.raises(ValueError):
+        tfrecord_transformer = TFRecordToMR(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME),
+                                            MINDRECORD_FILE_NAME, feature_dict, ["image_bytes"])
+        tfrecord_transformer.transform()
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+def test_tfrecord_to_mindrecord_list_with_diff_type_exception():
+    """test transform tfrecord to mindrecord."""
+    if not tf or tf.__version__ < SupportedTensorFlowVersion:
+        # skip the test
+        logger.warning("Module tensorflow is not found or version wrong, \
+            please use pip install it / reinstall version >= {}.".format(SupportedTensorFlowVersion))
+        return
+
+    generate_tfrecord()
+    assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+    feature_dict = {"file_name": tf.io.FixedLenFeature([], tf.string),
+                    "image_bytes": tf.io.FixedLenFeature([], tf.string),
+                    "int64_scalar": tf.io.FixedLenFeature([1], tf.int64),
+                    "float_scalar": tf.io.FixedLenFeature([1], tf.float32),
+                    "int64_list": tf.io.FixedLenFeature([6], tf.float32),
+                    "float_list": tf.io.FixedLenFeature([7], tf.float32),
+                    }
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    with pytest.raises(ValueError):
+        tfrecord_transformer = TFRecordToMR(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME),
+                                            MINDRECORD_FILE_NAME, feature_dict, ["image_bytes"])
+        tfrecord_transformer.transform()
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+def test_tfrecord_to_mindrecord_list_without_bytes_type():
+    """test transform tfrecord to mindrecord."""
+    if not tf or tf.__version__ < SupportedTensorFlowVersion:
+        # skip the test
+        logger.warning("Module tensorflow is not found or version wrong, \
+            please use pip install it / reinstall version >= {}.".format(SupportedTensorFlowVersion))
+        return
+
+    generate_tfrecord()
+    assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+    feature_dict = {"file_name": tf.io.FixedLenFeature([], tf.string),
+                    "image_bytes": tf.io.FixedLenFeature([], tf.string),
+                    "int64_scalar": tf.io.FixedLenFeature([1], tf.int64),
+                    "float_scalar": tf.io.FixedLenFeature([1], tf.float32),
+                    "int64_list": tf.io.FixedLenFeature([6], tf.int64),
+                    "float_list": tf.io.FixedLenFeature([7], tf.float32),
+                    }
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    tfrecord_transformer = TFRecordToMR(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME),
+                                        MINDRECORD_FILE_NAME, feature_dict)
+    tfrecord_transformer.transform()
+
+    assert os.path.exists(MINDRECORD_FILE_NAME)
+    assert os.path.exists(MINDRECORD_FILE_NAME + ".db")
+
+    fr_mindrecord = FileReader(MINDRECORD_FILE_NAME)
+    verify_data(tfrecord_transformer, fr_mindrecord)
+
+    os.remove(MINDRECORD_FILE_NAME)
+    os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+def test_tfrecord_to_mindrecord_scalar_with_2_exception():
+    """test transform tfrecord to mindrecord."""
+    if not tf or tf.__version__ < SupportedTensorFlowVersion:
+        # skip the test
+        logger.warning("Module tensorflow is not found or version wrong, \
+            please use pip install it / reinstall version >= {}.".format(SupportedTensorFlowVersion))
+        return
+
+    generate_tfrecord()
+    assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+    feature_dict = {"file_name": tf.io.FixedLenFeature([], tf.string),
+                    "image_bytes": tf.io.FixedLenFeature([], tf.string),
+                    "int64_scalar": tf.io.FixedLenFeature([2], tf.int64),
+                    "float_scalar": tf.io.FixedLenFeature([1], tf.float32),
+                    "int64_list": tf.io.FixedLenFeature([6], tf.int64),
+                    "float_list": tf.io.FixedLenFeature([7], tf.float32),
+                    }
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    tfrecord_transformer = TFRecordToMR(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME),
+                                        MINDRECORD_FILE_NAME, feature_dict, ["image_bytes"])
+    with pytest.raises(ValueError):
+        tfrecord_transformer.transform()
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+def test_tfrecord_to_mindrecord_scalar_string_with_1_exception():
+    """test transform tfrecord to mindrecord."""
+    if not tf or tf.__version__ < SupportedTensorFlowVersion:
+        # skip the test
+        logger.warning("Module tensorflow is not found or version wrong, \
+            please use pip install it / reinstall version >= {}.".format(SupportedTensorFlowVersion))
+        return
+
+    generate_tfrecord()
+    assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+    feature_dict = {"file_name": tf.io.FixedLenFeature([1], tf.string),
+                    "image_bytes": tf.io.FixedLenFeature([], tf.string),
+                    "int64_scalar": tf.io.FixedLenFeature([1], tf.int64),
+                    "float_scalar": tf.io.FixedLenFeature([1], tf.float32),
+                    "int64_list": tf.io.FixedLenFeature([6], tf.int64),
+                    "float_list": tf.io.FixedLenFeature([7], tf.float32),
+                    }
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    with pytest.raises(ValueError):
+        tfrecord_transformer = TFRecordToMR(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME),
+                                            MINDRECORD_FILE_NAME, feature_dict, ["image_bytes"])
+        tfrecord_transformer.transform()
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+def test_tfrecord_to_mindrecord_scalar_bytes_with_10_exception():
+    """test transform tfrecord to mindrecord."""
+    if not tf or tf.__version__ < SupportedTensorFlowVersion:
+        # skip the test
+        logger.warning("Module tensorflow is not found or version wrong, \
+            please use pip install it / reinstall version >= {}.".format(SupportedTensorFlowVersion))
+        return
+
+    generate_tfrecord()
+    assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
+
+    feature_dict = {"file_name": tf.io.FixedLenFeature([], tf.string),
+                    "image_bytes": tf.io.FixedLenFeature([10], tf.string),
+                    "int64_scalar": tf.io.FixedLenFeature([1], tf.int64),
+                    "float_scalar": tf.io.FixedLenFeature([1], tf.float32),
+                    "int64_list": tf.io.FixedLenFeature([6], tf.int64),
+                    "float_list": tf.io.FixedLenFeature([7], tf.float32),
+                    }
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    with pytest.raises(ValueError):
+        tfrecord_transformer = TFRecordToMR(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME),
+                                            MINDRECORD_FILE_NAME, feature_dict, ["image_bytes"])
+        tfrecord_transformer.transform()
+
+    if os.path.exists(MINDRECORD_FILE_NAME):
+        os.remove(MINDRECORD_FILE_NAME)
+    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
+        os.remove(MINDRECORD_FILE_NAME + ".db")
+
+    os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
diff --git a/tests/ut/python/model/test_mix_precision.py b/tests/ut/python/model/test_mix_precision.py
index 30c6002be8..d0e77f901a 100644
--- a/tests/ut/python/model/test_mix_precision.py
+++ b/tests/ut/python/model/test_mix_precision.py
@@ -25,6 +25,7 @@ from mindspore.nn import Momentum
 from mindspore.nn import TrainOneStepCell, WithLossCell
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
+from mindspore.ops import functional as F
 from mindspore.train.parallel_utils import ParallelMode
 from tests.ops_common import convert
 from ....train_step_wrap import train_step_with_loss_warp
@@ -185,3 +186,36 @@ def test_grad_conv_prelu():
     net = GetParamGrad(net)
     net.set_train()
     net(*all_inputs)
+
+
+def test_dict_cast():
+    class FirstNet(nn.Cell):
+        def __init__(self):
+            super(FirstNet, self).__init__()
+            self.net = SecondNet()
+            self.sub = P.Sub()
+
+        def construct(self, tensor_a, tensor_b):
+            a = F.mixed_precision_cast(mstype.float16, tensor_a)
+            b = F.mixed_precision_cast(mstype.float16, tensor_b)
+            c = self.sub(a, b)
+            dictionary = {"key": a}
+            result = self.net(c, key1=a, key2=dictionary)
+            return result
+
+    class SecondNet(nn.Cell):
+        def __init__(self):
+            super(SecondNet, self).__init__()
+            self.add = P.TensorAdd()
+
+        def construct(self, tensor_c, **kwargs):
+            d = F.mixed_precision_cast(mstype.float16, tensor_c)
+            dict_cast = F.mixed_precision_cast(mstype.float16, kwargs)
+            e = self.add(d, dict_cast["key1"])
+            f = self.add(e, dict_cast["key2"]["key"])
+            return f
+
+    x = Tensor(np.array([1, 2.5, 3.5]), mstype.float32)
+    y = Tensor(np.array([4, 5.5, 6.5]), mstype.float32)
+    net = FirstNet()
+    net(x, y)
diff --git a/tests/ut/python/model/test_vgg.py b/tests/ut/python/model/test_vgg.py
index 8f05179eec..ed8a217e51 100644
--- a/tests/ut/python/model/test_vgg.py
+++ b/tests/ut/python/model/test_vgg.py
@@ -17,7 +17,7 @@ import numpy as np
 import pytest
 
 from mindspore import Tensor
-from mindspore.model_zoo.vgg import vgg16
+from model_zoo.vgg16.src.vgg import vgg16
 from ..ut_filter import non_graph_engine
 
 
diff --git a/tests/ut/python/nn/optim/test_adam.py b/tests/ut/python/nn/optim/test_adam.py
index e47a0d6704..3fd18b9664 100644
--- a/tests/ut/python/nn/optim/test_adam.py
+++ b/tests/ut/python/nn/optim/test_adam.py
@@ -20,7 +20,7 @@ import mindspore.nn as nn
 from mindspore import Tensor, Parameter
 from mindspore.common.api import _executor
 from mindspore.nn import TrainOneStepCell, WithLossCell
-from mindspore.nn.optim import AdamWeightDecay, AdamWeightDecayDynamicLR
+from mindspore.nn.optim import Adam, AdamWeightDecay, AdamWeightDecayDynamicLR
 from mindspore.ops import operations as P
 
 
@@ -49,6 +49,20 @@ class NetWithoutWeight(nn.Cell):
         return x
 
 
+class NetWithSparseGatherV2(nn.Cell):
+    """ NetWithSparseGatherV2 definition """
+    def __init__(self):
+        super(NetWithSparseGatherV2, self).__init__()
+        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)),
+                                 name="weight1", sparse_grad="sparse_key_w1")
+        self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2")
+        self.axis = 0
+        self.gather = P.SparseGatherV2()
+
+    def construct(self, indices, label):
+        return self.gather(self.weight1, indices, self.axis) + self.weight2
+
+
 def test_adamwithoutparam():
     net = NetWithoutWeight()
     net.set_train()
@@ -71,6 +85,33 @@ def test_adamw_compile():
     _executor.compile(train_network, inputs, label)
 
 
+def test_adam_compile():
+    """ test adam compile """
+    inputs = Tensor(np.ones([1, 64]).astype(np.float32))
+    label = Tensor(np.zeros([1, 10]).astype(np.float32))
+    net = Net()
+    net.set_train()
+
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+    optimizer = Adam(net.trainable_params(), learning_rate=0.1, weight_decay=0.9)
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, optimizer)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_sparse_adam_compile():
+    """ test_sparse_adam_compile """
+    indices = Tensor(np.array([0, 1]).astype(np.int32))
+    label = Tensor(np.zeros([2, 1, 2]).astype(np.float32))
+    net = NetWithSparseGatherV2()
+    net.set_train()
+
+    optimizer = Adam(net.trainable_params(), learning_rate=0.1, loss_scale=1024.0)
+    train_network = TrainOneStepCell(net, optimizer)
+    _executor.compile(train_network, indices, label)
+
+
 def test_AdamWeightDecay_beta1():
     net = Net()
     print("**********", net.get_parameters())
@@ -104,7 +145,7 @@ def test_AdamWeightDecayDynamicLR():
     _executor.compile(train_network, inputs, label)
 
 
-def test_adam_mindspore_flatten():
+def test_adam_mindspore_with_empty_params():
     net = nn.Flatten()
     with pytest.raises(ValueError, match=r"Optimizer got an empty parameter list"):
         AdamWeightDecay(net.get_parameters())
diff --git a/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py b/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py
new file mode 100644
index 0000000000..5222f920ba
--- /dev/null
+++ b/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py
@@ -0,0 +1,173 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+""" test adam """
+import numpy as np
+
+import mindspore.nn as nn
+from mindspore import Tensor, Parameter, context
+from mindspore.common.api import _executor
+from mindspore.common import dtype as mstype
+from mindspore.nn import TrainOneStepCell, WithLossCell
+from mindspore.nn.optim import Optimizer
+from mindspore.ops import operations as P
+from mindspore.ops import composite as C
+from mindspore.ops import functional as F
+from mindspore._checkparam import Validator as validator
+from mindspore._checkparam import Rel
+
+
+adam_opt_for_map = C.MultitypeFuncGraph("adam_opt_for_map")
+@adam_opt_for_map.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
+                           "Tensor", "Tensor", "Tensor", "Bool")
+def _update_run_op_for_map(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag):
+    op_mul = P.Mul()
+    op_square = P.Square()
+    op_sqrt = P.Sqrt()
+    op_cast = P.Cast()
+    op_reshape = P.Reshape()
+    op_shape = P.Shape()
+
+    param_fp32 = op_cast(param, mstype.float32)
+    m_fp32 = op_cast(m, mstype.float32)
+    v_fp32 = op_cast(v, mstype.float32)
+    gradient_fp32 = op_cast(gradient, mstype.float32)
+
+    next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32) - beta1, gradient_fp32)
+
+    next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32)
+                                            - beta2, op_square(gradient_fp32))
+
+    update = next_m / (op_sqrt(next_v) + eps)
+    if decay_flag:
+        update = update + op_mul(weight_decay_tensor, param_fp32)
+
+    update_with_lr = op_mul(lr, update)
+    next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))
+
+    next_v = F.depend(next_v, F.assign(param, next_param))
+    next_v = F.depend(next_v, F.assign(m, next_m))
+    next_v = F.depend(next_v, F.assign(v, next_v))
+    return next_v
+
+
+@adam_opt_for_map.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
+                           "Tensor", "Tensor", "Tuple", "Bool")
+def _update_run_op_sparse_for_map(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag):
+    return gradient[2][2]
+
+def _check_param_value(beta1, beta2, eps, weight_decay, prim_name):
+    """Check the type of inputs."""
+    validator.check_value_type("beta1", beta1, [float], prim_name)
+    validator.check_value_type("beta2", beta2, [float], prim_name)
+    validator.check_value_type("eps", eps, [float], prim_name)
+    validator.check_value_type("weight_dacay", weight_decay, [float], prim_name)
+    validator.check_number_range("beta1", beta1, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
+    validator.check_number_range("beta2", beta2, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
+    validator.check_number_range("eps", eps, 0.0, float("inf"), Rel.INC_NEITHER, prim_name)
+    validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
+
+
+class AdamWeightDecaySparse(Optimizer):
+    """
+    Implements Adam algorithm weight decay fix.
+
+    Args:
+        params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
+                                  should be class mindspore.Parameter.
+        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
+                                                        Iterable or a Tensor and the dims of the Tensor is 1,
+                                                        use dynamic learning rate, then the i-th step will
+                                                        take the i-th value as the learning rate.
+                                                        When the learning_rate is float or learning_rate is a Tensor
+                                                        but the dims of the Tensor is 0, use fixed learning rate.
+                                                        Other cases are not supported. Default: 1e-3.
+        beta1 (float): The exponential decay rate for the 1st moment estimates. Default: 0.9.
+            Should be in range (0.0, 1.0).
+        beta2 (float): The exponential decay rate for the 2nd moment estimates. Default: 0.999.
+            Should be in range (0.0, 1.0).
+        eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6.
+            Should be greater than 0.
+        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
+        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
+                                 lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name.
+
+    Inputs:
+        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`,
+          and might be in sparse format.
+
+    Outputs:
+        tuple[Parameter], the updated velocity value, the shape is the same as `params`.
+
+    Examples:
+        >>> net = Net()
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> optim = nn.AdamWeightDecay(params=net.trainable_params())
+        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
+   """
+    def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0,
+                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
+        super(AdamWeightDecaySparse, self).__init__(learning_rate, params)
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
+        _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
+        self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
+        self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
+        self.eps = Tensor(np.array([eps]).astype(np.float32))
+        self.weight_decay_tensor = Tensor(np.array([weight_decay]).astype(np.float32))
+
+        self.params = self.parameters
+        self.moments1 = self.params.clone(prefix="adam_m", init='zeros')
+        self.moments2 = self.params.clone(prefix="adam_v", init='zeros')
+        self.decay_flag = tuple(decay_filter(x) for x in self.params)
+
+        self.map = C.Map()
+
+    def construct(self, gradients):
+        lr = self.get_lr()
+        updated_velocity = self.map(F.partial(adam_opt_for_map, self.beta1, self.beta2, self.eps, lr,
+                                              self.weight_decay_tensor),
+                                    self.params, self.moments1, self.moments2, gradients, self.decay_flag)
+
+        return updated_velocity
+
+
+def test_AdamWeightDecaySparse():
+    """ test_AdamWeightDecaySparse """
+    context.set_context(mode=context.GRAPH_MODE)
+    class Loss(nn.Cell):
+        def __init__(self):
+            super(Loss, self).__init__()
+        def construct(self, base, target):
+            return base
+    class NetWithSparseGatherV2(nn.Cell):
+        def __init__(self):
+            super(NetWithSparseGatherV2, self).__init__()
+            self.w1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="w1", sparse_grad="sparse_key_w1")
+            self.w2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="w2", sparse_grad="sparse_key_w2")
+            self.gatherv2 = P.SparseGatherV2()
+            self.axis = 0
+        def construct(self, indices):
+            return self.gatherv2(self.w1, indices, self.axis) * self.w2
+
+    inputs = Tensor(np.array([0, 1]).astype(np.int32))
+    label = Tensor(np.zeros([2, 1, 2]).astype(np.float32))
+    net = NetWithSparseGatherV2()
+    net.set_train()
+    loss = Loss()
+    optimizer = AdamWeightDecaySparse(net.trainable_params())
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, optimizer)
+    _executor.compile(train_network, inputs, label)
diff --git a/tests/ut/python/nn/optim/test_ftrl.py b/tests/ut/python/nn/optim/test_ftrl.py
index cbaa2a4520..f0f094c177 100644
--- a/tests/ut/python/nn/optim/test_ftrl.py
+++ b/tests/ut/python/nn/optim/test_ftrl.py
@@ -37,6 +37,20 @@ class Net(nn.Cell):
         return x
 
 
+class NetWithSparseGatherV2(nn.Cell):
+    """ NetWithSparseGatherV2 definition """
+    def __init__(self):
+        super(NetWithSparseGatherV2, self).__init__()
+        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)),
+                                 name="weight1", sparse_grad="sparse_key_w1")
+        self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2")
+        self.axis = 0
+        self.gather = P.SparseGatherV2()
+
+    def construct(self, indices, label):
+        return self.gather(self.weight1, indices, self.axis) + self.weight2
+
+
 def test_ftrl():
     """ test_ftrl """
     inputs = Tensor(np.ones([1, 64]).astype(np.float32))
@@ -44,7 +58,19 @@ def test_ftrl():
     net = Net()
     net.set_train()
     loss = nn.SoftmaxCrossEntropyWithLogits()
-    optimizer = FTRL(net.trainable_params())
+    optimizer = FTRL(net.trainable_params(), weight_decay=0.9, loss_scale=2.0)
     net_with_loss = WithLossCell(net, loss)
     train_network = TrainOneStepCell(net_with_loss, optimizer)
     _executor.compile(train_network, inputs, label)
+
+
+def test_spares_ftrl_compile():
+    """ test sparse ftrl compile """
+    indices = Tensor(np.array([0, 1]).astype(np.int32))
+    label = Tensor(np.zeros([2, 1, 2]).astype(np.float32))
+    net = NetWithSparseGatherV2()
+    net.set_train()
+
+    optimizer = FTRL(net.trainable_params(), loss_scale=2.0)
+    train_network = TrainOneStepCell(net, optimizer)
+    _executor.compile(train_network, indices, label)
diff --git a/tests/ut/python/nn/optim/test_lazyadam.py b/tests/ut/python/nn/optim/test_lazyadam.py
new file mode 100644
index 0000000000..713fffc50d
--- /dev/null
+++ b/tests/ut/python/nn/optim/test_lazyadam.py
@@ -0,0 +1,89 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+""" test lazy adam """
+import numpy as np
+import pytest
+
+import mindspore.nn as nn
+from mindspore import Tensor, Parameter
+from mindspore.common.api import _executor
+from mindspore.nn import TrainOneStepCell, WithLossCell
+from mindspore.nn.optim import LazyAdam
+from mindspore.ops import operations as P
+
+
+class Net(nn.Cell):
+    """ Net definition """
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.weight = Parameter(Tensor(np.ones([64, 10]).astype(np.float32)), name="weight")
+        self.bias = Parameter(Tensor(np.ones([10]).astype((np.float32))), name="bias")
+        self.matmul = P.MatMul()
+        self.biasAdd = P.BiasAdd()
+
+    def construct(self, x):
+        x = self.biasAdd(self.matmul(x, self.weight), self.bias)
+        return x
+
+
+class NetWithSparseGatherV2(nn.Cell):
+    """ NetWithSparseGatherV2 definition """
+    def __init__(self):
+        super(NetWithSparseGatherV2, self).__init__()
+        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)),
+                                 name="weight1", sparse_grad="sparse_key_w1")
+        self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2")
+        self.axis = 0
+        self.gather = P.SparseGatherV2()
+
+    def construct(self, indices, label):
+        return self.gather(self.weight1, indices, self.axis) + self.weight2
+
+
+def test_lazy_adam_compile():
+    """ test lazy adam compile """
+    inputs = Tensor(np.ones([1, 64]).astype(np.float32))
+    label = Tensor(np.zeros([1, 10]).astype(np.float32))
+    net = Net()
+    net.set_train()
+
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+    optimizer = LazyAdam(net.trainable_params(), learning_rate=0.1, weight_decay=0.9, loss_scale=2.0)
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, optimizer)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_spares_lazy_adam_compile():
+    """ test sparse adam compile """
+    indices = Tensor(np.array([0, 1]).astype(np.int32))
+    label = Tensor(np.zeros([2, 1, 2]).astype(np.float32))
+    net = NetWithSparseGatherV2()
+    net.set_train()
+
+    optimizer = LazyAdam(net.trainable_params(), learning_rate=0.1, loss_scale=2.0)
+    train_network = TrainOneStepCell(net, optimizer)
+    _executor.compile(train_network, indices, label)
+
+
+def test_lazy_adam_error():
+    net = Net()
+    with pytest.raises(ValueError):
+        LazyAdam(net.get_parameters(), learning_rate=-0.1)
+
+    with pytest.raises(TypeError):
+        LazyAdam(net.get_parameters(), learning_rate=0.1, beta1=2)
diff --git a/tests/ut/python/nn/optim/test_optimizer.py b/tests/ut/python/nn/optim/test_optimizer.py
index 548094840e..70b79e97d7 100644
--- a/tests/ut/python/nn/optim/test_optimizer.py
+++ b/tests/ut/python/nn/optim/test_optimizer.py
@@ -52,11 +52,11 @@ class TestAdam():
              use_nesterov=False, weight_decay=0.0, loss_scale=1.0)
 
     def test_construct(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(RuntimeError):
             gradient = Tensor(np.zeros([1, 2, 3]))
             adam = Adam(params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, use_locking=False,
                         use_nesterov=False, weight_decay=0.0, loss_scale=1.0)
-            adam.construct(gradient)
+            adam(gradient)
 
 
 class TestSGD():
@@ -107,5 +107,5 @@ class TestUnsupportParam():
 
     def test_Sgd_init(self):
         with pytest.raises(TypeError):
-            paramsTensor = Tensor(np.zeros([1, 2, 3]))
+            paramsTensor = Parameter(Tensor(np.zeros([1, 2, 3])), "x")
             SGD(paramsTensor)
diff --git a/tests/ut/python/nn/optim/test_proximal_ada_grad.py b/tests/ut/python/nn/optim/test_proximal_ada_grad.py
new file mode 100644
index 0000000000..a43a4ad23d
--- /dev/null
+++ b/tests/ut/python/nn/optim/test_proximal_ada_grad.py
@@ -0,0 +1,75 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+""" test PROXIMAL_ADA_GRAD """
+
+import numpy as np
+
+import mindspore.nn as nn
+from mindspore import Tensor, Parameter
+from mindspore.common.api import _executor
+from mindspore.nn import TrainOneStepCell, WithLossCell
+from mindspore.nn.optim import ProximalAdagrad
+from mindspore.ops import operations as P
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.weight = Parameter(Tensor(np.ones([64, 10]).astype(np.float32)), name='weight')
+        self.bias = Parameter(Tensor(np.ones([10]).astype(np.float32)), name='bias')
+        self.matmul = P.MatMul()
+        self.biasAdd = P.BiasAdd()
+
+    def construct(self, x):
+        x = self.biasAdd(self.matmul(x, self.weight), self.bias)
+        return x
+
+class NetWithSparseGatherV2(nn.Cell):
+    """ NetWithSparseGatherV2 definition """
+    def __init__(self):
+        super(NetWithSparseGatherV2, self).__init__()
+        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1",
+                                 sparse_grad="sparse_key_w1")
+        self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="weight2")
+        self.axis = 0
+        self.gather = P.SparseGatherV2()
+
+    def construct(self, indices, label):
+        return self.gather(self.weight1, indices, self.axis) + self.weight2
+
+
+def test_proximal_ada_grad():
+    """ test_proximal_ada_grad """
+    inputs = Tensor(np.ones([1, 64]).astype(np.float32))
+    label = Tensor(np.zeros([1, 10]).astype(np.float32))
+    net = Net()
+    net.set_train()
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+    optimizer = ProximalAdagrad(net.trainable_params())
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, optimizer)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_spares_proximal_ada_grad_compile():
+    """ test sparse proximal_ada_grad compile """
+    indices = Tensor(np.array([0, 1]).astype(np.int32))
+    label = Tensor(np.zeros([2, 1, 2]).astype(np.float32))
+    net = NetWithSparseGatherV2()
+    net.set_train()
+
+    optimizer = ProximalAdagrad(net.trainable_params(), loss_scale=2.0)
+    train_network = TrainOneStepCell(net, optimizer)
+    _executor.compile(train_network, indices, label)
diff --git a/tests/ut/python/nn/test_central_crop.py b/tests/ut/python/nn/test_central_crop.py
new file mode 100644
index 0000000000..dc9f438f95
--- /dev/null
+++ b/tests/ut/python/nn/test_central_crop.py
@@ -0,0 +1,74 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+test CentralCrop
+"""
+import numpy as np
+import pytest
+
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common import dtype as mstype
+from mindspore.common.api import _executor
+
+
+class CentralCropNet(nn.Cell):
+    def __init__(self, central_fraction):
+        super(CentralCropNet, self).__init__()
+        self.net = nn.CentralCrop(central_fraction)
+
+    def construct(self, image):
+        return self.net(image)
+
+
+def test_compile_3d_central_crop():
+    central_fraction = 0.2
+    net = CentralCropNet(central_fraction)
+    image = Tensor(np.random.random((3, 16, 16)), mstype.float32)
+    _executor.compile(net, image)
+
+
+def test_compile_4d_central_crop():
+    central_fraction = 0.5
+    net = CentralCropNet(central_fraction)
+    image = Tensor(np.random.random((8, 3, 16, 16)), mstype.float32)
+    _executor.compile(net, image)
+
+
+def test_central_fraction_bool():
+    central_fraction = True
+    with pytest.raises(TypeError):
+        _ = CentralCropNet(central_fraction)
+
+
+def test_central_crop_central_fraction_negative():
+    central_fraction = -1.0
+    with pytest.raises(ValueError):
+        _ = CentralCropNet(central_fraction)
+
+
+def test_central_fraction_zero():
+    central_fraction = 0.0
+    with pytest.raises(ValueError):
+        _ = CentralCropNet(central_fraction)
+
+
+def test_central_crop_invalid_5d_input():
+    invalid_shape = (8, 3, 16, 16, 1)
+    invalid_image = Tensor(np.random.random(invalid_shape))
+
+    net = CentralCropNet(central_fraction=0.5)
+    with pytest.raises(ValueError):
+        _executor.compile(net, invalid_image)
diff --git a/tests/ut/python/nn/test_clip_by_norm.py b/tests/ut/python/nn/test_clip_by_norm.py
index ff7d128108..512159974e 100644
--- a/tests/ut/python/nn/test_clip_by_norm.py
+++ b/tests/ut/python/nn/test_clip_by_norm.py
@@ -26,3 +26,19 @@ def test_clip_by_norm():
     x = Tensor(np.array([[-2, 0, 0], [0, 3, 4]]).astype(np.float32))
     clip_norm = Tensor(np.array([1]).astype(np.float32))
     clip_by_norm(x, clip_norm)
+
+
+@non_graph_engine
+def test_clip_by_norm_const():
+    class Network(nn.Cell):
+        def __init__(self):
+            super(Network, self).__init__()
+            self.norm_value = Tensor(np.array([1]).astype(np.float32))
+            self.clip = nn.ClipByNorm()
+
+        def construct(self, x):
+            return self.clip(x, self.norm_value)
+
+    net = Network()
+    x = Tensor(np.array([[-2, 0, 0], [0, 3, 4]]).astype(np.float32))
+    net(x)
diff --git a/tests/ut/python/nn/test_dense.py b/tests/ut/python/nn/test_dense.py
index 4fe24c895a..3972f48b4d 100644
--- a/tests/ut/python/nn/test_dense.py
+++ b/tests/ut/python/nn/test_dense.py
@@ -76,7 +76,7 @@ class Net(nn.Cell):
                  weight='normal',
                  bias='zeros',
                  has_bias=True,
-                 activation=''):
+                 activation=None):
         super(Net, self).__init__()
         self.dense = nn.Dense(input_channels,
                               output_channels,
diff --git a/tests/ut/python/ops/test_array_ops.py b/tests/ut/python/ops/test_array_ops.py
index bf1d8b72d3..ae71579973 100644
--- a/tests/ut/python/ops/test_array_ops.py
+++ b/tests/ut/python/ops/test_array_ops.py
@@ -25,6 +25,7 @@ from mindspore import Tensor
 from mindspore.common import dtype as mstype
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
+from mindspore.ops.operations import _inner_ops as inner
 from mindspore.ops import prim_attr_register
 from mindspore.ops.primitive import PrimitiveWithInfer
 import mindspore.context as context
@@ -285,6 +286,16 @@ class SpaceToBatchNDNet(Cell):
     def construct(self, x):
         return self.space_to_batch_nd(x)
 
+
+class RangeNet(Cell):
+    def __init__(self):
+        super(RangeNet, self).__init__()
+        self.range_ops = inner.Range(1.0, 8.0, 2.0)
+
+    def construct(self, x):
+        return self.range_ops(x)
+
+
 test_case_array_ops = [
     ('CustNet1', {
         'block': CustNet1(),
@@ -325,6 +336,9 @@ test_case_array_ops = [
     ('BatchToSpaceNDNet', {
         'block': BatchToSpaceNDNet(),
         'desc_inputs': [Tensor(np.random.rand(4, 1, 1, 1).astype(np.float16))]}),
+    ('RangeNet', {
+        'block': RangeNet(),
+        'desc_inputs': [Tensor(np.array([1, 2, 3, 2]), ms.int32)]}),
 ]
 
 test_case_lists = [test_case_array_ops]
diff --git a/tests/ut/python/ops/test_control_ops.py b/tests/ut/python/ops/test_control_ops.py
index 8697e33fcb..064512b19a 100644
--- a/tests/ut/python/ops/test_control_ops.py
+++ b/tests/ut/python/ops/test_control_ops.py
@@ -520,3 +520,83 @@ def test_while_in_while():
         out = out + 3
         return out
     while_in_while(c1, c2, c3, c4)
+
+
+def test_tensor_cond():
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.t = Tensor(np.array(0, np.bool))
+            self.t1 = Tensor(np.array([True], np.bool))
+        def construct(self, x, y):
+            t = 0
+            if self.t:
+                t = t - x * y
+            else:
+                t = t - x / y
+            if self.t1:
+                t = t + x / y
+            else:
+                t = t + x * y
+            return t
+            
+            
+    x = Tensor(np.ones([6, 8, 10], np.int32))
+    y = Tensor(np.ones([6, 8, 10], np.int32))
+    net = Net()
+    out = net(x, y)
+
+def test_tensor_cond_exception():
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.t = Tensor(np.array([True, False], np.bool))
+        def construct(self, x, y):
+            t = 0
+            if self.t:
+                t = t - x * y
+            else:
+                t = t - x / y
+            return t
+            
+            
+    x = Tensor(np.ones([6, 8, 10], np.int32))
+    y = Tensor(np.ones([6, 8, 10], np.int32))
+    net = Net()
+    with pytest.raises(ValueError):
+        out = net(x, y)
+
+def test_while_scalar():
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.x = 10
+        def construct(self, x, y):
+            i = 0
+            t = 0
+            while (i < 10):
+                t = t + x + y
+                i = i + 1
+            return t
+    net = Net()
+    x = Tensor(np.ones([6, 8, 10], np.int32))
+    y = Tensor(np.ones([6, 8, 10], np.int32))
+    out = net(x, y)
+
+def test_while_tensor():
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.t = Tensor(np.ones([6, 8, 10], np.int32))
+            self.count = Tensor(np.array([10], np.int32))
+        def construct(self, x, y):
+            i = 0
+            t = self.t
+            while (i < self.count):
+                t = t + x + y
+                i = i + 1
+            return t
+    net = Net()
+    x = Tensor(np.ones([6, 8, 10], np.int32))
+    y = Tensor(np.ones([6, 8, 10], np.int32))
+    out = net(x, y)
diff --git a/tests/ut/python/ops/test_math_ops.py b/tests/ut/python/ops/test_math_ops.py
index d600ce16b4..09f113204e 100755
--- a/tests/ut/python/ops/test_math_ops.py
+++ b/tests/ut/python/ops/test_math_ops.py
@@ -31,7 +31,7 @@ from ....mindspore_test_framework.pipeline.forward.compile_forward \
     import pipeline_for_compile_forward_ge_graph_for_case_by_case_config
 from ....mindspore_test_framework.pipeline.forward.verify_exception \
     import pipeline_for_verify_exception_for_case_by_case_config
-
+context.set_context(mode=context.GRAPH_MODE)
 
 # pylint: disable=W0613
 # pylint: disable=W0231
@@ -101,10 +101,8 @@ def test_pow():
     result = testpow(input_tensor, power)
     assert np.all(result.asnumpy() == expect)
     net = PowNet()
-    with pytest.raises(TypeError):
-        net(input_tensor, True)
-    with pytest.raises(TypeError):
-        net(input_tensor, power2)
+    net(input_tensor, True)
+    net(input_tensor, power2)
 
 
 def test_exp():
diff --git a/tests/ut/python/ops/test_math_ops_check.py b/tests/ut/python/ops/test_math_ops_check.py
index 355e35f933..9772de82e4 100755
--- a/tests/ut/python/ops/test_math_ops_check.py
+++ b/tests/ut/python/ops/test_math_ops_check.py
@@ -293,13 +293,6 @@ raise_set = [
         'desc_inputs': [5.0],
         'skip': ['backward']}),
 
-    # input x is Tensor(bool)
-    ('Pow1', {
-        'block': (P.Pow(),
-                  {'exception': TypeError, 'error_keywords': ['Pow']}),
-        'desc_inputs': [Tensor(np.ones([2, 3]).astype(np.bool_)), 2.0],
-        'skip': ['backward']}),
-
     # input is not Tensor
     ('Exp1', {
         'block': (P.Exp(),
diff --git a/tests/ut/python/ops/test_momentum.py b/tests/ut/python/ops/test_momentum.py
index 8889feb4fb..973da9a45e 100644
--- a/tests/ut/python/ops/test_momentum.py
+++ b/tests/ut/python/ops/test_momentum.py
@@ -41,7 +41,7 @@ def tensor_run_opt(opt, iters, learning_rate, momentum,
                    gradient, variable, moment):
     """ tensor_run_opt """
     success = True
-    new_weight = opt(variable, moment, learning_rate, gradient, momentum)
+    new_weight = opt(variable, moment, learning_rate, gradient, momentum)[0]
     success = F.depend(success, F.assign(variable, new_weight))
     return success
 
diff --git a/tests/ut/python/ops/test_nn_ops.py b/tests/ut/python/ops/test_nn_ops.py
index 2c97b49e15..e950707234 100644
--- a/tests/ut/python/ops/test_nn_ops.py
+++ b/tests/ut/python/ops/test_nn_ops.py
@@ -20,9 +20,9 @@ import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor, Parameter
 from mindspore.common.initializer import initializer
-from mindspore.ops import Primitive
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
+from mindspore.ops import functional as F
 from mindspore.ops.operations import _grad_ops as G
 from mindspore.ops import prim_attr_register, PrimitiveWithInfer
 from ..ut_filter import non_graph_engine
@@ -358,7 +358,7 @@ class StateNet(nn.Cell):
         self.assign = P.Assign()
 
     def construct(self, x):
-        x = Primitive('depend')(x, self.assign(self.s1, x + self.s1))
+        x = F.depend(x, self.assign(self.s1, x + self.s1))
         self.s1 = self.sub(self.s1, x)
         self.s2 = self.sub(self.s2, x)
         return x
@@ -370,6 +370,7 @@ def test_conv2d_same_primitive():
             super(Conv2DSameNet, self).__init__()
             self.conv1 = nn.Conv2d(16, 64, (1, 41), (1, 4), "same", 0, 1, has_bias=True)
             self.conv2 = nn.Conv2d(16, 64, (1, 41), (1, 4), "same", 0, 1, has_bias=True)
+
         def construct(self, x, y):
             r1 = self.conv1(x)
             r2 = self.conv2(y)
@@ -576,6 +577,22 @@ test_cases = [
                         Tensor(np.ones([1, 3, 4, 4], np.float32)),
                         Tensor(np.ones(3, np.float32))],
     }),
+    ('MatrixDiag', {
+        'block': nn.MatrixDiag(),
+        'desc_inputs': [Tensor(np.array([1, 2, 3]).astype(np.float32))],
+        'skip': ['backward']
+    }),
+    ('MatrixDiagPart', {
+        'block': nn.MatrixDiagPart(),
+        'desc_inputs': [Tensor(np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32))],
+        'skip': ['backward']
+    }),
+    ('MatrixSetDiag', {
+        'block': nn.MatrixSetDiag(),
+        'desc_inputs': [Tensor(np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)),
+                        Tensor(np.array([1, 2]).astype(np.float32))],
+        'skip': ['backward']
+    }),
 ]
 
 test_cases_for_verify_exception = [
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index 752c99960a..cf6a6705ab 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -14,6 +14,7 @@
 # ============================================================================
 """ test ops """
 import functools
+
 import numpy as np
 
 import mindspore.nn as nn
@@ -24,6 +25,7 @@ from mindspore.common import dtype as mstype
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from mindspore.ops.operations import _grad_ops as G
+from mindspore.ops.operations import _inner_ops as inner
 from ..ut_filter import non_graph_engine
 from ....mindspore_test_framework.mindspore_test import mindspore_test
 from ....mindspore_test_framework.pipeline.forward.compile_forward \
@@ -33,6 +35,25 @@ from ....mindspore_test_framework.pipeline.gradient.compile_gradient \
     import pipeline_for_compile_grad_ge_graph_for_case_by_case_config
 
 
+def test_tensor_scatter_update():
+    class TensorScatterUpdateNet(nn.Cell):
+        """TensorScatterUpdate net definition"""
+
+        def __init__(self):
+            super(TensorScatterUpdateNet, self).__init__()
+            self.tensor_scatter_update = P.TensorScatterUpdate()
+
+        def construct(self, x, i, u):
+            out = self.tensor_scatter_update(x, i, u)
+            return out
+    net = TensorScatterUpdateNet()
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
+    x = Tensor(np.arange(3 * 4 * 5).reshape((3, 4, 5)),  mstype.float32)
+    indices = Tensor(np.array([[0, 0], [1, 1]], np.int32))
+    updates = Tensor(np.ones([2, 5], np.float32))
+    net(x, indices, updates)
+
+
 class InputBackward(nn.Cell):
     def __init__(self, network):
         super(InputBackward, self).__init__()
@@ -128,7 +149,7 @@ class NetForFlattenComposed(nn.Cell):
         self.flatten = P.Flatten()
 
     def construct(self, x, y):
-        return self.flatten(x+x) + y
+        return self.flatten(x + x) + y
 
 
 class ArgmaxNet(nn.Cell):
@@ -243,12 +264,14 @@ class SparseApplyProximalAdagradNet(nn.Cell):
     def __init__(self):
         super(SparseApplyProximalAdagradNet, self).__init__()
         self.sparse_apply_proximal_adagrad = P.SparseApplyProximalAdagrad()
+        self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
         self.lr = 0.01
         self.l1 = 0.0
         self.l2 = 0.0
 
-    def construct(self, var, accum, grad, indices):
-        out = self.sparse_apply_proximal_adagrad(var, accum, self.lr, self.l1, self.l2, grad, indices)
+    def construct(self, grad, indices):
+        out = self.sparse_apply_proximal_adagrad(self.var, self.accum, self.lr, self.l1, self.l2, grad, indices)
         return out
 
 
@@ -256,15 +279,89 @@ class ApplyProximalAdagradNet(nn.Cell):
     def __init__(self):
         super(ApplyProximalAdagradNet, self).__init__()
         self.apply_proximal_adagrad = P.ApplyProximalAdagrad()
+        self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
         self.lr = 0.01
         self.l1 = 0.0
         self.l2 = 0.0
 
-    def construct(self, var, accum, grad):
-        out = self.apply_proximal_adagrad(var, accum, self.lr, self.l1, self.l2, grad)
+    def construct(self, grad):
+        out = self.apply_proximal_adagrad(self.var, self.accum, self.lr, self.l1, self.l2, grad)
         return out
 
 
+class ApplyAdaMaxNet(nn.Cell):
+    def __init__(self):
+        super(ApplyAdaMaxNet, self).__init__()
+        self.apply_ada_max = P.ApplyAdaMax()
+        self.beta1_power = 0.9
+        self.lr = 0.001
+        self.beta1 = 0.9
+        self.beta2 = 0.99
+        self.epsilon = 1e-10
+        self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        self.m = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="m")
+        self.v = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="v")
+
+    def construct(self, grad):
+        out = self.apply_ada_max(self.var, self.m, self.v, self.beta1_power, self.lr,
+                                 self.beta1, self.beta2, self.epsilon, grad)
+        return out
+
+
+class ApplyAdadeltaNet(nn.Cell):
+    def __init__(self):
+        super(ApplyAdadeltaNet, self).__init__()
+        self.apply_adadelta = P.ApplyAdadelta()
+        self.lr = 0.001
+        self.rho = 0.0
+        self.epsilon = 1e-6
+        self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+        self.accum_update = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum_update")
+
+    def construct(self, grad):
+        out = self.apply_adadelta(self.var, self.accum, self.accum_update, self.lr, self.rho, self.epsilon, grad)
+        return out
+
+
+class ApplyAdagradNet(nn.Cell):
+    def __init__(self):
+        super(ApplyAdagradNet, self).__init__()
+        self.apply_adagrad = P.ApplyAdagrad()
+        self.lr = 0.001
+        self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+
+    def construct(self, grad):
+        out = self.apply_adagrad(self.var, self.accum, self.lr, grad)
+        return out
+
+
+class ApplyAdagradV2Net(nn.Cell):
+    def __init__(self):
+        super(ApplyAdagradV2Net, self).__init__()
+        self.apply_adagrad_v2 = P.ApplyAdagradV2(epsilon=1e-6)
+        self.lr = 0.001
+        self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+
+    def construct(self, grad):
+        out = self.apply_adagrad_v2(self.var, self.accum, self.lr, grad)
+        return out
+
+
+class SparseApplyAdagradNet(nn.Cell):
+    def __init__(self):
+        super(SparseApplyAdagradNet, self).__init__()
+        self.sparse_apply_adagrad = P.SparseApplyAdagrad(lr=0.01)
+        self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
+        self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
+
+    def construct(self, grad, indices):
+        out = self.sparse_apply_adagrad(self.var, self.accum, grad, indices)
+        return out
+
 class ApplyRMSNet(nn.Cell):
     def __init__(self):
         super(ApplyRMSNet, self).__init__()
@@ -281,6 +378,40 @@ class ApplyRMSNet(nn.Cell):
         out = self.apply_rms(self.var, self.ms, self.moment, self.lr, grad, self.rho, self.momentum, self.epsilon)
         return out
 
+
+class InplaceAddNet(nn.Cell):
+    def __init__(self):
+        super(InplaceAddNet, self).__init__()
+        self.inplace_add = P.InplaceAdd(indices=(0, 1))
+
+    def construct(self, x, v):
+        out = self.inplace_add(x, v)
+        return out
+
+
+class InplaceSubNet(nn.Cell):
+    def __init__(self):
+        super(InplaceSubNet, self).__init__()
+        self.inplace_sub = P.InplaceSub(indices=(0, 1))
+
+    def construct(self, x, v):
+        out = self.inplace_sub(x, v)
+        return out
+
+
+class NormalNet(nn.Cell):
+    def __init__(self, shape=None, mean=0.0, stddev=1.0, seed=0):
+        super(NormalNet, self).__init__()
+        self.normal = P.Normal(seed=seed)
+        self.shape = shape
+        self.mean = Tensor(mean, mstype.float32)
+        self.stddev = Tensor(stddev, mstype.float32)
+
+    def construct(self):
+        out = self.normal(self.shape, self.mean, self.stddev)
+        return out
+
+
 test_case_math_ops = [
     ('BitwiseAnd', {
         'block': P.BitwiseAnd(),
@@ -388,6 +519,10 @@ test_case_math_ops = [
         'block': P.Exp(),
         'desc_inputs': [[2, 3]],
         'desc_bprop': [[2, 3]]}),
+    ('Expm1', {
+        'block': P.Expm1(),
+        'desc_inputs': [[2, 3]],
+        'desc_bprop': [[2, 3]]}),
     ('Erf', {
         'block': P.Erf(),
         'desc_inputs': [Tensor(np.array([-2, -1, 0, 1, 2]).astype(np.float16))],
@@ -397,6 +532,21 @@ test_case_math_ops = [
         'desc_inputs': [[2, 512, 56, 56]],
         'desc_bprop': [[2, 512, 56, 56]],
         'skip': ['backward']}),
+    ('Ceil', {
+        'block': P.Ceil(),
+        'desc_inputs': [[2, 512, 56, 56]],
+        'desc_bprop': [[2, 512, 56, 56]],
+        'skip': ['backward']}),
+    ('InplaceAdd', {
+        'block': InplaceAddNet(),
+        'desc_inputs': [Tensor(np.array([[1, 2], [3, 4], [5, 6]]).astype(np.float32)),
+                        Tensor(np.array([[0.5, 1], [1, 1.5]]).astype(np.float32))],
+        'skip': ['backward']}),
+    ('InplaceSub', {
+        'block': InplaceSubNet(),
+        'desc_inputs': [Tensor(np.array([[1, 2], [3, 4], [5, 6]]).astype(np.float32)),
+                        Tensor(np.array([[0.5, 1], [1, 1.5]]).astype(np.float32))],
+        'skip': ['backward']}),
     ('ACos', {
         'block': P.ACos(),
         'desc_inputs': [Tensor(np.array([2., 3.]).astype(np.float32))],
@@ -576,6 +726,10 @@ test_case_math_ops = [
         'desc_inputs': [1, [2, 3, 4, 5]],
         'desc_bprop': [Tensor(np.ones((2, 3, 4, 5), np.bool_))],
         'skip': ['backward']}),
+    ('ApproximateEqual', {
+        'block': P.ApproximateEqual(),
+        'desc_inputs': [[3, 4, 5], [3, 4, 5]],
+        'desc_bprop': [Tensor(np.zeros((3, 4, 5), np.bool_))]}),
     ('Greater', {
         'block': P.Greater(),
         'desc_inputs': [[2, 3, 4, 1], [4, 5]],
@@ -732,6 +886,32 @@ test_case_math_ops = [
         'block': P.Atanh(),
         'desc_inputs': [[2, 3]],
         'desc_bprop': [[2, 3]]}),
+    ('Cosh', {
+        'block': P.Cosh(),
+        'desc_inputs': [[3, 4, 5]],
+        'desc_bprop': [[3, 4, 5]]}),
+    ('Sinh', {
+        'block': P.Sinh(),
+        'desc_inputs': [[3, 4, 5]],
+        'desc_bprop': [[3, 4, 5]]}),
+    ('Inv', {
+        'block': P.Inv(),
+        'desc_inputs': [[21, 9, 12, 5]],
+        'desc_bprop': [[21, 9, 12, 5]]}),
+    ('Invert', {
+        'block': P.Invert(),
+        'desc_inputs': [Tensor(np.array([[24, 4, 13, 9], [1, 5, 10, 8]]).astype(np.int16))],
+        'desc_bprop': [],
+        'skip': ['backward']}),
+    ('HistogramFixedWidth', {
+        'block': P.HistogramFixedWidth(5),
+        'desc_inputs': [Tensor([-1.0, 0.0, 1.5, 2.0, 5.0, 15], mstype.float16), Tensor([0.0, 5.0], mstype.float16)],
+        'desc_bprop': [],
+        'skip': ['backward']}),
+    ('Normal', {
+        'block': NormalNet((3, 2, 4), 0.0, 1.0, 0),
+        'desc_inputs': [],
+        'skip': ['backward']}),
 ]
 
 test_case_nn_ops = [
@@ -880,7 +1060,7 @@ test_case_nn_ops = [
         'skip': ['backward']}),
     ('BasicLSTMCell', {
         'block': P.BasicLSTMCell(keep_prob=1.0, forget_bias=1.0, state_is_tuple=True, activation='tanh'),
-        'desc_inputs': [[128, 128], [128, 128], [128, 128], [512, 256, 1, 1],[512, 1, 1, 1]],
+        'desc_inputs': [[128, 128], [128, 128], [128, 128], [512, 256, 1, 1], [512, 1, 1, 1]],
         'desc_bprop': [[128, 128], [128, 128], [128, 128], [128, 128], [128, 128], [128, 128], [128, 128]],
         'skip': []}),
     ('TopK', {
@@ -924,8 +1104,13 @@ test_case_nn_ops = [
         'desc_const': [0],
         'desc_inputs': [[1152], Tensor(np.array(10).astype(np.int32))],
         'desc_bprop': [Tensor(np.array(10).astype(np.float32))]}),
+    ('SparseGatherV2_0', {
+        'block': P.SparseGatherV2(),
+        'desc_const': [0],
+        'desc_inputs': [[3, 1, 2], Tensor(np.array([0, 1]).astype(np.int32))],
+        'desc_bprop': [[2, 1, 2]]}),
     ('Range', {
-        'block': P.Range(1.0, 5.0),
+        'block': inner.Range(1.0, 5.0),
         'desc_inputs': [Tensor(np.ones([10]).astype(np.float32))],
         'desc_bprop': [[10]]}),
     ('UnsortedSegmentSum', {
@@ -1024,8 +1209,9 @@ test_case_nn_ops = [
         'desc_inputs': [[1, 2, 3], [1, 2, 3], [1, 2, 3]],
         'desc_bprop': []}),
     ('SparseApplyAdagrad', {
-        'block': P.SparseApplyAdagrad(0.5),
-        'desc_inputs': [[3, 3], [3, 3], [3, 3], Tensor(np.ones((3,), np.int32))],
+        'block': SparseApplyAdagradNet(),
+        'desc_inputs': [[3, 3], Tensor(np.ones((3,), np.int32))],
+        'desc_bprop': [[3, 3], [3, 3]],
         'skip': ['backward']}),
     ('SparseApplyFtrl', {
         'block': SparseApplyFtrlNet(),
@@ -1033,11 +1219,27 @@ test_case_nn_ops = [
         'skip': ['backward']}),
     ('ApplyProximalAdagrad', {
         'block': ApplyProximalAdagradNet(),
-        'desc_inputs': [[3, 3], [3, 3], [3, 3]],
+        'desc_inputs': [[3, 3]],
         'skip': ['backward']}),
     ('SparseApplyProximalAdagrad', {
         'block': SparseApplyProximalAdagradNet(),
-        'desc_inputs': [[3, 3], [3, 3], [3, 3], Tensor(np.ones((3,), np.int32))],
+        'desc_inputs': [[3, 3], Tensor(np.ones((3,), np.int32))],
+        'skip': ['backward']}),
+    ('ApplyAdaMax', {
+        'block': ApplyAdaMaxNet(),
+        'desc_inputs': [[3, 3]],
+        'skip': ['backward']}),
+    ('ApplyAdadelta', {
+        'block': ApplyAdadeltaNet(),
+        'desc_inputs': [[3, 3]],
+        'skip': ['backward']}),
+    ('ApplyAdagrad', {
+        'block': ApplyAdagradNet(),
+        'desc_inputs': [[3, 3]],
+        'skip': ['backward']}),
+    ('ApplyAdagradV2', {
+        'block': ApplyAdagradV2Net(),
+        'desc_inputs': [[3, 3]],
         'skip': ['backward']}),
     ('Flatten_1', {
         'block': NetForFlatten(),
@@ -1158,12 +1360,10 @@ test_case_nn_ops = [
                         Tensor([[-1.4, -0.7], [0.9, 0.7]], mstype.float16)],
         'desc_bprop': [],
         'skip': ['backward']}),
-    ('SparseApplyAdagrad', {
-        'block': P.SparseApplyAdagrad(0.5),
-        'desc_inputs': [Tensor([[0.7, 0.2], [0.1, 0.07]], mstype.float32),
-                        Tensor([[0.2, 0.2], [0.1, 0.4]], mstype.float32),
-                        Tensor([[0.5, 0.4], [0.6, 0.1]], mstype.float32), Tensor([1, 1], mstype.int32)],
-        'desc_bprop': [Tensor([[0.7, 0.2], [0.1, 0.07]], mstype.float32)],
+    ('DataFormatDimMap', {
+        'block': P.DataFormatDimMap(),
+        'desc_inputs': [Tensor([0, 1, 2, 3], mstype.int32)],
+        'desc_bprop': [],
         'skip': ['backward']}),
 ]
 
@@ -1215,6 +1415,11 @@ test_case_array_ops = [
         'desc_inputs': [[2, 3, 3, 5], [2, 3, 3, 5]],
         'desc_bprop': [[2, 3, 3, 5]],
         'skip': ['backward']}),
+    ('AccumulateNV2', {
+        'block': NetForTupleInput(P.AccumulateNV2()),
+        'desc_inputs': [[2, 3, 3, 5], [2, 3, 3, 5]],
+        'desc_bprop': [[2, 3, 3, 5]],
+        'skip': ['backward']}),
     ('Shape', {
         'block': P.Shape(),
         'desc_inputs': [[3, 3, 2, 2]],
@@ -1306,7 +1511,7 @@ test_case_array_ops = [
         'desc_inputs': [(Tensor(np.array([1], np.float32)),
                          Tensor(np.array([1], np.float32)),
                          Tensor(np.array([1], np.float32)))],
-        'desc_bprop': [[3,]]}),
+        'desc_bprop': [[3, ]]}),
     ('Pack_0', {
         'block': NetForPackInput(P.Pack()),
         'desc_inputs': [[2, 2], [2, 2], [2, 2]],
@@ -1378,11 +1583,54 @@ test_case_array_ops = [
         'desc_inputs': [Tensor(np.array([[1, 2, 3], [4, 5, 6], [4, 2, 1]]).astype(np.float32)),
                         Tensor(np.array([0, 1, 1]).astype(np.int32))],
         'desc_bprop': [Tensor(np.array([[1, 2, 3], [4, 2, 1]]).astype(np.float32))]}),
+    ('BroadcastTo', {
+        'block': P.BroadcastTo((2, 3)),
+        'desc_inputs': [Tensor(np.array([1, 2, 3]).astype(np.float32))],
+        'desc_bprop': [Tensor(np.array([[1, 2, 3], [1, 2, 3]]).astype(np.float32))]}),
+    ('InTopK', {
+        'block': P.InTopK(2),
+        'desc_inputs': [Tensor(np.array([[1, 2, 3], [2, 3, 6], [4, 2, 1]]).astype(np.float32)),
+                        Tensor(np.array([2, 1, 2]).astype(np.int32))],
+        'skip': ['backward'],
+    }),
+    ('InplaceUpdate', {
+        'block': P.InplaceUpdate((0, 2)),
+        'desc_inputs': [Tensor(np.arange(24).reshape(3, 4, 2).astype(np.float32)),
+                        Tensor(np.arange(16).reshape(2, 4, 2).astype(np.float32))],
+        'skip': ['backward'],
+    }),
     ('ReverseSequence', {
         'block': P.ReverseSequence(1, 0),
         'desc_inputs': [Tensor(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.float32)),
                         Tensor(np.array([1, 2, 3]).astype(np.int32))],
         'desc_bprop': [[3, 3]]}),
+    ('LinSpace', {
+        'block': inner.LinSpace(),
+        'desc_inputs': [Tensor([5, 5.5], mstype.float32),
+                        Tensor(1, mstype.float32),
+                        Tensor(10, mstype.float32),
+                        Tensor(5, mstype.int32)],
+        'skip': ['backward'],
+    }),
+    ('MatrixDiag', {
+        'block': inner.MatrixDiag(),
+        'desc_inputs': [Tensor(np.array([1, -1]), mstype.float32),
+                        Tensor(np.arange(-12, 0).reshape(3, 2, 2), mstype.float32)],
+        'skip': ['backward'],
+    }),
+    ('MatrixDiagPart', {
+        'block': inner.MatrixDiagPart(),
+        'desc_inputs': [Tensor(np.arange(12).reshape(3, 2, 2), mstype.float32),
+                        Tensor(np.arange(-12, 0).reshape(3, 2, 2), mstype.float32)],
+        'skip': ['backward'],
+    }),
+    ('MatrixSetDiag', {
+        'block': inner.MatrixSetDiag(),
+        'desc_inputs': [Tensor(np.arange(12).reshape(3, 2, 2), mstype.float32),
+                        Tensor(np.arange(6).reshape(3, 2), mstype.float32),
+                        Tensor(np.arange(-12, 0).reshape(3, 2, 2), mstype.float32)],
+        'skip': ['backward'],
+    }),
 ]
 
 test_case_other_ops = [
@@ -1413,6 +1661,12 @@ test_case_other_ops = [
         'desc_inputs': (Tensor(np.ones((2, 2), np.int32)),
                         Tensor(np.ones((2,), np.int32))),
         'desc_bprop': [([3, 3], {'dtype': np.int32})]}),
+    ('TensorScatterUpdate', {
+        'block': P.TensorScatterUpdate(),
+        'desc_inputs': (Tensor(np.arange(3 * 4 * 5).reshape((3, 4, 5)),  mstype.float32),
+                        Tensor(np.array([[0, 1], [1, 2]], np.int32)),
+                        Tensor(np.ones([2, 5], np.float32) * 99)),
+        'desc_bprop': [([3, 4, 5], {'dtype': np.float32})]}),
     ('ScatterMax', {
         'block': ScatterMax(),
         'desc_inputs': (Tensor(np.array([[0, 0], [1, 1]], np.int32)),
@@ -1465,7 +1719,43 @@ test_case_other_ops = [
 
 ]
 
-test_case_lists = [test_case_nn_ops, test_case_math_ops, test_case_array_ops, test_case_other_ops]
+
+test_case_quant_ops = [
+    ('AscendQuant_1', {
+        'block': inner.AscendQuant(0.5, 0.0, False, "Round"),
+        'desc_inputs': [Tensor(np.random.rand(1,2,4,4), mstype.float32)],
+        'skip': ['backward']}),
+    ('AscendQuant_2', {
+        'block': inner.AscendQuant(80.0, 10.0, True, "Round"),
+        'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
+        'skip': ['backward']}),
+    ('AscendQuant_3', {
+        'block': inner.AscendQuant(80.0, 0.0, False, "Floor"),
+        'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
+        'skip': ['backward']}),
+    ('AscendQuant_4', {
+        'block': inner.AscendQuant(80.0, 0.0, False, "Ceil"),
+        'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
+        'skip': ['backward']}),
+    ('AscendQuant_5', {
+        'block': inner.AscendQuant(80.0, 0.0, False, "Trunc"),
+        'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
+        'skip': ['backward']}),
+    ('AscendQuant_6', {
+        'block': inner.AscendQuant(-80.0, 10.0, False, "Round"),
+        'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
+        'skip': ['backward']}),
+    ('AscendQuant_7', {
+        'block': inner.AscendQuant(80.0, -10.0, False, "Round"),
+        'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
+        'skip': ['backward']}),
+    ('AscendQuant_8', {
+        'block': inner.AscendQuant(80.0, 10.0, False, "Round"),
+        'desc_inputs': [Tensor([100.0, 200.0], mstype.float16)],
+        'skip': ['backward']}),
+]
+
+test_case_lists = [test_case_nn_ops, test_case_math_ops, test_case_array_ops, test_case_other_ops, test_case_quant_ops]
 test_case = functools.reduce(lambda x, y: x + y, test_case_lists)
 # use -k to select certain testcast
 # pytest tests/python/ops/test_ops.py::test_backward -k LayerNorm
@@ -1473,8 +1763,7 @@ test_case = functools.reduce(lambda x, y: x + y, test_case_lists)
 
 test_exec_case = test_case
 
-test_backward_exec_case = filter(lambda x: 'skip' not in x[1] or
-                                 'backward' not in x[1]['skip'], test_case)
+test_backward_exec_case = filter(lambda x: 'skip' not in x[1] or 'backward' not in x[1]['skip'], test_case)
 
 
 @non_graph_engine
diff --git a/tests/ut/python/ops/test_ops_attr_infer.py b/tests/ut/python/ops/test_ops_attr_infer.py
index d4f6d8e299..6f18710558 100644
--- a/tests/ut/python/ops/test_ops_attr_infer.py
+++ b/tests/ut/python/ops/test_ops_attr_infer.py
@@ -14,9 +14,12 @@
 # ============================================================================
 """ test nn ops """
 import numpy as np
+from numpy.random import normal
 
 import mindspore.nn as nn
 import mindspore.context as context
+from mindspore.ops.composite import core
+from mindspore.common.api import ms_function
 
 from mindspore import Tensor
 from mindspore.ops import functional as F
@@ -59,10 +62,39 @@ def test_conv2d_same_primitive():
     net(t1, t2)
 
 
+# test free variable function list as parameter
+def test_remove_and_fv_2():
+    @core(loop_can_uroll=True)
+    def inner_loop(x, input_data, fv_func_list):
+        ret = ()
+        for fv_fn in fv_func_list:
+            ele = fv_fn(input_data)
+            ret += (ele,)
+        return ret
+
+    @ms_function
+    def out_loop(input1, input_data):
+        ret = ()
+
+        def fv_func1(y):
+            return input1 * y
+        def fv_func2(y):
+            return input1 - y
+        fv_func_list = [fv_func1, fv_func2]
+        ele0 = inner_loop(input1, input_data[0], fv_func_list)
+        ele1 = inner_loop(input1, input_data[1], fv_func_list)
+        ret = (ele0, ele1)
+        return ret
+
+    input_data = (Tensor(normal(0, 0.1, (3, 3))), Tensor(normal(0, 0.1, (3, 1))))
+    input1 = Tensor(normal(0, 0.1, (3, 3)))
+    out_loop(input1, input_data)
+
+
 # test cell as high order argument
 # The graph with free variables used as argument is not supported yet
 # because of the limit of inference specialize system
-def Xtest_conv2d_op_with_arg():
+def test_conv2d_op_with_argi_1():
     class Conv2dNet(nn.Cell):
         def __init__(self):
             super(Conv2dNet, self).__init__()
@@ -279,7 +311,7 @@ def test_op_with_arg_as_input():
 
 # The partial application used as argument is not supported yet
 # because of the limit of inference specialize system
-def Xtest_partial_as_arg():
+def test_partial_as_arg():
     class PartialArgNet(nn.Cell):
         def __init__(self):
             super(PartialArgNet, self).__init__()
diff --git a/tests/ut/python/ops/test_signature.py b/tests/ut/python/ops/test_signature.py
index e6447be8f3..6de00330af 100644
--- a/tests/ut/python/ops/test_signature.py
+++ b/tests/ut/python/ops/test_signature.py
@@ -47,7 +47,7 @@ class Net(nn.Cell):
 
 
 def test_assign_through_cell():
-    context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
+    context.set_context(mode=context.GRAPH_MODE)
     net = Net()
     net.to_float(ms.float16)
     net.add_flags_recursive(fp16=False)
@@ -57,6 +57,25 @@ def test_assign_through_cell():
         net(None)
 
 
+class AssignOp(nn.Cell):
+    def __init__(self):
+        super(AssignOp, self).__init__()
+        self.b = Parameter(initializer('ones', [5]), name='b')
+
+
+    def construct(self, w):
+        self.b = w
+        return w
+
+
+def test_assign_by_operator():
+    context.set_context(mode=context.GRAPH_MODE)
+    net = AssignOp()
+    net.to_float(ms.float16)
+    input_data = Tensor(np.ones([5]).astype(np.float32))
+    net(input_data)
+
+
 class NetScatterNdUpdate(nn.Cell):
     def __init__(self):
         super(NetScatterNdUpdate, self).__init__()
diff --git a/tests/ut/python/ops/test_tensor_slice.py b/tests/ut/python/ops/test_tensor_slice.py
index 7985b9c232..e6b078fd5c 100644
--- a/tests/ut/python/ops/test_tensor_slice.py
+++ b/tests/ut/python/ops/test_tensor_slice.py
@@ -25,7 +25,6 @@ from ....mindspore_test_framework.pipeline.forward.compile_forward \
     import pipeline_for_compile_forward_ge_graph_for_case_by_case_config, \
     pipeline_for_compile_forward_ge_graph_for_case_by_case_config_exception
 
-
 class NetWorkSlicePositive(Cell):
     def __init__(self):
         super(NetWorkSlicePositive, self).__init__()
@@ -528,6 +527,7 @@ def test_tensor_assign():
     # 2. A[::, 1:, ...] = scalar/tensor
     net = TensorAssignWithTupleEllipsis()
     net(Ta, b)
+    Tc = Tensor(1, mstype.float32)
     with pytest.raises(ValueError):
         net(Ta, Tc)
     with pytest.raises(ValueError):
@@ -1139,7 +1139,7 @@ raise_error_set = [
 
 @mindspore_test(pipeline_for_compile_forward_ge_graph_for_case_by_case_config)
 def test_exec():
-    context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
+    context.set_context(mode=context.GRAPH_MODE)
     return test_cases
 
 
diff --git a/tests/ut/python/ops/test_tuple_slice.py b/tests/ut/python/ops/test_tuple_slice.py
index eb21f3f2eb..1475c177f4 100644
--- a/tests/ut/python/ops/test_tuple_slice.py
+++ b/tests/ut/python/ops/test_tuple_slice.py
@@ -80,6 +80,17 @@ class NetWork_3(Cell):
         return res
 
 
+class NetWorkOutOfBounds(Cell):
+    """ NetWork_3 definition """
+
+    def __init__(self):
+        super(NetWorkOutOfBounds, self).__init__()
+        self.addN = P.AddN()
+
+    def construct(self, tensor_tuple):
+        return tensor_tuple[100]
+
+
 test_cases = [
     ('SlicePositive', {
         'block': NetWork_1(),
@@ -104,16 +115,23 @@ test_cases = [
 test_cases_for_verify_exception = [
     ('SliceStartCross', {
         'block': (NetWork_3(), {'exception': RuntimeError}),
-        'desc_inputs': [*(Tensor(np.ones([2, 3, 4], np.int32)),
-                          Tensor(np.zeros([2, 3, 4], np.int32)),
-                          Tensor(np.ones([2, 3, 4], np.int32)))],
+        'desc_inputs': [Tensor(np.ones([2, 3, 4], np.int32)),
+                        Tensor(np.zeros([2, 3, 4], np.int32)),
+                        Tensor(np.ones([2, 3, 4], np.int32))],
     }),
     ('SliceStepZero', {
         'block': (NetWork_3(), {'exception': RuntimeError}),
-        'desc_inputs': [*(Tensor(np.ones([2, 3, 4], np.int32)),
-                          Tensor(np.zeros([2, 3, 4], np.int32)),
-                          Tensor(np.ones([2, 3, 4], np.int32)))],
+        'desc_inputs': [Tensor(np.ones([2, 3, 4], np.int32)),
+                        Tensor(np.zeros([2, 3, 4], np.int32)),
+                        Tensor(np.ones([2, 3, 4], np.int32))],
+    }),
+    ('SliceOutOfBounds', {
+        'block': (NetWorkOutOfBounds(), {'exception': IndexError}),
+        'desc_inputs': [(Tensor(np.ones([2, 3, 4], np.int32)),
+                         Tensor(np.zeros([2, 3, 4], np.int32)),
+                         Tensor(np.ones([2, 3, 4], np.int32)))],
     }),
+
 ]
 
 
diff --git a/tests/ut/python/optimizer/test_debug_location.py b/tests/ut/python/optimizer/test_debug_location.py
index f35a8e3fc3..bdd8e96ca0 100644
--- a/tests/ut/python/optimizer/test_debug_location.py
+++ b/tests/ut/python/optimizer/test_debug_location.py
@@ -144,7 +144,7 @@ def test_op_forward_infererror():
     input_np = np.random.randn(2, 3, 4, 5).astype(np.float32)
     input_me = Tensor(input_np)
     net = Net3()
-    with pytest.raises(TypeError) as e:
+    with pytest.raises(TypeError):
         net(input_me)
 
 
@@ -162,7 +162,7 @@ def test_sequential_resolve_error():
     input_np = np.random.randn(2, 3, 4, 5).astype(np.float32)
     input_me = Tensor(input_np)
     net = SequenceNet()
-    with pytest.raises(RuntimeError) as e:
+    with pytest.raises(RuntimeError):
         net(input_me)
 
 
diff --git a/tests/ut/python/optimizer/test_optimize_with_loss_scale.py b/tests/ut/python/optimizer/test_optimizer_with_loss_scale.py
similarity index 100%
rename from tests/ut/python/optimizer/test_optimize_with_loss_scale.py
rename to tests/ut/python/optimizer/test_optimizer_with_loss_scale.py
diff --git a/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py b/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py
similarity index 56%
rename from tests/ut/python/optimizer/test_optimize_with_parameter_groups.py
rename to tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py
index 705c85be26..5806a5bfc8 100644
--- a/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py
+++ b/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py
@@ -59,8 +59,9 @@ def test_group_lr():
     default_lr = 0.1
     conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
     no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-    group_params = [{'params': conv_params, 'lr': conv_lr},
-                    {'params': no_conv_params}]
+    group_params = [{'params': no_conv_params},
+                    {'params': conv_params, 'lr': conv_lr},
+                    {'order_params': net.trainable_params()}]
     net.set_train()
     loss = nn.SoftmaxCrossEntropyWithLogits()
 
@@ -68,12 +69,15 @@ def test_group_lr():
     assert opt.is_group is True
     assert opt.is_group_lr is True
     assert opt.dynamic_lr is False
-    for lr, param in zip(opt.learning_rate, opt.parameters):
+    assert opt.is_group_params_ordered is True
+    for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()):
         if param in conv_params:
             assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy())
         else:
             assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
 
+        assert param.name == order_param.name
+
     net_with_loss = WithLossCell(net, loss)
     train_network = TrainOneStepCell(net_with_loss, opt)
     _executor.compile(train_network, inputs, label)
@@ -88,20 +92,24 @@ def test_group_dynamic_1():
     default_lr = (0.1, 0.2, 0.3)
     conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
     no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-    group_params = [{'params': conv_params, 'lr': conv_lr},
-                    {'params': no_conv_params}]
+    group_params = [{'params': no_conv_params},
+                    {'params': conv_params, 'lr': conv_lr},
+                    {'order_params': net.trainable_params()}]
     net.set_train()
     loss = nn.SoftmaxCrossEntropyWithLogits()
 
     opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9)
     assert opt.is_group is True
     assert opt.dynamic_lr is True
-    for lr, param in zip(opt.learning_rate, opt.parameters):
+    assert opt.is_group_params_ordered is True
+    for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()):
         if param in conv_params:
             assert np.all(lr.data.asnumpy() == Tensor(np.array([conv_lr] * 3).astype(np.float32)).asnumpy())
         else:
             assert np.all(lr.data.asnumpy() == Tensor(np.array(list(default_lr)).astype(np.float32)).asnumpy())
 
+        assert param.name == order_param.name
+
     net_with_loss = WithLossCell(net, loss)
     train_network = TrainOneStepCell(net_with_loss, opt)
     _executor.compile(train_network, inputs, label)
@@ -126,9 +134,9 @@ def test_group_dynamic_2():
     assert opt.dynamic_lr is True
     for lr, param in zip(opt.learning_rate, opt.parameters):
         if param in conv_params:
-            assert np.all(lr.data == Tensor(np.array(list(conv_lr)).astype(np.float32)))
+            assert np.all(lr.data.asnumpy() == Tensor(np.array(list(conv_lr)).astype(np.float32)).asnumpy())
         else:
-            assert np.all(lr.data == Tensor(np.array([default_lr] * 3).astype(np.float32)))
+            assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3).astype(np.float32)).asnumpy())
 
     net_with_loss = WithLossCell(net, loss)
     train_network = TrainOneStepCell(net_with_loss, opt)
@@ -179,15 +187,18 @@ def test_weight_decay():
     default_weight_decay = 0.0
     conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
     no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
-    group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
-                    {'params': no_conv_params}]
+    group_params = [{'params': no_conv_params},
+                    {'params': conv_params, 'weight_decay': conv_weight_decay},
+                    {'order_params': net.trainable_params()}]
     net.set_train()
     loss = nn.SoftmaxCrossEntropyWithLogits()
 
     opt = SGD(group_params, learning_rate=0.1, weight_decay=default_weight_decay)
     assert opt.is_group is True
     assert opt.is_group_lr is False
-    for weight_decay, decay_flags, param in zip(opt.weight_decay, opt.decay_flags, opt.parameters):
+    assert opt.is_group_params_ordered is True
+    for weight_decay, decay_flags, param, order_param in zip(
+            opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()):
         if param in conv_params:
             assert weight_decay == conv_weight_decay
             assert decay_flags is True
@@ -195,6 +206,8 @@ def test_weight_decay():
             assert weight_decay == default_weight_decay
             assert decay_flags is False
 
+        assert param.name == order_param.name
+
     net_with_loss = WithLossCell(net, loss)
     train_network = TrainOneStepCell(net_with_loss, opt)
     _executor.compile(train_network, inputs, label)
@@ -232,6 +245,19 @@ def test_get_lr_parameter_with_group():
         assert lr.name == 'lr_' + param.name
 
 
+def test_get_lr_parameter_with_order_group():
+    net = LeNet5()
+    conv_lr = 0.1
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'order_params': net.trainable_params()}]
+    opt = SGD(group_params)
+    assert opt.is_group_lr is True
+    for param in opt.parameters:
+        lr = opt.get_lr_parameter(param)
+        assert lr.name == 'lr_' + param.name
+
+
 def test_get_lr_parameter_with_no_group():
     net = LeNet5()
     conv_weight_decay = 0.8
@@ -249,3 +275,125 @@ def test_get_lr_parameter_with_no_group():
     params_error = [1, 2, 3]
     with pytest.raises(TypeError):
         opt.get_lr_parameter(params_error)
+
+
+def test_order_params_lr():
+    net = LeNet5()
+    conv_lr = 0.01
+    default_lr = 0.1
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'order_params': net.trainable_params()}]
+    opt = SGD(group_params, learning_rate=default_lr)
+    assert opt.is_group is True
+    assert opt.is_group_lr is True
+    assert opt.is_group_params_ordered is True
+    for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()):
+        if param in conv_params:
+            assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy())
+        else:
+            assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
+
+        assert param.name == order_param.name
+        assert lr.name == 'lr_' + param.name
+
+
+def test_order_params_weight_decay():
+    net = LeNet5()
+    conv_weight_decay = 0.01
+    default_wd = 0.0
+    default_lr = 0.1
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
+                    {'order_params': net.trainable_params()}]
+    opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
+    assert opt.is_group is True
+    assert opt.is_group_lr is False
+    assert opt.is_group_params_ordered is True
+    assert opt.learning_rate.name == "learning_rate"
+    assert np.all(opt.learning_rate.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
+    for weight_decay, decay_flags, param, order_param in zip(
+            opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()):
+        if param in conv_params:
+            assert weight_decay == conv_weight_decay
+            assert decay_flags is True
+        else:
+            assert weight_decay == default_wd
+            assert decay_flags is False
+        assert param.name == order_param.name
+
+
+def test_order_params_all_1():
+    net = LeNet5()
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'weight_decay': 0.01},
+                    {'params': bias_params, 'lr': 0.01},
+                    {'order_params': net.trainable_params()}]
+    opt = SGD(group_params, learning_rate=0.1, weight_decay=0.0)
+    assert opt.is_group is True
+    assert opt.is_group_lr is True
+    assert opt.is_group_params_ordered is True
+    for weight_decay, decay_flags, lr, param, order_param in zip(
+            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
+        if param in conv_params:
+            assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy())
+            assert weight_decay == 0.01
+            assert decay_flags is True
+        elif param in bias_params:
+            assert np.all(lr.data.asnumpy() == Tensor(0.01, mstype.float32).asnumpy())
+            assert weight_decay == 0.0
+            assert decay_flags is False
+        else:
+            assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy())
+            assert weight_decay == 0.0
+            assert decay_flags is False
+
+        assert param.name == order_param.name
+        assert lr.name == 'lr_' + param.name
+
+
+def test_order_params_all_2():
+    net = LeNet5()
+    conv_weight_decay = 0.01
+    fc1_lr = (0.5, 0.4, 0.3)
+    default_lr = 0.1
+    default_wd = 0.0
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    fc1_params = list(filter(lambda x: 'fc1' in x.name, net.trainable_params()))
+    group_params = [{'params': fc1_params, 'lr': fc1_lr},
+                    {'params': conv_params, 'weight_decay': conv_weight_decay},
+                    {'order_params': net.trainable_params()}]
+    opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
+    assert opt.is_group is True
+    assert opt.is_group_lr is True
+    assert opt.is_group_params_ordered is True
+    for weight_decay, decay_flags, lr, param, order_param in zip(
+            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
+        if param in conv_params:
+            assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy())
+            assert weight_decay == conv_weight_decay
+            assert decay_flags is True
+        elif param in fc1_params:
+            assert np.all(lr.data.asnumpy() == Tensor(fc1_lr, mstype.float32).asnumpy())
+            assert weight_decay == default_wd
+            assert decay_flags is False
+        else:
+            assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy())
+            assert weight_decay == default_wd
+            assert decay_flags is False
+
+        assert param.name == order_param.name
+        assert lr.name == 'lr_' + param.name
+
+
+def test_get_order_params_with_not_include():
+    net = LeNet5()
+    conv_weight_decay = 0.8
+
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
+                    {'order_params': no_conv_params}]
+    with pytest.raises(ValueError):
+        SGD(group_params)
diff --git a/tests/ut/python/optimizer/test_while_ScatterNdUpdate.py b/tests/ut/python/optimizer/test_while_ScatterNdUpdate.py
new file mode 100644
index 0000000000..a21955b2b6
--- /dev/null
+++ b/tests/ut/python/optimizer/test_while_ScatterNdUpdate.py
@@ -0,0 +1,31 @@
+import numpy as np
+from mindspore import context, nn, Tensor, Parameter
+from mindspore.common import dtype as mstype
+from mindspore.ops import operations as P
+
+
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
+
+class Net(nn.Cell):
+    def __init__(self, data):
+        super(Net, self).__init__()
+        self.start = Tensor(0, dtype=mstype.int32)
+        self.end = Tensor(2, dtype=mstype.int32)
+        self.max_output = Parameter(data, "output_x")
+        self.upd = P.ScatterNdUpdate()
+        self.zero = Tensor(np.ones([1], dtype=np.int32))
+
+    def construct(self, inputs):
+        idx = self.start
+        end = self.end
+        while idx < end:
+            xi = inputs[idx, :, :]
+            self.upd(self.max_output, idx + self.zero, xi)
+            idx = idx + 1
+        return self.max_output + 0
+
+
+def test_x():
+    x = Tensor(np.arange(10 * 2 * 3).reshape(10, 2, 3).astype(np.float32))
+    net = Net(x)
+    net(x)
diff --git a/tests/ut/python/parallel/test_auto_parallel_cast.py b/tests/ut/python/parallel/test_auto_parallel_cast.py
index cac452de96..4a77fd0cd2 100644
--- a/tests/ut/python/parallel/test_auto_parallel_cast.py
+++ b/tests/ut/python/parallel/test_auto_parallel_cast.py
@@ -80,9 +80,9 @@ def test_double_star_graph():
 
     _executor.compile(net, x, y, z, w, phase='train')
     strategies = _executor._get_strategy(net)
-    expected_strategies = {'Default/network-Net/Cast-op1': [[8, 1]],
-                           'Default/network-Net/Cast-op3': [[1, 8]],
-                           'Default/network-Net/MatMul-op2': [[8, 1], [1, 1]],
+    expected_strategies = {'Default/network-Net/Cast-op0': [[8, 1]],
+                           'Default/network-Net/Cast-op1': [[1, 8]],
+                           'Default/network-Net/MatMul-op3': [[8, 1], [1, 1]],
                            'Default/network-Net/MatMul-op4': [[1, 1], [1, 8]],
-                           'Default/network-Net/MatMul-op0': [[1, 8], [8, 1]]}
+                           'Default/network-Net/MatMul-op2': [[1, 8], [8, 1]]}
     assert strategies == expected_strategies
diff --git a/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py b/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
index 691bfcdf32..6d4452407c 100644
--- a/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
+++ b/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
 import numpy as np
 
 import mindspore as ms
@@ -69,9 +70,8 @@ def test_common_parameter():
 
     _executor.compile(net, x, y, phase='train')
     strategies = _executor._get_strategy(net)
-    expected_strategies = {'Default/network-Net/MatMul-op1': [[8, 1], [1, 1]],
-                           'Default/network-Net/MatMul-op3': [[8, 1], [1, 1]],
-                           'Default/network-Net/Cast-op2': [[1, 1]],
-                           'Default/network-Net/MatMul-op0': [[8, 1], [1, 1]],
-                           'Default/network-Net/Cast-op4': [[1, 1]]}
-    assert strategies == expected_strategies
+    for (k, v) in strategies.items():
+        if re.search('MatMul-op', k) is not None:
+            assert v == [[8, 1], [1, 1]]
+        elif re.search('Cast-op', k) is not None:
+            assert v == [[1, 1]]
diff --git a/tests/ut/python/parallel/test_embeddinglookup.py b/tests/ut/python/parallel/test_embeddinglookup.py
index b306061981..4ab5f5f878 100644
--- a/tests/ut/python/parallel/test_embeddinglookup.py
+++ b/tests/ut/python/parallel/test_embeddinglookup.py
@@ -16,12 +16,20 @@ import numpy as np
 
 import mindspore as ms
 import mindspore.nn as nn
-from mindspore import Tensor
 from mindspore.common.api import _executor
 from mindspore.ops import operations as P
+from mindspore.ops import composite as C
 from mindspore.ops.operations import _inner_ops as inner
+from mindspore import Tensor, context
 from tests.ut.python.ops.test_math_ops import VirtualLoss
 
+class GradWrap(nn.Cell):
+    def __init__(self, network):
+        super(GradWrap, self).__init__()
+        self.network = network
+
+    def construct(self, x, y):
+        return C.grad_all(self.network)(x, y)
 
 class NetWithLoss(nn.Cell):
     def __init__(self, network):
@@ -73,3 +81,30 @@ def test_embeddinglookup_reducescatter_true():
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([8, 32, 8]), dtype=ms.float32)
     _executor.compile(net, x, y)
+
+
+def test_embeddinglookup_reducescatter_false_grad():
+    shape = [8, 8]
+    offset = 8
+    reduce_scatter_flag = False
+    split_num = 1
+    net = GradWrap(NetWithLoss(Net(shape, offset, reduce_scatter_flag, split_num)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
+    y = Tensor(np.ones([8, 32, 8]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_embeddinglookup_reducescatter_true_grad():
+    context.set_context(save_graphs=True)
+    shape = [64, 8]
+    offset = 8
+    reduce_scatter_flag = True
+    split_num = 8
+    net = GradWrap(NetWithLoss(Net(shape, offset, reduce_scatter_flag, split_num)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
+    y = Tensor(np.ones([8, 32, 8]), dtype=ms.float32)
+    _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_full_batch.py b/tests/ut/python/parallel/test_full_batch.py
new file mode 100644
index 0000000000..70a68a5b00
--- /dev/null
+++ b/tests/ut/python/parallel/test_full_batch.py
@@ -0,0 +1,89 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import mindspore as ms
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore import context
+from mindspore.common.parameter import Parameter
+from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
+from mindspore.nn.optim.momentum import Momentum
+from mindspore.ops import operations as P
+from mindspore.parallel._utils import _reset_op_id
+from mindspore.train import Model, ParallelMode
+from tests.dataset_mock import MindData
+
+class Dataset(MindData):
+    def __init__(self, predict, label, length=3):
+        super(Dataset, self).__init__(size=length)
+        self.predict = predict
+        self.label = label
+        self.index = 0
+        self.length = length
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.index >= self.length:
+            raise StopIteration
+        self.index += 1
+        return self.predict, self.label
+
+    def reset(self):
+        self.index = 0
+
+
+class AllToAllNet(nn.Cell):
+    def __init__(self, strategy1):
+        super(AllToAllNet, self).__init__()
+        self.matmul = P.MatMul().set_strategy(((1, 1), (1, 8)))
+        self.matmul_weight = Parameter(Tensor(np.ones([128, 256]), dtype=ms.float32), name="weight")
+        self.transpose1 = P.Transpose().set_strategy(strategy1)
+
+    def construct(self, x):
+        x = self.matmul(x, self.matmul_weight)
+        x = self.transpose1(x, (1, 0))
+        return x
+
+def all_to_all_net(strategy1):
+    return AllToAllNet(strategy1=strategy1)
+
+def all_to_all_common(strategy1):
+    learning_rate = 0.1
+    momentum = 0.9
+    epoch_size = 2
+
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
+    context.reset_auto_parallel_context()
+    context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=8, full_batch=True)
+    predict = Tensor(np.ones([256, 128]), dtype=ms.float32)
+    label = Tensor(np.ones([256]), dtype=ms.int32)
+    dataset = Dataset(predict, label, 2)
+    net = all_to_all_net(strategy1)
+
+    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+    loss.softmax_cross_entropy.set_strategy(((8, 1), (8, 1)))
+    loss.one_hot.set_strategy(((8, 1), (), ()))
+    opt = Momentum(net.trainable_params(), learning_rate, momentum)
+    model = Model(net, loss, opt)
+
+    model.train(epoch_size, dataset, dataset_sink_mode=False)
+
+def test_all_to_all():
+    strategy1 = ((8, 1),)
+    _reset_op_id()
+    all_to_all_common(strategy1)
diff --git a/tests/ut/python/parallel/test_gather_v2.py b/tests/ut/python/parallel/test_gather_v2.py
index 2720cb33e1..5d52089cbe 100644
--- a/tests/ut/python/parallel/test_gather_v2.py
+++ b/tests/ut/python/parallel/test_gather_v2.py
@@ -45,11 +45,11 @@ class GradWrap(nn.Cell):
 
 
 class Net(nn.Cell):
-    def __init__(self, axis=0, strategy1=None, strategy2=None, shape=None):
+    def __init__(self, axis=0, strategy1=None, strategy2=None, shape=None, target=""):
         super().__init__()
         if shape is None:
             shape = [64, 64]
-        self.gatherv2 = P.GatherV2().set_strategy(strategy1)
+        self.gatherv2 = P.GatherV2().set_strategy(strategy1).add_prim_attr("primitive_target", target)
         self.mul = P.Mul().set_strategy(strategy2)
         self.index = Tensor(np.ones(shape), dtype=ms.int32)
         self.axis = axis
@@ -182,3 +182,39 @@ def test_gatherv2_auto1():
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
     _executor.compile(net, x, y)
+
+
+def test_gatherv2_cpu0():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((8, 1), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = NetWithLoss(Net(0, strategy1, strategy2, None, "CPU"))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_cpu1():
+    context.set_auto_parallel_context(device_num=16, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((16, 1), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = NetWithLoss(Net(0, strategy1, strategy2, None, "CPU"))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_cpu2():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((1, 8), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = NetWithLoss(Net(0, strategy1, strategy2, None, "CPU"))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_sparse_feature_bprop.py b/tests/ut/python/parallel/test_sparse_feature_bprop.py
new file mode 100644
index 0000000000..cd58261dbd
--- /dev/null
+++ b/tests/ut/python/parallel/test_sparse_feature_bprop.py
@@ -0,0 +1,118 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+""" test sparse feature bprop """
+import numpy as np
+
+import mindspore as ms
+import mindspore.nn as nn
+from mindspore import context
+from mindspore.common import dtype as mstype
+from mindspore.common.tensor import Tensor
+from mindspore.ops import composite as C
+from mindspore.ops.operations.comm_ops import AllReduce, _MirrorOperator
+from mindspore.ops._grad.grad_base import bprop_getters
+from mindspore._checkparam import Validator as validator
+from mindspore._checkparam import Rel
+from mindspore.ops.primitive import prim_attr_register, PrimitiveWithInfer
+from mindspore.common.api import _executor
+from mindspore.communication.management import HCCL_WORLD_COMM_GROUP
+
+class GradWrap(nn.Cell):
+    def __init__(self, network):
+        super(GradWrap, self).__init__()
+        self.network = network
+
+    def construct(self, x):
+        return C.grad_all(self.network)(x)
+
+class VirtualGatherV2(PrimitiveWithInfer):
+    @prim_attr_register
+    def __init__(self):
+        """init index_select"""
+        super(VirtualGatherV2, self).__init__('VirtualGatherV2')
+        self.init_prim_io_names(inputs=['params', 'indices', 'axis'], outputs=['output'])
+
+    def __infer__(self, params, indices, axis):
+        validator.check_subclass("params", params['dtype'], mstype.tensor, self.name)
+        validator.check_tensor_type_same({"indices": indices['dtype']}, mstype.int_type, self.name)
+        validator.check_subclass("axis", axis['dtype'], mstype.int_, self.name)
+        axis_v = axis['value']
+        params_shp = params['shape']
+        rank = len(params_shp)
+        validator.check_int_range("axis", axis_v, -rank, rank, Rel.INC_LEFT, self.name)
+        if axis_v < 0:
+            axis_v += rank
+        out_shape = params_shp[:axis_v] + indices['shape'] + params_shp[axis_v + 1:]
+        out = {'shape': out_shape,
+               'dtype': params['dtype'],
+               'value': None}
+        return out
+
+@bprop_getters.register(VirtualGatherV2)
+def get_bprop_gather_v2(self):
+    """Generate bprop for GatherV2"""
+
+    def bprop(x, indices, axis, out, dout):
+        return (indices, dout, x), axis, out
+
+    return bprop
+
+def test_bprop_with_sparse_feature_allreduce():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="hybrid_parallel")
+
+    class Net(nn.Cell):
+        def __init__(self, axis=0, shape=None):
+            super(Net, self).__init__()
+            if shape is None:
+                shape = [8, 8]
+            self.all_reduce = AllReduce()
+            self.gatherv2 = VirtualGatherV2()
+            self.index = Tensor(np.ones(shape), dtype=ms.int32)
+            self.axis = axis
+
+        def construct(self, x):
+            out = self.all_reduce(x)
+            out = self.gatherv2(out, self.index, self.axis)
+
+            return out
+
+    net = GradWrap(Net())
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+
+    _executor.compile(net, x)
+
+def test_bprop_with_sparse_feature_mirror():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="hybrid_parallel")
+
+    class Net(nn.Cell):
+        def __init__(self, axis=0, shape=None):
+            super(Net, self).__init__()
+            if shape is None:
+                shape = [8, 8]
+            self.mirror = _MirrorOperator(group=HCCL_WORLD_COMM_GROUP)
+            self.gatherv2 = VirtualGatherV2()
+            self.index = Tensor(np.ones(shape), dtype=ms.int32)
+            self.axis = axis
+
+        def construct(self, x):
+            out = self.mirror(x)
+            out = self.gatherv2(out, self.index, self.axis)
+
+            return out
+
+    net = GradWrap(Net())
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+
+    _executor.compile(net, x)
diff --git a/tests/ut/python/parallel/test_sparse_gather_v2.py b/tests/ut/python/parallel/test_sparse_gather_v2.py
new file mode 100644
index 0000000000..dd0517a08e
--- /dev/null
+++ b/tests/ut/python/parallel/test_sparse_gather_v2.py
@@ -0,0 +1,220 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore as ms
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore import context
+from mindspore.common.api import _executor
+from mindspore.ops import composite as C
+from mindspore.ops import operations as P
+from tests.ut.python.ops.test_math_ops import VirtualLoss
+
+
+class NetWithLoss(nn.Cell):
+    def __init__(self, network):
+        super(NetWithLoss, self).__init__()
+        self.loss = VirtualLoss()
+        self.network = network
+
+    def construct(self, x, y):
+        predict = self.network(x, y)
+        return self.loss(predict)
+
+
+class GradWrap(nn.Cell):
+    def __init__(self, network):
+        super(GradWrap, self).__init__()
+        self.network = network
+
+    def construct(self, x, y):
+        return C.grad_all(self.network)(x, y)
+
+
+class Net(nn.Cell):
+    def __init__(self, axis=0, strategy1=None, strategy2=None, shape=None, target=""):
+        super().__init__()
+        if shape is None:
+            shape = [64, 64]
+        self.gatherv2 = P.SparseGatherV2().set_strategy(strategy1).add_prim_attr("primitive_target", target)
+        self.mul = P.Mul().set_strategy(strategy2)
+        self.index = Tensor(np.ones(shape), dtype=ms.int32)
+        self.axis = axis
+
+    def construct(self, x, y):
+        out = self.gatherv2(x, self.index, self.axis)
+        out = self.mul(out, y)
+        return out
+
+
+def test_gatherv2_semi_auto0():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((1, 8), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = GradWrap(NetWithLoss(Net(0, strategy1, strategy2)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_semi_auto1():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((8, 1), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = GradWrap(NetWithLoss(Net(0, strategy1, strategy2)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_semi_auto2():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((2, 4), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = GradWrap(NetWithLoss(Net(0, strategy1, strategy2)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_semi_auto3():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((1, 8), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = GradWrap(NetWithLoss(Net(1, strategy1, strategy2)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_semi_auto4():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((8, 1), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = GradWrap(NetWithLoss(Net(1, strategy1, strategy2)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_semi_auto5():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((2, 4), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = GradWrap(NetWithLoss(Net(1, strategy1, strategy2)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_semi_auto6():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = GradWrap(NetWithLoss(Net(0, None, strategy2)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 32]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_semi_auto7():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = GradWrap(NetWithLoss(Net(1, None, strategy2)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_semi_auto8():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((8,), (1, 1))
+    strategy2 = ((4, 2), (4, 2))
+    net = GradWrap(NetWithLoss(Net(0, strategy1, strategy2)))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_auto0():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel")
+    net = GradWrap(NetWithLoss(Net(0)))
+    net.set_auto_parallel()
+    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 32]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_auto1():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel")
+    net = GradWrap(NetWithLoss(Net(1)))
+    net.set_auto_parallel()
+    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_cpu0():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((8, 1), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = NetWithLoss(Net(0, strategy1, strategy2, None, "CPU"))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_cpu1():
+    context.set_auto_parallel_context(device_num=16, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((16, 1), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = NetWithLoss(Net(0, strategy1, strategy2, None, "CPU"))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
+
+
+def test_gatherv2_cpu2():
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel")
+    strategy1 = ((1, 8), (1, 1))
+    strategy2 = ((4, 2, 1), (4, 2, 1))
+    net = NetWithLoss(Net(0, strategy1, strategy2, None, "CPU"))
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    _executor.compile(net, x, y)
diff --git a/tests/ut/python/pipeline/infer/infer.py b/tests/ut/python/pipeline/infer/infer.py
index 4945b0ed4f..a85f19bdc3 100644
--- a/tests/ut/python/pipeline/infer/infer.py
+++ b/tests/ut/python/pipeline/infer/infer.py
@@ -71,7 +71,7 @@ def test(name, file_path, batch_size):
         data_list.append(data.asnumpy())
     batch_data = np.concatenate(data_list, axis=0).transpose((0, 3, 1, 2))
     input_tensor = Tensor(batch_data)
-    print(input_tensor.shape())
+    print(input_tensor.shape)
     network(input_tensor)
 
 
diff --git a/tests/ut/python/pynative_mode/nn/test_activation.py b/tests/ut/python/pynative_mode/nn/test_activation.py
index 0adccc2f34..2698651219 100644
--- a/tests/ut/python/pynative_mode/nn/test_activation.py
+++ b/tests/ut/python/pynative_mode/nn/test_activation.py
@@ -46,10 +46,6 @@ def test_activation_param():
     assert isinstance(output_np[0][0][0][0], (np.float32, np.float64))
 
 
-def test_activation_empty():
-    assert nn.get_activation('') is None
-
-
 # test softmax
 def test_softmax_axis():
     layer = nn.Softmax(1)
diff --git a/tests/ut/python/pynative_mode/nn/test_dropout.py b/tests/ut/python/pynative_mode/nn/test_dropout.py
index 4c5d3957fa..3272e92a51 100644
--- a/tests/ut/python/pynative_mode/nn/test_dropout.py
+++ b/tests/ut/python/pynative_mode/nn/test_dropout.py
@@ -19,7 +19,6 @@ import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore import context
 from mindspore import dtype as mstype
-from mindspore.common.api import _executor
 
 context.set_context(device_target="Ascend")
 
@@ -54,4 +53,4 @@ class Net_Dropout(nn.Cell):
 def test_compile_dropout():
     net = Net_Dropout()
     input_data = Tensor(np.ones([20, 16, 50], dtype=np.float32))
-    _executor.compile(net, input_data)
+    net(input_data)
diff --git a/tests/ut/python/pynative_mode/nn/test_layernorm.py b/tests/ut/python/pynative_mode/nn/test_layernorm.py
index 1362d6ba9e..eb95e1f8a3 100644
--- a/tests/ut/python/pynative_mode/nn/test_layernorm.py
+++ b/tests/ut/python/pynative_mode/nn/test_layernorm.py
@@ -23,7 +23,7 @@ from mindspore import dtype as mstype
 
 def test_check_layer_norm_1():
     x = Tensor(np.ones([20, 5, 10, 10]), mstype.float32)
-    shape1 = x.shape()[1:]
+    shape1 = x.shape[1:]
     m = nn.LayerNorm(shape1, -1, 1)
     with pytest.raises(NotImplementedError):
         m(x)
@@ -31,7 +31,7 @@ def test_check_layer_norm_1():
 
 def test_check_layer_norm_2():
     x = Tensor(np.ones([20, 5, 10, 10]), mstype.float32)
-    shape1 = x.shape()[1:]
+    shape1 = x.shape[1:]
     m = nn.LayerNorm(shape1, begin_params_axis=1)
     with pytest.raises(NotImplementedError):
         m(x)
diff --git a/tests/ut/python/pynative_mode/nn/test_parameter_operation.py b/tests/ut/python/pynative_mode/nn/test_parameter_operation.py
new file mode 100644
index 0000000000..7f609361f0
--- /dev/null
+++ b/tests/ut/python/pynative_mode/nn/test_parameter_operation.py
@@ -0,0 +1,67 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+""" test_tensor_operation """
+import numpy as np
+
+import mindspore.nn as nn
+from mindspore import Tensor, Parameter
+from mindspore import context
+
+
+def setup_module(module):
+    context.set_context(mode=context.PYNATIVE_MODE)
+
+def test_parameter_add():
+    x = Parameter(Tensor(np.ones((3, 3)).astype(np.float32)), name="ref")
+    y = Tensor(np.ones((3, 3)).astype(np.float32))
+    expect = np.ones((3, 3)).astype(np.float32) * 2
+    z = x + y
+    assert np.allclose(z.asnumpy(), expect)
+
+def test_parameter_sub():
+    x = Parameter(Tensor(np.ones((3, 3)).astype(np.float32) * 2), name="ref")
+    y = Tensor(np.ones((3, 3)).astype(np.float32))
+    expect = np.ones((3, 3)).astype(np.float32)
+    z = x - y
+    assert np.allclose(z.asnumpy(), expect)
+
+def test_parameter_mul():
+    x = Parameter(Tensor(np.ones((3, 3)).astype(np.float32) * 2), name="ref")
+    y = Tensor(np.ones((3, 3)).astype(np.float32) * 2)
+    expect = np.ones((3, 3)).astype(np.float32) * 4
+    z = x * y
+    assert np.allclose(z.asnumpy(), expect)
+
+def test_parameter_div():
+    x = Parameter(Tensor(np.ones((3, 3)).astype(np.float32) * 8), name="ref")
+    y = Tensor(np.ones((3, 3)).astype(np.float32) * 2)
+    expect = np.ones((3, 3)).astype(np.float32) * 4
+    z = x / y
+    assert np.allclose(z.asnumpy(), expect)
+
+class ParameterNet(nn.Cell):
+    def __init__(self):
+        super(ParameterNet, self).__init__()
+        self.weight = Parameter(Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], np.float32)), name="ref")
+
+    def construct(self, x):
+        self.weight = x
+
+def test_parameter_assign():
+    """test parameter assign with tensor"""
+    input_x = Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 8.0]], np.float32))
+    net = ParameterNet()
+    net(input_x)
+    assert np.allclose(net.weight.data.asnumpy(), input_x.asnumpy())
diff --git a/tests/ut/python/pynative_mode/ops/test_grad.py b/tests/ut/python/pynative_mode/ops/test_grad.py
index c2e14129ea..8d880a86d9 100644
--- a/tests/ut/python/pynative_mode/ops/test_grad.py
+++ b/tests/ut/python/pynative_mode/ops/test_grad.py
@@ -168,7 +168,7 @@ def test_select_grad():
     sens = Tensor(np.ones_like(out.asnumpy()).astype(np.float32))
     args = [cond, x, y, sens]
     gout = gfn(*args)
-    expect_cond = np.zeros_like(cond)
+    expect_cond = np.zeros_like(cond.asnumpy())
     expect_x = np.array([[1, 0, 0], [0, 1, 1]])
     expect_y = np.array([[0, 1, 1], [1, 0, 0]])
     assert np.all(gout[0].asnumpy() == expect_cond)
diff --git a/tests/ut/python/pynative_mode/ops/test_hypermap.py b/tests/ut/python/pynative_mode/ops/test_hypermap.py
index dc55ae5cbb..db391301ba 100644
--- a/tests/ut/python/pynative_mode/ops/test_hypermap.py
+++ b/tests/ut/python/pynative_mode/ops/test_hypermap.py
@@ -132,7 +132,7 @@ def test_hypermap_add3_easy():
 
 
 add3 = C.MultitypeFuncGraph('add')
-partial = Primitive('partial')
+partial = P.Partial()
 
 
 @add3.register("Number", "Number", "Number")
diff --git a/tests/ut/python/pynative_mode/test_framstruct.py b/tests/ut/python/pynative_mode/test_framstruct.py
index 0e27aee70a..99d792671a 100644
--- a/tests/ut/python/pynative_mode/test_framstruct.py
+++ b/tests/ut/python/pynative_mode/test_framstruct.py
@@ -30,6 +30,7 @@ from ....mindspore_test_framework.utils.check_gradient import (
     ms_function, check_jacobian, Tensor, NNGradChecker,
     OperationGradChecker, check_gradient, ScalarGradChecker)
 
+context.set_context(mode=context.PYNATIVE_MODE)
 
 def setup_module(module):
     context.set_context(mode=context.PYNATIVE_MODE)
@@ -257,8 +258,8 @@ def if_tensor(a, b):
 
 
 def test_if_tensor():
-    res = if_tensor(Tensor(np.ones([64, 10]).astype(np.int32)), Tensor(np.ones([64, 10]).astype(np.int32)))
-    assert res == Tensor(np.ones([64, 10]).astype(np.int32) * 4)
+    res = if_tensor(Tensor(np.ones([1]).astype(np.int32)), Tensor(np.ones([1]).astype(np.int32)))
+    assert res == Tensor(np.ones([1]).astype(np.int32) * 4)
 
 
 @ms_function
@@ -399,7 +400,7 @@ def if_while(a, b, x, z):
 def test_if_while():
     x = Tensor(np.random.randn(1, 16, 12, 12).astype(np.float32))
     z = Tensor(np.random.randn(1, 16, 16, 16).astype(np.float32))
-    res = if_while(Tensor(np.ones([64, 10]).astype(np.float32)), Tensor(np.ones([64, 10]).astype(np.float32)), x, z)
+    res = if_while(Tensor(np.ones([1]).astype(np.float32)), Tensor(np.ones([1]).astype(np.float32)), x, z)
     assert res == Tensor(np.ones([64, 10]).astype(np.float32) * 4.0)
 
 
diff --git a/tests/ut/python/pynative_mode/test_graph_return_const_param.py b/tests/ut/python/pynative_mode/test_graph_return_const_param.py
index 67e55fff56..6c55d561d8 100644
--- a/tests/ut/python/pynative_mode/test_graph_return_const_param.py
+++ b/tests/ut/python/pynative_mode/test_graph_return_const_param.py
@@ -44,7 +44,7 @@ class ChooseInitParameterWithInput(nn.Cell):
         self.x = Parameter(Tensor(np.ones(2), dtype=mstype.int32), name='x')
 
     @ms_function
-    def construct(self, input):
+    def construct(self, input_data):
         return self.x
 
 
@@ -57,7 +57,7 @@ def test_choose_init_param():
 
 def test_choose_param_with_input():
     choose = ChooseInitParameterWithInput()
-    input = Tensor(np.zeros(2), dtype=mstype.int32)
+    input_data = Tensor(np.zeros(2), dtype=mstype.int32)
     expect = Tensor(np.ones(2), dtype=mstype.int32)
-    out = choose(input)
+    out = choose(input_data)
     assert np.allclose(expect.asnumpy(), out.asnumpy())
diff --git a/tests/ut/python/pynative_mode/test_hook.py b/tests/ut/python/pynative_mode/test_hook.py
index 6cfab67ce8..023f039a97 100644
--- a/tests/ut/python/pynative_mode/test_hook.py
+++ b/tests/ut/python/pynative_mode/test_hook.py
@@ -1,12 +1,16 @@
 import numpy as np
+
 import mindspore.nn as nn
 import mindspore.ops.operations as P
-from mindspore.ops import composite as C
 from mindspore import context, Tensor, ParameterTuple
 from mindspore.common.initializer import TruncatedNormal
 from mindspore.nn import WithLossCell, Momentum
+from mindspore.ops import composite as C
 
 context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
+cell_hook_done = False
+var_hook_done = False
+cell_bprop_done = False
 
 
 def conv(in_channels, out_channels, kernel_size, stride=1, padding=0):
@@ -31,21 +35,40 @@ def weight_variable():
 
 def cell_hook_function(cell_id, grad_input, grad_output):
     print(cell_id)
+    global cell_hook_done
+    cell_hook_done = True
     assert (grad_output[0].asnumpy().shape == (32, 6, 14, 14))
     assert (grad_input[0].asnumpy().shape == (32, 16, 10, 10))
 
 
 def var_hook_function(grad_out):
     print("grad:", grad_out)
+    global var_hook_done
+    var_hook_done = True
     assert (grad_out[0].asnumpy().shape == (32, 120))
 
 
+class Block(nn.Cell):
+    def __init__(self):
+        super(Block, self).__init__()
+        self.relu = nn.ReLU()
+
+    def construct(self, x):
+        x = self.relu(x)
+        return x
+
+    def bprop(self, x, out, dout):
+        global cell_bprop_done
+        cell_bprop_done = True
+        grad = out.asnumpy() * dout.asnumpy()
+        grad = Tensor(grad)
+        return (grad,)
+
 class LeNet5(nn.Cell):
     """
     Lenet network
     Args:
         num_class (int): Num classes. Default: 10.
-        
     Returns:
         Tensor, output tensor
 
@@ -59,6 +82,7 @@ class LeNet5(nn.Cell):
         self.conv1 = conv(1, 6, 5)
         self.conv2 = conv(6, 16, 5)
         self.conv2.register_backward_hook(cell_hook_function)
+        self.block = Block()
         self.fc1 = fc_with_initialize(16 * 5 * 5, 120)
         self.fc2 = fc_with_initialize(120, 84)
         self.fc3 = fc_with_initialize(84, self.num_class)
@@ -72,7 +96,7 @@ class LeNet5(nn.Cell):
         x = self.relu(x)
         x = self.max_pool2d(x)
         x = self.conv2(x)
-        x = self.relu(x)
+        x = self.block(x)
         x = self.max_pool2d(x)
         x = self.reshape(x, (self.batch_size, -1))
         x = self.fc1(x)
@@ -110,6 +134,9 @@ def test_hook():
     loss_output = criterion(output, label)
     grads = train_network(input_data, label)
     success = optimizer(grads)
+    assert cell_hook_done
+    assert var_hook_done
+    assert cell_bprop_done
     print(loss_output.asnumpy().shape)
 
 
diff --git a/tests/ut/python/pynative_mode/test_insert_grad_of.py b/tests/ut/python/pynative_mode/test_insert_grad_of.py
index 9c17c5dcd0..0a28bbbb63 100644
--- a/tests/ut/python/pynative_mode/test_insert_grad_of.py
+++ b/tests/ut/python/pynative_mode/test_insert_grad_of.py
@@ -21,7 +21,6 @@ from mindspore import Tensor
 from mindspore import context
 from mindspore.common.api import ms_function
 from mindspore.ops import composite as C
-from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from ....mindspore_test_framework.utils.bprop_util import bprop
 from ....mindspore_test_framework.utils.debug_util import PrintShapeTypeCell, PrintGradShapeTypeCell
diff --git a/tests/ut/python/pynative_mode/test_parse_method.py b/tests/ut/python/pynative_mode/test_parse_method.py
index a4b2907cc9..f189b825e9 100644
--- a/tests/ut/python/pynative_mode/test_parse_method.py
+++ b/tests/ut/python/pynative_mode/test_parse_method.py
@@ -19,9 +19,10 @@
 @Desc  : test parse the object's method
 """
 import logging
+from dataclasses import dataclass
+
 import numpy as np
 import pytest
-from dataclasses import dataclass
 
 import mindspore.nn as nn
 from mindspore import context
@@ -29,6 +30,8 @@ from mindspore._extends.parse.standard_method import ms_len
 from mindspore.common.api import ms_function
 from mindspore.common.tensor import Tensor
 from mindspore.ops.composite import core
+from mindspore.ops.primitive import constexpr
+from mindspore.ops import functional as F
 from ..ut_filter import non_graph_engine
 
 
@@ -417,3 +420,18 @@ def test_range():
     """ test_range """
     res = range_spec(10, 10)
     return res
+
+def test_expr():
+    """ test const expr """
+    a = (1, 2)
+    @constexpr
+    def tuple_len(x):
+        assert len(x) == 2
+    tuple_len(a)
+
+
+def test_tuple_to_array():
+    """ test range tuple to array """
+    range_x = range(10)
+    res = F.tuple_to_array(range_x)
+    print(res)
diff --git a/tests/ut/python/pynative_mode/test_training.py b/tests/ut/python/pynative_mode/test_training.py
index 032ae60f60..0d2cb990e9 100644
--- a/tests/ut/python/pynative_mode/test_training.py
+++ b/tests/ut/python/pynative_mode/test_training.py
@@ -19,9 +19,7 @@ import mindspore.nn as nn
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import WithGradCell, WithLossCell
-from mindspore.nn.optim import Momentum
 from mindspore.ops import operations as P
-from mindspore.train.model import Model
 from ..ut_filter import non_graph_engine
 
 
diff --git a/tests/ut/python/train/quant/mobilenetv2_combined.py b/tests/ut/python/train/quant/mobilenetv2_combined.py
index 5ae241c0f2..7ed1498fb6 100644
--- a/tests/ut/python/train/quant/mobilenetv2_combined.py
+++ b/tests/ut/python/train/quant/mobilenetv2_combined.py
@@ -1,6 +1,5 @@
 """mobile net v2"""
 from mindspore import nn
-from mindspore.nn.layer import combined
 from mindspore.ops import operations as P
 
 
@@ -14,11 +13,11 @@ def _conv_bn(in_channel,
              stride=1):
     """Get a conv2d batchnorm and relu layer."""
     return nn.SequentialCell(
-        [combined.Conv2d(in_channel,
-                         out_channel,
-                         kernel_size=ksize,
-                         stride=stride,
-                         batchnorm=True)])
+        [nn.Conv2dBnAct(in_channel,
+                        out_channel,
+                        kernel_size=ksize,
+                        stride=stride,
+                        batchnorm=True)])
 
 
 class InvertedResidual(nn.Cell):
@@ -31,30 +30,30 @@ class InvertedResidual(nn.Cell):
         self.use_res_connect = self.stride == 1 and inp == oup
         if expend_ratio == 1:
             self.conv = nn.SequentialCell([
-                combined.Conv2d(hidden_dim,
-                                hidden_dim,
-                                3,
-                                stride,
-                                group=hidden_dim,
-                                batchnorm=True,
-                                activation='relu6'),
-                combined.Conv2d(hidden_dim, oup, 1, 1,
-                                batchnorm=True)
+                nn.Conv2dBnAct(hidden_dim,
+                               hidden_dim,
+                               3,
+                               stride,
+                               group=hidden_dim,
+                               batchnorm=True,
+                               activation='relu6'),
+                nn.Conv2dBnAct(hidden_dim, oup, 1, 1,
+                               batchnorm=True)
             ])
         else:
             self.conv = nn.SequentialCell([
-                combined.Conv2d(inp, hidden_dim, 1, 1,
-                                batchnorm=True,
-                                activation='relu6'),
-                combined.Conv2d(hidden_dim,
-                                hidden_dim,
-                                3,
-                                stride,
-                                group=hidden_dim,
-                                batchnorm=True,
-                                activation='relu6'),
-                combined.Conv2d(hidden_dim, oup, 1, 1,
-                                batchnorm=True)
+                nn.Conv2dBnAct(inp, hidden_dim, 1, 1,
+                               batchnorm=True,
+                               activation='relu6'),
+                nn.Conv2dBnAct(hidden_dim,
+                               hidden_dim,
+                               3,
+                               stride,
+                               group=hidden_dim,
+                               batchnorm=True,
+                               activation='relu6'),
+                nn.Conv2dBnAct(hidden_dim, oup, 1, 1,
+                               batchnorm=True)
             ])
         self.add = P.TensorAdd()
 
@@ -99,7 +98,7 @@ class MobileNetV2(nn.Cell):
 
         self.features = nn.SequentialCell(features)
         self.mean = P.ReduceMean(keep_dims=False)
-        self.classifier = combined.Dense(self.last_channel, num_class)
+        self.classifier = nn.DenseBnAct(self.last_channel, num_class)
 
     def construct(self, input_x):
         out = input_x
diff --git a/tests/ut/python/train/quant/test_quant.py b/tests/ut/python/train/quant/test_quant.py
index 91698e0138..c9398be456 100644
--- a/tests/ut/python/train/quant/test_quant.py
+++ b/tests/ut/python/train/quant/test_quant.py
@@ -14,13 +14,13 @@
 # ============================================================================
 """ tests for quant """
 import numpy as np
-from mobilenetv2_combined import MobileNetV2
+import pytest
 
 import mindspore.context as context
 from mindspore import Tensor
 from mindspore import nn
-from mindspore.nn.layer import combined
 from mindspore.train.quant import quant as qat
+from mobilenetv2_combined import MobileNetV2
 
 context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
 
@@ -42,24 +42,45 @@ class LeNet5(nn.Cell):
     def __init__(self, num_class=10):
         super(LeNet5, self).__init__()
         self.num_class = num_class
-        self.conv1 = combined.Conv2d(
-            1, 6, kernel_size=5, batchnorm=True, activation='relu6')
-        self.conv2 = combined.Conv2d(6, 16, kernel_size=5, activation='relu')
-        self.fc1 = combined.Dense(16 * 5 * 5, 120, activation='relu')
-        self.fc2 = combined.Dense(120, 84, activation='relu')
-        self.fc3 = combined.Dense(84, self.num_class)
+        self.conv1 = nn.Conv2dBnAct(1, 6, kernel_size=5, batchnorm=True, activation='relu6', pad_mode="valid")
+        self.conv2 = nn.Conv2dBnAct(6, 16, kernel_size=5, activation='relu', pad_mode="valid")
+        self.fc1 = nn.DenseBnAct(16 * 5 * 5, 120, activation='relu')
+        self.fc2 = nn.DenseBnAct(120, 84, activation='relu')
+        self.fc3 = nn.DenseBnAct(84, self.num_class)
         self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
-        self.flattern = nn.Flatten()
+        self.flatten = nn.Flatten()
 
     def construct(self, x):
         x = self.conv1(x)
-        x = self.bn(x)
-        x = self.relu(x)
         x = self.max_pool2d(x)
         x = self.conv2(x)
         x = self.max_pool2d(x)
-        x = self.flattern(x)
+        x = self.flatten(x)
         x = self.fc1(x)
         x = self.fc2(x)
         x = self.fc3(x)
         return x
+
+
+@pytest.mark.skip(reason="no `te.lang.cce` in ut env")
+def test_qat_lenet():
+    img = Tensor(np.ones((32, 1, 32, 32)).astype(np.float32))
+    net = LeNet5()
+    net = qat.convert_quant_network(
+        net, quant_delay=0, bn_fold=False, freeze_bn=10000, num_bits=8)
+    # should load the checkpoint. mock here
+    for param in net.get_parameters():
+        param.init_data()
+    qat.export_geir(net, img, file_name="quant.pb")
+
+
+@pytest.mark.skip(reason="no `te.lang.cce` in ut env")
+def test_qat_mobile():
+    net = MobileNetV2()
+    img = Tensor(np.ones((1, 3, 224, 224)).astype(np.float32))
+    net = qat.convert_quant_network(
+        net, quant_delay=0, bn_fold=True, freeze_bn=10000, num_bits=8)
+    # should load the checkpoint. mock here
+    for param in net.get_parameters():
+        param.init_data()
+    qat.export_geir(net, img, file_name="quant.pb")
diff --git a/tests/ut/python/train/summary/test_graph_summary.py b/tests/ut/python/train/summary/test_graph_summary.py
index 79a56ba748..643ddbdea2 100644
--- a/tests/ut/python/train/summary/test_graph_summary.py
+++ b/tests/ut/python/train/summary/test_graph_summary.py
@@ -20,8 +20,8 @@ import numpy as np
 import mindspore.nn as nn
 from mindspore import Model, context
 from mindspore.nn.optim import Momentum
-from mindspore.train.callback import SummaryStep
-from mindspore.train.summary.summary_record import SummaryRecord
+from mindspore.train.summary import SummaryRecord
+from mindspore.train.callback import SummaryCollector
 from .....dataset_mock import MindData
 
 CUR_DIR = os.getcwd()
@@ -107,16 +107,9 @@ def test_graph_summary_sample():
     model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
     with SummaryRecord(SUMMARY_DIR, file_suffix="_MS_GRAPH", network=model._train_network) as test_writer:
         model.train(2, dataset)
-        # step 2: create the Event
         for i in range(1, 5):
             test_writer.record(i)
 
-        # step 3: send the event to mq
-
-        # step 4: accept the event and write the file
-
-        log.debug("finished test_graph_summary_sample")
-
 
 def test_graph_summary_callback():
     dataset = get_dataset()
@@ -125,18 +118,8 @@ def test_graph_summary_callback():
     optim = Momentum(net.trainable_params(), 0.1, 0.9)
     context.set_context(mode=context.GRAPH_MODE)
     model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
-    with SummaryRecord(SUMMARY_DIR, file_suffix="_MS_GRAPH", network=model._train_network) as test_writer:
-        summary_cb = SummaryStep(test_writer, 1)
-        model.train(2, dataset, callbacks=summary_cb)
-
-
-def test_graph_summary_callback2():
-    dataset = get_dataset()
-    net = Net()
-    loss = nn.SoftmaxCrossEntropyWithLogits()
-    optim = Momentum(net.trainable_params(), 0.1, 0.9)
-    context.set_context(mode=context.GRAPH_MODE)
-    model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
-    with SummaryRecord(SUMMARY_DIR, file_suffix="_MS_GRAPH", network=net) as test_writer:
-        summary_cb = SummaryStep(test_writer, 1)
-        model.train(2, dataset, callbacks=summary_cb)
+    summary_collector = SummaryCollector(SUMMARY_DIR,
+                                         collect_freq=1,
+                                         keep_default_action=False,
+                                         collect_specified_data={'collect_graph': True})
+    model.train(1, dataset, callbacks=[summary_collector])
diff --git a/tests/ut/python/train/summary/test_histogram_summary.py b/tests/ut/python/train/summary/test_histogram_summary.py
index dc0892167c..2d5a175f70 100644
--- a/tests/ut/python/train/summary/test_histogram_summary.py
+++ b/tests/ut/python/train/summary/test_histogram_summary.py
@@ -22,7 +22,7 @@ import numpy as np
 from mindspore.common.tensor import Tensor
 from mindspore.train.summary._summary_adapter import _calc_histogram_bins
 from mindspore.train.summary.summary_record import SummaryRecord, _cache_summary_tensor_data
-from .summary_reader import SummaryReader
+from tests.summary_utils import SummaryReader
 
 CUR_DIR = os.getcwd()
 SUMMARY_DIR = os.path.join(CUR_DIR, "/test_temp_summary_event_file/")
@@ -57,9 +57,9 @@ def test_histogram_summary():
             test_writer.record(step=1)
 
         file_name = os.path.join(tmp_dir, test_writer.event_file_name)
-        reader = SummaryReader(file_name)
-        event = reader.read_event()
-        assert event.summary.value[0].histogram.count == 6
+        with SummaryReader(file_name) as reader:
+            event = reader.read_event()
+            assert event.summary.value[0].histogram.count == 6
 
 
 def test_histogram_multi_summary():
@@ -79,25 +79,10 @@ def test_histogram_multi_summary():
                 test_writer.record(step=i)
 
         file_name = os.path.join(tmp_dir, test_writer.event_file_name)
-        reader = SummaryReader(file_name)
-        for _ in range(num_step):
-            event = reader.read_event()
-            assert event.summary.value[0].histogram.count == size
-
-
-def test_histogram_summary_scalar_tensor():
-    """Test histogram summary, input is a scalar tensor."""
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        with SummaryRecord(tmp_dir, file_suffix="_MS_HISTOGRAM") as test_writer:
-            test_data = _wrap_test_data(Tensor(1))
-            _cache_summary_tensor_data(test_data)
-            test_writer.record(step=1)
-
-        file_name = os.path.join(tmp_dir, test_writer.event_file_name)
-        reader = SummaryReader(file_name)
-        event = reader.read_event()
-        assert event.summary.value[0].histogram.count == 1
-
+        with SummaryReader(file_name) as reader:
+            for _ in range(num_step):
+                event = reader.read_event()
+                assert event.summary.value[0].histogram.count == size
 
 def test_histogram_summary_empty_tensor():
     """Test histogram summary, input is an empty tensor."""
@@ -108,9 +93,9 @@ def test_histogram_summary_empty_tensor():
             test_writer.record(step=1)
 
         file_name = os.path.join(tmp_dir, test_writer.event_file_name)
-        reader = SummaryReader(file_name)
-        event = reader.read_event()
-        assert event.summary.value[0].histogram.count == 0
+        with SummaryReader(file_name) as reader:
+            event = reader.read_event()
+            assert event.summary.value[0].histogram.count == 0
 
 
 def test_histogram_summary_same_value():
@@ -125,11 +110,11 @@ def test_histogram_summary_same_value():
             test_writer.record(step=1)
 
         file_name = os.path.join(tmp_dir, test_writer.event_file_name)
-        reader = SummaryReader(file_name)
-        event = reader.read_event()
-        LOG.debug(event)
+        with SummaryReader(file_name) as reader:
+            event = reader.read_event()
+            LOG.debug(event)
 
-        assert len(event.summary.value[0].histogram.buckets) == _calc_histogram_bins(dim1 * dim2)
+            assert len(event.summary.value[0].histogram.buckets) == _calc_histogram_bins(dim1 * dim2)
 
 
 def test_histogram_summary_high_dims():
@@ -145,11 +130,11 @@ def test_histogram_summary_high_dims():
             test_writer.record(step=1)
 
         file_name = os.path.join(tmp_dir, test_writer.event_file_name)
-        reader = SummaryReader(file_name)
-        event = reader.read_event()
-        LOG.debug(event)
+        with SummaryReader(file_name) as reader:
+            event = reader.read_event()
+            LOG.debug(event)
 
-        assert event.summary.value[0].histogram.count == tensor_data.size
+            assert event.summary.value[0].histogram.count == tensor_data.size
 
 
 def test_histogram_summary_nan_inf():
@@ -169,11 +154,11 @@ def test_histogram_summary_nan_inf():
             test_writer.record(step=1)
 
         file_name = os.path.join(tmp_dir, test_writer.event_file_name)
-        reader = SummaryReader(file_name)
-        event = reader.read_event()
-        LOG.debug(event)
+        with SummaryReader(file_name) as reader:
+            event = reader.read_event()
+            LOG.debug(event)
 
-        assert event.summary.value[0].histogram.nan_count == 1
+            assert event.summary.value[0].histogram.nan_count == 1
 
 
 def test_histogram_summary_all_nan_inf():
@@ -185,11 +170,11 @@ def test_histogram_summary_all_nan_inf():
             test_writer.record(step=1)
 
         file_name = os.path.join(tmp_dir, test_writer.event_file_name)
-        reader = SummaryReader(file_name)
-        event = reader.read_event()
-        LOG.debug(event)
-
-        histogram = event.summary.value[0].histogram
-        assert histogram.nan_count == 3
-        assert histogram.pos_inf_count == 1
-        assert histogram.neg_inf_count == 1
+        with SummaryReader(file_name) as reader:
+            event = reader.read_event()
+            LOG.debug(event)
+
+            histogram = event.summary.value[0].histogram
+            assert histogram.nan_count == 3
+            assert histogram.pos_inf_count == 1
+            assert histogram.neg_inf_count == 1
diff --git a/tests/ut/python/train/summary/test_image_summary.py b/tests/ut/python/train/summary/test_image_summary.py
index 5e5bf2b3c3..addeaec212 100644
--- a/tests/ut/python/train/summary/test_image_summary.py
+++ b/tests/ut/python/train/summary/test_image_summary.py
@@ -26,9 +26,8 @@ import mindspore.nn as nn
 from mindspore import Model, context
 from mindspore import Tensor
 from mindspore.nn.optim import Momentum
-from mindspore.train.callback import SummaryStep
-from mindspore.train.summary.summary_record import SummaryRecord, \
-    _cache_summary_tensor_data
+from mindspore.train.summary.summary_record import SummaryRecord, _cache_summary_tensor_data
+from mindspore.train.callback import Callback
 from .....dataset_mock import MindData
 
 CUR_DIR = os.getcwd()
@@ -155,13 +154,22 @@ def get_dataset():
     return dataset
 
 
-class ImageSummaryCallback:
-    def __init__(self, summaryRecord):
-        self._summaryRecord = summaryRecord
+class ImageSummaryCallback(Callback):
+    """Image summary callback."""
+
+    def __init__(self, summary_record):
+        self._summary_record = summary_record
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *err):
+        self._summary_record.close()
 
     def record(self, step, train_network=None):
-        self._summaryRecord.record(step, train_network)
-        self._summaryRecord.flush()
+        """record data."""
+        self._summary_record.record(step, train_network)
+        self._summary_record.flush()
 
 
 def test_image_summary_train():
@@ -176,9 +184,8 @@ def test_image_summary_train():
         # step 2: create the Event
 
         model = get_model()
-        fn = ImageSummaryCallback(test_writer)
-        summary_recode = SummaryStep(fn, 1)
-        model.train(2, dataset, callbacks=summary_recode)
+        callback = ImageSummaryCallback(test_writer)
+        model.train(2, dataset, callbacks=[callback])
 
         # step 3: send the event to mq
 
diff --git a/tests/ut/python/train/summary/test_summary.py b/tests/ut/python/train/summary/test_summary.py
index d26924a499..b069d89954 100644
--- a/tests/ut/python/train/summary/test_summary.py
+++ b/tests/ut/python/train/summary/test_summary.py
@@ -24,11 +24,9 @@ import random
 import numpy as np
 import pytest
 
-
 import mindspore.nn as nn
 from mindspore.common.tensor import Tensor
 from mindspore.ops import operations as P
-from mindspore.train.callback import SummaryStep
 from mindspore.train.summary.summary_record import SummaryRecord, _cache_summary_tensor_data
 
 CUR_DIR = os.getcwd()
@@ -192,16 +190,6 @@ def test_scalar_summary_with_ge_2():
 
 def test_validate():
     with SummaryRecord(SUMMARY_DIR) as sr:
-        with pytest.raises(ValueError):
-            SummaryStep(sr, 0)
-        with pytest.raises(ValueError):
-            SummaryStep(sr, -1)
-        with pytest.raises(ValueError):
-            SummaryStep(sr, 1.2)
-        with pytest.raises(ValueError):
-            SummaryStep(sr, True)
-        with pytest.raises(ValueError):
-            SummaryStep(sr, "str")
         sr.record(1)
         with pytest.raises(ValueError):
             sr.record(False)
@@ -215,17 +203,3 @@ def test_validate():
             sr.record("str")
         with pytest.raises(ValueError):
             sr.record(sr)
-
-    SummaryStep(sr, 1)
-    with pytest.raises(ValueError):
-        SummaryStep(sr, 1.2)
-    with pytest.raises(ValueError):
-        SummaryStep(sr, False)
-    with pytest.raises(ValueError):
-        SummaryStep(sr, "str")
-    with pytest.raises(ValueError):
-        SummaryStep(sr, (1, 2))
-    with pytest.raises(ValueError):
-        SummaryStep(sr, [3, 4])
-    with pytest.raises(ValueError):
-        SummaryStep(sr, sr)
diff --git a/tests/ut/python/train/summary/test_summary_abnormal_input.py b/tests/ut/python/train/summary/test_summary_abnormal_input.py
index 95cbf7549b..388952feca 100644
--- a/tests/ut/python/train/summary/test_summary_abnormal_input.py
+++ b/tests/ut/python/train/summary/test_summary_abnormal_input.py
@@ -59,7 +59,8 @@ def test_summaryrecord_input_null_string():
     log.debug("begin test_summaryrecord_input_null_string")
     # step 0: create the thread
     try:
-        SummaryRecord("")
+        with SummaryRecord(""):
+            pass
     except:
         assert True
     else:
@@ -71,7 +72,8 @@ def test_summaryrecord_input_None():
     log.debug("begin test_summaryrecord_input_None")
     # step 0: create the thread
     try:
-        SummaryRecord(None)
+        with SummaryRecord(None):
+            pass
     except:
         assert True
     else:
@@ -83,7 +85,8 @@ def test_summaryrecord_input_relative_dir_1():
     log.debug("begin test_summaryrecord_input_relative_dir_1")
     # step 0: create the thread
     try:
-        SummaryRecord("./test_temp_summary_event_file/")
+        with SummaryRecord("./test_temp_summary_event_file/"):
+            pass
     except:
         assert False
     else:
@@ -95,7 +98,8 @@ def test_summaryrecord_input_relative_dir_2():
     log.debug("begin test_summaryrecord_input_relative_dir_2")
     # step 0: create the thread
     try:
-        SummaryRecord("../summary/")
+        with SummaryRecord("../summary/"):
+            pass
     except:
         assert False
     else:
@@ -107,7 +111,8 @@ def test_summaryrecord_input_invalid_type_dir():
     log.debug("begin test_summaryrecord_input_invalid_type_dir")
     # step 0: create the thread
     try:
-        SummaryRecord(32)
+        with SummaryRecord(32):
+            pass
     except:
         assert True
     else:
@@ -119,7 +124,8 @@ def test_mulit_layer_directory():
     log.debug("begin test_mulit_layer_directory")
     # step 0: create the thread
     try:
-        SummaryRecord("./test_temp_summary_event_file/test/t1/")
+        with SummaryRecord("./test_temp_summary_event_file/test/t1/"):
+            pass
     except:
         assert False
     else:
diff --git a/tests/ut/python/train/summary/test_summary_collector.py b/tests/ut/python/train/summary/test_summary_collector.py
new file mode 100644
index 0000000000..5e7f8e662c
--- /dev/null
+++ b/tests/ut/python/train/summary/test_summary_collector.py
@@ -0,0 +1,184 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Test the exception parameter scenario for summary collector."""
+import os
+import tempfile
+import shutil
+import pytest
+
+from mindspore.train.callback import SummaryCollector
+
+
+class TestSummaryCollector:
+    """Test the exception parameter for summary collector."""
+    base_summary_dir = ''
+
+    def setup_class(self):
+        """Run before test this class."""
+        self.base_summary_dir = tempfile.mkdtemp(suffix='summary')
+
+    def teardown_class(self):
+        """Run after test this class."""
+        if os.path.exists(self.base_summary_dir):
+            shutil.rmtree(self.base_summary_dir)
+
+    @pytest.mark.parametrize("summary_dir", [1234, None, True, ''])
+    def test_params_with_summary_dir_value_error(self, summary_dir):
+        """Test the exception scenario for summary dir."""
+        if isinstance(summary_dir, str):
+            with pytest.raises(ValueError) as exc:
+                SummaryCollector(summary_dir=summary_dir)
+            assert str(exc.value) == 'For `summary_dir` the value should be a valid string of path, ' \
+                                     'but got empty string.'
+        else:
+            with pytest.raises(TypeError) as exc:
+                SummaryCollector(summary_dir=summary_dir)
+            assert 'For `summary_dir` the type should be a valid type' in str(exc.value)
+
+    def test_params_with_summary_dir_not_dir(self):
+        """Test the given summary dir parameter is not a directory."""
+        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
+        summary_file = os.path.join(summary_dir, 'temp_file.txt')
+        with open(summary_file, 'w') as file_handle:
+            file_handle.write('temp')
+        print(os.path.isfile(summary_file))
+        with pytest.raises(NotADirectoryError):
+            SummaryCollector(summary_dir=summary_file)
+
+    @pytest.mark.parametrize("collect_freq", [None, 0, 0.01])
+    def test_params_with_collect_freq_exception(self, collect_freq):
+        """Test the exception scenario for collect freq."""
+        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
+        if isinstance(collect_freq, int):
+            with pytest.raises(ValueError) as exc:
+                SummaryCollector(summary_dir=summary_dir, collect_freq=collect_freq)
+            expected_msg = f'For `collect_freq` the value should be greater than 0, but got `{collect_freq}`.'
+            assert expected_msg == str(exc.value)
+        else:
+            with pytest.raises(TypeError) as exc:
+                SummaryCollector(summary_dir=summary_dir, collect_freq=collect_freq)
+            expected_msg = f"For `collect_freq` the type should be a valid type of ['int'], " \
+                           f'bug got {type(collect_freq).__name__}.'
+            assert expected_msg == str(exc.value)
+
+    @pytest.mark.parametrize("action", [None, 123, '', '123'])
+    def test_params_with_action_exception(self, action):
+        """Test the exception scenario for action."""
+        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
+        with pytest.raises(TypeError) as exc:
+            SummaryCollector(summary_dir=summary_dir, keep_default_action=action)
+        expected_msg = f"For `keep_default_action` the type should be a valid type of ['bool'], " \
+                       f"bug got {type(action).__name__}."
+        assert expected_msg == str(exc.value)
+
+    @pytest.mark.parametrize("collect_specified_data", [123])
+    def test_params_with_collect_specified_data_type_error(self, collect_specified_data):
+        """Test type error scenario for collect specified data param."""
+        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
+        with pytest.raises(TypeError) as exc:
+            SummaryCollector(summary_dir, collect_specified_data=collect_specified_data)
+
+        expected_msg = f"For `collect_specified_data` the type should be a valid type of ['dict', 'NoneType'], " \
+                       f"bug got {type(collect_specified_data).__name__}."
+
+        assert expected_msg == str(exc.value)
+
+    @pytest.mark.parametrize("collect_specified_data", [
+        {
+            123: 123
+        },
+        {
+            None: True
+        }
+    ])
+    def test_params_with_collect_specified_data_key_type_error(self, collect_specified_data):
+        """Test the key of collect specified data param."""
+        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
+        with pytest.raises(TypeError) as exc:
+            SummaryCollector(summary_dir, collect_specified_data=collect_specified_data)
+
+        param_name = list(collect_specified_data)[0]
+        expected_msg = f"For `{param_name}` the type should be a valid type of ['str'], " \
+                       f"bug got {type(param_name).__name__}."
+        assert expected_msg == str(exc.value)
+
+    @pytest.mark.parametrize("collect_specified_data", [
+        {
+            'collect_metric': None
+        },
+        {
+            'collect_graph': 123
+        },
+        {
+            'histogram_regular': 123
+        },
+    ])
+    def test_params_with_collect_specified_data_value_type_error(self, collect_specified_data):
+        """Test the value of collect specified data param."""
+        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
+        with pytest.raises(TypeError) as exc:
+            SummaryCollector(summary_dir, collect_specified_data=collect_specified_data)
+
+        param_name = list(collect_specified_data)[0]
+        param_value = collect_specified_data[param_name]
+        expected_type = "['bool']" if param_name != 'histogram_regular' else "['str', 'NoneType']"
+        expected_msg = f'For `{param_name}` the type should be a valid type of {expected_type}, ' \
+                       f'bug got {type(param_value).__name__}.'
+
+        assert expected_msg == str(exc.value)
+
+    def test_params_with_collect_specified_data_unexpected_key(self):
+        """Test the collect_specified_data parameter with unexpected key."""
+        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
+        data = {'unexpected_key': True}
+        with pytest.raises(ValueError) as exc:
+            SummaryCollector(summary_dir, collect_specified_data=data)
+        expected_msg = f"For `collect_specified_data` the keys {set(data)} are unsupported."
+        assert expected_msg == str(exc.value)
+
+    @pytest.mark.parametrize("custom_lineage_data", [
+        123,
+        {
+            'custom': {}
+        },
+        {
+            'custom': None
+        },
+        {
+            123: 'custom'
+        }
+    ])
+    def test_params_with_custom_lineage_data_type_error(self, custom_lineage_data):
+        """Test the custom lineage data parameter type error."""
+        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
+        with pytest.raises(TypeError) as exc:
+            SummaryCollector(summary_dir, custom_lineage_data=custom_lineage_data)
+
+        if not isinstance(custom_lineage_data, dict):
+            expected_msg = f"For `custom_lineage_data` the type should be a valid type of ['dict', 'NoneType'], " \
+                           f"bug got {type(custom_lineage_data).__name__}."
+        else:
+            param_name = list(custom_lineage_data)[0]
+            param_value = custom_lineage_data[param_name]
+            if not isinstance(param_name, str):
+                arg_name = f'custom_lineage_data -> {param_name}'
+                expected_msg = f"For `{arg_name}` the type should be a valid type of ['str'], " \
+                               f'bug got {type(param_name).__name__}.'
+            else:
+                arg_name = f'the value of custom_lineage_data -> {param_name}'
+                expected_msg = f"For `{arg_name}` the type should be a valid type of ['int', 'str', 'float'], " \
+                               f'bug got {type(param_value).__name__}.'
+
+        assert expected_msg == str(exc.value)
diff --git a/tests/ut/python/train/test_amp.py b/tests/ut/python/train/test_amp.py
index 5325aad593..c7befb6c2b 100644
--- a/tests/ut/python/train/test_amp.py
+++ b/tests/ut/python/train/test_amp.py
@@ -20,9 +20,12 @@ import mindspore.context as context
 from mindspore import Tensor
 from mindspore import amp
 from mindspore import nn
-from mindspore.train import Model
+from mindspore.train import Model, ParallelMode
+from mindspore.common import dtype as mstype
+from mindspore.model_zoo.resnet import resnet50
 from ....dataset_mock import MindData
-
+from mindspore.parallel._auto_parallel_context import auto_parallel_context
+from mindspore.communication.management import init
 
 def setup_module(module):
     _ = module
@@ -139,3 +142,22 @@ def test_compile_model_train_O2():
     with pytest.raises(ValueError):
         # not actual run, the metrics step will fail, check if compile ok.
         model.eval(dataset)
+
+def test_compile_model_train_O2_parallel():
+    dataset_types = (np.float32, np.float32)
+    dataset_shapes = ((16, 16), (16, 16))
+
+    dataset = MindDataSet(dataset_types, dataset_shapes)
+
+    net = NetNoLoss(16, 16)
+    loss = nn.MSELoss()
+    optimizer = nn.Momentum(net.trainable_params(), 0.1, 0.9, 0.00004, 1024.0)
+
+    context.set_auto_parallel_context(
+        global_rank=0, device_num=8,
+        mirror_mean=True, parameter_broadcast=True,
+        parallel_mode=ParallelMode.DATA_PARALLEL)
+    init()
+
+    model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={"acc"}, amp_level="O2")
+    model.train(2, dataset, dataset_sink_mode=False)
diff --git a/tests/ut/python/train/test_training.py b/tests/ut/python/train/test_training.py
index 92625e54f9..ad26951104 100644
--- a/tests/ut/python/train/test_training.py
+++ b/tests/ut/python/train/test_training.py
@@ -20,8 +20,8 @@ import pytest
 import mindspore.nn as nn
 from mindspore import Model, context
 from mindspore import Tensor
+from mindspore.train.callback import Callback
 from mindspore.nn.optim import Momentum
-from mindspore.train.callback import SummaryStep
 from ..ut_filter import non_graph_engine
 from ....dataset_mock import MindData
 
@@ -174,25 +174,31 @@ class TestGraphMode:
         model.train(1, dataset)
 
 
-class CallbackTest:
+class CallbackTest(Callback):
     """ CallbackTest definition """
 
     def __init__(self):
         pass
 
-    def record(self, step, *args):
-        print(step, args)
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *err):
+        pass
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        print(cb_params.cur_epoch_num, cb_params.cur_step_num)
 
 
 def test_train_callback(test_with_simu):
     """ test_train_callback """
     dataset = get_dataset()
     model = get_model()
-    fn = CallbackTest()
-    summary_recode = SummaryStep(fn, 2)
+    callback = CallbackTest()
     if test_with_simu:
         return
-    model.train(2, dataset, callbacks=summary_recode)
+    model.train(2, dataset, callbacks=callback)
 
 
 log = logging.getLogger("test")
diff --git a/tests/ut/python/utils/test_callback.py b/tests/ut/python/utils/test_callback.py
index 26ad1a2277..a5f2a3323f 100644
--- a/tests/ut/python/utils/test_callback.py
+++ b/tests/ut/python/utils/test_callback.py
@@ -15,6 +15,7 @@
 """test callback function."""
 import os
 import stat
+from unittest import mock
 
 import numpy as np
 import pytest
@@ -25,10 +26,9 @@ from mindspore.common.api import ms_function
 from mindspore.common.tensor import Tensor
 from mindspore.nn import TrainOneStepCell, WithLossCell
 from mindspore.nn.optim import Momentum
-from mindspore.train.callback import ModelCheckpoint, _check_file_name_prefix, RunContext, _checkpoint_cb_for_save_op, \
-    LossMonitor, _InternalCallbackParam, _chg_ckpt_file_name_if_same_exist, \
-    _build_callbacks, CheckpointConfig, _set_cur_net
-
+from mindspore.train.callback import ModelCheckpoint, RunContext, LossMonitor, _InternalCallbackParam, \
+    _CallbackManager, Callback, CheckpointConfig, _set_cur_net, _checkpoint_cb_for_save_op
+from mindspore.train.callback._checkpoint import _check_file_name_prefix, _chg_ckpt_file_name_if_same_exist
 
 class Net(nn.Cell):
     """Net definition."""
@@ -74,7 +74,7 @@ class LossNet(nn.Cell):
         return out
 
 
-def test_Model_Checkpoint_prefix_invalid():
+def test_model_checkpoint_prefix_invalid():
     """Test ModelCheckpoint prefix invalid."""
     with pytest.raises(ValueError):
         ModelCheckpoint(123)
@@ -116,19 +116,20 @@ def test_loss_monitor_sink_mode():
     """Test loss monitor sink mode."""
     cb_params = _InternalCallbackParam()
     cb_params.cur_epoch_num = 4
+    cb_params.epoch_num = 4
     cb_params.cur_step_num = 2
     cb_params.batch_num = 2
     cb_params.net_outputs = Tensor(2.0)
     run_context = RunContext(cb_params)
     loss_cb = LossMonitor(1)
     callbacks = [loss_cb]
-    callbacklist = _build_callbacks(callbacks)
-    callbacklist.begin(run_context)
-    callbacklist.epoch_begin(run_context)
-    callbacklist.step_begin(run_context)
-    callbacklist.step_end(run_context)
-    callbacklist.epoch_end(run_context)
-    callbacklist.end(run_context)
+    with _CallbackManager(callbacks) as callbacklist:
+        callbacklist.begin(run_context)
+        callbacklist.epoch_begin(run_context)
+        callbacklist.step_begin(run_context)
+        callbacklist.step_end(run_context)
+        callbacklist.epoch_end(run_context)
+        callbacklist.end(run_context)
 
 
 def test_loss_monitor_normal_mode():
@@ -137,6 +138,7 @@ def test_loss_monitor_normal_mode():
     run_context = RunContext(cb_params)
     loss_cb = LossMonitor(1)
     cb_params.cur_epoch_num = 4
+    cb_params.epoch_num = 4
     cb_params.cur_step_num = 1
     cb_params.batch_num = 1
     cb_params.net_outputs = Tensor(2.0)
@@ -269,29 +271,61 @@ def test_checkpoint_save_ckpt_seconds():
     ckpt_cb2.step_end(run_context)
 
 
-def test_build_callbacks():
-    """Test_build_callbacks."""
+def test_CallbackManager():
+    """TestCallbackManager."""
     ck_obj = ModelCheckpoint()
     loss_cb_1 = LossMonitor(1)
 
     callbacks = [None]
     with pytest.raises(TypeError):
-        callbacks = _build_callbacks(callbacks)
+        _CallbackManager(callbacks)
 
     callbacks = ['Error']
     with pytest.raises(TypeError):
-        callbacks = _build_callbacks(callbacks)
+        _CallbackManager(callbacks)
 
     callbacks = [ck_obj, loss_cb_1, 'Error', None]
     with pytest.raises(TypeError):
-        _ = _build_callbacks(callbacks)
+        _CallbackManager(callbacks)
+
+
+def test_CallbackManager_exit_called():
+    with mock.patch.object(Callback, '__exit__', return_value=None) as mock_exit:
+        cb1, cb2 = Callback(), Callback()
+        with _CallbackManager([cb1, cb2]):
+            pass
+    for call_args in mock_exit.call_args_list:
+        assert call_args == mock.call(mock.ANY, None, None, None)
+    assert mock_exit.call_count == 2
+
+
+def test_CallbackManager_exit_called_when_raises():
+    with mock.patch.object(Callback, '__exit__', return_value=None) as mock_exit:
+        cb1, cb2 = Callback(), Callback()
+        with pytest.raises(ValueError):
+            with _CallbackManager([cb1, cb2]):
+                raise ValueError()
+    for call_args in mock_exit.call_args_list:
+        assert call_args == mock.call(*[mock.ANY] * 4)
+    assert mock_exit.call_count == 2
+
+
+def test_CallbackManager_begin_called():
+    context = dict()
+    with mock.patch.object(Callback, 'begin', return_value=None) as mock_begin:
+        cb1, cb2 = Callback(), Callback()
+        with _CallbackManager([cb1, cb2]) as cm:
+            cm.begin(context)
+    for call_args in mock_begin.call_args_list:
+        assert call_args == mock.call(context)
+    assert mock_begin.call_count == 2
 
 
 def test_RunContext():
     """Test RunContext."""
     context_err = 666
     with pytest.raises(TypeError):
-        _ = RunContext(context_err)
+        RunContext(context_err)
 
     cb_params = _InternalCallbackParam()
     cb_params.member1 = 1
@@ -338,9 +372,9 @@ def test_step_end_save_graph():
     ckpoint_cb.begin(run_context)
     # import pdb;pdb.set_trace()
     ckpoint_cb.step_end(run_context)
-    assert os.path.exists('./test_files/test-graph.meta') == True
+    assert os.path.exists('./test_files/test-graph.meta')
     if os.path.exists('./test_files/test-graph.meta'):
         os.chmod('./test_files/test-graph.meta', stat.S_IWRITE)
         os.remove('./test_files/test-graph.meta')
     ckpoint_cb.step_end(run_context)
-    assert os.path.exists('./test_files/test-graph.meta') == False
+    assert not os.path.exists('./test_files/test-graph.meta')
diff --git a/tests/ut/python/utils/test_initializer.py b/tests/ut/python/utils/test_initializer.py
index 57709baa76..44d12fa94d 100644
--- a/tests/ut/python/utils/test_initializer.py
+++ b/tests/ut/python/utils/test_initializer.py
@@ -65,7 +65,7 @@ def test_init_Initializer():
 def test_init_tensor():
     tensor = ms.Tensor(np.zeros([1, 2, 3]))
     tensor = init.initializer(tensor, [1, 2, 3], ms.float32)
-    assert tensor.shape() == (1, 2, 3)
+    assert tensor.shape == (1, 2, 3)
 
 
 def test_init_zero_default_dtype():
diff --git a/tests/ut/python/utils/test_serialize.py b/tests/ut/python/utils/test_serialize.py
index a248a49bce..19e9bd72e6 100644
--- a/tests/ut/python/utils/test_serialize.py
+++ b/tests/ut/python/utils/test_serialize.py
@@ -16,8 +16,9 @@
 import os
 import stat
 import time
-import pytest
+
 import numpy as np
+import pytest
 
 import mindspore.common.dtype as mstype
 import mindspore.nn as nn
@@ -33,7 +34,7 @@ from mindspore.train.serialization import save_checkpoint, load_checkpoint, load
     _exec_save_checkpoint, export, _save_graph
 from ..ut_filter import non_graph_engine
 
-context.set_context(mode=context.GRAPH_MODE)
+context.set_context(mode=context.GRAPH_MODE, print_file_path="print.pb")
 
 
 class Net(nn.Cell):
@@ -71,16 +72,16 @@ def setup_module():
 def test_save_graph():
     """ test_exec_save_graph """
 
-    class Net(nn.Cell):
+    class Net1(nn.Cell):
         def __init__(self):
-            super(Net, self).__init__()
+            super(Net1, self).__init__()
             self.add = P.TensorAdd()
 
         def construct(self, x, y):
             z = self.add(x, y)
             return z
 
-    net = Net()
+    net = Net1()
     net.set_train()
     out_me_list = []
     x = Tensor(np.random.rand(2, 1, 2, 3).astype(np.float32))
@@ -126,8 +127,8 @@ def test_load_checkpoint():
 
     assert len(par_dict) == 3
     assert par_dict['param_test'].name == 'param_test'
-    assert par_dict['param_test'].data.dtype() == mstype.float32
-    assert par_dict['param_test'].data.shape() == (1, 3, 224, 224)
+    assert par_dict['param_test'].data.dtype == mstype.float32
+    assert par_dict['param_test'].data.shape == (1, 3, 224, 224)
     assert isinstance(par_dict, dict)
 
 
@@ -320,8 +321,59 @@ def test_export():
     export(net, input_data, file_name="./me_export.pb", file_format="GEIR")
 
 
+@non_graph_engine
+def test_binary_export():
+    net = MYNET()
+    input_data = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]).astype(np.float32))
+    export(net, input_data, file_name="./me_binary_export.pb", file_format="BINARY")
+
+
+class PrintNet(nn.Cell):
+    def __init__(self):
+        super(PrintNet, self).__init__()
+        self.print = P.Print()
+
+    def construct(self, int8, uint8, int16, uint16, int32, uint32, int64, uint64, flt16, flt32, flt64, bool_,
+                  scale1, scale2):
+        self.print('============tensor int8:==============', int8)
+        self.print('============tensor uint8:==============', uint8)
+        self.print('============tensor int16:==============', int16)
+        self.print('============tensor uint16:==============', uint16)
+        self.print('============tensor int32:==============', int32)
+        self.print('============tensor uint32:==============', uint32)
+        self.print('============tensor int64:==============', int64)
+        self.print('============tensor uint64:==============', uint64)
+        self.print('============tensor float16:==============', flt16)
+        self.print('============tensor float32:==============', flt32)
+        self.print('============tensor float64:==============', flt64)
+        self.print('============tensor bool:==============', bool_)
+        self.print('============tensor scale1:==============', scale1)
+        self.print('============tensor scale2:==============', scale2)
+        return int8, uint8, int16, uint16, int32, uint32, int64, uint64, flt16, flt32, flt64, bool_, scale1, scale2
+
+
+def test_print():
+    print_net = PrintNet()
+    int8 = Tensor(np.random.randint(100, size=(10, 10), dtype="int8"))
+    uint8 = Tensor(np.random.randint(100, size=(10, 10), dtype="uint8"))
+    int16 = Tensor(np.random.randint(100, size=(10, 10), dtype="int16"))
+    uint16 = Tensor(np.random.randint(100, size=(10, 10), dtype="uint16"))
+    int32 = Tensor(np.random.randint(100, size=(10, 10), dtype="int32"))
+    uint32 = Tensor(np.random.randint(100, size=(10, 10), dtype="uint32"))
+    int64 = Tensor(np.random.randint(100, size=(10, 10), dtype="int64"))
+    uint64 = Tensor(np.random.randint(100, size=(10, 10), dtype="uint64"))
+    float16 = Tensor(np.random.rand(224, 224).astype(np.float16))
+    float32 = Tensor(np.random.rand(224, 224).astype(np.float32))
+    float64 = Tensor(np.random.rand(224, 224).astype(np.float64))
+    bool_ = Tensor(np.arange(-10, 10, 2).astype(np.bool_))
+    scale1 = Tensor(np.array(1))
+    scale2 = Tensor(np.array(0.1))
+    print_net(int8, uint8, int16, uint16, int32, uint32, int64, uint64, float16, float32, float64, bool_, scale1,
+              scale2)
+
+
 def teardown_module():
-    files = ['parameters.ckpt', 'new_ckpt.ckpt', 'empty.ckpt']
+    files = ['parameters.ckpt', 'new_ckpt.ckpt', 'empty.ckpt', 'print.pb']
     for item in files:
         file_name = './' + item
         if not os.path.exists(file_name):
diff --git a/tests/vm_impl/array_ops_vm_impl.py b/tests/vm_impl/array_ops_vm_impl.py
index 7236d9e919..91493a4c07 100644
--- a/tests/vm_impl/array_ops_vm_impl.py
+++ b/tests/vm_impl/array_ops_vm_impl.py
@@ -18,6 +18,7 @@ import numpy as np
 import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 from mindspore.ops import operations as P
+from mindspore.ops.operations import _grad_ops as G
 from mindspore.ops.vm_impl_registry import vm_impl_registry as vm_impl_getters
 from .vm_interface import vm
 
@@ -45,7 +46,7 @@ def vm_impl_dType(self):
 
     def vm_impl(x):
         # update the src type
-        return x.dtype()
+        return x.dtype
 
     return vm_impl
 
@@ -225,7 +226,7 @@ def vm_impl_slice(self):
     return vm_impl
 
 
-@vm_impl_getters.register(P._grad_ops.ConcatOffset)
+@vm_impl_getters.register(G.ConcatOffset)
 def vm_impl_concatOffset(self):
     """Generate vm_impl function for ConcatOffset"""
 
@@ -277,3 +278,27 @@ def vm_impl_square(self):
         return Tensor(x * x)
 
     return vm_impl
+
+@vm_impl_getters.register(P.ZerosLike)
+def vm_impl_zeros_like(self):
+    """Generate vm_impl function for ZerosLike"""
+    def vm_impl(x):
+        return Tensor(np.zeros_like(x.asnumpy()))
+
+@vm_impl_getters.register(P.Partial)
+def vm_impl_partial(self):
+    """Generate vm_impl function for Partial"""
+    def vm_impl(*args):
+        func = args[0].__call__
+        partial_func = functools.partial(func, *args[1:])
+        return partial_func
+
+    return vm_impl
+
+@vm_impl_getters.register(P.Depend)
+def vm_impl_depend(self):
+    """Generate vm_impl function for Depend"""
+    def vm_impl(value, expr):
+        return value
+
+    return vm_impl
diff --git a/third_party/icu4c/filter.json b/third_party/icu4c/filter.json
new file mode 100644
index 0000000000..b3decad8fb
--- /dev/null
+++ b/third_party/icu4c/filter.json
@@ -0,0 +1,6 @@
+{
+  "strategy": "additive",
+  "featureFilters": {
+    "normalization": "include"
+  }
+}
\ No newline at end of file
diff --git a/third_party/patch/sqlite/sqlite.patch001 b/third_party/patch/sqlite/sqlite.patch001
index 083fc2986c..14a9fb3218 100644
--- a/third_party/patch/sqlite/sqlite.patch001
+++ b/third_party/patch/sqlite/sqlite.patch001
@@ -1,144 +1,290 @@
-diff -Npur -x .git sqlite.3.31.1/manifest sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/manifest
---- sqlite.3.31.1/manifest	2020-04-20 10:21:03.622574899 +0800
-+++ sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/manifest	2020-04-20 11:05:54.613993594 +0800
-@@ -482,8 +482,8 @@ F src/ctime.c 1b0724e66f95f33b160b1af85c
- F src/date.c 6c408fdd2e9ddf6e8431aba76315a2d061bea2cec8fbb75e25d7c1ba08274712
- F src/dbpage.c 8a01e865bf8bc6d7b1844b4314443a6436c07c3efe1d488ed89e81719047833a
- F src/dbstat.c 0f55297469d4244ab7df395849e1af98eb5e95816af7c661e7d2d8402dea23da
--F src/delete.c a5c59b9c0251cf7682bc52af0d64f09b1aefc6781a63592c8f1136f7b73c66e4
--F src/expr.c 003c59158b33d7f3b198122cb0d1e13c06517cc3932e56b42283eb0e96696d66
-+F src/delete.c 11000121c4281c0bce4e41db29addfaea0038eaa127ece02557c9207bc3e541d
-+F src/expr.c 4b25db7f9472b3532560242193bc4eefaefc7720dc4f2d7ec9a89ada410c6ea2
- F src/fault.c 460f3e55994363812d9d60844b2a6de88826e007
- F src/fkey.c 92a248ec0fa4ed8ab60c98d9b188ce173aaf218f32e7737ba77deb2a684f9847
- F src/func.c 108577cebe8a50c86d849a93b99493a54e348dd0b846f00d13b52ca973d5baf4
-@@ -536,8 +536,8 @@ F src/shell.c.in c2e20c43a44fb5588a6c27c
- F src/sqlite.h.in 75d0304247a2154122d6d06f12219c1e29291d72304f0eeef4c1ec6b1409b443
- F src/sqlite3.rc 5121c9e10c3964d5755191c80dd1180c122fc3a8
- F src/sqlite3ext.h 27951f294f29cd875c6027f2707d644ef99f469bd97514568b5a8581a114db8c
--F src/sqliteInt.h 7a29ba700a51eeb925731749a570cf3859f6a58ed94797ecf47508875b0ba279
--F src/sqliteLimit.h 1513bfb7b20378aa0041e7022d04acb73525de35b80b252f1b83fedb4de6a76b
-+F src/sqliteInt.h d736043dc6291d3af289d911237da0801b6c05be086ae322eedd47a089ae8d2f
-+F src/sqliteLimit.h 95cb8479ca459496d9c1c6a9f76b38aee12203a56ce1092fe13e50ae2454c032
- F src/status.c 9ff2210207c6c3b4d9631a8241a7d45ab1b26a0e9c84cb07a9b5ce2de9a3b278
- F src/table.c b46ad567748f24a326d9de40e5b9659f96ffff34
- F src/tclsqlite.c 97590069efaba5a4928ecffb606e3771dd93ee8e6bf248a62a6507c37a2b2e46
-@@ -619,8 +619,8 @@ F src/wal.h 606292549f5a7be50b6227bd685f
- F src/walker.c a137468bf36c92e64d2275caa80c83902e3a0fc59273591b96c6416d3253d05d
- F src/where.c 2005d0511e05e5f7b6fb3be514b44f264f23d45f3b0cc5e150c63e3006a003e5
- F src/whereInt.h 9157228db086f436a574589f8cc5749bd971e94017c552305ad9ec472ed2e098
--F src/wherecode.c ec8870d6fe79668dd12d7edc65ae9771828d6cdfe478348c8abd872a89fdbadd
--F src/whereexpr.c 4b34be1434183e7bb8a05d4bf42bd53ea53021b0b060936fbd12062b4ff6b396
-+F src/wherecode.c f5df56e395ade2240cabb2d39500c681bd29f8cc0636c3301c4996ad160df94d
-+F src/whereexpr.c 264d58971eaf8256eb5b0917bcd7fc7a1f1109fdda183a8382308a1b18a2dce7
- F src/window.c f8ba2ee12a19b51d3ba42c16277c74185ee9215306bc0d5a03974ade8b5bc98f
- F test/8_3_names.test ebbb5cd36741350040fd28b432ceadf495be25b2
- F test/affinity2.test ce1aafc86e110685b324e9a763eab4f2a73f737842ec3b687bd965867de90627
-@@ -1857,10 +1857,10 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91
- F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
- F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
- F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
--P 6fb9a8fb85486a8fccc462856316ef523450c23a7a7a81c8dfb323fbe809f8f5
--R bf075f6bcc1758c5c1ecd13052997456
-+P 9d0d4ab95dc0c56e053c2924ed322a9ea7b25439e6f74599f706905a1994e454
-+R 1c052b7cdf4947664b7043564b643ac3
- T +bgcolor * #d0c0ff
- T +sym-release *
- T +sym-version-3.31.1 *
- U drh
--Z 7c50801eed3eaef969e028ef5a0a641a
-+Z e960557a43b001a47933dacf8bc1d10e
-diff -Npur -x .git sqlite.3.31.1/manifest.uuid sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/manifest.uuid
---- sqlite.3.31.1/manifest.uuid	2020-04-20 10:21:03.630574843 +0800
-+++ sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/manifest.uuid	2020-04-20 11:05:54.613993594 +0800
-@@ -1 +1 @@
--3bfa9cc97da10598521b342961df8f5f68c7388fa117345eeb516eaa837bb4d6
-\ 文件尾没有换行符
-+abc473fb8fb999005dc79a360e34f97b3b25429decf1820dd2afa5c19577753d
-diff -Npur -x .git sqlite.3.31.1/src/expr.c sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/src/expr.c
---- sqlite.3.31.1/src/expr.c	2020-04-20 10:21:03.642574758 +0800
-+++ sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/src/expr.c	2020-04-20 11:05:54.617993541 +0800
-@@ -5463,19 +5463,25 @@ static int impliesNotNullRow(Walker *pWa
-     case TK_LT:
-     case TK_LE:
-     case TK_GT:
--    case TK_GE:
-+    case TK_GE: {
-+      Expr *pLeft = pExpr->pLeft;
-+      Expr *pRight = pExpr->pRight;
-       testcase( pExpr->op==TK_EQ );
-       testcase( pExpr->op==TK_NE );
-       testcase( pExpr->op==TK_LT );
-       testcase( pExpr->op==TK_LE );
-       testcase( pExpr->op==TK_GT );
-       testcase( pExpr->op==TK_GE );
--      if( (pExpr->pLeft->op==TK_COLUMN && IsVirtual(pExpr->pLeft->y.pTab))
--       || (pExpr->pRight->op==TK_COLUMN && IsVirtual(pExpr->pRight->y.pTab))
-+      /* The y.pTab=0 assignment in wherecode.c always happens after the
-+      ** impliesNotNullRow() test */
-+      if( (pLeft->op==TK_COLUMN && ALWAYS(pLeft->y.pTab!=0)
-+                               && IsVirtual(pLeft->y.pTab))
-+       || (pRight->op==TK_COLUMN && ALWAYS(pRight->y.pTab!=0)
-+                               && IsVirtual(pRight->y.pTab))
-       ){
--       return WRC_Prune;
-+        return WRC_Prune;
+diff -Npur sqlite-version-3.32.2/src/expr.c sqlite-version-3.32.2-patched/src/expr.c
+--- sqlite-version-3.32.2/src/expr.c	2020-06-04 20:58:43.000000000 +0800
++++ sqlite-version-3.32.2-patched/src/expr.c	2020-06-15 16:03:29.343573250 +0800
+@@ -3813,6 +3813,7 @@ expr_code_doover:
+       AggInfo *pAggInfo = pExpr->pAggInfo;
+       struct AggInfo_col *pCol;
+       assert( pAggInfo!=0 );
++      assert( AggInfoValid(pAggInfo) );
+       assert( pExpr->iAgg>=0 && pExpr->iAgg<pAggInfo->nColumn );
+       pCol = &pAggInfo->aCol[pExpr->iAgg];
+       if( !pAggInfo->directMode ){
+@@ -4121,6 +4122,7 @@ expr_code_doover:
+         assert( !ExprHasProperty(pExpr, EP_IntValue) );
+         sqlite3ErrorMsg(pParse, "misuse of aggregate: %s()", pExpr->u.zToken);
+       }else{
++        assert( AggInfoValid(pInfo) );
+         return pInfo->aFunc[pExpr->iAgg].iMem;
        }
+       break;
+@@ -5658,13 +5660,7 @@ struct SrcCount {
+ ** Count the number of references to columns.
+ */
+ static int exprSrcCount(Walker *pWalker, Expr *pExpr){
+-  /* There was once a NEVER() on the second term on the grounds that
+-  ** sqlite3FunctionUsesThisSrc() was always called before 
+-  ** sqlite3ExprAnalyzeAggregates() and so the TK_COLUMNs have not yet 
+-  ** been converted into TK_AGG_COLUMN. But this is no longer true due
+-  ** to window functions - sqlite3WindowRewrite() may now indirectly call
+-  ** FunctionUsesThisSrc() when creating a new sub-select. */
+-  if( pExpr->op==TK_COLUMN || pExpr->op==TK_AGG_COLUMN ){
++  if( pExpr->op==TK_COLUMN || NEVER(pExpr->op==TK_AGG_COLUMN) ){
+     int i;
+     struct SrcCount *p = pWalker->u.pSrcCount;
+     SrcList *pSrc = p->pSrc;
+diff -Npur sqlite-version-3.32.2/src/global.c sqlite-version-3.32.2-patched/src/global.c
+--- sqlite-version-3.32.2/src/global.c	2020-06-04 20:58:43.000000000 +0800
++++ sqlite-version-3.32.2-patched/src/global.c	2020-06-15 16:03:29.343573250 +0800
+@@ -300,6 +300,11 @@ sqlite3_uint64 sqlite3NProfileCnt = 0;
+ int sqlite3PendingByte = 0x40000000;
+ #endif
+ 
++/*
++** Flags for select tracing and the ".selecttrace" macro of the CLI
++*/
++/**/ u32 sqlite3SelectTrace = 0;
++
+ #include "opcodes.h"
+ /*
+ ** Properties of opcodes.  The OPFLG_INITIALIZER macro is
+diff -Npur sqlite-version-3.32.2/src/resolve.c sqlite-version-3.32.2-patched/src/resolve.c
+--- sqlite-version-3.32.2/src/resolve.c	2020-06-04 20:58:43.000000000 +0800
++++ sqlite-version-3.32.2-patched/src/resolve.c	2020-06-15 16:03:29.343573250 +0800
+@@ -1715,6 +1715,14 @@ static int resolveSelectStep(Walker *pWa
+           return WRC_Abort;
+         }
+       }
++    }else if( p->pWin && ALWAYS( (p->selFlags & SF_WinRewrite)==0 ) ){
++      sqlite3WindowRewrite(pParse, p);
++#if SELECTTRACE_ENABLED
++      if( (sqlite3SelectTrace & 0x108)!=0 ){
++        SELECTTRACE(0x104,pParse,p, ("after window rewrite:\n"));
++        sqlite3TreeViewSelect(0, p, 0);
++      }
++#endif
+     }
+ #endif
+ 
+diff -Npur sqlite-version-3.32.2/src/select.c sqlite-version-3.32.2-patched/src/select.c
+--- sqlite-version-3.32.2/src/select.c	2020-06-04 20:58:43.000000000 +0800
++++ sqlite-version-3.32.2-patched/src/select.c	2020-06-15 16:03:29.343573250 +0800
+@@ -15,20 +15,6 @@
+ #include "sqliteInt.h"
+ 
+ /*
+-** Trace output macros
+-*/
+-#if SELECTTRACE_ENABLED
+-/***/ int sqlite3SelectTrace = 0;
+-# define SELECTTRACE(K,P,S,X)  \
+-  if(sqlite3SelectTrace&(K))   \
+-    sqlite3DebugPrintf("%u/%d/%p: ",(S)->selId,(P)->addrExplain,(S)),\
+-    sqlite3DebugPrintf X
+-#else
+-# define SELECTTRACE(K,P,S,X)
+-#endif
 -
-+    }
-     default:
-       return WRC_Continue;
+-
+-/*
+ ** An instance of the following object is used to record information about
+ ** how to process the DISTINCT keyword, to simplify passing that information
+ ** into the selectInnerLoop() routine.
+@@ -4426,11 +4412,14 @@ static int pushDownWhereTerms(
+ ){
+   Expr *pNew;
+   int nChng = 0;
++  Select *pSel;
+   if( pWhere==0 ) return 0;
+   if( pSubq->selFlags & SF_Recursive ) return 0;  /* restriction (2) */
+ 
+ #ifndef SQLITE_OMIT_WINDOWFUNC
+-  if( pSubq->pWin ) return 0;    /* restriction (6) */
++  for(pSel=pSubq; pSel; pSel=pSel->pPrior){
++    if( pSel->pWin ) return 0;    /* restriction (6) */
++  }
+ #endif
+ 
+ #ifdef SQLITE_DEBUG
+@@ -5766,6 +5755,9 @@ int sqlite3Select(
+   }
+   if( sqlite3AuthCheck(pParse, SQLITE_SELECT, 0, 0, 0) ) return 1;
+   memset(&sAggInfo, 0, sizeof(sAggInfo));
++#ifdef SQLITE_DEBUG
++  sAggInfo.iAggMagic = SQLITE_AGGMAGIC_VALID;
++#endif
+ #if SELECTTRACE_ENABLED
+   SELECTTRACE(1,pParse,p, ("begin processing:\n", pParse->addrExplain));
+   if( sqlite3SelectTrace & 0x100 ){
+@@ -5804,19 +5796,6 @@ int sqlite3Select(
+     generateColumnNames(pParse, p);
    }
-diff -Npur -x .git sqlite.3.31.1/src/sqliteInt.h sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/src/sqliteInt.h
---- sqlite.3.31.1/src/sqliteInt.h	2020-04-20 10:21:03.642574758 +0800
-+++ sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/src/sqliteInt.h	2020-04-20 11:05:54.617993541 +0800
-@@ -2153,8 +2153,11 @@ struct Table {
+ 
+-#ifndef SQLITE_OMIT_WINDOWFUNC
+-  rc = sqlite3WindowRewrite(pParse, p);
+-  if( rc ){
+-    assert( db->mallocFailed || pParse->nErr>0 );
+-    goto select_end;
+-  }
+-#if SELECTTRACE_ENABLED
+-  if( p->pWin && (sqlite3SelectTrace & 0x108)!=0 ){
+-    SELECTTRACE(0x104,pParse,p, ("after window rewrite:\n"));
+-    sqlite3TreeViewSelect(0, p, 0);
+-  }
+-#endif
+-#endif /* SQLITE_OMIT_WINDOWFUNC */
+   pTabList = p->pSrc;
+   isAgg = (p->selFlags & SF_Aggregate)!=0;
+   memset(&sSort, 0, sizeof(sSort));
+@@ -6144,7 +6123,7 @@ int sqlite3Select(
+   if( (p->selFlags & (SF_Distinct|SF_Aggregate))==SF_Distinct 
+    && sqlite3ExprListCompare(sSort.pOrderBy, pEList, -1)==0
+ #ifndef SQLITE_OMIT_WINDOWFUNC
+-   && p->pWin==0
++   && ALWAYS(p->pWin==0)
+ #endif
+   ){
+     p->selFlags &= ~SF_Distinct;
+@@ -6791,6 +6770,14 @@ int sqlite3Select(
+ select_end:
+   sqlite3ExprListDelete(db, pMinMaxOrderBy);
+   sqlite3DbFree(db, sAggInfo.aCol);
++#ifdef SQLITE_DEBUG
++  for(i=0; i<sAggInfo.nFunc; i++){
++    assert( sAggInfo.aFunc[i].pExpr!=0 );
++    assert( sAggInfo.aFunc[i].pExpr->pAggInfo==&sAggInfo );
++    sAggInfo.aFunc[i].pExpr->pAggInfo = 0;
++  }
++  sAggInfo.iAggMagic = 0;
++#endif
+   sqlite3DbFree(db, sAggInfo.aFunc);
+ #if SELECTTRACE_ENABLED
+   SELECTTRACE(0x1,pParse,p,("end processing\n"));
+diff -Npur sqlite-version-3.32.2/src/sqliteInt.h sqlite-version-3.32.2-patched/src/sqliteInt.h
+--- sqlite-version-3.32.2/src/sqliteInt.h	2020-06-04 20:58:43.000000000 +0800
++++ sqlite-version-3.32.2-patched/src/sqliteInt.h	2020-06-15 16:03:29.347573247 +0800
+@@ -976,7 +976,12 @@ typedef INT16_TYPE LogEst;
  */
- #ifndef SQLITE_OMIT_VIRTUALTABLE
- #  define IsVirtual(X)      ((X)->nModuleArg)
-+#  define ExprIsVtab(X)  \
-+              ((X)->op==TK_COLUMN && (X)->y.pTab!=0 && (X)->y.pTab->nModuleArg)
+ #if defined(SQLITE_ENABLE_SELECTTRACE)
+ # define SELECTTRACE_ENABLED 1
++# define SELECTTRACE(K,P,S,X)  \
++  if(sqlite3SelectTrace&(K))   \
++    sqlite3DebugPrintf("%u/%d/%p: ",(S)->selId,(P)->addrExplain,(S)),\
++    sqlite3DebugPrintf X
  #else
- #  define IsVirtual(X)      0
-+#  define ExprIsVtab(X)     0
++# define SELECTTRACE(K,P,S,X)
+ # define SELECTTRACE_ENABLED 0
  #endif
  
+@@ -2523,9 +2528,24 @@ struct AggInfo {
+     int iDistinct;           /* Ephemeral table used to enforce DISTINCT */
+   } *aFunc;
+   int nFunc;              /* Number of entries in aFunc[] */
++#ifdef SQLITE_DEBUG
++  u32 iAggMagic;          /* Sanity checking constant */
++#endif
+ };
+ 
  /*
-diff -Npur -x .git sqlite.3.31.1/src/whereexpr.c sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/src/whereexpr.c
---- sqlite.3.31.1/src/whereexpr.c	2020-04-20 10:21:03.642574758 +0800
-+++ sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/src/whereexpr.c	2020-04-20 11:05:54.617993541 +0800
-@@ -377,7 +377,8 @@ static int isAuxiliaryVtabOperator(
-     **       MATCH(expression,vtab_column)
-     */
-     pCol = pList->a[1].pExpr;
--    if( pCol->op==TK_COLUMN && IsVirtual(pCol->y.pTab) ){
-+    testcase( pCol->op==TK_COLUMN && pCol->y.pTab==0 );
-+    if( ExprIsVtab(pCol) ){
-       for(i=0; i<ArraySize(aOp); i++){
-         if( sqlite3StrICmp(pExpr->u.zToken, aOp[i].zOp)==0 ){
-           *peOp2 = aOp[i].eOp2;
-@@ -399,7 +400,8 @@ static int isAuxiliaryVtabOperator(
-     ** with function names in an arbitrary case.
-     */
-     pCol = pList->a[0].pExpr;
--    if( pCol->op==TK_COLUMN && IsVirtual(pCol->y.pTab) ){
-+    testcase( pCol->op==TK_COLUMN && pCol->y.pTab==0 );
-+    if( ExprIsVtab(pCol) ){
-       sqlite3_vtab *pVtab;
-       sqlite3_module *pMod;
-       void (*xNotUsed)(sqlite3_context*,int,sqlite3_value**);
-@@ -422,10 +424,12 @@ static int isAuxiliaryVtabOperator(
-     int res = 0;
-     Expr *pLeft = pExpr->pLeft;
-     Expr *pRight = pExpr->pRight;
--    if( pLeft->op==TK_COLUMN && IsVirtual(pLeft->y.pTab) ){
-+    testcase( pLeft->op==TK_COLUMN && pLeft->y.pTab==0 );
-+    if( ExprIsVtab(pLeft) ){
-       res++;
-     }
--    if( pRight && pRight->op==TK_COLUMN && IsVirtual(pRight->y.pTab) ){
-+    testcase( pRight && pRight->op==TK_COLUMN && pRight->y.pTab==0 );
-+    if( pRight && ExprIsVtab(pRight) ){
-       res++;
-       SWAP(Expr*, pLeft, pRight);
-     }
++** Allowed values for AggInfo.iAggMagic
++*/
++#define SQLITE_AGGMAGIC_VALID  0x05cadade
++
++/*
++** True if the AggInfo object is valid.  Used inside of assert() only.
++*/
++#ifdef SQLITE_DEBUG
++#  define AggInfoValid(P) ((P)->iAggMagic==SQLITE_AGGMAGIC_VALID)
++#endif
++
++/*
+ ** The datatype ynVar is a signed integer, either 16-bit or 32-bit.
+ ** Usually it is 16-bits.  But if SQLITE_MAX_VARIABLE_NUMBER is greater
+ ** than 32767 we have to make it 32-bit.  16-bit is preferred because
+@@ -4546,10 +4566,11 @@ extern const unsigned char sqlite3UpperT
+ extern const unsigned char sqlite3CtypeMap[];
+ extern SQLITE_WSD struct Sqlite3Config sqlite3Config;
+ extern FuncDefHash sqlite3BuiltinFunctions;
++extern u32 sqlite3SelectTrace;
+ #ifndef SQLITE_OMIT_WSD
+ extern int sqlite3PendingByte;
+ #endif
+-#endif
++#endif /* !defined(SQLITE_AMALGAMATION) */
+ #ifdef VDBE_PROFILE
+ extern sqlite3_uint64 sqlite3NProfileCnt;
+ #endif
+diff -Npur sqlite-version-3.32.2/src/test1.c sqlite-version-3.32.2-patched/src/test1.c
+--- sqlite-version-3.32.2/src/test1.c	2020-06-04 20:58:43.000000000 +0800
++++ sqlite-version-3.32.2-patched/src/test1.c	2020-06-15 16:03:29.347573247 +0800
+@@ -8164,7 +8164,7 @@ int Sqlitetest1_Init(Tcl_Interp *interp)
+ #endif
+ #endif
+ #if defined(SQLITE_ENABLE_SELECTTRACE)
+-  extern int sqlite3SelectTrace;
++  extern u32 sqlite3SelectTrace;
+ #endif
+ 
+   for(i=0; i<sizeof(aCmd)/sizeof(aCmd[0]); i++){
+diff -Npur sqlite-version-3.32.2/src/window.c sqlite-version-3.32.2-patched/src/window.c
+--- sqlite-version-3.32.2/src/window.c	2020-06-04 20:58:43.000000000 +0800
++++ sqlite-version-3.32.2-patched/src/window.c	2020-06-15 16:03:29.347573247 +0800
+@@ -942,7 +942,7 @@ static int sqlite3WindowExtraAggFuncDept
+ */
+ int sqlite3WindowRewrite(Parse *pParse, Select *p){
+   int rc = SQLITE_OK;
+-  if( p->pWin && p->pPrior==0 && (p->selFlags & SF_WinRewrite)==0 ){
++  if( ALWAYS(p->pWin && (p->selFlags & SF_WinRewrite)==0) ){
+     Vdbe *v = sqlite3GetVdbe(pParse);
+     sqlite3 *db = pParse->db;
+     Select *pSub = 0;             /* The subquery */
+diff -Npur sqlite-version-3.32.2/test/window1.test sqlite-version-3.32.2-patched/test/window1.test
+--- sqlite-version-3.32.2/test/window1.test	2020-06-04 20:58:43.000000000 +0800
++++ sqlite-version-3.32.2-patched/test/window1.test	2020-06-15 16:03:29.347573247 +0800
+@@ -1743,5 +1743,47 @@ do_execsql_test 53.0 {
+                WHERE a.c);
+ } {4 4 4 4}
+ 
++#-------------------------------------------------------------------------
++reset_db
++do_execsql_test 54.1 {
++  CREATE TABLE t1(a VARCHAR(20), b FLOAT);
++  INSERT INTO t1 VALUES('1',10.0);
++}
++
++do_execsql_test 54.2 {
++  SELECT * FROM ( 
++    SELECT sum(b) OVER() AS c FROM t1 
++      UNION
++    SELECT b AS c FROM t1
++  ) WHERE c>10;
++}
++
++do_execsql_test 54.3 {
++  INSERT INTO t1 VALUES('2',5.0);
++  INSERT INTO t1 VALUES('3',15.0);
++}
++
++do_execsql_test 54.4 {
++  SELECT * FROM ( 
++    SELECT sum(b) OVER() AS c FROM t1 
++      UNION
++    SELECT b AS c FROM t1
++  ) WHERE c>10;
++} {15.0 30.0}
++
++# 2020-06-05 ticket c8d3b9f0a750a529
++reset_db
++do_execsql_test 55.1 {
++   CREATE TABLE a(b);
++   SELECT
++      (SELECT b FROM a
++        GROUP BY b
++        HAVING (SELECT COUNT()OVER() + lead(b)OVER(ORDER BY SUM(DISTINCT b) + b))
++      ) 
++    FROM a
++  UNION
++   SELECT 99
++    ORDER BY 1;
++} {99}
+ 
+ finish_test
diff --git a/third_party/patch/sqlite/sqlite.patch002 b/third_party/patch/sqlite/sqlite.patch002
deleted file mode 100644
index 85d8707c45..0000000000
--- a/third_party/patch/sqlite/sqlite.patch002
+++ /dev/null
@@ -1,85 +0,0 @@
-diff -Npur -x .git sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/manifest sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/manifest
---- sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/manifest	2020-04-20 11:05:54.613993594 +0800
-+++ sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/manifest	2020-05-25 09:50:05.352246036 +0800
-@@ -1,5 +1,5 @@
- C Version\s3.31.1
--D 2020-01-27T19:55:54.490
-+D 2020-04-03T13:19:03.054
- F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1
- F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea
- F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724
-@@ -529,11 +529,11 @@ F src/pragma.h 9f86a3a3a0099e651189521c8
- F src/prepare.c 6049beb71385f017af6fc320d2c75a4e50b75e280c54232442b785fbb83df057
- F src/printf.c 9be6945837c839ba57837b4bc3af349eba630920fa5532aa518816defe42a7d4
- F src/random.c 80f5d666f23feb3e6665a6ce04c7197212a88384
--F src/resolve.c f0781c9e180028b279bc4ff079ad54f4727223d470c8d2343643fcaf79b67740
--F src/rowset.c d977b011993aaea002cab3e0bb2ce50cf346000dff94e944d547b989f4b1fe93
--F src/select.c 3f7aecf64b08b018b89e4fe16ea621cc9a0e3f3801e9e5638cfe1a6035fa1581
--F src/shell.c.in c2e20c43a44fb5588a6c27ce60589538fbf4794fd7686f5b2598eca22eaae1fa
--F src/sqlite.h.in 75d0304247a2154122d6d06f12219c1e29291d72304f0eeef4c1ec6b1409b443
-+F src/resolve.c 5c3b3b18e096353ee2794a8f8a6227c301a57ea771814c158546265d9ef2087e
-+F src/rowset.c ba9515a922af32abe1f7d39406b9d35730ed65efab9443dc5702693b60854c92
-+F src/select.c fd38aa7f87ad0fc93577df6ef1d6cf2b9e5f6186b93c04271f5248c6c4be088c
-+F src/shell.c.in 759bb4a283651955ff2ddb104541b1805b1fff915017083bdd39975cd4e223aa
-+F src/sqlite.h.in cc7d0949ac32bb68ed97acdb3e7ae91cd413a24d32d6ff049ef8308d620a4367
- F src/sqlite3.rc 5121c9e10c3964d5755191c80dd1180c122fc3a8
- F src/sqlite3ext.h 27951f294f29cd875c6027f2707d644ef99f469bd97514568b5a8581a114db8c
- F src/sqliteInt.h d736043dc6291d3af289d911237da0801b6c05be086ae322eedd47a089ae8d2f
-@@ -1718,7 +1718,7 @@ F test/win32heap.test 10fd891266bd00af68
- F test/win32lock.test fbf107c91d8f5512be5a5b87c4c42ab9fdd54972
- F test/win32longpath.test 169c75a3b2e43481f4a62122510210c67b08f26d
- F test/win32nolock.test ac4f08811a562e45a5755e661f45ca85892bdbbc
--F test/window1.test cec56b9a0a2e7ca4bd63b30590c7b049dce9acfd87478e2597e13b67152bd821
-+F test/window1.test ec792f92e63ee457447c5c04de8f8d42f4a94b842b5bac1f403ac38a6d867c22
- F test/window2.tcl 492c125fa550cda1dd3555768a2303b3effbeceee215293adf8871efc25f1476
- F test/window2.test e466a88bd626d66edc3d352d7d7e1d5531e0079b549ba44efb029d1fbff9fd3c
- F test/window3.tcl acea6e86a4324a210fd608d06741010ca83ded9fde438341cb978c49928faf03
-@@ -1857,10 +1857,10 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91
- F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
- F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
- F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
--P 9d0d4ab95dc0c56e053c2924ed322a9ea7b25439e6f74599f706905a1994e454
--R 1c052b7cdf4947664b7043564b643ac3
-+P 684293882c302600e112cf52553c19d84fdb31663d96e5dd7f8ac17dda00a026
-+R dbb40938a904f2e39c11078dcedb87b0
- T +bgcolor * #d0c0ff
- T +sym-release *
- T +sym-version-3.31.1 *
- U drh
--Z e960557a43b001a47933dacf8bc1d10e
-+Z 1c021fc7d9ac6b5d0e31d06cd9bb3304
-diff -Npur -x .git sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/manifest.uuid sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/manifest.uuid
---- sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/manifest.uuid	2020-04-20 11:05:54.613993594 +0800
-+++ sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/manifest.uuid	2020-05-25 09:50:19.076317552 +0800
-@@ -1 +1 @@
--abc473fb8fb999005dc79a360e34f97b3b25429decf1820dd2afa5c19577753d
-+4a302b42c7bf5e11ddb5522ca999f74aba397d3a7eb91b1844bb02852f772441
-diff -Npur -x .git sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/src/select.c sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/src/select.c
---- sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/src/select.c	2020-04-20 11:05:54.613993594 +0800
-+++ sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/src/select.c	2020-05-25 09:48:15.975677012 +0800
-@@ -5352,6 +5352,7 @@ static void resetAccumulator(Parse *pPar
-   struct AggInfo_func *pFunc;
-   int nReg = pAggInfo->nFunc + pAggInfo->nColumn;
-   if( nReg==0 ) return;
-+  if( pParse->nErr ) return;
- #ifdef SQLITE_DEBUG
-   /* Verify that all AggInfo registers are within the range specified by
-   ** AggInfo.mnReg..AggInfo.mxReg */
-diff -Npur -x .git sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/test/window1.test sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/test/window1.test
---- sqlite.3.31.1_CVE-2020-9327_bf48ce49_78d1d225_patch001/test/window1.test	2020-04-20 11:05:54.673992813 +0800
-+++ sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/test/window1.test	2020-05-25 09:50:46.824462220 +0800
-@@ -1594,4 +1594,14 @@ do_execsql_test 48.1 {
- } {2 2 2}
- 
- 
-+# 2020-04-03 ticket af4556bb5c285c08
-+#
-+reset_db
-+do_catchsql_test 51.1 {
-+  CREATE TABLE a(b, c);
-+  SELECT c FROM a GROUP BY c
-+    HAVING(SELECT(sum(b) OVER(ORDER BY b),
-+                  sum(b) OVER(PARTITION BY min(DISTINCT c), c ORDER BY b)));
-+} {1 {row value misused}}
-+
- finish_test
diff --git a/third_party/patch/sqlite/sqlite.patch003 b/third_party/patch/sqlite/sqlite.patch003
deleted file mode 100644
index e76991defb..0000000000
--- a/third_party/patch/sqlite/sqlite.patch003
+++ /dev/null
@@ -1,169 +0,0 @@
-diff -Npur -x .git sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/manifest sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/manifest
---- sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/manifest	2020-05-25 09:50:05.352246036 +0800
-+++ sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/manifest	2020-05-25 10:00:45.272851274 +0800
-@@ -1,5 +1,5 @@
- C Version\s3.31.1
--D 2020-04-03T13:19:03.054
-+D 2020-04-03T11:52:59.198
- F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1
- F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea
- F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724
-@@ -465,9 +465,9 @@ F spec.template 86a4a43b99ebb3e75e6b9a73
- F sqlite.pc.in 42b7bf0d02e08b9e77734a47798d1a55a9e0716b
- F sqlite3.1 fc7ad8990fc8409983309bb80de8c811a7506786
- F sqlite3.pc.in 48fed132e7cb71ab676105d2a4dc77127d8c1f3a
--F src/alter.c f48a4423c8f198d7f1ae4940f74b606707d05384ac79fb219be8e3323af2a2de
--F src/analyze.c b3ceec3fc052df8a96ca8a8c858d455dc5029ba681b4be98bb5c5a9162cfa58c
--F src/attach.c df0ead9091042c68964856ecc08dba55d5403ad5f3ca865d9d396d71528c511a
-+F src/alter.c ac9d737cace62b5cd88bff5310e53e299bc0919f08b5934a2bd0f8e8e65d770e
-+F src/analyze.c 831bb090988477a00d3b4c000746e1b0454dcc93b10b793e6ebe1c47f25d193a
-+F src/attach.c ff2daea0fe62080192e3f262670e4f61f5a86c1e7bea9cec34e960fe79852aa1
- F src/auth.c a3d5bfdba83d25abed1013a8c7a5f204e2e29b0c25242a56bc02bb0c07bf1e06
- F src/backup.c f70077d40c08b7787bfe934e4d1da8030cb0cc57d46b345fba2294b7d1be23ab
- F src/bitvec.c 17ea48eff8ba979f1f5b04cc484c7bb2be632f33
-@@ -639,7 +639,7 @@ F test/altercol.test 1d6a6fe698b81e626ba
- F test/alterlegacy.test 82022721ce0de29cedc9a7af63bc9fcc078b0ee000f8283b4b6ea9c3eab2f44b
- F test/altermalloc.test 167a47de41b5c638f5f5c6efb59784002b196fff70f98d9b4ed3cd74a3fb80c9
- F test/altermalloc2.test fa7b1c1139ea39b8dec407cf1feb032ca8e0076bd429574969b619175ad0174b
--F test/altertab.test bd61e5b73d495ec4707133db91b07f09d57e339d988de5ec5a76d34a2198e8f2
-+F test/altertab.test 523ba6368e0da19f462f7c05563c569675736d946724cac1c4ae848f76783434
- F test/altertab2.test b0d62f323ca5dab42b0bc028c52e310ebdd13e655e8fac070fe622bad7852c2b
- F test/altertab3.test 155b8dc225ce484454a7fb4c8ba745680b6fa0fc3e08919cbbc19f9309d128ff
- F test/amatch1.test b5ae7065f042b7f4c1c922933f4700add50cdb9f
-@@ -1857,10 +1857,10 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91
- F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
- F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
- F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
--P 684293882c302600e112cf52553c19d84fdb31663d96e5dd7f8ac17dda00a026
--R dbb40938a904f2e39c11078dcedb87b0
-+P d09f8c3621d5f7f8c6d99d7d82bcaa8421855b3f470bea2b26c858106382b906
-+R 800f7c7b57166aa2318c88ff52f2e7ab
- T +bgcolor * #d0c0ff
- T +sym-release *
- T +sym-version-3.31.1 *
--U drh
--Z 1c021fc7d9ac6b5d0e31d06cd9bb3304
-+U dan
-+Z 369e72d27cf5a202a4749d0a2e3d428e
-diff -Npur -x .git sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/manifest.uuid sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/manifest.uuid
---- sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/manifest.uuid	2020-05-25 09:50:19.076317552 +0800
-+++ sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/manifest.uuid	2020-05-25 10:01:03.568710038 +0800
-@@ -1 +1 @@
--4a302b42c7bf5e11ddb5522ca999f74aba397d3a7eb91b1844bb02852f772441
-+684293882c302600e112cf52553c19d84fdb31663d96e5dd7f8ac17dda00a026
-diff -Npur -x .git sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/src/alter.c sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/src/alter.c
---- sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/src/alter.c	2020-05-25 09:43:54.350324669 +0800
-+++ sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/src/alter.c	2020-05-25 09:57:36.168219061 +0800
-@@ -756,6 +756,21 @@ static void renameWalkWith(Walker *pWalk
- }
- 
- /*
-+** Unmap all tokens in the IdList object passed as the second argument.
-+*/
-+static void unmapColumnIdlistNames(
-+  Parse *pParse,
-+  IdList *pIdList
-+){
-+  if( pIdList ){
-+    int ii;
-+    for(ii=0; ii<pIdList->nId; ii++){
-+      sqlite3RenameTokenRemap(pParse, 0, (void*)pIdList->a[ii].zName);
-+    }
-+  }
-+}
-+
-+/*
- ** Walker callback used by sqlite3RenameExprUnmap().
- */
- static int renameUnmapSelectCb(Walker *pWalker, Select *p){
-@@ -776,6 +791,7 @@ static int renameUnmapSelectCb(Walker *p
-     for(i=0; i<pSrc->nSrc; i++){
-       sqlite3RenameTokenRemap(pParse, 0, (void*)pSrc->a[i].zName);
-       if( sqlite3WalkExpr(pWalker, pSrc->a[i].pOn) ) return WRC_Abort;
-+      unmapColumnIdlistNames(pParse, pSrc->a[i].pUsing);
-     }
-   }
- 
-@@ -984,6 +1000,7 @@ static void renameColumnIdlistNames(
-   }
- }
- 
-+
- /*
- ** Parse the SQL statement zSql using Parse object (*p). The Parse object
- ** is initialized by this function before it is used.
-diff -Npur -x .git sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/src/resolve.c sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/src/resolve.c
---- sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/src/resolve.c	2020-05-25 09:43:54.362324731 +0800
-+++ sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/src/resolve.c	2020-05-25 10:00:02.705658443 +0800
-@@ -1177,7 +1177,7 @@ static int resolveOrderByTermToExprList(
-   nc.nErr = 0;
-   db = pParse->db;
-   savedSuppErr = db->suppressErr;
--  db->suppressErr = 1;
-+  if( IN_RENAME_OBJECT==0 ) db->suppressErr = 1;
-   rc = sqlite3ResolveExprNames(&nc, pE);
-   db->suppressErr = savedSuppErr;
-   if( rc ) return 0;
-diff -Npur -x .git sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/test/altertab.test sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/test/altertab.test
---- sqlite.3.31.1_CVE-2020-11655_c415d910_patch002/test/altertab.test	2020-05-25 09:43:54.462325244 +0800
-+++ sqlite.3.31.1_CVE-2020-11656_fb99e388_4db7ab53_patch003/test/altertab.test	2020-05-25 10:00:02.697658682 +0800
-@@ -594,7 +594,6 @@ reset_db
- do_execsql_test 18.1.0 {
-   CREATE TABLE t0 (c0 INTEGER, PRIMARY KEY(c0)) WITHOUT ROWID;
- }
--breakpoint
- do_execsql_test 18.1.1 {
-   ALTER TABLE t0 RENAME COLUMN c0 TO c1;
- }
-@@ -613,4 +612,51 @@ do_execsql_test 18.2.2 {
-   SELECT sql FROM sqlite_master;
- } {{CREATE TABLE t0 (c1 INTEGER, PRIMARY KEY(c1))}}
- 
-+# 2020-02-23 ticket f50af3e8a565776b
-+reset_db
-+do_execsql_test 19.100 {
-+  CREATE TABLE t1(x);
-+  CREATE VIEW t2 AS SELECT 1 FROM t1, (t1 AS a0, t1);
-+  ALTER TABLE t1 RENAME TO t3;
-+  SELECT sql FROM sqlite_master;
-+} {{CREATE TABLE "t3"(x)} {CREATE VIEW t2 AS SELECT 1 FROM "t3", ("t3" AS a0, "t3")}}
-+do_execsql_test 19.110 {
-+  INSERT INTO t3(x) VALUES(123);
-+  SELECT * FROM t2;
-+} {1}
-+do_execsql_test 19.120 {
-+  INSERT INTO t3(x) VALUES('xyz');
-+  SELECT * FROM t2;
-+} {1 1 1 1 1 1 1 1}
-+
-+# Ticket 4722bdab08cb14
-+reset_db
-+do_execsql_test 20.0 {
-+  CREATE TABLE a(a);
-+  CREATE VIEW b AS SELECT(SELECT *FROM c JOIN a USING(d, a, a, a) JOIN a) IN();
-+}
-+do_execsql_test 20.1 {
-+  ALTER TABLE a RENAME a TO e;
-+} {}
-+
-+reset_db
-+do_execsql_test 21.0 {
-+  CREATE TABLE a(b);
-+  CREATE VIEW c AS 
-+      SELECT NULL INTERSECT 
-+      SELECT NULL ORDER BY
-+      likelihood(NULL, (d, (SELECT c)));
-+} {}
-+do_catchsql_test 21.1 {
-+  SELECT likelihood(NULL, (d, (SELECT c)));
-+} {1 {second argument to likelihood() must be a constant between 0.0 and 1.0}}
-+do_catchsql_test 21.2 {
-+  SELECT * FROM c;
-+} {1 {1st ORDER BY term does not match any column in the result set}}
-+
-+do_catchsql_test 21.3 {
-+  ALTER TABLE a RENAME TO e;
-+} {1 {error in view c: 1st ORDER BY term does not match any column in the result set}}
-+
-+
- finish_test
diff --git a/third_party/patch/sqlite/sqlite.windows.patch001 b/third_party/patch/sqlite/sqlite.windows.patch001
index c3aae272e6..d328d90ae9 100644
--- a/third_party/patch/sqlite/sqlite.windows.patch001
+++ b/third_party/patch/sqlite/sqlite.windows.patch001
@@ -1,6 +1,6 @@
-diff -uprN sqlite-amalgamation-3310100/CMakeLists.txt sqlite-patch001/CMakeLists.txt
---- sqlite-amalgamation-3310100/CMakeLists.txt	1970-01-01 08:00:00.000000000 +0800
-+++ sqlite-patch001/CMakeLists.txt	2020-05-25 10:47:36.902007755 +0800
+diff -Npur sqlite-amalgamation-3320200/CMakeLists.txt linux-amalgamation/CMakeLists.txt
+--- sqlite-amalgamation-3320200/CMakeLists.txt	1970-01-01 08:00:00.000000000 +0800
++++ linux-amalgamation/CMakeLists.txt	2020-06-16 09:21:51.768154641 +0800
 @@ -0,0 +1,6 @@
 +cmake_minimum_required(VERSION 3.14)
 +project (Sqlite[C])
@@ -8,126 +8,245 @@ diff -uprN sqlite-amalgamation-3310100/CMakeLists.txt sqlite-patch001/CMakeLists
 +set_target_properties(sqlite3 PROPERTIES PUBLIC_HEADER "sqlite3.h;sqlite3ext.h")
 +include(GNUInstallDirs)
 +install(TARGETS sqlite3 PUBLIC_HEADER)
-diff -uprN sqlite-amalgamation-3310100/sqlite3.c sqlite-patch001/sqlite3.c
---- sqlite-amalgamation-3310100/sqlite3.c	2020-01-28 04:25:14.000000000 +0800
-+++ sqlite-patch001/sqlite3.c	2020-05-25 10:49:04.057117196 +0800
-@@ -1167,7 +1167,7 @@ extern "C" {
+diff -Npur sqlite-amalgamation-3320200/sqlite3.c linux-amalgamation/sqlite3.c
+--- sqlite-amalgamation-3320200/sqlite3.c	2020-06-04 22:01:17.000000000 +0800
++++ linux-amalgamation/sqlite3.c	2020-06-15 14:18:34.330175000 +0800
+@@ -1164,7 +1164,7 @@ extern "C" {
  */
- #define SQLITE_VERSION        "3.31.1"
- #define SQLITE_VERSION_NUMBER 3031001
--#define SQLITE_SOURCE_ID      "2020-01-27 19:55:54 3bfa9cc97da10598521b342961df8f5f68c7388fa117345eeb516eaa837bb4d6"
-+#define SQLITE_SOURCE_ID      "2020-01-27 19:55:54 bc2f42080a6a8a33048eedf289152768c416b2a7677a92111b3b1ea60d4aalt1"
+ #define SQLITE_VERSION        "3.32.2"
+ #define SQLITE_VERSION_NUMBER 3032002
+-#define SQLITE_SOURCE_ID      "2020-06-04 12:58:43 ec02243ea6ce33b090870ae55ab8aa2534b54d216d45c4aa2fdbb00e86861e8c"
++#define SQLITE_SOURCE_ID      "2020-06-04 12:58:43 ec02243ea6ce33b090870ae55ab8aa2534b54d216d45c4aa2fdbb00e8686alt1"
  
  /*
  ** CAPI3REF: Run-Time Library Version Numbers
-@@ -17428,8 +17428,11 @@ struct Table {
+@@ -14521,7 +14521,12 @@ typedef INT16_TYPE LogEst;
  */
- #ifndef SQLITE_OMIT_VIRTUALTABLE
- #  define IsVirtual(X)      ((X)->nModuleArg)
-+#  define ExprIsVtab(X)  \
-+              ((X)->op==TK_COLUMN && (X)->y.pTab!=0 && (X)->y.pTab->nModuleArg)
+ #if defined(SQLITE_ENABLE_SELECTTRACE)
+ # define SELECTTRACE_ENABLED 1
++# define SELECTTRACE(K,P,S,X)  \
++  if(sqlite3SelectTrace&(K))   \
++    sqlite3DebugPrintf("%u/%d/%p: ",(S)->selId,(P)->addrExplain,(S)),\
++    sqlite3DebugPrintf X
  #else
- #  define IsVirtual(X)      0
-+#  define ExprIsVtab(X)     0
++# define SELECTTRACE(K,P,S,X)
+ # define SELECTTRACE_ENABLED 0
  #endif
  
+@@ -17880,9 +17885,24 @@ struct AggInfo {
+     int iDistinct;           /* Ephemeral table used to enforce DISTINCT */
+   } *aFunc;
+   int nFunc;              /* Number of entries in aFunc[] */
++#ifdef SQLITE_DEBUG
++  u32 iAggMagic;          /* Sanity checking constant */
++#endif
+ };
+ 
  /*
-@@ -104133,19 +104136,25 @@ static int impliesNotNullRow(Walker *pWa
-     case TK_LT:
-     case TK_LE:
-     case TK_GT:
--    case TK_GE:
-+    case TK_GE: {
-+      Expr *pLeft = pExpr->pLeft;
-+      Expr *pRight = pExpr->pRight;
-       testcase( pExpr->op==TK_EQ );
-       testcase( pExpr->op==TK_NE );
-       testcase( pExpr->op==TK_LT );
-       testcase( pExpr->op==TK_LE );
-       testcase( pExpr->op==TK_GT );
-       testcase( pExpr->op==TK_GE );
--      if( (pExpr->pLeft->op==TK_COLUMN && IsVirtual(pExpr->pLeft->y.pTab))
--       || (pExpr->pRight->op==TK_COLUMN && IsVirtual(pExpr->pRight->y.pTab))
-+      /* The y.pTab=0 assignment in wherecode.c always happens after the
-+      ** impliesNotNullRow() test */
-+      if( (pLeft->op==TK_COLUMN && ALWAYS(pLeft->y.pTab!=0)
-+                               && IsVirtual(pLeft->y.pTab))
-+       || (pRight->op==TK_COLUMN && ALWAYS(pRight->y.pTab!=0)
-+                               && IsVirtual(pRight->y.pTab))
-       ){
--       return WRC_Prune;
-+        return WRC_Prune;
++** Allowed values for AggInfo.iAggMagic
++*/
++#define SQLITE_AGGMAGIC_VALID  0x05cadade
++
++/*
++** True if the AggInfo object is valid.  Used inside of assert() only.
++*/
++#ifdef SQLITE_DEBUG
++#  define AggInfoValid(P) ((P)->iAggMagic==SQLITE_AGGMAGIC_VALID)
++#endif
++
++/*
+ ** The datatype ynVar is a signed integer, either 16-bit or 32-bit.
+ ** Usually it is 16-bits.  But if SQLITE_MAX_VARIABLE_NUMBER is greater
+ ** than 32767 we have to make it 32-bit.  16-bit is preferred because
+@@ -19903,10 +19923,11 @@ SQLITE_PRIVATE const unsigned char sqlit
+ SQLITE_PRIVATE const unsigned char sqlite3CtypeMap[];
+ SQLITE_PRIVATE SQLITE_WSD struct Sqlite3Config sqlite3Config;
+ SQLITE_PRIVATE FuncDefHash sqlite3BuiltinFunctions;
++SQLITE_PRIVATE u32 sqlite3SelectTrace;
+ #ifndef SQLITE_OMIT_WSD
+ SQLITE_PRIVATE int sqlite3PendingByte;
+ #endif
+-#endif
++#endif /* !defined(SQLITE_AMALGAMATION) */
+ #ifdef VDBE_PROFILE
+ SQLITE_PRIVATE sqlite3_uint64 sqlite3NProfileCnt;
+ #endif
+@@ -20616,6 +20637,11 @@ SQLITE_PRIVATE sqlite3_uint64 sqlite3NPr
+ SQLITE_PRIVATE int sqlite3PendingByte = 0x40000000;
+ #endif
+ 
++/*
++** Flags for select tracing and the ".selecttrace" macro of the CLI
++*/
++/**/ u32 sqlite3SelectTrace = 0;
++
+ /* #include "opcodes.h" */
+ /*
+ ** Properties of opcodes.  The OPFLG_INITIALIZER macro is
+@@ -99243,6 +99269,14 @@ static int resolveSelectStep(Walker *pWa
+           return WRC_Abort;
+         }
        }
--
-+    }
-     default:
-       return WRC_Continue;
-   }
-@@ -142591,7 +142600,8 @@ static int isAuxiliaryVtabOperator(
-     **       MATCH(expression,vtab_column)
-     */
-     pCol = pList->a[1].pExpr;
--    if( pCol->op==TK_COLUMN && IsVirtual(pCol->y.pTab) ){
-+    testcase( pCol->op==TK_COLUMN && pCol->y.pTab==0 );
-+    if( ExprIsVtab(pCol) ){
-       for(i=0; i<ArraySize(aOp); i++){
-         if( sqlite3StrICmp(pExpr->u.zToken, aOp[i].zOp)==0 ){
-           *peOp2 = aOp[i].eOp2;
-@@ -142613,7 +142623,8 @@ static int isAuxiliaryVtabOperator(
-     ** with function names in an arbitrary case.
-     */
-     pCol = pList->a[0].pExpr;
--    if( pCol->op==TK_COLUMN && IsVirtual(pCol->y.pTab) ){
-+    testcase( pCol->op==TK_COLUMN && pCol->y.pTab==0 );
-+    if( ExprIsVtab(pCol) ){
-       sqlite3_vtab *pVtab;
-       sqlite3_module *pMod;
-       void (*xNotUsed)(sqlite3_context*,int,sqlite3_value**);
-@@ -142636,10 +142647,12 @@ static int isAuxiliaryVtabOperator(
-     int res = 0;
-     Expr *pLeft = pExpr->pLeft;
-     Expr *pRight = pExpr->pRight;
--    if( pLeft->op==TK_COLUMN && IsVirtual(pLeft->y.pTab) ){
-+    testcase( pLeft->op==TK_COLUMN && pLeft->y.pTab==0 );
-+    if( ExprIsVtab(pLeft) ){
-       res++;
-     }
--    if( pRight && pRight->op==TK_COLUMN && IsVirtual(pRight->y.pTab) ){
-+    testcase( pRight && pRight->op==TK_COLUMN && pRight->y.pTab==0 );
-+    if( pRight && ExprIsVtab(pRight) ){
-       res++;
-       SWAP(Expr*, pLeft, pRight);
++    }else if( p->pWin && ALWAYS( (p->selFlags & SF_WinRewrite)==0 ) ){
++      sqlite3WindowRewrite(pParse, p);
++#if SELECTTRACE_ENABLED
++      if( (sqlite3SelectTrace & 0x108)!=0 ){
++        SELECTTRACE(0x104,pParse,p, ("after window rewrite:\n"));
++        sqlite3TreeViewSelect(0, p, 0);
++      }
++#endif
      }
-@@ -223667,7 +223680,7 @@ static void fts5SourceIdFunc(
- ){
-   assert( nArg==0 );
-   UNUSED_PARAM2(nArg, apUnused);
--  sqlite3_result_text(pCtx, "fts5: 2020-01-27 19:55:54 3bfa9cc97da10598521b342961df8f5f68c7388fa117345eeb516eaa837bb4d6", -1, SQLITE_TRANSIENT);
-+  sqlite3_result_text(pCtx, "fts5: 2020-01-27 19:55:54 abc473fb8fb999005dc79a360e34f97b3b25429decf1820dd2afa5c19577753d", -1, SQLITE_TRANSIENT);
- }
+ #endif
+ 
+@@ -103297,6 +103331,7 @@ expr_code_doover:
+       AggInfo *pAggInfo = pExpr->pAggInfo;
+       struct AggInfo_col *pCol;
+       assert( pAggInfo!=0 );
++      assert( AggInfoValid(pAggInfo) );
+       assert( pExpr->iAgg>=0 && pExpr->iAgg<pAggInfo->nColumn );
+       pCol = &pAggInfo->aCol[pExpr->iAgg];
+       if( !pAggInfo->directMode ){
+@@ -103605,6 +103640,7 @@ expr_code_doover:
+         assert( !ExprHasProperty(pExpr, EP_IntValue) );
+         sqlite3ErrorMsg(pParse, "misuse of aggregate: %s()", pExpr->u.zToken);
+       }else{
++        assert( AggInfoValid(pInfo) );
+         return pInfo->aFunc[pExpr->iAgg].iMem;
+       }
+       break;
+@@ -105142,13 +105178,7 @@ struct SrcCount {
+ ** Count the number of references to columns.
+ */
+ static int exprSrcCount(Walker *pWalker, Expr *pExpr){
+-  /* There was once a NEVER() on the second term on the grounds that
+-  ** sqlite3FunctionUsesThisSrc() was always called before 
+-  ** sqlite3ExprAnalyzeAggregates() and so the TK_COLUMNs have not yet 
+-  ** been converted into TK_AGG_COLUMN. But this is no longer true due
+-  ** to window functions - sqlite3WindowRewrite() may now indirectly call
+-  ** FunctionUsesThisSrc() when creating a new sub-select. */
+-  if( pExpr->op==TK_COLUMN || pExpr->op==TK_AGG_COLUMN ){
++  if( pExpr->op==TK_COLUMN || NEVER(pExpr->op==TK_AGG_COLUMN) ){
+     int i;
+     struct SrcCount *p = pWalker->u.pSrcCount;
+     SrcList *pSrc = p->pSrc;
+@@ -128851,20 +128881,6 @@ SQLITE_API int sqlite3_prepare16_v3(
+ /* #include "sqliteInt.h" */
  
  /*
-@@ -228440,9 +228453,9 @@ SQLITE_API int sqlite3_stmt_init(
+-** Trace output macros
+-*/
+-#if SELECTTRACE_ENABLED
+-/***/ int sqlite3SelectTrace = 0;
+-# define SELECTTRACE(K,P,S,X)  \
+-  if(sqlite3SelectTrace&(K))   \
+-    sqlite3DebugPrintf("%u/%d/%p: ",(S)->selId,(P)->addrExplain,(S)),\
+-    sqlite3DebugPrintf X
+-#else
+-# define SELECTTRACE(K,P,S,X)
+-#endif
+-
+-
+-/*
+ ** An instance of the following object is used to record information about
+ ** how to process the DISTINCT keyword, to simplify passing that information
+ ** into the selectInnerLoop() routine.
+@@ -133262,11 +133278,14 @@ static int pushDownWhereTerms(
+ ){
+   Expr *pNew;
+   int nChng = 0;
++  Select *pSel;
+   if( pWhere==0 ) return 0;
+   if( pSubq->selFlags & SF_Recursive ) return 0;  /* restriction (2) */
+ 
+ #ifndef SQLITE_OMIT_WINDOWFUNC
+-  if( pSubq->pWin ) return 0;    /* restriction (6) */
++  for(pSel=pSubq; pSel; pSel=pSel->pPrior){
++    if( pSel->pWin ) return 0;    /* restriction (6) */
++  }
+ #endif
+ 
+ #ifdef SQLITE_DEBUG
+@@ -134602,6 +134621,9 @@ SQLITE_PRIVATE int sqlite3Select(
+   }
+   if( sqlite3AuthCheck(pParse, SQLITE_SELECT, 0, 0, 0) ) return 1;
+   memset(&sAggInfo, 0, sizeof(sAggInfo));
++#ifdef SQLITE_DEBUG
++  sAggInfo.iAggMagic = SQLITE_AGGMAGIC_VALID;
++#endif
+ #if SELECTTRACE_ENABLED
+   SELECTTRACE(1,pParse,p, ("begin processing:\n", pParse->addrExplain));
+   if( sqlite3SelectTrace & 0x100 ){
+@@ -134640,19 +134662,6 @@ SQLITE_PRIVATE int sqlite3Select(
+     generateColumnNames(pParse, p);
+   }
+ 
+-#ifndef SQLITE_OMIT_WINDOWFUNC
+-  rc = sqlite3WindowRewrite(pParse, p);
+-  if( rc ){
+-    assert( db->mallocFailed || pParse->nErr>0 );
+-    goto select_end;
+-  }
+-#if SELECTTRACE_ENABLED
+-  if( p->pWin && (sqlite3SelectTrace & 0x108)!=0 ){
+-    SELECTTRACE(0x104,pParse,p, ("after window rewrite:\n"));
+-    sqlite3TreeViewSelect(0, p, 0);
+-  }
+-#endif
+-#endif /* SQLITE_OMIT_WINDOWFUNC */
+   pTabList = p->pSrc;
+   isAgg = (p->selFlags & SF_Aggregate)!=0;
+   memset(&sSort, 0, sizeof(sSort));
+@@ -134980,7 +134989,7 @@ SQLITE_PRIVATE int sqlite3Select(
+   if( (p->selFlags & (SF_Distinct|SF_Aggregate))==SF_Distinct 
+    && sqlite3ExprListCompare(sSort.pOrderBy, pEList, -1)==0
+ #ifndef SQLITE_OMIT_WINDOWFUNC
+-   && p->pWin==0
++   && ALWAYS(p->pWin==0)
+ #endif
+   ){
+     p->selFlags &= ~SF_Distinct;
+@@ -135627,6 +135636,14 @@ SQLITE_PRIVATE int sqlite3Select(
+ select_end:
+   sqlite3ExprListDelete(db, pMinMaxOrderBy);
+   sqlite3DbFree(db, sAggInfo.aCol);
++#ifdef SQLITE_DEBUG
++  for(i=0; i<sAggInfo.nFunc; i++){
++    assert( sAggInfo.aFunc[i].pExpr!=0 );
++    assert( sAggInfo.aFunc[i].pExpr->pAggInfo==&sAggInfo );
++    sAggInfo.aFunc[i].pExpr->pAggInfo = 0;
++  }
++  sAggInfo.iAggMagic = 0;
++#endif
+   sqlite3DbFree(db, sAggInfo.aFunc);
+ #if SELECTTRACE_ENABLED
+   SELECTTRACE(0x1,pParse,p,("end processing\n"));
+@@ -151305,7 +151322,7 @@ static int sqlite3WindowExtraAggFuncDept
+ */
+ SQLITE_PRIVATE int sqlite3WindowRewrite(Parse *pParse, Select *p){
+   int rc = SQLITE_OK;
+-  if( p->pWin && p->pPrior==0 && (p->selFlags & SF_WinRewrite)==0 ){
++  if( ALWAYS(p->pWin && (p->selFlags & SF_WinRewrite)==0) ){
+     Vdbe *v = sqlite3GetVdbe(pParse);
+     sqlite3 *db = pParse->db;
+     Select *pSub = 0;             /* The subquery */
+@@ -229607,7 +229624,7 @@ SQLITE_API int sqlite3_stmt_init(
  #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_STMTVTAB) */
  
  /************** End of stmt.c ************************************************/
--#if __LINE__!=228443
-+#if __LINE__!=228456
+-#if __LINE__!=229610
++#if __LINE__!=229627
  #undef SQLITE_SOURCE_ID
--#define SQLITE_SOURCE_ID      "2020-01-27 19:55:54 3bfa9cc97da10598521b342961df8f5f68c7388fa117345eeb516eaa837balt2"
-+#define SQLITE_SOURCE_ID      "2020-01-27 19:55:54 bc2f42080a6a8a33048eedf289152768c416b2a7677a92111b3b1ea60d4aalt2"
+ #define SQLITE_SOURCE_ID      "2020-06-04 12:58:43 ec02243ea6ce33b090870ae55ab8aa2534b54d216d45c4aa2fdbb00e8686alt2"
  #endif
- /* Return the source-id for this library */
- SQLITE_API const char *sqlite3_sourceid(void){ return SQLITE_SOURCE_ID; }
-diff -uprN sqlite-amalgamation-3310100/sqlite3.h sqlite-patch001/sqlite3.h
---- sqlite-amalgamation-3310100/sqlite3.h	2020-01-28 04:25:14.000000000 +0800
-+++ sqlite-patch001/sqlite3.h	2020-05-25 10:49:17.932970790 +0800
+diff -Npur sqlite-amalgamation-3320200/sqlite3.h linux-amalgamation/sqlite3.h
+--- sqlite-amalgamation-3320200/sqlite3.h	2020-06-04 22:01:17.000000000 +0800
++++ linux-amalgamation/sqlite3.h	2020-06-15 14:18:32.674154000 +0800
 @@ -125,7 +125,7 @@ extern "C" {
  */
- #define SQLITE_VERSION        "3.31.1"
- #define SQLITE_VERSION_NUMBER 3031001
--#define SQLITE_SOURCE_ID      "2020-01-27 19:55:54 3bfa9cc97da10598521b342961df8f5f68c7388fa117345eeb516eaa837bb4d6"
-+#define SQLITE_SOURCE_ID      "2020-01-27 19:55:54 bc2f42080a6a8a33048eedf289152768c416b2a7677a92111b3b1ea60d4aalt1"
+ #define SQLITE_VERSION        "3.32.2"
+ #define SQLITE_VERSION_NUMBER 3032002
+-#define SQLITE_SOURCE_ID      "2020-06-04 12:58:43 ec02243ea6ce33b090870ae55ab8aa2534b54d216d45c4aa2fdbb00e86861e8c"
++#define SQLITE_SOURCE_ID      "2020-06-04 12:58:43 ec02243ea6ce33b090870ae55ab8aa2534b54d216d45c4aa2fdbb00e8686alt1"
  
  /*
  ** CAPI3REF: Run-Time Library Version Numbers
diff --git a/third_party/patch/sqlite/sqlite.windows.patch002 b/third_party/patch/sqlite/sqlite.windows.patch002
deleted file mode 100644
index dac788c10a..0000000000
--- a/third_party/patch/sqlite/sqlite.windows.patch002
+++ /dev/null
@@ -1,53 +0,0 @@
-diff -uprN sqlite-patch001/sqlite3.c sqlite-patch002/sqlite3.c
---- sqlite-patch001/sqlite3.c	2020-05-25 10:49:04.057117196 +0800
-+++ sqlite-patch002/sqlite3.c	2020-05-25 10:56:21.120064808 +0800
-@@ -1167,7 +1167,7 @@ extern "C" {
- */
- #define SQLITE_VERSION        "3.31.1"
- #define SQLITE_VERSION_NUMBER 3031001
--#define SQLITE_SOURCE_ID      "2020-01-27 19:55:54 bc2f42080a6a8a33048eedf289152768c416b2a7677a92111b3b1ea60d4aalt1"
-+#define SQLITE_SOURCE_ID      "2020-04-03 13:19:03 0f4911fdb07c7c4111731d3db0adae54ee750ddbad8d98bf9ab957fb923falt1"
- 
- /*
- ** CAPI3REF: Run-Time Library Version Numbers
-@@ -133226,6 +133226,7 @@ static void resetAccumulator(Parse *pPar
-   struct AggInfo_func *pFunc;
-   int nReg = pAggInfo->nFunc + pAggInfo->nColumn;
-   if( nReg==0 ) return;
-+  if( pParse->nErr ) return;
- #ifdef SQLITE_DEBUG
-   /* Verify that all AggInfo registers are within the range specified by
-   ** AggInfo.mnReg..AggInfo.mxReg */
-@@ -223680,7 +223681,7 @@ static void fts5SourceIdFunc(
- ){
-   assert( nArg==0 );
-   UNUSED_PARAM2(nArg, apUnused);
--  sqlite3_result_text(pCtx, "fts5: 2020-01-27 19:55:54 abc473fb8fb999005dc79a360e34f97b3b25429decf1820dd2afa5c19577753d", -1, SQLITE_TRANSIENT);
-+  sqlite3_result_text(pCtx, "fts5: 2020-04-03 13:19:03 4a302b42c7bf5e11ddb5522ca999f74aba397d3a7eb91b1844bb02852f772441", -1, SQLITE_TRANSIENT);
- }
- 
- /*
-@@ -228453,9 +228454,9 @@ SQLITE_API int sqlite3_stmt_init(
- #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_STMTVTAB) */
- 
- /************** End of stmt.c ************************************************/
--#if __LINE__!=228456
-+#if __LINE__!=228457
- #undef SQLITE_SOURCE_ID
--#define SQLITE_SOURCE_ID      "2020-01-27 19:55:54 bc2f42080a6a8a33048eedf289152768c416b2a7677a92111b3b1ea60d4aalt2"
-+#define SQLITE_SOURCE_ID      "2020-04-03 13:19:03 0f4911fdb07c7c4111731d3db0adae54ee750ddbad8d98bf9ab957fb923falt2"
- #endif
- /* Return the source-id for this library */
- SQLITE_API const char *sqlite3_sourceid(void){ return SQLITE_SOURCE_ID; }
-diff -uprN sqlite-patch001/sqlite3.h sqlite-patch002/sqlite3.h
---- sqlite-patch001/sqlite3.h	2020-05-25 10:49:17.932970790 +0800
-+++ sqlite-patch002/sqlite3.h	2020-05-25 10:56:23.296037923 +0800
-@@ -125,7 +125,7 @@ extern "C" {
- */
- #define SQLITE_VERSION        "3.31.1"
- #define SQLITE_VERSION_NUMBER 3031001
--#define SQLITE_SOURCE_ID      "2020-01-27 19:55:54 bc2f42080a6a8a33048eedf289152768c416b2a7677a92111b3b1ea60d4aalt1"
-+#define SQLITE_SOURCE_ID      "2020-04-03 13:19:03 0f4911fdb07c7c4111731d3db0adae54ee750ddbad8d98bf9ab957fb923falt1"
- 
- /*
- ** CAPI3REF: Run-Time Library Version Numbers
diff --git a/third_party/patch/sqlite/sqlite.windows.patch003 b/third_party/patch/sqlite/sqlite.windows.patch003
deleted file mode 100644
index 5bd422a9cf..0000000000
--- a/third_party/patch/sqlite/sqlite.windows.patch003
+++ /dev/null
@@ -1,92 +0,0 @@
-diff -uprN sqlite-patch002/sqlite3.c sqlite-patch003/sqlite3.c
---- sqlite-patch002/sqlite3.c	2020-05-25 10:56:21.120064808 +0800
-+++ sqlite-patch003/sqlite3.c	2020-05-25 11:00:13.909126750 +0800
-@@ -1167,7 +1167,7 @@ extern "C" {
- */
- #define SQLITE_VERSION        "3.31.1"
- #define SQLITE_VERSION_NUMBER 3031001
--#define SQLITE_SOURCE_ID      "2020-04-03 13:19:03 0f4911fdb07c7c4111731d3db0adae54ee750ddbad8d98bf9ab957fb923falt1"
-+#define SQLITE_SOURCE_ID      "2020-04-03 11:52:59 b58be6e2216e7a8e9c0eee07c42856f751359bbfa740e8ad8c5d73b33273alt1"
- 
- /*
- ** CAPI3REF: Run-Time Library Version Numbers
-@@ -97945,7 +97945,7 @@ static int resolveOrderByTermToExprList(
-   nc.nErr = 0;
-   db = pParse->db;
-   savedSuppErr = db->suppressErr;
--  db->suppressErr = 1;
-+  if( IN_RENAME_OBJECT==0 ) db->suppressErr = 1;
-   rc = sqlite3ResolveExprNames(&nc, pE);
-   db->suppressErr = savedSuppErr;
-   if( rc ) return 0;
-@@ -105384,6 +105384,21 @@ static void renameWalkWith(Walker *pWalk
- }
- 
- /*
-+** Unmap all tokens in the IdList object passed as the second argument.
-+*/
-+static void unmapColumnIdlistNames(
-+  Parse *pParse,
-+  IdList *pIdList
-+){
-+  if( pIdList ){
-+    int ii;
-+    for(ii=0; ii<pIdList->nId; ii++){
-+      sqlite3RenameTokenRemap(pParse, 0, (void*)pIdList->a[ii].zName);
-+    }
-+  }
-+}
-+
-+/*
- ** Walker callback used by sqlite3RenameExprUnmap().
- */
- static int renameUnmapSelectCb(Walker *pWalker, Select *p){
-@@ -105404,6 +105419,7 @@ static int renameUnmapSelectCb(Walker *p
-     for(i=0; i<pSrc->nSrc; i++){
-       sqlite3RenameTokenRemap(pParse, 0, (void*)pSrc->a[i].zName);
-       if( sqlite3WalkExpr(pWalker, pSrc->a[i].pOn) ) return WRC_Abort;
-+      unmapColumnIdlistNames(pParse, pSrc->a[i].pUsing);
-     }
-   }
- 
-@@ -105612,6 +105628,7 @@ static void renameColumnIdlistNames(
-   }
- }
- 
-+
- /*
- ** Parse the SQL statement zSql using Parse object (*p). The Parse object
- ** is initialized by this function before it is used.
-@@ -223681,7 +223698,7 @@ static void fts5SourceIdFunc(
- ){
-   assert( nArg==0 );
-   UNUSED_PARAM2(nArg, apUnused);
--  sqlite3_result_text(pCtx, "fts5: 2020-04-03 13:19:03 4a302b42c7bf5e11ddb5522ca999f74aba397d3a7eb91b1844bb02852f772441", -1, SQLITE_TRANSIENT);
-+  sqlite3_result_text(pCtx, "fts5: 2020-04-03 11:52:59 684293882c302600e112cf52553c19d84fdb31663d96e5dd7f8ac17dda00a026", -1, SQLITE_TRANSIENT);
- }
- 
- /*
-@@ -228454,9 +228471,9 @@ SQLITE_API int sqlite3_stmt_init(
- #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_STMTVTAB) */
- 
- /************** End of stmt.c ************************************************/
--#if __LINE__!=228457
-+#if __LINE__!=228474
- #undef SQLITE_SOURCE_ID
--#define SQLITE_SOURCE_ID      "2020-04-03 13:19:03 0f4911fdb07c7c4111731d3db0adae54ee750ddbad8d98bf9ab957fb923falt2"
-+#define SQLITE_SOURCE_ID      "2020-04-03 11:52:59 b58be6e2216e7a8e9c0eee07c42856f751359bbfa740e8ad8c5d73b33273alt2"
- #endif
- /* Return the source-id for this library */
- SQLITE_API const char *sqlite3_sourceid(void){ return SQLITE_SOURCE_ID; }
-diff -uprN sqlite-patch002/sqlite3.h sqlite-patch003/sqlite3.h
---- sqlite-patch002/sqlite3.h	2020-05-25 10:56:23.296037923 +0800
-+++ sqlite-patch003/sqlite3.h	2020-05-25 11:00:14.649117241 +0800
-@@ -125,7 +125,7 @@ extern "C" {
- */
- #define SQLITE_VERSION        "3.31.1"
- #define SQLITE_VERSION_NUMBER 3031001
--#define SQLITE_SOURCE_ID      "2020-04-03 13:19:03 0f4911fdb07c7c4111731d3db0adae54ee750ddbad8d98bf9ab957fb923falt1"
-+#define SQLITE_SOURCE_ID      "2020-04-03 11:52:59 b58be6e2216e7a8e9c0eee07c42856f751359bbfa740e8ad8c5d73b33273alt1"
- 
- /*
- ** CAPI3REF: Run-Time Library Version Numbers
diff --git a/third_party/patch/to_mindrecord/CLUERNER2020/README.md b/third_party/patch/to_mindrecord/CLUERNER2020/README.md
new file mode 100644
index 0000000000..d0191db59a
--- /dev/null
+++ b/third_party/patch/to_mindrecord/CLUERNER2020/README.md
@@ -0,0 +1 @@
+## the file is a patch which is about just change data_processor_seq.py the part of generated tfrecord to MindRecord in [CLUEbenchmark/CLUENER2020](https://github.com/CLUEbenchmark/CLUENER2020/tree/master/tf_version)
diff --git a/third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch b/third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch
new file mode 100644
index 0000000000..e24db8a537
--- /dev/null
+++ b/third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch
@@ -0,0 +1,105 @@
+--- data_processor_seq.py	2020-05-28 10:07:13.365947168 +0800
++++ data_processor_seq.py	2020-05-28 10:14:33.298177130 +0800
+@@ -4,11 +4,18 @@
+ @author: Cong Yu
+ @time: 2019-12-07 17:03
+ """
++import sys
++sys.path.append("../../../third_party/to_mindrecord/CLUERNER2020")
++
++import argparse
+ import json
+ import tokenization
+ import collections
+-import tensorflow as tf
+ 
++import numpy as np
++from mindspore.mindrecord import FileWriter
++
++# pylint: skip-file
+ 
+ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+     """Truncates a sequence pair in place to the maximum length."""
+@@ -80,11 +87,18 @@ def process_one_example(tokenizer, label
+     return feature
+ 
+ 
+-def prepare_tf_record_data(tokenizer, max_seq_len, label2id, path, out_path):
++def prepare_mindrecord_data(tokenizer, max_seq_len, label2id, path, out_path):
+     """
+-        生成训练数据， tf.record, 单标签分类模型, 随机打乱数据
++        生成训练数据， *.mindrecord, 单标签分类模型, 随机打乱数据
+     """
+-    writer = tf.python_io.TFRecordWriter(out_path)
++    writer = FileWriter(out_path)
++
++    data_schema = {"input_ids": {"type": "int64", "shape": [-1]},
++                   "input_mask": {"type": "int64", "shape": [-1]},
++                   "segment_ids": {"type": "int64", "shape": [-1]},
++                   "label_ids": {"type": "int64", "shape": [-1]}}
++    writer.add_schema(data_schema, "CLUENER2020 schema")
++
+     example_count = 0
+ 
+     for line in open(path):
+@@ -113,16 +127,12 @@ def prepare_tf_record_data(tokenizer, ma
+         feature = process_one_example(tokenizer, label2id, list(_["text"]), labels,
+                                       max_seq_len=max_seq_len)
+ 
+-        def create_int_feature(values):
+-            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+-            return f
+-
+         features = collections.OrderedDict()
+         # 序列标注任务
+-        features["input_ids"] = create_int_feature(feature[0])
+-        features["input_mask"] = create_int_feature(feature[1])
+-        features["segment_ids"] = create_int_feature(feature[2])
+-        features["label_ids"] = create_int_feature(feature[3])
++        features["input_ids"] = np.asarray(feature[0])
++        features["input_mask"] = np.asarray(feature[1])
++        features["segment_ids"] = np.asarray(feature[2])
++        features["label_ids"] = np.asarray(feature[3])
+         if example_count < 5:
+             print("*** Example ***")
+             print(_["text"])
+@@ -132,8 +142,7 @@ def prepare_tf_record_data(tokenizer, ma
+             print("segment_ids: %s" % " ".join([str(x) for x in feature[2]]))
+             print("label: %s " % " ".join([str(x) for x in feature[3]]))
+ 
+-        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+-        writer.write(tf_example.SerializeToString())
++        writer.write_raw_data([features])
+         example_count += 1
+ 
+         # if example_count == 20:
+@@ -141,17 +150,22 @@ def prepare_tf_record_data(tokenizer, ma
+         if example_count % 3000 == 0:
+             print(example_count)
+     print("total example:", example_count)
+-    writer.close()
++    writer.commit()
+ 
+ 
+ if __name__ == "__main__":
+-    vocab_file = "./vocab.txt"
++    parser = argparse.ArgumentParser()
++    parser.add_argument("--vocab_file", type=str, required=True, help='The vocabulary file.')
++    parser.add_argument("--label2id_file", type=str, required=True, help='The label2id.json file.')
++    args = parser.parse_args()
++
++    vocab_file = args.vocab_file
+     tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file)
+-    label2id = json.loads(open("label2id.json").read())
++    label2id = json.loads(open(args.label2id_file).read())
+ 
+     max_seq_len = 64
+ 
+-    prepare_tf_record_data(tokenizer, max_seq_len, label2id, path="data/thuctc_train.json",
+-                           out_path="data/train.tf_record")
+-    prepare_tf_record_data(tokenizer, max_seq_len, label2id, path="data/thuctc_valid.json",
+-                           out_path="data/dev.tf_record")
++    prepare_mindrecord_data(tokenizer, max_seq_len, label2id, path="data/cluener_public/train.json",
++                           out_path="output/train.mindrecord")
++    prepare_mindrecord_data(tokenizer, max_seq_len, label2id, path="data/cluener_public/dev.json",
++                           out_path="output/dev.mindrecord")
diff --git a/third_party/patch/to_mindrecord/zhwiki/README.md b/third_party/patch/to_mindrecord/zhwiki/README.md
new file mode 100644
index 0000000000..b06360d60d
--- /dev/null
+++ b/third_party/patch/to_mindrecord/zhwiki/README.md
@@ -0,0 +1 @@
+## the file is a patch which is about just change create_pretraining_data.py the part of generated tfrecord to MindRecord in [google-research/bert](https://github.com/google-research/bert)
diff --git a/third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch b/third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
new file mode 100644
index 0000000000..1a7b15dce2
--- /dev/null
+++ b/third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
@@ -0,0 +1,288 @@
+--- create_pretraining_data.py	2020-05-27 17:02:14.285363720 +0800
++++ create_pretraining_data.py	2020-05-27 17:30:52.427767841 +0800
+@@ -12,57 +12,28 @@
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+-"""Create masked LM/next sentence masked_lm TF examples for BERT."""
++"""Create masked LM/next sentence masked_lm MindRecord files for BERT."""
+ 
+ from __future__ import absolute_import
+ from __future__ import division
+ from __future__ import print_function
+ 
++import sys
++sys.path.append("../../../third_party/to_mindrecord/zhwiki")
++
++import argparse
+ import collections
++import logging
+ import random
+ import tokenization
+-import tensorflow as tf
+-
+-flags = tf.flags
+-
+-FLAGS = flags.FLAGS
+-
+-flags.DEFINE_string("input_file", None,
+-                    "Input raw text file (or comma-separated list of files).")
+-
+-flags.DEFINE_string(
+-    "output_file", None,
+-    "Output TF example file (or comma-separated list of files).")
+-
+-flags.DEFINE_string("vocab_file", None,
+-                    "The vocabulary file that the BERT model was trained on.")
+-
+-flags.DEFINE_bool(
+-    "do_lower_case", True,
+-    "Whether to lower case the input text. Should be True for uncased "
+-    "models and False for cased models.")
+-
+-flags.DEFINE_bool(
+-    "do_whole_word_mask", False,
+-    "Whether to use whole word masking rather than per-WordPiece masking.")
+-
+-flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+ 
+-flags.DEFINE_integer("max_predictions_per_seq", 20,
+-                     "Maximum number of masked LM predictions per sequence.")
++import numpy as np
++from mindspore.mindrecord import FileWriter
+ 
+-flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
++# pylint: skip-file
+ 
+-flags.DEFINE_integer(
+-    "dupe_factor", 10,
+-    "Number of times to duplicate the input data (with different masks).")
+-
+-flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+-
+-flags.DEFINE_float(
+-    "short_seq_prob", 0.1,
+-    "Probability of creating sequences which are shorter than the "
+-    "maximum length.")
++logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
++                    datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO)
+ 
+ 
+ class TrainingInstance(object):
+@@ -94,13 +65,19 @@ class TrainingInstance(object):
+ 
+ 
+ def write_instance_to_example_files(instances, tokenizer, max_seq_length,
+-                                    max_predictions_per_seq, output_files):
+-  """Create TF example files from `TrainingInstance`s."""
+-  writers = []
+-  for output_file in output_files:
+-    writers.append(tf.python_io.TFRecordWriter(output_file))
+-
+-  writer_index = 0
++                                    max_predictions_per_seq, output_file, partition_number):
++  """Create MindRecord files from `TrainingInstance`s."""
++  writer = FileWriter(output_file, int(partition_number))
++
++  data_schema = {"input_ids": {"type": "int64", "shape": [-1]},
++                 "input_mask": {"type": "int64", "shape": [-1]},
++                 "segment_ids": {"type": "int64", "shape": [-1]},
++                 "masked_lm_positions": {"type": "int64", "shape": [-1]},
++                 "masked_lm_ids": {"type": "int64", "shape": [-1]},
++                 "masked_lm_weights": {"type": "float32", "shape": [-1]},
++                 "next_sentence_labels": {"type": "int64", "shape": [-1]},
++                }
++  writer.add_schema(data_schema, "zhwiki schema")
+ 
+   total_written = 0
+   for (inst_index, instance) in enumerate(instances):
+@@ -130,55 +107,35 @@ def write_instance_to_example_files(inst
+     next_sentence_label = 1 if instance.is_random_next else 0
+ 
+     features = collections.OrderedDict()
+-    features["input_ids"] = create_int_feature(input_ids)
+-    features["input_mask"] = create_int_feature(input_mask)
+-    features["segment_ids"] = create_int_feature(segment_ids)
+-    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
+-    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
+-    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
+-    features["next_sentence_labels"] = create_int_feature([next_sentence_label])
+-
+-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+-
+-    writers[writer_index].write(tf_example.SerializeToString())
+-    writer_index = (writer_index + 1) % len(writers)
++    features["input_ids"] = np.asarray(input_ids, np.int64)
++    features["input_mask"] = np.asarray(input_mask, np.int64)
++    features["segment_ids"] = np.asarray(segment_ids, np.int64)
++    features["masked_lm_positions"] = np.asarray(masked_lm_positions, np.int64)
++    features["masked_lm_ids"] = np.asarray(masked_lm_ids, np.int64)
++    features["masked_lm_weights"] = np.asarray(masked_lm_weights, np.float32)
++    features["next_sentence_labels"] = np.asarray([next_sentence_label], np.int64)
+ 
+     total_written += 1
+ 
+     if inst_index < 20:
+-      tf.logging.info("*** Example ***")
+-      tf.logging.info("tokens: %s" % " ".join(
++      logging.info("*** Example ***")
++      logging.info("tokens: %s" % " ".join(
+           [tokenization.printable_text(x) for x in instance.tokens]))
+ 
+       for feature_name in features.keys():
+         feature = features[feature_name]
+-        values = []
+-        if feature.int64_list.value:
+-          values = feature.int64_list.value
+-        elif feature.float_list.value:
+-          values = feature.float_list.value
+-        tf.logging.info(
+-            "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
+-
+-  for writer in writers:
+-    writer.close()
+-
+-  tf.logging.info("Wrote %d total instances", total_written)
+-
+-
+-def create_int_feature(values):
+-  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+-  return feature
++        logging.info(
++            "%s: %s" % (feature_name, " ".join([str(x) for x in feature])))
++    writer.write_raw_data([features])
+ 
++  writer.commit()
+ 
+-def create_float_feature(values):
+-  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+-  return feature
++  logging.info("Wrote %d total instances", total_written)
+ 
+ 
+ def create_training_instances(input_files, tokenizer, max_seq_length,
+                               dupe_factor, short_seq_prob, masked_lm_prob,
+-                              max_predictions_per_seq, rng):
++                              max_predictions_per_seq, rng, do_whole_word_mask):
+   """Create `TrainingInstance`s from raw text."""
+   all_documents = [[]]
+ 
+@@ -189,7 +146,7 @@ def create_training_instances(input_file
+   # (2) Blank lines between documents. Document boundaries are needed so
+   # that the "next sentence prediction" task doesn't span between documents.
+   for input_file in input_files:
+-    with tf.gfile.GFile(input_file, "r") as reader:
++    with open(input_file, "r") as reader:
+       while True:
+         line = tokenization.convert_to_unicode(reader.readline())
+         if not line:
+@@ -214,7 +171,7 @@ def create_training_instances(input_file
+       instances.extend(
+           create_instances_from_document(
+               all_documents, document_index, max_seq_length, short_seq_prob,
+-              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
++              masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask))
+ 
+   rng.shuffle(instances)
+   return instances
+@@ -222,7 +179,7 @@ def create_training_instances(input_file
+ 
+ def create_instances_from_document(
+     all_documents, document_index, max_seq_length, short_seq_prob,
+-    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
++    masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask):
+   """Creates `TrainingInstance`s for a single document."""
+   document = all_documents[document_index]
+ 
+@@ -320,7 +277,7 @@ def create_instances_from_document(
+ 
+         (tokens, masked_lm_positions,
+          masked_lm_labels) = create_masked_lm_predictions(
+-             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
++             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask)
+         instance = TrainingInstance(
+             tokens=tokens,
+             segment_ids=segment_ids,
+@@ -340,7 +297,7 @@ MaskedLmInstance = collections.namedtupl
+ 
+ 
+ def create_masked_lm_predictions(tokens, masked_lm_prob,
+-                                 max_predictions_per_seq, vocab_words, rng):
++                                 max_predictions_per_seq, vocab_words, rng, do_whole_word_mask):
+   """Creates the predictions for the masked LM objective."""
+ 
+   cand_indexes = []
+@@ -356,7 +313,7 @@ def create_masked_lm_predictions(tokens,
+     # Note that Whole Word Masking does *not* change the training code
+     # at all -- we still predict each WordPiece independently, softmaxed
+     # over the entire vocabulary.
+-    if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and
++    if (do_whole_word_mask and len(cand_indexes) >= 1 and
+         token.startswith("##")):
+       cand_indexes[-1].append(i)
+     else:
+@@ -433,37 +390,42 @@ def truncate_seq_pair(tokens_a, tokens_b
+       trunc_tokens.pop()
+ 
+ 
+-def main(_):
+-  tf.logging.set_verbosity(tf.logging.INFO)
++def main():
++  parser = argparse.ArgumentParser()
++  parser.add_argument("--input_file", type=str, required=True, help='Input raw text file (or comma-separated list of files).')
++  parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.')
++  parser.add_argument("--partition_number", type=int, default=1, help='The MindRecord file will be split into the number of partition.')
++  parser.add_argument("--vocab_file", type=str, required=True, help='The vocabulary file than the BERT model was trained on.')
++  parser.add_argument("--do_lower_case", type=bool, default=False, help='Whether to lower case the input text. Should be True for uncased models and False for cased models.')
++  parser.add_argument("--do_whole_word_mask", type=bool, default=False, help='Whether to use whole word masking rather than per-WordPiece masking.')
++  parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.')
++  parser.add_argument("--max_predictions_per_seq", type=int, default=20, help='Maximum number of masked LM predictions per sequence.')
++  parser.add_argument("--random_seed", type=int, default=12345, help='Random seed for data generation.')
++  parser.add_argument("--dupe_factor", type=int, default=10, help='Number of times to duplicate the input data (with diffrent masks).')
++  parser.add_argument("--masked_lm_prob", type=float, default=0.15, help='Masked LM probability.')
++  parser.add_argument("--short_seq_prob", type=float, default=0.1, help='Probability of creating sequences which are shorter than the maximum length.')
++  args = parser.parse_args()
+ 
+   tokenizer = tokenization.FullTokenizer(
+-      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
++      vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
+ 
+   input_files = []
+-  for input_pattern in FLAGS.input_file.split(","):
+-    input_files.extend(tf.gfile.Glob(input_pattern))
++  for input_pattern in args.input_file.split(","):
++    input_files.append(input_pattern)
+ 
+-  tf.logging.info("*** Reading from input files ***")
++  logging.info("*** Reading from input files ***")
+   for input_file in input_files:
+-    tf.logging.info("  %s", input_file)
++    logging.info("  %s", input_file)
+ 
+-  rng = random.Random(FLAGS.random_seed)
++  rng = random.Random(args.random_seed)
+   instances = create_training_instances(
+-      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
+-      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
+-      rng)
+-
+-  output_files = FLAGS.output_file.split(",")
+-  tf.logging.info("*** Writing to output files ***")
+-  for output_file in output_files:
+-    tf.logging.info("  %s", output_file)
++      input_files, tokenizer, args.max_seq_length, args.dupe_factor,
++      args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
++      rng, args.do_whole_word_mask)
+ 
+-  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
+-                                  FLAGS.max_predictions_per_seq, output_files)
++  write_instance_to_example_files(instances, tokenizer, args.max_seq_length,
++                                  args.max_predictions_per_seq, args.output_file, args.partition_number)
+ 
+ 
+ if __name__ == "__main__":
+-  flags.mark_flag_as_required("input_file")
+-  flags.mark_flag_as_required("output_file")
+-  flags.mark_flag_as_required("vocab_file")
+-  tf.app.run()
++  main()
diff --git a/third_party/to_mindrecord/CLUERNER2020/.gitignore b/third_party/to_mindrecord/CLUERNER2020/.gitignore
new file mode 100644
index 0000000000..6c906cba1f
--- /dev/null
+++ b/third_party/to_mindrecord/CLUERNER2020/.gitignore
@@ -0,0 +1 @@
+data_processor_seq_patched.py
diff --git a/third_party/to_mindrecord/CLUERNER2020/README.md b/third_party/to_mindrecord/CLUERNER2020/README.md
new file mode 100644
index 0000000000..3dea3ae45a
--- /dev/null
+++ b/third_party/to_mindrecord/CLUERNER2020/README.md
@@ -0,0 +1 @@
+## All the scripts here come from [CLUEbenchmark/CLUENER2020](https://github.com/CLUEbenchmark/CLUENER2020/tree/master/tf_version)
diff --git a/third_party/to_mindrecord/CLUERNER2020/data_processor_seq.py b/third_party/to_mindrecord/CLUERNER2020/data_processor_seq.py
new file mode 100644
index 0000000000..e917aa7ebb
--- /dev/null
+++ b/third_party/to_mindrecord/CLUERNER2020/data_processor_seq.py
@@ -0,0 +1,157 @@
+#!/usr/bin/python
+# coding:utf8
+"""
+@author: Cong Yu
+@time: 2019-12-07 17:03
+"""
+import json
+import tokenization
+import collections
+import tensorflow as tf
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def process_one_example(tokenizer, label2id, text, label, max_seq_len=128):
+    # textlist = text.split(' ')
+    # labellist = label.split(' ')
+    textlist = list(text)
+    labellist = list(label)
+    tokens = []
+    labels = []
+    for i, word in enumerate(textlist):
+        token = tokenizer.tokenize(word)
+        tokens.extend(token)
+        label_1 = labellist[i]
+        for m in range(len(token)):
+            if m == 0:
+                labels.append(label_1)
+            else:
+                print("some unknown token...")
+                labels.append(labels[0])
+    # tokens = tokenizer.tokenize(example.text)  -2 的原因是因为序列需要加一个句首和句尾标志
+    if len(tokens) >= max_seq_len - 1:
+        tokens = tokens[0:(max_seq_len - 2)]
+        labels = labels[0:(max_seq_len - 2)]
+    ntokens = []
+    segment_ids = []
+    label_ids = []
+    ntokens.append("[CLS]")  # 句子开始设置CLS 标志
+    segment_ids.append(0)
+    # [CLS] [SEP] 可以为 他们构建标签，或者 统一到某个标签，反正他们是不变的，基本不参加训练 即：x-l 永远不变
+    label_ids.append(0)  # label2id["[CLS]"]
+    for i, token in enumerate(tokens):
+        ntokens.append(token)
+        segment_ids.append(0)
+        label_ids.append(label2id[labels[i]])
+    ntokens.append("[SEP]")
+    segment_ids.append(0)
+    # append("O") or append("[SEP]") not sure!
+    label_ids.append(0)  # label2id["[SEP]"]
+    input_ids = tokenizer.convert_tokens_to_ids(ntokens)
+    input_mask = [1] * len(input_ids)
+    while len(input_ids) < max_seq_len:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+        label_ids.append(0)
+        ntokens.append("**NULL**")
+    assert len(input_ids) == max_seq_len
+    assert len(input_mask) == max_seq_len
+    assert len(segment_ids) == max_seq_len
+    assert len(label_ids) == max_seq_len
+
+    feature = (input_ids, input_mask, segment_ids, label_ids)
+    return feature
+
+
+def prepare_tf_record_data(tokenizer, max_seq_len, label2id, path, out_path):
+    """
+        生成训练数据， tf.record, 单标签分类模型, 随机打乱数据
+    """
+    writer = tf.python_io.TFRecordWriter(out_path)
+    example_count = 0
+
+    for line in open(path):
+        if not line.strip():
+            continue
+        _ = json.loads(line.strip())
+        len_ = len(_["text"])
+        labels = ["O"] * len_
+        for k, v in _["label"].items():
+            for kk, vv in v.items():
+                for vvv in vv:
+                    span = vvv
+                    s = span[0]
+                    e = span[1] + 1
+                    # print(s, e)
+                    if e - s == 1:
+                        labels[s] = "S_" + k
+                    else:
+                        labels[s] = "B_" + k
+                        for i in range(s + 1, e - 1):
+                            labels[i] = "M_" + k
+                        labels[e - 1] = "E_" + k
+            # print()
+        # feature = process_one_example(tokenizer, label2id, row[column_name_x1], row[column_name_y],
+        #                               max_seq_len=max_seq_len)
+        feature = process_one_example(tokenizer, label2id, list(_["text"]), labels,
+                                      max_seq_len=max_seq_len)
+
+        def create_int_feature(values):
+            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+            return f
+
+        features = collections.OrderedDict()
+        # 序列标注任务
+        features["input_ids"] = create_int_feature(feature[0])
+        features["input_mask"] = create_int_feature(feature[1])
+        features["segment_ids"] = create_int_feature(feature[2])
+        features["label_ids"] = create_int_feature(feature[3])
+        if example_count < 5:
+            print("*** Example ***")
+            print(_["text"])
+            print(_["label"])
+            print("input_ids: %s" % " ".join([str(x) for x in feature[0]]))
+            print("input_mask: %s" % " ".join([str(x) for x in feature[1]]))
+            print("segment_ids: %s" % " ".join([str(x) for x in feature[2]]))
+            print("label: %s " % " ".join([str(x) for x in feature[3]]))
+
+        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+        writer.write(tf_example.SerializeToString())
+        example_count += 1
+
+        # if example_count == 20:
+        #     break
+        if example_count % 3000 == 0:
+            print(example_count)
+    print("total example:", example_count)
+    writer.close()
+
+
+if __name__ == "__main__":
+    vocab_file = "./vocab.txt"
+    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file)
+    label2id = json.loads(open("label2id.json").read())
+
+    max_seq_len = 64
+
+    prepare_tf_record_data(tokenizer, max_seq_len, label2id, path="data/thuctc_train.json",
+                           out_path="data/train.tf_record")
+    prepare_tf_record_data(tokenizer, max_seq_len, label2id, path="data/thuctc_valid.json",
+                           out_path="data/dev.tf_record")
diff --git a/third_party/to_mindrecord/CLUERNER2020/label2id.json b/third_party/to_mindrecord/CLUERNER2020/label2id.json
new file mode 100644
index 0000000000..f296bcb28f
--- /dev/null
+++ b/third_party/to_mindrecord/CLUERNER2020/label2id.json
@@ -0,0 +1,43 @@
+{
+  "O": 0,
+  "S_address": 1,
+  "B_address": 2,
+  "M_address": 3,
+  "E_address": 4,
+  "S_book": 5,
+  "B_book": 6,
+  "M_book": 7,
+  "E_book": 8,
+  "S_company": 9,
+  "B_company": 10,
+  "M_company": 11,
+  "E_company": 12,
+  "S_game": 13,
+  "B_game": 14,
+  "M_game": 15,
+  "E_game": 16,
+  "S_government": 17,
+  "B_government": 18,
+  "M_government": 19,
+  "E_government": 20,
+  "S_movie": 21,
+  "B_movie": 22,
+  "M_movie": 23,
+  "E_movie": 24,
+  "S_name": 25,
+  "B_name": 26,
+  "M_name": 27,
+  "E_name": 28,
+  "S_organization": 29,
+  "B_organization": 30,
+  "M_organization": 31,
+  "E_organization": 32,
+  "S_position": 33,
+  "B_position": 34,
+  "M_position": 35,
+  "E_position": 36,
+  "S_scene": 37,
+  "B_scene": 38,
+  "M_scene": 39,
+  "E_scene": 40
+}
\ No newline at end of file
diff --git a/third_party/to_mindrecord/CLUERNER2020/tokenization.py b/third_party/to_mindrecord/CLUERNER2020/tokenization.py
new file mode 100644
index 0000000000..856021d6a9
--- /dev/null
+++ b/third_party/to_mindrecord/CLUERNER2020/tokenization.py
@@ -0,0 +1,388 @@
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+import six
+
+# pylint: skip-file
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+    """Checks whether the casing config is consistent with the checkpoint name."""
+
+    # The casing has to be passed in by the user and there is no explicit check
+    # as to whether it matches the checkpoint. The casing information probably
+    # should have been stored in the bert_config.json file, but it's not, so
+    # we have to heuristically detect it to validate.
+
+    if not init_checkpoint:
+        return
+
+    m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+    if m is None:
+        return
+
+    model_name = m.group(1)
+
+    lower_models = [
+        "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+        "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+    ]
+
+    cased_models = [
+        "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+        "multi_cased_L-12_H-768_A-12"
+    ]
+
+    is_bad_config = False
+    if model_name in lower_models and not do_lower_case:
+        is_bad_config = True
+        actual_flag = "False"
+        case_name = "lowercased"
+        opposite_flag = "True"
+
+    if model_name in cased_models and do_lower_case:
+        is_bad_config = True
+        actual_flag = "True"
+        case_name = "cased"
+        opposite_flag = "False"
+
+    if is_bad_config:
+        raise ValueError(
+            "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+            "However, `%s` seems to be a %s model, so you "
+            "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+            "how the model was pre-training. If this error is wrong, please "
+            "just comment out this check." % (actual_flag, init_checkpoint,
+                                              model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        if item in vocab:
+            output.append(vocab[item])
+        else:
+            output.append(vocab['[UNK]'])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/third_party/to_mindrecord/CLUERNER2020/vocab.txt b/third_party/to_mindrecord/CLUERNER2020/vocab.txt
new file mode 100644
index 0000000000..ca4f978103
--- /dev/null
+++ b/third_party/to_mindrecord/CLUERNER2020/vocab.txt
@@ -0,0 +1,21128 @@
+[PAD]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[unused99]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+<S>
+<T>
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+~
+£
+¤
+¥
+§
+©
+«
+®
+°
+±
+²
+³
+µ
+·
+¹
+º
+»
+¼
+×
+ß
+æ
+÷
+ø
+đ
+ŋ
+ɔ
+ə
+ɡ
+ʰ
+ˇ
+ˈ
+ˊ
+ˋ
+ˍ
+ː
+˙
+˚
+ˢ
+α
+β
+γ
+δ
+ε
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+ы
+ь
+я
+і
+ا
+ب
+ة
+ت
+د
+ر
+س
+ع
+ل
+م
+ن
+ه
+و
+ي
+۩
+ก
+ง
+น
+ม
+ย
+ร
+อ
+า
+เ
+๑
+་
+ღ
+ᄀ
+ᄁ
+ᄂ
+ᄃ
+ᄅ
+ᄆ
+ᄇ
+ᄈ
+ᄉ
+ᄋ
+ᄌ
+ᄎ
+ᄏ
+ᄐ
+ᄑ
+ᄒ
+ᅡ
+ᅢ
+ᅣ
+ᅥ
+ᅦ
+ᅧ
+ᅨ
+ᅩ
+ᅪ
+ᅬ
+ᅭ
+ᅮ
+ᅯ
+ᅲ
+ᅳ
+ᅴ
+ᅵ
+ᆨ
+ᆫ
+ᆯ
+ᆷ
+ᆸ
+ᆺ
+ᆻ
+ᆼ
+ᗜ
+ᵃ
+ᵉ
+ᵍ
+ᵏ
+ᵐ
+ᵒ
+ᵘ
+‖
+„
+†
+•
+‥
+‧
+ 
+‰
+′
+″
+‹
+›
+※
+‿
+⁄
+ⁱ
+⁺
+ⁿ
+₁
+₂
+₃
+₄
+€
+℃
+№
+™
+ⅰ
+ⅱ
+ⅲ
+ⅳ
+ⅴ
+←
+↑
+→
+↓
+↔
+↗
+↘
+⇒
+∀
+−
+∕
+∙
+√
+∞
+∟
+∠
+∣
+∥
+∩
+∮
+∶
+∼
+∽
+≈
+≒
+≡
+≤
+≥
+≦
+≧
+≪
+≫
+⊙
+⋅
+⋈
+⋯
+⌒
+①
+②
+③
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+⑴
+⑵
+⑶
+⑷
+⑸
+⒈
+⒉
+⒊
+⒋
+ⓒ
+ⓔ
+ⓘ
+─
+━
+│
+┃
+┅
+┆
+┊
+┌
+└
+├
+┣
+═
+║
+╚
+╞
+╠
+╭
+╮
+╯
+╰
+╱
+╳
+▂
+▃
+▅
+▇
+█
+▉
+▋
+▌
+▍
+▎
+■
+□
+▪
+▫
+▬
+▲
+△
+▶
+►
+▼
+▽
+◆
+◇
+○
+◎
+●
+◕
+◠
+◢
+◤
+☀
+★
+☆
+☕
+☞
+☺
+☼
+♀
+♂
+♠
+♡
+♣
+♥
+♦
+♪
+♫
+♬
+✈
+✔
+✕
+✖
+✦
+✨
+✪
+✰
+✿
+❀
+❤
+➜
+➤
+⦿
+、
+。
+〃
+々
+〇
+〈
+〉
+《
+》
+「
+」
+『
+』
+【
+】
+〓
+〔
+〕
+〖
+〗
+〜
+〝
+〞
+ぁ
+あ
+ぃ
+い
+う
+ぇ
+え
+お
+か
+き
+く
+け
+こ
+さ
+し
+す
+せ
+そ
+た
+ち
+っ
+つ
+て
+と
+な
+に
+ぬ
+ね
+の
+は
+ひ
+ふ
+へ
+ほ
+ま
+み
+む
+め
+も
+ゃ
+や
+ゅ
+ゆ
+ょ
+よ
+ら
+り
+る
+れ
+ろ
+わ
+を
+ん
+゜
+ゝ
+ァ
+ア
+ィ
+イ
+ゥ
+ウ
+ェ
+エ
+ォ
+オ
+カ
+キ
+ク
+ケ
+コ
+サ
+シ
+ス
+セ
+ソ
+タ
+チ
+ッ
+ツ
+テ
+ト
+ナ
+ニ
+ヌ
+ネ
+ノ
+ハ
+ヒ
+フ
+ヘ
+ホ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ヤ
+ュ
+ユ
+ョ
+ヨ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ヲ
+ン
+ヶ
+・
+ー
+ヽ
+ㄅ
+ㄆ
+ㄇ
+ㄉ
+ㄋ
+ㄌ
+ㄍ
+ㄎ
+ㄏ
+ㄒ
+ㄚ
+ㄛ
+ㄞ
+ㄟ
+ㄢ
+ㄤ
+ㄥ
+ㄧ
+ㄨ
+ㆍ
+㈦
+㊣
+㎡
+㗎
+一
+丁
+七
+万
+丈
+三
+上
+下
+不
+与
+丐
+丑
+专
+且
+丕
+世
+丘
+丙
+业
+丛
+东
+丝
+丞
+丟
+両
+丢
+两
+严
+並
+丧
+丨
+个
+丫
+中
+丰
+串
+临
+丶
+丸
+丹
+为
+主
+丼
+丽
+举
+丿
+乂
+乃
+久
+么
+义
+之
+乌
+乍
+乎
+乏
+乐
+乒
+乓
+乔
+乖
+乗
+乘
+乙
+乜
+九
+乞
+也
+习
+乡
+书
+乩
+买
+乱
+乳
+乾
+亀
+亂
+了
+予
+争
+事
+二
+于
+亏
+云
+互
+五
+井
+亘
+亙
+亚
+些
+亜
+亞
+亟
+亡
+亢
+交
+亥
+亦
+产
+亨
+亩
+享
+京
+亭
+亮
+亲
+亳
+亵
+人
+亿
+什
+仁
+仃
+仄
+仅
+仆
+仇
+今
+介
+仍
+从
+仏
+仑
+仓
+仔
+仕
+他
+仗
+付
+仙
+仝
+仞
+仟
+代
+令
+以
+仨
+仪
+们
+仮
+仰
+仲
+件
+价
+任
+份
+仿
+企
+伉
+伊
+伍
+伎
+伏
+伐
+休
+伕
+众
+优
+伙
+会
+伝
+伞
+伟
+传
+伢
+伤
+伦
+伪
+伫
+伯
+估
+伴
+伶
+伸
+伺
+似
+伽
+佃
+但
+佇
+佈
+位
+低
+住
+佐
+佑
+体
+佔
+何
+佗
+佘
+余
+佚
+佛
+作
+佝
+佞
+佟
+你
+佢
+佣
+佤
+佥
+佩
+佬
+佯
+佰
+佳
+併
+佶
+佻
+佼
+使
+侃
+侄
+來
+侈
+例
+侍
+侏
+侑
+侖
+侗
+供
+依
+侠
+価
+侣
+侥
+侦
+侧
+侨
+侬
+侮
+侯
+侵
+侶
+侷
+便
+係
+促
+俄
+俊
+俎
+俏
+俐
+俑
+俗
+俘
+俚
+保
+俞
+俟
+俠
+信
+俨
+俩
+俪
+俬
+俭
+修
+俯
+俱
+俳
+俸
+俺
+俾
+倆
+倉
+個
+倌
+倍
+倏
+們
+倒
+倔
+倖
+倘
+候
+倚
+倜
+借
+倡
+値
+倦
+倩
+倪
+倫
+倬
+倭
+倶
+债
+值
+倾
+偃
+假
+偈
+偉
+偌
+偎
+偏
+偕
+做
+停
+健
+側
+偵
+偶
+偷
+偻
+偽
+偿
+傀
+傅
+傍
+傑
+傘
+備
+傚
+傢
+傣
+傥
+储
+傩
+催
+傭
+傲
+傳
+債
+傷
+傻
+傾
+僅
+働
+像
+僑
+僕
+僖
+僚
+僥
+僧
+僭
+僮
+僱
+僵
+價
+僻
+儀
+儂
+億
+儆
+儉
+儋
+儒
+儕
+儘
+償
+儡
+優
+儲
+儷
+儼
+儿
+兀
+允
+元
+兄
+充
+兆
+兇
+先
+光
+克
+兌
+免
+児
+兑
+兒
+兔
+兖
+党
+兜
+兢
+入
+內
+全
+兩
+八
+公
+六
+兮
+兰
+共
+兲
+关
+兴
+兵
+其
+具
+典
+兹
+养
+兼
+兽
+冀
+内
+円
+冇
+冈
+冉
+冊
+册
+再
+冏
+冒
+冕
+冗
+写
+军
+农
+冠
+冢
+冤
+冥
+冨
+冪
+冬
+冯
+冰
+冲
+决
+况
+冶
+冷
+冻
+冼
+冽
+冾
+净
+凄
+准
+凇
+凈
+凉
+凋
+凌
+凍
+减
+凑
+凛
+凜
+凝
+几
+凡
+凤
+処
+凪
+凭
+凯
+凰
+凱
+凳
+凶
+凸
+凹
+出
+击
+函
+凿
+刀
+刁
+刃
+分
+切
+刈
+刊
+刍
+刎
+刑
+划
+列
+刘
+则
+刚
+创
+初
+删
+判
+別
+刨
+利
+刪
+别
+刮
+到
+制
+刷
+券
+刹
+刺
+刻
+刽
+剁
+剂
+剃
+則
+剉
+削
+剋
+剌
+前
+剎
+剐
+剑
+剔
+剖
+剛
+剜
+剝
+剣
+剤
+剥
+剧
+剩
+剪
+副
+割
+創
+剷
+剽
+剿
+劃
+劇
+劈
+劉
+劊
+劍
+劏
+劑
+力
+劝
+办
+功
+加
+务
+劣
+动
+助
+努
+劫
+劭
+励
+劲
+劳
+労
+劵
+効
+劾
+势
+勁
+勃
+勇
+勉
+勋
+勐
+勒
+動
+勖
+勘
+務
+勛
+勝
+勞
+募
+勢
+勤
+勧
+勳
+勵
+勸
+勺
+勻
+勾
+勿
+匀
+包
+匆
+匈
+匍
+匐
+匕
+化
+北
+匙
+匝
+匠
+匡
+匣
+匪
+匮
+匯
+匱
+匹
+区
+医
+匾
+匿
+區
+十
+千
+卅
+升
+午
+卉
+半
+卍
+华
+协
+卑
+卒
+卓
+協
+单
+卖
+南
+単
+博
+卜
+卞
+卟
+占
+卡
+卢
+卤
+卦
+卧
+卫
+卮
+卯
+印
+危
+即
+却
+卵
+卷
+卸
+卻
+卿
+厂
+厄
+厅
+历
+厉
+压
+厌
+厕
+厘
+厚
+厝
+原
+厢
+厥
+厦
+厨
+厩
+厭
+厮
+厲
+厳
+去
+县
+叁
+参
+參
+又
+叉
+及
+友
+双
+反
+収
+发
+叔
+取
+受
+变
+叙
+叛
+叟
+叠
+叡
+叢
+口
+古
+句
+另
+叨
+叩
+只
+叫
+召
+叭
+叮
+可
+台
+叱
+史
+右
+叵
+叶
+号
+司
+叹
+叻
+叼
+叽
+吁
+吃
+各
+吆
+合
+吉
+吊
+吋
+同
+名
+后
+吏
+吐
+向
+吒
+吓
+吕
+吖
+吗
+君
+吝
+吞
+吟
+吠
+吡
+否
+吧
+吨
+吩
+含
+听
+吭
+吮
+启
+吱
+吳
+吴
+吵
+吶
+吸
+吹
+吻
+吼
+吽
+吾
+呀
+呂
+呃
+呆
+呈
+告
+呋
+呎
+呐
+呓
+呕
+呗
+员
+呛
+呜
+呢
+呤
+呦
+周
+呱
+呲
+味
+呵
+呷
+呸
+呻
+呼
+命
+咀
+咁
+咂
+咄
+咆
+咋
+和
+咎
+咏
+咐
+咒
+咔
+咕
+咖
+咗
+咘
+咙
+咚
+咛
+咣
+咤
+咦
+咧
+咨
+咩
+咪
+咫
+咬
+咭
+咯
+咱
+咲
+咳
+咸
+咻
+咽
+咿
+哀
+品
+哂
+哄
+哆
+哇
+哈
+哉
+哋
+哌
+响
+哎
+哏
+哐
+哑
+哒
+哔
+哗
+哟
+員
+哥
+哦
+哧
+哨
+哩
+哪
+哭
+哮
+哲
+哺
+哼
+哽
+唁
+唄
+唆
+唇
+唉
+唏
+唐
+唑
+唔
+唠
+唤
+唧
+唬
+售
+唯
+唰
+唱
+唳
+唷
+唸
+唾
+啃
+啄
+商
+啉
+啊
+問
+啓
+啕
+啖
+啜
+啞
+啟
+啡
+啤
+啥
+啦
+啧
+啪
+啫
+啬
+啮
+啰
+啱
+啲
+啵
+啶
+啷
+啸
+啻
+啼
+啾
+喀
+喂
+喃
+善
+喆
+喇
+喉
+喊
+喋
+喎
+喏
+喔
+喘
+喙
+喚
+喜
+喝
+喟
+喧
+喪
+喫
+喬
+單
+喰
+喱
+喲
+喳
+喵
+営
+喷
+喹
+喺
+喻
+喽
+嗅
+嗆
+嗇
+嗎
+嗑
+嗒
+嗓
+嗔
+嗖
+嗚
+嗜
+嗝
+嗟
+嗡
+嗣
+嗤
+嗦
+嗨
+嗪
+嗬
+嗯
+嗰
+嗲
+嗳
+嗶
+嗷
+嗽
+嘀
+嘅
+嘆
+嘈
+嘉
+嘌
+嘍
+嘎
+嘔
+嘖
+嘗
+嘘
+嘚
+嘛
+嘜
+嘞
+嘟
+嘢
+嘣
+嘤
+嘧
+嘩
+嘭
+嘮
+嘯
+嘰
+嘱
+嘲
+嘴
+嘶
+嘸
+嘹
+嘻
+嘿
+噁
+噌
+噎
+噓
+噔
+噗
+噙
+噜
+噠
+噢
+噤
+器
+噩
+噪
+噬
+噱
+噴
+噶
+噸
+噹
+噻
+噼
+嚀
+嚇
+嚎
+嚏
+嚐
+嚓
+嚕
+嚟
+嚣
+嚥
+嚨
+嚮
+嚴
+嚷
+嚼
+囂
+囉
+囊
+囍
+囑
+囔
+囗
+囚
+四
+囝
+回
+囟
+因
+囡
+团
+団
+囤
+囧
+囪
+囫
+园
+困
+囱
+囲
+図
+围
+囹
+固
+国
+图
+囿
+圃
+圄
+圆
+圈
+國
+圍
+圏
+園
+圓
+圖
+團
+圜
+土
+圣
+圧
+在
+圩
+圭
+地
+圳
+场
+圻
+圾
+址
+坂
+均
+坊
+坍
+坎
+坏
+坐
+坑
+块
+坚
+坛
+坝
+坞
+坟
+坠
+坡
+坤
+坦
+坨
+坪
+坯
+坳
+坵
+坷
+垂
+垃
+垄
+型
+垒
+垚
+垛
+垠
+垢
+垣
+垦
+垩
+垫
+垭
+垮
+垵
+埂
+埃
+埋
+城
+埔
+埕
+埗
+域
+埠
+埤
+埵
+執
+埸
+培
+基
+埼
+堀
+堂
+堃
+堅
+堆
+堇
+堑
+堕
+堙
+堡
+堤
+堪
+堯
+堰
+報
+場
+堵
+堺
+堿
+塊
+塌
+塑
+塔
+塗
+塘
+塚
+塞
+塢
+塩
+填
+塬
+塭
+塵
+塾
+墀
+境
+墅
+墉
+墊
+墒
+墓
+増
+墘
+墙
+墜
+增
+墟
+墨
+墩
+墮
+墳
+墻
+墾
+壁
+壅
+壆
+壇
+壊
+壑
+壓
+壕
+壘
+壞
+壟
+壢
+壤
+壩
+士
+壬
+壮
+壯
+声
+売
+壳
+壶
+壹
+壺
+壽
+处
+备
+変
+复
+夏
+夔
+夕
+外
+夙
+多
+夜
+够
+夠
+夢
+夥
+大
+天
+太
+夫
+夭
+央
+夯
+失
+头
+夷
+夸
+夹
+夺
+夾
+奂
+奄
+奇
+奈
+奉
+奋
+奎
+奏
+奐
+契
+奔
+奕
+奖
+套
+奘
+奚
+奠
+奢
+奥
+奧
+奪
+奬
+奮
+女
+奴
+奶
+奸
+她
+好
+如
+妃
+妄
+妆
+妇
+妈
+妊
+妍
+妒
+妓
+妖
+妘
+妙
+妝
+妞
+妣
+妤
+妥
+妨
+妩
+妪
+妮
+妲
+妳
+妹
+妻
+妾
+姆
+姉
+姊
+始
+姍
+姐
+姑
+姒
+姓
+委
+姗
+姚
+姜
+姝
+姣
+姥
+姦
+姨
+姪
+姫
+姬
+姹
+姻
+姿
+威
+娃
+娄
+娅
+娆
+娇
+娉
+娑
+娓
+娘
+娛
+娜
+娟
+娠
+娣
+娥
+娩
+娱
+娲
+娴
+娶
+娼
+婀
+婁
+婆
+婉
+婊
+婕
+婚
+婢
+婦
+婧
+婪
+婭
+婴
+婵
+婶
+婷
+婺
+婿
+媒
+媚
+媛
+媞
+媧
+媲
+媳
+媽
+媾
+嫁
+嫂
+嫉
+嫌
+嫑
+嫔
+嫖
+嫘
+嫚
+嫡
+嫣
+嫦
+嫩
+嫲
+嫵
+嫻
+嬅
+嬉
+嬌
+嬗
+嬛
+嬢
+嬤
+嬪
+嬰
+嬴
+嬷
+嬸
+嬿
+孀
+孃
+子
+孑
+孔
+孕
+孖
+字
+存
+孙
+孚
+孛
+孜
+孝
+孟
+孢
+季
+孤
+学
+孩
+孪
+孫
+孬
+孰
+孱
+孳
+孵
+學
+孺
+孽
+孿
+宁
+它
+宅
+宇
+守
+安
+宋
+完
+宏
+宓
+宕
+宗
+官
+宙
+定
+宛
+宜
+宝
+实
+実
+宠
+审
+客
+宣
+室
+宥
+宦
+宪
+宫
+宮
+宰
+害
+宴
+宵
+家
+宸
+容
+宽
+宾
+宿
+寂
+寄
+寅
+密
+寇
+富
+寐
+寒
+寓
+寛
+寝
+寞
+察
+寡
+寢
+寥
+實
+寧
+寨
+審
+寫
+寬
+寮
+寰
+寵
+寶
+寸
+对
+寺
+寻
+导
+対
+寿
+封
+専
+射
+将
+將
+專
+尉
+尊
+尋
+對
+導
+小
+少
+尔
+尕
+尖
+尘
+尚
+尝
+尤
+尧
+尬
+就
+尴
+尷
+尸
+尹
+尺
+尻
+尼
+尽
+尾
+尿
+局
+屁
+层
+屄
+居
+屆
+屈
+屉
+届
+屋
+屌
+屍
+屎
+屏
+屐
+屑
+展
+屜
+属
+屠
+屡
+屢
+層
+履
+屬
+屯
+山
+屹
+屿
+岀
+岁
+岂
+岌
+岐
+岑
+岔
+岖
+岗
+岘
+岙
+岚
+岛
+岡
+岩
+岫
+岬
+岭
+岱
+岳
+岷
+岸
+峇
+峋
+峒
+峙
+峡
+峤
+峥
+峦
+峨
+峪
+峭
+峯
+峰
+峴
+島
+峻
+峽
+崁
+崂
+崆
+崇
+崎
+崑
+崔
+崖
+崗
+崙
+崛
+崧
+崩
+崭
+崴
+崽
+嵇
+嵊
+嵋
+嵌
+嵐
+嵘
+嵩
+嵬
+嵯
+嶂
+嶄
+嶇
+嶋
+嶙
+嶺
+嶼
+嶽
+巅
+巍
+巒
+巔
+巖
+川
+州
+巡
+巢
+工
+左
+巧
+巨
+巩
+巫
+差
+己
+已
+巳
+巴
+巷
+巻
+巽
+巾
+巿
+币
+市
+布
+帅
+帆
+师
+希
+帐
+帑
+帕
+帖
+帘
+帚
+帛
+帜
+帝
+帥
+带
+帧
+師
+席
+帮
+帯
+帰
+帳
+帶
+帷
+常
+帼
+帽
+幀
+幂
+幄
+幅
+幌
+幔
+幕
+幟
+幡
+幢
+幣
+幫
+干
+平
+年
+并
+幸
+幹
+幺
+幻
+幼
+幽
+幾
+广
+庁
+広
+庄
+庆
+庇
+床
+序
+庐
+库
+应
+底
+庖
+店
+庙
+庚
+府
+庞
+废
+庠
+度
+座
+庫
+庭
+庵
+庶
+康
+庸
+庹
+庾
+廁
+廂
+廃
+廈
+廉
+廊
+廓
+廖
+廚
+廝
+廟
+廠
+廢
+廣
+廬
+廳
+延
+廷
+建
+廿
+开
+弁
+异
+弃
+弄
+弈
+弊
+弋
+式
+弑
+弒
+弓
+弔
+引
+弗
+弘
+弛
+弟
+张
+弥
+弦
+弧
+弩
+弭
+弯
+弱
+張
+強
+弹
+强
+弼
+弾
+彅
+彆
+彈
+彌
+彎
+归
+当
+录
+彗
+彙
+彝
+形
+彤
+彥
+彦
+彧
+彩
+彪
+彫
+彬
+彭
+彰
+影
+彷
+役
+彻
+彼
+彿
+往
+征
+径
+待
+徇
+很
+徉
+徊
+律
+後
+徐
+徑
+徒
+従
+徕
+得
+徘
+徙
+徜
+從
+徠
+御
+徨
+復
+循
+徬
+微
+徳
+徴
+徵
+德
+徹
+徼
+徽
+心
+必
+忆
+忌
+忍
+忏
+忐
+忑
+忒
+忖
+志
+忘
+忙
+応
+忠
+忡
+忤
+忧
+忪
+快
+忱
+念
+忻
+忽
+忿
+怀
+态
+怂
+怅
+怆
+怎
+怏
+怒
+怔
+怕
+怖
+怙
+怜
+思
+怠
+怡
+急
+怦
+性
+怨
+怪
+怯
+怵
+总
+怼
+恁
+恃
+恆
+恋
+恍
+恐
+恒
+恕
+恙
+恚
+恢
+恣
+恤
+恥
+恨
+恩
+恪
+恫
+恬
+恭
+息
+恰
+恳
+恵
+恶
+恸
+恺
+恻
+恼
+恿
+悄
+悅
+悉
+悌
+悍
+悔
+悖
+悚
+悟
+悠
+患
+悦
+您
+悩
+悪
+悬
+悯
+悱
+悲
+悴
+悵
+悶
+悸
+悻
+悼
+悽
+情
+惆
+惇
+惊
+惋
+惑
+惕
+惘
+惚
+惜
+惟
+惠
+惡
+惦
+惧
+惨
+惩
+惫
+惬
+惭
+惮
+惯
+惰
+惱
+想
+惴
+惶
+惹
+惺
+愁
+愆
+愈
+愉
+愍
+意
+愕
+愚
+愛
+愜
+感
+愣
+愤
+愧
+愫
+愷
+愿
+慄
+慈
+態
+慌
+慎
+慑
+慕
+慘
+慚
+慟
+慢
+慣
+慧
+慨
+慫
+慮
+慰
+慳
+慵
+慶
+慷
+慾
+憂
+憊
+憋
+憎
+憐
+憑
+憔
+憚
+憤
+憧
+憨
+憩
+憫
+憬
+憲
+憶
+憾
+懂
+懇
+懈
+應
+懊
+懋
+懑
+懒
+懦
+懲
+懵
+懶
+懷
+懸
+懺
+懼
+懾
+懿
+戀
+戈
+戊
+戌
+戍
+戎
+戏
+成
+我
+戒
+戕
+或
+战
+戚
+戛
+戟
+戡
+戦
+截
+戬
+戮
+戰
+戲
+戳
+戴
+戶
+户
+戸
+戻
+戾
+房
+所
+扁
+扇
+扈
+扉
+手
+才
+扎
+扑
+扒
+打
+扔
+払
+托
+扛
+扣
+扦
+执
+扩
+扪
+扫
+扬
+扭
+扮
+扯
+扰
+扱
+扳
+扶
+批
+扼
+找
+承
+技
+抄
+抉
+把
+抑
+抒
+抓
+投
+抖
+抗
+折
+抚
+抛
+抜
+択
+抟
+抠
+抡
+抢
+护
+报
+抨
+披
+抬
+抱
+抵
+抹
+押
+抽
+抿
+拂
+拄
+担
+拆
+拇
+拈
+拉
+拋
+拌
+拍
+拎
+拐
+拒
+拓
+拔
+拖
+拗
+拘
+拙
+拚
+招
+拜
+拟
+拡
+拢
+拣
+拥
+拦
+拧
+拨
+择
+括
+拭
+拮
+拯
+拱
+拳
+拴
+拷
+拼
+拽
+拾
+拿
+持
+挂
+指
+挈
+按
+挎
+挑
+挖
+挙
+挚
+挛
+挝
+挞
+挟
+挠
+挡
+挣
+挤
+挥
+挨
+挪
+挫
+振
+挲
+挹
+挺
+挽
+挾
+捂
+捅
+捆
+捉
+捋
+捌
+捍
+捎
+捏
+捐
+捕
+捞
+损
+捡
+换
+捣
+捧
+捨
+捩
+据
+捱
+捲
+捶
+捷
+捺
+捻
+掀
+掂
+掃
+掇
+授
+掉
+掌
+掏
+掐
+排
+掖
+掘
+掙
+掛
+掠
+採
+探
+掣
+接
+控
+推
+掩
+措
+掬
+掰
+掲
+掳
+掴
+掷
+掸
+掺
+揀
+揃
+揄
+揆
+揉
+揍
+描
+提
+插
+揖
+揚
+換
+握
+揣
+揩
+揪
+揭
+揮
+援
+揶
+揸
+揹
+揽
+搀
+搁
+搂
+搅
+損
+搏
+搐
+搓
+搔
+搖
+搗
+搜
+搞
+搡
+搪
+搬
+搭
+搵
+搶
+携
+搽
+摀
+摁
+摄
+摆
+摇
+摈
+摊
+摒
+摔
+摘
+摞
+摟
+摧
+摩
+摯
+摳
+摸
+摹
+摺
+摻
+撂
+撃
+撅
+撇
+撈
+撐
+撑
+撒
+撓
+撕
+撚
+撞
+撤
+撥
+撩
+撫
+撬
+播
+撮
+撰
+撲
+撵
+撷
+撸
+撻
+撼
+撿
+擀
+擁
+擂
+擄
+擅
+擇
+擊
+擋
+操
+擎
+擒
+擔
+擘
+據
+擞
+擠
+擡
+擢
+擦
+擬
+擰
+擱
+擲
+擴
+擷
+擺
+擼
+擾
+攀
+攏
+攒
+攔
+攘
+攙
+攜
+攝
+攞
+攢
+攣
+攤
+攥
+攪
+攫
+攬
+支
+收
+攸
+改
+攻
+放
+政
+故
+效
+敌
+敍
+敎
+敏
+救
+敕
+敖
+敗
+敘
+教
+敛
+敝
+敞
+敢
+散
+敦
+敬
+数
+敲
+整
+敵
+敷
+數
+斂
+斃
+文
+斋
+斌
+斎
+斐
+斑
+斓
+斗
+料
+斛
+斜
+斟
+斡
+斤
+斥
+斧
+斩
+斫
+斬
+断
+斯
+新
+斷
+方
+於
+施
+旁
+旃
+旅
+旋
+旌
+旎
+族
+旖
+旗
+无
+既
+日
+旦
+旧
+旨
+早
+旬
+旭
+旮
+旱
+时
+旷
+旺
+旻
+昀
+昂
+昆
+昇
+昉
+昊
+昌
+明
+昏
+易
+昔
+昕
+昙
+星
+映
+春
+昧
+昨
+昭
+是
+昱
+昴
+昵
+昶
+昼
+显
+晁
+時
+晃
+晉
+晋
+晌
+晏
+晒
+晓
+晔
+晕
+晖
+晗
+晚
+晝
+晞
+晟
+晤
+晦
+晨
+晩
+普
+景
+晰
+晴
+晶
+晷
+智
+晾
+暂
+暄
+暇
+暈
+暉
+暌
+暐
+暑
+暖
+暗
+暝
+暢
+暧
+暨
+暫
+暮
+暱
+暴
+暸
+暹
+曄
+曆
+曇
+曉
+曖
+曙
+曜
+曝
+曠
+曦
+曬
+曰
+曲
+曳
+更
+書
+曹
+曼
+曾
+替
+最
+會
+月
+有
+朋
+服
+朐
+朔
+朕
+朗
+望
+朝
+期
+朦
+朧
+木
+未
+末
+本
+札
+朮
+术
+朱
+朴
+朵
+机
+朽
+杀
+杂
+权
+杆
+杈
+杉
+李
+杏
+材
+村
+杓
+杖
+杜
+杞
+束
+杠
+条
+来
+杨
+杭
+杯
+杰
+東
+杳
+杵
+杷
+杼
+松
+板
+极
+构
+枇
+枉
+枋
+析
+枕
+林
+枚
+果
+枝
+枢
+枣
+枪
+枫
+枭
+枯
+枰
+枱
+枳
+架
+枷
+枸
+柄
+柏
+某
+柑
+柒
+染
+柔
+柘
+柚
+柜
+柞
+柠
+柢
+查
+柩
+柬
+柯
+柱
+柳
+柴
+柵
+査
+柿
+栀
+栃
+栄
+栅
+标
+栈
+栉
+栋
+栎
+栏
+树
+栓
+栖
+栗
+校
+栩
+株
+样
+核
+根
+格
+栽
+栾
+桀
+桁
+桂
+桃
+桅
+框
+案
+桉
+桌
+桎
+桐
+桑
+桓
+桔
+桜
+桠
+桡
+桢
+档
+桥
+桦
+桧
+桨
+桩
+桶
+桿
+梁
+梅
+梆
+梏
+梓
+梗
+條
+梟
+梢
+梦
+梧
+梨
+梭
+梯
+械
+梳
+梵
+梶
+检
+棂
+棄
+棉
+棋
+棍
+棒
+棕
+棗
+棘
+棚
+棟
+棠
+棣
+棧
+森
+棱
+棲
+棵
+棹
+棺
+椁
+椅
+椋
+植
+椎
+椒
+検
+椪
+椭
+椰
+椹
+椽
+椿
+楂
+楊
+楓
+楔
+楚
+楝
+楞
+楠
+楣
+楨
+楫
+業
+楮
+極
+楷
+楸
+楹
+楼
+楽
+概
+榄
+榆
+榈
+榉
+榔
+榕
+榖
+榛
+榜
+榨
+榫
+榭
+榮
+榱
+榴
+榷
+榻
+槁
+槃
+構
+槌
+槍
+槎
+槐
+槓
+様
+槛
+槟
+槤
+槭
+槲
+槳
+槻
+槽
+槿
+樁
+樂
+樊
+樑
+樓
+標
+樞
+樟
+模
+樣
+権
+横
+樫
+樯
+樱
+樵
+樸
+樹
+樺
+樽
+樾
+橄
+橇
+橋
+橐
+橘
+橙
+機
+橡
+橢
+橫
+橱
+橹
+橼
+檀
+檄
+檎
+檐
+檔
+檗
+檜
+檢
+檬
+檯
+檳
+檸
+檻
+櫃
+櫚
+櫛
+櫥
+櫸
+櫻
+欄
+權
+欒
+欖
+欠
+次
+欢
+欣
+欧
+欲
+欸
+欺
+欽
+款
+歆
+歇
+歉
+歌
+歎
+歐
+歓
+歙
+歛
+歡
+止
+正
+此
+步
+武
+歧
+歩
+歪
+歯
+歲
+歳
+歴
+歷
+歸
+歹
+死
+歼
+殁
+殃
+殆
+殇
+殉
+殊
+残
+殒
+殓
+殖
+殘
+殞
+殡
+殤
+殭
+殯
+殲
+殴
+段
+殷
+殺
+殼
+殿
+毀
+毁
+毂
+毅
+毆
+毋
+母
+毎
+每
+毒
+毓
+比
+毕
+毗
+毘
+毙
+毛
+毡
+毫
+毯
+毽
+氈
+氏
+氐
+民
+氓
+气
+氖
+気
+氙
+氛
+氟
+氡
+氢
+氣
+氤
+氦
+氧
+氨
+氪
+氫
+氮
+氯
+氰
+氲
+水
+氷
+永
+氹
+氾
+汀
+汁
+求
+汆
+汇
+汉
+汎
+汐
+汕
+汗
+汙
+汛
+汝
+汞
+江
+池
+污
+汤
+汨
+汩
+汪
+汰
+汲
+汴
+汶
+汹
+決
+汽
+汾
+沁
+沂
+沃
+沅
+沈
+沉
+沌
+沏
+沐
+沒
+沓
+沖
+沙
+沛
+沟
+没
+沢
+沣
+沥
+沦
+沧
+沪
+沫
+沭
+沮
+沱
+河
+沸
+油
+治
+沼
+沽
+沾
+沿
+況
+泄
+泉
+泊
+泌
+泓
+法
+泗
+泛
+泞
+泠
+泡
+波
+泣
+泥
+注
+泪
+泫
+泮
+泯
+泰
+泱
+泳
+泵
+泷
+泸
+泻
+泼
+泽
+泾
+洁
+洄
+洋
+洒
+洗
+洙
+洛
+洞
+津
+洩
+洪
+洮
+洱
+洲
+洵
+洶
+洸
+洹
+活
+洼
+洽
+派
+流
+浃
+浄
+浅
+浆
+浇
+浊
+测
+济
+浏
+浑
+浒
+浓
+浔
+浙
+浚
+浜
+浣
+浦
+浩
+浪
+浬
+浮
+浯
+浴
+海
+浸
+涂
+涅
+涇
+消
+涉
+涌
+涎
+涓
+涔
+涕
+涙
+涛
+涝
+涞
+涟
+涠
+涡
+涣
+涤
+润
+涧
+涨
+涩
+涪
+涮
+涯
+液
+涵
+涸
+涼
+涿
+淀
+淄
+淅
+淆
+淇
+淋
+淌
+淑
+淒
+淖
+淘
+淙
+淚
+淞
+淡
+淤
+淦
+淨
+淩
+淪
+淫
+淬
+淮
+深
+淳
+淵
+混
+淹
+淺
+添
+淼
+清
+済
+渉
+渊
+渋
+渍
+渎
+渐
+渔
+渗
+渙
+渚
+減
+渝
+渠
+渡
+渣
+渤
+渥
+渦
+温
+測
+渭
+港
+渲
+渴
+游
+渺
+渾
+湃
+湄
+湊
+湍
+湖
+湘
+湛
+湟
+湧
+湫
+湮
+湯
+湳
+湾
+湿
+満
+溃
+溅
+溉
+溏
+源
+準
+溜
+溝
+溟
+溢
+溥
+溧
+溪
+溫
+溯
+溱
+溴
+溶
+溺
+溼
+滁
+滂
+滄
+滅
+滇
+滋
+滌
+滑
+滓
+滔
+滕
+滙
+滚
+滝
+滞
+滟
+满
+滢
+滤
+滥
+滦
+滨
+滩
+滬
+滯
+滲
+滴
+滷
+滸
+滾
+滿
+漁
+漂
+漆
+漉
+漏
+漓
+演
+漕
+漠
+漢
+漣
+漩
+漪
+漫
+漬
+漯
+漱
+漲
+漳
+漸
+漾
+漿
+潆
+潇
+潋
+潍
+潑
+潔
+潘
+潛
+潜
+潞
+潟
+潢
+潤
+潦
+潧
+潭
+潮
+潰
+潴
+潸
+潺
+潼
+澀
+澄
+澆
+澈
+澍
+澎
+澗
+澜
+澡
+澤
+澧
+澱
+澳
+澹
+激
+濁
+濂
+濃
+濑
+濒
+濕
+濘
+濛
+濟
+濠
+濡
+濤
+濫
+濬
+濮
+濯
+濱
+濺
+濾
+瀅
+瀆
+瀉
+瀋
+瀏
+瀑
+瀕
+瀘
+瀚
+瀛
+瀝
+瀞
+瀟
+瀧
+瀨
+瀬
+瀰
+瀾
+灌
+灏
+灑
+灘
+灝
+灞
+灣
+火
+灬
+灭
+灯
+灰
+灵
+灶
+灸
+灼
+災
+灾
+灿
+炀
+炁
+炅
+炉
+炊
+炎
+炒
+炔
+炕
+炖
+炙
+炜
+炫
+炬
+炭
+炮
+炯
+炳
+炷
+炸
+点
+為
+炼
+炽
+烁
+烂
+烃
+烈
+烊
+烏
+烘
+烙
+烛
+烟
+烤
+烦
+烧
+烨
+烩
+烫
+烬
+热
+烯
+烷
+烹
+烽
+焉
+焊
+焕
+焖
+焗
+焘
+焙
+焚
+焜
+無
+焦
+焯
+焰
+焱
+然
+焼
+煅
+煉
+煊
+煌
+煎
+煒
+煖
+煙
+煜
+煞
+煤
+煥
+煦
+照
+煨
+煩
+煮
+煲
+煸
+煽
+熄
+熊
+熏
+熒
+熔
+熙
+熟
+熠
+熨
+熬
+熱
+熵
+熹
+熾
+燁
+燃
+燄
+燈
+燉
+燊
+燎
+燒
+燔
+燕
+燙
+燜
+營
+燥
+燦
+燧
+燭
+燮
+燴
+燻
+燼
+燿
+爆
+爍
+爐
+爛
+爪
+爬
+爭
+爰
+爱
+爲
+爵
+父
+爷
+爸
+爹
+爺
+爻
+爽
+爾
+牆
+片
+版
+牌
+牍
+牒
+牙
+牛
+牝
+牟
+牠
+牡
+牢
+牦
+牧
+物
+牯
+牲
+牴
+牵
+特
+牺
+牽
+犀
+犁
+犄
+犊
+犍
+犒
+犢
+犧
+犬
+犯
+状
+犷
+犸
+犹
+狀
+狂
+狄
+狈
+狎
+狐
+狒
+狗
+狙
+狞
+狠
+狡
+狩
+独
+狭
+狮
+狰
+狱
+狸
+狹
+狼
+狽
+猎
+猕
+猖
+猗
+猙
+猛
+猜
+猝
+猥
+猩
+猪
+猫
+猬
+献
+猴
+猶
+猷
+猾
+猿
+獄
+獅
+獎
+獐
+獒
+獗
+獠
+獣
+獨
+獭
+獰
+獲
+獵
+獷
+獸
+獺
+獻
+獼
+獾
+玄
+率
+玉
+王
+玑
+玖
+玛
+玟
+玠
+玥
+玩
+玫
+玮
+环
+现
+玲
+玳
+玷
+玺
+玻
+珀
+珂
+珅
+珈
+珉
+珊
+珍
+珏
+珐
+珑
+珙
+珞
+珠
+珣
+珥
+珩
+珪
+班
+珮
+珲
+珺
+現
+球
+琅
+理
+琇
+琉
+琊
+琍
+琏
+琐
+琛
+琢
+琥
+琦
+琨
+琪
+琬
+琮
+琰
+琲
+琳
+琴
+琵
+琶
+琺
+琼
+瑀
+瑁
+瑄
+瑋
+瑕
+瑗
+瑙
+瑚
+瑛
+瑜
+瑞
+瑟
+瑠
+瑣
+瑤
+瑩
+瑪
+瑯
+瑰
+瑶
+瑾
+璀
+璁
+璃
+璇
+璉
+璋
+璎
+璐
+璜
+璞
+璟
+璧
+璨
+環
+璽
+璿
+瓊
+瓏
+瓒
+瓜
+瓢
+瓣
+瓤
+瓦
+瓮
+瓯
+瓴
+瓶
+瓷
+甄
+甌
+甕
+甘
+甙
+甚
+甜
+生
+產
+産
+甥
+甦
+用
+甩
+甫
+甬
+甭
+甯
+田
+由
+甲
+申
+电
+男
+甸
+町
+画
+甾
+畀
+畅
+界
+畏
+畑
+畔
+留
+畜
+畝
+畢
+略
+畦
+番
+畫
+異
+畲
+畳
+畴
+當
+畸
+畹
+畿
+疆
+疇
+疊
+疏
+疑
+疔
+疖
+疗
+疙
+疚
+疝
+疟
+疡
+疣
+疤
+疥
+疫
+疮
+疯
+疱
+疲
+疳
+疵
+疸
+疹
+疼
+疽
+疾
+痂
+病
+症
+痈
+痉
+痊
+痍
+痒
+痔
+痕
+痘
+痙
+痛
+痞
+痠
+痢
+痣
+痤
+痧
+痨
+痪
+痫
+痰
+痱
+痴
+痹
+痺
+痼
+痿
+瘀
+瘁
+瘋
+瘍
+瘓
+瘘
+瘙
+瘟
+瘠
+瘡
+瘢
+瘤
+瘦
+瘧
+瘩
+瘪
+瘫
+瘴
+瘸
+瘾
+療
+癇
+癌
+癒
+癖
+癜
+癞
+癡
+癢
+癣
+癥
+癫
+癬
+癮
+癱
+癲
+癸
+発
+登
+發
+白
+百
+皂
+的
+皆
+皇
+皈
+皋
+皎
+皑
+皓
+皖
+皙
+皚
+皮
+皰
+皱
+皴
+皺
+皿
+盂
+盃
+盅
+盆
+盈
+益
+盎
+盏
+盐
+监
+盒
+盔
+盖
+盗
+盘
+盛
+盜
+盞
+盟
+盡
+監
+盤
+盥
+盧
+盪
+目
+盯
+盱
+盲
+直
+相
+盹
+盼
+盾
+省
+眈
+眉
+看
+県
+眙
+眞
+真
+眠
+眦
+眨
+眩
+眯
+眶
+眷
+眸
+眺
+眼
+眾
+着
+睁
+睇
+睏
+睐
+睑
+睛
+睜
+睞
+睡
+睢
+督
+睥
+睦
+睨
+睪
+睫
+睬
+睹
+睽
+睾
+睿
+瞄
+瞅
+瞇
+瞋
+瞌
+瞎
+瞑
+瞒
+瞓
+瞞
+瞟
+瞠
+瞥
+瞧
+瞩
+瞪
+瞬
+瞭
+瞰
+瞳
+瞻
+瞼
+瞿
+矇
+矍
+矗
+矚
+矛
+矜
+矢
+矣
+知
+矩
+矫
+短
+矮
+矯
+石
+矶
+矽
+矾
+矿
+码
+砂
+砌
+砍
+砒
+研
+砖
+砗
+砚
+砝
+砣
+砥
+砧
+砭
+砰
+砲
+破
+砷
+砸
+砺
+砼
+砾
+础
+硅
+硐
+硒
+硕
+硝
+硫
+硬
+确
+硯
+硼
+碁
+碇
+碉
+碌
+碍
+碎
+碑
+碓
+碗
+碘
+碚
+碛
+碟
+碣
+碧
+碩
+碰
+碱
+碳
+碴
+確
+碼
+碾
+磁
+磅
+磊
+磋
+磐
+磕
+磚
+磡
+磨
+磬
+磯
+磲
+磷
+磺
+礁
+礎
+礙
+礡
+礦
+礪
+礫
+礴
+示
+礼
+社
+祀
+祁
+祂
+祇
+祈
+祉
+祎
+祐
+祕
+祖
+祗
+祚
+祛
+祜
+祝
+神
+祟
+祠
+祢
+祥
+票
+祭
+祯
+祷
+祸
+祺
+祿
+禀
+禁
+禄
+禅
+禍
+禎
+福
+禛
+禦
+禧
+禪
+禮
+禱
+禹
+禺
+离
+禽
+禾
+禿
+秀
+私
+秃
+秆
+秉
+秋
+种
+科
+秒
+秘
+租
+秣
+秤
+秦
+秧
+秩
+秭
+积
+称
+秸
+移
+秽
+稀
+稅
+程
+稍
+税
+稔
+稗
+稚
+稜
+稞
+稟
+稠
+稣
+種
+稱
+稲
+稳
+稷
+稹
+稻
+稼
+稽
+稿
+穀
+穂
+穆
+穌
+積
+穎
+穗
+穢
+穩
+穫
+穴
+究
+穷
+穹
+空
+穿
+突
+窃
+窄
+窈
+窍
+窑
+窒
+窓
+窕
+窖
+窗
+窘
+窜
+窝
+窟
+窠
+窥
+窦
+窨
+窩
+窪
+窮
+窯
+窺
+窿
+竄
+竅
+竇
+竊
+立
+竖
+站
+竜
+竞
+竟
+章
+竣
+童
+竭
+端
+競
+竹
+竺
+竽
+竿
+笃
+笆
+笈
+笋
+笏
+笑
+笔
+笙
+笛
+笞
+笠
+符
+笨
+第
+笹
+笺
+笼
+筆
+等
+筊
+筋
+筍
+筏
+筐
+筑
+筒
+答
+策
+筛
+筝
+筠
+筱
+筲
+筵
+筷
+筹
+签
+简
+箇
+箋
+箍
+箏
+箐
+箔
+箕
+算
+箝
+管
+箩
+箫
+箭
+箱
+箴
+箸
+節
+篁
+範
+篆
+篇
+築
+篑
+篓
+篙
+篝
+篠
+篡
+篤
+篩
+篪
+篮
+篱
+篷
+簇
+簌
+簍
+簡
+簦
+簧
+簪
+簫
+簷
+簸
+簽
+簾
+簿
+籁
+籃
+籌
+籍
+籐
+籟
+籠
+籤
+籬
+籮
+籲
+米
+类
+籼
+籽
+粄
+粉
+粑
+粒
+粕
+粗
+粘
+粟
+粤
+粥
+粧
+粪
+粮
+粱
+粲
+粳
+粵
+粹
+粼
+粽
+精
+粿
+糅
+糊
+糍
+糕
+糖
+糗
+糙
+糜
+糞
+糟
+糠
+糧
+糬
+糯
+糰
+糸
+系
+糾
+紀
+紂
+約
+紅
+紉
+紊
+紋
+納
+紐
+紓
+純
+紗
+紘
+紙
+級
+紛
+紜
+素
+紡
+索
+紧
+紫
+紮
+累
+細
+紳
+紹
+紺
+終
+絃
+組
+絆
+経
+結
+絕
+絞
+絡
+絢
+給
+絨
+絮
+統
+絲
+絳
+絵
+絶
+絹
+綁
+綏
+綑
+經
+継
+続
+綜
+綠
+綢
+綦
+綫
+綬
+維
+綱
+網
+綴
+綵
+綸
+綺
+綻
+綽
+綾
+綿
+緊
+緋
+総
+緑
+緒
+緘
+線
+緝
+緞
+締
+緣
+編
+緩
+緬
+緯
+練
+緹
+緻
+縁
+縄
+縈
+縛
+縝
+縣
+縫
+縮
+縱
+縴
+縷
+總
+績
+繁
+繃
+繆
+繇
+繋
+織
+繕
+繚
+繞
+繡
+繩
+繪
+繫
+繭
+繳
+繹
+繼
+繽
+纂
+續
+纍
+纏
+纓
+纔
+纖
+纜
+纠
+红
+纣
+纤
+约
+级
+纨
+纪
+纫
+纬
+纭
+纯
+纰
+纱
+纲
+纳
+纵
+纶
+纷
+纸
+纹
+纺
+纽
+纾
+线
+绀
+练
+组
+绅
+细
+织
+终
+绊
+绍
+绎
+经
+绑
+绒
+结
+绔
+绕
+绘
+给
+绚
+绛
+络
+绝
+绞
+统
+绡
+绢
+绣
+绥
+绦
+继
+绩
+绪
+绫
+续
+绮
+绯
+绰
+绳
+维
+绵
+绶
+绷
+绸
+绻
+综
+绽
+绾
+绿
+缀
+缄
+缅
+缆
+缇
+缈
+缉
+缎
+缓
+缔
+缕
+编
+缘
+缙
+缚
+缜
+缝
+缠
+缢
+缤
+缥
+缨
+缩
+缪
+缭
+缮
+缰
+缱
+缴
+缸
+缺
+缽
+罂
+罄
+罌
+罐
+网
+罔
+罕
+罗
+罚
+罡
+罢
+罩
+罪
+置
+罰
+署
+罵
+罷
+罹
+羁
+羅
+羈
+羊
+羌
+美
+羔
+羚
+羞
+羟
+羡
+羣
+群
+羥
+羧
+羨
+義
+羯
+羲
+羸
+羹
+羽
+羿
+翁
+翅
+翊
+翌
+翎
+習
+翔
+翘
+翟
+翠
+翡
+翦
+翩
+翰
+翱
+翳
+翹
+翻
+翼
+耀
+老
+考
+耄
+者
+耆
+耋
+而
+耍
+耐
+耒
+耕
+耗
+耘
+耙
+耦
+耨
+耳
+耶
+耷
+耸
+耻
+耽
+耿
+聂
+聆
+聊
+聋
+职
+聒
+联
+聖
+聘
+聚
+聞
+聪
+聯
+聰
+聲
+聳
+聴
+聶
+職
+聽
+聾
+聿
+肃
+肄
+肅
+肆
+肇
+肉
+肋
+肌
+肏
+肓
+肖
+肘
+肚
+肛
+肝
+肠
+股
+肢
+肤
+肥
+肩
+肪
+肮
+肯
+肱
+育
+肴
+肺
+肽
+肾
+肿
+胀
+胁
+胃
+胄
+胆
+背
+胍
+胎
+胖
+胚
+胛
+胜
+胝
+胞
+胡
+胤
+胥
+胧
+胫
+胭
+胯
+胰
+胱
+胳
+胴
+胶
+胸
+胺
+能
+脂
+脅
+脆
+脇
+脈
+脉
+脊
+脍
+脏
+脐
+脑
+脓
+脖
+脘
+脚
+脛
+脣
+脩
+脫
+脯
+脱
+脲
+脳
+脸
+脹
+脾
+腆
+腈
+腊
+腋
+腌
+腎
+腐
+腑
+腓
+腔
+腕
+腥
+腦
+腩
+腫
+腭
+腮
+腰
+腱
+腳
+腴
+腸
+腹
+腺
+腻
+腼
+腾
+腿
+膀
+膈
+膊
+膏
+膑
+膘
+膚
+膛
+膜
+膝
+膠
+膦
+膨
+膩
+膳
+膺
+膻
+膽
+膾
+膿
+臀
+臂
+臃
+臆
+臉
+臊
+臍
+臓
+臘
+臟
+臣
+臥
+臧
+臨
+自
+臬
+臭
+至
+致
+臺
+臻
+臼
+臾
+舀
+舂
+舅
+舆
+與
+興
+舉
+舊
+舌
+舍
+舎
+舐
+舒
+舔
+舖
+舗
+舛
+舜
+舞
+舟
+航
+舫
+般
+舰
+舱
+舵
+舶
+舷
+舸
+船
+舺
+舾
+艇
+艋
+艘
+艙
+艦
+艮
+良
+艰
+艱
+色
+艳
+艷
+艹
+艺
+艾
+节
+芃
+芈
+芊
+芋
+芍
+芎
+芒
+芙
+芜
+芝
+芡
+芥
+芦
+芩
+芪
+芫
+芬
+芭
+芮
+芯
+花
+芳
+芷
+芸
+芹
+芻
+芽
+芾
+苁
+苄
+苇
+苋
+苍
+苏
+苑
+苒
+苓
+苔
+苕
+苗
+苛
+苜
+苞
+苟
+苡
+苣
+若
+苦
+苫
+苯
+英
+苷
+苹
+苻
+茁
+茂
+范
+茄
+茅
+茉
+茎
+茏
+茗
+茜
+茧
+茨
+茫
+茬
+茭
+茯
+茱
+茲
+茴
+茵
+茶
+茸
+茹
+茼
+荀
+荃
+荆
+草
+荊
+荏
+荐
+荒
+荔
+荖
+荘
+荚
+荞
+荟
+荠
+荡
+荣
+荤
+荥
+荧
+荨
+荪
+荫
+药
+荳
+荷
+荸
+荻
+荼
+荽
+莅
+莆
+莉
+莊
+莎
+莒
+莓
+莖
+莘
+莞
+莠
+莢
+莧
+莪
+莫
+莱
+莲
+莴
+获
+莹
+莺
+莽
+莿
+菀
+菁
+菅
+菇
+菈
+菊
+菌
+菏
+菓
+菖
+菘
+菜
+菟
+菠
+菡
+菩
+華
+菱
+菲
+菸
+菽
+萁
+萃
+萄
+萊
+萋
+萌
+萍
+萎
+萘
+萝
+萤
+营
+萦
+萧
+萨
+萩
+萬
+萱
+萵
+萸
+萼
+落
+葆
+葉
+著
+葚
+葛
+葡
+董
+葦
+葩
+葫
+葬
+葭
+葯
+葱
+葳
+葵
+葷
+葺
+蒂
+蒋
+蒐
+蒔
+蒙
+蒜
+蒞
+蒟
+蒡
+蒨
+蒲
+蒸
+蒹
+蒻
+蒼
+蒿
+蓁
+蓄
+蓆
+蓉
+蓋
+蓑
+蓓
+蓖
+蓝
+蓟
+蓦
+蓬
+蓮
+蓼
+蓿
+蔑
+蔓
+蔔
+蔗
+蔘
+蔚
+蔡
+蔣
+蔥
+蔫
+蔬
+蔭
+蔵
+蔷
+蔺
+蔻
+蔼
+蔽
+蕁
+蕃
+蕈
+蕉
+蕊
+蕎
+蕙
+蕤
+蕨
+蕩
+蕪
+蕭
+蕲
+蕴
+蕻
+蕾
+薄
+薅
+薇
+薈
+薊
+薏
+薑
+薔
+薙
+薛
+薦
+薨
+薩
+薪
+薬
+薯
+薰
+薹
+藉
+藍
+藏
+藐
+藓
+藕
+藜
+藝
+藤
+藥
+藩
+藹
+藻
+藿
+蘆
+蘇
+蘊
+蘋
+蘑
+蘚
+蘭
+蘸
+蘼
+蘿
+虎
+虏
+虐
+虑
+虔
+處
+虚
+虛
+虜
+虞
+號
+虢
+虧
+虫
+虬
+虱
+虹
+虻
+虽
+虾
+蚀
+蚁
+蚂
+蚊
+蚌
+蚓
+蚕
+蚜
+蚝
+蚣
+蚤
+蚩
+蚪
+蚯
+蚱
+蚵
+蛀
+蛆
+蛇
+蛊
+蛋
+蛎
+蛐
+蛔
+蛙
+蛛
+蛟
+蛤
+蛭
+蛮
+蛰
+蛳
+蛹
+蛻
+蛾
+蜀
+蜂
+蜃
+蜆
+蜇
+蜈
+蜊
+蜍
+蜒
+蜓
+蜕
+蜗
+蜘
+蜚
+蜜
+蜡
+蜢
+蜥
+蜱
+蜴
+蜷
+蜻
+蜿
+蝇
+蝈
+蝉
+蝌
+蝎
+蝕
+蝗
+蝙
+蝟
+蝠
+蝦
+蝨
+蝴
+蝶
+蝸
+蝼
+螂
+螃
+融
+螞
+螢
+螨
+螯
+螳
+螺
+蟀
+蟄
+蟆
+蟋
+蟎
+蟑
+蟒
+蟠
+蟬
+蟲
+蟹
+蟻
+蟾
+蠅
+蠍
+蠔
+蠕
+蠛
+蠟
+蠡
+蠢
+蠣
+蠱
+蠶
+蠹
+蠻
+血
+衄
+衅
+衆
+行
+衍
+術
+衔
+街
+衙
+衛
+衝
+衞
+衡
+衢
+衣
+补
+表
+衩
+衫
+衬
+衮
+衰
+衲
+衷
+衹
+衾
+衿
+袁
+袂
+袄
+袅
+袈
+袋
+袍
+袒
+袖
+袜
+袞
+袤
+袪
+被
+袭
+袱
+裁
+裂
+装
+裆
+裊
+裏
+裔
+裕
+裘
+裙
+補
+裝
+裟
+裡
+裤
+裨
+裱
+裳
+裴
+裸
+裹
+製
+裾
+褂
+複
+褐
+褒
+褓
+褔
+褚
+褥
+褪
+褫
+褲
+褶
+褻
+襁
+襄
+襟
+襠
+襪
+襬
+襯
+襲
+西
+要
+覃
+覆
+覇
+見
+規
+覓
+視
+覚
+覦
+覧
+親
+覬
+観
+覷
+覺
+覽
+觀
+见
+观
+规
+觅
+视
+览
+觉
+觊
+觎
+觐
+觑
+角
+觞
+解
+觥
+触
+觸
+言
+訂
+計
+訊
+討
+訓
+訕
+訖
+託
+記
+訛
+訝
+訟
+訣
+訥
+訪
+設
+許
+訳
+訴
+訶
+診
+註
+証
+詆
+詐
+詔
+評
+詛
+詞
+詠
+詡
+詢
+詣
+試
+詩
+詫
+詬
+詭
+詮
+詰
+話
+該
+詳
+詹
+詼
+誅
+誇
+誉
+誌
+認
+誓
+誕
+誘
+語
+誠
+誡
+誣
+誤
+誥
+誦
+誨
+說
+説
+読
+誰
+課
+誹
+誼
+調
+諄
+談
+請
+諏
+諒
+論
+諗
+諜
+諡
+諦
+諧
+諫
+諭
+諮
+諱
+諳
+諷
+諸
+諺
+諾
+謀
+謁
+謂
+謄
+謊
+謎
+謐
+謔
+謗
+謙
+講
+謝
+謠
+謨
+謬
+謹
+謾
+譁
+證
+譎
+譏
+識
+譙
+譚
+譜
+警
+譬
+譯
+議
+譲
+譴
+護
+譽
+讀
+變
+讓
+讚
+讞
+计
+订
+认
+讥
+讧
+讨
+让
+讪
+讫
+训
+议
+讯
+记
+讲
+讳
+讴
+讶
+讷
+许
+讹
+论
+讼
+讽
+设
+访
+诀
+证
+诃
+评
+诅
+识
+诈
+诉
+诊
+诋
+词
+诏
+译
+试
+诗
+诘
+诙
+诚
+诛
+话
+诞
+诟
+诠
+诡
+询
+诣
+诤
+该
+详
+诧
+诩
+诫
+诬
+语
+误
+诰
+诱
+诲
+说
+诵
+诶
+请
+诸
+诺
+读
+诽
+课
+诿
+谀
+谁
+调
+谄
+谅
+谆
+谈
+谊
+谋
+谌
+谍
+谎
+谏
+谐
+谑
+谒
+谓
+谔
+谕
+谗
+谘
+谙
+谚
+谛
+谜
+谟
+谢
+谣
+谤
+谥
+谦
+谧
+谨
+谩
+谪
+谬
+谭
+谯
+谱
+谲
+谴
+谶
+谷
+豁
+豆
+豇
+豈
+豉
+豊
+豌
+豎
+豐
+豔
+豚
+象
+豢
+豪
+豫
+豬
+豹
+豺
+貂
+貅
+貌
+貓
+貔
+貘
+貝
+貞
+負
+財
+貢
+貧
+貨
+販
+貪
+貫
+責
+貯
+貰
+貳
+貴
+貶
+買
+貸
+費
+貼
+貽
+貿
+賀
+賁
+賂
+賃
+賄
+資
+賈
+賊
+賑
+賓
+賜
+賞
+賠
+賡
+賢
+賣
+賤
+賦
+質
+賬
+賭
+賴
+賺
+購
+賽
+贅
+贈
+贊
+贍
+贏
+贓
+贖
+贛
+贝
+贞
+负
+贡
+财
+责
+贤
+败
+账
+货
+质
+贩
+贪
+贫
+贬
+购
+贮
+贯
+贰
+贱
+贲
+贴
+贵
+贷
+贸
+费
+贺
+贻
+贼
+贾
+贿
+赁
+赂
+赃
+资
+赅
+赈
+赊
+赋
+赌
+赎
+赏
+赐
+赓
+赔
+赖
+赘
+赚
+赛
+赝
+赞
+赠
+赡
+赢
+赣
+赤
+赦
+赧
+赫
+赭
+走
+赳
+赴
+赵
+赶
+起
+趁
+超
+越
+趋
+趕
+趙
+趟
+趣
+趨
+足
+趴
+趵
+趸
+趺
+趾
+跃
+跄
+跆
+跋
+跌
+跎
+跑
+跖
+跚
+跛
+距
+跟
+跡
+跤
+跨
+跩
+跪
+路
+跳
+践
+跷
+跹
+跺
+跻
+踉
+踊
+踌
+踏
+踐
+踝
+踞
+踟
+踢
+踩
+踪
+踮
+踱
+踴
+踵
+踹
+蹂
+蹄
+蹇
+蹈
+蹉
+蹊
+蹋
+蹑
+蹒
+蹙
+蹟
+蹣
+蹤
+蹦
+蹩
+蹬
+蹭
+蹲
+蹴
+蹶
+蹺
+蹼
+蹿
+躁
+躇
+躉
+躊
+躋
+躍
+躏
+躪
+身
+躬
+躯
+躲
+躺
+軀
+車
+軋
+軌
+軍
+軒
+軟
+転
+軸
+軼
+軽
+軾
+較
+載
+輒
+輓
+輔
+輕
+輛
+輝
+輟
+輩
+輪
+輯
+輸
+輻
+輾
+輿
+轄
+轅
+轆
+轉
+轍
+轎
+轟
+车
+轧
+轨
+轩
+转
+轭
+轮
+软
+轰
+轲
+轴
+轶
+轻
+轼
+载
+轿
+较
+辄
+辅
+辆
+辇
+辈
+辉
+辊
+辍
+辐
+辑
+输
+辕
+辖
+辗
+辘
+辙
+辛
+辜
+辞
+辟
+辣
+辦
+辨
+辩
+辫
+辭
+辮
+辯
+辰
+辱
+農
+边
+辺
+辻
+込
+辽
+达
+迁
+迂
+迄
+迅
+过
+迈
+迎
+运
+近
+返
+还
+这
+进
+远
+违
+连
+迟
+迢
+迤
+迥
+迦
+迩
+迪
+迫
+迭
+述
+迴
+迷
+迸
+迹
+迺
+追
+退
+送
+适
+逃
+逅
+逆
+选
+逊
+逍
+透
+逐
+递
+途
+逕
+逗
+這
+通
+逛
+逝
+逞
+速
+造
+逢
+連
+逮
+週
+進
+逵
+逶
+逸
+逻
+逼
+逾
+遁
+遂
+遅
+遇
+遊
+運
+遍
+過
+遏
+遐
+遑
+遒
+道
+達
+違
+遗
+遙
+遛
+遜
+遞
+遠
+遢
+遣
+遥
+遨
+適
+遭
+遮
+遲
+遴
+遵
+遶
+遷
+選
+遺
+遼
+遽
+避
+邀
+邁
+邂
+邃
+還
+邇
+邈
+邊
+邋
+邏
+邑
+邓
+邕
+邛
+邝
+邢
+那
+邦
+邨
+邪
+邬
+邮
+邯
+邰
+邱
+邳
+邵
+邸
+邹
+邺
+邻
+郁
+郅
+郊
+郎
+郑
+郜
+郝
+郡
+郢
+郤
+郦
+郧
+部
+郫
+郭
+郴
+郵
+郷
+郸
+都
+鄂
+鄉
+鄒
+鄔
+鄙
+鄞
+鄢
+鄧
+鄭
+鄰
+鄱
+鄲
+鄺
+酉
+酊
+酋
+酌
+配
+酐
+酒
+酗
+酚
+酝
+酢
+酣
+酥
+酩
+酪
+酬
+酮
+酯
+酰
+酱
+酵
+酶
+酷
+酸
+酿
+醃
+醇
+醉
+醋
+醍
+醐
+醒
+醚
+醛
+醜
+醞
+醣
+醪
+醫
+醬
+醮
+醯
+醴
+醺
+釀
+釁
+采
+釉
+释
+釋
+里
+重
+野
+量
+釐
+金
+釗
+釘
+釜
+針
+釣
+釦
+釧
+釵
+鈀
+鈉
+鈍
+鈎
+鈔
+鈕
+鈞
+鈣
+鈦
+鈪
+鈴
+鈺
+鈾
+鉀
+鉄
+鉅
+鉉
+鉑
+鉗
+鉚
+鉛
+鉤
+鉴
+鉻
+銀
+銃
+銅
+銑
+銓
+銖
+銘
+銜
+銬
+銭
+銮
+銳
+銷
+銹
+鋁
+鋅
+鋒
+鋤
+鋪
+鋰
+鋸
+鋼
+錄
+錐
+錘
+錚
+錠
+錢
+錦
+錨
+錫
+錮
+錯
+録
+錳
+錶
+鍊
+鍋
+鍍
+鍛
+鍥
+鍰
+鍵
+鍺
+鍾
+鎂
+鎊
+鎌
+鎏
+鎔
+鎖
+鎗
+鎚
+鎧
+鎬
+鎮
+鎳
+鏈
+鏖
+鏗
+鏘
+鏞
+鏟
+鏡
+鏢
+鏤
+鏽
+鐘
+鐮
+鐲
+鐳
+鐵
+鐸
+鐺
+鑄
+鑊
+鑑
+鑒
+鑣
+鑫
+鑰
+鑲
+鑼
+鑽
+鑾
+鑿
+针
+钉
+钊
+钎
+钏
+钒
+钓
+钗
+钙
+钛
+钜
+钝
+钞
+钟
+钠
+钡
+钢
+钣
+钤
+钥
+钦
+钧
+钨
+钩
+钮
+钯
+钰
+钱
+钳
+钴
+钵
+钺
+钻
+钼
+钾
+钿
+铀
+铁
+铂
+铃
+铄
+铅
+铆
+铉
+铎
+铐
+铛
+铜
+铝
+铠
+铡
+铢
+铣
+铤
+铨
+铩
+铬
+铭
+铮
+铰
+铲
+铵
+银
+铸
+铺
+链
+铿
+销
+锁
+锂
+锄
+锅
+锆
+锈
+锉
+锋
+锌
+锏
+锐
+锑
+错
+锚
+锟
+锡
+锢
+锣
+锤
+锥
+锦
+锭
+键
+锯
+锰
+锲
+锵
+锹
+锺
+锻
+镀
+镁
+镂
+镇
+镉
+镌
+镍
+镐
+镑
+镕
+镖
+镗
+镛
+镜
+镣
+镭
+镯
+镰
+镳
+镶
+長
+长
+門
+閃
+閉
+開
+閎
+閏
+閑
+閒
+間
+閔
+閘
+閡
+関
+閣
+閥
+閨
+閩
+閱
+閲
+閹
+閻
+閾
+闆
+闇
+闊
+闌
+闍
+闔
+闕
+闖
+闘
+關
+闡
+闢
+门
+闪
+闫
+闭
+问
+闯
+闰
+闲
+间
+闵
+闷
+闸
+闹
+闺
+闻
+闽
+闾
+阀
+阁
+阂
+阅
+阆
+阇
+阈
+阉
+阎
+阐
+阑
+阔
+阕
+阖
+阙
+阚
+阜
+队
+阡
+阪
+阮
+阱
+防
+阳
+阴
+阵
+阶
+阻
+阿
+陀
+陂
+附
+际
+陆
+陇
+陈
+陋
+陌
+降
+限
+陕
+陛
+陝
+陞
+陟
+陡
+院
+陣
+除
+陨
+险
+陪
+陰
+陲
+陳
+陵
+陶
+陷
+陸
+険
+陽
+隅
+隆
+隈
+隊
+隋
+隍
+階
+随
+隐
+隔
+隕
+隘
+隙
+際
+障
+隠
+隣
+隧
+隨
+險
+隱
+隴
+隶
+隸
+隻
+隼
+隽
+难
+雀
+雁
+雄
+雅
+集
+雇
+雉
+雋
+雌
+雍
+雎
+雏
+雑
+雒
+雕
+雖
+雙
+雛
+雜
+雞
+離
+難
+雨
+雪
+雯
+雰
+雲
+雳
+零
+雷
+雹
+電
+雾
+需
+霁
+霄
+霆
+震
+霈
+霉
+霊
+霍
+霎
+霏
+霑
+霓
+霖
+霜
+霞
+霧
+霭
+霰
+露
+霸
+霹
+霽
+霾
+靂
+靄
+靈
+青
+靓
+靖
+静
+靚
+靛
+靜
+非
+靠
+靡
+面
+靥
+靦
+革
+靳
+靴
+靶
+靼
+鞅
+鞋
+鞍
+鞏
+鞑
+鞘
+鞠
+鞣
+鞦
+鞭
+韆
+韋
+韌
+韓
+韜
+韦
+韧
+韩
+韬
+韭
+音
+韵
+韶
+韻
+響
+頁
+頂
+頃
+項
+順
+須
+頌
+預
+頑
+頒
+頓
+頗
+領
+頜
+頡
+頤
+頫
+頭
+頰
+頷
+頸
+頹
+頻
+頼
+顆
+題
+額
+顎
+顏
+顔
+願
+顛
+類
+顧
+顫
+顯
+顱
+顴
+页
+顶
+顷
+项
+顺
+须
+顼
+顽
+顾
+顿
+颁
+颂
+预
+颅
+领
+颇
+颈
+颉
+颊
+颌
+颍
+颐
+频
+颓
+颔
+颖
+颗
+题
+颚
+颛
+颜
+额
+颞
+颠
+颡
+颢
+颤
+颦
+颧
+風
+颯
+颱
+颳
+颶
+颼
+飄
+飆
+风
+飒
+飓
+飕
+飘
+飙
+飚
+飛
+飞
+食
+飢
+飨
+飩
+飪
+飯
+飲
+飼
+飽
+飾
+餃
+餅
+餉
+養
+餌
+餐
+餒
+餓
+餘
+餚
+餛
+餞
+餡
+館
+餮
+餵
+餾
+饅
+饈
+饋
+饌
+饍
+饑
+饒
+饕
+饗
+饞
+饥
+饨
+饪
+饬
+饭
+饮
+饯
+饰
+饱
+饲
+饴
+饵
+饶
+饷
+饺
+饼
+饽
+饿
+馀
+馁
+馄
+馅
+馆
+馈
+馋
+馍
+馏
+馒
+馔
+首
+馗
+香
+馥
+馨
+馬
+馭
+馮
+馳
+馴
+駁
+駄
+駅
+駆
+駐
+駒
+駕
+駛
+駝
+駭
+駱
+駿
+騁
+騎
+騏
+験
+騙
+騨
+騰
+騷
+驀
+驅
+驊
+驍
+驒
+驕
+驗
+驚
+驛
+驟
+驢
+驥
+马
+驭
+驮
+驯
+驰
+驱
+驳
+驴
+驶
+驷
+驸
+驹
+驻
+驼
+驾
+驿
+骁
+骂
+骄
+骅
+骆
+骇
+骈
+骊
+骋
+验
+骏
+骐
+骑
+骗
+骚
+骛
+骜
+骞
+骠
+骡
+骤
+骥
+骧
+骨
+骯
+骰
+骶
+骷
+骸
+骼
+髂
+髅
+髋
+髏
+髒
+髓
+體
+髖
+高
+髦
+髪
+髮
+髯
+髻
+鬃
+鬆
+鬍
+鬓
+鬚
+鬟
+鬢
+鬣
+鬥
+鬧
+鬱
+鬼
+魁
+魂
+魄
+魅
+魇
+魍
+魏
+魔
+魘
+魚
+魯
+魷
+鮑
+鮨
+鮪
+鮭
+鮮
+鯉
+鯊
+鯖
+鯛
+鯨
+鯰
+鯽
+鰍
+鰓
+鰭
+鰲
+鰻
+鰾
+鱈
+鱉
+鱔
+鱗
+鱷
+鱸
+鱼
+鱿
+鲁
+鲈
+鲍
+鲑
+鲛
+鲜
+鲟
+鲢
+鲤
+鲨
+鲫
+鲱
+鲲
+鲶
+鲷
+鲸
+鳃
+鳄
+鳅
+鳌
+鳍
+鳕
+鳖
+鳗
+鳝
+鳞
+鳥
+鳩
+鳳
+鳴
+鳶
+鴉
+鴕
+鴛
+鴦
+鴨
+鴻
+鴿
+鵑
+鵜
+鵝
+鵡
+鵬
+鵰
+鵲
+鶘
+鶩
+鶯
+鶴
+鷗
+鷲
+鷹
+鷺
+鸚
+鸞
+鸟
+鸠
+鸡
+鸢
+鸣
+鸥
+鸦
+鸨
+鸪
+鸭
+鸯
+鸳
+鸵
+鸽
+鸾
+鸿
+鹂
+鹃
+鹄
+鹅
+鹈
+鹉
+鹊
+鹌
+鹏
+鹑
+鹕
+鹘
+鹜
+鹞
+鹤
+鹦
+鹧
+鹫
+鹭
+鹰
+鹳
+鹵
+鹹
+鹼
+鹽
+鹿
+麂
+麋
+麒
+麓
+麗
+麝
+麟
+麥
+麦
+麩
+麴
+麵
+麸
+麺
+麻
+麼
+麽
+麾
+黃
+黄
+黍
+黎
+黏
+黑
+黒
+黔
+默
+黛
+黜
+黝
+點
+黠
+黨
+黯
+黴
+鼋
+鼎
+鼐
+鼓
+鼠
+鼬
+鼹
+鼻
+鼾
+齁
+齊
+齋
+齐
+齒
+齡
+齢
+齣
+齦
+齿
+龄
+龅
+龈
+龊
+龋
+龌
+龍
+龐
+龔
+龕
+龙
+龚
+龛
+龜
+龟
+︰
+︱
+︶
+︿
+﹁
+﹂
+﹍
+﹏
+﹐
+﹑
+﹒
+﹔
+﹕
+﹖
+﹗
+﹙
+﹚
+﹝
+﹞
+﹡
+﹣
+！
+＂
+＃
+＄
+％
+＆
+＇
+（
+）
+＊
+＋
+，
+－
+．
+／
+０
+１
+２
+３
+４
+５
+６
+７
+８
+９
+：
+；
+＜
+＝
+＞
+？
+＠
+［
+＼
+］
+＾
+＿
+｀
+ａ
+ｂ
+ｃ
+ｄ
+ｅ
+ｆ
+ｇ
+ｈ
+ｉ
+ｊ
+ｋ
+ｌ
+ｍ
+ｎ
+ｏ
+ｐ
+ｑ
+ｒ
+ｓ
+ｔ
+ｕ
+ｖ
+ｗ
+ｘ
+ｙ
+ｚ
+｛
+｜
+｝
+～
+｡
+｢
+｣
+､
+･
+ｯ
+ｰ
+ｲ
+ｸ
+ｼ
+ｽ
+ﾄ
+ﾉ
+ﾌ
+ﾗ
+ﾙ
+ﾝ
+ﾞ
+ﾟ
+￣
+￥
+👍
+🔥
+😂
+😎
+...
+yam
+10
+2017
+12
+11
+2016
+20
+30
+15
+06
+lofter
+##s
+2015
+by
+16
+14
+18
+13
+24
+17
+2014
+21
+##0
+22
+19
+25
+23
+com
+100
+00
+05
+2013
+##a
+03
+09
+08
+28
+##2
+50
+01
+04
+##1
+27
+02
+2012
+##3
+26
+##e
+07
+##8
+##5
+##6
+##4
+##9
+##7
+29
+2011
+40
+##t
+2010
+##o
+##d
+##i
+2009
+##n
+app
+www
+the
+##m
+31
+##c
+##l
+##y
+##r
+##g
+2008
+60
+http
+200
+qq
+##p
+80
+##f
+google
+pixnet
+90
+cookies
+tripadvisor
+500
+##er
+##k
+35
+##h
+facebook
+2007
+2000
+70
+##b
+of
+##x
+##u
+45
+300
+iphone
+32
+1000
+2006
+48
+ip
+36
+in
+38
+3d
+##w
+##ing
+55
+ctrip
+##on
+##v
+33
+##の
+to
+34
+400
+id
+2005
+it
+37
+windows
+llc
+top
+99
+42
+39
+000
+led
+at
+##an
+41
+51
+52
+46
+49
+43
+53
+44
+##z
+android
+58
+and
+59
+2004
+56
+vr
+##か
+5000
+2003
+47
+blogthis
+twitter
+54
+##le
+150
+ok
+2018
+57
+75
+cn
+no
+ios
+##in
+##mm
+##00
+800
+on
+te
+3000
+65
+2001
+360
+95
+ig
+lv
+120
+##ng
+##を
+##us
+##に
+pc
+てす
+──
+600
+##te
+85
+2002
+88
+##ed
+html
+ncc
+wifi
+email
+64
+blog
+is
+##10
+##て
+mail
+online
+##al
+dvd
+##ic
+studio
+##は
+##℃
+##ia
+##と
+line
+vip
+72
+##q
+98
+##ce
+##en
+for
+##is
+##ra
+##es
+##j
+usb
+net
+cp
+1999
+asia
+4g
+##cm
+diy
+new
+3c
+##お
+ta
+66
+language
+vs
+apple
+tw
+86
+web
+##ne
+ipad
+62
+you
+##re
+101
+68
+##tion
+ps
+de
+bt
+pony
+atm
+##2017
+1998
+67
+##ch
+ceo
+##or
+go
+##na
+av
+pro
+cafe
+96
+pinterest
+97
+63
+pixstyleme3c
+##ta
+more
+said
+##2016
+1997
+mp3
+700
+##ll
+nba
+jun
+##20
+92
+tv
+1995
+pm
+61
+76
+nbsp
+250
+##ie
+linux
+##ma
+cd
+110
+hd
+##17
+78
+##ion
+77
+6000
+am
+##th
+##st
+94
+##se
+##et
+69
+180
+gdp
+my
+105
+81
+abc
+89
+flash
+79
+one
+93
+1990
+1996
+##ck
+gps
+##も
+##ly
+web885
+106
+2020
+91
+##ge
+4000
+1500
+xd
+boss
+isbn
+1994
+org
+##ry
+me
+love
+##11
+0fork
+73
+##12
+3g
+##ter
+##ar
+71
+82
+##la
+hotel
+130
+1970
+pk
+83
+87
+140
+ie
+##os
+##30
+##el
+74
+##50
+seo
+cpu
+##ml
+p2p
+84
+may
+##る
+sun
+tue
+internet
+cc
+posted
+youtube
+##at
+##ン
+##man
+ii
+##ル
+##15
+abs
+nt
+pdf
+yahoo
+ago
+1980
+##it
+news
+mac
+104
+##てす
+##me
+##り
+java
+1992
+spa
+##de
+##nt
+hk
+all
+plus
+la
+1993
+##mb
+##16
+##ve
+west
+##da
+160
+air
+##い
+##ps
+から
+##to
+1989
+logo
+htc
+php
+https
+fi
+momo
+##son
+sat
+##ke
+##80
+ebd
+suv
+wi
+day
+apk
+##88
+##um
+mv
+galaxy
+wiki
+or
+brake
+##ス
+1200
+する
+this
+1991
+mon
+##こ
+❤2017
+po
+##ない
+javascript
+life
+home
+june
+##ss
+system
+900
+##ー
+##０
+pp
+1988
+world
+fb
+4k
+br
+##as
+ic
+ai
+leonardo
+safari
+##60
+live
+free
+xx
+wed
+win7
+kiehl
+##co
+lg
+o2o
+##go
+us
+235
+1949
+mm
+しい
+vfm
+kanye
+##90
+##2015
+##id
+jr
+##ey
+123
+rss
+##sa
+##ro
+##am
+##no
+thu
+fri
+350
+##sh
+##ki
+103
+comments
+name
+##のて
+##pe
+##ine
+max
+1987
+8000
+uber
+##mi
+##ton
+wordpress
+office
+1986
+1985
+##ment
+107
+bd
+win10
+##ld
+##li
+gmail
+bb
+dior
+##rs
+##ri
+##rd
+##ます
+up
+cad
+##®
+dr
+して
+read
+##21
+をお
+##io
+##99
+url
+1984
+pvc
+paypal
+show
+policy
+##40
+##ty
+##18
+with
+##★
+##01
+txt
+102
+##ba
+dna
+from
+post
+mini
+ar
+taiwan
+john
+##ga
+privacy
+agoda
+##13
+##ny
+word
+##24
+##22
+##by
+##ur
+##hz
+1982
+##ang
+265
+cookie
+netscape
+108
+##ka
+##～
+##ad
+house
+share
+note
+ibm
+code
+hello
+nike
+sim
+survey
+##016
+1979
+1950
+wikia
+##32
+##017
+5g
+cbc
+##tor
+##kg
+1983
+##rt
+##14
+campaign
+store
+2500
+os
+##ct
+##ts
+##°
+170
+api
+##ns
+365
+excel
+##な
+##ao
+##ら
+##し
+～～
+##nd
+university
+163
+には
+518
+##70
+##ya
+##il
+##25
+pierre
+ipo
+0020
+897
+##23
+hotels
+##ian
+のお
+125
+years
+6606
+##ers
+##26
+high
+##day
+time
+##ay
+bug
+##line
+##く
+##す
+##be
+xp
+talk2yam
+yamservice
+10000
+coco
+##dy
+sony
+##ies
+1978
+microsoft
+david
+people
+##ha
+1960
+instagram
+intel
+その
+##ot
+iso
+1981
+##va
+115
+##mo
+##land
+xxx
+man
+co
+ltxsw
+##ation
+baby
+220
+##pa
+##ol
+1945
+7000
+tag
+450
+##ue
+msn
+##31
+oppo
+##ト
+##ca
+control
+##om
+st
+chrome
+##ure
+##ん
+be
+##き
+lol
+##19
+した
+##bo
+240
+lady
+##100
+##way
+##から
+4600
+##ko
+##do
+##un
+4s
+corporation
+168
+##ni
+herme
+##28
+ｃｐ
+978
+##up
+##06
+ui
+##ds
+ppt
+admin
+three
+します
+bbc
+re
+128
+##48
+ca
+##015
+##35
+hp
+##ee
+tpp
+##た
+##ive
+××
+root
+##cc
+##ました
+##ble
+##ity
+adobe
+park
+114
+et
+oled
+city
+##ex
+##ler
+##ap
+china
+##book
+20000
+view
+##ice
+global
+##km
+your
+hong
+##mg
+out
+##ms
+ng
+ebay
+##29
+menu
+ubuntu
+##cy
+rom
+##view
+open
+ktv
+do
+server
+##lo
+if
+english
+##ね
+##５
+##oo
+1600
+##02
+step1
+kong
+club
+135
+july
+inc
+1976
+mr
+hi
+##net
+touch
+##ls
+##ii
+michael
+lcd
+##05
+##33
+phone
+james
+step2
+1300
+ios9
+##box
+dc
+##２
+##ley
+samsung
+111
+280
+pokemon
+css
+##ent
+##les
+いいえ
+##１
+s8
+atom
+play
+bmw
+##said
+sa
+etf
+ctrl
+♥yoyo♥
+##55
+2025
+##2014
+##66
+adidas
+amazon
+1958
+##ber
+##ner
+visa
+##77
+##der
+1800
+connectivity
+##hi
+firefox
+109
+118
+hr
+so
+style
+mark
+pop
+ol
+skip
+1975
+as
+##27
+##ir
+##61
+190
+mba
+##う
+##ai
+le
+##ver
+1900
+cafe2017
+lte
+super
+113
+129
+##ron
+amd
+like
+##☆
+are
+##ster
+we
+##sk
+paul
+data
+international
+##ft
+longchamp
+ssd
+good
+##ート
+##ti
+reply
+##my
+↓↓↓
+apr
+star
+##ker
+source
+136
+js
+112
+get
+force
+photo
+##one
+126
+##2013
+##ow
+link
+bbs
+1972
+goods
+##lin
+python
+119
+##ip
+game
+##ics
+##ません
+blue
+##●
+520
+##45
+page
+itunes
+##03
+1955
+260
+1968
+gt
+gif
+618
+##ff
+##47
+group
+くたさい
+about
+bar
+ganji
+##nce
+music
+lee
+not
+1977
+1971
+1973
+##per
+an
+faq
+comment
+##って
+days
+##ock
+116
+##bs
+1974
+1969
+v1
+player
+1956
+xbox
+sql
+fm
+f1
+139
+##ah
+210
+##lv
+##mp
+##000
+melody
+1957
+##３
+550
+17life
+199
+1966
+xml
+market
+##au
+##71
+999
+##04
+what
+gl
+##95
+##age
+tips
+##68
+book
+##ting
+mysql
+can
+1959
+230
+##ung
+wonderland
+watch
+10℃
+##ction
+9000
+mar
+mobile
+1946
+1962
+article
+##db
+part
+▲top
+party
+って
+1967
+1964
+1948
+##07
+##ore
+##op
+この
+dj
+##78
+##38
+010
+main
+225
+1965
+##ong
+art
+320
+ad
+134
+020
+##73
+117
+pm2
+japan
+228
+##08
+ts
+1963
+##ica
+der
+sm
+##36
+2019
+##wa
+ct
+##７
+##や
+##64
+1937
+homemesh
+search
+##85
+##れは
+##tv
+##di
+macbook
+##９
+##くたさい
+service
+##♥
+type
+った
+750
+##ier
+##si
+##75
+##います
+##ok
+best
+##ット
+goris
+lock
+##った
+cf
+3m
+big
+##ut
+ftp
+carol
+##vi
+１０
+1961
+happy
+sd
+##ac
+122
+anti
+pe
+cnn
+iii
+1920
+138
+##ラ
+1940
+esp
+jan
+tags
+##98
+##51
+august
+vol
+##86
+154
+##™
+##fs
+##れ
+##sion
+design
+ac
+##ム
+press
+jordan
+ppp
+that
+key
+check
+##６
+##tt
+##㎡
+1080p
+##lt
+power
+##42
+1952
+##bc
+vivi
+##ック
+he
+133
+121
+jpg
+##rry
+201
+175
+3500
+1947
+nb
+##ted
+##rn
+しています
+1954
+usd
+##t00
+master
+##ンク
+001
+model
+##58
+al
+##09
+1953
+##34
+ram
+goo
+ても
+##ui
+127
+1930
+red
+##ary
+rpg
+item
+##pm
+##41
+270
+##za
+project
+##2012
+hot
+td
+blogabstract
+##ger
+##62
+650
+##44
+gr2
+##します
+##ｍ
+black
+electronic
+nfc
+year
+asus
+また
+html5
+cindy
+##hd
+m3
+132
+esc
+##od
+booking
+##53
+fed
+tvb
+##81
+##ina
+mit
+165
+##いる
+chan
+192
+distribution
+next
+になる
+peter
+bios
+steam
+cm
+1941
+にも
+pk10
+##ix
+##65
+##91
+dec
+nasa
+##ana
+icecat
+00z
+b1
+will
+##46
+li
+se
+##ji
+##み
+##ard
+oct
+##ain
+jp
+##ze
+##bi
+cio
+##56
+smart
+h5
+##39
+##port
+curve
+vpn
+##nm
+##dia
+utc
+##あり
+12345678910
+##52
+rmvb
+chanel
+a4
+miss
+##and
+##im
+media
+who
+##63
+she
+girl
+5s
+124
+vera
+##して
+class
+vivo
+king
+##フ
+##ei
+national
+ab
+1951
+5cm
+888
+145
+ipod
+ap
+1100
+5mm
+211
+ms
+2756
+##69
+mp4
+msci
+##po
+##89
+131
+mg
+index
+380
+##bit
+##out
+##zz
+##97
+##67
+158
+apec
+##８
+photoshop
+opec
+￥799
+ては
+##96
+##tes
+##ast
+2g
+○○
+##ール
+￥2899
+##ling
+##よ
+##ory
+1938
+##ical
+kitty
+content
+##43
+step3
+##cn
+win8
+155
+vc
+1400
+iphone7
+robert
+##した
+tcl
+137
+beauty
+##87
+en
+dollars
+##ys
+##oc
+step
+pay
+yy
+a1
+##2011
+##lly
+##ks
+##♪
+1939
+188
+download
+1944
+sep
+exe
+ph
+います
+school
+gb
+center
+pr
+street
+##board
+uv
+##37
+##lan
+winrar
+##que
+##ua
+##com
+1942
+1936
+480
+gpu
+##４
+ettoday
+fu
+tom
+##54
+##ren
+##via
+149
+##72
+b2b
+144
+##79
+##tch
+rose
+arm
+mb
+##49
+##ial
+##nn
+nvidia
+step4
+mvp
+00㎡
+york
+156
+##イ
+how
+cpi
+591
+2765
+gov
+kg
+joe
+##xx
+mandy
+pa
+##ser
+copyright
+fashion
+1935
+don
+##け
+ecu
+##ist
+##art
+erp
+wap
+have
+##lm
+talk
+##ek
+##ning
+##if
+ch
+##ite
+video
+1943
+cs
+san
+iot
+look
+##84
+##2010
+##ku
+october
+##ux
+trump
+##hs
+##ide
+box
+141
+first
+##ins
+april
+##ight
+##83
+185
+angel
+protected
+aa
+151
+162
+x1
+m2
+##fe
+##×
+##ho
+size
+143
+min
+ofo
+fun
+gomaji
+ex
+hdmi
+food
+dns
+march
+chris
+kevin
+##のか
+##lla
+##pp
+##ec
+ag
+ems
+6s
+720p
+##rm
+##ham
+off
+##92
+asp
+team
+fandom
+ed
+299
+▌♥
+##ell
+info
+されています
+##82
+sina
+4066
+161
+##able
+##ctor
+330
+399
+315
+dll
+rights
+ltd
+idc
+jul
+3kg
+1927
+142
+ma
+surface
+##76
+##ク
+～～～
+304
+mall
+eps
+146
+green
+##59
+map
+space
+donald
+v2
+sodu
+##light
+1931
+148
+1700
+まて
+310
+reserved
+htm
+##han
+##57
+2d
+178
+mod
+##ise
+##tions
+152
+ti
+##shi
+doc
+1933
+icp
+055
+wang
+##ram
+shopping
+aug
+##pi
+##well
+now
+wam
+b2
+からお
+##hu
+236
+1928
+##gb
+266
+f2
+##93
+153
+mix
+##ef
+##uan
+bwl
+##plus
+##res
+core
+##ess
+tea
+5℃
+hktvmall
+nhk
+##ate
+list
+##ese
+301
+feb
+4m
+inn
+ての
+nov
+159
+12345
+daniel
+##ci
+pass
+##bet
+##nk
+coffee
+202
+ssl
+airbnb
+##ute
+fbi
+woshipm
+skype
+ea
+cg
+sp
+##fc
+##www
+yes
+edge
+alt
+007
+##94
+fpga
+##ght
+##gs
+iso9001
+さい
+##ile
+##wood
+##uo
+image
+lin
+icon
+american
+##em
+1932
+set
+says
+##king
+##tive
+blogger
+##74
+なと
+256
+147
+##ox
+##zy
+##red
+##ium
+##lf
+nokia
+claire
+##リ
+##ding
+november
+lohas
+##500
+##tic
+##マ
+##cs
+##ある
+##che
+##ire
+##gy
+##ult
+db
+january
+win
+##カ
+166
+road
+ptt
+##ま
+##つ
+198
+##fa
+##mer
+anna
+pchome
+はい
+udn
+ef
+420
+##time
+##tte
+2030
+##ア
+g20
+white
+かかります
+1929
+308
+garden
+eleven
+di
+##おります
+chen
+309b
+777
+172
+young
+cosplay
+ちてない
+4500
+bat
+##123
+##tra
+##ては
+kindle
+npc
+steve
+etc
+##ern
+##｜
+call
+xperia
+ces
+travel
+sk
+s7
+##ous
+1934
+##int
+みいたたけます
+183
+edu
+file
+cho
+qr
+##car
+##our
+186
+##ant
+##ｄ
+eric
+1914
+rends
+##jo
+##する
+mastercard
+##2000
+kb
+##min
+290
+##ino
+vista
+##ris
+##ud
+jack
+2400
+##set
+169
+pos
+1912
+##her
+##ou
+taipei
+しく
+205
+beta
+##ませんか
+232
+##fi
+express
+255
+body
+##ill
+aphojoy
+user
+december
+meiki
+##ick
+tweet
+richard
+##av
+##ᆫ
+iphone6
+##dd
+ちてすか
+views
+##mark
+321
+pd
+##００
+times
+##▲
+level
+##ash
+10g
+point
+5l
+##ome
+208
+koreanmall
+##ak
+george
+q2
+206
+wma
+tcp
+##200
+スタッフ
+full
+mlb
+##lle
+##watch
+tm
+run
+179
+911
+smith
+business
+##und
+1919
+color
+##tal
+222
+171
+##less
+moon
+4399
+##rl
+update
+pcb
+shop
+499
+157
+little
+なし
+end
+##mhz
+van
+dsp
+easy
+660
+##house
+##key
+history
+##ｏ
+oh
+##001
+##hy
+##web
+oem
+let
+was
+##2009
+##gg
+review
+##wan
+182
+##°c
+203
+uc
+title
+##val
+united
+233
+2021
+##ons
+doi
+trivago
+overdope
+sbs
+##ance
+##ち
+grand
+special
+573032185
+imf
+216
+wx17house
+##so
+##ーム
+audi
+##he
+london
+william
+##rp
+##ake
+science
+beach
+cfa
+amp
+ps4
+880
+##800
+##link
+##hp
+crm
+ferragamo
+bell
+make
+##eng
+195
+under
+zh
+photos
+2300
+##style
+##ント
+via
+176
+da
+##gi
+company
+i7
+##ray
+thomas
+370
+ufo
+i5
+##max
+plc
+ben
+back
+research
+8g
+173
+mike
+##pc
+##ッフ
+september
+189
+##ace
+vps
+february
+167
+pantos
+wp
+lisa
+1921
+★★
+jquery
+night
+long
+offer
+##berg
+##news
+1911
+##いて
+ray
+fks
+wto
+せます
+over
+164
+340
+##all
+##rus
+1924
+##888
+##works
+blogtitle
+loftpermalink
+##→
+187
+martin
+test
+ling
+km
+##め
+15000
+fda
+v3
+##ja
+##ロ
+ｗedding
+かある
+outlet
+family
+##ea
+をこ
+##top
+story
+##ness
+salvatore
+##lu
+204
+swift
+215
+room
+している
+oracle
+##ul
+1925
+sam
+b2c
+week
+pi
+rock
+##のは
+##ａ
+##けと
+##ean
+##300
+##gle
+cctv
+after
+chinese
+##back
+powered
+x2
+##tan
+1918
+##nes
+##イン
+canon
+only
+181
+##zi
+##las
+say
+##oe
+184
+##sd
+221
+##bot
+##world
+##zo
+sky
+made
+top100
+just
+1926
+pmi
+802
+234
+gap
+##vr
+177
+les
+174
+▲topoct
+ball
+vogue
+vi
+ing
+ofweek
+cos
+##list
+##ort
+▲topmay
+##なら
+##lon
+として
+last
+##tc
+##of
+##bus
+##gen
+real
+eva
+##コ
+a3
+nas
+##lie
+##ria
+##coin
+##bt
+▲topapr
+his
+212
+cat
+nata
+vive
+health
+⋯⋯
+drive
+sir
+▲topmar
+du
+cup
+##カー
+##ook
+##よう
+##sy
+alex
+msg
+tour
+しました
+3ce
+##word
+193
+ebooks
+r8
+block
+318
+##より
+2200
+nice
+pvp
+207
+months
+1905
+rewards
+##ther
+1917
+0800
+##xi
+##チ
+##sc
+micro
+850
+gg
+blogfp
+op
+1922
+daily
+m1
+264
+true
+##bb
+ml
+##tar
+##のお
+##ky
+anthony
+196
+253
+##yo
+state
+218
+##ara
+##aa
+##rc
+##tz
+##ston
+より
+gear
+##eo
+##ade
+ge
+see
+1923
+##win
+##ura
+ss
+heart
+##den
+##ita
+down
+##sm
+el
+png
+2100
+610
+rakuten
+whatsapp
+bay
+dream
+add
+##use
+680
+311
+pad
+gucci
+mpv
+##ode
+##fo
+island
+▲topjun
+##▼
+223
+jason
+214
+chicago
+##❤
+しの
+##hone
+io
+##れる
+##ことか
+sogo
+be2
+##ology
+990
+cloud
+vcd
+##con
+2～3
+##ford
+##joy
+##kb
+##こさいます
+##rade
+but
+##ach
+docker
+##ful
+rfid
+ul
+##ase
+hit
+ford
+##star
+580
+##○
+１１
+a2
+sdk
+reading
+edited
+##are
+cmos
+##mc
+238
+siri
+light
+##ella
+##ため
+bloomberg
+##read
+pizza
+##ison
+jimmy
+##vm
+college
+node
+journal
+ba
+18k
+##play
+245
+##cer
+２０
+magic
+##yu
+191
+jump
+288
+tt
+##ings
+asr
+##lia
+3200
+step5
+network
+##cd
+mc
+いします
+1234
+pixstyleme
+273
+##600
+2800
+money
+★★★★★
+1280
+１２
+430
+bl
+みの
+act
+##tus
+tokyo
+##rial
+##life
+emba
+##ae
+saas
+tcs
+##rk
+##wang
+summer
+##sp
+ko
+##ving
+390
+premium
+##その
+netflix
+##ヒ
+uk
+mt
+##lton
+right
+frank
+two
+209
+える
+##ple
+##cal
+021
+##んな
+##sen
+##ville
+hold
+nexus
+dd
+##ius
+てお
+##mah
+##なく
+tila
+zero
+820
+ce
+##tin
+resort
+##ws
+charles
+old
+p10
+5d
+report
+##360
+##ru
+##には
+bus
+vans
+lt
+##est
+pv
+##レ
+links
+rebecca
+##ツ
+##dm
+azure
+##365
+きな
+limited
+bit
+4gb
+##mon
+1910
+moto
+##eam
+213
+1913
+var
+eos
+なとの
+226
+blogspot
+された
+699
+e3
+dos
+dm
+fc
+##ments
+##ik
+##kw
+boy
+##bin
+##ata
+960
+er
+##せ
+219
+##vin
+##tu
+##ula
+194
+##∥
+station
+##ろ
+##ature
+835
+files
+zara
+hdr
+top10
+nature
+950
+magazine
+s6
+marriott
+##シ
+avira
+case
+##っと
+tab
+##ran
+tony
+##home
+oculus
+im
+##ral
+jean
+saint
+cry
+307
+rosie
+##force
+##ini
+ice
+##bert
+のある
+##nder
+##mber
+pet
+2600
+##◆
+plurk
+▲topdec
+##sis
+00kg
+▲topnov
+720
+##ence
+tim
+##ω
+##nc
+##ても
+##name
+log
+ips
+great
+ikea
+malaysia
+unix
+##イト
+3600
+##ncy
+##nie
+12000
+akb48
+##ye
+##oid
+404
+##chi
+##いた
+oa
+xuehai
+##1000
+##orm
+##rf
+275
+さん
+##ware
+##リー
+980
+ho
+##pro
+text
+##era
+560
+bob
+227
+##ub
+##2008
+8891
+scp
+avi
+##zen
+2022
+mi
+wu
+museum
+qvod
+apache
+lake
+jcb
+▲topaug
+★★★
+ni
+##hr
+hill
+302
+ne
+weibo
+490
+ruby
+##ーシ
+##ヶ
+##row
+4d
+▲topjul
+iv
+##ish
+github
+306
+mate
+312
+##スト
+##lot
+##ane
+andrew
+のハイト
+##tina
+t1
+rf
+ed2k
+##vel
+##900
+way
+final
+りの
+ns
+5a
+705
+197
+##メ
+sweet
+bytes
+##ene
+▲topjan
+231
+##cker
+##2007
+##px
+100g
+topapp
+229
+helpapp
+rs
+low
+14k
+g4g
+care
+630
+ldquo
+あり
+##fork
+leave
+rm
+edition
+##gan
+##zon
+##qq
+▲topsep
+##google
+##ism
+gold
+224
+explorer
+##zer
+toyota
+category
+select
+visual
+##labels
+restaurant
+##md
+posts
+s1
+##ico
+もっと
+angelababy
+123456
+217
+sports
+s3
+mbc
+1915
+してくたさい
+shell
+x86
+candy
+##new
+kbs
+face
+xl
+470
+##here
+4a
+swissinfo
+v8
+▲topfeb
+dram
+##ual
+##vice
+3a
+##wer
+sport
+q1
+ios10
+public
+int
+card
+##ｃ
+ep
+au
+rt
+##れた
+1080
+bill
+##mll
+kim
+３０
+460
+wan
+##uk
+##ミ
+x3
+298
+0t
+scott
+##ming
+239
+e5
+##3d
+h7n9
+worldcat
+brown
+##あります
+##vo
+##led
+##580
+##ax
+249
+410
+##ert
+paris
+##～6
+polo
+925
+##lr
+599
+##ナ
+capital
+##hing
+bank
+cv
+1g
+##chat
+##ｓ
+##たい
+adc
+##ule
+2m
+##ｅ
+digital
+hotmail
+268
+##pad
+870
+bbq
+quot
+##ring
+before
+wali
+##まて
+mcu
+2k
+2b
+という
+costco
+316
+north
+333
+switch
+##city
+##ｐ
+philips
+##mann
+management
+panasonic
+##cl
+##vd
+##ping
+##rge
+alice
+##lk
+##ましょう
+css3
+##ney
+vision
+alpha
+##ular
+##400
+##tter
+lz
+にお
+##ありません
+mode
+gre
+1916
+pci
+##tm
+237
+1～2
+##yan
+##そ
+について
+##let
+##キ
+work
+war
+coach
+ah
+mary
+##ᅵ
+huang
+##pt
+a8
+pt
+follow
+##berry
+1895
+##ew
+a5
+ghost
+##ション
+##wn
+##og
+south
+##code
+girls
+##rid
+action
+villa
+git
+r11
+table
+games
+##cket
+error
+##anonymoussaid
+##ag
+here
+##ame
+##gc
+qa
+##■
+##lis
+gmp
+##gin
+vmalife
+##cher
+yu
+wedding
+##tis
+demo
+dragon
+530
+soho
+social
+bye
+##rant
+river
+orz
+acer
+325
+##↑
+##ース
+##ats
+261
+del
+##ven
+440
+ups
+##ように
+##ター
+305
+value
+macd
+yougou
+##dn
+661
+##ano
+ll
+##urt
+##rent
+continue
+script
+##wen
+##ect
+paper
+263
+319
+shift
+##chel
+##フト
+##cat
+258
+x5
+fox
+243
+##さん
+car
+aaa
+##blog
+loading
+##yn
+##tp
+kuso
+799
+si
+sns
+イカせるテンマ
+ヒンクテンマ3
+rmb
+vdc
+forest
+central
+prime
+help
+ultra
+##rmb
+##ような
+241
+square
+688
+##しい
+のないフロクに
+##field
+##reen
+##ors
+##ju
+c1
+start
+510
+##air
+##map
+cdn
+##wo
+cba
+stephen
+m8
+100km
+##get
+opera
+##base
+##ood
+vsa
+com™
+##aw
+##ail
+251
+なのて
+count
+t2
+##ᅡ
+##een
+2700
+hop
+##gp
+vsc
+tree
+##eg
+##ose
+816
+285
+##ories
+##shop
+alphago
+v4
+1909
+simon
+##ᆼ
+fluke62max
+zip
+スホンサー
+##sta
+louis
+cr
+bas
+##～10
+bc
+##yer
+hadoop
+##ube
+##wi
+1906
+0755
+hola
+##low
+place
+centre
+5v
+d3
+##fer
+252
+##750
+##media
+281
+540
+0l
+exchange
+262
+series
+##ハー
+##san
+eb
+##bank
+##ｋ
+q3
+##nge
+##mail
+take
+##lp
+259
+1888
+client
+east
+cache
+event
+vincent
+##ールを
+きを
+##nse
+sui
+855
+adchoice
+##и
+##stry
+##なたの
+246
+##zone
+ga
+apps
+sea
+##ab
+248
+cisco
+##タ
+##rner
+kymco
+##care
+dha
+##pu
+##yi
+minkoff
+royal
+p1
+への
+annie
+269
+collection
+kpi
+playstation
+257
+になります
+866
+bh
+##bar
+queen
+505
+radio
+1904
+andy
+armani
+##xy
+manager
+iherb
+##ery
+##share
+spring
+raid
+johnson
+1908
+##ob
+volvo
+hall
+##ball
+v6
+our
+taylor
+##hk
+bi
+242
+##cp
+kate
+bo
+water
+technology
+##rie
+サイトは
+277
+##ona
+##sl
+hpv
+303
+gtx
+hip
+rdquo
+jayz
+stone
+##lex
+##rum
+namespace
+##やり
+620
+##ale
+##atic
+des
+##erson
+##ql
+##ves
+##type
+enter
+##この
+##てきます
+d2
+##168
+##mix
+##bian
+との
+a9
+jj
+ky
+##lc
+access
+movie
+##hc
+リストに
+tower
+##ration
+##mit
+ます
+##nch
+ua
+tel
+prefix
+##o2
+1907
+##point
+1901
+ott
+～10
+##http
+##ury
+baidu
+##ink
+member
+##logy
+bigbang
+nownews
+##js
+##shot
+##tb
+##こと
+247
+eba
+##tics
+##lus
+ける
+v5
+spark
+##ama
+there
+##ions
+god
+##lls
+##down
+hiv
+##ress
+burberry
+day2
+##kv
+◆◆
+jeff
+related
+film
+edit
+joseph
+283
+##ark
+cx
+32gb
+order
+g9
+30000
+##ans
+##tty
+s5
+##bee
+かあります
+thread
+xr
+buy
+sh
+005
+land
+spotify
+mx
+##ari
+276
+##verse
+×email
+sf
+why
+##ことて
+244
+7headlines
+nego
+sunny
+dom
+exo
+401
+666
+positioning
+fit
+rgb
+##tton
+278
+kiss
+alexa
+adam
+lp
+みリストを
+##ｇ
+mp
+##ties
+##llow
+amy
+##du
+np
+002
+institute
+271
+##rth
+##lar
+2345
+590
+##des
+sidebar
+１５
+imax
+site
+##cky
+##kit
+##ime
+##009
+season
+323
+##fun
+##ンター
+##ひ
+gogoro
+a7
+pu
+lily
+fire
+twd600
+##ッセーシを
+いて
+##vis
+30ml
+##cture
+##をお
+information
+##オ
+close
+friday
+##くれる
+yi
+nick
+てすか
+##tta
+##tel
+6500
+##lock
+cbd
+economy
+254
+かお
+267
+tinker
+double
+375
+8gb
+voice
+##app
+oops
+channel
+today
+985
+##right
+raw
+xyz
+##＋
+jim
+edm
+##cent
+7500
+supreme
+814
+ds
+##its
+##asia
+dropbox
+##てすか
+##tti
+books
+272
+100ml
+##tle
+##ller
+##ken
+##more
+##boy
+sex
+309
+##dom
+t3
+##ider
+##なります
+##unch
+1903
+810
+feel
+5500
+##かった
+##put
+により
+s2
+mo
+##gh
+men
+ka
+amoled
+div
+##tr
+##n1
+port
+howard
+##tags
+ken
+dnf
+##nus
+adsense
+##а
+ide
+##へ
+buff
+thunder
+##town
+##ique
+has
+##body
+auto
+pin
+##erry
+tee
+てした
+295
+number
+##the
+##013
+object
+psp
+cool
+udnbkk
+16gb
+##mic
+miui
+##tro
+most
+r2
+##alk
+##nity
+1880
+±0
+##いました
+428
+s4
+law
+version
+##oa
+n1
+sgs
+docomo
+##tf
+##ack
+henry
+fc2
+##ded
+##sco
+##014
+##rite
+286
+0mm
+linkedin
+##ada
+##now
+wii
+##ndy
+ucbug
+##◎
+sputniknews
+legalminer
+##ika
+##xp
+2gb
+##bu
+q10
+oo
+b6
+come
+##rman
+cheese
+ming
+maker
+##gm
+nikon
+##fig
+ppi
+kelly
+##ります
+jchere
+てきます
+ted
+md
+003
+fgo
+tech
+##tto
+dan
+soc
+##gl
+##len
+hair
+earth
+640
+521
+img
+##pper
+##a1
+##てきる
+##ロク
+acca
+##ition
+##ference
+suite
+##ig
+outlook
+##mond
+##cation
+398
+##pr
+279
+101vip
+358
+##999
+282
+64gb
+3800
+345
+airport
+##over
+284
+##おり
+jones
+##ith
+lab
+##su
+##いるのて
+co2
+town
+piece
+##llo
+no1
+vmware
+24h
+##qi
+focus
+reader
+##admin
+##ora
+tb
+false
+##log
+1898
+know
+lan
+838
+##ces
+f4
+##ume
+motel
+stop
+##oper
+na
+flickr
+netcomponents
+##af
+##─
+pose
+williams
+local
+##ound
+##cg
+##site
+##iko
+いお
+274
+5m
+gsm
+con
+##ath
+1902
+friends
+##hip
+cell
+317
+##rey
+780
+cream
+##cks
+012
+##dp
+facebooktwitterpinterestgoogle
+sso
+324
+shtml
+song
+swiss
+##mw
+##キンク
+lumia
+xdd
+string
+tiffany
+522
+marc
+られた
+insee
+russell
+sc
+dell
+##ations
+ｏｋ
+camera
+289
+##vs
+##flow
+##late
+classic
+287
+##nter
+stay
+g1
+mtv
+512
+##ever
+##lab
+##nger
+qe
+sata
+ryan
+d1
+50ml
+cms
+##cing
+su
+292
+3300
+editor
+296
+##nap
+security
+sunday
+association
+##ens
+##700
+##bra
+acg
+##かり
+sofascore
+とは
+mkv
+##ign
+jonathan
+gary
+build
+labels
+##oto
+tesla
+moba
+qi
+gohappy
+general
+ajax
+1024
+##かる
+サイト
+society
+##test
+##urs
+wps
+fedora
+##ich
+mozilla
+328
+##480
+##dr
+usa
+urn
+##lina
+##ｒ
+grace
+##die
+##try
+##ader
+1250
+##なり
+elle
+570
+##chen
+##ᆯ
+price
+##ten
+uhz
+##ough
+eq
+##hen
+states
+push
+session
+balance
+wow
+506
+##cus
+##py
+when
+##ward
+##ep
+34e
+wong
+library
+prada
+##サイト
+##cle
+running
+##ree
+313
+ck
+date
+q4
+##ctive
+##ool
+##＞
+mk
+##ira
+##163
+388
+die
+secret
+rq
+dota
+buffet
+は１ヶ
+e6
+##ez
+pan
+368
+ha
+##card
+##cha
+2a
+##さ
+alan
+day3
+eye
+f3
+##end
+france
+keep
+adi
+rna
+tvbs
+##ala
+solo
+nova
+##え
+##tail
+##ょう
+support
+##ries
+##なる
+##ved
+base
+copy
+iis
+fps
+##ways
+hero
+hgih
+profile
+fish
+mu
+ssh
+entertainment
+chang
+##wd
+click
+cake
+##ond
+pre
+##tom
+kic
+pixel
+##ov
+##fl
+product
+6a
+##pd
+dear
+##gate
+es
+yumi
+audio
+##²
+##sky
+echo
+bin
+where
+##ture
+329
+##ape
+find
+sap
+isis
+##なと
+nand
+##101
+##load
+##ream
+band
+a6
+525
+never
+##post
+festival
+50cm
+##we
+555
+guide
+314
+zenfone
+##ike
+335
+gd
+forum
+jessica
+strong
+alexander
+##ould
+software
+allen
+##ious
+program
+360°
+else
+lohasthree
+##gar
+することかてきます
+please
+##れます
+rc
+##ggle
+##ric
+bim
+50000
+##own
+eclipse
+355
+brian
+3ds
+##side
+061
+361
+##other
+##ける
+##tech
+##ator
+485
+engine
+##ged
+##ｔ
+plaza
+##fit
+cia
+ngo
+westbrook
+shi
+tbs
+50mm
+##みませんか
+sci
+291
+reuters
+##ily
+contextlink
+##hn
+af
+##cil
+bridge
+very
+##cel
+1890
+cambridge
+##ize
+15g
+##aid
+##data
+790
+frm
+##head
+award
+butler
+##sun
+meta
+##mar
+america
+ps3
+puma
+pmid
+##すか
+lc
+670
+kitchen
+##lic
+オーフン5
+きなしソフトサーヒス
+そして
+day1
+future
+★★★★
+##text
+##page
+##rris
+pm1
+##ket
+fans
+##っています
+1001
+christian
+bot
+kids
+trackback
+##hai
+c3
+display
+##hl
+n2
+1896
+idea
+さんも
+##sent
+airmail
+##ug
+##men
+pwm
+けます
+028
+##lution
+369
+852
+awards
+schemas
+354
+asics
+wikipedia
+font
+##tional
+##vy
+c2
+293
+##れている
+##dget
+##ein
+っている
+contact
+pepper
+スキル
+339
+##～5
+294
+##uel
+##ument
+730
+##hang
+みてす
+q5
+##sue
+rain
+##ndi
+wei
+swatch
+##cept
+わせ
+331
+popular
+##ste
+##tag
+p2
+501
+trc
+1899
+##west
+##live
+justin
+honda
+ping
+messenger
+##rap
+v9
+543
+##とは
+unity
+appqq
+はすへて
+025
+leo
+##tone
+##テ
+##ass
+uniqlo
+##010
+502
+her
+jane
+memory
+moneydj
+##tical
+human
+12306
+していると
+##m2
+coc
+miacare
+##mn
+tmt
+##core
+vim
+kk
+##may
+fan
+target
+use
+too
+338
+435
+2050
+867
+737
+fast
+##2c
+services
+##ope
+omega
+energy
+##わ
+pinkoi
+1a
+##なから
+##rain
+jackson
+##ement
+##シャンルの
+374
+366
+そんな
+p9
+rd
+##ᆨ
+1111
+##tier
+##vic
+zone
+##│
+385
+690
+dl
+isofix
+cpa
+m4
+322
+kimi
+めて
+davis
+##lay
+lulu
+##uck
+050
+weeks
+qs
+##hop
+920
+##ｎ
+ae
+##ear
+～5
+eia
+405
+##fly
+korea
+jpeg
+boost
+##ship
+small
+##リア
+1860
+eur
+297
+425
+valley
+##iel
+simple
+##ude
+rn
+k2
+##ena
+されます
+non
+patrick
+しているから
+##ナー
+feed
+5757
+30g
+process
+well
+qqmei
+##thing
+they
+aws
+lu
+pink
+##ters
+##kin
+または
+board
+##vertisement
+wine
+##ien
+unicode
+##dge
+r1
+359
+##tant
+いを
+##twitter
+##3c
+cool1
+される
+##れて
+##ｌ
+isp
+##012
+standard
+45㎡2
+402
+##150
+matt
+##fu
+326
+##iner
+googlemsn
+pixnetfacebookyahoo
+##ラン
+x7
+886
+##uce
+メーカー
+sao
+##ev
+##きました
+##file
+9678
+403
+xddd
+shirt
+6l
+##rio
+##hat
+3mm
+givenchy
+ya
+bang
+##lio
+monday
+crystal
+ロクイン
+##abc
+336
+head
+890
+ubuntuforumwikilinuxpastechat
+##vc
+##～20
+##rity
+cnc
+7866
+ipv6
+null
+1897
+##ost
+yang
+imsean
+tiger
+##fet
+##ンス
+352
+##＝
+dji
+327
+ji
+maria
+##come
+##んて
+foundation
+3100
+##beth
+##なった
+1m
+601
+active
+##aft
+##don
+3p
+sr
+349
+emma
+##khz
+living
+415
+353
+1889
+341
+709
+457
+sas
+x6
+##face
+pptv
+x4
+##mate
+han
+sophie
+##jing
+337
+fifa
+##mand
+other
+sale
+inwedding
+##gn
+てきちゃいます
+##mmy
+##pmlast
+bad
+nana
+nbc
+してみてくたさいね
+なとはお
+##wu
+##かあります
+##あ
+note7
+single
+##340
+せからこ
+してくたさい♪この
+しにはとんとんワークケートを
+するとあなたにもっとマッチした
+ならワークケートへ
+もみつかっちゃうかも
+ワークケートの
+##bel
+window
+##dio
+##ht
+union
+age
+382
+１４
+##ivity
+##ｙ
+コメント
+domain
+neo
+##isa
+##lter
+5k
+f5
+steven
+##cts
+powerpoint
+tft
+self
+g2
+ft
+##テル
+zol
+##act
+mwc
+381
+343
+もう
+nbapop
+408
+てある
+eds
+ace
+##room
+previous
+author
+tomtom
+il
+##ets
+hu
+financial
+☆☆☆
+っています
+bp
+5t
+chi
+1gb
+##hg
+fairmont
+cross
+008
+gay
+h2
+function
+##けて
+356
+also
+1b
+625
+##ータ
+##raph
+1894
+3～5
+##ils
+i3
+334
+avenue
+##host
+による
+##bon
+##tsu
+message
+navigation
+50g
+fintech
+h6
+##ことを
+8cm
+##ject
+##vas
+##firm
+credit
+##wf
+xxxx
+form
+##nor
+##space
+huawei
+plan
+json
+sbl
+##dc
+machine
+921
+392
+wish
+##120
+##sol
+windows7
+edward
+##ために
+development
+washington
+##nsis
+lo
+818
+##sio
+##ym
+##bor
+planet
+##～8
+##wt
+ieee
+gpa
+##めて
+camp
+ann
+gm
+##tw
+##oka
+connect
+##rss
+##work
+##atus
+wall
+chicken
+soul
+2mm
+##times
+fa
+##ather
+##cord
+009
+##eep
+hitachi
+gui
+harry
+##pan
+e1
+disney
+##press
+##ーション
+wind
+386
+frigidaire
+##tl
+liu
+hsu
+332
+basic
+von
+ev
+いた
+てきる
+スホンサーサイト
+learning
+##ull
+expedia
+archives
+change
+##wei
+santa
+cut
+ins
+6gb
+turbo
+brand
+cf1
+508
+004
+return
+747
+##rip
+h1
+##nis
+##をこ
+128gb
+##にお
+3t
+application
+しており
+emc
+rx
+##oon
+384
+quick
+412
+15058
+wilson
+wing
+chapter
+##bug
+beyond
+##cms
+##dar
+##oh
+zoom
+e2
+trip
+sb
+##nba
+rcep
+342
+aspx
+ci
+080
+gc
+gnu
+める
+##count
+advanced
+dance
+dv
+##url
+##ging
+367
+8591
+am09
+shadow
+battle
+346
+##ｉ
+##cia
+##という
+emily
+##のてす
+##tation
+host
+ff
+techorz
+sars
+##mini
+##mporary
+##ering
+nc
+4200
+798
+##next
+cma
+##mbps
+##gas
+##ift
+##dot
+##ィ
+455
+##～17
+amana
+##りの
+426
+##ros
+ir
+00㎡1
+##eet
+##ible
+##↓
+710
+ˋ▽ˊ
+##aka
+dcs
+iq
+##ｖ
+l1
+##lor
+maggie
+##011
+##iu
+588
+##～1
+830
+##gt
+1tb
+articles
+create
+##burg
+##iki
+database
+fantasy
+##rex
+##cam
+dlc
+dean
+##you
+hard
+path
+gaming
+victoria
+maps
+cb
+##lee
+##itor
+overchicstoretvhome
+systems
+##xt
+416
+p3
+sarah
+760
+##nan
+407
+486
+x9
+install
+second
+626
+##ann
+##ph
+##rcle
+##nic
+860
+##nar
+ec
+##とう
+768
+metro
+chocolate
+##rian
+～4
+##table
+##しています
+skin
+##sn
+395
+mountain
+##0mm
+inparadise
+6m
+7x24
+ib
+4800
+##jia
+eeworld
+creative
+g5
+g3
+357
+parker
+ecfa
+village
+からの
+18000
+sylvia
+サーヒス
+hbl
+##ques
+##onsored
+##x2
+##きます
+##v4
+##tein
+ie6
+383
+##stack
+389
+ver
+##ads
+##baby
+sound
+bbe
+##110
+##lone
+##uid
+ads
+022
+gundam
+351
+thinkpad
+006
+scrum
+match
+##ave
+mems
+##470
+##oy
+##なりました
+##talk
+glass
+lamigo
+span
+##eme
+job
+##a5
+jay
+wade
+kde
+498
+##lace
+ocean
+tvg
+##covery
+##r3
+##ners
+##rea
+junior
+think
+##aine
+cover
+##ision
+##sia
+↓↓
+##bow
+msi
+413
+458
+406
+##love
+711
+801
+soft
+z2
+##pl
+456
+1840
+mobil
+mind
+##uy
+427
+nginx
+##oi
+めた
+##rr
+6221
+##mple
+##sson
+##ーシてす
+371
+##nts
+91tv
+comhd
+crv3000
+##uard
+1868
+397
+deep
+lost
+field
+gallery
+##bia
+rate
+spf
+redis
+traction
+930
+icloud
+011
+なら
+fe
+jose
+372
+##tory
+into
+sohu
+fx
+899
+379
+kicstart2
+##hia
+すく
+##～3
+##sit
+ra
+２４
+##walk
+##xure
+500g
+##pact
+pacific
+xa
+natural
+carlo
+##250
+##walker
+1850
+##can
+cto
+gigi
+516
+##サー
+pen
+##hoo
+ob
+matlab
+##ｂ
+##yy
+13913459
+##iti
+mango
+##bbs
+sense
+c5
+oxford
+##ニア
+walker
+jennifer
+##ola
+course
+##bre
+701
+##pus
+##rder
+lucky
+075
+##ぁ
+ivy
+なお
+##nia
+sotheby
+side
+##ugh
+joy
+##orage
+##ush
+##bat
+##dt
+364
+r9
+##2d
+##gio
+511
+country
+wear
+##lax
+##～7
+##moon
+393
+seven
+study
+411
+348
+lonzo
+8k
+##ェ
+evolution
+##イフ
+##kk
+gs
+kd
+##レス
+arduino
+344
+b12
+##lux
+arpg
+##rdon
+cook
+##x5
+dark
+five
+##als
+##ida
+とても
+sign
+362
+##ちの
+something
+20mm
+##nda
+387
+##posted
+fresh
+tf
+1870
+422
+cam
+##mine
+##skip
+##form
+##ssion
+education
+394
+##tee
+dyson
+stage
+##jie
+want
+##night
+epson
+pack
+あります
+##ppy
+テリヘル
+##█
+wd
+##eh
+##rence
+left
+##lvin
+golden
+mhz
+discovery
+##trix
+##n2
+loft
+##uch
+##dra
+##sse
+speed
+～1
+1mdb
+sorry
+welcome
+##urn
+wave
+gaga
+##lmer
+teddy
+##160
+トラックハック
+せよ
+611
+##f2016
+378
+rp
+##sha
+rar
+##あなたに
+##きた
+840
+holiday
+##ュー
+373
+074
+##vg
+##nos
+##rail
+gartner
+gi
+6p
+##dium
+kit
+488
+b3
+eco
+##ろう
+20g
+sean
+##stone
+autocad
+nu
+##np
+f16
+write
+029
+m5
+##ias
+images
+atp
+##dk
+fsm
+504
+1350
+ve
+52kb
+##xxx
+##のに
+##cake
+414
+unit
+lim
+ru
+1v
+##ification
+published
+angela
+16g
+analytics
+ak
+##ｑ
+##nel
+gmt
+##icon
+again
+##₂
+##bby
+ios11
+445
+かこさいます
+waze
+いてす
+##ハ
+9985
+##ust
+##ティー
+framework
+##007
+iptv
+delete
+52sykb
+cl
+wwdc
+027
+30cm
+##fw
+##ての
+1389
+##xon
+brandt
+##ses
+##dragon
+tc
+vetements
+anne
+monte
+modern
+official
+##へて
+##ere
+##nne
+##oud
+もちろん
+５０
+etnews
+##a2
+##graphy
+421
+863
+##ちゃん
+444
+##rtex
+##てお
+l2
+##gma
+mount
+ccd
+たと
+archive
+morning
+tan
+ddos
+e7
+##ホ
+day4
+##ウ
+gis
+453
+its
+495
+factory
+bruce
+pg
+##ito
+ってくたさい
+guest
+cdma
+##lling
+536
+n3
+しかし
+3～4
+mega
+eyes
+ro
+１３
+women
+dac
+church
+##jun
+singapore
+##facebook
+6991
+starbucks
+##tos
+##stin
+##shine
+zen
+##mu
+tina
+20℃
+1893
+##たけて
+503
+465
+request
+##gence
+qt
+##っ
+1886
+347
+363
+q7
+##zzi
+diary
+##tore
+409
+##ead
+468
+cst
+##osa
+canada
+agent
+va
+##jiang
+##ちは
+##ーク
+##lam
+sg
+##nix
+##sday
+##よって
+g6
+##master
+bing
+##zl
+charlie
+１６
+8mm
+nb40
+##ーン
+thai
+##ルフ
+ln284ct
+##itz
+##2f
+bonnie
+##food
+##lent
+originals
+##stro
+##lts
+418
+∟∣
+##bscribe
+children
+ntd
+yesstyle
+##かも
+hmv
+##tment
+d5
+2cm
+arts
+sms
+##pn
+##я
+##いい
+topios9
+539
+lifestyle
+virtual
+##ague
+xz
+##deo
+muji
+024
+unt
+##nnis
+##ᅩ
+faq1
+1884
+396
+##ette
+fly
+64㎡
+はしめまして
+441
+curry
+##pop
+のこ
+release
+##←
+##◆◆
+##cast
+073
+ありな
+500ml
+##ews
+5c
+##stle
+ios7
+##ima
+787
+dog
+lenovo
+##r4
+roger
+013
+cbs
+vornado
+100m
+417
+##desk
+##クok
+##ald
+1867
+9595
+2900
+##van
+oil
+##ｘ
+some
+break
+common
+##jy
+##lines
+g7
+twice
+419
+ella
+nano
+belle
+にこ
+##mes
+##self
+##note
+jb
+##ことかてきます
+benz
+##との
+##ova
+451
+save
+##wing
+##ますのて
+kai
+りは
+##hua
+##rect
+rainer
+##unge
+448
+##0m
+adsl
+##かな
+guestname
+##uma
+##kins
+##zu
+tokichoi
+##price
+county
+##med
+##mus
+rmk
+391
+address
+vm
+えて
+openload
+##group
+##hin
+##iginal
+amg
+urban
+##oz
+jobs
+emi
+##public
+beautiful
+##sch
+album
+##dden
+##bell
+jerry
+works
+hostel
+miller
+##drive
+##rmin
+##１０
+376
+boot
+828
+##370
+##fx
+##cm～
+1885
+##nome
+##ctionary
+##oman
+##lish
+##cr
+##hm
+433
+##how
+432
+francis
+xi
+c919
+b5
+evernote
+##uc
+vga
+##3000
+coupe
+##urg
+##cca
+##uality
+019
+6g
+れる
+multi
+##また
+##ett
+em
+hey
+##ani
+##tax
+##rma
+inside
+than
+740
+leonnhurt
+##jin
+ict
+れた
+bird
+notes
+200mm
+くの
+##dical
+##lli
+result
+442
+iu
+ee
+438
+smap
+gopro
+##last
+yin
+pure
+998
+32g
+けた
+5kg
+##dan
+##rame
+mama
+##oot
+bean
+marketing
+##hur
+2l
+bella
+sync
+xuite
+##ground
+515
+discuz
+##getrelax
+##ince
+##bay
+##5s
+cj
+##イス
+gmat
+apt
+##pass
+jing
+##rix
+c4
+rich
+##とても
+niusnews
+##ello
+bag
+770
+##eting
+##mobile
+１８
+culture
+015
+##のてすか
+377
+1020
+area
+##ience
+616
+details
+gp
+universal
+silver
+dit
+はお
+private
+ddd
+u11
+kanshu
+##ified
+fung
+##nny
+dx
+##520
+tai
+475
+023
+##fr
+##lean
+3s
+##pin
+429
+##rin
+25000
+ly
+rick
+##bility
+usb3
+banner
+##baru
+##gion
+metal
+dt
+vdf
+1871
+karl
+qualcomm
+bear
+1010
+oldid
+ian
+jo
+##tors
+population
+##ernel
+1882
+mmorpg
+##mv
+##bike
+603
+##©
+ww
+friend
+##ager
+exhibition
+##del
+##pods
+fpx
+structure
+##free
+##tings
+kl
+##rley
+##copyright
+##mma
+california
+3400
+orange
+yoga
+4l
+canmake
+honey
+##anda
+##コメント
+595
+nikkie
+##ルハイト
+dhl
+publishing
+##mall
+##gnet
+20cm
+513
+##クセス
+##┅
+e88
+970
+##dog
+fishbase
+##!
+##"
+###
+##$
+##%
+##&
+##'
+##(
+##)
+##*
+##+
+##,
+##-
+##.
+##/
+##:
+##;
+##<
+##=
+##>
+##?
+##@
+##[
+##\
+##]
+##^
+##_
+##{
+##|
+##}
+##~
+##£
+##¤
+##¥
+##§
+##«
+##±
+##³
+##µ
+##·
+##¹
+##º
+##»
+##¼
+##ß
+##æ
+##÷
+##ø
+##đ
+##ŋ
+##ɔ
+##ə
+##ɡ
+##ʰ
+##ˇ
+##ˈ
+##ˊ
+##ˋ
+##ˍ
+##ː
+##˙
+##˚
+##ˢ
+##α
+##β
+##γ
+##δ
+##ε
+##η
+##θ
+##ι
+##κ
+##λ
+##μ
+##ν
+##ο
+##π
+##ρ
+##ς
+##σ
+##τ
+##υ
+##φ
+##χ
+##ψ
+##б
+##в
+##г
+##д
+##е
+##ж
+##з
+##к
+##л
+##м
+##н
+##о
+##п
+##р
+##с
+##т
+##у
+##ф
+##х
+##ц
+##ч
+##ш
+##ы
+##ь
+##і
+##ا
+##ب
+##ة
+##ت
+##د
+##ر
+##س
+##ع
+##ل
+##م
+##ن
+##ه
+##و
+##ي
+##۩
+##ก
+##ง
+##น
+##ม
+##ย
+##ร
+##อ
+##า
+##เ
+##๑
+##་
+##ღ
+##ᄀ
+##ᄁ
+##ᄂ
+##ᄃ
+##ᄅ
+##ᄆ
+##ᄇ
+##ᄈ
+##ᄉ
+##ᄋ
+##ᄌ
+##ᄎ
+##ᄏ
+##ᄐ
+##ᄑ
+##ᄒ
+##ᅢ
+##ᅣ
+##ᅥ
+##ᅦ
+##ᅧ
+##ᅨ
+##ᅪ
+##ᅬ
+##ᅭ
+##ᅮ
+##ᅯ
+##ᅲ
+##ᅳ
+##ᅴ
+##ᆷ
+##ᆸ
+##ᆺ
+##ᆻ
+##ᗜ
+##ᵃ
+##ᵉ
+##ᵍ
+##ᵏ
+##ᵐ
+##ᵒ
+##ᵘ
+##‖
+##„
+##†
+##•
+##‥
+##‧
+## 
+##‰
+##′
+##″
+##‹
+##›
+##※
+##‿
+##⁄
+##ⁱ
+##⁺
+##ⁿ
+##₁
+##₃
+##₄
+##€
+##№
+##ⅰ
+##ⅱ
+##ⅲ
+##ⅳ
+##ⅴ
+##↔
+##↗
+##↘
+##⇒
+##∀
+##−
+##∕
+##∙
+##√
+##∞
+##∟
+##∠
+##∣
+##∩
+##∮
+##∶
+##∼
+##∽
+##≈
+##≒
+##≡
+##≤
+##≥
+##≦
+##≧
+##≪
+##≫
+##⊙
+##⋅
+##⋈
+##⋯
+##⌒
+##①
+##②
+##③
+##④
+##⑤
+##⑥
+##⑦
+##⑧
+##⑨
+##⑩
+##⑴
+##⑵
+##⑶
+##⑷
+##⑸
+##⒈
+##⒉
+##⒊
+##⒋
+##ⓒ
+##ⓔ
+##ⓘ
+##━
+##┃
+##┆
+##┊
+##┌
+##└
+##├
+##┣
+##═
+##║
+##╚
+##╞
+##╠
+##╭
+##╮
+##╯
+##╰
+##╱
+##╳
+##▂
+##▃
+##▅
+##▇
+##▉
+##▋
+##▌
+##▍
+##▎
+##□
+##▪
+##▫
+##▬
+##△
+##▶
+##►
+##▽
+##◇
+##◕
+##◠
+##◢
+##◤
+##☀
+##☕
+##☞
+##☺
+##☼
+##♀
+##♂
+##♠
+##♡
+##♣
+##♦
+##♫
+##♬
+##✈
+##✔
+##✕
+##✖
+##✦
+##✨
+##✪
+##✰
+##✿
+##❀
+##➜
+##➤
+##⦿
+##、
+##。
+##〃
+##々
+##〇
+##〈
+##〉
+##《
+##》
+##「
+##」
+##『
+##』
+##【
+##】
+##〓
+##〔
+##〕
+##〖
+##〗
+##〜
+##〝
+##〞
+##ぃ
+##ぇ
+##ぬ
+##ふ
+##ほ
+##む
+##ゃ
+##ゅ
+##ゆ
+##ょ
+##゜
+##ゝ
+##ァ
+##ゥ
+##エ
+##ォ
+##ケ
+##サ
+##セ
+##ソ
+##ッ
+##ニ
+##ヌ
+##ネ
+##ノ
+##ヘ
+##モ
+##ャ
+##ヤ
+##ュ
+##ユ
+##ョ
+##ヨ
+##ワ
+##ヲ
+##・
+##ヽ
+##ㄅ
+##ㄆ
+##ㄇ
+##ㄉ
+##ㄋ
+##ㄌ
+##ㄍ
+##ㄎ
+##ㄏ
+##ㄒ
+##ㄚ
+##ㄛ
+##ㄞ
+##ㄟ
+##ㄢ
+##ㄤ
+##ㄥ
+##ㄧ
+##ㄨ
+##ㆍ
+##㈦
+##㊣
+##㗎
+##一
+##丁
+##七
+##万
+##丈
+##三
+##上
+##下
+##不
+##与
+##丐
+##丑
+##专
+##且
+##丕
+##世
+##丘
+##丙
+##业
+##丛
+##东
+##丝
+##丞
+##丟
+##両
+##丢
+##两
+##严
+##並
+##丧
+##丨
+##个
+##丫
+##中
+##丰
+##串
+##临
+##丶
+##丸
+##丹
+##为
+##主
+##丼
+##丽
+##举
+##丿
+##乂
+##乃
+##久
+##么
+##义
+##之
+##乌
+##乍
+##乎
+##乏
+##乐
+##乒
+##乓
+##乔
+##乖
+##乗
+##乘
+##乙
+##乜
+##九
+##乞
+##也
+##习
+##乡
+##书
+##乩
+##买
+##乱
+##乳
+##乾
+##亀
+##亂
+##了
+##予
+##争
+##事
+##二
+##于
+##亏
+##云
+##互
+##五
+##井
+##亘
+##亙
+##亚
+##些
+##亜
+##亞
+##亟
+##亡
+##亢
+##交
+##亥
+##亦
+##产
+##亨
+##亩
+##享
+##京
+##亭
+##亮
+##亲
+##亳
+##亵
+##人
+##亿
+##什
+##仁
+##仃
+##仄
+##仅
+##仆
+##仇
+##今
+##介
+##仍
+##从
+##仏
+##仑
+##仓
+##仔
+##仕
+##他
+##仗
+##付
+##仙
+##仝
+##仞
+##仟
+##代
+##令
+##以
+##仨
+##仪
+##们
+##仮
+##仰
+##仲
+##件
+##价
+##任
+##份
+##仿
+##企
+##伉
+##伊
+##伍
+##伎
+##伏
+##伐
+##休
+##伕
+##众
+##优
+##伙
+##会
+##伝
+##伞
+##伟
+##传
+##伢
+##伤
+##伦
+##伪
+##伫
+##伯
+##估
+##伴
+##伶
+##伸
+##伺
+##似
+##伽
+##佃
+##但
+##佇
+##佈
+##位
+##低
+##住
+##佐
+##佑
+##体
+##佔
+##何
+##佗
+##佘
+##余
+##佚
+##佛
+##作
+##佝
+##佞
+##佟
+##你
+##佢
+##佣
+##佤
+##佥
+##佩
+##佬
+##佯
+##佰
+##佳
+##併
+##佶
+##佻
+##佼
+##使
+##侃
+##侄
+##來
+##侈
+##例
+##侍
+##侏
+##侑
+##侖
+##侗
+##供
+##依
+##侠
+##価
+##侣
+##侥
+##侦
+##侧
+##侨
+##侬
+##侮
+##侯
+##侵
+##侶
+##侷
+##便
+##係
+##促
+##俄
+##俊
+##俎
+##俏
+##俐
+##俑
+##俗
+##俘
+##俚
+##保
+##俞
+##俟
+##俠
+##信
+##俨
+##俩
+##俪
+##俬
+##俭
+##修
+##俯
+##俱
+##俳
+##俸
+##俺
+##俾
+##倆
+##倉
+##個
+##倌
+##倍
+##倏
+##們
+##倒
+##倔
+##倖
+##倘
+##候
+##倚
+##倜
+##借
+##倡
+##値
+##倦
+##倩
+##倪
+##倫
+##倬
+##倭
+##倶
+##债
+##值
+##倾
+##偃
+##假
+##偈
+##偉
+##偌
+##偎
+##偏
+##偕
+##做
+##停
+##健
+##側
+##偵
+##偶
+##偷
+##偻
+##偽
+##偿
+##傀
+##傅
+##傍
+##傑
+##傘
+##備
+##傚
+##傢
+##傣
+##傥
+##储
+##傩
+##催
+##傭
+##傲
+##傳
+##債
+##傷
+##傻
+##傾
+##僅
+##働
+##像
+##僑
+##僕
+##僖
+##僚
+##僥
+##僧
+##僭
+##僮
+##僱
+##僵
+##價
+##僻
+##儀
+##儂
+##億
+##儆
+##儉
+##儋
+##儒
+##儕
+##儘
+##償
+##儡
+##優
+##儲
+##儷
+##儼
+##儿
+##兀
+##允
+##元
+##兄
+##充
+##兆
+##兇
+##先
+##光
+##克
+##兌
+##免
+##児
+##兑
+##兒
+##兔
+##兖
+##党
+##兜
+##兢
+##入
+##內
+##全
+##兩
+##八
+##公
+##六
+##兮
+##兰
+##共
+##兲
+##关
+##兴
+##兵
+##其
+##具
+##典
+##兹
+##养
+##兼
+##兽
+##冀
+##内
+##円
+##冇
+##冈
+##冉
+##冊
+##册
+##再
+##冏
+##冒
+##冕
+##冗
+##写
+##军
+##农
+##冠
+##冢
+##冤
+##冥
+##冨
+##冪
+##冬
+##冯
+##冰
+##冲
+##决
+##况
+##冶
+##冷
+##冻
+##冼
+##冽
+##冾
+##净
+##凄
+##准
+##凇
+##凈
+##凉
+##凋
+##凌
+##凍
+##减
+##凑
+##凛
+##凜
+##凝
+##几
+##凡
+##凤
+##処
+##凪
+##凭
+##凯
+##凰
+##凱
+##凳
+##凶
+##凸
+##凹
+##出
+##击
+##函
+##凿
+##刀
+##刁
+##刃
+##分
+##切
+##刈
+##刊
+##刍
+##刎
+##刑
+##划
+##列
+##刘
+##则
+##刚
+##创
+##初
+##删
+##判
+##別
+##刨
+##利
+##刪
+##别
+##刮
+##到
+##制
+##刷
+##券
+##刹
+##刺
+##刻
+##刽
+##剁
+##剂
+##剃
+##則
+##剉
+##削
+##剋
+##剌
+##前
+##剎
+##剐
+##剑
+##剔
+##剖
+##剛
+##剜
+##剝
+##剣
+##剤
+##剥
+##剧
+##剩
+##剪
+##副
+##割
+##創
+##剷
+##剽
+##剿
+##劃
+##劇
+##劈
+##劉
+##劊
+##劍
+##劏
+##劑
+##力
+##劝
+##办
+##功
+##加
+##务
+##劣
+##动
+##助
+##努
+##劫
+##劭
+##励
+##劲
+##劳
+##労
+##劵
+##効
+##劾
+##势
+##勁
+##勃
+##勇
+##勉
+##勋
+##勐
+##勒
+##動
+##勖
+##勘
+##務
+##勛
+##勝
+##勞
+##募
+##勢
+##勤
+##勧
+##勳
+##勵
+##勸
+##勺
+##勻
+##勾
+##勿
+##匀
+##包
+##匆
+##匈
+##匍
+##匐
+##匕
+##化
+##北
+##匙
+##匝
+##匠
+##匡
+##匣
+##匪
+##匮
+##匯
+##匱
+##匹
+##区
+##医
+##匾
+##匿
+##區
+##十
+##千
+##卅
+##升
+##午
+##卉
+##半
+##卍
+##华
+##协
+##卑
+##卒
+##卓
+##協
+##单
+##卖
+##南
+##単
+##博
+##卜
+##卞
+##卟
+##占
+##卡
+##卢
+##卤
+##卦
+##卧
+##卫
+##卮
+##卯
+##印
+##危
+##即
+##却
+##卵
+##卷
+##卸
+##卻
+##卿
+##厂
+##厄
+##厅
+##历
+##厉
+##压
+##厌
+##厕
+##厘
+##厚
+##厝
+##原
+##厢
+##厥
+##厦
+##厨
+##厩
+##厭
+##厮
+##厲
+##厳
+##去
+##县
+##叁
+##参
+##參
+##又
+##叉
+##及
+##友
+##双
+##反
+##収
+##发
+##叔
+##取
+##受
+##变
+##叙
+##叛
+##叟
+##叠
+##叡
+##叢
+##口
+##古
+##句
+##另
+##叨
+##叩
+##只
+##叫
+##召
+##叭
+##叮
+##可
+##台
+##叱
+##史
+##右
+##叵
+##叶
+##号
+##司
+##叹
+##叻
+##叼
+##叽
+##吁
+##吃
+##各
+##吆
+##合
+##吉
+##吊
+##吋
+##同
+##名
+##后
+##吏
+##吐
+##向
+##吒
+##吓
+##吕
+##吖
+##吗
+##君
+##吝
+##吞
+##吟
+##吠
+##吡
+##否
+##吧
+##吨
+##吩
+##含
+##听
+##吭
+##吮
+##启
+##吱
+##吳
+##吴
+##吵
+##吶
+##吸
+##吹
+##吻
+##吼
+##吽
+##吾
+##呀
+##呂
+##呃
+##呆
+##呈
+##告
+##呋
+##呎
+##呐
+##呓
+##呕
+##呗
+##员
+##呛
+##呜
+##呢
+##呤
+##呦
+##周
+##呱
+##呲
+##味
+##呵
+##呷
+##呸
+##呻
+##呼
+##命
+##咀
+##咁
+##咂
+##咄
+##咆
+##咋
+##和
+##咎
+##咏
+##咐
+##咒
+##咔
+##咕
+##咖
+##咗
+##咘
+##咙
+##咚
+##咛
+##咣
+##咤
+##咦
+##咧
+##咨
+##咩
+##咪
+##咫
+##咬
+##咭
+##咯
+##咱
+##咲
+##咳
+##咸
+##咻
+##咽
+##咿
+##哀
+##品
+##哂
+##哄
+##哆
+##哇
+##哈
+##哉
+##哋
+##哌
+##响
+##哎
+##哏
+##哐
+##哑
+##哒
+##哔
+##哗
+##哟
+##員
+##哥
+##哦
+##哧
+##哨
+##哩
+##哪
+##哭
+##哮
+##哲
+##哺
+##哼
+##哽
+##唁
+##唄
+##唆
+##唇
+##唉
+##唏
+##唐
+##唑
+##唔
+##唠
+##唤
+##唧
+##唬
+##售
+##唯
+##唰
+##唱
+##唳
+##唷
+##唸
+##唾
+##啃
+##啄
+##商
+##啉
+##啊
+##問
+##啓
+##啕
+##啖
+##啜
+##啞
+##啟
+##啡
+##啤
+##啥
+##啦
+##啧
+##啪
+##啫
+##啬
+##啮
+##啰
+##啱
+##啲
+##啵
+##啶
+##啷
+##啸
+##啻
+##啼
+##啾
+##喀
+##喂
+##喃
+##善
+##喆
+##喇
+##喉
+##喊
+##喋
+##喎
+##喏
+##喔
+##喘
+##喙
+##喚
+##喜
+##喝
+##喟
+##喧
+##喪
+##喫
+##喬
+##單
+##喰
+##喱
+##喲
+##喳
+##喵
+##営
+##喷
+##喹
+##喺
+##喻
+##喽
+##嗅
+##嗆
+##嗇
+##嗎
+##嗑
+##嗒
+##嗓
+##嗔
+##嗖
+##嗚
+##嗜
+##嗝
+##嗟
+##嗡
+##嗣
+##嗤
+##嗦
+##嗨
+##嗪
+##嗬
+##嗯
+##嗰
+##嗲
+##嗳
+##嗶
+##嗷
+##嗽
+##嘀
+##嘅
+##嘆
+##嘈
+##嘉
+##嘌
+##嘍
+##嘎
+##嘔
+##嘖
+##嘗
+##嘘
+##嘚
+##嘛
+##嘜
+##嘞
+##嘟
+##嘢
+##嘣
+##嘤
+##嘧
+##嘩
+##嘭
+##嘮
+##嘯
+##嘰
+##嘱
+##嘲
+##嘴
+##嘶
+##嘸
+##嘹
+##嘻
+##嘿
+##噁
+##噌
+##噎
+##噓
+##噔
+##噗
+##噙
+##噜
+##噠
+##噢
+##噤
+##器
+##噩
+##噪
+##噬
+##噱
+##噴
+##噶
+##噸
+##噹
+##噻
+##噼
+##嚀
+##嚇
+##嚎
+##嚏
+##嚐
+##嚓
+##嚕
+##嚟
+##嚣
+##嚥
+##嚨
+##嚮
+##嚴
+##嚷
+##嚼
+##囂
+##囉
+##囊
+##囍
+##囑
+##囔
+##囗
+##囚
+##四
+##囝
+##回
+##囟
+##因
+##囡
+##团
+##団
+##囤
+##囧
+##囪
+##囫
+##园
+##困
+##囱
+##囲
+##図
+##围
+##囹
+##固
+##国
+##图
+##囿
+##圃
+##圄
+##圆
+##圈
+##國
+##圍
+##圏
+##園
+##圓
+##圖
+##團
+##圜
+##土
+##圣
+##圧
+##在
+##圩
+##圭
+##地
+##圳
+##场
+##圻
+##圾
+##址
+##坂
+##均
+##坊
+##坍
+##坎
+##坏
+##坐
+##坑
+##块
+##坚
+##坛
+##坝
+##坞
+##坟
+##坠
+##坡
+##坤
+##坦
+##坨
+##坪
+##坯
+##坳
+##坵
+##坷
+##垂
+##垃
+##垄
+##型
+##垒
+##垚
+##垛
+##垠
+##垢
+##垣
+##垦
+##垩
+##垫
+##垭
+##垮
+##垵
+##埂
+##埃
+##埋
+##城
+##埔
+##埕
+##埗
+##域
+##埠
+##埤
+##埵
+##執
+##埸
+##培
+##基
+##埼
+##堀
+##堂
+##堃
+##堅
+##堆
+##堇
+##堑
+##堕
+##堙
+##堡
+##堤
+##堪
+##堯
+##堰
+##報
+##場
+##堵
+##堺
+##堿
+##塊
+##塌
+##塑
+##塔
+##塗
+##塘
+##塚
+##塞
+##塢
+##塩
+##填
+##塬
+##塭
+##塵
+##塾
+##墀
+##境
+##墅
+##墉
+##墊
+##墒
+##墓
+##増
+##墘
+##墙
+##墜
+##增
+##墟
+##墨
+##墩
+##墮
+##墳
+##墻
+##墾
+##壁
+##壅
+##壆
+##壇
+##壊
+##壑
+##壓
+##壕
+##壘
+##壞
+##壟
+##壢
+##壤
+##壩
+##士
+##壬
+##壮
+##壯
+##声
+##売
+##壳
+##壶
+##壹
+##壺
+##壽
+##处
+##备
+##変
+##复
+##夏
+##夔
+##夕
+##外
+##夙
+##多
+##夜
+##够
+##夠
+##夢
+##夥
+##大
+##天
+##太
+##夫
+##夭
+##央
+##夯
+##失
+##头
+##夷
+##夸
+##夹
+##夺
+##夾
+##奂
+##奄
+##奇
+##奈
+##奉
+##奋
+##奎
+##奏
+##奐
+##契
+##奔
+##奕
+##奖
+##套
+##奘
+##奚
+##奠
+##奢
+##奥
+##奧
+##奪
+##奬
+##奮
+##女
+##奴
+##奶
+##奸
+##她
+##好
+##如
+##妃
+##妄
+##妆
+##妇
+##妈
+##妊
+##妍
+##妒
+##妓
+##妖
+##妘
+##妙
+##妝
+##妞
+##妣
+##妤
+##妥
+##妨
+##妩
+##妪
+##妮
+##妲
+##妳
+##妹
+##妻
+##妾
+##姆
+##姉
+##姊
+##始
+##姍
+##姐
+##姑
+##姒
+##姓
+##委
+##姗
+##姚
+##姜
+##姝
+##姣
+##姥
+##姦
+##姨
+##姪
+##姫
+##姬
+##姹
+##姻
+##姿
+##威
+##娃
+##娄
+##娅
+##娆
+##娇
+##娉
+##娑
+##娓
+##娘
+##娛
+##娜
+##娟
+##娠
+##娣
+##娥
+##娩
+##娱
+##娲
+##娴
+##娶
+##娼
+##婀
+##婁
+##婆
+##婉
+##婊
+##婕
+##婚
+##婢
+##婦
+##婧
+##婪
+##婭
+##婴
+##婵
+##婶
+##婷
+##婺
+##婿
+##媒
+##媚
+##媛
+##媞
+##媧
+##媲
+##媳
+##媽
+##媾
+##嫁
+##嫂
+##嫉
+##嫌
+##嫑
+##嫔
+##嫖
+##嫘
+##嫚
+##嫡
+##嫣
+##嫦
+##嫩
+##嫲
+##嫵
+##嫻
+##嬅
+##嬉
+##嬌
+##嬗
+##嬛
+##嬢
+##嬤
+##嬪
+##嬰
+##嬴
+##嬷
+##嬸
+##嬿
+##孀
+##孃
+##子
+##孑
+##孔
+##孕
+##孖
+##字
+##存
+##孙
+##孚
+##孛
+##孜
+##孝
+##孟
+##孢
+##季
+##孤
+##学
+##孩
+##孪
+##孫
+##孬
+##孰
+##孱
+##孳
+##孵
+##學
+##孺
+##孽
+##孿
+##宁
+##它
+##宅
+##宇
+##守
+##安
+##宋
+##完
+##宏
+##宓
+##宕
+##宗
+##官
+##宙
+##定
+##宛
+##宜
+##宝
+##实
+##実
+##宠
+##审
+##客
+##宣
+##室
+##宥
+##宦
+##宪
+##宫
+##宮
+##宰
+##害
+##宴
+##宵
+##家
+##宸
+##容
+##宽
+##宾
+##宿
+##寂
+##寄
+##寅
+##密
+##寇
+##富
+##寐
+##寒
+##寓
+##寛
+##寝
+##寞
+##察
+##寡
+##寢
+##寥
+##實
+##寧
+##寨
+##審
+##寫
+##寬
+##寮
+##寰
+##寵
+##寶
+##寸
+##对
+##寺
+##寻
+##导
+##対
+##寿
+##封
+##専
+##射
+##将
+##將
+##專
+##尉
+##尊
+##尋
+##對
+##導
+##小
+##少
+##尔
+##尕
+##尖
+##尘
+##尚
+##尝
+##尤
+##尧
+##尬
+##就
+##尴
+##尷
+##尸
+##尹
+##尺
+##尻
+##尼
+##尽
+##尾
+##尿
+##局
+##屁
+##层
+##屄
+##居
+##屆
+##屈
+##屉
+##届
+##屋
+##屌
+##屍
+##屎
+##屏
+##屐
+##屑
+##展
+##屜
+##属
+##屠
+##屡
+##屢
+##層
+##履
+##屬
+##屯
+##山
+##屹
+##屿
+##岀
+##岁
+##岂
+##岌
+##岐
+##岑
+##岔
+##岖
+##岗
+##岘
+##岙
+##岚
+##岛
+##岡
+##岩
+##岫
+##岬
+##岭
+##岱
+##岳
+##岷
+##岸
+##峇
+##峋
+##峒
+##峙
+##峡
+##峤
+##峥
+##峦
+##峨
+##峪
+##峭
+##峯
+##峰
+##峴
+##島
+##峻
+##峽
+##崁
+##崂
+##崆
+##崇
+##崎
+##崑
+##崔
+##崖
+##崗
+##崙
+##崛
+##崧
+##崩
+##崭
+##崴
+##崽
+##嵇
+##嵊
+##嵋
+##嵌
+##嵐
+##嵘
+##嵩
+##嵬
+##嵯
+##嶂
+##嶄
+##嶇
+##嶋
+##嶙
+##嶺
+##嶼
+##嶽
+##巅
+##巍
+##巒
+##巔
+##巖
+##川
+##州
+##巡
+##巢
+##工
+##左
+##巧
+##巨
+##巩
+##巫
+##差
+##己
+##已
+##巳
+##巴
+##巷
+##巻
+##巽
+##巾
+##巿
+##币
+##市
+##布
+##帅
+##帆
+##师
+##希
+##帐
+##帑
+##帕
+##帖
+##帘
+##帚
+##帛
+##帜
+##帝
+##帥
+##带
+##帧
+##師
+##席
+##帮
+##帯
+##帰
+##帳
+##帶
+##帷
+##常
+##帼
+##帽
+##幀
+##幂
+##幄
+##幅
+##幌
+##幔
+##幕
+##幟
+##幡
+##幢
+##幣
+##幫
+##干
+##平
+##年
+##并
+##幸
+##幹
+##幺
+##幻
+##幼
+##幽
+##幾
+##广
+##庁
+##広
+##庄
+##庆
+##庇
+##床
+##序
+##庐
+##库
+##应
+##底
+##庖
+##店
+##庙
+##庚
+##府
+##庞
+##废
+##庠
+##度
+##座
+##庫
+##庭
+##庵
+##庶
+##康
+##庸
+##庹
+##庾
+##廁
+##廂
+##廃
+##廈
+##廉
+##廊
+##廓
+##廖
+##廚
+##廝
+##廟
+##廠
+##廢
+##廣
+##廬
+##廳
+##延
+##廷
+##建
+##廿
+##开
+##弁
+##异
+##弃
+##弄
+##弈
+##弊
+##弋
+##式
+##弑
+##弒
+##弓
+##弔
+##引
+##弗
+##弘
+##弛
+##弟
+##张
+##弥
+##弦
+##弧
+##弩
+##弭
+##弯
+##弱
+##張
+##強
+##弹
+##强
+##弼
+##弾
+##彅
+##彆
+##彈
+##彌
+##彎
+##归
+##当
+##录
+##彗
+##彙
+##彝
+##形
+##彤
+##彥
+##彦
+##彧
+##彩
+##彪
+##彫
+##彬
+##彭
+##彰
+##影
+##彷
+##役
+##彻
+##彼
+##彿
+##往
+##征
+##径
+##待
+##徇
+##很
+##徉
+##徊
+##律
+##後
+##徐
+##徑
+##徒
+##従
+##徕
+##得
+##徘
+##徙
+##徜
+##從
+##徠
+##御
+##徨
+##復
+##循
+##徬
+##微
+##徳
+##徴
+##徵
+##德
+##徹
+##徼
+##徽
+##心
+##必
+##忆
+##忌
+##忍
+##忏
+##忐
+##忑
+##忒
+##忖
+##志
+##忘
+##忙
+##応
+##忠
+##忡
+##忤
+##忧
+##忪
+##快
+##忱
+##念
+##忻
+##忽
+##忿
+##怀
+##态
+##怂
+##怅
+##怆
+##怎
+##怏
+##怒
+##怔
+##怕
+##怖
+##怙
+##怜
+##思
+##怠
+##怡
+##急
+##怦
+##性
+##怨
+##怪
+##怯
+##怵
+##总
+##怼
+##恁
+##恃
+##恆
+##恋
+##恍
+##恐
+##恒
+##恕
+##恙
+##恚
+##恢
+##恣
+##恤
+##恥
+##恨
+##恩
+##恪
+##恫
+##恬
+##恭
+##息
+##恰
+##恳
+##恵
+##恶
+##恸
+##恺
+##恻
+##恼
+##恿
+##悄
+##悅
+##悉
+##悌
+##悍
+##悔
+##悖
+##悚
+##悟
+##悠
+##患
+##悦
+##您
+##悩
+##悪
+##悬
+##悯
+##悱
+##悲
+##悴
+##悵
+##悶
+##悸
+##悻
+##悼
+##悽
+##情
+##惆
+##惇
+##惊
+##惋
+##惑
+##惕
+##惘
+##惚
+##惜
+##惟
+##惠
+##惡
+##惦
+##惧
+##惨
+##惩
+##惫
+##惬
+##惭
+##惮
+##惯
+##惰
+##惱
+##想
+##惴
+##惶
+##惹
+##惺
+##愁
+##愆
+##愈
+##愉
+##愍
+##意
+##愕
+##愚
+##愛
+##愜
+##感
+##愣
+##愤
+##愧
+##愫
+##愷
+##愿
+##慄
+##慈
+##態
+##慌
+##慎
+##慑
+##慕
+##慘
+##慚
+##慟
+##慢
+##慣
+##慧
+##慨
+##慫
+##慮
+##慰
+##慳
+##慵
+##慶
+##慷
+##慾
+##憂
+##憊
+##憋
+##憎
+##憐
+##憑
+##憔
+##憚
+##憤
+##憧
+##憨
+##憩
+##憫
+##憬
+##憲
+##憶
+##憾
+##懂
+##懇
+##懈
+##應
+##懊
+##懋
+##懑
+##懒
+##懦
+##懲
+##懵
+##懶
+##懷
+##懸
+##懺
+##懼
+##懾
+##懿
+##戀
+##戈
+##戊
+##戌
+##戍
+##戎
+##戏
+##成
+##我
+##戒
+##戕
+##或
+##战
+##戚
+##戛
+##戟
+##戡
+##戦
+##截
+##戬
+##戮
+##戰
+##戲
+##戳
+##戴
+##戶
+##户
+##戸
+##戻
+##戾
+##房
+##所
+##扁
+##扇
+##扈
+##扉
+##手
+##才
+##扎
+##扑
+##扒
+##打
+##扔
+##払
+##托
+##扛
+##扣
+##扦
+##执
+##扩
+##扪
+##扫
+##扬
+##扭
+##扮
+##扯
+##扰
+##扱
+##扳
+##扶
+##批
+##扼
+##找
+##承
+##技
+##抄
+##抉
+##把
+##抑
+##抒
+##抓
+##投
+##抖
+##抗
+##折
+##抚
+##抛
+##抜
+##択
+##抟
+##抠
+##抡
+##抢
+##护
+##报
+##抨
+##披
+##抬
+##抱
+##抵
+##抹
+##押
+##抽
+##抿
+##拂
+##拄
+##担
+##拆
+##拇
+##拈
+##拉
+##拋
+##拌
+##拍
+##拎
+##拐
+##拒
+##拓
+##拔
+##拖
+##拗
+##拘
+##拙
+##拚
+##招
+##拜
+##拟
+##拡
+##拢
+##拣
+##拥
+##拦
+##拧
+##拨
+##择
+##括
+##拭
+##拮
+##拯
+##拱
+##拳
+##拴
+##拷
+##拼
+##拽
+##拾
+##拿
+##持
+##挂
+##指
+##挈
+##按
+##挎
+##挑
+##挖
+##挙
+##挚
+##挛
+##挝
+##挞
+##挟
+##挠
+##挡
+##挣
+##挤
+##挥
+##挨
+##挪
+##挫
+##振
+##挲
+##挹
+##挺
+##挽
+##挾
+##捂
+##捅
+##捆
+##捉
+##捋
+##捌
+##捍
+##捎
+##捏
+##捐
+##捕
+##捞
+##损
+##捡
+##换
+##捣
+##捧
+##捨
+##捩
+##据
+##捱
+##捲
+##捶
+##捷
+##捺
+##捻
+##掀
+##掂
+##掃
+##掇
+##授
+##掉
+##掌
+##掏
+##掐
+##排
+##掖
+##掘
+##掙
+##掛
+##掠
+##採
+##探
+##掣
+##接
+##控
+##推
+##掩
+##措
+##掬
+##掰
+##掲
+##掳
+##掴
+##掷
+##掸
+##掺
+##揀
+##揃
+##揄
+##揆
+##揉
+##揍
+##描
+##提
+##插
+##揖
+##揚
+##換
+##握
+##揣
+##揩
+##揪
+##揭
+##揮
+##援
+##揶
+##揸
+##揹
+##揽
+##搀
+##搁
+##搂
+##搅
+##損
+##搏
+##搐
+##搓
+##搔
+##搖
+##搗
+##搜
+##搞
+##搡
+##搪
+##搬
+##搭
+##搵
+##搶
+##携
+##搽
+##摀
+##摁
+##摄
+##摆
+##摇
+##摈
+##摊
+##摒
+##摔
+##摘
+##摞
+##摟
+##摧
+##摩
+##摯
+##摳
+##摸
+##摹
+##摺
+##摻
+##撂
+##撃
+##撅
+##撇
+##撈
+##撐
+##撑
+##撒
+##撓
+##撕
+##撚
+##撞
+##撤
+##撥
+##撩
+##撫
+##撬
+##播
+##撮
+##撰
+##撲
+##撵
+##撷
+##撸
+##撻
+##撼
+##撿
+##擀
+##擁
+##擂
+##擄
+##擅
+##擇
+##擊
+##擋
+##操
+##擎
+##擒
+##擔
+##擘
+##據
+##擞
+##擠
+##擡
+##擢
+##擦
+##擬
+##擰
+##擱
+##擲
+##擴
+##擷
+##擺
+##擼
+##擾
+##攀
+##攏
+##攒
+##攔
+##攘
+##攙
+##攜
+##攝
+##攞
+##攢
+##攣
+##攤
+##攥
+##攪
+##攫
+##攬
+##支
+##收
+##攸
+##改
+##攻
+##放
+##政
+##故
+##效
+##敌
+##敍
+##敎
+##敏
+##救
+##敕
+##敖
+##敗
+##敘
+##教
+##敛
+##敝
+##敞
+##敢
+##散
+##敦
+##敬
+##数
+##敲
+##整
+##敵
+##敷
+##數
+##斂
+##斃
+##文
+##斋
+##斌
+##斎
+##斐
+##斑
+##斓
+##斗
+##料
+##斛
+##斜
+##斟
+##斡
+##斤
+##斥
+##斧
+##斩
+##斫
+##斬
+##断
+##斯
+##新
+##斷
+##方
+##於
+##施
+##旁
+##旃
+##旅
+##旋
+##旌
+##旎
+##族
+##旖
+##旗
+##无
+##既
+##日
+##旦
+##旧
+##旨
+##早
+##旬
+##旭
+##旮
+##旱
+##时
+##旷
+##旺
+##旻
+##昀
+##昂
+##昆
+##昇
+##昉
+##昊
+##昌
+##明
+##昏
+##易
+##昔
+##昕
+##昙
+##星
+##映
+##春
+##昧
+##昨
+##昭
+##是
+##昱
+##昴
+##昵
+##昶
+##昼
+##显
+##晁
+##時
+##晃
+##晉
+##晋
+##晌
+##晏
+##晒
+##晓
+##晔
+##晕
+##晖
+##晗
+##晚
+##晝
+##晞
+##晟
+##晤
+##晦
+##晨
+##晩
+##普
+##景
+##晰
+##晴
+##晶
+##晷
+##智
+##晾
+##暂
+##暄
+##暇
+##暈
+##暉
+##暌
+##暐
+##暑
+##暖
+##暗
+##暝
+##暢
+##暧
+##暨
+##暫
+##暮
+##暱
+##暴
+##暸
+##暹
+##曄
+##曆
+##曇
+##曉
+##曖
+##曙
+##曜
+##曝
+##曠
+##曦
+##曬
+##曰
+##曲
+##曳
+##更
+##書
+##曹
+##曼
+##曾
+##替
+##最
+##會
+##月
+##有
+##朋
+##服
+##朐
+##朔
+##朕
+##朗
+##望
+##朝
+##期
+##朦
+##朧
+##木
+##未
+##末
+##本
+##札
+##朮
+##术
+##朱
+##朴
+##朵
+##机
+##朽
+##杀
+##杂
+##权
+##杆
+##杈
+##杉
+##李
+##杏
+##材
+##村
+##杓
+##杖
+##杜
+##杞
+##束
+##杠
+##条
+##来
+##杨
+##杭
+##杯
+##杰
+##東
+##杳
+##杵
+##杷
+##杼
+##松
+##板
+##极
+##构
+##枇
+##枉
+##枋
+##析
+##枕
+##林
+##枚
+##果
+##枝
+##枢
+##枣
+##枪
+##枫
+##枭
+##枯
+##枰
+##枱
+##枳
+##架
+##枷
+##枸
+##柄
+##柏
+##某
+##柑
+##柒
+##染
+##柔
+##柘
+##柚
+##柜
+##柞
+##柠
+##柢
+##查
+##柩
+##柬
+##柯
+##柱
+##柳
+##柴
+##柵
+##査
+##柿
+##栀
+##栃
+##栄
+##栅
+##标
+##栈
+##栉
+##栋
+##栎
+##栏
+##树
+##栓
+##栖
+##栗
+##校
+##栩
+##株
+##样
+##核
+##根
+##格
+##栽
+##栾
+##桀
+##桁
+##桂
+##桃
+##桅
+##框
+##案
+##桉
+##桌
+##桎
+##桐
+##桑
+##桓
+##桔
+##桜
+##桠
+##桡
+##桢
+##档
+##桥
+##桦
+##桧
+##桨
+##桩
+##桶
+##桿
+##梁
+##梅
+##梆
+##梏
+##梓
+##梗
+##條
+##梟
+##梢
+##梦
+##梧
+##梨
+##梭
+##梯
+##械
+##梳
+##梵
+##梶
+##检
+##棂
+##棄
+##棉
+##棋
+##棍
+##棒
+##棕
+##棗
+##棘
+##棚
+##棟
+##棠
+##棣
+##棧
+##森
+##棱
+##棲
+##棵
+##棹
+##棺
+##椁
+##椅
+##椋
+##植
+##椎
+##椒
+##検
+##椪
+##椭
+##椰
+##椹
+##椽
+##椿
+##楂
+##楊
+##楓
+##楔
+##楚
+##楝
+##楞
+##楠
+##楣
+##楨
+##楫
+##業
+##楮
+##極
+##楷
+##楸
+##楹
+##楼
+##楽
+##概
+##榄
+##榆
+##榈
+##榉
+##榔
+##榕
+##榖
+##榛
+##榜
+##榨
+##榫
+##榭
+##榮
+##榱
+##榴
+##榷
+##榻
+##槁
+##槃
+##構
+##槌
+##槍
+##槎
+##槐
+##槓
+##様
+##槛
+##槟
+##槤
+##槭
+##槲
+##槳
+##槻
+##槽
+##槿
+##樁
+##樂
+##樊
+##樑
+##樓
+##標
+##樞
+##樟
+##模
+##樣
+##権
+##横
+##樫
+##樯
+##樱
+##樵
+##樸
+##樹
+##樺
+##樽
+##樾
+##橄
+##橇
+##橋
+##橐
+##橘
+##橙
+##機
+##橡
+##橢
+##橫
+##橱
+##橹
+##橼
+##檀
+##檄
+##檎
+##檐
+##檔
+##檗
+##檜
+##檢
+##檬
+##檯
+##檳
+##檸
+##檻
+##櫃
+##櫚
+##櫛
+##櫥
+##櫸
+##櫻
+##欄
+##權
+##欒
+##欖
+##欠
+##次
+##欢
+##欣
+##欧
+##欲
+##欸
+##欺
+##欽
+##款
+##歆
+##歇
+##歉
+##歌
+##歎
+##歐
+##歓
+##歙
+##歛
+##歡
+##止
+##正
+##此
+##步
+##武
+##歧
+##歩
+##歪
+##歯
+##歲
+##歳
+##歴
+##歷
+##歸
+##歹
+##死
+##歼
+##殁
+##殃
+##殆
+##殇
+##殉
+##殊
+##残
+##殒
+##殓
+##殖
+##殘
+##殞
+##殡
+##殤
+##殭
+##殯
+##殲
+##殴
+##段
+##殷
+##殺
+##殼
+##殿
+##毀
+##毁
+##毂
+##毅
+##毆
+##毋
+##母
+##毎
+##每
+##毒
+##毓
+##比
+##毕
+##毗
+##毘
+##毙
+##毛
+##毡
+##毫
+##毯
+##毽
+##氈
+##氏
+##氐
+##民
+##氓
+##气
+##氖
+##気
+##氙
+##氛
+##氟
+##氡
+##氢
+##氣
+##氤
+##氦
+##氧
+##氨
+##氪
+##氫
+##氮
+##氯
+##氰
+##氲
+##水
+##氷
+##永
+##氹
+##氾
+##汀
+##汁
+##求
+##汆
+##汇
+##汉
+##汎
+##汐
+##汕
+##汗
+##汙
+##汛
+##汝
+##汞
+##江
+##池
+##污
+##汤
+##汨
+##汩
+##汪
+##汰
+##汲
+##汴
+##汶
+##汹
+##決
+##汽
+##汾
+##沁
+##沂
+##沃
+##沅
+##沈
+##沉
+##沌
+##沏
+##沐
+##沒
+##沓
+##沖
+##沙
+##沛
+##沟
+##没
+##沢
+##沣
+##沥
+##沦
+##沧
+##沪
+##沫
+##沭
+##沮
+##沱
+##河
+##沸
+##油
+##治
+##沼
+##沽
+##沾
+##沿
+##況
+##泄
+##泉
+##泊
+##泌
+##泓
+##法
+##泗
+##泛
+##泞
+##泠
+##泡
+##波
+##泣
+##泥
+##注
+##泪
+##泫
+##泮
+##泯
+##泰
+##泱
+##泳
+##泵
+##泷
+##泸
+##泻
+##泼
+##泽
+##泾
+##洁
+##洄
+##洋
+##洒
+##洗
+##洙
+##洛
+##洞
+##津
+##洩
+##洪
+##洮
+##洱
+##洲
+##洵
+##洶
+##洸
+##洹
+##活
+##洼
+##洽
+##派
+##流
+##浃
+##浄
+##浅
+##浆
+##浇
+##浊
+##测
+##济
+##浏
+##浑
+##浒
+##浓
+##浔
+##浙
+##浚
+##浜
+##浣
+##浦
+##浩
+##浪
+##浬
+##浮
+##浯
+##浴
+##海
+##浸
+##涂
+##涅
+##涇
+##消
+##涉
+##涌
+##涎
+##涓
+##涔
+##涕
+##涙
+##涛
+##涝
+##涞
+##涟
+##涠
+##涡
+##涣
+##涤
+##润
+##涧
+##涨
+##涩
+##涪
+##涮
+##涯
+##液
+##涵
+##涸
+##涼
+##涿
+##淀
+##淄
+##淅
+##淆
+##淇
+##淋
+##淌
+##淑
+##淒
+##淖
+##淘
+##淙
+##淚
+##淞
+##淡
+##淤
+##淦
+##淨
+##淩
+##淪
+##淫
+##淬
+##淮
+##深
+##淳
+##淵
+##混
+##淹
+##淺
+##添
+##淼
+##清
+##済
+##渉
+##渊
+##渋
+##渍
+##渎
+##渐
+##渔
+##渗
+##渙
+##渚
+##減
+##渝
+##渠
+##渡
+##渣
+##渤
+##渥
+##渦
+##温
+##測
+##渭
+##港
+##渲
+##渴
+##游
+##渺
+##渾
+##湃
+##湄
+##湊
+##湍
+##湖
+##湘
+##湛
+##湟
+##湧
+##湫
+##湮
+##湯
+##湳
+##湾
+##湿
+##満
+##溃
+##溅
+##溉
+##溏
+##源
+##準
+##溜
+##溝
+##溟
+##溢
+##溥
+##溧
+##溪
+##溫
+##溯
+##溱
+##溴
+##溶
+##溺
+##溼
+##滁
+##滂
+##滄
+##滅
+##滇
+##滋
+##滌
+##滑
+##滓
+##滔
+##滕
+##滙
+##滚
+##滝
+##滞
+##滟
+##满
+##滢
+##滤
+##滥
+##滦
+##滨
+##滩
+##滬
+##滯
+##滲
+##滴
+##滷
+##滸
+##滾
+##滿
+##漁
+##漂
+##漆
+##漉
+##漏
+##漓
+##演
+##漕
+##漠
+##漢
+##漣
+##漩
+##漪
+##漫
+##漬
+##漯
+##漱
+##漲
+##漳
+##漸
+##漾
+##漿
+##潆
+##潇
+##潋
+##潍
+##潑
+##潔
+##潘
+##潛
+##潜
+##潞
+##潟
+##潢
+##潤
+##潦
+##潧
+##潭
+##潮
+##潰
+##潴
+##潸
+##潺
+##潼
+##澀
+##澄
+##澆
+##澈
+##澍
+##澎
+##澗
+##澜
+##澡
+##澤
+##澧
+##澱
+##澳
+##澹
+##激
+##濁
+##濂
+##濃
+##濑
+##濒
+##濕
+##濘
+##濛
+##濟
+##濠
+##濡
+##濤
+##濫
+##濬
+##濮
+##濯
+##濱
+##濺
+##濾
+##瀅
+##瀆
+##瀉
+##瀋
+##瀏
+##瀑
+##瀕
+##瀘
+##瀚
+##瀛
+##瀝
+##瀞
+##瀟
+##瀧
+##瀨
+##瀬
+##瀰
+##瀾
+##灌
+##灏
+##灑
+##灘
+##灝
+##灞
+##灣
+##火
+##灬
+##灭
+##灯
+##灰
+##灵
+##灶
+##灸
+##灼
+##災
+##灾
+##灿
+##炀
+##炁
+##炅
+##炉
+##炊
+##炎
+##炒
+##炔
+##炕
+##炖
+##炙
+##炜
+##炫
+##炬
+##炭
+##炮
+##炯
+##炳
+##炷
+##炸
+##点
+##為
+##炼
+##炽
+##烁
+##烂
+##烃
+##烈
+##烊
+##烏
+##烘
+##烙
+##烛
+##烟
+##烤
+##烦
+##烧
+##烨
+##烩
+##烫
+##烬
+##热
+##烯
+##烷
+##烹
+##烽
+##焉
+##焊
+##焕
+##焖
+##焗
+##焘
+##焙
+##焚
+##焜
+##無
+##焦
+##焯
+##焰
+##焱
+##然
+##焼
+##煅
+##煉
+##煊
+##煌
+##煎
+##煒
+##煖
+##煙
+##煜
+##煞
+##煤
+##煥
+##煦
+##照
+##煨
+##煩
+##煮
+##煲
+##煸
+##煽
+##熄
+##熊
+##熏
+##熒
+##熔
+##熙
+##熟
+##熠
+##熨
+##熬
+##熱
+##熵
+##熹
+##熾
+##燁
+##燃
+##燄
+##燈
+##燉
+##燊
+##燎
+##燒
+##燔
+##燕
+##燙
+##燜
+##營
+##燥
+##燦
+##燧
+##燭
+##燮
+##燴
+##燻
+##燼
+##燿
+##爆
+##爍
+##爐
+##爛
+##爪
+##爬
+##爭
+##爰
+##爱
+##爲
+##爵
+##父
+##爷
+##爸
+##爹
+##爺
+##爻
+##爽
+##爾
+##牆
+##片
+##版
+##牌
+##牍
+##牒
+##牙
+##牛
+##牝
+##牟
+##牠
+##牡
+##牢
+##牦
+##牧
+##物
+##牯
+##牲
+##牴
+##牵
+##特
+##牺
+##牽
+##犀
+##犁
+##犄
+##犊
+##犍
+##犒
+##犢
+##犧
+##犬
+##犯
+##状
+##犷
+##犸
+##犹
+##狀
+##狂
+##狄
+##狈
+##狎
+##狐
+##狒
+##狗
+##狙
+##狞
+##狠
+##狡
+##狩
+##独
+##狭
+##狮
+##狰
+##狱
+##狸
+##狹
+##狼
+##狽
+##猎
+##猕
+##猖
+##猗
+##猙
+##猛
+##猜
+##猝
+##猥
+##猩
+##猪
+##猫
+##猬
+##献
+##猴
+##猶
+##猷
+##猾
+##猿
+##獄
+##獅
+##獎
+##獐
+##獒
+##獗
+##獠
+##獣
+##獨
+##獭
+##獰
+##獲
+##獵
+##獷
+##獸
+##獺
+##獻
+##獼
+##獾
+##玄
+##率
+##玉
+##王
+##玑
+##玖
+##玛
+##玟
+##玠
+##玥
+##玩
+##玫
+##玮
+##环
+##现
+##玲
+##玳
+##玷
+##玺
+##玻
+##珀
+##珂
+##珅
+##珈
+##珉
+##珊
+##珍
+##珏
+##珐
+##珑
+##珙
+##珞
+##珠
+##珣
+##珥
+##珩
+##珪
+##班
+##珮
+##珲
+##珺
+##現
+##球
+##琅
+##理
+##琇
+##琉
+##琊
+##琍
+##琏
+##琐
+##琛
+##琢
+##琥
+##琦
+##琨
+##琪
+##琬
+##琮
+##琰
+##琲
+##琳
+##琴
+##琵
+##琶
+##琺
+##琼
+##瑀
+##瑁
+##瑄
+##瑋
+##瑕
+##瑗
+##瑙
+##瑚
+##瑛
+##瑜
+##瑞
+##瑟
+##瑠
+##瑣
+##瑤
+##瑩
+##瑪
+##瑯
+##瑰
+##瑶
+##瑾
+##璀
+##璁
+##璃
+##璇
+##璉
+##璋
+##璎
+##璐
+##璜
+##璞
+##璟
+##璧
+##璨
+##環
+##璽
+##璿
+##瓊
+##瓏
+##瓒
+##瓜
+##瓢
+##瓣
+##瓤
+##瓦
+##瓮
+##瓯
+##瓴
+##瓶
+##瓷
+##甄
+##甌
+##甕
+##甘
+##甙
+##甚
+##甜
+##生
+##產
+##産
+##甥
+##甦
+##用
+##甩
+##甫
+##甬
+##甭
+##甯
+##田
+##由
+##甲
+##申
+##电
+##男
+##甸
+##町
+##画
+##甾
+##畀
+##畅
+##界
+##畏
+##畑
+##畔
+##留
+##畜
+##畝
+##畢
+##略
+##畦
+##番
+##畫
+##異
+##畲
+##畳
+##畴
+##當
+##畸
+##畹
+##畿
+##疆
+##疇
+##疊
+##疏
+##疑
+##疔
+##疖
+##疗
+##疙
+##疚
+##疝
+##疟
+##疡
+##疣
+##疤
+##疥
+##疫
+##疮
+##疯
+##疱
+##疲
+##疳
+##疵
+##疸
+##疹
+##疼
+##疽
+##疾
+##痂
+##病
+##症
+##痈
+##痉
+##痊
+##痍
+##痒
+##痔
+##痕
+##痘
+##痙
+##痛
+##痞
+##痠
+##痢
+##痣
+##痤
+##痧
+##痨
+##痪
+##痫
+##痰
+##痱
+##痴
+##痹
+##痺
+##痼
+##痿
+##瘀
+##瘁
+##瘋
+##瘍
+##瘓
+##瘘
+##瘙
+##瘟
+##瘠
+##瘡
+##瘢
+##瘤
+##瘦
+##瘧
+##瘩
+##瘪
+##瘫
+##瘴
+##瘸
+##瘾
+##療
+##癇
+##癌
+##癒
+##癖
+##癜
+##癞
+##癡
+##癢
+##癣
+##癥
+##癫
+##癬
+##癮
+##癱
+##癲
+##癸
+##発
+##登
+##發
+##白
+##百
+##皂
+##的
+##皆
+##皇
+##皈
+##皋
+##皎
+##皑
+##皓
+##皖
+##皙
+##皚
+##皮
+##皰
+##皱
+##皴
+##皺
+##皿
+##盂
+##盃
+##盅
+##盆
+##盈
+##益
+##盎
+##盏
+##盐
+##监
+##盒
+##盔
+##盖
+##盗
+##盘
+##盛
+##盜
+##盞
+##盟
+##盡
+##監
+##盤
+##盥
+##盧
+##盪
+##目
+##盯
+##盱
+##盲
+##直
+##相
+##盹
+##盼
+##盾
+##省
+##眈
+##眉
+##看
+##県
+##眙
+##眞
+##真
+##眠
+##眦
+##眨
+##眩
+##眯
+##眶
+##眷
+##眸
+##眺
+##眼
+##眾
+##着
+##睁
+##睇
+##睏
+##睐
+##睑
+##睛
+##睜
+##睞
+##睡
+##睢
+##督
+##睥
+##睦
+##睨
+##睪
+##睫
+##睬
+##睹
+##睽
+##睾
+##睿
+##瞄
+##瞅
+##瞇
+##瞋
+##瞌
+##瞎
+##瞑
+##瞒
+##瞓
+##瞞
+##瞟
+##瞠
+##瞥
+##瞧
+##瞩
+##瞪
+##瞬
+##瞭
+##瞰
+##瞳
+##瞻
+##瞼
+##瞿
+##矇
+##矍
+##矗
+##矚
+##矛
+##矜
+##矢
+##矣
+##知
+##矩
+##矫
+##短
+##矮
+##矯
+##石
+##矶
+##矽
+##矾
+##矿
+##码
+##砂
+##砌
+##砍
+##砒
+##研
+##砖
+##砗
+##砚
+##砝
+##砣
+##砥
+##砧
+##砭
+##砰
+##砲
+##破
+##砷
+##砸
+##砺
+##砼
+##砾
+##础
+##硅
+##硐
+##硒
+##硕
+##硝
+##硫
+##硬
+##确
+##硯
+##硼
+##碁
+##碇
+##碉
+##碌
+##碍
+##碎
+##碑
+##碓
+##碗
+##碘
+##碚
+##碛
+##碟
+##碣
+##碧
+##碩
+##碰
+##碱
+##碳
+##碴
+##確
+##碼
+##碾
+##磁
+##磅
+##磊
+##磋
+##磐
+##磕
+##磚
+##磡
+##磨
+##磬
+##磯
+##磲
+##磷
+##磺
+##礁
+##礎
+##礙
+##礡
+##礦
+##礪
+##礫
+##礴
+##示
+##礼
+##社
+##祀
+##祁
+##祂
+##祇
+##祈
+##祉
+##祎
+##祐
+##祕
+##祖
+##祗
+##祚
+##祛
+##祜
+##祝
+##神
+##祟
+##祠
+##祢
+##祥
+##票
+##祭
+##祯
+##祷
+##祸
+##祺
+##祿
+##禀
+##禁
+##禄
+##禅
+##禍
+##禎
+##福
+##禛
+##禦
+##禧
+##禪
+##禮
+##禱
+##禹
+##禺
+##离
+##禽
+##禾
+##禿
+##秀
+##私
+##秃
+##秆
+##秉
+##秋
+##种
+##科
+##秒
+##秘
+##租
+##秣
+##秤
+##秦
+##秧
+##秩
+##秭
+##积
+##称
+##秸
+##移
+##秽
+##稀
+##稅
+##程
+##稍
+##税
+##稔
+##稗
+##稚
+##稜
+##稞
+##稟
+##稠
+##稣
+##種
+##稱
+##稲
+##稳
+##稷
+##稹
+##稻
+##稼
+##稽
+##稿
+##穀
+##穂
+##穆
+##穌
+##積
+##穎
+##穗
+##穢
+##穩
+##穫
+##穴
+##究
+##穷
+##穹
+##空
+##穿
+##突
+##窃
+##窄
+##窈
+##窍
+##窑
+##窒
+##窓
+##窕
+##窖
+##窗
+##窘
+##窜
+##窝
+##窟
+##窠
+##窥
+##窦
+##窨
+##窩
+##窪
+##窮
+##窯
+##窺
+##窿
+##竄
+##竅
+##竇
+##竊
+##立
+##竖
+##站
+##竜
+##竞
+##竟
+##章
+##竣
+##童
+##竭
+##端
+##競
+##竹
+##竺
+##竽
+##竿
+##笃
+##笆
+##笈
+##笋
+##笏
+##笑
+##笔
+##笙
+##笛
+##笞
+##笠
+##符
+##笨
+##第
+##笹
+##笺
+##笼
+##筆
+##等
+##筊
+##筋
+##筍
+##筏
+##筐
+##筑
+##筒
+##答
+##策
+##筛
+##筝
+##筠
+##筱
+##筲
+##筵
+##筷
+##筹
+##签
+##简
+##箇
+##箋
+##箍
+##箏
+##箐
+##箔
+##箕
+##算
+##箝
+##管
+##箩
+##箫
+##箭
+##箱
+##箴
+##箸
+##節
+##篁
+##範
+##篆
+##篇
+##築
+##篑
+##篓
+##篙
+##篝
+##篠
+##篡
+##篤
+##篩
+##篪
+##篮
+##篱
+##篷
+##簇
+##簌
+##簍
+##簡
+##簦
+##簧
+##簪
+##簫
+##簷
+##簸
+##簽
+##簾
+##簿
+##籁
+##籃
+##籌
+##籍
+##籐
+##籟
+##籠
+##籤
+##籬
+##籮
+##籲
+##米
+##类
+##籼
+##籽
+##粄
+##粉
+##粑
+##粒
+##粕
+##粗
+##粘
+##粟
+##粤
+##粥
+##粧
+##粪
+##粮
+##粱
+##粲
+##粳
+##粵
+##粹
+##粼
+##粽
+##精
+##粿
+##糅
+##糊
+##糍
+##糕
+##糖
+##糗
+##糙
+##糜
+##糞
+##糟
+##糠
+##糧
+##糬
+##糯
+##糰
+##糸
+##系
+##糾
+##紀
+##紂
+##約
+##紅
+##紉
+##紊
+##紋
+##納
+##紐
+##紓
+##純
+##紗
+##紘
+##紙
+##級
+##紛
+##紜
+##素
+##紡
+##索
+##紧
+##紫
+##紮
+##累
+##細
+##紳
+##紹
+##紺
+##終
+##絃
+##組
+##絆
+##経
+##結
+##絕
+##絞
+##絡
+##絢
+##給
+##絨
+##絮
+##統
+##絲
+##絳
+##絵
+##絶
+##絹
+##綁
+##綏
+##綑
+##經
+##継
+##続
+##綜
+##綠
+##綢
+##綦
+##綫
+##綬
+##維
+##綱
+##網
+##綴
+##綵
+##綸
+##綺
+##綻
+##綽
+##綾
+##綿
+##緊
+##緋
+##総
+##緑
+##緒
+##緘
+##線
+##緝
+##緞
+##締
+##緣
+##編
+##緩
+##緬
+##緯
+##練
+##緹
+##緻
+##縁
+##縄
+##縈
+##縛
+##縝
+##縣
+##縫
+##縮
+##縱
+##縴
+##縷
+##總
+##績
+##繁
+##繃
+##繆
+##繇
+##繋
+##織
+##繕
+##繚
+##繞
+##繡
+##繩
+##繪
+##繫
+##繭
+##繳
+##繹
+##繼
+##繽
+##纂
+##續
+##纍
+##纏
+##纓
+##纔
+##纖
+##纜
+##纠
+##红
+##纣
+##纤
+##约
+##级
+##纨
+##纪
+##纫
+##纬
+##纭
+##纯
+##纰
+##纱
+##纲
+##纳
+##纵
+##纶
+##纷
+##纸
+##纹
+##纺
+##纽
+##纾
+##线
+##绀
+##练
+##组
+##绅
+##细
+##织
+##终
+##绊
+##绍
+##绎
+##经
+##绑
+##绒
+##结
+##绔
+##绕
+##绘
+##给
+##绚
+##绛
+##络
+##绝
+##绞
+##统
+##绡
+##绢
+##绣
+##绥
+##绦
+##继
+##绩
+##绪
+##绫
+##续
+##绮
+##绯
+##绰
+##绳
+##维
+##绵
+##绶
+##绷
+##绸
+##绻
+##综
+##绽
+##绾
+##绿
+##缀
+##缄
+##缅
+##缆
+##缇
+##缈
+##缉
+##缎
+##缓
+##缔
+##缕
+##编
+##缘
+##缙
+##缚
+##缜
+##缝
+##缠
+##缢
+##缤
+##缥
+##缨
+##缩
+##缪
+##缭
+##缮
+##缰
+##缱
+##缴
+##缸
+##缺
+##缽
+##罂
+##罄
+##罌
+##罐
+##网
+##罔
+##罕
+##罗
+##罚
+##罡
+##罢
+##罩
+##罪
+##置
+##罰
+##署
+##罵
+##罷
+##罹
+##羁
+##羅
+##羈
+##羊
+##羌
+##美
+##羔
+##羚
+##羞
+##羟
+##羡
+##羣
+##群
+##羥
+##羧
+##羨
+##義
+##羯
+##羲
+##羸
+##羹
+##羽
+##羿
+##翁
+##翅
+##翊
+##翌
+##翎
+##習
+##翔
+##翘
+##翟
+##翠
+##翡
+##翦
+##翩
+##翰
+##翱
+##翳
+##翹
+##翻
+##翼
+##耀
+##老
+##考
+##耄
+##者
+##耆
+##耋
+##而
+##耍
+##耐
+##耒
+##耕
+##耗
+##耘
+##耙
+##耦
+##耨
+##耳
+##耶
+##耷
+##耸
+##耻
+##耽
+##耿
+##聂
+##聆
+##聊
+##聋
+##职
+##聒
+##联
+##聖
+##聘
+##聚
+##聞
+##聪
+##聯
+##聰
+##聲
+##聳
+##聴
+##聶
+##職
+##聽
+##聾
+##聿
+##肃
+##肄
+##肅
+##肆
+##肇
+##肉
+##肋
+##肌
+##肏
+##肓
+##肖
+##肘
+##肚
+##肛
+##肝
+##肠
+##股
+##肢
+##肤
+##肥
+##肩
+##肪
+##肮
+##肯
+##肱
+##育
+##肴
+##肺
+##肽
+##肾
+##肿
+##胀
+##胁
+##胃
+##胄
+##胆
+##背
+##胍
+##胎
+##胖
+##胚
+##胛
+##胜
+##胝
+##胞
+##胡
+##胤
+##胥
+##胧
+##胫
+##胭
+##胯
+##胰
+##胱
+##胳
+##胴
+##胶
+##胸
+##胺
+##能
+##脂
+##脅
+##脆
+##脇
+##脈
+##脉
+##脊
+##脍
+##脏
+##脐
+##脑
+##脓
+##脖
+##脘
+##脚
+##脛
+##脣
+##脩
+##脫
+##脯
+##脱
+##脲
+##脳
+##脸
+##脹
+##脾
+##腆
+##腈
+##腊
+##腋
+##腌
+##腎
+##腐
+##腑
+##腓
+##腔
+##腕
+##腥
+##腦
+##腩
+##腫
+##腭
+##腮
+##腰
+##腱
+##腳
+##腴
+##腸
+##腹
+##腺
+##腻
+##腼
+##腾
+##腿
+##膀
+##膈
+##膊
+##膏
+##膑
+##膘
+##膚
+##膛
+##膜
+##膝
+##膠
+##膦
+##膨
+##膩
+##膳
+##膺
+##膻
+##膽
+##膾
+##膿
+##臀
+##臂
+##臃
+##臆
+##臉
+##臊
+##臍
+##臓
+##臘
+##臟
+##臣
+##臥
+##臧
+##臨
+##自
+##臬
+##臭
+##至
+##致
+##臺
+##臻
+##臼
+##臾
+##舀
+##舂
+##舅
+##舆
+##與
+##興
+##舉
+##舊
+##舌
+##舍
+##舎
+##舐
+##舒
+##舔
+##舖
+##舗
+##舛
+##舜
+##舞
+##舟
+##航
+##舫
+##般
+##舰
+##舱
+##舵
+##舶
+##舷
+##舸
+##船
+##舺
+##舾
+##艇
+##艋
+##艘
+##艙
+##艦
+##艮
+##良
+##艰
+##艱
+##色
+##艳
+##艷
+##艹
+##艺
+##艾
+##节
+##芃
+##芈
+##芊
+##芋
+##芍
+##芎
+##芒
+##芙
+##芜
+##芝
+##芡
+##芥
+##芦
+##芩
+##芪
+##芫
+##芬
+##芭
+##芮
+##芯
+##花
+##芳
+##芷
+##芸
+##芹
+##芻
+##芽
+##芾
+##苁
+##苄
+##苇
+##苋
+##苍
+##苏
+##苑
+##苒
+##苓
+##苔
+##苕
+##苗
+##苛
+##苜
+##苞
+##苟
+##苡
+##苣
+##若
+##苦
+##苫
+##苯
+##英
+##苷
+##苹
+##苻
+##茁
+##茂
+##范
+##茄
+##茅
+##茉
+##茎
+##茏
+##茗
+##茜
+##茧
+##茨
+##茫
+##茬
+##茭
+##茯
+##茱
+##茲
+##茴
+##茵
+##茶
+##茸
+##茹
+##茼
+##荀
+##荃
+##荆
+##草
+##荊
+##荏
+##荐
+##荒
+##荔
+##荖
+##荘
+##荚
+##荞
+##荟
+##荠
+##荡
+##荣
+##荤
+##荥
+##荧
+##荨
+##荪
+##荫
+##药
+##荳
+##荷
+##荸
+##荻
+##荼
+##荽
+##莅
+##莆
+##莉
+##莊
+##莎
+##莒
+##莓
+##莖
+##莘
+##莞
+##莠
+##莢
+##莧
+##莪
+##莫
+##莱
+##莲
+##莴
+##获
+##莹
+##莺
+##莽
+##莿
+##菀
+##菁
+##菅
+##菇
+##菈
+##菊
+##菌
+##菏
+##菓
+##菖
+##菘
+##菜
+##菟
+##菠
+##菡
+##菩
+##華
+##菱
+##菲
+##菸
+##菽
+##萁
+##萃
+##萄
+##萊
+##萋
+##萌
+##萍
+##萎
+##萘
+##萝
+##萤
+##营
+##萦
+##萧
+##萨
+##萩
+##萬
+##萱
+##萵
+##萸
+##萼
+##落
+##葆
+##葉
+##著
+##葚
+##葛
+##葡
+##董
+##葦
+##葩
+##葫
+##葬
+##葭
+##葯
+##葱
+##葳
+##葵
+##葷
+##葺
+##蒂
+##蒋
+##蒐
+##蒔
+##蒙
+##蒜
+##蒞
+##蒟
+##蒡
+##蒨
+##蒲
+##蒸
+##蒹
+##蒻
+##蒼
+##蒿
+##蓁
+##蓄
+##蓆
+##蓉
+##蓋
+##蓑
+##蓓
+##蓖
+##蓝
+##蓟
+##蓦
+##蓬
+##蓮
+##蓼
+##蓿
+##蔑
+##蔓
+##蔔
+##蔗
+##蔘
+##蔚
+##蔡
+##蔣
+##蔥
+##蔫
+##蔬
+##蔭
+##蔵
+##蔷
+##蔺
+##蔻
+##蔼
+##蔽
+##蕁
+##蕃
+##蕈
+##蕉
+##蕊
+##蕎
+##蕙
+##蕤
+##蕨
+##蕩
+##蕪
+##蕭
+##蕲
+##蕴
+##蕻
+##蕾
+##薄
+##薅
+##薇
+##薈
+##薊
+##薏
+##薑
+##薔
+##薙
+##薛
+##薦
+##薨
+##薩
+##薪
+##薬
+##薯
+##薰
+##薹
+##藉
+##藍
+##藏
+##藐
+##藓
+##藕
+##藜
+##藝
+##藤
+##藥
+##藩
+##藹
+##藻
+##藿
+##蘆
+##蘇
+##蘊
+##蘋
+##蘑
+##蘚
+##蘭
+##蘸
+##蘼
+##蘿
+##虎
+##虏
+##虐
+##虑
+##虔
+##處
+##虚
+##虛
+##虜
+##虞
+##號
+##虢
+##虧
+##虫
+##虬
+##虱
+##虹
+##虻
+##虽
+##虾
+##蚀
+##蚁
+##蚂
+##蚊
+##蚌
+##蚓
+##蚕
+##蚜
+##蚝
+##蚣
+##蚤
+##蚩
+##蚪
+##蚯
+##蚱
+##蚵
+##蛀
+##蛆
+##蛇
+##蛊
+##蛋
+##蛎
+##蛐
+##蛔
+##蛙
+##蛛
+##蛟
+##蛤
+##蛭
+##蛮
+##蛰
+##蛳
+##蛹
+##蛻
+##蛾
+##蜀
+##蜂
+##蜃
+##蜆
+##蜇
+##蜈
+##蜊
+##蜍
+##蜒
+##蜓
+##蜕
+##蜗
+##蜘
+##蜚
+##蜜
+##蜡
+##蜢
+##蜥
+##蜱
+##蜴
+##蜷
+##蜻
+##蜿
+##蝇
+##蝈
+##蝉
+##蝌
+##蝎
+##蝕
+##蝗
+##蝙
+##蝟
+##蝠
+##蝦
+##蝨
+##蝴
+##蝶
+##蝸
+##蝼
+##螂
+##螃
+##融
+##螞
+##螢
+##螨
+##螯
+##螳
+##螺
+##蟀
+##蟄
+##蟆
+##蟋
+##蟎
+##蟑
+##蟒
+##蟠
+##蟬
+##蟲
+##蟹
+##蟻
+##蟾
+##蠅
+##蠍
+##蠔
+##蠕
+##蠛
+##蠟
+##蠡
+##蠢
+##蠣
+##蠱
+##蠶
+##蠹
+##蠻
+##血
+##衄
+##衅
+##衆
+##行
+##衍
+##術
+##衔
+##街
+##衙
+##衛
+##衝
+##衞
+##衡
+##衢
+##衣
+##补
+##表
+##衩
+##衫
+##衬
+##衮
+##衰
+##衲
+##衷
+##衹
+##衾
+##衿
+##袁
+##袂
+##袄
+##袅
+##袈
+##袋
+##袍
+##袒
+##袖
+##袜
+##袞
+##袤
+##袪
+##被
+##袭
+##袱
+##裁
+##裂
+##装
+##裆
+##裊
+##裏
+##裔
+##裕
+##裘
+##裙
+##補
+##裝
+##裟
+##裡
+##裤
+##裨
+##裱
+##裳
+##裴
+##裸
+##裹
+##製
+##裾
+##褂
+##複
+##褐
+##褒
+##褓
+##褔
+##褚
+##褥
+##褪
+##褫
+##褲
+##褶
+##褻
+##襁
+##襄
+##襟
+##襠
+##襪
+##襬
+##襯
+##襲
+##西
+##要
+##覃
+##覆
+##覇
+##見
+##規
+##覓
+##視
+##覚
+##覦
+##覧
+##親
+##覬
+##観
+##覷
+##覺
+##覽
+##觀
+##见
+##观
+##规
+##觅
+##视
+##览
+##觉
+##觊
+##觎
+##觐
+##觑
+##角
+##觞
+##解
+##觥
+##触
+##觸
+##言
+##訂
+##計
+##訊
+##討
+##訓
+##訕
+##訖
+##託
+##記
+##訛
+##訝
+##訟
+##訣
+##訥
+##訪
+##設
+##許
+##訳
+##訴
+##訶
+##診
+##註
+##証
+##詆
+##詐
+##詔
+##評
+##詛
+##詞
+##詠
+##詡
+##詢
+##詣
+##試
+##詩
+##詫
+##詬
+##詭
+##詮
+##詰
+##話
+##該
+##詳
+##詹
+##詼
+##誅
+##誇
+##誉
+##誌
+##認
+##誓
+##誕
+##誘
+##語
+##誠
+##誡
+##誣
+##誤
+##誥
+##誦
+##誨
+##說
+##説
+##読
+##誰
+##課
+##誹
+##誼
+##調
+##諄
+##談
+##請
+##諏
+##諒
+##論
+##諗
+##諜
+##諡
+##諦
+##諧
+##諫
+##諭
+##諮
+##諱
+##諳
+##諷
+##諸
+##諺
+##諾
+##謀
+##謁
+##謂
+##謄
+##謊
+##謎
+##謐
+##謔
+##謗
+##謙
+##講
+##謝
+##謠
+##謨
+##謬
+##謹
+##謾
+##譁
+##證
+##譎
+##譏
+##識
+##譙
+##譚
+##譜
+##警
+##譬
+##譯
+##議
+##譲
+##譴
+##護
+##譽
+##讀
+##變
+##讓
+##讚
+##讞
+##计
+##订
+##认
+##讥
+##讧
+##讨
+##让
+##讪
+##讫
+##训
+##议
+##讯
+##记
+##讲
+##讳
+##讴
+##讶
+##讷
+##许
+##讹
+##论
+##讼
+##讽
+##设
+##访
+##诀
+##证
+##诃
+##评
+##诅
+##识
+##诈
+##诉
+##诊
+##诋
+##词
+##诏
+##译
+##试
+##诗
+##诘
+##诙
+##诚
+##诛
+##话
+##诞
+##诟
+##诠
+##诡
+##询
+##诣
+##诤
+##该
+##详
+##诧
+##诩
+##诫
+##诬
+##语
+##误
+##诰
+##诱
+##诲
+##说
+##诵
+##诶
+##请
+##诸
+##诺
+##读
+##诽
+##课
+##诿
+##谀
+##谁
+##调
+##谄
+##谅
+##谆
+##谈
+##谊
+##谋
+##谌
+##谍
+##谎
+##谏
+##谐
+##谑
+##谒
+##谓
+##谔
+##谕
+##谗
+##谘
+##谙
+##谚
+##谛
+##谜
+##谟
+##谢
+##谣
+##谤
+##谥
+##谦
+##谧
+##谨
+##谩
+##谪
+##谬
+##谭
+##谯
+##谱
+##谲
+##谴
+##谶
+##谷
+##豁
+##豆
+##豇
+##豈
+##豉
+##豊
+##豌
+##豎
+##豐
+##豔
+##豚
+##象
+##豢
+##豪
+##豫
+##豬
+##豹
+##豺
+##貂
+##貅
+##貌
+##貓
+##貔
+##貘
+##貝
+##貞
+##負
+##財
+##貢
+##貧
+##貨
+##販
+##貪
+##貫
+##責
+##貯
+##貰
+##貳
+##貴
+##貶
+##買
+##貸
+##費
+##貼
+##貽
+##貿
+##賀
+##賁
+##賂
+##賃
+##賄
+##資
+##賈
+##賊
+##賑
+##賓
+##賜
+##賞
+##賠
+##賡
+##賢
+##賣
+##賤
+##賦
+##質
+##賬
+##賭
+##賴
+##賺
+##購
+##賽
+##贅
+##贈
+##贊
+##贍
+##贏
+##贓
+##贖
+##贛
+##贝
+##贞
+##负
+##贡
+##财
+##责
+##贤
+##败
+##账
+##货
+##质
+##贩
+##贪
+##贫
+##贬
+##购
+##贮
+##贯
+##贰
+##贱
+##贲
+##贴
+##贵
+##贷
+##贸
+##费
+##贺
+##贻
+##贼
+##贾
+##贿
+##赁
+##赂
+##赃
+##资
+##赅
+##赈
+##赊
+##赋
+##赌
+##赎
+##赏
+##赐
+##赓
+##赔
+##赖
+##赘
+##赚
+##赛
+##赝
+##赞
+##赠
+##赡
+##赢
+##赣
+##赤
+##赦
+##赧
+##赫
+##赭
+##走
+##赳
+##赴
+##赵
+##赶
+##起
+##趁
+##超
+##越
+##趋
+##趕
+##趙
+##趟
+##趣
+##趨
+##足
+##趴
+##趵
+##趸
+##趺
+##趾
+##跃
+##跄
+##跆
+##跋
+##跌
+##跎
+##跑
+##跖
+##跚
+##跛
+##距
+##跟
+##跡
+##跤
+##跨
+##跩
+##跪
+##路
+##跳
+##践
+##跷
+##跹
+##跺
+##跻
+##踉
+##踊
+##踌
+##踏
+##踐
+##踝
+##踞
+##踟
+##踢
+##踩
+##踪
+##踮
+##踱
+##踴
+##踵
+##踹
+##蹂
+##蹄
+##蹇
+##蹈
+##蹉
+##蹊
+##蹋
+##蹑
+##蹒
+##蹙
+##蹟
+##蹣
+##蹤
+##蹦
+##蹩
+##蹬
+##蹭
+##蹲
+##蹴
+##蹶
+##蹺
+##蹼
+##蹿
+##躁
+##躇
+##躉
+##躊
+##躋
+##躍
+##躏
+##躪
+##身
+##躬
+##躯
+##躲
+##躺
+##軀
+##車
+##軋
+##軌
+##軍
+##軒
+##軟
+##転
+##軸
+##軼
+##軽
+##軾
+##較
+##載
+##輒
+##輓
+##輔
+##輕
+##輛
+##輝
+##輟
+##輩
+##輪
+##輯
+##輸
+##輻
+##輾
+##輿
+##轄
+##轅
+##轆
+##轉
+##轍
+##轎
+##轟
+##车
+##轧
+##轨
+##轩
+##转
+##轭
+##轮
+##软
+##轰
+##轲
+##轴
+##轶
+##轻
+##轼
+##载
+##轿
+##较
+##辄
+##辅
+##辆
+##辇
+##辈
+##辉
+##辊
+##辍
+##辐
+##辑
+##输
+##辕
+##辖
+##辗
+##辘
+##辙
+##辛
+##辜
+##辞
+##辟
+##辣
+##辦
+##辨
+##辩
+##辫
+##辭
+##辮
+##辯
+##辰
+##辱
+##農
+##边
+##辺
+##辻
+##込
+##辽
+##达
+##迁
+##迂
+##迄
+##迅
+##过
+##迈
+##迎
+##运
+##近
+##返
+##还
+##这
+##进
+##远
+##违
+##连
+##迟
+##迢
+##迤
+##迥
+##迦
+##迩
+##迪
+##迫
+##迭
+##述
+##迴
+##迷
+##迸
+##迹
+##迺
+##追
+##退
+##送
+##适
+##逃
+##逅
+##逆
+##选
+##逊
+##逍
+##透
+##逐
+##递
+##途
+##逕
+##逗
+##這
+##通
+##逛
+##逝
+##逞
+##速
+##造
+##逢
+##連
+##逮
+##週
+##進
+##逵
+##逶
+##逸
+##逻
+##逼
+##逾
+##遁
+##遂
+##遅
+##遇
+##遊
+##運
+##遍
+##過
+##遏
+##遐
+##遑
+##遒
+##道
+##達
+##違
+##遗
+##遙
+##遛
+##遜
+##遞
+##遠
+##遢
+##遣
+##遥
+##遨
+##適
+##遭
+##遮
+##遲
+##遴
+##遵
+##遶
+##遷
+##選
+##遺
+##遼
+##遽
+##避
+##邀
+##邁
+##邂
+##邃
+##還
+##邇
+##邈
+##邊
+##邋
+##邏
+##邑
+##邓
+##邕
+##邛
+##邝
+##邢
+##那
+##邦
+##邨
+##邪
+##邬
+##邮
+##邯
+##邰
+##邱
+##邳
+##邵
+##邸
+##邹
+##邺
+##邻
+##郁
+##郅
+##郊
+##郎
+##郑
+##郜
+##郝
+##郡
+##郢
+##郤
+##郦
+##郧
+##部
+##郫
+##郭
+##郴
+##郵
+##郷
+##郸
+##都
+##鄂
+##鄉
+##鄒
+##鄔
+##鄙
+##鄞
+##鄢
+##鄧
+##鄭
+##鄰
+##鄱
+##鄲
+##鄺
+##酉
+##酊
+##酋
+##酌
+##配
+##酐
+##酒
+##酗
+##酚
+##酝
+##酢
+##酣
+##酥
+##酩
+##酪
+##酬
+##酮
+##酯
+##酰
+##酱
+##酵
+##酶
+##酷
+##酸
+##酿
+##醃
+##醇
+##醉
+##醋
+##醍
+##醐
+##醒
+##醚
+##醛
+##醜
+##醞
+##醣
+##醪
+##醫
+##醬
+##醮
+##醯
+##醴
+##醺
+##釀
+##釁
+##采
+##釉
+##释
+##釋
+##里
+##重
+##野
+##量
+##釐
+##金
+##釗
+##釘
+##釜
+##針
+##釣
+##釦
+##釧
+##釵
+##鈀
+##鈉
+##鈍
+##鈎
+##鈔
+##鈕
+##鈞
+##鈣
+##鈦
+##鈪
+##鈴
+##鈺
+##鈾
+##鉀
+##鉄
+##鉅
+##鉉
+##鉑
+##鉗
+##鉚
+##鉛
+##鉤
+##鉴
+##鉻
+##銀
+##銃
+##銅
+##銑
+##銓
+##銖
+##銘
+##銜
+##銬
+##銭
+##銮
+##銳
+##銷
+##銹
+##鋁
+##鋅
+##鋒
+##鋤
+##鋪
+##鋰
+##鋸
+##鋼
+##錄
+##錐
+##錘
+##錚
+##錠
+##錢
+##錦
+##錨
+##錫
+##錮
+##錯
+##録
+##錳
+##錶
+##鍊
+##鍋
+##鍍
+##鍛
+##鍥
+##鍰
+##鍵
+##鍺
+##鍾
+##鎂
+##鎊
+##鎌
+##鎏
+##鎔
+##鎖
+##鎗
+##鎚
+##鎧
+##鎬
+##鎮
+##鎳
+##鏈
+##鏖
+##鏗
+##鏘
+##鏞
+##鏟
+##鏡
+##鏢
+##鏤
+##鏽
+##鐘
+##鐮
+##鐲
+##鐳
+##鐵
+##鐸
+##鐺
+##鑄
+##鑊
+##鑑
+##鑒
+##鑣
+##鑫
+##鑰
+##鑲
+##鑼
+##鑽
+##鑾
+##鑿
+##针
+##钉
+##钊
+##钎
+##钏
+##钒
+##钓
+##钗
+##钙
+##钛
+##钜
+##钝
+##钞
+##钟
+##钠
+##钡
+##钢
+##钣
+##钤
+##钥
+##钦
+##钧
+##钨
+##钩
+##钮
+##钯
+##钰
+##钱
+##钳
+##钴
+##钵
+##钺
+##钻
+##钼
+##钾
+##钿
+##铀
+##铁
+##铂
+##铃
+##铄
+##铅
+##铆
+##铉
+##铎
+##铐
+##铛
+##铜
+##铝
+##铠
+##铡
+##铢
+##铣
+##铤
+##铨
+##铩
+##铬
+##铭
+##铮
+##铰
+##铲
+##铵
+##银
+##铸
+##铺
+##链
+##铿
+##销
+##锁
+##锂
+##锄
+##锅
+##锆
+##锈
+##锉
+##锋
+##锌
+##锏
+##锐
+##锑
+##错
+##锚
+##锟
+##锡
+##锢
+##锣
+##锤
+##锥
+##锦
+##锭
+##键
+##锯
+##锰
+##锲
+##锵
+##锹
+##锺
+##锻
+##镀
+##镁
+##镂
+##镇
+##镉
+##镌
+##镍
+##镐
+##镑
+##镕
+##镖
+##镗
+##镛
+##镜
+##镣
+##镭
+##镯
+##镰
+##镳
+##镶
+##長
+##长
+##門
+##閃
+##閉
+##開
+##閎
+##閏
+##閑
+##閒
+##間
+##閔
+##閘
+##閡
+##関
+##閣
+##閥
+##閨
+##閩
+##閱
+##閲
+##閹
+##閻
+##閾
+##闆
+##闇
+##闊
+##闌
+##闍
+##闔
+##闕
+##闖
+##闘
+##關
+##闡
+##闢
+##门
+##闪
+##闫
+##闭
+##问
+##闯
+##闰
+##闲
+##间
+##闵
+##闷
+##闸
+##闹
+##闺
+##闻
+##闽
+##闾
+##阀
+##阁
+##阂
+##阅
+##阆
+##阇
+##阈
+##阉
+##阎
+##阐
+##阑
+##阔
+##阕
+##阖
+##阙
+##阚
+##阜
+##队
+##阡
+##阪
+##阮
+##阱
+##防
+##阳
+##阴
+##阵
+##阶
+##阻
+##阿
+##陀
+##陂
+##附
+##际
+##陆
+##陇
+##陈
+##陋
+##陌
+##降
+##限
+##陕
+##陛
+##陝
+##陞
+##陟
+##陡
+##院
+##陣
+##除
+##陨
+##险
+##陪
+##陰
+##陲
+##陳
+##陵
+##陶
+##陷
+##陸
+##険
+##陽
+##隅
+##隆
+##隈
+##隊
+##隋
+##隍
+##階
+##随
+##隐
+##隔
+##隕
+##隘
+##隙
+##際
+##障
+##隠
+##隣
+##隧
+##隨
+##險
+##隱
+##隴
+##隶
+##隸
+##隻
+##隼
+##隽
+##难
+##雀
+##雁
+##雄
+##雅
+##集
+##雇
+##雉
+##雋
+##雌
+##雍
+##雎
+##雏
+##雑
+##雒
+##雕
+##雖
+##雙
+##雛
+##雜
+##雞
+##離
+##難
+##雨
+##雪
+##雯
+##雰
+##雲
+##雳
+##零
+##雷
+##雹
+##電
+##雾
+##需
+##霁
+##霄
+##霆
+##震
+##霈
+##霉
+##霊
+##霍
+##霎
+##霏
+##霑
+##霓
+##霖
+##霜
+##霞
+##霧
+##霭
+##霰
+##露
+##霸
+##霹
+##霽
+##霾
+##靂
+##靄
+##靈
+##青
+##靓
+##靖
+##静
+##靚
+##靛
+##靜
+##非
+##靠
+##靡
+##面
+##靥
+##靦
+##革
+##靳
+##靴
+##靶
+##靼
+##鞅
+##鞋
+##鞍
+##鞏
+##鞑
+##鞘
+##鞠
+##鞣
+##鞦
+##鞭
+##韆
+##韋
+##韌
+##韓
+##韜
+##韦
+##韧
+##韩
+##韬
+##韭
+##音
+##韵
+##韶
+##韻
+##響
+##頁
+##頂
+##頃
+##項
+##順
+##須
+##頌
+##預
+##頑
+##頒
+##頓
+##頗
+##領
+##頜
+##頡
+##頤
+##頫
+##頭
+##頰
+##頷
+##頸
+##頹
+##頻
+##頼
+##顆
+##題
+##額
+##顎
+##顏
+##顔
+##願
+##顛
+##類
+##顧
+##顫
+##顯
+##顱
+##顴
+##页
+##顶
+##顷
+##项
+##顺
+##须
+##顼
+##顽
+##顾
+##顿
+##颁
+##颂
+##预
+##颅
+##领
+##颇
+##颈
+##颉
+##颊
+##颌
+##颍
+##颐
+##频
+##颓
+##颔
+##颖
+##颗
+##题
+##颚
+##颛
+##颜
+##额
+##颞
+##颠
+##颡
+##颢
+##颤
+##颦
+##颧
+##風
+##颯
+##颱
+##颳
+##颶
+##颼
+##飄
+##飆
+##风
+##飒
+##飓
+##飕
+##飘
+##飙
+##飚
+##飛
+##飞
+##食
+##飢
+##飨
+##飩
+##飪
+##飯
+##飲
+##飼
+##飽
+##飾
+##餃
+##餅
+##餉
+##養
+##餌
+##餐
+##餒
+##餓
+##餘
+##餚
+##餛
+##餞
+##餡
+##館
+##餮
+##餵
+##餾
+##饅
+##饈
+##饋
+##饌
+##饍
+##饑
+##饒
+##饕
+##饗
+##饞
+##饥
+##饨
+##饪
+##饬
+##饭
+##饮
+##饯
+##饰
+##饱
+##饲
+##饴
+##饵
+##饶
+##饷
+##饺
+##饼
+##饽
+##饿
+##馀
+##馁
+##馄
+##馅
+##馆
+##馈
+##馋
+##馍
+##馏
+##馒
+##馔
+##首
+##馗
+##香
+##馥
+##馨
+##馬
+##馭
+##馮
+##馳
+##馴
+##駁
+##駄
+##駅
+##駆
+##駐
+##駒
+##駕
+##駛
+##駝
+##駭
+##駱
+##駿
+##騁
+##騎
+##騏
+##験
+##騙
+##騨
+##騰
+##騷
+##驀
+##驅
+##驊
+##驍
+##驒
+##驕
+##驗
+##驚
+##驛
+##驟
+##驢
+##驥
+##马
+##驭
+##驮
+##驯
+##驰
+##驱
+##驳
+##驴
+##驶
+##驷
+##驸
+##驹
+##驻
+##驼
+##驾
+##驿
+##骁
+##骂
+##骄
+##骅
+##骆
+##骇
+##骈
+##骊
+##骋
+##验
+##骏
+##骐
+##骑
+##骗
+##骚
+##骛
+##骜
+##骞
+##骠
+##骡
+##骤
+##骥
+##骧
+##骨
+##骯
+##骰
+##骶
+##骷
+##骸
+##骼
+##髂
+##髅
+##髋
+##髏
+##髒
+##髓
+##體
+##髖
+##高
+##髦
+##髪
+##髮
+##髯
+##髻
+##鬃
+##鬆
+##鬍
+##鬓
+##鬚
+##鬟
+##鬢
+##鬣
+##鬥
+##鬧
+##鬱
+##鬼
+##魁
+##魂
+##魄
+##魅
+##魇
+##魍
+##魏
+##魔
+##魘
+##魚
+##魯
+##魷
+##鮑
+##鮨
+##鮪
+##鮭
+##鮮
+##鯉
+##鯊
+##鯖
+##鯛
+##鯨
+##鯰
+##鯽
+##鰍
+##鰓
+##鰭
+##鰲
+##鰻
+##鰾
+##鱈
+##鱉
+##鱔
+##鱗
+##鱷
+##鱸
+##鱼
+##鱿
+##鲁
+##鲈
+##鲍
+##鲑
+##鲛
+##鲜
+##鲟
+##鲢
+##鲤
+##鲨
+##鲫
+##鲱
+##鲲
+##鲶
+##鲷
+##鲸
+##鳃
+##鳄
+##鳅
+##鳌
+##鳍
+##鳕
+##鳖
+##鳗
+##鳝
+##鳞
+##鳥
+##鳩
+##鳳
+##鳴
+##鳶
+##鴉
+##鴕
+##鴛
+##鴦
+##鴨
+##鴻
+##鴿
+##鵑
+##鵜
+##鵝
+##鵡
+##鵬
+##鵰
+##鵲
+##鶘
+##鶩
+##鶯
+##鶴
+##鷗
+##鷲
+##鷹
+##鷺
+##鸚
+##鸞
+##鸟
+##鸠
+##鸡
+##鸢
+##鸣
+##鸥
+##鸦
+##鸨
+##鸪
+##鸭
+##鸯
+##鸳
+##鸵
+##鸽
+##鸾
+##鸿
+##鹂
+##鹃
+##鹄
+##鹅
+##鹈
+##鹉
+##鹊
+##鹌
+##鹏
+##鹑
+##鹕
+##鹘
+##鹜
+##鹞
+##鹤
+##鹦
+##鹧
+##鹫
+##鹭
+##鹰
+##鹳
+##鹵
+##鹹
+##鹼
+##鹽
+##鹿
+##麂
+##麋
+##麒
+##麓
+##麗
+##麝
+##麟
+##麥
+##麦
+##麩
+##麴
+##麵
+##麸
+##麺
+##麻
+##麼
+##麽
+##麾
+##黃
+##黄
+##黍
+##黎
+##黏
+##黑
+##黒
+##黔
+##默
+##黛
+##黜
+##黝
+##點
+##黠
+##黨
+##黯
+##黴
+##鼋
+##鼎
+##鼐
+##鼓
+##鼠
+##鼬
+##鼹
+##鼻
+##鼾
+##齁
+##齊
+##齋
+##齐
+##齒
+##齡
+##齢
+##齣
+##齦
+##齿
+##龄
+##龅
+##龈
+##龊
+##龋
+##龌
+##龍
+##龐
+##龔
+##龕
+##龙
+##龚
+##龛
+##龜
+##龟
+##︰
+##︱
+##︶
+##︿
+##﹁
+##﹂
+##﹍
+##﹏
+##﹐
+##﹑
+##﹒
+##﹔
+##﹕
+##﹖
+##﹗
+##﹙
+##﹚
+##﹝
+##﹞
+##﹡
+##﹣
+##！
+##＂
+##＃
+##＄
+##％
+##＆
+##＇
+##（
+##）
+##＊
+##，
+##－
+##．
+##／
+##：
+##；
+##＜
+##？
+##＠
+##［
+##＼
+##］
+##＾
+##＿
+##｀
+##ｆ
+##ｈ
+##ｊ
+##ｕ
+##ｗ
+##ｚ
+##｛
+##｝
+##｡
+##｢
+##｣
+##､
+##･
+##ｯ
+##ｰ
+##ｲ
+##ｸ
+##ｼ
+##ｽ
+##ﾄ
+##ﾉ
+##ﾌ
+##ﾗ
+##ﾙ
+##ﾝ
+##ﾞ
+##ﾟ
+##￣
+##￥
+##👍
+##🔥
+##😂
+##😎
diff --git a/third_party/to_mindrecord/zhwiki/.gitignore b/third_party/to_mindrecord/zhwiki/.gitignore
new file mode 100644
index 0000000000..38ab986b46
--- /dev/null
+++ b/third_party/to_mindrecord/zhwiki/.gitignore
@@ -0,0 +1 @@
+create_pretraining_data_patched.py
diff --git a/third_party/to_mindrecord/zhwiki/README.md b/third_party/to_mindrecord/zhwiki/README.md
new file mode 100644
index 0000000000..6199240d7c
--- /dev/null
+++ b/third_party/to_mindrecord/zhwiki/README.md
@@ -0,0 +1 @@
+## All the scripts here come from [google-research/bert](https://github.com/google-research/bert)
diff --git a/third_party/to_mindrecord/zhwiki/create_pretraining_data.py b/third_party/to_mindrecord/zhwiki/create_pretraining_data.py
new file mode 100644
index 0000000000..5340d96ae3
--- /dev/null
+++ b/third_party/to_mindrecord/zhwiki/create_pretraining_data.py
@@ -0,0 +1,469 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import random
+import tokenization
+import tensorflow as tf
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None,
+                    "Input raw text file (or comma-separated list of files).")
+
+flags.DEFINE_string(
+    "output_file", None,
+    "Output TF example file (or comma-separated list of files).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_bool(
+    "do_whole_word_mask", False,
+    "Whether to use whole word masking rather than per-WordPiece masking.")
+
+flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+
+flags.DEFINE_integer("max_predictions_per_seq", 20,
+                     "Maximum number of masked LM predictions per sequence.")
+
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+
+flags.DEFINE_integer(
+    "dupe_factor", 10,
+    "Number of times to duplicate the input data (with different masks).")
+
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+
+flags.DEFINE_float(
+    "short_seq_prob", 0.1,
+    "Probability of creating sequences which are shorter than the "
+    "maximum length.")
+
+
+class TrainingInstance(object):
+  """A single training instance (sentence pair)."""
+
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+               is_random_next):
+    self.tokens = tokens
+    self.segment_ids = segment_ids
+    self.is_random_next = is_random_next
+    self.masked_lm_positions = masked_lm_positions
+    self.masked_lm_labels = masked_lm_labels
+
+  def __str__(self):
+    s = ""
+    s += "tokens: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+    s += "is_random_next: %s\n" % self.is_random_next
+    s += "masked_lm_positions: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "\n"
+    return s
+
+  def __repr__(self):
+    return self.__str__()
+
+
+def write_instance_to_example_files(instances, tokenizer, max_seq_length,
+                                    max_predictions_per_seq, output_files):
+  """Create TF example files from `TrainingInstance`s."""
+  writers = []
+  for output_file in output_files:
+    writers.append(tf.python_io.TFRecordWriter(output_file))
+
+  writer_index = 0
+
+  total_written = 0
+  for (inst_index, instance) in enumerate(instances):
+    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+    input_mask = [1] * len(input_ids)
+    segment_ids = list(instance.segment_ids)
+    assert len(input_ids) <= max_seq_length
+
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    masked_lm_positions = list(instance.masked_lm_positions)
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+    masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_ids.append(0)
+      masked_lm_weights.append(0.0)
+
+    next_sentence_label = 1 if instance.is_random_next else 0
+
+    features = collections.OrderedDict()
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(input_mask)
+    features["segment_ids"] = create_int_feature(segment_ids)
+    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
+    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
+    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
+    features["next_sentence_labels"] = create_int_feature([next_sentence_label])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+
+    writers[writer_index].write(tf_example.SerializeToString())
+    writer_index = (writer_index + 1) % len(writers)
+
+    total_written += 1
+
+    if inst_index < 20:
+      tf.logging.info("*** Example ***")
+      tf.logging.info("tokens: %s" % " ".join(
+          [tokenization.printable_text(x) for x in instance.tokens]))
+
+      for feature_name in features.keys():
+        feature = features[feature_name]
+        values = []
+        if feature.int64_list.value:
+          values = feature.int64_list.value
+        elif feature.float_list.value:
+          values = feature.float_list.value
+        tf.logging.info(
+            "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
+
+  for writer in writers:
+    writer.close()
+
+  tf.logging.info("Wrote %d total instances", total_written)
+
+
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+
+
+def create_float_feature(values):
+  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+  return feature
+
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+  """Create `TrainingInstance`s from raw text."""
+  all_documents = [[]]
+
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    with tf.gfile.GFile(input_file, "r") as reader:
+      while True:
+        line = tokenization.convert_to_unicode(reader.readline())
+        if not line:
+          break
+        line = line.strip()
+
+        # Empty lines are used as document delimiters
+        if not line:
+          all_documents.append([])
+        tokens = tokenizer.tokenize(line)
+        if tokens:
+          all_documents[-1].append(tokens)
+
+  # Remove empty documents
+  all_documents = [x for x in all_documents if x]
+  rng.shuffle(all_documents)
+
+  vocab_words = list(tokenizer.vocab.keys())
+  instances = []
+  for _ in range(dupe_factor):
+    for document_index in range(len(all_documents)):
+      instances.extend(
+          create_instances_from_document(
+              all_documents, document_index, max_seq_length, short_seq_prob,
+              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
+
+  rng.shuffle(instances)
+  return instances
+
+
+def create_instances_from_document(
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+  """Creates `TrainingInstance`s for a single document."""
+  document = all_documents[document_index]
+
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob:
+    target_seq_length = rng.randint(2, max_num_tokens)
+
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  instances = []
+  current_chunk = []
+  current_length = 0
+  i = 0
+  while i < len(document):
+    segment = document[i]
+    current_chunk.append(segment)
+    current_length += len(segment)
+    if i == len(document) - 1 or current_length >= target_seq_length:
+      if current_chunk:
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2:
+          a_end = rng.randint(1, len(current_chunk) - 1)
+
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+
+        tokens_b = []
+        # Random next
+        is_random_next = False
+        if len(current_chunk) == 1 or rng.random() < 0.5:
+          is_random_next = True
+          target_b_length = target_seq_length - len(tokens_a)
+
+          # This should rarely go for more than one iteration for large
+          # corpora. However, just to be careful, we try to make sure that
+          # the random document is not the same as the document
+          # we're processing.
+          for _ in range(10):
+            random_document_index = rng.randint(0, len(all_documents) - 1)
+            if random_document_index != document_index:
+              break
+
+          random_document = all_documents[random_document_index]
+          random_start = rng.randint(0, len(random_document) - 1)
+          for j in range(random_start, len(random_document)):
+            tokens_b.extend(random_document[j])
+            if len(tokens_b) >= target_b_length:
+              break
+          # We didn't actually use these segments so we "put them back" so
+          # they don't go to waste.
+          num_unused_segments = len(current_chunk) - a_end
+          i -= num_unused_segments
+        # Actual next
+        else:
+          is_random_next = False
+          for j in range(a_end, len(current_chunk)):
+            tokens_b.extend(current_chunk[j])
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+        instance = TrainingInstance(
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = []
+      current_length = 0
+    i += 1
+
+  return instances
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+  """Creates the predictions for the masked LM objective."""
+
+  cand_indexes = []
+  for (i, token) in enumerate(tokens):
+    if token == "[CLS]" or token == "[SEP]":
+      continue
+    # Whole Word Masking means that if we mask all of the wordpieces
+    # corresponding to an original word. When a word has been split into
+    # WordPieces, the first token does not have any marker and any subsequence
+    # tokens are prefixed with ##. So whenever we see the ## token, we
+    # append it to the previous set of word indexes.
+    #
+    # Note that Whole Word Masking does *not* change the training code
+    # at all -- we still predict each WordPiece independently, softmaxed
+    # over the entire vocabulary.
+    if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and
+        token.startswith("##")):
+      cand_indexes[-1].append(i)
+    else:
+      cand_indexes.append([i])
+
+  rng.shuffle(cand_indexes)
+
+  output_tokens = list(tokens)
+
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+
+  masked_lms = []
+  covered_indexes = set()
+  for index_set in cand_indexes:
+    if len(masked_lms) >= num_to_predict:
+      break
+    # If adding a whole-word mask would exceed the maximum number of
+    # predictions, then just skip this candidate.
+    if len(masked_lms) + len(index_set) > num_to_predict:
+      continue
+    is_any_index_covered = False
+    for index in index_set:
+      if index in covered_indexes:
+        is_any_index_covered = True
+        break
+    if is_any_index_covered:
+      continue
+    for index in index_set:
+      covered_indexes.add(index)
+
+      masked_token = None
+      # 80% of the time, replace with [MASK]
+      if rng.random() < 0.8:
+        masked_token = "[MASK]"
+      else:
+        # 10% of the time, keep original
+        if rng.random() < 0.5:
+          masked_token = tokens[index]
+        # 10% of the time, replace with random word
+        else:
+          masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+      output_tokens[index] = masked_token
+
+      masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+  assert len(masked_lms) <= num_to_predict
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+  """Truncates a pair of sequences to a maximum sequence length."""
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_num_tokens:
+      break
+
+    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+    assert len(trunc_tokens) >= 1
+
+    # We want to sometimes truncate from the front and sometimes from the
+    # back to add more randomness and avoid biases.
+    if rng.random() < 0.5:
+      del trunc_tokens[0]
+    else:
+      trunc_tokens.pop()
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  input_files = []
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.gfile.Glob(input_pattern))
+
+  tf.logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    tf.logging.info("  %s", input_file)
+
+  rng = random.Random(FLAGS.random_seed)
+  instances = create_training_instances(
+      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
+      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
+      rng)
+
+  output_files = FLAGS.output_file.split(",")
+  tf.logging.info("*** Writing to output files ***")
+  for output_file in output_files:
+    tf.logging.info("  %s", output_file)
+
+  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
+                                  FLAGS.max_predictions_per_seq, output_files)
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("output_file")
+  flags.mark_flag_as_required("vocab_file")
+  tf.app.run()
diff --git a/third_party/to_mindrecord/zhwiki/sample_text.txt b/third_party/to_mindrecord/zhwiki/sample_text.txt
new file mode 100644
index 0000000000..a42812060c
--- /dev/null
+++ b/third_party/to_mindrecord/zhwiki/sample_text.txt
@@ -0,0 +1,33 @@
+This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
+Text should be one-sentence-per-line, with empty lines between documents.
+This sample text is public domain and was randomly selected from Project Guttenberg.
+
+The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
+Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
+Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
+"Cass" Beard had risen early that morning, but not with a view to discovery.
+A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
+The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
+This was nearly opposite.
+Mr. Cassius crossed the highway, and stopped suddenly.
+Something glittered in the nearest red pool before him.
+Gold, surely!
+But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
+Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
+Like most of his fellow gold-seekers, Cass was superstitious.
+
+The fountain of classic wisdom, Hypatia herself.
+As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
+From my youth I felt in me a soul above the matter-entangled herd.
+She revealed to me the glorious fact, that I am a spark of Divinity itself.
+A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
+There is a philosophic pleasure in opening one's treasures to the modest young.
+Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
+Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
+but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
+Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
+His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
+while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
+At last they reached the quay at the opposite end of the street;
+and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
+He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
diff --git a/third_party/to_mindrecord/zhwiki/tokenization.py b/third_party/to_mindrecord/zhwiki/tokenization.py
new file mode 100644
index 0000000000..50e9445a19
--- /dev/null
+++ b/third_party/to_mindrecord/zhwiki/tokenization.py
@@ -0,0 +1,394 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+import six
+
+# pylint: skip-file
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  index = 0
+  with open(vocab_file, "r") as reader:
+    while True:
+      token = convert_to_unicode(reader.readline())
+      if not token:
+        break
+      token = token.strip()
+      vocab[token] = index
+      index += 1
+  return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True):
+    """Constructs a BasicTokenizer.
+    Args:
+      do_lower_case: Whether to lower case the input.
+    """
+    self.do_lower_case = do_lower_case
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token))
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    text = convert_to_unicode(text)
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+
+
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically contorl characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+
+
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat in ("Cc", "Cf"):
+    return True
+  return False
+
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
diff --git a/third_party/to_mindrecord/zhwiki/vocab.txt b/third_party/to_mindrecord/zhwiki/vocab.txt
new file mode 100644
index 0000000000..ca4f978103
--- /dev/null
+++ b/third_party/to_mindrecord/zhwiki/vocab.txt
@@ -0,0 +1,21128 @@
+[PAD]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[unused99]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+<S>
+<T>
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+~
+£
+¤
+¥
+§
+©
+«
+®
+°
+±
+²
+³
+µ
+·
+¹
+º
+»
+¼
+×
+ß
+æ
+÷
+ø
+đ
+ŋ
+ɔ
+ə
+ɡ
+ʰ
+ˇ
+ˈ
+ˊ
+ˋ
+ˍ
+ː
+˙
+˚
+ˢ
+α
+β
+γ
+δ
+ε
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+ы
+ь
+я
+і
+ا
+ب
+ة
+ت
+د
+ر
+س
+ع
+ل
+م
+ن
+ه
+و
+ي
+۩
+ก
+ง
+น
+ม
+ย
+ร
+อ
+า
+เ
+๑
+་
+ღ
+ᄀ
+ᄁ
+ᄂ
+ᄃ
+ᄅ
+ᄆ
+ᄇ
+ᄈ
+ᄉ
+ᄋ
+ᄌ
+ᄎ
+ᄏ
+ᄐ
+ᄑ
+ᄒ
+ᅡ
+ᅢ
+ᅣ
+ᅥ
+ᅦ
+ᅧ
+ᅨ
+ᅩ
+ᅪ
+ᅬ
+ᅭ
+ᅮ
+ᅯ
+ᅲ
+ᅳ
+ᅴ
+ᅵ
+ᆨ
+ᆫ
+ᆯ
+ᆷ
+ᆸ
+ᆺ
+ᆻ
+ᆼ
+ᗜ
+ᵃ
+ᵉ
+ᵍ
+ᵏ
+ᵐ
+ᵒ
+ᵘ
+‖
+„
+†
+•
+‥
+‧
+ 
+‰
+′
+″
+‹
+›
+※
+‿
+⁄
+ⁱ
+⁺
+ⁿ
+₁
+₂
+₃
+₄
+€
+℃
+№
+™
+ⅰ
+ⅱ
+ⅲ
+ⅳ
+ⅴ
+←
+↑
+→
+↓
+↔
+↗
+↘
+⇒
+∀
+−
+∕
+∙
+√
+∞
+∟
+∠
+∣
+∥
+∩
+∮
+∶
+∼
+∽
+≈
+≒
+≡
+≤
+≥
+≦
+≧
+≪
+≫
+⊙
+⋅
+⋈
+⋯
+⌒
+①
+②
+③
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+⑴
+⑵
+⑶
+⑷
+⑸
+⒈
+⒉
+⒊
+⒋
+ⓒ
+ⓔ
+ⓘ
+─
+━
+│
+┃
+┅
+┆
+┊
+┌
+└
+├
+┣
+═
+║
+╚
+╞
+╠
+╭
+╮
+╯
+╰
+╱
+╳
+▂
+▃
+▅
+▇
+█
+▉
+▋
+▌
+▍
+▎
+■
+□
+▪
+▫
+▬
+▲
+△
+▶
+►
+▼
+▽
+◆
+◇
+○
+◎
+●
+◕
+◠
+◢
+◤
+☀
+★
+☆
+☕
+☞
+☺
+☼
+♀
+♂
+♠
+♡
+♣
+♥
+♦
+♪
+♫
+♬
+✈
+✔
+✕
+✖
+✦
+✨
+✪
+✰
+✿
+❀
+❤
+➜
+➤
+⦿
+、
+。
+〃
+々
+〇
+〈
+〉
+《
+》
+「
+」
+『
+』
+【
+】
+〓
+〔
+〕
+〖
+〗
+〜
+〝
+〞
+ぁ
+あ
+ぃ
+い
+う
+ぇ
+え
+お
+か
+き
+く
+け
+こ
+さ
+し
+す
+せ
+そ
+た
+ち
+っ
+つ
+て
+と
+な
+に
+ぬ
+ね
+の
+は
+ひ
+ふ
+へ
+ほ
+ま
+み
+む
+め
+も
+ゃ
+や
+ゅ
+ゆ
+ょ
+よ
+ら
+り
+る
+れ
+ろ
+わ
+を
+ん
+゜
+ゝ
+ァ
+ア
+ィ
+イ
+ゥ
+ウ
+ェ
+エ
+ォ
+オ
+カ
+キ
+ク
+ケ
+コ
+サ
+シ
+ス
+セ
+ソ
+タ
+チ
+ッ
+ツ
+テ
+ト
+ナ
+ニ
+ヌ
+ネ
+ノ
+ハ
+ヒ
+フ
+ヘ
+ホ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ヤ
+ュ
+ユ
+ョ
+ヨ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ヲ
+ン
+ヶ
+・
+ー
+ヽ
+ㄅ
+ㄆ
+ㄇ
+ㄉ
+ㄋ
+ㄌ
+ㄍ
+ㄎ
+ㄏ
+ㄒ
+ㄚ
+ㄛ
+ㄞ
+ㄟ
+ㄢ
+ㄤ
+ㄥ
+ㄧ
+ㄨ
+ㆍ
+㈦
+㊣
+㎡
+㗎
+一
+丁
+七
+万
+丈
+三
+上
+下
+不
+与
+丐
+丑
+专
+且
+丕
+世
+丘
+丙
+业
+丛
+东
+丝
+丞
+丟
+両
+丢
+两
+严
+並
+丧
+丨
+个
+丫
+中
+丰
+串
+临
+丶
+丸
+丹
+为
+主
+丼
+丽
+举
+丿
+乂
+乃
+久
+么
+义
+之
+乌
+乍
+乎
+乏
+乐
+乒
+乓
+乔
+乖
+乗
+乘
+乙
+乜
+九
+乞
+也
+习
+乡
+书
+乩
+买
+乱
+乳
+乾
+亀
+亂
+了
+予
+争
+事
+二
+于
+亏
+云
+互
+五
+井
+亘
+亙
+亚
+些
+亜
+亞
+亟
+亡
+亢
+交
+亥
+亦
+产
+亨
+亩
+享
+京
+亭
+亮
+亲
+亳
+亵
+人
+亿
+什
+仁
+仃
+仄
+仅
+仆
+仇
+今
+介
+仍
+从
+仏
+仑
+仓
+仔
+仕
+他
+仗
+付
+仙
+仝
+仞
+仟
+代
+令
+以
+仨
+仪
+们
+仮
+仰
+仲
+件
+价
+任
+份
+仿
+企
+伉
+伊
+伍
+伎
+伏
+伐
+休
+伕
+众
+优
+伙
+会
+伝
+伞
+伟
+传
+伢
+伤
+伦
+伪
+伫
+伯
+估
+伴
+伶
+伸
+伺
+似
+伽
+佃
+但
+佇
+佈
+位
+低
+住
+佐
+佑
+体
+佔
+何
+佗
+佘
+余
+佚
+佛
+作
+佝
+佞
+佟
+你
+佢
+佣
+佤
+佥
+佩
+佬
+佯
+佰
+佳
+併
+佶
+佻
+佼
+使
+侃
+侄
+來
+侈
+例
+侍
+侏
+侑
+侖
+侗
+供
+依
+侠
+価
+侣
+侥
+侦
+侧
+侨
+侬
+侮
+侯
+侵
+侶
+侷
+便
+係
+促
+俄
+俊
+俎
+俏
+俐
+俑
+俗
+俘
+俚
+保
+俞
+俟
+俠
+信
+俨
+俩
+俪
+俬
+俭
+修
+俯
+俱
+俳
+俸
+俺
+俾
+倆
+倉
+個
+倌
+倍
+倏
+們
+倒
+倔
+倖
+倘
+候
+倚
+倜
+借
+倡
+値
+倦
+倩
+倪
+倫
+倬
+倭
+倶
+债
+值
+倾
+偃
+假
+偈
+偉
+偌
+偎
+偏
+偕
+做
+停
+健
+側
+偵
+偶
+偷
+偻
+偽
+偿
+傀
+傅
+傍
+傑
+傘
+備
+傚
+傢
+傣
+傥
+储
+傩
+催
+傭
+傲
+傳
+債
+傷
+傻
+傾
+僅
+働
+像
+僑
+僕
+僖
+僚
+僥
+僧
+僭
+僮
+僱
+僵
+價
+僻
+儀
+儂
+億
+儆
+儉
+儋
+儒
+儕
+儘
+償
+儡
+優
+儲
+儷
+儼
+儿
+兀
+允
+元
+兄
+充
+兆
+兇
+先
+光
+克
+兌
+免
+児
+兑
+兒
+兔
+兖
+党
+兜
+兢
+入
+內
+全
+兩
+八
+公
+六
+兮
+兰
+共
+兲
+关
+兴
+兵
+其
+具
+典
+兹
+养
+兼
+兽
+冀
+内
+円
+冇
+冈
+冉
+冊
+册
+再
+冏
+冒
+冕
+冗
+写
+军
+农
+冠
+冢
+冤
+冥
+冨
+冪
+冬
+冯
+冰
+冲
+决
+况
+冶
+冷
+冻
+冼
+冽
+冾
+净
+凄
+准
+凇
+凈
+凉
+凋
+凌
+凍
+减
+凑
+凛
+凜
+凝
+几
+凡
+凤
+処
+凪
+凭
+凯
+凰
+凱
+凳
+凶
+凸
+凹
+出
+击
+函
+凿
+刀
+刁
+刃
+分
+切
+刈
+刊
+刍
+刎
+刑
+划
+列
+刘
+则
+刚
+创
+初
+删
+判
+別
+刨
+利
+刪
+别
+刮
+到
+制
+刷
+券
+刹
+刺
+刻
+刽
+剁
+剂
+剃
+則
+剉
+削
+剋
+剌
+前
+剎
+剐
+剑
+剔
+剖
+剛
+剜
+剝
+剣
+剤
+剥
+剧
+剩
+剪
+副
+割
+創
+剷
+剽
+剿
+劃
+劇
+劈
+劉
+劊
+劍
+劏
+劑
+力
+劝
+办
+功
+加
+务
+劣
+动
+助
+努
+劫
+劭
+励
+劲
+劳
+労
+劵
+効
+劾
+势
+勁
+勃
+勇
+勉
+勋
+勐
+勒
+動
+勖
+勘
+務
+勛
+勝
+勞
+募
+勢
+勤
+勧
+勳
+勵
+勸
+勺
+勻
+勾
+勿
+匀
+包
+匆
+匈
+匍
+匐
+匕
+化
+北
+匙
+匝
+匠
+匡
+匣
+匪
+匮
+匯
+匱
+匹
+区
+医
+匾
+匿
+區
+十
+千
+卅
+升
+午
+卉
+半
+卍
+华
+协
+卑
+卒
+卓
+協
+单
+卖
+南
+単
+博
+卜
+卞
+卟
+占
+卡
+卢
+卤
+卦
+卧
+卫
+卮
+卯
+印
+危
+即
+却
+卵
+卷
+卸
+卻
+卿
+厂
+厄
+厅
+历
+厉
+压
+厌
+厕
+厘
+厚
+厝
+原
+厢
+厥
+厦
+厨
+厩
+厭
+厮
+厲
+厳
+去
+县
+叁
+参
+參
+又
+叉
+及
+友
+双
+反
+収
+发
+叔
+取
+受
+变
+叙
+叛
+叟
+叠
+叡
+叢
+口
+古
+句
+另
+叨
+叩
+只
+叫
+召
+叭
+叮
+可
+台
+叱
+史
+右
+叵
+叶
+号
+司
+叹
+叻
+叼
+叽
+吁
+吃
+各
+吆
+合
+吉
+吊
+吋
+同
+名
+后
+吏
+吐
+向
+吒
+吓
+吕
+吖
+吗
+君
+吝
+吞
+吟
+吠
+吡
+否
+吧
+吨
+吩
+含
+听
+吭
+吮
+启
+吱
+吳
+吴
+吵
+吶
+吸
+吹
+吻
+吼
+吽
+吾
+呀
+呂
+呃
+呆
+呈
+告
+呋
+呎
+呐
+呓
+呕
+呗
+员
+呛
+呜
+呢
+呤
+呦
+周
+呱
+呲
+味
+呵
+呷
+呸
+呻
+呼
+命
+咀
+咁
+咂
+咄
+咆
+咋
+和
+咎
+咏
+咐
+咒
+咔
+咕
+咖
+咗
+咘
+咙
+咚
+咛
+咣
+咤
+咦
+咧
+咨
+咩
+咪
+咫
+咬
+咭
+咯
+咱
+咲
+咳
+咸
+咻
+咽
+咿
+哀
+品
+哂
+哄
+哆
+哇
+哈
+哉
+哋
+哌
+响
+哎
+哏
+哐
+哑
+哒
+哔
+哗
+哟
+員
+哥
+哦
+哧
+哨
+哩
+哪
+哭
+哮
+哲
+哺
+哼
+哽
+唁
+唄
+唆
+唇
+唉
+唏
+唐
+唑
+唔
+唠
+唤
+唧
+唬
+售
+唯
+唰
+唱
+唳
+唷
+唸
+唾
+啃
+啄
+商
+啉
+啊
+問
+啓
+啕
+啖
+啜
+啞
+啟
+啡
+啤
+啥
+啦
+啧
+啪
+啫
+啬
+啮
+啰
+啱
+啲
+啵
+啶
+啷
+啸
+啻
+啼
+啾
+喀
+喂
+喃
+善
+喆
+喇
+喉
+喊
+喋
+喎
+喏
+喔
+喘
+喙
+喚
+喜
+喝
+喟
+喧
+喪
+喫
+喬
+單
+喰
+喱
+喲
+喳
+喵
+営
+喷
+喹
+喺
+喻
+喽
+嗅
+嗆
+嗇
+嗎
+嗑
+嗒
+嗓
+嗔
+嗖
+嗚
+嗜
+嗝
+嗟
+嗡
+嗣
+嗤
+嗦
+嗨
+嗪
+嗬
+嗯
+嗰
+嗲
+嗳
+嗶
+嗷
+嗽
+嘀
+嘅
+嘆
+嘈
+嘉
+嘌
+嘍
+嘎
+嘔
+嘖
+嘗
+嘘
+嘚
+嘛
+嘜
+嘞
+嘟
+嘢
+嘣
+嘤
+嘧
+嘩
+嘭
+嘮
+嘯
+嘰
+嘱
+嘲
+嘴
+嘶
+嘸
+嘹
+嘻
+嘿
+噁
+噌
+噎
+噓
+噔
+噗
+噙
+噜
+噠
+噢
+噤
+器
+噩
+噪
+噬
+噱
+噴
+噶
+噸
+噹
+噻
+噼
+嚀
+嚇
+嚎
+嚏
+嚐
+嚓
+嚕
+嚟
+嚣
+嚥
+嚨
+嚮
+嚴
+嚷
+嚼
+囂
+囉
+囊
+囍
+囑
+囔
+囗
+囚
+四
+囝
+回
+囟
+因
+囡
+团
+団
+囤
+囧
+囪
+囫
+园
+困
+囱
+囲
+図
+围
+囹
+固
+国
+图
+囿
+圃
+圄
+圆
+圈
+國
+圍
+圏
+園
+圓
+圖
+團
+圜
+土
+圣
+圧
+在
+圩
+圭
+地
+圳
+场
+圻
+圾
+址
+坂
+均
+坊
+坍
+坎
+坏
+坐
+坑
+块
+坚
+坛
+坝
+坞
+坟
+坠
+坡
+坤
+坦
+坨
+坪
+坯
+坳
+坵
+坷
+垂
+垃
+垄
+型
+垒
+垚
+垛
+垠
+垢
+垣
+垦
+垩
+垫
+垭
+垮
+垵
+埂
+埃
+埋
+城
+埔
+埕
+埗
+域
+埠
+埤
+埵
+執
+埸
+培
+基
+埼
+堀
+堂
+堃
+堅
+堆
+堇
+堑
+堕
+堙
+堡
+堤
+堪
+堯
+堰
+報
+場
+堵
+堺
+堿
+塊
+塌
+塑
+塔
+塗
+塘
+塚
+塞
+塢
+塩
+填
+塬
+塭
+塵
+塾
+墀
+境
+墅
+墉
+墊
+墒
+墓
+増
+墘
+墙
+墜
+增
+墟
+墨
+墩
+墮
+墳
+墻
+墾
+壁
+壅
+壆
+壇
+壊
+壑
+壓
+壕
+壘
+壞
+壟
+壢
+壤
+壩
+士
+壬
+壮
+壯
+声
+売
+壳
+壶
+壹
+壺
+壽
+处
+备
+変
+复
+夏
+夔
+夕
+外
+夙
+多
+夜
+够
+夠
+夢
+夥
+大
+天
+太
+夫
+夭
+央
+夯
+失
+头
+夷
+夸
+夹
+夺
+夾
+奂
+奄
+奇
+奈
+奉
+奋
+奎
+奏
+奐
+契
+奔
+奕
+奖
+套
+奘
+奚
+奠
+奢
+奥
+奧
+奪
+奬
+奮
+女
+奴
+奶
+奸
+她
+好
+如
+妃
+妄
+妆
+妇
+妈
+妊
+妍
+妒
+妓
+妖
+妘
+妙
+妝
+妞
+妣
+妤
+妥
+妨
+妩
+妪
+妮
+妲
+妳
+妹
+妻
+妾
+姆
+姉
+姊
+始
+姍
+姐
+姑
+姒
+姓
+委
+姗
+姚
+姜
+姝
+姣
+姥
+姦
+姨
+姪
+姫
+姬
+姹
+姻
+姿
+威
+娃
+娄
+娅
+娆
+娇
+娉
+娑
+娓
+娘
+娛
+娜
+娟
+娠
+娣
+娥
+娩
+娱
+娲
+娴
+娶
+娼
+婀
+婁
+婆
+婉
+婊
+婕
+婚
+婢
+婦
+婧
+婪
+婭
+婴
+婵
+婶
+婷
+婺
+婿
+媒
+媚
+媛
+媞
+媧
+媲
+媳
+媽
+媾
+嫁
+嫂
+嫉
+嫌
+嫑
+嫔
+嫖
+嫘
+嫚
+嫡
+嫣
+嫦
+嫩
+嫲
+嫵
+嫻
+嬅
+嬉
+嬌
+嬗
+嬛
+嬢
+嬤
+嬪
+嬰
+嬴
+嬷
+嬸
+嬿
+孀
+孃
+子
+孑
+孔
+孕
+孖
+字
+存
+孙
+孚
+孛
+孜
+孝
+孟
+孢
+季
+孤
+学
+孩
+孪
+孫
+孬
+孰
+孱
+孳
+孵
+學
+孺
+孽
+孿
+宁
+它
+宅
+宇
+守
+安
+宋
+完
+宏
+宓
+宕
+宗
+官
+宙
+定
+宛
+宜
+宝
+实
+実
+宠
+审
+客
+宣
+室
+宥
+宦
+宪
+宫
+宮
+宰
+害
+宴
+宵
+家
+宸
+容
+宽
+宾
+宿
+寂
+寄
+寅
+密
+寇
+富
+寐
+寒
+寓
+寛
+寝
+寞
+察
+寡
+寢
+寥
+實
+寧
+寨
+審
+寫
+寬
+寮
+寰
+寵
+寶
+寸
+对
+寺
+寻
+导
+対
+寿
+封
+専
+射
+将
+將
+專
+尉
+尊
+尋
+對
+導
+小
+少
+尔
+尕
+尖
+尘
+尚
+尝
+尤
+尧
+尬
+就
+尴
+尷
+尸
+尹
+尺
+尻
+尼
+尽
+尾
+尿
+局
+屁
+层
+屄
+居
+屆
+屈
+屉
+届
+屋
+屌
+屍
+屎
+屏
+屐
+屑
+展
+屜
+属
+屠
+屡
+屢
+層
+履
+屬
+屯
+山
+屹
+屿
+岀
+岁
+岂
+岌
+岐
+岑
+岔
+岖
+岗
+岘
+岙
+岚
+岛
+岡
+岩
+岫
+岬
+岭
+岱
+岳
+岷
+岸
+峇
+峋
+峒
+峙
+峡
+峤
+峥
+峦
+峨
+峪
+峭
+峯
+峰
+峴
+島
+峻
+峽
+崁
+崂
+崆
+崇
+崎
+崑
+崔
+崖
+崗
+崙
+崛
+崧
+崩
+崭
+崴
+崽
+嵇
+嵊
+嵋
+嵌
+嵐
+嵘
+嵩
+嵬
+嵯
+嶂
+嶄
+嶇
+嶋
+嶙
+嶺
+嶼
+嶽
+巅
+巍
+巒
+巔
+巖
+川
+州
+巡
+巢
+工
+左
+巧
+巨
+巩
+巫
+差
+己
+已
+巳
+巴
+巷
+巻
+巽
+巾
+巿
+币
+市
+布
+帅
+帆
+师
+希
+帐
+帑
+帕
+帖
+帘
+帚
+帛
+帜
+帝
+帥
+带
+帧
+師
+席
+帮
+帯
+帰
+帳
+帶
+帷
+常
+帼
+帽
+幀
+幂
+幄
+幅
+幌
+幔
+幕
+幟
+幡
+幢
+幣
+幫
+干
+平
+年
+并
+幸
+幹
+幺
+幻
+幼
+幽
+幾
+广
+庁
+広
+庄
+庆
+庇
+床
+序
+庐
+库
+应
+底
+庖
+店
+庙
+庚
+府
+庞
+废
+庠
+度
+座
+庫
+庭
+庵
+庶
+康
+庸
+庹
+庾
+廁
+廂
+廃
+廈
+廉
+廊
+廓
+廖
+廚
+廝
+廟
+廠
+廢
+廣
+廬
+廳
+延
+廷
+建
+廿
+开
+弁
+异
+弃
+弄
+弈
+弊
+弋
+式
+弑
+弒
+弓
+弔
+引
+弗
+弘
+弛
+弟
+张
+弥
+弦
+弧
+弩
+弭
+弯
+弱
+張
+強
+弹
+强
+弼
+弾
+彅
+彆
+彈
+彌
+彎
+归
+当
+录
+彗
+彙
+彝
+形
+彤
+彥
+彦
+彧
+彩
+彪
+彫
+彬
+彭
+彰
+影
+彷
+役
+彻
+彼
+彿
+往
+征
+径
+待
+徇
+很
+徉
+徊
+律
+後
+徐
+徑
+徒
+従
+徕
+得
+徘
+徙
+徜
+從
+徠
+御
+徨
+復
+循
+徬
+微
+徳
+徴
+徵
+德
+徹
+徼
+徽
+心
+必
+忆
+忌
+忍
+忏
+忐
+忑
+忒
+忖
+志
+忘
+忙
+応
+忠
+忡
+忤
+忧
+忪
+快
+忱
+念
+忻
+忽
+忿
+怀
+态
+怂
+怅
+怆
+怎
+怏
+怒
+怔
+怕
+怖
+怙
+怜
+思
+怠
+怡
+急
+怦
+性
+怨
+怪
+怯
+怵
+总
+怼
+恁
+恃
+恆
+恋
+恍
+恐
+恒
+恕
+恙
+恚
+恢
+恣
+恤
+恥
+恨
+恩
+恪
+恫
+恬
+恭
+息
+恰
+恳
+恵
+恶
+恸
+恺
+恻
+恼
+恿
+悄
+悅
+悉
+悌
+悍
+悔
+悖
+悚
+悟
+悠
+患
+悦
+您
+悩
+悪
+悬
+悯
+悱
+悲
+悴
+悵
+悶
+悸
+悻
+悼
+悽
+情
+惆
+惇
+惊
+惋
+惑
+惕
+惘
+惚
+惜
+惟
+惠
+惡
+惦
+惧
+惨
+惩
+惫
+惬
+惭
+惮
+惯
+惰
+惱
+想
+惴
+惶
+惹
+惺
+愁
+愆
+愈
+愉
+愍
+意
+愕
+愚
+愛
+愜
+感
+愣
+愤
+愧
+愫
+愷
+愿
+慄
+慈
+態
+慌
+慎
+慑
+慕
+慘
+慚
+慟
+慢
+慣
+慧
+慨
+慫
+慮
+慰
+慳
+慵
+慶
+慷
+慾
+憂
+憊
+憋
+憎
+憐
+憑
+憔
+憚
+憤
+憧
+憨
+憩
+憫
+憬
+憲
+憶
+憾
+懂
+懇
+懈
+應
+懊
+懋
+懑
+懒
+懦
+懲
+懵
+懶
+懷
+懸
+懺
+懼
+懾
+懿
+戀
+戈
+戊
+戌
+戍
+戎
+戏
+成
+我
+戒
+戕
+或
+战
+戚
+戛
+戟
+戡
+戦
+截
+戬
+戮
+戰
+戲
+戳
+戴
+戶
+户
+戸
+戻
+戾
+房
+所
+扁
+扇
+扈
+扉
+手
+才
+扎
+扑
+扒
+打
+扔
+払
+托
+扛
+扣
+扦
+执
+扩
+扪
+扫
+扬
+扭
+扮
+扯
+扰
+扱
+扳
+扶
+批
+扼
+找
+承
+技
+抄
+抉
+把
+抑
+抒
+抓
+投
+抖
+抗
+折
+抚
+抛
+抜
+択
+抟
+抠
+抡
+抢
+护
+报
+抨
+披
+抬
+抱
+抵
+抹
+押
+抽
+抿
+拂
+拄
+担
+拆
+拇
+拈
+拉
+拋
+拌
+拍
+拎
+拐
+拒
+拓
+拔
+拖
+拗
+拘
+拙
+拚
+招
+拜
+拟
+拡
+拢
+拣
+拥
+拦
+拧
+拨
+择
+括
+拭
+拮
+拯
+拱
+拳
+拴
+拷
+拼
+拽
+拾
+拿
+持
+挂
+指
+挈
+按
+挎
+挑
+挖
+挙
+挚
+挛
+挝
+挞
+挟
+挠
+挡
+挣
+挤
+挥
+挨
+挪
+挫
+振
+挲
+挹
+挺
+挽
+挾
+捂
+捅
+捆
+捉
+捋
+捌
+捍
+捎
+捏
+捐
+捕
+捞
+损
+捡
+换
+捣
+捧
+捨
+捩
+据
+捱
+捲
+捶
+捷
+捺
+捻
+掀
+掂
+掃
+掇
+授
+掉
+掌
+掏
+掐
+排
+掖
+掘
+掙
+掛
+掠
+採
+探
+掣
+接
+控
+推
+掩
+措
+掬
+掰
+掲
+掳
+掴
+掷
+掸
+掺
+揀
+揃
+揄
+揆
+揉
+揍
+描
+提
+插
+揖
+揚
+換
+握
+揣
+揩
+揪
+揭
+揮
+援
+揶
+揸
+揹
+揽
+搀
+搁
+搂
+搅
+損
+搏
+搐
+搓
+搔
+搖
+搗
+搜
+搞
+搡
+搪
+搬
+搭
+搵
+搶
+携
+搽
+摀
+摁
+摄
+摆
+摇
+摈
+摊
+摒
+摔
+摘
+摞
+摟
+摧
+摩
+摯
+摳
+摸
+摹
+摺
+摻
+撂
+撃
+撅
+撇
+撈
+撐
+撑
+撒
+撓
+撕
+撚
+撞
+撤
+撥
+撩
+撫
+撬
+播
+撮
+撰
+撲
+撵
+撷
+撸
+撻
+撼
+撿
+擀
+擁
+擂
+擄
+擅
+擇
+擊
+擋
+操
+擎
+擒
+擔
+擘
+據
+擞
+擠
+擡
+擢
+擦
+擬
+擰
+擱
+擲
+擴
+擷
+擺
+擼
+擾
+攀
+攏
+攒
+攔
+攘
+攙
+攜
+攝
+攞
+攢
+攣
+攤
+攥
+攪
+攫
+攬
+支
+收
+攸
+改
+攻
+放
+政
+故
+效
+敌
+敍
+敎
+敏
+救
+敕
+敖
+敗
+敘
+教
+敛
+敝
+敞
+敢
+散
+敦
+敬
+数
+敲
+整
+敵
+敷
+數
+斂
+斃
+文
+斋
+斌
+斎
+斐
+斑
+斓
+斗
+料
+斛
+斜
+斟
+斡
+斤
+斥
+斧
+斩
+斫
+斬
+断
+斯
+新
+斷
+方
+於
+施
+旁
+旃
+旅
+旋
+旌
+旎
+族
+旖
+旗
+无
+既
+日
+旦
+旧
+旨
+早
+旬
+旭
+旮
+旱
+时
+旷
+旺
+旻
+昀
+昂
+昆
+昇
+昉
+昊
+昌
+明
+昏
+易
+昔
+昕
+昙
+星
+映
+春
+昧
+昨
+昭
+是
+昱
+昴
+昵
+昶
+昼
+显
+晁
+時
+晃
+晉
+晋
+晌
+晏
+晒
+晓
+晔
+晕
+晖
+晗
+晚
+晝
+晞
+晟
+晤
+晦
+晨
+晩
+普
+景
+晰
+晴
+晶
+晷
+智
+晾
+暂
+暄
+暇
+暈
+暉
+暌
+暐
+暑
+暖
+暗
+暝
+暢
+暧
+暨
+暫
+暮
+暱
+暴
+暸
+暹
+曄
+曆
+曇
+曉
+曖
+曙
+曜
+曝
+曠
+曦
+曬
+曰
+曲
+曳
+更
+書
+曹
+曼
+曾
+替
+最
+會
+月
+有
+朋
+服
+朐
+朔
+朕
+朗
+望
+朝
+期
+朦
+朧
+木
+未
+末
+本
+札
+朮
+术
+朱
+朴
+朵
+机
+朽
+杀
+杂
+权
+杆
+杈
+杉
+李
+杏
+材
+村
+杓
+杖
+杜
+杞
+束
+杠
+条
+来
+杨
+杭
+杯
+杰
+東
+杳
+杵
+杷
+杼
+松
+板
+极
+构
+枇
+枉
+枋
+析
+枕
+林
+枚
+果
+枝
+枢
+枣
+枪
+枫
+枭
+枯
+枰
+枱
+枳
+架
+枷
+枸
+柄
+柏
+某
+柑
+柒
+染
+柔
+柘
+柚
+柜
+柞
+柠
+柢
+查
+柩
+柬
+柯
+柱
+柳
+柴
+柵
+査
+柿
+栀
+栃
+栄
+栅
+标
+栈
+栉
+栋
+栎
+栏
+树
+栓
+栖
+栗
+校
+栩
+株
+样
+核
+根
+格
+栽
+栾
+桀
+桁
+桂
+桃
+桅
+框
+案
+桉
+桌
+桎
+桐
+桑
+桓
+桔
+桜
+桠
+桡
+桢
+档
+桥
+桦
+桧
+桨
+桩
+桶
+桿
+梁
+梅
+梆
+梏
+梓
+梗
+條
+梟
+梢
+梦
+梧
+梨
+梭
+梯
+械
+梳
+梵
+梶
+检
+棂
+棄
+棉
+棋
+棍
+棒
+棕
+棗
+棘
+棚
+棟
+棠
+棣
+棧
+森
+棱
+棲
+棵
+棹
+棺
+椁
+椅
+椋
+植
+椎
+椒
+検
+椪
+椭
+椰
+椹
+椽
+椿
+楂
+楊
+楓
+楔
+楚
+楝
+楞
+楠
+楣
+楨
+楫
+業
+楮
+極
+楷
+楸
+楹
+楼
+楽
+概
+榄
+榆
+榈
+榉
+榔
+榕
+榖
+榛
+榜
+榨
+榫
+榭
+榮
+榱
+榴
+榷
+榻
+槁
+槃
+構
+槌
+槍
+槎
+槐
+槓
+様
+槛
+槟
+槤
+槭
+槲
+槳
+槻
+槽
+槿
+樁
+樂
+樊
+樑
+樓
+標
+樞
+樟
+模
+樣
+権
+横
+樫
+樯
+樱
+樵
+樸
+樹
+樺
+樽
+樾
+橄
+橇
+橋
+橐
+橘
+橙
+機
+橡
+橢
+橫
+橱
+橹
+橼
+檀
+檄
+檎
+檐
+檔
+檗
+檜
+檢
+檬
+檯
+檳
+檸
+檻
+櫃
+櫚
+櫛
+櫥
+櫸
+櫻
+欄
+權
+欒
+欖
+欠
+次
+欢
+欣
+欧
+欲
+欸
+欺
+欽
+款
+歆
+歇
+歉
+歌
+歎
+歐
+歓
+歙
+歛
+歡
+止
+正
+此
+步
+武
+歧
+歩
+歪
+歯
+歲
+歳
+歴
+歷
+歸
+歹
+死
+歼
+殁
+殃
+殆
+殇
+殉
+殊
+残
+殒
+殓
+殖
+殘
+殞
+殡
+殤
+殭
+殯
+殲
+殴
+段
+殷
+殺
+殼
+殿
+毀
+毁
+毂
+毅
+毆
+毋
+母
+毎
+每
+毒
+毓
+比
+毕
+毗
+毘
+毙
+毛
+毡
+毫
+毯
+毽
+氈
+氏
+氐
+民
+氓
+气
+氖
+気
+氙
+氛
+氟
+氡
+氢
+氣
+氤
+氦
+氧
+氨
+氪
+氫
+氮
+氯
+氰
+氲
+水
+氷
+永
+氹
+氾
+汀
+汁
+求
+汆
+汇
+汉
+汎
+汐
+汕
+汗
+汙
+汛
+汝
+汞
+江
+池
+污
+汤
+汨
+汩
+汪
+汰
+汲
+汴
+汶
+汹
+決
+汽
+汾
+沁
+沂
+沃
+沅
+沈
+沉
+沌
+沏
+沐
+沒
+沓
+沖
+沙
+沛
+沟
+没
+沢
+沣
+沥
+沦
+沧
+沪
+沫
+沭
+沮
+沱
+河
+沸
+油
+治
+沼
+沽
+沾
+沿
+況
+泄
+泉
+泊
+泌
+泓
+法
+泗
+泛
+泞
+泠
+泡
+波
+泣
+泥
+注
+泪
+泫
+泮
+泯
+泰
+泱
+泳
+泵
+泷
+泸
+泻
+泼
+泽
+泾
+洁
+洄
+洋
+洒
+洗
+洙
+洛
+洞
+津
+洩
+洪
+洮
+洱
+洲
+洵
+洶
+洸
+洹
+活
+洼
+洽
+派
+流
+浃
+浄
+浅
+浆
+浇
+浊
+测
+济
+浏
+浑
+浒
+浓
+浔
+浙
+浚
+浜
+浣
+浦
+浩
+浪
+浬
+浮
+浯
+浴
+海
+浸
+涂
+涅
+涇
+消
+涉
+涌
+涎
+涓
+涔
+涕
+涙
+涛
+涝
+涞
+涟
+涠
+涡
+涣
+涤
+润
+涧
+涨
+涩
+涪
+涮
+涯
+液
+涵
+涸
+涼
+涿
+淀
+淄
+淅
+淆
+淇
+淋
+淌
+淑
+淒
+淖
+淘
+淙
+淚
+淞
+淡
+淤
+淦
+淨
+淩
+淪
+淫
+淬
+淮
+深
+淳
+淵
+混
+淹
+淺
+添
+淼
+清
+済
+渉
+渊
+渋
+渍
+渎
+渐
+渔
+渗
+渙
+渚
+減
+渝
+渠
+渡
+渣
+渤
+渥
+渦
+温
+測
+渭
+港
+渲
+渴
+游
+渺
+渾
+湃
+湄
+湊
+湍
+湖
+湘
+湛
+湟
+湧
+湫
+湮
+湯
+湳
+湾
+湿
+満
+溃
+溅
+溉
+溏
+源
+準
+溜
+溝
+溟
+溢
+溥
+溧
+溪
+溫
+溯
+溱
+溴
+溶
+溺
+溼
+滁
+滂
+滄
+滅
+滇
+滋
+滌
+滑
+滓
+滔
+滕
+滙
+滚
+滝
+滞
+滟
+满
+滢
+滤
+滥
+滦
+滨
+滩
+滬
+滯
+滲
+滴
+滷
+滸
+滾
+滿
+漁
+漂
+漆
+漉
+漏
+漓
+演
+漕
+漠
+漢
+漣
+漩
+漪
+漫
+漬
+漯
+漱
+漲
+漳
+漸
+漾
+漿
+潆
+潇
+潋
+潍
+潑
+潔
+潘
+潛
+潜
+潞
+潟
+潢
+潤
+潦
+潧
+潭
+潮
+潰
+潴
+潸
+潺
+潼
+澀
+澄
+澆
+澈
+澍
+澎
+澗
+澜
+澡
+澤
+澧
+澱
+澳
+澹
+激
+濁
+濂
+濃
+濑
+濒
+濕
+濘
+濛
+濟
+濠
+濡
+濤
+濫
+濬
+濮
+濯
+濱
+濺
+濾
+瀅
+瀆
+瀉
+瀋
+瀏
+瀑
+瀕
+瀘
+瀚
+瀛
+瀝
+瀞
+瀟
+瀧
+瀨
+瀬
+瀰
+瀾
+灌
+灏
+灑
+灘
+灝
+灞
+灣
+火
+灬
+灭
+灯
+灰
+灵
+灶
+灸
+灼
+災
+灾
+灿
+炀
+炁
+炅
+炉
+炊
+炎
+炒
+炔
+炕
+炖
+炙
+炜
+炫
+炬
+炭
+炮
+炯
+炳
+炷
+炸
+点
+為
+炼
+炽
+烁
+烂
+烃
+烈
+烊
+烏
+烘
+烙
+烛
+烟
+烤
+烦
+烧
+烨
+烩
+烫
+烬
+热
+烯
+烷
+烹
+烽
+焉
+焊
+焕
+焖
+焗
+焘
+焙
+焚
+焜
+無
+焦
+焯
+焰
+焱
+然
+焼
+煅
+煉
+煊
+煌
+煎
+煒
+煖
+煙
+煜
+煞
+煤
+煥
+煦
+照
+煨
+煩
+煮
+煲
+煸
+煽
+熄
+熊
+熏
+熒
+熔
+熙
+熟
+熠
+熨
+熬
+熱
+熵
+熹
+熾
+燁
+燃
+燄
+燈
+燉
+燊
+燎
+燒
+燔
+燕
+燙
+燜
+營
+燥
+燦
+燧
+燭
+燮
+燴
+燻
+燼
+燿
+爆
+爍
+爐
+爛
+爪
+爬
+爭
+爰
+爱
+爲
+爵
+父
+爷
+爸
+爹
+爺
+爻
+爽
+爾
+牆
+片
+版
+牌
+牍
+牒
+牙
+牛
+牝
+牟
+牠
+牡
+牢
+牦
+牧
+物
+牯
+牲
+牴
+牵
+特
+牺
+牽
+犀
+犁
+犄
+犊
+犍
+犒
+犢
+犧
+犬
+犯
+状
+犷
+犸
+犹
+狀
+狂
+狄
+狈
+狎
+狐
+狒
+狗
+狙
+狞
+狠
+狡
+狩
+独
+狭
+狮
+狰
+狱
+狸
+狹
+狼
+狽
+猎
+猕
+猖
+猗
+猙
+猛
+猜
+猝
+猥
+猩
+猪
+猫
+猬
+献
+猴
+猶
+猷
+猾
+猿
+獄
+獅
+獎
+獐
+獒
+獗
+獠
+獣
+獨
+獭
+獰
+獲
+獵
+獷
+獸
+獺
+獻
+獼
+獾
+玄
+率
+玉
+王
+玑
+玖
+玛
+玟
+玠
+玥
+玩
+玫
+玮
+环
+现
+玲
+玳
+玷
+玺
+玻
+珀
+珂
+珅
+珈
+珉
+珊
+珍
+珏
+珐
+珑
+珙
+珞
+珠
+珣
+珥
+珩
+珪
+班
+珮
+珲
+珺
+現
+球
+琅
+理
+琇
+琉
+琊
+琍
+琏
+琐
+琛
+琢
+琥
+琦
+琨
+琪
+琬
+琮
+琰
+琲
+琳
+琴
+琵
+琶
+琺
+琼
+瑀
+瑁
+瑄
+瑋
+瑕
+瑗
+瑙
+瑚
+瑛
+瑜
+瑞
+瑟
+瑠
+瑣
+瑤
+瑩
+瑪
+瑯
+瑰
+瑶
+瑾
+璀
+璁
+璃
+璇
+璉
+璋
+璎
+璐
+璜
+璞
+璟
+璧
+璨
+環
+璽
+璿
+瓊
+瓏
+瓒
+瓜
+瓢
+瓣
+瓤
+瓦
+瓮
+瓯
+瓴
+瓶
+瓷
+甄
+甌
+甕
+甘
+甙
+甚
+甜
+生
+產
+産
+甥
+甦
+用
+甩
+甫
+甬
+甭
+甯
+田
+由
+甲
+申
+电
+男
+甸
+町
+画
+甾
+畀
+畅
+界
+畏
+畑
+畔
+留
+畜
+畝
+畢
+略
+畦
+番
+畫
+異
+畲
+畳
+畴
+當
+畸
+畹
+畿
+疆
+疇
+疊
+疏
+疑
+疔
+疖
+疗
+疙
+疚
+疝
+疟
+疡
+疣
+疤
+疥
+疫
+疮
+疯
+疱
+疲
+疳
+疵
+疸
+疹
+疼
+疽
+疾
+痂
+病
+症
+痈
+痉
+痊
+痍
+痒
+痔
+痕
+痘
+痙
+痛
+痞
+痠
+痢
+痣
+痤
+痧
+痨
+痪
+痫
+痰
+痱
+痴
+痹
+痺
+痼
+痿
+瘀
+瘁
+瘋
+瘍
+瘓
+瘘
+瘙
+瘟
+瘠
+瘡
+瘢
+瘤
+瘦
+瘧
+瘩
+瘪
+瘫
+瘴
+瘸
+瘾
+療
+癇
+癌
+癒
+癖
+癜
+癞
+癡
+癢
+癣
+癥
+癫
+癬
+癮
+癱
+癲
+癸
+発
+登
+發
+白
+百
+皂
+的
+皆
+皇
+皈
+皋
+皎
+皑
+皓
+皖
+皙
+皚
+皮
+皰
+皱
+皴
+皺
+皿
+盂
+盃
+盅
+盆
+盈
+益
+盎
+盏
+盐
+监
+盒
+盔
+盖
+盗
+盘
+盛
+盜
+盞
+盟
+盡
+監
+盤
+盥
+盧
+盪
+目
+盯
+盱
+盲
+直
+相
+盹
+盼
+盾
+省
+眈
+眉
+看
+県
+眙
+眞
+真
+眠
+眦
+眨
+眩
+眯
+眶
+眷
+眸
+眺
+眼
+眾
+着
+睁
+睇
+睏
+睐
+睑
+睛
+睜
+睞
+睡
+睢
+督
+睥
+睦
+睨
+睪
+睫
+睬
+睹
+睽
+睾
+睿
+瞄
+瞅
+瞇
+瞋
+瞌
+瞎
+瞑
+瞒
+瞓
+瞞
+瞟
+瞠
+瞥
+瞧
+瞩
+瞪
+瞬
+瞭
+瞰
+瞳
+瞻
+瞼
+瞿
+矇
+矍
+矗
+矚
+矛
+矜
+矢
+矣
+知
+矩
+矫
+短
+矮
+矯
+石
+矶
+矽
+矾
+矿
+码
+砂
+砌
+砍
+砒
+研
+砖
+砗
+砚
+砝
+砣
+砥
+砧
+砭
+砰
+砲
+破
+砷
+砸
+砺
+砼
+砾
+础
+硅
+硐
+硒
+硕
+硝
+硫
+硬
+确
+硯
+硼
+碁
+碇
+碉
+碌
+碍
+碎
+碑
+碓
+碗
+碘
+碚
+碛
+碟
+碣
+碧
+碩
+碰
+碱
+碳
+碴
+確
+碼
+碾
+磁
+磅
+磊
+磋
+磐
+磕
+磚
+磡
+磨
+磬
+磯
+磲
+磷
+磺
+礁
+礎
+礙
+礡
+礦
+礪
+礫
+礴
+示
+礼
+社
+祀
+祁
+祂
+祇
+祈
+祉
+祎
+祐
+祕
+祖
+祗
+祚
+祛
+祜
+祝
+神
+祟
+祠
+祢
+祥
+票
+祭
+祯
+祷
+祸
+祺
+祿
+禀
+禁
+禄
+禅
+禍
+禎
+福
+禛
+禦
+禧
+禪
+禮
+禱
+禹
+禺
+离
+禽
+禾
+禿
+秀
+私
+秃
+秆
+秉
+秋
+种
+科
+秒
+秘
+租
+秣
+秤
+秦
+秧
+秩
+秭
+积
+称
+秸
+移
+秽
+稀
+稅
+程
+稍
+税
+稔
+稗
+稚
+稜
+稞
+稟
+稠
+稣
+種
+稱
+稲
+稳
+稷
+稹
+稻
+稼
+稽
+稿
+穀
+穂
+穆
+穌
+積
+穎
+穗
+穢
+穩
+穫
+穴
+究
+穷
+穹
+空
+穿
+突
+窃
+窄
+窈
+窍
+窑
+窒
+窓
+窕
+窖
+窗
+窘
+窜
+窝
+窟
+窠
+窥
+窦
+窨
+窩
+窪
+窮
+窯
+窺
+窿
+竄
+竅
+竇
+竊
+立
+竖
+站
+竜
+竞
+竟
+章
+竣
+童
+竭
+端
+競
+竹
+竺
+竽
+竿
+笃
+笆
+笈
+笋
+笏
+笑
+笔
+笙
+笛
+笞
+笠
+符
+笨
+第
+笹
+笺
+笼
+筆
+等
+筊
+筋
+筍
+筏
+筐
+筑
+筒
+答
+策
+筛
+筝
+筠
+筱
+筲
+筵
+筷
+筹
+签
+简
+箇
+箋
+箍
+箏
+箐
+箔
+箕
+算
+箝
+管
+箩
+箫
+箭
+箱
+箴
+箸
+節
+篁
+範
+篆
+篇
+築
+篑
+篓
+篙
+篝
+篠
+篡
+篤
+篩
+篪
+篮
+篱
+篷
+簇
+簌
+簍
+簡
+簦
+簧
+簪
+簫
+簷
+簸
+簽
+簾
+簿
+籁
+籃
+籌
+籍
+籐
+籟
+籠
+籤
+籬
+籮
+籲
+米
+类
+籼
+籽
+粄
+粉
+粑
+粒
+粕
+粗
+粘
+粟
+粤
+粥
+粧
+粪
+粮
+粱
+粲
+粳
+粵
+粹
+粼
+粽
+精
+粿
+糅
+糊
+糍
+糕
+糖
+糗
+糙
+糜
+糞
+糟
+糠
+糧
+糬
+糯
+糰
+糸
+系
+糾
+紀
+紂
+約
+紅
+紉
+紊
+紋
+納
+紐
+紓
+純
+紗
+紘
+紙
+級
+紛
+紜
+素
+紡
+索
+紧
+紫
+紮
+累
+細
+紳
+紹
+紺
+終
+絃
+組
+絆
+経
+結
+絕
+絞
+絡
+絢
+給
+絨
+絮
+統
+絲
+絳
+絵
+絶
+絹
+綁
+綏
+綑
+經
+継
+続
+綜
+綠
+綢
+綦
+綫
+綬
+維
+綱
+網
+綴
+綵
+綸
+綺
+綻
+綽
+綾
+綿
+緊
+緋
+総
+緑
+緒
+緘
+線
+緝
+緞
+締
+緣
+編
+緩
+緬
+緯
+練
+緹
+緻
+縁
+縄
+縈
+縛
+縝
+縣
+縫
+縮
+縱
+縴
+縷
+總
+績
+繁
+繃
+繆
+繇
+繋
+織
+繕
+繚
+繞
+繡
+繩
+繪
+繫
+繭
+繳
+繹
+繼
+繽
+纂
+續
+纍
+纏
+纓
+纔
+纖
+纜
+纠
+红
+纣
+纤
+约
+级
+纨
+纪
+纫
+纬
+纭
+纯
+纰
+纱
+纲
+纳
+纵
+纶
+纷
+纸
+纹
+纺
+纽
+纾
+线
+绀
+练
+组
+绅
+细
+织
+终
+绊
+绍
+绎
+经
+绑
+绒
+结
+绔
+绕
+绘
+给
+绚
+绛
+络
+绝
+绞
+统
+绡
+绢
+绣
+绥
+绦
+继
+绩
+绪
+绫
+续
+绮
+绯
+绰
+绳
+维
+绵
+绶
+绷
+绸
+绻
+综
+绽
+绾
+绿
+缀
+缄
+缅
+缆
+缇
+缈
+缉
+缎
+缓
+缔
+缕
+编
+缘
+缙
+缚
+缜
+缝
+缠
+缢
+缤
+缥
+缨
+缩
+缪
+缭
+缮
+缰
+缱
+缴
+缸
+缺
+缽
+罂
+罄
+罌
+罐
+网
+罔
+罕
+罗
+罚
+罡
+罢
+罩
+罪
+置
+罰
+署
+罵
+罷
+罹
+羁
+羅
+羈
+羊
+羌
+美
+羔
+羚
+羞
+羟
+羡
+羣
+群
+羥
+羧
+羨
+義
+羯
+羲
+羸
+羹
+羽
+羿
+翁
+翅
+翊
+翌
+翎
+習
+翔
+翘
+翟
+翠
+翡
+翦
+翩
+翰
+翱
+翳
+翹
+翻
+翼
+耀
+老
+考
+耄
+者
+耆
+耋
+而
+耍
+耐
+耒
+耕
+耗
+耘
+耙
+耦
+耨
+耳
+耶
+耷
+耸
+耻
+耽
+耿
+聂
+聆
+聊
+聋
+职
+聒
+联
+聖
+聘
+聚
+聞
+聪
+聯
+聰
+聲
+聳
+聴
+聶
+職
+聽
+聾
+聿
+肃
+肄
+肅
+肆
+肇
+肉
+肋
+肌
+肏
+肓
+肖
+肘
+肚
+肛
+肝
+肠
+股
+肢
+肤
+肥
+肩
+肪
+肮
+肯
+肱
+育
+肴
+肺
+肽
+肾
+肿
+胀
+胁
+胃
+胄
+胆
+背
+胍
+胎
+胖
+胚
+胛
+胜
+胝
+胞
+胡
+胤
+胥
+胧
+胫
+胭
+胯
+胰
+胱
+胳
+胴
+胶
+胸
+胺
+能
+脂
+脅
+脆
+脇
+脈
+脉
+脊
+脍
+脏
+脐
+脑
+脓
+脖
+脘
+脚
+脛
+脣
+脩
+脫
+脯
+脱
+脲
+脳
+脸
+脹
+脾
+腆
+腈
+腊
+腋
+腌
+腎
+腐
+腑
+腓
+腔
+腕
+腥
+腦
+腩
+腫
+腭
+腮
+腰
+腱
+腳
+腴
+腸
+腹
+腺
+腻
+腼
+腾
+腿
+膀
+膈
+膊
+膏
+膑
+膘
+膚
+膛
+膜
+膝
+膠
+膦
+膨
+膩
+膳
+膺
+膻
+膽
+膾
+膿
+臀
+臂
+臃
+臆
+臉
+臊
+臍
+臓
+臘
+臟
+臣
+臥
+臧
+臨
+自
+臬
+臭
+至
+致
+臺
+臻
+臼
+臾
+舀
+舂
+舅
+舆
+與
+興
+舉
+舊
+舌
+舍
+舎
+舐
+舒
+舔
+舖
+舗
+舛
+舜
+舞
+舟
+航
+舫
+般
+舰
+舱
+舵
+舶
+舷
+舸
+船
+舺
+舾
+艇
+艋
+艘
+艙
+艦
+艮
+良
+艰
+艱
+色
+艳
+艷
+艹
+艺
+艾
+节
+芃
+芈
+芊
+芋
+芍
+芎
+芒
+芙
+芜
+芝
+芡
+芥
+芦
+芩
+芪
+芫
+芬
+芭
+芮
+芯
+花
+芳
+芷
+芸
+芹
+芻
+芽
+芾
+苁
+苄
+苇
+苋
+苍
+苏
+苑
+苒
+苓
+苔
+苕
+苗
+苛
+苜
+苞
+苟
+苡
+苣
+若
+苦
+苫
+苯
+英
+苷
+苹
+苻
+茁
+茂
+范
+茄
+茅
+茉
+茎
+茏
+茗
+茜
+茧
+茨
+茫
+茬
+茭
+茯
+茱
+茲
+茴
+茵
+茶
+茸
+茹
+茼
+荀
+荃
+荆
+草
+荊
+荏
+荐
+荒
+荔
+荖
+荘
+荚
+荞
+荟
+荠
+荡
+荣
+荤
+荥
+荧
+荨
+荪
+荫
+药
+荳
+荷
+荸
+荻
+荼
+荽
+莅
+莆
+莉
+莊
+莎
+莒
+莓
+莖
+莘
+莞
+莠
+莢
+莧
+莪
+莫
+莱
+莲
+莴
+获
+莹
+莺
+莽
+莿
+菀
+菁
+菅
+菇
+菈
+菊
+菌
+菏
+菓
+菖
+菘
+菜
+菟
+菠
+菡
+菩
+華
+菱
+菲
+菸
+菽
+萁
+萃
+萄
+萊
+萋
+萌
+萍
+萎
+萘
+萝
+萤
+营
+萦
+萧
+萨
+萩
+萬
+萱
+萵
+萸
+萼
+落
+葆
+葉
+著
+葚
+葛
+葡
+董
+葦
+葩
+葫
+葬
+葭
+葯
+葱
+葳
+葵
+葷
+葺
+蒂
+蒋
+蒐
+蒔
+蒙
+蒜
+蒞
+蒟
+蒡
+蒨
+蒲
+蒸
+蒹
+蒻
+蒼
+蒿
+蓁
+蓄
+蓆
+蓉
+蓋
+蓑
+蓓
+蓖
+蓝
+蓟
+蓦
+蓬
+蓮
+蓼
+蓿
+蔑
+蔓
+蔔
+蔗
+蔘
+蔚
+蔡
+蔣
+蔥
+蔫
+蔬
+蔭
+蔵
+蔷
+蔺
+蔻
+蔼
+蔽
+蕁
+蕃
+蕈
+蕉
+蕊
+蕎
+蕙
+蕤
+蕨
+蕩
+蕪
+蕭
+蕲
+蕴
+蕻
+蕾
+薄
+薅
+薇
+薈
+薊
+薏
+薑
+薔
+薙
+薛
+薦
+薨
+薩
+薪
+薬
+薯
+薰
+薹
+藉
+藍
+藏
+藐
+藓
+藕
+藜
+藝
+藤
+藥
+藩
+藹
+藻
+藿
+蘆
+蘇
+蘊
+蘋
+蘑
+蘚
+蘭
+蘸
+蘼
+蘿
+虎
+虏
+虐
+虑
+虔
+處
+虚
+虛
+虜
+虞
+號
+虢
+虧
+虫
+虬
+虱
+虹
+虻
+虽
+虾
+蚀
+蚁
+蚂
+蚊
+蚌
+蚓
+蚕
+蚜
+蚝
+蚣
+蚤
+蚩
+蚪
+蚯
+蚱
+蚵
+蛀
+蛆
+蛇
+蛊
+蛋
+蛎
+蛐
+蛔
+蛙
+蛛
+蛟
+蛤
+蛭
+蛮
+蛰
+蛳
+蛹
+蛻
+蛾
+蜀
+蜂
+蜃
+蜆
+蜇
+蜈
+蜊
+蜍
+蜒
+蜓
+蜕
+蜗
+蜘
+蜚
+蜜
+蜡
+蜢
+蜥
+蜱
+蜴
+蜷
+蜻
+蜿
+蝇
+蝈
+蝉
+蝌
+蝎
+蝕
+蝗
+蝙
+蝟
+蝠
+蝦
+蝨
+蝴
+蝶
+蝸
+蝼
+螂
+螃
+融
+螞
+螢
+螨
+螯
+螳
+螺
+蟀
+蟄
+蟆
+蟋
+蟎
+蟑
+蟒
+蟠
+蟬
+蟲
+蟹
+蟻
+蟾
+蠅
+蠍
+蠔
+蠕
+蠛
+蠟
+蠡
+蠢
+蠣
+蠱
+蠶
+蠹
+蠻
+血
+衄
+衅
+衆
+行
+衍
+術
+衔
+街
+衙
+衛
+衝
+衞
+衡
+衢
+衣
+补
+表
+衩
+衫
+衬
+衮
+衰
+衲
+衷
+衹
+衾
+衿
+袁
+袂
+袄
+袅
+袈
+袋
+袍
+袒
+袖
+袜
+袞
+袤
+袪
+被
+袭
+袱
+裁
+裂
+装
+裆
+裊
+裏
+裔
+裕
+裘
+裙
+補
+裝
+裟
+裡
+裤
+裨
+裱
+裳
+裴
+裸
+裹
+製
+裾
+褂
+複
+褐
+褒
+褓
+褔
+褚
+褥
+褪
+褫
+褲
+褶
+褻
+襁
+襄
+襟
+襠
+襪
+襬
+襯
+襲
+西
+要
+覃
+覆
+覇
+見
+規
+覓
+視
+覚
+覦
+覧
+親
+覬
+観
+覷
+覺
+覽
+觀
+见
+观
+规
+觅
+视
+览
+觉
+觊
+觎
+觐
+觑
+角
+觞
+解
+觥
+触
+觸
+言
+訂
+計
+訊
+討
+訓
+訕
+訖
+託
+記
+訛
+訝
+訟
+訣
+訥
+訪
+設
+許
+訳
+訴
+訶
+診
+註
+証
+詆
+詐
+詔
+評
+詛
+詞
+詠
+詡
+詢
+詣
+試
+詩
+詫
+詬
+詭
+詮
+詰
+話
+該
+詳
+詹
+詼
+誅
+誇
+誉
+誌
+認
+誓
+誕
+誘
+語
+誠
+誡
+誣
+誤
+誥
+誦
+誨
+說
+説
+読
+誰
+課
+誹
+誼
+調
+諄
+談
+請
+諏
+諒
+論
+諗
+諜
+諡
+諦
+諧
+諫
+諭
+諮
+諱
+諳
+諷
+諸
+諺
+諾
+謀
+謁
+謂
+謄
+謊
+謎
+謐
+謔
+謗
+謙
+講
+謝
+謠
+謨
+謬
+謹
+謾
+譁
+證
+譎
+譏
+識
+譙
+譚
+譜
+警
+譬
+譯
+議
+譲
+譴
+護
+譽
+讀
+變
+讓
+讚
+讞
+计
+订
+认
+讥
+讧
+讨
+让
+讪
+讫
+训
+议
+讯
+记
+讲
+讳
+讴
+讶
+讷
+许
+讹
+论
+讼
+讽
+设
+访
+诀
+证
+诃
+评
+诅
+识
+诈
+诉
+诊
+诋
+词
+诏
+译
+试
+诗
+诘
+诙
+诚
+诛
+话
+诞
+诟
+诠
+诡
+询
+诣
+诤
+该
+详
+诧
+诩
+诫
+诬
+语
+误
+诰
+诱
+诲
+说
+诵
+诶
+请
+诸
+诺
+读
+诽
+课
+诿
+谀
+谁
+调
+谄
+谅
+谆
+谈
+谊
+谋
+谌
+谍
+谎
+谏
+谐
+谑
+谒
+谓
+谔
+谕
+谗
+谘
+谙
+谚
+谛
+谜
+谟
+谢
+谣
+谤
+谥
+谦
+谧
+谨
+谩
+谪
+谬
+谭
+谯
+谱
+谲
+谴
+谶
+谷
+豁
+豆
+豇
+豈
+豉
+豊
+豌
+豎
+豐
+豔
+豚
+象
+豢
+豪
+豫
+豬
+豹
+豺
+貂
+貅
+貌
+貓
+貔
+貘
+貝
+貞
+負
+財
+貢
+貧
+貨
+販
+貪
+貫
+責
+貯
+貰
+貳
+貴
+貶
+買
+貸
+費
+貼
+貽
+貿
+賀
+賁
+賂
+賃
+賄
+資
+賈
+賊
+賑
+賓
+賜
+賞
+賠
+賡
+賢
+賣
+賤
+賦
+質
+賬
+賭
+賴
+賺
+購
+賽
+贅
+贈
+贊
+贍
+贏
+贓
+贖
+贛
+贝
+贞
+负
+贡
+财
+责
+贤
+败
+账
+货
+质
+贩
+贪
+贫
+贬
+购
+贮
+贯
+贰
+贱
+贲
+贴
+贵
+贷
+贸
+费
+贺
+贻
+贼
+贾
+贿
+赁
+赂
+赃
+资
+赅
+赈
+赊
+赋
+赌
+赎
+赏
+赐
+赓
+赔
+赖
+赘
+赚
+赛
+赝
+赞
+赠
+赡
+赢
+赣
+赤
+赦
+赧
+赫
+赭
+走
+赳
+赴
+赵
+赶
+起
+趁
+超
+越
+趋
+趕
+趙
+趟
+趣
+趨
+足
+趴
+趵
+趸
+趺
+趾
+跃
+跄
+跆
+跋
+跌
+跎
+跑
+跖
+跚
+跛
+距
+跟
+跡
+跤
+跨
+跩
+跪
+路
+跳
+践
+跷
+跹
+跺
+跻
+踉
+踊
+踌
+踏
+踐
+踝
+踞
+踟
+踢
+踩
+踪
+踮
+踱
+踴
+踵
+踹
+蹂
+蹄
+蹇
+蹈
+蹉
+蹊
+蹋
+蹑
+蹒
+蹙
+蹟
+蹣
+蹤
+蹦
+蹩
+蹬
+蹭
+蹲
+蹴
+蹶
+蹺
+蹼
+蹿
+躁
+躇
+躉
+躊
+躋
+躍
+躏
+躪
+身
+躬
+躯
+躲
+躺
+軀
+車
+軋
+軌
+軍
+軒
+軟
+転
+軸
+軼
+軽
+軾
+較
+載
+輒
+輓
+輔
+輕
+輛
+輝
+輟
+輩
+輪
+輯
+輸
+輻
+輾
+輿
+轄
+轅
+轆
+轉
+轍
+轎
+轟
+车
+轧
+轨
+轩
+转
+轭
+轮
+软
+轰
+轲
+轴
+轶
+轻
+轼
+载
+轿
+较
+辄
+辅
+辆
+辇
+辈
+辉
+辊
+辍
+辐
+辑
+输
+辕
+辖
+辗
+辘
+辙
+辛
+辜
+辞
+辟
+辣
+辦
+辨
+辩
+辫
+辭
+辮
+辯
+辰
+辱
+農
+边
+辺
+辻
+込
+辽
+达
+迁
+迂
+迄
+迅
+过
+迈
+迎
+运
+近
+返
+还
+这
+进
+远
+违
+连
+迟
+迢
+迤
+迥
+迦
+迩
+迪
+迫
+迭
+述
+迴
+迷
+迸
+迹
+迺
+追
+退
+送
+适
+逃
+逅
+逆
+选
+逊
+逍
+透
+逐
+递
+途
+逕
+逗
+這
+通
+逛
+逝
+逞
+速
+造
+逢
+連
+逮
+週
+進
+逵
+逶
+逸
+逻
+逼
+逾
+遁
+遂
+遅
+遇
+遊
+運
+遍
+過
+遏
+遐
+遑
+遒
+道
+達
+違
+遗
+遙
+遛
+遜
+遞
+遠
+遢
+遣
+遥
+遨
+適
+遭
+遮
+遲
+遴
+遵
+遶
+遷
+選
+遺
+遼
+遽
+避
+邀
+邁
+邂
+邃
+還
+邇
+邈
+邊
+邋
+邏
+邑
+邓
+邕
+邛
+邝
+邢
+那
+邦
+邨
+邪
+邬
+邮
+邯
+邰
+邱
+邳
+邵
+邸
+邹
+邺
+邻
+郁
+郅
+郊
+郎
+郑
+郜
+郝
+郡
+郢
+郤
+郦
+郧
+部
+郫
+郭
+郴
+郵
+郷
+郸
+都
+鄂
+鄉
+鄒
+鄔
+鄙
+鄞
+鄢
+鄧
+鄭
+鄰
+鄱
+鄲
+鄺
+酉
+酊
+酋
+酌
+配
+酐
+酒
+酗
+酚
+酝
+酢
+酣
+酥
+酩
+酪
+酬
+酮
+酯
+酰
+酱
+酵
+酶
+酷
+酸
+酿
+醃
+醇
+醉
+醋
+醍
+醐
+醒
+醚
+醛
+醜
+醞
+醣
+醪
+醫
+醬
+醮
+醯
+醴
+醺
+釀
+釁
+采
+釉
+释
+釋
+里
+重
+野
+量
+釐
+金
+釗
+釘
+釜
+針
+釣
+釦
+釧
+釵
+鈀
+鈉
+鈍
+鈎
+鈔
+鈕
+鈞
+鈣
+鈦
+鈪
+鈴
+鈺
+鈾
+鉀
+鉄
+鉅
+鉉
+鉑
+鉗
+鉚
+鉛
+鉤
+鉴
+鉻
+銀
+銃
+銅
+銑
+銓
+銖
+銘
+銜
+銬
+銭
+銮
+銳
+銷
+銹
+鋁
+鋅
+鋒
+鋤
+鋪
+鋰
+鋸
+鋼
+錄
+錐
+錘
+錚
+錠
+錢
+錦
+錨
+錫
+錮
+錯
+録
+錳
+錶
+鍊
+鍋
+鍍
+鍛
+鍥
+鍰
+鍵
+鍺
+鍾
+鎂
+鎊
+鎌
+鎏
+鎔
+鎖
+鎗
+鎚
+鎧
+鎬
+鎮
+鎳
+鏈
+鏖
+鏗
+鏘
+鏞
+鏟
+鏡
+鏢
+鏤
+鏽
+鐘
+鐮
+鐲
+鐳
+鐵
+鐸
+鐺
+鑄
+鑊
+鑑
+鑒
+鑣
+鑫
+鑰
+鑲
+鑼
+鑽
+鑾
+鑿
+针
+钉
+钊
+钎
+钏
+钒
+钓
+钗
+钙
+钛
+钜
+钝
+钞
+钟
+钠
+钡
+钢
+钣
+钤
+钥
+钦
+钧
+钨
+钩
+钮
+钯
+钰
+钱
+钳
+钴
+钵
+钺
+钻
+钼
+钾
+钿
+铀
+铁
+铂
+铃
+铄
+铅
+铆
+铉
+铎
+铐
+铛
+铜
+铝
+铠
+铡
+铢
+铣
+铤
+铨
+铩
+铬
+铭
+铮
+铰
+铲
+铵
+银
+铸
+铺
+链
+铿
+销
+锁
+锂
+锄
+锅
+锆
+锈
+锉
+锋
+锌
+锏
+锐
+锑
+错
+锚
+锟
+锡
+锢
+锣
+锤
+锥
+锦
+锭
+键
+锯
+锰
+锲
+锵
+锹
+锺
+锻
+镀
+镁
+镂
+镇
+镉
+镌
+镍
+镐
+镑
+镕
+镖
+镗
+镛
+镜
+镣
+镭
+镯
+镰
+镳
+镶
+長
+长
+門
+閃
+閉
+開
+閎
+閏
+閑
+閒
+間
+閔
+閘
+閡
+関
+閣
+閥
+閨
+閩
+閱
+閲
+閹
+閻
+閾
+闆
+闇
+闊
+闌
+闍
+闔
+闕
+闖
+闘
+關
+闡
+闢
+门
+闪
+闫
+闭
+问
+闯
+闰
+闲
+间
+闵
+闷
+闸
+闹
+闺
+闻
+闽
+闾
+阀
+阁
+阂
+阅
+阆
+阇
+阈
+阉
+阎
+阐
+阑
+阔
+阕
+阖
+阙
+阚
+阜
+队
+阡
+阪
+阮
+阱
+防
+阳
+阴
+阵
+阶
+阻
+阿
+陀
+陂
+附
+际
+陆
+陇
+陈
+陋
+陌
+降
+限
+陕
+陛
+陝
+陞
+陟
+陡
+院
+陣
+除
+陨
+险
+陪
+陰
+陲
+陳
+陵
+陶
+陷
+陸
+険
+陽
+隅
+隆
+隈
+隊
+隋
+隍
+階
+随
+隐
+隔
+隕
+隘
+隙
+際
+障
+隠
+隣
+隧
+隨
+險
+隱
+隴
+隶
+隸
+隻
+隼
+隽
+难
+雀
+雁
+雄
+雅
+集
+雇
+雉
+雋
+雌
+雍
+雎
+雏
+雑
+雒
+雕
+雖
+雙
+雛
+雜
+雞
+離
+難
+雨
+雪
+雯
+雰
+雲
+雳
+零
+雷
+雹
+電
+雾
+需
+霁
+霄
+霆
+震
+霈
+霉
+霊
+霍
+霎
+霏
+霑
+霓
+霖
+霜
+霞
+霧
+霭
+霰
+露
+霸
+霹
+霽
+霾
+靂
+靄
+靈
+青
+靓
+靖
+静
+靚
+靛
+靜
+非
+靠
+靡
+面
+靥
+靦
+革
+靳
+靴
+靶
+靼
+鞅
+鞋
+鞍
+鞏
+鞑
+鞘
+鞠
+鞣
+鞦
+鞭
+韆
+韋
+韌
+韓
+韜
+韦
+韧
+韩
+韬
+韭
+音
+韵
+韶
+韻
+響
+頁
+頂
+頃
+項
+順
+須
+頌
+預
+頑
+頒
+頓
+頗
+領
+頜
+頡
+頤
+頫
+頭
+頰
+頷
+頸
+頹
+頻
+頼
+顆
+題
+額
+顎
+顏
+顔
+願
+顛
+類
+顧
+顫
+顯
+顱
+顴
+页
+顶
+顷
+项
+顺
+须
+顼
+顽
+顾
+顿
+颁
+颂
+预
+颅
+领
+颇
+颈
+颉
+颊
+颌
+颍
+颐
+频
+颓
+颔
+颖
+颗
+题
+颚
+颛
+颜
+额
+颞
+颠
+颡
+颢
+颤
+颦
+颧
+風
+颯
+颱
+颳
+颶
+颼
+飄
+飆
+风
+飒
+飓
+飕
+飘
+飙
+飚
+飛
+飞
+食
+飢
+飨
+飩
+飪
+飯
+飲
+飼
+飽
+飾
+餃
+餅
+餉
+養
+餌
+餐
+餒
+餓
+餘
+餚
+餛
+餞
+餡
+館
+餮
+餵
+餾
+饅
+饈
+饋
+饌
+饍
+饑
+饒
+饕
+饗
+饞
+饥
+饨
+饪
+饬
+饭
+饮
+饯
+饰
+饱
+饲
+饴
+饵
+饶
+饷
+饺
+饼
+饽
+饿
+馀
+馁
+馄
+馅
+馆
+馈
+馋
+馍
+馏
+馒
+馔
+首
+馗
+香
+馥
+馨
+馬
+馭
+馮
+馳
+馴
+駁
+駄
+駅
+駆
+駐
+駒
+駕
+駛
+駝
+駭
+駱
+駿
+騁
+騎
+騏
+験
+騙
+騨
+騰
+騷
+驀
+驅
+驊
+驍
+驒
+驕
+驗
+驚
+驛
+驟
+驢
+驥
+马
+驭
+驮
+驯
+驰
+驱
+驳
+驴
+驶
+驷
+驸
+驹
+驻
+驼
+驾
+驿
+骁
+骂
+骄
+骅
+骆
+骇
+骈
+骊
+骋
+验
+骏
+骐
+骑
+骗
+骚
+骛
+骜
+骞
+骠
+骡
+骤
+骥
+骧
+骨
+骯
+骰
+骶
+骷
+骸
+骼
+髂
+髅
+髋
+髏
+髒
+髓
+體
+髖
+高
+髦
+髪
+髮
+髯
+髻
+鬃
+鬆
+鬍
+鬓
+鬚
+鬟
+鬢
+鬣
+鬥
+鬧
+鬱
+鬼
+魁
+魂
+魄
+魅
+魇
+魍
+魏
+魔
+魘
+魚
+魯
+魷
+鮑
+鮨
+鮪
+鮭
+鮮
+鯉
+鯊
+鯖
+鯛
+鯨
+鯰
+鯽
+鰍
+鰓
+鰭
+鰲
+鰻
+鰾
+鱈
+鱉
+鱔
+鱗
+鱷
+鱸
+鱼
+鱿
+鲁
+鲈
+鲍
+鲑
+鲛
+鲜
+鲟
+鲢
+鲤
+鲨
+鲫
+鲱
+鲲
+鲶
+鲷
+鲸
+鳃
+鳄
+鳅
+鳌
+鳍
+鳕
+鳖
+鳗
+鳝
+鳞
+鳥
+鳩
+鳳
+鳴
+鳶
+鴉
+鴕
+鴛
+鴦
+鴨
+鴻
+鴿
+鵑
+鵜
+鵝
+鵡
+鵬
+鵰
+鵲
+鶘
+鶩
+鶯
+鶴
+鷗
+鷲
+鷹
+鷺
+鸚
+鸞
+鸟
+鸠
+鸡
+鸢
+鸣
+鸥
+鸦
+鸨
+鸪
+鸭
+鸯
+鸳
+鸵
+鸽
+鸾
+鸿
+鹂
+鹃
+鹄
+鹅
+鹈
+鹉
+鹊
+鹌
+鹏
+鹑
+鹕
+鹘
+鹜
+鹞
+鹤
+鹦
+鹧
+鹫
+鹭
+鹰
+鹳
+鹵
+鹹
+鹼
+鹽
+鹿
+麂
+麋
+麒
+麓
+麗
+麝
+麟
+麥
+麦
+麩
+麴
+麵
+麸
+麺
+麻
+麼
+麽
+麾
+黃
+黄
+黍
+黎
+黏
+黑
+黒
+黔
+默
+黛
+黜
+黝
+點
+黠
+黨
+黯
+黴
+鼋
+鼎
+鼐
+鼓
+鼠
+鼬
+鼹
+鼻
+鼾
+齁
+齊
+齋
+齐
+齒
+齡
+齢
+齣
+齦
+齿
+龄
+龅
+龈
+龊
+龋
+龌
+龍
+龐
+龔
+龕
+龙
+龚
+龛
+龜
+龟
+︰
+︱
+︶
+︿
+﹁
+﹂
+﹍
+﹏
+﹐
+﹑
+﹒
+﹔
+﹕
+﹖
+﹗
+﹙
+﹚
+﹝
+﹞
+﹡
+﹣
+！
+＂
+＃
+＄
+％
+＆
+＇
+（
+）
+＊
+＋
+，
+－
+．
+／
+０
+１
+２
+３
+４
+５
+６
+７
+８
+９
+：
+；
+＜
+＝
+＞
+？
+＠
+［
+＼
+］
+＾
+＿
+｀
+ａ
+ｂ
+ｃ
+ｄ
+ｅ
+ｆ
+ｇ
+ｈ
+ｉ
+ｊ
+ｋ
+ｌ
+ｍ
+ｎ
+ｏ
+ｐ
+ｑ
+ｒ
+ｓ
+ｔ
+ｕ
+ｖ
+ｗ
+ｘ
+ｙ
+ｚ
+｛
+｜
+｝
+～
+｡
+｢
+｣
+､
+･
+ｯ
+ｰ
+ｲ
+ｸ
+ｼ
+ｽ
+ﾄ
+ﾉ
+ﾌ
+ﾗ
+ﾙ
+ﾝ
+ﾞ
+ﾟ
+￣
+￥
+👍
+🔥
+😂
+😎
+...
+yam
+10
+2017
+12
+11
+2016
+20
+30
+15
+06
+lofter
+##s
+2015
+by
+16
+14
+18
+13
+24
+17
+2014
+21
+##0
+22
+19
+25
+23
+com
+100
+00
+05
+2013
+##a
+03
+09
+08
+28
+##2
+50
+01
+04
+##1
+27
+02
+2012
+##3
+26
+##e
+07
+##8
+##5
+##6
+##4
+##9
+##7
+29
+2011
+40
+##t
+2010
+##o
+##d
+##i
+2009
+##n
+app
+www
+the
+##m
+31
+##c
+##l
+##y
+##r
+##g
+2008
+60
+http
+200
+qq
+##p
+80
+##f
+google
+pixnet
+90
+cookies
+tripadvisor
+500
+##er
+##k
+35
+##h
+facebook
+2007
+2000
+70
+##b
+of
+##x
+##u
+45
+300
+iphone
+32
+1000
+2006
+48
+ip
+36
+in
+38
+3d
+##w
+##ing
+55
+ctrip
+##on
+##v
+33
+##の
+to
+34
+400
+id
+2005
+it
+37
+windows
+llc
+top
+99
+42
+39
+000
+led
+at
+##an
+41
+51
+52
+46
+49
+43
+53
+44
+##z
+android
+58
+and
+59
+2004
+56
+vr
+##か
+5000
+2003
+47
+blogthis
+twitter
+54
+##le
+150
+ok
+2018
+57
+75
+cn
+no
+ios
+##in
+##mm
+##00
+800
+on
+te
+3000
+65
+2001
+360
+95
+ig
+lv
+120
+##ng
+##を
+##us
+##に
+pc
+てす
+──
+600
+##te
+85
+2002
+88
+##ed
+html
+ncc
+wifi
+email
+64
+blog
+is
+##10
+##て
+mail
+online
+##al
+dvd
+##ic
+studio
+##は
+##℃
+##ia
+##と
+line
+vip
+72
+##q
+98
+##ce
+##en
+for
+##is
+##ra
+##es
+##j
+usb
+net
+cp
+1999
+asia
+4g
+##cm
+diy
+new
+3c
+##お
+ta
+66
+language
+vs
+apple
+tw
+86
+web
+##ne
+ipad
+62
+you
+##re
+101
+68
+##tion
+ps
+de
+bt
+pony
+atm
+##2017
+1998
+67
+##ch
+ceo
+##or
+go
+##na
+av
+pro
+cafe
+96
+pinterest
+97
+63
+pixstyleme3c
+##ta
+more
+said
+##2016
+1997
+mp3
+700
+##ll
+nba
+jun
+##20
+92
+tv
+1995
+pm
+61
+76
+nbsp
+250
+##ie
+linux
+##ma
+cd
+110
+hd
+##17
+78
+##ion
+77
+6000
+am
+##th
+##st
+94
+##se
+##et
+69
+180
+gdp
+my
+105
+81
+abc
+89
+flash
+79
+one
+93
+1990
+1996
+##ck
+gps
+##も
+##ly
+web885
+106
+2020
+91
+##ge
+4000
+1500
+xd
+boss
+isbn
+1994
+org
+##ry
+me
+love
+##11
+0fork
+73
+##12
+3g
+##ter
+##ar
+71
+82
+##la
+hotel
+130
+1970
+pk
+83
+87
+140
+ie
+##os
+##30
+##el
+74
+##50
+seo
+cpu
+##ml
+p2p
+84
+may
+##る
+sun
+tue
+internet
+cc
+posted
+youtube
+##at
+##ン
+##man
+ii
+##ル
+##15
+abs
+nt
+pdf
+yahoo
+ago
+1980
+##it
+news
+mac
+104
+##てす
+##me
+##り
+java
+1992
+spa
+##de
+##nt
+hk
+all
+plus
+la
+1993
+##mb
+##16
+##ve
+west
+##da
+160
+air
+##い
+##ps
+から
+##to
+1989
+logo
+htc
+php
+https
+fi
+momo
+##son
+sat
+##ke
+##80
+ebd
+suv
+wi
+day
+apk
+##88
+##um
+mv
+galaxy
+wiki
+or
+brake
+##ス
+1200
+する
+this
+1991
+mon
+##こ
+❤2017
+po
+##ない
+javascript
+life
+home
+june
+##ss
+system
+900
+##ー
+##０
+pp
+1988
+world
+fb
+4k
+br
+##as
+ic
+ai
+leonardo
+safari
+##60
+live
+free
+xx
+wed
+win7
+kiehl
+##co
+lg
+o2o
+##go
+us
+235
+1949
+mm
+しい
+vfm
+kanye
+##90
+##2015
+##id
+jr
+##ey
+123
+rss
+##sa
+##ro
+##am
+##no
+thu
+fri
+350
+##sh
+##ki
+103
+comments
+name
+##のて
+##pe
+##ine
+max
+1987
+8000
+uber
+##mi
+##ton
+wordpress
+office
+1986
+1985
+##ment
+107
+bd
+win10
+##ld
+##li
+gmail
+bb
+dior
+##rs
+##ri
+##rd
+##ます
+up
+cad
+##®
+dr
+して
+read
+##21
+をお
+##io
+##99
+url
+1984
+pvc
+paypal
+show
+policy
+##40
+##ty
+##18
+with
+##★
+##01
+txt
+102
+##ba
+dna
+from
+post
+mini
+ar
+taiwan
+john
+##ga
+privacy
+agoda
+##13
+##ny
+word
+##24
+##22
+##by
+##ur
+##hz
+1982
+##ang
+265
+cookie
+netscape
+108
+##ka
+##～
+##ad
+house
+share
+note
+ibm
+code
+hello
+nike
+sim
+survey
+##016
+1979
+1950
+wikia
+##32
+##017
+5g
+cbc
+##tor
+##kg
+1983
+##rt
+##14
+campaign
+store
+2500
+os
+##ct
+##ts
+##°
+170
+api
+##ns
+365
+excel
+##な
+##ao
+##ら
+##し
+～～
+##nd
+university
+163
+には
+518
+##70
+##ya
+##il
+##25
+pierre
+ipo
+0020
+897
+##23
+hotels
+##ian
+のお
+125
+years
+6606
+##ers
+##26
+high
+##day
+time
+##ay
+bug
+##line
+##く
+##す
+##be
+xp
+talk2yam
+yamservice
+10000
+coco
+##dy
+sony
+##ies
+1978
+microsoft
+david
+people
+##ha
+1960
+instagram
+intel
+その
+##ot
+iso
+1981
+##va
+115
+##mo
+##land
+xxx
+man
+co
+ltxsw
+##ation
+baby
+220
+##pa
+##ol
+1945
+7000
+tag
+450
+##ue
+msn
+##31
+oppo
+##ト
+##ca
+control
+##om
+st
+chrome
+##ure
+##ん
+be
+##き
+lol
+##19
+した
+##bo
+240
+lady
+##100
+##way
+##から
+4600
+##ko
+##do
+##un
+4s
+corporation
+168
+##ni
+herme
+##28
+ｃｐ
+978
+##up
+##06
+ui
+##ds
+ppt
+admin
+three
+します
+bbc
+re
+128
+##48
+ca
+##015
+##35
+hp
+##ee
+tpp
+##た
+##ive
+××
+root
+##cc
+##ました
+##ble
+##ity
+adobe
+park
+114
+et
+oled
+city
+##ex
+##ler
+##ap
+china
+##book
+20000
+view
+##ice
+global
+##km
+your
+hong
+##mg
+out
+##ms
+ng
+ebay
+##29
+menu
+ubuntu
+##cy
+rom
+##view
+open
+ktv
+do
+server
+##lo
+if
+english
+##ね
+##５
+##oo
+1600
+##02
+step1
+kong
+club
+135
+july
+inc
+1976
+mr
+hi
+##net
+touch
+##ls
+##ii
+michael
+lcd
+##05
+##33
+phone
+james
+step2
+1300
+ios9
+##box
+dc
+##２
+##ley
+samsung
+111
+280
+pokemon
+css
+##ent
+##les
+いいえ
+##１
+s8
+atom
+play
+bmw
+##said
+sa
+etf
+ctrl
+♥yoyo♥
+##55
+2025
+##2014
+##66
+adidas
+amazon
+1958
+##ber
+##ner
+visa
+##77
+##der
+1800
+connectivity
+##hi
+firefox
+109
+118
+hr
+so
+style
+mark
+pop
+ol
+skip
+1975
+as
+##27
+##ir
+##61
+190
+mba
+##う
+##ai
+le
+##ver
+1900
+cafe2017
+lte
+super
+113
+129
+##ron
+amd
+like
+##☆
+are
+##ster
+we
+##sk
+paul
+data
+international
+##ft
+longchamp
+ssd
+good
+##ート
+##ti
+reply
+##my
+↓↓↓
+apr
+star
+##ker
+source
+136
+js
+112
+get
+force
+photo
+##one
+126
+##2013
+##ow
+link
+bbs
+1972
+goods
+##lin
+python
+119
+##ip
+game
+##ics
+##ません
+blue
+##●
+520
+##45
+page
+itunes
+##03
+1955
+260
+1968
+gt
+gif
+618
+##ff
+##47
+group
+くたさい
+about
+bar
+ganji
+##nce
+music
+lee
+not
+1977
+1971
+1973
+##per
+an
+faq
+comment
+##って
+days
+##ock
+116
+##bs
+1974
+1969
+v1
+player
+1956
+xbox
+sql
+fm
+f1
+139
+##ah
+210
+##lv
+##mp
+##000
+melody
+1957
+##３
+550
+17life
+199
+1966
+xml
+market
+##au
+##71
+999
+##04
+what
+gl
+##95
+##age
+tips
+##68
+book
+##ting
+mysql
+can
+1959
+230
+##ung
+wonderland
+watch
+10℃
+##ction
+9000
+mar
+mobile
+1946
+1962
+article
+##db
+part
+▲top
+party
+って
+1967
+1964
+1948
+##07
+##ore
+##op
+この
+dj
+##78
+##38
+010
+main
+225
+1965
+##ong
+art
+320
+ad
+134
+020
+##73
+117
+pm2
+japan
+228
+##08
+ts
+1963
+##ica
+der
+sm
+##36
+2019
+##wa
+ct
+##７
+##や
+##64
+1937
+homemesh
+search
+##85
+##れは
+##tv
+##di
+macbook
+##９
+##くたさい
+service
+##♥
+type
+った
+750
+##ier
+##si
+##75
+##います
+##ok
+best
+##ット
+goris
+lock
+##った
+cf
+3m
+big
+##ut
+ftp
+carol
+##vi
+１０
+1961
+happy
+sd
+##ac
+122
+anti
+pe
+cnn
+iii
+1920
+138
+##ラ
+1940
+esp
+jan
+tags
+##98
+##51
+august
+vol
+##86
+154
+##™
+##fs
+##れ
+##sion
+design
+ac
+##ム
+press
+jordan
+ppp
+that
+key
+check
+##６
+##tt
+##㎡
+1080p
+##lt
+power
+##42
+1952
+##bc
+vivi
+##ック
+he
+133
+121
+jpg
+##rry
+201
+175
+3500
+1947
+nb
+##ted
+##rn
+しています
+1954
+usd
+##t00
+master
+##ンク
+001
+model
+##58
+al
+##09
+1953
+##34
+ram
+goo
+ても
+##ui
+127
+1930
+red
+##ary
+rpg
+item
+##pm
+##41
+270
+##za
+project
+##2012
+hot
+td
+blogabstract
+##ger
+##62
+650
+##44
+gr2
+##します
+##ｍ
+black
+electronic
+nfc
+year
+asus
+また
+html5
+cindy
+##hd
+m3
+132
+esc
+##od
+booking
+##53
+fed
+tvb
+##81
+##ina
+mit
+165
+##いる
+chan
+192
+distribution
+next
+になる
+peter
+bios
+steam
+cm
+1941
+にも
+pk10
+##ix
+##65
+##91
+dec
+nasa
+##ana
+icecat
+00z
+b1
+will
+##46
+li
+se
+##ji
+##み
+##ard
+oct
+##ain
+jp
+##ze
+##bi
+cio
+##56
+smart
+h5
+##39
+##port
+curve
+vpn
+##nm
+##dia
+utc
+##あり
+12345678910
+##52
+rmvb
+chanel
+a4
+miss
+##and
+##im
+media
+who
+##63
+she
+girl
+5s
+124
+vera
+##して
+class
+vivo
+king
+##フ
+##ei
+national
+ab
+1951
+5cm
+888
+145
+ipod
+ap
+1100
+5mm
+211
+ms
+2756
+##69
+mp4
+msci
+##po
+##89
+131
+mg
+index
+380
+##bit
+##out
+##zz
+##97
+##67
+158
+apec
+##８
+photoshop
+opec
+￥799
+ては
+##96
+##tes
+##ast
+2g
+○○
+##ール
+￥2899
+##ling
+##よ
+##ory
+1938
+##ical
+kitty
+content
+##43
+step3
+##cn
+win8
+155
+vc
+1400
+iphone7
+robert
+##した
+tcl
+137
+beauty
+##87
+en
+dollars
+##ys
+##oc
+step
+pay
+yy
+a1
+##2011
+##lly
+##ks
+##♪
+1939
+188
+download
+1944
+sep
+exe
+ph
+います
+school
+gb
+center
+pr
+street
+##board
+uv
+##37
+##lan
+winrar
+##que
+##ua
+##com
+1942
+1936
+480
+gpu
+##４
+ettoday
+fu
+tom
+##54
+##ren
+##via
+149
+##72
+b2b
+144
+##79
+##tch
+rose
+arm
+mb
+##49
+##ial
+##nn
+nvidia
+step4
+mvp
+00㎡
+york
+156
+##イ
+how
+cpi
+591
+2765
+gov
+kg
+joe
+##xx
+mandy
+pa
+##ser
+copyright
+fashion
+1935
+don
+##け
+ecu
+##ist
+##art
+erp
+wap
+have
+##lm
+talk
+##ek
+##ning
+##if
+ch
+##ite
+video
+1943
+cs
+san
+iot
+look
+##84
+##2010
+##ku
+october
+##ux
+trump
+##hs
+##ide
+box
+141
+first
+##ins
+april
+##ight
+##83
+185
+angel
+protected
+aa
+151
+162
+x1
+m2
+##fe
+##×
+##ho
+size
+143
+min
+ofo
+fun
+gomaji
+ex
+hdmi
+food
+dns
+march
+chris
+kevin
+##のか
+##lla
+##pp
+##ec
+ag
+ems
+6s
+720p
+##rm
+##ham
+off
+##92
+asp
+team
+fandom
+ed
+299
+▌♥
+##ell
+info
+されています
+##82
+sina
+4066
+161
+##able
+##ctor
+330
+399
+315
+dll
+rights
+ltd
+idc
+jul
+3kg
+1927
+142
+ma
+surface
+##76
+##ク
+～～～
+304
+mall
+eps
+146
+green
+##59
+map
+space
+donald
+v2
+sodu
+##light
+1931
+148
+1700
+まて
+310
+reserved
+htm
+##han
+##57
+2d
+178
+mod
+##ise
+##tions
+152
+ti
+##shi
+doc
+1933
+icp
+055
+wang
+##ram
+shopping
+aug
+##pi
+##well
+now
+wam
+b2
+からお
+##hu
+236
+1928
+##gb
+266
+f2
+##93
+153
+mix
+##ef
+##uan
+bwl
+##plus
+##res
+core
+##ess
+tea
+5℃
+hktvmall
+nhk
+##ate
+list
+##ese
+301
+feb
+4m
+inn
+ての
+nov
+159
+12345
+daniel
+##ci
+pass
+##bet
+##nk
+coffee
+202
+ssl
+airbnb
+##ute
+fbi
+woshipm
+skype
+ea
+cg
+sp
+##fc
+##www
+yes
+edge
+alt
+007
+##94
+fpga
+##ght
+##gs
+iso9001
+さい
+##ile
+##wood
+##uo
+image
+lin
+icon
+american
+##em
+1932
+set
+says
+##king
+##tive
+blogger
+##74
+なと
+256
+147
+##ox
+##zy
+##red
+##ium
+##lf
+nokia
+claire
+##リ
+##ding
+november
+lohas
+##500
+##tic
+##マ
+##cs
+##ある
+##che
+##ire
+##gy
+##ult
+db
+january
+win
+##カ
+166
+road
+ptt
+##ま
+##つ
+198
+##fa
+##mer
+anna
+pchome
+はい
+udn
+ef
+420
+##time
+##tte
+2030
+##ア
+g20
+white
+かかります
+1929
+308
+garden
+eleven
+di
+##おります
+chen
+309b
+777
+172
+young
+cosplay
+ちてない
+4500
+bat
+##123
+##tra
+##ては
+kindle
+npc
+steve
+etc
+##ern
+##｜
+call
+xperia
+ces
+travel
+sk
+s7
+##ous
+1934
+##int
+みいたたけます
+183
+edu
+file
+cho
+qr
+##car
+##our
+186
+##ant
+##ｄ
+eric
+1914
+rends
+##jo
+##する
+mastercard
+##2000
+kb
+##min
+290
+##ino
+vista
+##ris
+##ud
+jack
+2400
+##set
+169
+pos
+1912
+##her
+##ou
+taipei
+しく
+205
+beta
+##ませんか
+232
+##fi
+express
+255
+body
+##ill
+aphojoy
+user
+december
+meiki
+##ick
+tweet
+richard
+##av
+##ᆫ
+iphone6
+##dd
+ちてすか
+views
+##mark
+321
+pd
+##００
+times
+##▲
+level
+##ash
+10g
+point
+5l
+##ome
+208
+koreanmall
+##ak
+george
+q2
+206
+wma
+tcp
+##200
+スタッフ
+full
+mlb
+##lle
+##watch
+tm
+run
+179
+911
+smith
+business
+##und
+1919
+color
+##tal
+222
+171
+##less
+moon
+4399
+##rl
+update
+pcb
+shop
+499
+157
+little
+なし
+end
+##mhz
+van
+dsp
+easy
+660
+##house
+##key
+history
+##ｏ
+oh
+##001
+##hy
+##web
+oem
+let
+was
+##2009
+##gg
+review
+##wan
+182
+##°c
+203
+uc
+title
+##val
+united
+233
+2021
+##ons
+doi
+trivago
+overdope
+sbs
+##ance
+##ち
+grand
+special
+573032185
+imf
+216
+wx17house
+##so
+##ーム
+audi
+##he
+london
+william
+##rp
+##ake
+science
+beach
+cfa
+amp
+ps4
+880
+##800
+##link
+##hp
+crm
+ferragamo
+bell
+make
+##eng
+195
+under
+zh
+photos
+2300
+##style
+##ント
+via
+176
+da
+##gi
+company
+i7
+##ray
+thomas
+370
+ufo
+i5
+##max
+plc
+ben
+back
+research
+8g
+173
+mike
+##pc
+##ッフ
+september
+189
+##ace
+vps
+february
+167
+pantos
+wp
+lisa
+1921
+★★
+jquery
+night
+long
+offer
+##berg
+##news
+1911
+##いて
+ray
+fks
+wto
+せます
+over
+164
+340
+##all
+##rus
+1924
+##888
+##works
+blogtitle
+loftpermalink
+##→
+187
+martin
+test
+ling
+km
+##め
+15000
+fda
+v3
+##ja
+##ロ
+ｗedding
+かある
+outlet
+family
+##ea
+をこ
+##top
+story
+##ness
+salvatore
+##lu
+204
+swift
+215
+room
+している
+oracle
+##ul
+1925
+sam
+b2c
+week
+pi
+rock
+##のは
+##ａ
+##けと
+##ean
+##300
+##gle
+cctv
+after
+chinese
+##back
+powered
+x2
+##tan
+1918
+##nes
+##イン
+canon
+only
+181
+##zi
+##las
+say
+##oe
+184
+##sd
+221
+##bot
+##world
+##zo
+sky
+made
+top100
+just
+1926
+pmi
+802
+234
+gap
+##vr
+177
+les
+174
+▲topoct
+ball
+vogue
+vi
+ing
+ofweek
+cos
+##list
+##ort
+▲topmay
+##なら
+##lon
+として
+last
+##tc
+##of
+##bus
+##gen
+real
+eva
+##コ
+a3
+nas
+##lie
+##ria
+##coin
+##bt
+▲topapr
+his
+212
+cat
+nata
+vive
+health
+⋯⋯
+drive
+sir
+▲topmar
+du
+cup
+##カー
+##ook
+##よう
+##sy
+alex
+msg
+tour
+しました
+3ce
+##word
+193
+ebooks
+r8
+block
+318
+##より
+2200
+nice
+pvp
+207
+months
+1905
+rewards
+##ther
+1917
+0800
+##xi
+##チ
+##sc
+micro
+850
+gg
+blogfp
+op
+1922
+daily
+m1
+264
+true
+##bb
+ml
+##tar
+##のお
+##ky
+anthony
+196
+253
+##yo
+state
+218
+##ara
+##aa
+##rc
+##tz
+##ston
+より
+gear
+##eo
+##ade
+ge
+see
+1923
+##win
+##ura
+ss
+heart
+##den
+##ita
+down
+##sm
+el
+png
+2100
+610
+rakuten
+whatsapp
+bay
+dream
+add
+##use
+680
+311
+pad
+gucci
+mpv
+##ode
+##fo
+island
+▲topjun
+##▼
+223
+jason
+214
+chicago
+##❤
+しの
+##hone
+io
+##れる
+##ことか
+sogo
+be2
+##ology
+990
+cloud
+vcd
+##con
+2～3
+##ford
+##joy
+##kb
+##こさいます
+##rade
+but
+##ach
+docker
+##ful
+rfid
+ul
+##ase
+hit
+ford
+##star
+580
+##○
+１１
+a2
+sdk
+reading
+edited
+##are
+cmos
+##mc
+238
+siri
+light
+##ella
+##ため
+bloomberg
+##read
+pizza
+##ison
+jimmy
+##vm
+college
+node
+journal
+ba
+18k
+##play
+245
+##cer
+２０
+magic
+##yu
+191
+jump
+288
+tt
+##ings
+asr
+##lia
+3200
+step5
+network
+##cd
+mc
+いします
+1234
+pixstyleme
+273
+##600
+2800
+money
+★★★★★
+1280
+１２
+430
+bl
+みの
+act
+##tus
+tokyo
+##rial
+##life
+emba
+##ae
+saas
+tcs
+##rk
+##wang
+summer
+##sp
+ko
+##ving
+390
+premium
+##その
+netflix
+##ヒ
+uk
+mt
+##lton
+right
+frank
+two
+209
+える
+##ple
+##cal
+021
+##んな
+##sen
+##ville
+hold
+nexus
+dd
+##ius
+てお
+##mah
+##なく
+tila
+zero
+820
+ce
+##tin
+resort
+##ws
+charles
+old
+p10
+5d
+report
+##360
+##ru
+##には
+bus
+vans
+lt
+##est
+pv
+##レ
+links
+rebecca
+##ツ
+##dm
+azure
+##365
+きな
+limited
+bit
+4gb
+##mon
+1910
+moto
+##eam
+213
+1913
+var
+eos
+なとの
+226
+blogspot
+された
+699
+e3
+dos
+dm
+fc
+##ments
+##ik
+##kw
+boy
+##bin
+##ata
+960
+er
+##せ
+219
+##vin
+##tu
+##ula
+194
+##∥
+station
+##ろ
+##ature
+835
+files
+zara
+hdr
+top10
+nature
+950
+magazine
+s6
+marriott
+##シ
+avira
+case
+##っと
+tab
+##ran
+tony
+##home
+oculus
+im
+##ral
+jean
+saint
+cry
+307
+rosie
+##force
+##ini
+ice
+##bert
+のある
+##nder
+##mber
+pet
+2600
+##◆
+plurk
+▲topdec
+##sis
+00kg
+▲topnov
+720
+##ence
+tim
+##ω
+##nc
+##ても
+##name
+log
+ips
+great
+ikea
+malaysia
+unix
+##イト
+3600
+##ncy
+##nie
+12000
+akb48
+##ye
+##oid
+404
+##chi
+##いた
+oa
+xuehai
+##1000
+##orm
+##rf
+275
+さん
+##ware
+##リー
+980
+ho
+##pro
+text
+##era
+560
+bob
+227
+##ub
+##2008
+8891
+scp
+avi
+##zen
+2022
+mi
+wu
+museum
+qvod
+apache
+lake
+jcb
+▲topaug
+★★★
+ni
+##hr
+hill
+302
+ne
+weibo
+490
+ruby
+##ーシ
+##ヶ
+##row
+4d
+▲topjul
+iv
+##ish
+github
+306
+mate
+312
+##スト
+##lot
+##ane
+andrew
+のハイト
+##tina
+t1
+rf
+ed2k
+##vel
+##900
+way
+final
+りの
+ns
+5a
+705
+197
+##メ
+sweet
+bytes
+##ene
+▲topjan
+231
+##cker
+##2007
+##px
+100g
+topapp
+229
+helpapp
+rs
+low
+14k
+g4g
+care
+630
+ldquo
+あり
+##fork
+leave
+rm
+edition
+##gan
+##zon
+##qq
+▲topsep
+##google
+##ism
+gold
+224
+explorer
+##zer
+toyota
+category
+select
+visual
+##labels
+restaurant
+##md
+posts
+s1
+##ico
+もっと
+angelababy
+123456
+217
+sports
+s3
+mbc
+1915
+してくたさい
+shell
+x86
+candy
+##new
+kbs
+face
+xl
+470
+##here
+4a
+swissinfo
+v8
+▲topfeb
+dram
+##ual
+##vice
+3a
+##wer
+sport
+q1
+ios10
+public
+int
+card
+##ｃ
+ep
+au
+rt
+##れた
+1080
+bill
+##mll
+kim
+３０
+460
+wan
+##uk
+##ミ
+x3
+298
+0t
+scott
+##ming
+239
+e5
+##3d
+h7n9
+worldcat
+brown
+##あります
+##vo
+##led
+##580
+##ax
+249
+410
+##ert
+paris
+##～6
+polo
+925
+##lr
+599
+##ナ
+capital
+##hing
+bank
+cv
+1g
+##chat
+##ｓ
+##たい
+adc
+##ule
+2m
+##ｅ
+digital
+hotmail
+268
+##pad
+870
+bbq
+quot
+##ring
+before
+wali
+##まて
+mcu
+2k
+2b
+という
+costco
+316
+north
+333
+switch
+##city
+##ｐ
+philips
+##mann
+management
+panasonic
+##cl
+##vd
+##ping
+##rge
+alice
+##lk
+##ましょう
+css3
+##ney
+vision
+alpha
+##ular
+##400
+##tter
+lz
+にお
+##ありません
+mode
+gre
+1916
+pci
+##tm
+237
+1～2
+##yan
+##そ
+について
+##let
+##キ
+work
+war
+coach
+ah
+mary
+##ᅵ
+huang
+##pt
+a8
+pt
+follow
+##berry
+1895
+##ew
+a5
+ghost
+##ション
+##wn
+##og
+south
+##code
+girls
+##rid
+action
+villa
+git
+r11
+table
+games
+##cket
+error
+##anonymoussaid
+##ag
+here
+##ame
+##gc
+qa
+##■
+##lis
+gmp
+##gin
+vmalife
+##cher
+yu
+wedding
+##tis
+demo
+dragon
+530
+soho
+social
+bye
+##rant
+river
+orz
+acer
+325
+##↑
+##ース
+##ats
+261
+del
+##ven
+440
+ups
+##ように
+##ター
+305
+value
+macd
+yougou
+##dn
+661
+##ano
+ll
+##urt
+##rent
+continue
+script
+##wen
+##ect
+paper
+263
+319
+shift
+##chel
+##フト
+##cat
+258
+x5
+fox
+243
+##さん
+car
+aaa
+##blog
+loading
+##yn
+##tp
+kuso
+799
+si
+sns
+イカせるテンマ
+ヒンクテンマ3
+rmb
+vdc
+forest
+central
+prime
+help
+ultra
+##rmb
+##ような
+241
+square
+688
+##しい
+のないフロクに
+##field
+##reen
+##ors
+##ju
+c1
+start
+510
+##air
+##map
+cdn
+##wo
+cba
+stephen
+m8
+100km
+##get
+opera
+##base
+##ood
+vsa
+com™
+##aw
+##ail
+251
+なのて
+count
+t2
+##ᅡ
+##een
+2700
+hop
+##gp
+vsc
+tree
+##eg
+##ose
+816
+285
+##ories
+##shop
+alphago
+v4
+1909
+simon
+##ᆼ
+fluke62max
+zip
+スホンサー
+##sta
+louis
+cr
+bas
+##～10
+bc
+##yer
+hadoop
+##ube
+##wi
+1906
+0755
+hola
+##low
+place
+centre
+5v
+d3
+##fer
+252
+##750
+##media
+281
+540
+0l
+exchange
+262
+series
+##ハー
+##san
+eb
+##bank
+##ｋ
+q3
+##nge
+##mail
+take
+##lp
+259
+1888
+client
+east
+cache
+event
+vincent
+##ールを
+きを
+##nse
+sui
+855
+adchoice
+##и
+##stry
+##なたの
+246
+##zone
+ga
+apps
+sea
+##ab
+248
+cisco
+##タ
+##rner
+kymco
+##care
+dha
+##pu
+##yi
+minkoff
+royal
+p1
+への
+annie
+269
+collection
+kpi
+playstation
+257
+になります
+866
+bh
+##bar
+queen
+505
+radio
+1904
+andy
+armani
+##xy
+manager
+iherb
+##ery
+##share
+spring
+raid
+johnson
+1908
+##ob
+volvo
+hall
+##ball
+v6
+our
+taylor
+##hk
+bi
+242
+##cp
+kate
+bo
+water
+technology
+##rie
+サイトは
+277
+##ona
+##sl
+hpv
+303
+gtx
+hip
+rdquo
+jayz
+stone
+##lex
+##rum
+namespace
+##やり
+620
+##ale
+##atic
+des
+##erson
+##ql
+##ves
+##type
+enter
+##この
+##てきます
+d2
+##168
+##mix
+##bian
+との
+a9
+jj
+ky
+##lc
+access
+movie
+##hc
+リストに
+tower
+##ration
+##mit
+ます
+##nch
+ua
+tel
+prefix
+##o2
+1907
+##point
+1901
+ott
+～10
+##http
+##ury
+baidu
+##ink
+member
+##logy
+bigbang
+nownews
+##js
+##shot
+##tb
+##こと
+247
+eba
+##tics
+##lus
+ける
+v5
+spark
+##ama
+there
+##ions
+god
+##lls
+##down
+hiv
+##ress
+burberry
+day2
+##kv
+◆◆
+jeff
+related
+film
+edit
+joseph
+283
+##ark
+cx
+32gb
+order
+g9
+30000
+##ans
+##tty
+s5
+##bee
+かあります
+thread
+xr
+buy
+sh
+005
+land
+spotify
+mx
+##ari
+276
+##verse
+×email
+sf
+why
+##ことて
+244
+7headlines
+nego
+sunny
+dom
+exo
+401
+666
+positioning
+fit
+rgb
+##tton
+278
+kiss
+alexa
+adam
+lp
+みリストを
+##ｇ
+mp
+##ties
+##llow
+amy
+##du
+np
+002
+institute
+271
+##rth
+##lar
+2345
+590
+##des
+sidebar
+１５
+imax
+site
+##cky
+##kit
+##ime
+##009
+season
+323
+##fun
+##ンター
+##ひ
+gogoro
+a7
+pu
+lily
+fire
+twd600
+##ッセーシを
+いて
+##vis
+30ml
+##cture
+##をお
+information
+##オ
+close
+friday
+##くれる
+yi
+nick
+てすか
+##tta
+##tel
+6500
+##lock
+cbd
+economy
+254
+かお
+267
+tinker
+double
+375
+8gb
+voice
+##app
+oops
+channel
+today
+985
+##right
+raw
+xyz
+##＋
+jim
+edm
+##cent
+7500
+supreme
+814
+ds
+##its
+##asia
+dropbox
+##てすか
+##tti
+books
+272
+100ml
+##tle
+##ller
+##ken
+##more
+##boy
+sex
+309
+##dom
+t3
+##ider
+##なります
+##unch
+1903
+810
+feel
+5500
+##かった
+##put
+により
+s2
+mo
+##gh
+men
+ka
+amoled
+div
+##tr
+##n1
+port
+howard
+##tags
+ken
+dnf
+##nus
+adsense
+##а
+ide
+##へ
+buff
+thunder
+##town
+##ique
+has
+##body
+auto
+pin
+##erry
+tee
+てした
+295
+number
+##the
+##013
+object
+psp
+cool
+udnbkk
+16gb
+##mic
+miui
+##tro
+most
+r2
+##alk
+##nity
+1880
+±0
+##いました
+428
+s4
+law
+version
+##oa
+n1
+sgs
+docomo
+##tf
+##ack
+henry
+fc2
+##ded
+##sco
+##014
+##rite
+286
+0mm
+linkedin
+##ada
+##now
+wii
+##ndy
+ucbug
+##◎
+sputniknews
+legalminer
+##ika
+##xp
+2gb
+##bu
+q10
+oo
+b6
+come
+##rman
+cheese
+ming
+maker
+##gm
+nikon
+##fig
+ppi
+kelly
+##ります
+jchere
+てきます
+ted
+md
+003
+fgo
+tech
+##tto
+dan
+soc
+##gl
+##len
+hair
+earth
+640
+521
+img
+##pper
+##a1
+##てきる
+##ロク
+acca
+##ition
+##ference
+suite
+##ig
+outlook
+##mond
+##cation
+398
+##pr
+279
+101vip
+358
+##999
+282
+64gb
+3800
+345
+airport
+##over
+284
+##おり
+jones
+##ith
+lab
+##su
+##いるのて
+co2
+town
+piece
+##llo
+no1
+vmware
+24h
+##qi
+focus
+reader
+##admin
+##ora
+tb
+false
+##log
+1898
+know
+lan
+838
+##ces
+f4
+##ume
+motel
+stop
+##oper
+na
+flickr
+netcomponents
+##af
+##─
+pose
+williams
+local
+##ound
+##cg
+##site
+##iko
+いお
+274
+5m
+gsm
+con
+##ath
+1902
+friends
+##hip
+cell
+317
+##rey
+780
+cream
+##cks
+012
+##dp
+facebooktwitterpinterestgoogle
+sso
+324
+shtml
+song
+swiss
+##mw
+##キンク
+lumia
+xdd
+string
+tiffany
+522
+marc
+られた
+insee
+russell
+sc
+dell
+##ations
+ｏｋ
+camera
+289
+##vs
+##flow
+##late
+classic
+287
+##nter
+stay
+g1
+mtv
+512
+##ever
+##lab
+##nger
+qe
+sata
+ryan
+d1
+50ml
+cms
+##cing
+su
+292
+3300
+editor
+296
+##nap
+security
+sunday
+association
+##ens
+##700
+##bra
+acg
+##かり
+sofascore
+とは
+mkv
+##ign
+jonathan
+gary
+build
+labels
+##oto
+tesla
+moba
+qi
+gohappy
+general
+ajax
+1024
+##かる
+サイト
+society
+##test
+##urs
+wps
+fedora
+##ich
+mozilla
+328
+##480
+##dr
+usa
+urn
+##lina
+##ｒ
+grace
+##die
+##try
+##ader
+1250
+##なり
+elle
+570
+##chen
+##ᆯ
+price
+##ten
+uhz
+##ough
+eq
+##hen
+states
+push
+session
+balance
+wow
+506
+##cus
+##py
+when
+##ward
+##ep
+34e
+wong
+library
+prada
+##サイト
+##cle
+running
+##ree
+313
+ck
+date
+q4
+##ctive
+##ool
+##＞
+mk
+##ira
+##163
+388
+die
+secret
+rq
+dota
+buffet
+は１ヶ
+e6
+##ez
+pan
+368
+ha
+##card
+##cha
+2a
+##さ
+alan
+day3
+eye
+f3
+##end
+france
+keep
+adi
+rna
+tvbs
+##ala
+solo
+nova
+##え
+##tail
+##ょう
+support
+##ries
+##なる
+##ved
+base
+copy
+iis
+fps
+##ways
+hero
+hgih
+profile
+fish
+mu
+ssh
+entertainment
+chang
+##wd
+click
+cake
+##ond
+pre
+##tom
+kic
+pixel
+##ov
+##fl
+product
+6a
+##pd
+dear
+##gate
+es
+yumi
+audio
+##²
+##sky
+echo
+bin
+where
+##ture
+329
+##ape
+find
+sap
+isis
+##なと
+nand
+##101
+##load
+##ream
+band
+a6
+525
+never
+##post
+festival
+50cm
+##we
+555
+guide
+314
+zenfone
+##ike
+335
+gd
+forum
+jessica
+strong
+alexander
+##ould
+software
+allen
+##ious
+program
+360°
+else
+lohasthree
+##gar
+することかてきます
+please
+##れます
+rc
+##ggle
+##ric
+bim
+50000
+##own
+eclipse
+355
+brian
+3ds
+##side
+061
+361
+##other
+##ける
+##tech
+##ator
+485
+engine
+##ged
+##ｔ
+plaza
+##fit
+cia
+ngo
+westbrook
+shi
+tbs
+50mm
+##みませんか
+sci
+291
+reuters
+##ily
+contextlink
+##hn
+af
+##cil
+bridge
+very
+##cel
+1890
+cambridge
+##ize
+15g
+##aid
+##data
+790
+frm
+##head
+award
+butler
+##sun
+meta
+##mar
+america
+ps3
+puma
+pmid
+##すか
+lc
+670
+kitchen
+##lic
+オーフン5
+きなしソフトサーヒス
+そして
+day1
+future
+★★★★
+##text
+##page
+##rris
+pm1
+##ket
+fans
+##っています
+1001
+christian
+bot
+kids
+trackback
+##hai
+c3
+display
+##hl
+n2
+1896
+idea
+さんも
+##sent
+airmail
+##ug
+##men
+pwm
+けます
+028
+##lution
+369
+852
+awards
+schemas
+354
+asics
+wikipedia
+font
+##tional
+##vy
+c2
+293
+##れている
+##dget
+##ein
+っている
+contact
+pepper
+スキル
+339
+##～5
+294
+##uel
+##ument
+730
+##hang
+みてす
+q5
+##sue
+rain
+##ndi
+wei
+swatch
+##cept
+わせ
+331
+popular
+##ste
+##tag
+p2
+501
+trc
+1899
+##west
+##live
+justin
+honda
+ping
+messenger
+##rap
+v9
+543
+##とは
+unity
+appqq
+はすへて
+025
+leo
+##tone
+##テ
+##ass
+uniqlo
+##010
+502
+her
+jane
+memory
+moneydj
+##tical
+human
+12306
+していると
+##m2
+coc
+miacare
+##mn
+tmt
+##core
+vim
+kk
+##may
+fan
+target
+use
+too
+338
+435
+2050
+867
+737
+fast
+##2c
+services
+##ope
+omega
+energy
+##わ
+pinkoi
+1a
+##なから
+##rain
+jackson
+##ement
+##シャンルの
+374
+366
+そんな
+p9
+rd
+##ᆨ
+1111
+##tier
+##vic
+zone
+##│
+385
+690
+dl
+isofix
+cpa
+m4
+322
+kimi
+めて
+davis
+##lay
+lulu
+##uck
+050
+weeks
+qs
+##hop
+920
+##ｎ
+ae
+##ear
+～5
+eia
+405
+##fly
+korea
+jpeg
+boost
+##ship
+small
+##リア
+1860
+eur
+297
+425
+valley
+##iel
+simple
+##ude
+rn
+k2
+##ena
+されます
+non
+patrick
+しているから
+##ナー
+feed
+5757
+30g
+process
+well
+qqmei
+##thing
+they
+aws
+lu
+pink
+##ters
+##kin
+または
+board
+##vertisement
+wine
+##ien
+unicode
+##dge
+r1
+359
+##tant
+いを
+##twitter
+##3c
+cool1
+される
+##れて
+##ｌ
+isp
+##012
+standard
+45㎡2
+402
+##150
+matt
+##fu
+326
+##iner
+googlemsn
+pixnetfacebookyahoo
+##ラン
+x7
+886
+##uce
+メーカー
+sao
+##ev
+##きました
+##file
+9678
+403
+xddd
+shirt
+6l
+##rio
+##hat
+3mm
+givenchy
+ya
+bang
+##lio
+monday
+crystal
+ロクイン
+##abc
+336
+head
+890
+ubuntuforumwikilinuxpastechat
+##vc
+##～20
+##rity
+cnc
+7866
+ipv6
+null
+1897
+##ost
+yang
+imsean
+tiger
+##fet
+##ンス
+352
+##＝
+dji
+327
+ji
+maria
+##come
+##んて
+foundation
+3100
+##beth
+##なった
+1m
+601
+active
+##aft
+##don
+3p
+sr
+349
+emma
+##khz
+living
+415
+353
+1889
+341
+709
+457
+sas
+x6
+##face
+pptv
+x4
+##mate
+han
+sophie
+##jing
+337
+fifa
+##mand
+other
+sale
+inwedding
+##gn
+てきちゃいます
+##mmy
+##pmlast
+bad
+nana
+nbc
+してみてくたさいね
+なとはお
+##wu
+##かあります
+##あ
+note7
+single
+##340
+せからこ
+してくたさい♪この
+しにはとんとんワークケートを
+するとあなたにもっとマッチした
+ならワークケートへ
+もみつかっちゃうかも
+ワークケートの
+##bel
+window
+##dio
+##ht
+union
+age
+382
+１４
+##ivity
+##ｙ
+コメント
+domain
+neo
+##isa
+##lter
+5k
+f5
+steven
+##cts
+powerpoint
+tft
+self
+g2
+ft
+##テル
+zol
+##act
+mwc
+381
+343
+もう
+nbapop
+408
+てある
+eds
+ace
+##room
+previous
+author
+tomtom
+il
+##ets
+hu
+financial
+☆☆☆
+っています
+bp
+5t
+chi
+1gb
+##hg
+fairmont
+cross
+008
+gay
+h2
+function
+##けて
+356
+also
+1b
+625
+##ータ
+##raph
+1894
+3～5
+##ils
+i3
+334
+avenue
+##host
+による
+##bon
+##tsu
+message
+navigation
+50g
+fintech
+h6
+##ことを
+8cm
+##ject
+##vas
+##firm
+credit
+##wf
+xxxx
+form
+##nor
+##space
+huawei
+plan
+json
+sbl
+##dc
+machine
+921
+392
+wish
+##120
+##sol
+windows7
+edward
+##ために
+development
+washington
+##nsis
+lo
+818
+##sio
+##ym
+##bor
+planet
+##～8
+##wt
+ieee
+gpa
+##めて
+camp
+ann
+gm
+##tw
+##oka
+connect
+##rss
+##work
+##atus
+wall
+chicken
+soul
+2mm
+##times
+fa
+##ather
+##cord
+009
+##eep
+hitachi
+gui
+harry
+##pan
+e1
+disney
+##press
+##ーション
+wind
+386
+frigidaire
+##tl
+liu
+hsu
+332
+basic
+von
+ev
+いた
+てきる
+スホンサーサイト
+learning
+##ull
+expedia
+archives
+change
+##wei
+santa
+cut
+ins
+6gb
+turbo
+brand
+cf1
+508
+004
+return
+747
+##rip
+h1
+##nis
+##をこ
+128gb
+##にお
+3t
+application
+しており
+emc
+rx
+##oon
+384
+quick
+412
+15058
+wilson
+wing
+chapter
+##bug
+beyond
+##cms
+##dar
+##oh
+zoom
+e2
+trip
+sb
+##nba
+rcep
+342
+aspx
+ci
+080
+gc
+gnu
+める
+##count
+advanced
+dance
+dv
+##url
+##ging
+367
+8591
+am09
+shadow
+battle
+346
+##ｉ
+##cia
+##という
+emily
+##のてす
+##tation
+host
+ff
+techorz
+sars
+##mini
+##mporary
+##ering
+nc
+4200
+798
+##next
+cma
+##mbps
+##gas
+##ift
+##dot
+##ィ
+455
+##～17
+amana
+##りの
+426
+##ros
+ir
+00㎡1
+##eet
+##ible
+##↓
+710
+ˋ▽ˊ
+##aka
+dcs
+iq
+##ｖ
+l1
+##lor
+maggie
+##011
+##iu
+588
+##～1
+830
+##gt
+1tb
+articles
+create
+##burg
+##iki
+database
+fantasy
+##rex
+##cam
+dlc
+dean
+##you
+hard
+path
+gaming
+victoria
+maps
+cb
+##lee
+##itor
+overchicstoretvhome
+systems
+##xt
+416
+p3
+sarah
+760
+##nan
+407
+486
+x9
+install
+second
+626
+##ann
+##ph
+##rcle
+##nic
+860
+##nar
+ec
+##とう
+768
+metro
+chocolate
+##rian
+～4
+##table
+##しています
+skin
+##sn
+395
+mountain
+##0mm
+inparadise
+6m
+7x24
+ib
+4800
+##jia
+eeworld
+creative
+g5
+g3
+357
+parker
+ecfa
+village
+からの
+18000
+sylvia
+サーヒス
+hbl
+##ques
+##onsored
+##x2
+##きます
+##v4
+##tein
+ie6
+383
+##stack
+389
+ver
+##ads
+##baby
+sound
+bbe
+##110
+##lone
+##uid
+ads
+022
+gundam
+351
+thinkpad
+006
+scrum
+match
+##ave
+mems
+##470
+##oy
+##なりました
+##talk
+glass
+lamigo
+span
+##eme
+job
+##a5
+jay
+wade
+kde
+498
+##lace
+ocean
+tvg
+##covery
+##r3
+##ners
+##rea
+junior
+think
+##aine
+cover
+##ision
+##sia
+↓↓
+##bow
+msi
+413
+458
+406
+##love
+711
+801
+soft
+z2
+##pl
+456
+1840
+mobil
+mind
+##uy
+427
+nginx
+##oi
+めた
+##rr
+6221
+##mple
+##sson
+##ーシてす
+371
+##nts
+91tv
+comhd
+crv3000
+##uard
+1868
+397
+deep
+lost
+field
+gallery
+##bia
+rate
+spf
+redis
+traction
+930
+icloud
+011
+なら
+fe
+jose
+372
+##tory
+into
+sohu
+fx
+899
+379
+kicstart2
+##hia
+すく
+##～3
+##sit
+ra
+２４
+##walk
+##xure
+500g
+##pact
+pacific
+xa
+natural
+carlo
+##250
+##walker
+1850
+##can
+cto
+gigi
+516
+##サー
+pen
+##hoo
+ob
+matlab
+##ｂ
+##yy
+13913459
+##iti
+mango
+##bbs
+sense
+c5
+oxford
+##ニア
+walker
+jennifer
+##ola
+course
+##bre
+701
+##pus
+##rder
+lucky
+075
+##ぁ
+ivy
+なお
+##nia
+sotheby
+side
+##ugh
+joy
+##orage
+##ush
+##bat
+##dt
+364
+r9
+##2d
+##gio
+511
+country
+wear
+##lax
+##～7
+##moon
+393
+seven
+study
+411
+348
+lonzo
+8k
+##ェ
+evolution
+##イフ
+##kk
+gs
+kd
+##レス
+arduino
+344
+b12
+##lux
+arpg
+##rdon
+cook
+##x5
+dark
+five
+##als
+##ida
+とても
+sign
+362
+##ちの
+something
+20mm
+##nda
+387
+##posted
+fresh
+tf
+1870
+422
+cam
+##mine
+##skip
+##form
+##ssion
+education
+394
+##tee
+dyson
+stage
+##jie
+want
+##night
+epson
+pack
+あります
+##ppy
+テリヘル
+##█
+wd
+##eh
+##rence
+left
+##lvin
+golden
+mhz
+discovery
+##trix
+##n2
+loft
+##uch
+##dra
+##sse
+speed
+～1
+1mdb
+sorry
+welcome
+##urn
+wave
+gaga
+##lmer
+teddy
+##160
+トラックハック
+せよ
+611
+##f2016
+378
+rp
+##sha
+rar
+##あなたに
+##きた
+840
+holiday
+##ュー
+373
+074
+##vg
+##nos
+##rail
+gartner
+gi
+6p
+##dium
+kit
+488
+b3
+eco
+##ろう
+20g
+sean
+##stone
+autocad
+nu
+##np
+f16
+write
+029
+m5
+##ias
+images
+atp
+##dk
+fsm
+504
+1350
+ve
+52kb
+##xxx
+##のに
+##cake
+414
+unit
+lim
+ru
+1v
+##ification
+published
+angela
+16g
+analytics
+ak
+##ｑ
+##nel
+gmt
+##icon
+again
+##₂
+##bby
+ios11
+445
+かこさいます
+waze
+いてす
+##ハ
+9985
+##ust
+##ティー
+framework
+##007
+iptv
+delete
+52sykb
+cl
+wwdc
+027
+30cm
+##fw
+##ての
+1389
+##xon
+brandt
+##ses
+##dragon
+tc
+vetements
+anne
+monte
+modern
+official
+##へて
+##ere
+##nne
+##oud
+もちろん
+５０
+etnews
+##a2
+##graphy
+421
+863
+##ちゃん
+444
+##rtex
+##てお
+l2
+##gma
+mount
+ccd
+たと
+archive
+morning
+tan
+ddos
+e7
+##ホ
+day4
+##ウ
+gis
+453
+its
+495
+factory
+bruce
+pg
+##ito
+ってくたさい
+guest
+cdma
+##lling
+536
+n3
+しかし
+3～4
+mega
+eyes
+ro
+１３
+women
+dac
+church
+##jun
+singapore
+##facebook
+6991
+starbucks
+##tos
+##stin
+##shine
+zen
+##mu
+tina
+20℃
+1893
+##たけて
+503
+465
+request
+##gence
+qt
+##っ
+1886
+347
+363
+q7
+##zzi
+diary
+##tore
+409
+##ead
+468
+cst
+##osa
+canada
+agent
+va
+##jiang
+##ちは
+##ーク
+##lam
+sg
+##nix
+##sday
+##よって
+g6
+##master
+bing
+##zl
+charlie
+１６
+8mm
+nb40
+##ーン
+thai
+##ルフ
+ln284ct
+##itz
+##2f
+bonnie
+##food
+##lent
+originals
+##stro
+##lts
+418
+∟∣
+##bscribe
+children
+ntd
+yesstyle
+##かも
+hmv
+##tment
+d5
+2cm
+arts
+sms
+##pn
+##я
+##いい
+topios9
+539
+lifestyle
+virtual
+##ague
+xz
+##deo
+muji
+024
+unt
+##nnis
+##ᅩ
+faq1
+1884
+396
+##ette
+fly
+64㎡
+はしめまして
+441
+curry
+##pop
+のこ
+release
+##←
+##◆◆
+##cast
+073
+ありな
+500ml
+##ews
+5c
+##stle
+ios7
+##ima
+787
+dog
+lenovo
+##r4
+roger
+013
+cbs
+vornado
+100m
+417
+##desk
+##クok
+##ald
+1867
+9595
+2900
+##van
+oil
+##ｘ
+some
+break
+common
+##jy
+##lines
+g7
+twice
+419
+ella
+nano
+belle
+にこ
+##mes
+##self
+##note
+jb
+##ことかてきます
+benz
+##との
+##ova
+451
+save
+##wing
+##ますのて
+kai
+りは
+##hua
+##rect
+rainer
+##unge
+448
+##0m
+adsl
+##かな
+guestname
+##uma
+##kins
+##zu
+tokichoi
+##price
+county
+##med
+##mus
+rmk
+391
+address
+vm
+えて
+openload
+##group
+##hin
+##iginal
+amg
+urban
+##oz
+jobs
+emi
+##public
+beautiful
+##sch
+album
+##dden
+##bell
+jerry
+works
+hostel
+miller
+##drive
+##rmin
+##１０
+376
+boot
+828
+##370
+##fx
+##cm～
+1885
+##nome
+##ctionary
+##oman
+##lish
+##cr
+##hm
+433
+##how
+432
+francis
+xi
+c919
+b5
+evernote
+##uc
+vga
+##3000
+coupe
+##urg
+##cca
+##uality
+019
+6g
+れる
+multi
+##また
+##ett
+em
+hey
+##ani
+##tax
+##rma
+inside
+than
+740
+leonnhurt
+##jin
+ict
+れた
+bird
+notes
+200mm
+くの
+##dical
+##lli
+result
+442
+iu
+ee
+438
+smap
+gopro
+##last
+yin
+pure
+998
+32g
+けた
+5kg
+##dan
+##rame
+mama
+##oot
+bean
+marketing
+##hur
+2l
+bella
+sync
+xuite
+##ground
+515
+discuz
+##getrelax
+##ince
+##bay
+##5s
+cj
+##イス
+gmat
+apt
+##pass
+jing
+##rix
+c4
+rich
+##とても
+niusnews
+##ello
+bag
+770
+##eting
+##mobile
+１８
+culture
+015
+##のてすか
+377
+1020
+area
+##ience
+616
+details
+gp
+universal
+silver
+dit
+はお
+private
+ddd
+u11
+kanshu
+##ified
+fung
+##nny
+dx
+##520
+tai
+475
+023
+##fr
+##lean
+3s
+##pin
+429
+##rin
+25000
+ly
+rick
+##bility
+usb3
+banner
+##baru
+##gion
+metal
+dt
+vdf
+1871
+karl
+qualcomm
+bear
+1010
+oldid
+ian
+jo
+##tors
+population
+##ernel
+1882
+mmorpg
+##mv
+##bike
+603
+##©
+ww
+friend
+##ager
+exhibition
+##del
+##pods
+fpx
+structure
+##free
+##tings
+kl
+##rley
+##copyright
+##mma
+california
+3400
+orange
+yoga
+4l
+canmake
+honey
+##anda
+##コメント
+595
+nikkie
+##ルハイト
+dhl
+publishing
+##mall
+##gnet
+20cm
+513
+##クセス
+##┅
+e88
+970
+##dog
+fishbase
+##!
+##"
+###
+##$
+##%
+##&
+##'
+##(
+##)
+##*
+##+
+##,
+##-
+##.
+##/
+##:
+##;
+##<
+##=
+##>
+##?
+##@
+##[
+##\
+##]
+##^
+##_
+##{
+##|
+##}
+##~
+##£
+##¤
+##¥
+##§
+##«
+##±
+##³
+##µ
+##·
+##¹
+##º
+##»
+##¼
+##ß
+##æ
+##÷
+##ø
+##đ
+##ŋ
+##ɔ
+##ə
+##ɡ
+##ʰ
+##ˇ
+##ˈ
+##ˊ
+##ˋ
+##ˍ
+##ː
+##˙
+##˚
+##ˢ
+##α
+##β
+##γ
+##δ
+##ε
+##η
+##θ
+##ι
+##κ
+##λ
+##μ
+##ν
+##ο
+##π
+##ρ
+##ς
+##σ
+##τ
+##υ
+##φ
+##χ
+##ψ
+##б
+##в
+##г
+##д
+##е
+##ж
+##з
+##к
+##л
+##м
+##н
+##о
+##п
+##р
+##с
+##т
+##у
+##ф
+##х
+##ц
+##ч
+##ш
+##ы
+##ь
+##і
+##ا
+##ب
+##ة
+##ت
+##د
+##ر
+##س
+##ع
+##ل
+##م
+##ن
+##ه
+##و
+##ي
+##۩
+##ก
+##ง
+##น
+##ม
+##ย
+##ร
+##อ
+##า
+##เ
+##๑
+##་
+##ღ
+##ᄀ
+##ᄁ
+##ᄂ
+##ᄃ
+##ᄅ
+##ᄆ
+##ᄇ
+##ᄈ
+##ᄉ
+##ᄋ
+##ᄌ
+##ᄎ
+##ᄏ
+##ᄐ
+##ᄑ
+##ᄒ
+##ᅢ
+##ᅣ
+##ᅥ
+##ᅦ
+##ᅧ
+##ᅨ
+##ᅪ
+##ᅬ
+##ᅭ
+##ᅮ
+##ᅯ
+##ᅲ
+##ᅳ
+##ᅴ
+##ᆷ
+##ᆸ
+##ᆺ
+##ᆻ
+##ᗜ
+##ᵃ
+##ᵉ
+##ᵍ
+##ᵏ
+##ᵐ
+##ᵒ
+##ᵘ
+##‖
+##„
+##†
+##•
+##‥
+##‧
+## 
+##‰
+##′
+##″
+##‹
+##›
+##※
+##‿
+##⁄
+##ⁱ
+##⁺
+##ⁿ
+##₁
+##₃
+##₄
+##€
+##№
+##ⅰ
+##ⅱ
+##ⅲ
+##ⅳ
+##ⅴ
+##↔
+##↗
+##↘
+##⇒
+##∀
+##−
+##∕
+##∙
+##√
+##∞
+##∟
+##∠
+##∣
+##∩
+##∮
+##∶
+##∼
+##∽
+##≈
+##≒
+##≡
+##≤
+##≥
+##≦
+##≧
+##≪
+##≫
+##⊙
+##⋅
+##⋈
+##⋯
+##⌒
+##①
+##②
+##③
+##④
+##⑤
+##⑥
+##⑦
+##⑧
+##⑨
+##⑩
+##⑴
+##⑵
+##⑶
+##⑷
+##⑸
+##⒈
+##⒉
+##⒊
+##⒋
+##ⓒ
+##ⓔ
+##ⓘ
+##━
+##┃
+##┆
+##┊
+##┌
+##└
+##├
+##┣
+##═
+##║
+##╚
+##╞
+##╠
+##╭
+##╮
+##╯
+##╰
+##╱
+##╳
+##▂
+##▃
+##▅
+##▇
+##▉
+##▋
+##▌
+##▍
+##▎
+##□
+##▪
+##▫
+##▬
+##△
+##▶
+##►
+##▽
+##◇
+##◕
+##◠
+##◢
+##◤
+##☀
+##☕
+##☞
+##☺
+##☼
+##♀
+##♂
+##♠
+##♡
+##♣
+##♦
+##♫
+##♬
+##✈
+##✔
+##✕
+##✖
+##✦
+##✨
+##✪
+##✰
+##✿
+##❀
+##➜
+##➤
+##⦿
+##、
+##。
+##〃
+##々
+##〇
+##〈
+##〉
+##《
+##》
+##「
+##」
+##『
+##』
+##【
+##】
+##〓
+##〔
+##〕
+##〖
+##〗
+##〜
+##〝
+##〞
+##ぃ
+##ぇ
+##ぬ
+##ふ
+##ほ
+##む
+##ゃ
+##ゅ
+##ゆ
+##ょ
+##゜
+##ゝ
+##ァ
+##ゥ
+##エ
+##ォ
+##ケ
+##サ
+##セ
+##ソ
+##ッ
+##ニ
+##ヌ
+##ネ
+##ノ
+##ヘ
+##モ
+##ャ
+##ヤ
+##ュ
+##ユ
+##ョ
+##ヨ
+##ワ
+##ヲ
+##・
+##ヽ
+##ㄅ
+##ㄆ
+##ㄇ
+##ㄉ
+##ㄋ
+##ㄌ
+##ㄍ
+##ㄎ
+##ㄏ
+##ㄒ
+##ㄚ
+##ㄛ
+##ㄞ
+##ㄟ
+##ㄢ
+##ㄤ
+##ㄥ
+##ㄧ
+##ㄨ
+##ㆍ
+##㈦
+##㊣
+##㗎
+##一
+##丁
+##七
+##万
+##丈
+##三
+##上
+##下
+##不
+##与
+##丐
+##丑
+##专
+##且
+##丕
+##世
+##丘
+##丙
+##业
+##丛
+##东
+##丝
+##丞
+##丟
+##両
+##丢
+##两
+##严
+##並
+##丧
+##丨
+##个
+##丫
+##中
+##丰
+##串
+##临
+##丶
+##丸
+##丹
+##为
+##主
+##丼
+##丽
+##举
+##丿
+##乂
+##乃
+##久
+##么
+##义
+##之
+##乌
+##乍
+##乎
+##乏
+##乐
+##乒
+##乓
+##乔
+##乖
+##乗
+##乘
+##乙
+##乜
+##九
+##乞
+##也
+##习
+##乡
+##书
+##乩
+##买
+##乱
+##乳
+##乾
+##亀
+##亂
+##了
+##予
+##争
+##事
+##二
+##于
+##亏
+##云
+##互
+##五
+##井
+##亘
+##亙
+##亚
+##些
+##亜
+##亞
+##亟
+##亡
+##亢
+##交
+##亥
+##亦
+##产
+##亨
+##亩
+##享
+##京
+##亭
+##亮
+##亲
+##亳
+##亵
+##人
+##亿
+##什
+##仁
+##仃
+##仄
+##仅
+##仆
+##仇
+##今
+##介
+##仍
+##从
+##仏
+##仑
+##仓
+##仔
+##仕
+##他
+##仗
+##付
+##仙
+##仝
+##仞
+##仟
+##代
+##令
+##以
+##仨
+##仪
+##们
+##仮
+##仰
+##仲
+##件
+##价
+##任
+##份
+##仿
+##企
+##伉
+##伊
+##伍
+##伎
+##伏
+##伐
+##休
+##伕
+##众
+##优
+##伙
+##会
+##伝
+##伞
+##伟
+##传
+##伢
+##伤
+##伦
+##伪
+##伫
+##伯
+##估
+##伴
+##伶
+##伸
+##伺
+##似
+##伽
+##佃
+##但
+##佇
+##佈
+##位
+##低
+##住
+##佐
+##佑
+##体
+##佔
+##何
+##佗
+##佘
+##余
+##佚
+##佛
+##作
+##佝
+##佞
+##佟
+##你
+##佢
+##佣
+##佤
+##佥
+##佩
+##佬
+##佯
+##佰
+##佳
+##併
+##佶
+##佻
+##佼
+##使
+##侃
+##侄
+##來
+##侈
+##例
+##侍
+##侏
+##侑
+##侖
+##侗
+##供
+##依
+##侠
+##価
+##侣
+##侥
+##侦
+##侧
+##侨
+##侬
+##侮
+##侯
+##侵
+##侶
+##侷
+##便
+##係
+##促
+##俄
+##俊
+##俎
+##俏
+##俐
+##俑
+##俗
+##俘
+##俚
+##保
+##俞
+##俟
+##俠
+##信
+##俨
+##俩
+##俪
+##俬
+##俭
+##修
+##俯
+##俱
+##俳
+##俸
+##俺
+##俾
+##倆
+##倉
+##個
+##倌
+##倍
+##倏
+##們
+##倒
+##倔
+##倖
+##倘
+##候
+##倚
+##倜
+##借
+##倡
+##値
+##倦
+##倩
+##倪
+##倫
+##倬
+##倭
+##倶
+##债
+##值
+##倾
+##偃
+##假
+##偈
+##偉
+##偌
+##偎
+##偏
+##偕
+##做
+##停
+##健
+##側
+##偵
+##偶
+##偷
+##偻
+##偽
+##偿
+##傀
+##傅
+##傍
+##傑
+##傘
+##備
+##傚
+##傢
+##傣
+##傥
+##储
+##傩
+##催
+##傭
+##傲
+##傳
+##債
+##傷
+##傻
+##傾
+##僅
+##働
+##像
+##僑
+##僕
+##僖
+##僚
+##僥
+##僧
+##僭
+##僮
+##僱
+##僵
+##價
+##僻
+##儀
+##儂
+##億
+##儆
+##儉
+##儋
+##儒
+##儕
+##儘
+##償
+##儡
+##優
+##儲
+##儷
+##儼
+##儿
+##兀
+##允
+##元
+##兄
+##充
+##兆
+##兇
+##先
+##光
+##克
+##兌
+##免
+##児
+##兑
+##兒
+##兔
+##兖
+##党
+##兜
+##兢
+##入
+##內
+##全
+##兩
+##八
+##公
+##六
+##兮
+##兰
+##共
+##兲
+##关
+##兴
+##兵
+##其
+##具
+##典
+##兹
+##养
+##兼
+##兽
+##冀
+##内
+##円
+##冇
+##冈
+##冉
+##冊
+##册
+##再
+##冏
+##冒
+##冕
+##冗
+##写
+##军
+##农
+##冠
+##冢
+##冤
+##冥
+##冨
+##冪
+##冬
+##冯
+##冰
+##冲
+##决
+##况
+##冶
+##冷
+##冻
+##冼
+##冽
+##冾
+##净
+##凄
+##准
+##凇
+##凈
+##凉
+##凋
+##凌
+##凍
+##减
+##凑
+##凛
+##凜
+##凝
+##几
+##凡
+##凤
+##処
+##凪
+##凭
+##凯
+##凰
+##凱
+##凳
+##凶
+##凸
+##凹
+##出
+##击
+##函
+##凿
+##刀
+##刁
+##刃
+##分
+##切
+##刈
+##刊
+##刍
+##刎
+##刑
+##划
+##列
+##刘
+##则
+##刚
+##创
+##初
+##删
+##判
+##別
+##刨
+##利
+##刪
+##别
+##刮
+##到
+##制
+##刷
+##券
+##刹
+##刺
+##刻
+##刽
+##剁
+##剂
+##剃
+##則
+##剉
+##削
+##剋
+##剌
+##前
+##剎
+##剐
+##剑
+##剔
+##剖
+##剛
+##剜
+##剝
+##剣
+##剤
+##剥
+##剧
+##剩
+##剪
+##副
+##割
+##創
+##剷
+##剽
+##剿
+##劃
+##劇
+##劈
+##劉
+##劊
+##劍
+##劏
+##劑
+##力
+##劝
+##办
+##功
+##加
+##务
+##劣
+##动
+##助
+##努
+##劫
+##劭
+##励
+##劲
+##劳
+##労
+##劵
+##効
+##劾
+##势
+##勁
+##勃
+##勇
+##勉
+##勋
+##勐
+##勒
+##動
+##勖
+##勘
+##務
+##勛
+##勝
+##勞
+##募
+##勢
+##勤
+##勧
+##勳
+##勵
+##勸
+##勺
+##勻
+##勾
+##勿
+##匀
+##包
+##匆
+##匈
+##匍
+##匐
+##匕
+##化
+##北
+##匙
+##匝
+##匠
+##匡
+##匣
+##匪
+##匮
+##匯
+##匱
+##匹
+##区
+##医
+##匾
+##匿
+##區
+##十
+##千
+##卅
+##升
+##午
+##卉
+##半
+##卍
+##华
+##协
+##卑
+##卒
+##卓
+##協
+##单
+##卖
+##南
+##単
+##博
+##卜
+##卞
+##卟
+##占
+##卡
+##卢
+##卤
+##卦
+##卧
+##卫
+##卮
+##卯
+##印
+##危
+##即
+##却
+##卵
+##卷
+##卸
+##卻
+##卿
+##厂
+##厄
+##厅
+##历
+##厉
+##压
+##厌
+##厕
+##厘
+##厚
+##厝
+##原
+##厢
+##厥
+##厦
+##厨
+##厩
+##厭
+##厮
+##厲
+##厳
+##去
+##县
+##叁
+##参
+##參
+##又
+##叉
+##及
+##友
+##双
+##反
+##収
+##发
+##叔
+##取
+##受
+##变
+##叙
+##叛
+##叟
+##叠
+##叡
+##叢
+##口
+##古
+##句
+##另
+##叨
+##叩
+##只
+##叫
+##召
+##叭
+##叮
+##可
+##台
+##叱
+##史
+##右
+##叵
+##叶
+##号
+##司
+##叹
+##叻
+##叼
+##叽
+##吁
+##吃
+##各
+##吆
+##合
+##吉
+##吊
+##吋
+##同
+##名
+##后
+##吏
+##吐
+##向
+##吒
+##吓
+##吕
+##吖
+##吗
+##君
+##吝
+##吞
+##吟
+##吠
+##吡
+##否
+##吧
+##吨
+##吩
+##含
+##听
+##吭
+##吮
+##启
+##吱
+##吳
+##吴
+##吵
+##吶
+##吸
+##吹
+##吻
+##吼
+##吽
+##吾
+##呀
+##呂
+##呃
+##呆
+##呈
+##告
+##呋
+##呎
+##呐
+##呓
+##呕
+##呗
+##员
+##呛
+##呜
+##呢
+##呤
+##呦
+##周
+##呱
+##呲
+##味
+##呵
+##呷
+##呸
+##呻
+##呼
+##命
+##咀
+##咁
+##咂
+##咄
+##咆
+##咋
+##和
+##咎
+##咏
+##咐
+##咒
+##咔
+##咕
+##咖
+##咗
+##咘
+##咙
+##咚
+##咛
+##咣
+##咤
+##咦
+##咧
+##咨
+##咩
+##咪
+##咫
+##咬
+##咭
+##咯
+##咱
+##咲
+##咳
+##咸
+##咻
+##咽
+##咿
+##哀
+##品
+##哂
+##哄
+##哆
+##哇
+##哈
+##哉
+##哋
+##哌
+##响
+##哎
+##哏
+##哐
+##哑
+##哒
+##哔
+##哗
+##哟
+##員
+##哥
+##哦
+##哧
+##哨
+##哩
+##哪
+##哭
+##哮
+##哲
+##哺
+##哼
+##哽
+##唁
+##唄
+##唆
+##唇
+##唉
+##唏
+##唐
+##唑
+##唔
+##唠
+##唤
+##唧
+##唬
+##售
+##唯
+##唰
+##唱
+##唳
+##唷
+##唸
+##唾
+##啃
+##啄
+##商
+##啉
+##啊
+##問
+##啓
+##啕
+##啖
+##啜
+##啞
+##啟
+##啡
+##啤
+##啥
+##啦
+##啧
+##啪
+##啫
+##啬
+##啮
+##啰
+##啱
+##啲
+##啵
+##啶
+##啷
+##啸
+##啻
+##啼
+##啾
+##喀
+##喂
+##喃
+##善
+##喆
+##喇
+##喉
+##喊
+##喋
+##喎
+##喏
+##喔
+##喘
+##喙
+##喚
+##喜
+##喝
+##喟
+##喧
+##喪
+##喫
+##喬
+##單
+##喰
+##喱
+##喲
+##喳
+##喵
+##営
+##喷
+##喹
+##喺
+##喻
+##喽
+##嗅
+##嗆
+##嗇
+##嗎
+##嗑
+##嗒
+##嗓
+##嗔
+##嗖
+##嗚
+##嗜
+##嗝
+##嗟
+##嗡
+##嗣
+##嗤
+##嗦
+##嗨
+##嗪
+##嗬
+##嗯
+##嗰
+##嗲
+##嗳
+##嗶
+##嗷
+##嗽
+##嘀
+##嘅
+##嘆
+##嘈
+##嘉
+##嘌
+##嘍
+##嘎
+##嘔
+##嘖
+##嘗
+##嘘
+##嘚
+##嘛
+##嘜
+##嘞
+##嘟
+##嘢
+##嘣
+##嘤
+##嘧
+##嘩
+##嘭
+##嘮
+##嘯
+##嘰
+##嘱
+##嘲
+##嘴
+##嘶
+##嘸
+##嘹
+##嘻
+##嘿
+##噁
+##噌
+##噎
+##噓
+##噔
+##噗
+##噙
+##噜
+##噠
+##噢
+##噤
+##器
+##噩
+##噪
+##噬
+##噱
+##噴
+##噶
+##噸
+##噹
+##噻
+##噼
+##嚀
+##嚇
+##嚎
+##嚏
+##嚐
+##嚓
+##嚕
+##嚟
+##嚣
+##嚥
+##嚨
+##嚮
+##嚴
+##嚷
+##嚼
+##囂
+##囉
+##囊
+##囍
+##囑
+##囔
+##囗
+##囚
+##四
+##囝
+##回
+##囟
+##因
+##囡
+##团
+##団
+##囤
+##囧
+##囪
+##囫
+##园
+##困
+##囱
+##囲
+##図
+##围
+##囹
+##固
+##国
+##图
+##囿
+##圃
+##圄
+##圆
+##圈
+##國
+##圍
+##圏
+##園
+##圓
+##圖
+##團
+##圜
+##土
+##圣
+##圧
+##在
+##圩
+##圭
+##地
+##圳
+##场
+##圻
+##圾
+##址
+##坂
+##均
+##坊
+##坍
+##坎
+##坏
+##坐
+##坑
+##块
+##坚
+##坛
+##坝
+##坞
+##坟
+##坠
+##坡
+##坤
+##坦
+##坨
+##坪
+##坯
+##坳
+##坵
+##坷
+##垂
+##垃
+##垄
+##型
+##垒
+##垚
+##垛
+##垠
+##垢
+##垣
+##垦
+##垩
+##垫
+##垭
+##垮
+##垵
+##埂
+##埃
+##埋
+##城
+##埔
+##埕
+##埗
+##域
+##埠
+##埤
+##埵
+##執
+##埸
+##培
+##基
+##埼
+##堀
+##堂
+##堃
+##堅
+##堆
+##堇
+##堑
+##堕
+##堙
+##堡
+##堤
+##堪
+##堯
+##堰
+##報
+##場
+##堵
+##堺
+##堿
+##塊
+##塌
+##塑
+##塔
+##塗
+##塘
+##塚
+##塞
+##塢
+##塩
+##填
+##塬
+##塭
+##塵
+##塾
+##墀
+##境
+##墅
+##墉
+##墊
+##墒
+##墓
+##増
+##墘
+##墙
+##墜
+##增
+##墟
+##墨
+##墩
+##墮
+##墳
+##墻
+##墾
+##壁
+##壅
+##壆
+##壇
+##壊
+##壑
+##壓
+##壕
+##壘
+##壞
+##壟
+##壢
+##壤
+##壩
+##士
+##壬
+##壮
+##壯
+##声
+##売
+##壳
+##壶
+##壹
+##壺
+##壽
+##处
+##备
+##変
+##复
+##夏
+##夔
+##夕
+##外
+##夙
+##多
+##夜
+##够
+##夠
+##夢
+##夥
+##大
+##天
+##太
+##夫
+##夭
+##央
+##夯
+##失
+##头
+##夷
+##夸
+##夹
+##夺
+##夾
+##奂
+##奄
+##奇
+##奈
+##奉
+##奋
+##奎
+##奏
+##奐
+##契
+##奔
+##奕
+##奖
+##套
+##奘
+##奚
+##奠
+##奢
+##奥
+##奧
+##奪
+##奬
+##奮
+##女
+##奴
+##奶
+##奸
+##她
+##好
+##如
+##妃
+##妄
+##妆
+##妇
+##妈
+##妊
+##妍
+##妒
+##妓
+##妖
+##妘
+##妙
+##妝
+##妞
+##妣
+##妤
+##妥
+##妨
+##妩
+##妪
+##妮
+##妲
+##妳
+##妹
+##妻
+##妾
+##姆
+##姉
+##姊
+##始
+##姍
+##姐
+##姑
+##姒
+##姓
+##委
+##姗
+##姚
+##姜
+##姝
+##姣
+##姥
+##姦
+##姨
+##姪
+##姫
+##姬
+##姹
+##姻
+##姿
+##威
+##娃
+##娄
+##娅
+##娆
+##娇
+##娉
+##娑
+##娓
+##娘
+##娛
+##娜
+##娟
+##娠
+##娣
+##娥
+##娩
+##娱
+##娲
+##娴
+##娶
+##娼
+##婀
+##婁
+##婆
+##婉
+##婊
+##婕
+##婚
+##婢
+##婦
+##婧
+##婪
+##婭
+##婴
+##婵
+##婶
+##婷
+##婺
+##婿
+##媒
+##媚
+##媛
+##媞
+##媧
+##媲
+##媳
+##媽
+##媾
+##嫁
+##嫂
+##嫉
+##嫌
+##嫑
+##嫔
+##嫖
+##嫘
+##嫚
+##嫡
+##嫣
+##嫦
+##嫩
+##嫲
+##嫵
+##嫻
+##嬅
+##嬉
+##嬌
+##嬗
+##嬛
+##嬢
+##嬤
+##嬪
+##嬰
+##嬴
+##嬷
+##嬸
+##嬿
+##孀
+##孃
+##子
+##孑
+##孔
+##孕
+##孖
+##字
+##存
+##孙
+##孚
+##孛
+##孜
+##孝
+##孟
+##孢
+##季
+##孤
+##学
+##孩
+##孪
+##孫
+##孬
+##孰
+##孱
+##孳
+##孵
+##學
+##孺
+##孽
+##孿
+##宁
+##它
+##宅
+##宇
+##守
+##安
+##宋
+##完
+##宏
+##宓
+##宕
+##宗
+##官
+##宙
+##定
+##宛
+##宜
+##宝
+##实
+##実
+##宠
+##审
+##客
+##宣
+##室
+##宥
+##宦
+##宪
+##宫
+##宮
+##宰
+##害
+##宴
+##宵
+##家
+##宸
+##容
+##宽
+##宾
+##宿
+##寂
+##寄
+##寅
+##密
+##寇
+##富
+##寐
+##寒
+##寓
+##寛
+##寝
+##寞
+##察
+##寡
+##寢
+##寥
+##實
+##寧
+##寨
+##審
+##寫
+##寬
+##寮
+##寰
+##寵
+##寶
+##寸
+##对
+##寺
+##寻
+##导
+##対
+##寿
+##封
+##専
+##射
+##将
+##將
+##專
+##尉
+##尊
+##尋
+##對
+##導
+##小
+##少
+##尔
+##尕
+##尖
+##尘
+##尚
+##尝
+##尤
+##尧
+##尬
+##就
+##尴
+##尷
+##尸
+##尹
+##尺
+##尻
+##尼
+##尽
+##尾
+##尿
+##局
+##屁
+##层
+##屄
+##居
+##屆
+##屈
+##屉
+##届
+##屋
+##屌
+##屍
+##屎
+##屏
+##屐
+##屑
+##展
+##屜
+##属
+##屠
+##屡
+##屢
+##層
+##履
+##屬
+##屯
+##山
+##屹
+##屿
+##岀
+##岁
+##岂
+##岌
+##岐
+##岑
+##岔
+##岖
+##岗
+##岘
+##岙
+##岚
+##岛
+##岡
+##岩
+##岫
+##岬
+##岭
+##岱
+##岳
+##岷
+##岸
+##峇
+##峋
+##峒
+##峙
+##峡
+##峤
+##峥
+##峦
+##峨
+##峪
+##峭
+##峯
+##峰
+##峴
+##島
+##峻
+##峽
+##崁
+##崂
+##崆
+##崇
+##崎
+##崑
+##崔
+##崖
+##崗
+##崙
+##崛
+##崧
+##崩
+##崭
+##崴
+##崽
+##嵇
+##嵊
+##嵋
+##嵌
+##嵐
+##嵘
+##嵩
+##嵬
+##嵯
+##嶂
+##嶄
+##嶇
+##嶋
+##嶙
+##嶺
+##嶼
+##嶽
+##巅
+##巍
+##巒
+##巔
+##巖
+##川
+##州
+##巡
+##巢
+##工
+##左
+##巧
+##巨
+##巩
+##巫
+##差
+##己
+##已
+##巳
+##巴
+##巷
+##巻
+##巽
+##巾
+##巿
+##币
+##市
+##布
+##帅
+##帆
+##师
+##希
+##帐
+##帑
+##帕
+##帖
+##帘
+##帚
+##帛
+##帜
+##帝
+##帥
+##带
+##帧
+##師
+##席
+##帮
+##帯
+##帰
+##帳
+##帶
+##帷
+##常
+##帼
+##帽
+##幀
+##幂
+##幄
+##幅
+##幌
+##幔
+##幕
+##幟
+##幡
+##幢
+##幣
+##幫
+##干
+##平
+##年
+##并
+##幸
+##幹
+##幺
+##幻
+##幼
+##幽
+##幾
+##广
+##庁
+##広
+##庄
+##庆
+##庇
+##床
+##序
+##庐
+##库
+##应
+##底
+##庖
+##店
+##庙
+##庚
+##府
+##庞
+##废
+##庠
+##度
+##座
+##庫
+##庭
+##庵
+##庶
+##康
+##庸
+##庹
+##庾
+##廁
+##廂
+##廃
+##廈
+##廉
+##廊
+##廓
+##廖
+##廚
+##廝
+##廟
+##廠
+##廢
+##廣
+##廬
+##廳
+##延
+##廷
+##建
+##廿
+##开
+##弁
+##异
+##弃
+##弄
+##弈
+##弊
+##弋
+##式
+##弑
+##弒
+##弓
+##弔
+##引
+##弗
+##弘
+##弛
+##弟
+##张
+##弥
+##弦
+##弧
+##弩
+##弭
+##弯
+##弱
+##張
+##強
+##弹
+##强
+##弼
+##弾
+##彅
+##彆
+##彈
+##彌
+##彎
+##归
+##当
+##录
+##彗
+##彙
+##彝
+##形
+##彤
+##彥
+##彦
+##彧
+##彩
+##彪
+##彫
+##彬
+##彭
+##彰
+##影
+##彷
+##役
+##彻
+##彼
+##彿
+##往
+##征
+##径
+##待
+##徇
+##很
+##徉
+##徊
+##律
+##後
+##徐
+##徑
+##徒
+##従
+##徕
+##得
+##徘
+##徙
+##徜
+##從
+##徠
+##御
+##徨
+##復
+##循
+##徬
+##微
+##徳
+##徴
+##徵
+##德
+##徹
+##徼
+##徽
+##心
+##必
+##忆
+##忌
+##忍
+##忏
+##忐
+##忑
+##忒
+##忖
+##志
+##忘
+##忙
+##応
+##忠
+##忡
+##忤
+##忧
+##忪
+##快
+##忱
+##念
+##忻
+##忽
+##忿
+##怀
+##态
+##怂
+##怅
+##怆
+##怎
+##怏
+##怒
+##怔
+##怕
+##怖
+##怙
+##怜
+##思
+##怠
+##怡
+##急
+##怦
+##性
+##怨
+##怪
+##怯
+##怵
+##总
+##怼
+##恁
+##恃
+##恆
+##恋
+##恍
+##恐
+##恒
+##恕
+##恙
+##恚
+##恢
+##恣
+##恤
+##恥
+##恨
+##恩
+##恪
+##恫
+##恬
+##恭
+##息
+##恰
+##恳
+##恵
+##恶
+##恸
+##恺
+##恻
+##恼
+##恿
+##悄
+##悅
+##悉
+##悌
+##悍
+##悔
+##悖
+##悚
+##悟
+##悠
+##患
+##悦
+##您
+##悩
+##悪
+##悬
+##悯
+##悱
+##悲
+##悴
+##悵
+##悶
+##悸
+##悻
+##悼
+##悽
+##情
+##惆
+##惇
+##惊
+##惋
+##惑
+##惕
+##惘
+##惚
+##惜
+##惟
+##惠
+##惡
+##惦
+##惧
+##惨
+##惩
+##惫
+##惬
+##惭
+##惮
+##惯
+##惰
+##惱
+##想
+##惴
+##惶
+##惹
+##惺
+##愁
+##愆
+##愈
+##愉
+##愍
+##意
+##愕
+##愚
+##愛
+##愜
+##感
+##愣
+##愤
+##愧
+##愫
+##愷
+##愿
+##慄
+##慈
+##態
+##慌
+##慎
+##慑
+##慕
+##慘
+##慚
+##慟
+##慢
+##慣
+##慧
+##慨
+##慫
+##慮
+##慰
+##慳
+##慵
+##慶
+##慷
+##慾
+##憂
+##憊
+##憋
+##憎
+##憐
+##憑
+##憔
+##憚
+##憤
+##憧
+##憨
+##憩
+##憫
+##憬
+##憲
+##憶
+##憾
+##懂
+##懇
+##懈
+##應
+##懊
+##懋
+##懑
+##懒
+##懦
+##懲
+##懵
+##懶
+##懷
+##懸
+##懺
+##懼
+##懾
+##懿
+##戀
+##戈
+##戊
+##戌
+##戍
+##戎
+##戏
+##成
+##我
+##戒
+##戕
+##或
+##战
+##戚
+##戛
+##戟
+##戡
+##戦
+##截
+##戬
+##戮
+##戰
+##戲
+##戳
+##戴
+##戶
+##户
+##戸
+##戻
+##戾
+##房
+##所
+##扁
+##扇
+##扈
+##扉
+##手
+##才
+##扎
+##扑
+##扒
+##打
+##扔
+##払
+##托
+##扛
+##扣
+##扦
+##执
+##扩
+##扪
+##扫
+##扬
+##扭
+##扮
+##扯
+##扰
+##扱
+##扳
+##扶
+##批
+##扼
+##找
+##承
+##技
+##抄
+##抉
+##把
+##抑
+##抒
+##抓
+##投
+##抖
+##抗
+##折
+##抚
+##抛
+##抜
+##択
+##抟
+##抠
+##抡
+##抢
+##护
+##报
+##抨
+##披
+##抬
+##抱
+##抵
+##抹
+##押
+##抽
+##抿
+##拂
+##拄
+##担
+##拆
+##拇
+##拈
+##拉
+##拋
+##拌
+##拍
+##拎
+##拐
+##拒
+##拓
+##拔
+##拖
+##拗
+##拘
+##拙
+##拚
+##招
+##拜
+##拟
+##拡
+##拢
+##拣
+##拥
+##拦
+##拧
+##拨
+##择
+##括
+##拭
+##拮
+##拯
+##拱
+##拳
+##拴
+##拷
+##拼
+##拽
+##拾
+##拿
+##持
+##挂
+##指
+##挈
+##按
+##挎
+##挑
+##挖
+##挙
+##挚
+##挛
+##挝
+##挞
+##挟
+##挠
+##挡
+##挣
+##挤
+##挥
+##挨
+##挪
+##挫
+##振
+##挲
+##挹
+##挺
+##挽
+##挾
+##捂
+##捅
+##捆
+##捉
+##捋
+##捌
+##捍
+##捎
+##捏
+##捐
+##捕
+##捞
+##损
+##捡
+##换
+##捣
+##捧
+##捨
+##捩
+##据
+##捱
+##捲
+##捶
+##捷
+##捺
+##捻
+##掀
+##掂
+##掃
+##掇
+##授
+##掉
+##掌
+##掏
+##掐
+##排
+##掖
+##掘
+##掙
+##掛
+##掠
+##採
+##探
+##掣
+##接
+##控
+##推
+##掩
+##措
+##掬
+##掰
+##掲
+##掳
+##掴
+##掷
+##掸
+##掺
+##揀
+##揃
+##揄
+##揆
+##揉
+##揍
+##描
+##提
+##插
+##揖
+##揚
+##換
+##握
+##揣
+##揩
+##揪
+##揭
+##揮
+##援
+##揶
+##揸
+##揹
+##揽
+##搀
+##搁
+##搂
+##搅
+##損
+##搏
+##搐
+##搓
+##搔
+##搖
+##搗
+##搜
+##搞
+##搡
+##搪
+##搬
+##搭
+##搵
+##搶
+##携
+##搽
+##摀
+##摁
+##摄
+##摆
+##摇
+##摈
+##摊
+##摒
+##摔
+##摘
+##摞
+##摟
+##摧
+##摩
+##摯
+##摳
+##摸
+##摹
+##摺
+##摻
+##撂
+##撃
+##撅
+##撇
+##撈
+##撐
+##撑
+##撒
+##撓
+##撕
+##撚
+##撞
+##撤
+##撥
+##撩
+##撫
+##撬
+##播
+##撮
+##撰
+##撲
+##撵
+##撷
+##撸
+##撻
+##撼
+##撿
+##擀
+##擁
+##擂
+##擄
+##擅
+##擇
+##擊
+##擋
+##操
+##擎
+##擒
+##擔
+##擘
+##據
+##擞
+##擠
+##擡
+##擢
+##擦
+##擬
+##擰
+##擱
+##擲
+##擴
+##擷
+##擺
+##擼
+##擾
+##攀
+##攏
+##攒
+##攔
+##攘
+##攙
+##攜
+##攝
+##攞
+##攢
+##攣
+##攤
+##攥
+##攪
+##攫
+##攬
+##支
+##收
+##攸
+##改
+##攻
+##放
+##政
+##故
+##效
+##敌
+##敍
+##敎
+##敏
+##救
+##敕
+##敖
+##敗
+##敘
+##教
+##敛
+##敝
+##敞
+##敢
+##散
+##敦
+##敬
+##数
+##敲
+##整
+##敵
+##敷
+##數
+##斂
+##斃
+##文
+##斋
+##斌
+##斎
+##斐
+##斑
+##斓
+##斗
+##料
+##斛
+##斜
+##斟
+##斡
+##斤
+##斥
+##斧
+##斩
+##斫
+##斬
+##断
+##斯
+##新
+##斷
+##方
+##於
+##施
+##旁
+##旃
+##旅
+##旋
+##旌
+##旎
+##族
+##旖
+##旗
+##无
+##既
+##日
+##旦
+##旧
+##旨
+##早
+##旬
+##旭
+##旮
+##旱
+##时
+##旷
+##旺
+##旻
+##昀
+##昂
+##昆
+##昇
+##昉
+##昊
+##昌
+##明
+##昏
+##易
+##昔
+##昕
+##昙
+##星
+##映
+##春
+##昧
+##昨
+##昭
+##是
+##昱
+##昴
+##昵
+##昶
+##昼
+##显
+##晁
+##時
+##晃
+##晉
+##晋
+##晌
+##晏
+##晒
+##晓
+##晔
+##晕
+##晖
+##晗
+##晚
+##晝
+##晞
+##晟
+##晤
+##晦
+##晨
+##晩
+##普
+##景
+##晰
+##晴
+##晶
+##晷
+##智
+##晾
+##暂
+##暄
+##暇
+##暈
+##暉
+##暌
+##暐
+##暑
+##暖
+##暗
+##暝
+##暢
+##暧
+##暨
+##暫
+##暮
+##暱
+##暴
+##暸
+##暹
+##曄
+##曆
+##曇
+##曉
+##曖
+##曙
+##曜
+##曝
+##曠
+##曦
+##曬
+##曰
+##曲
+##曳
+##更
+##書
+##曹
+##曼
+##曾
+##替
+##最
+##會
+##月
+##有
+##朋
+##服
+##朐
+##朔
+##朕
+##朗
+##望
+##朝
+##期
+##朦
+##朧
+##木
+##未
+##末
+##本
+##札
+##朮
+##术
+##朱
+##朴
+##朵
+##机
+##朽
+##杀
+##杂
+##权
+##杆
+##杈
+##杉
+##李
+##杏
+##材
+##村
+##杓
+##杖
+##杜
+##杞
+##束
+##杠
+##条
+##来
+##杨
+##杭
+##杯
+##杰
+##東
+##杳
+##杵
+##杷
+##杼
+##松
+##板
+##极
+##构
+##枇
+##枉
+##枋
+##析
+##枕
+##林
+##枚
+##果
+##枝
+##枢
+##枣
+##枪
+##枫
+##枭
+##枯
+##枰
+##枱
+##枳
+##架
+##枷
+##枸
+##柄
+##柏
+##某
+##柑
+##柒
+##染
+##柔
+##柘
+##柚
+##柜
+##柞
+##柠
+##柢
+##查
+##柩
+##柬
+##柯
+##柱
+##柳
+##柴
+##柵
+##査
+##柿
+##栀
+##栃
+##栄
+##栅
+##标
+##栈
+##栉
+##栋
+##栎
+##栏
+##树
+##栓
+##栖
+##栗
+##校
+##栩
+##株
+##样
+##核
+##根
+##格
+##栽
+##栾
+##桀
+##桁
+##桂
+##桃
+##桅
+##框
+##案
+##桉
+##桌
+##桎
+##桐
+##桑
+##桓
+##桔
+##桜
+##桠
+##桡
+##桢
+##档
+##桥
+##桦
+##桧
+##桨
+##桩
+##桶
+##桿
+##梁
+##梅
+##梆
+##梏
+##梓
+##梗
+##條
+##梟
+##梢
+##梦
+##梧
+##梨
+##梭
+##梯
+##械
+##梳
+##梵
+##梶
+##检
+##棂
+##棄
+##棉
+##棋
+##棍
+##棒
+##棕
+##棗
+##棘
+##棚
+##棟
+##棠
+##棣
+##棧
+##森
+##棱
+##棲
+##棵
+##棹
+##棺
+##椁
+##椅
+##椋
+##植
+##椎
+##椒
+##検
+##椪
+##椭
+##椰
+##椹
+##椽
+##椿
+##楂
+##楊
+##楓
+##楔
+##楚
+##楝
+##楞
+##楠
+##楣
+##楨
+##楫
+##業
+##楮
+##極
+##楷
+##楸
+##楹
+##楼
+##楽
+##概
+##榄
+##榆
+##榈
+##榉
+##榔
+##榕
+##榖
+##榛
+##榜
+##榨
+##榫
+##榭
+##榮
+##榱
+##榴
+##榷
+##榻
+##槁
+##槃
+##構
+##槌
+##槍
+##槎
+##槐
+##槓
+##様
+##槛
+##槟
+##槤
+##槭
+##槲
+##槳
+##槻
+##槽
+##槿
+##樁
+##樂
+##樊
+##樑
+##樓
+##標
+##樞
+##樟
+##模
+##樣
+##権
+##横
+##樫
+##樯
+##樱
+##樵
+##樸
+##樹
+##樺
+##樽
+##樾
+##橄
+##橇
+##橋
+##橐
+##橘
+##橙
+##機
+##橡
+##橢
+##橫
+##橱
+##橹
+##橼
+##檀
+##檄
+##檎
+##檐
+##檔
+##檗
+##檜
+##檢
+##檬
+##檯
+##檳
+##檸
+##檻
+##櫃
+##櫚
+##櫛
+##櫥
+##櫸
+##櫻
+##欄
+##權
+##欒
+##欖
+##欠
+##次
+##欢
+##欣
+##欧
+##欲
+##欸
+##欺
+##欽
+##款
+##歆
+##歇
+##歉
+##歌
+##歎
+##歐
+##歓
+##歙
+##歛
+##歡
+##止
+##正
+##此
+##步
+##武
+##歧
+##歩
+##歪
+##歯
+##歲
+##歳
+##歴
+##歷
+##歸
+##歹
+##死
+##歼
+##殁
+##殃
+##殆
+##殇
+##殉
+##殊
+##残
+##殒
+##殓
+##殖
+##殘
+##殞
+##殡
+##殤
+##殭
+##殯
+##殲
+##殴
+##段
+##殷
+##殺
+##殼
+##殿
+##毀
+##毁
+##毂
+##毅
+##毆
+##毋
+##母
+##毎
+##每
+##毒
+##毓
+##比
+##毕
+##毗
+##毘
+##毙
+##毛
+##毡
+##毫
+##毯
+##毽
+##氈
+##氏
+##氐
+##民
+##氓
+##气
+##氖
+##気
+##氙
+##氛
+##氟
+##氡
+##氢
+##氣
+##氤
+##氦
+##氧
+##氨
+##氪
+##氫
+##氮
+##氯
+##氰
+##氲
+##水
+##氷
+##永
+##氹
+##氾
+##汀
+##汁
+##求
+##汆
+##汇
+##汉
+##汎
+##汐
+##汕
+##汗
+##汙
+##汛
+##汝
+##汞
+##江
+##池
+##污
+##汤
+##汨
+##汩
+##汪
+##汰
+##汲
+##汴
+##汶
+##汹
+##決
+##汽
+##汾
+##沁
+##沂
+##沃
+##沅
+##沈
+##沉
+##沌
+##沏
+##沐
+##沒
+##沓
+##沖
+##沙
+##沛
+##沟
+##没
+##沢
+##沣
+##沥
+##沦
+##沧
+##沪
+##沫
+##沭
+##沮
+##沱
+##河
+##沸
+##油
+##治
+##沼
+##沽
+##沾
+##沿
+##況
+##泄
+##泉
+##泊
+##泌
+##泓
+##法
+##泗
+##泛
+##泞
+##泠
+##泡
+##波
+##泣
+##泥
+##注
+##泪
+##泫
+##泮
+##泯
+##泰
+##泱
+##泳
+##泵
+##泷
+##泸
+##泻
+##泼
+##泽
+##泾
+##洁
+##洄
+##洋
+##洒
+##洗
+##洙
+##洛
+##洞
+##津
+##洩
+##洪
+##洮
+##洱
+##洲
+##洵
+##洶
+##洸
+##洹
+##活
+##洼
+##洽
+##派
+##流
+##浃
+##浄
+##浅
+##浆
+##浇
+##浊
+##测
+##济
+##浏
+##浑
+##浒
+##浓
+##浔
+##浙
+##浚
+##浜
+##浣
+##浦
+##浩
+##浪
+##浬
+##浮
+##浯
+##浴
+##海
+##浸
+##涂
+##涅
+##涇
+##消
+##涉
+##涌
+##涎
+##涓
+##涔
+##涕
+##涙
+##涛
+##涝
+##涞
+##涟
+##涠
+##涡
+##涣
+##涤
+##润
+##涧
+##涨
+##涩
+##涪
+##涮
+##涯
+##液
+##涵
+##涸
+##涼
+##涿
+##淀
+##淄
+##淅
+##淆
+##淇
+##淋
+##淌
+##淑
+##淒
+##淖
+##淘
+##淙
+##淚
+##淞
+##淡
+##淤
+##淦
+##淨
+##淩
+##淪
+##淫
+##淬
+##淮
+##深
+##淳
+##淵
+##混
+##淹
+##淺
+##添
+##淼
+##清
+##済
+##渉
+##渊
+##渋
+##渍
+##渎
+##渐
+##渔
+##渗
+##渙
+##渚
+##減
+##渝
+##渠
+##渡
+##渣
+##渤
+##渥
+##渦
+##温
+##測
+##渭
+##港
+##渲
+##渴
+##游
+##渺
+##渾
+##湃
+##湄
+##湊
+##湍
+##湖
+##湘
+##湛
+##湟
+##湧
+##湫
+##湮
+##湯
+##湳
+##湾
+##湿
+##満
+##溃
+##溅
+##溉
+##溏
+##源
+##準
+##溜
+##溝
+##溟
+##溢
+##溥
+##溧
+##溪
+##溫
+##溯
+##溱
+##溴
+##溶
+##溺
+##溼
+##滁
+##滂
+##滄
+##滅
+##滇
+##滋
+##滌
+##滑
+##滓
+##滔
+##滕
+##滙
+##滚
+##滝
+##滞
+##滟
+##满
+##滢
+##滤
+##滥
+##滦
+##滨
+##滩
+##滬
+##滯
+##滲
+##滴
+##滷
+##滸
+##滾
+##滿
+##漁
+##漂
+##漆
+##漉
+##漏
+##漓
+##演
+##漕
+##漠
+##漢
+##漣
+##漩
+##漪
+##漫
+##漬
+##漯
+##漱
+##漲
+##漳
+##漸
+##漾
+##漿
+##潆
+##潇
+##潋
+##潍
+##潑
+##潔
+##潘
+##潛
+##潜
+##潞
+##潟
+##潢
+##潤
+##潦
+##潧
+##潭
+##潮
+##潰
+##潴
+##潸
+##潺
+##潼
+##澀
+##澄
+##澆
+##澈
+##澍
+##澎
+##澗
+##澜
+##澡
+##澤
+##澧
+##澱
+##澳
+##澹
+##激
+##濁
+##濂
+##濃
+##濑
+##濒
+##濕
+##濘
+##濛
+##濟
+##濠
+##濡
+##濤
+##濫
+##濬
+##濮
+##濯
+##濱
+##濺
+##濾
+##瀅
+##瀆
+##瀉
+##瀋
+##瀏
+##瀑
+##瀕
+##瀘
+##瀚
+##瀛
+##瀝
+##瀞
+##瀟
+##瀧
+##瀨
+##瀬
+##瀰
+##瀾
+##灌
+##灏
+##灑
+##灘
+##灝
+##灞
+##灣
+##火
+##灬
+##灭
+##灯
+##灰
+##灵
+##灶
+##灸
+##灼
+##災
+##灾
+##灿
+##炀
+##炁
+##炅
+##炉
+##炊
+##炎
+##炒
+##炔
+##炕
+##炖
+##炙
+##炜
+##炫
+##炬
+##炭
+##炮
+##炯
+##炳
+##炷
+##炸
+##点
+##為
+##炼
+##炽
+##烁
+##烂
+##烃
+##烈
+##烊
+##烏
+##烘
+##烙
+##烛
+##烟
+##烤
+##烦
+##烧
+##烨
+##烩
+##烫
+##烬
+##热
+##烯
+##烷
+##烹
+##烽
+##焉
+##焊
+##焕
+##焖
+##焗
+##焘
+##焙
+##焚
+##焜
+##無
+##焦
+##焯
+##焰
+##焱
+##然
+##焼
+##煅
+##煉
+##煊
+##煌
+##煎
+##煒
+##煖
+##煙
+##煜
+##煞
+##煤
+##煥
+##煦
+##照
+##煨
+##煩
+##煮
+##煲
+##煸
+##煽
+##熄
+##熊
+##熏
+##熒
+##熔
+##熙
+##熟
+##熠
+##熨
+##熬
+##熱
+##熵
+##熹
+##熾
+##燁
+##燃
+##燄
+##燈
+##燉
+##燊
+##燎
+##燒
+##燔
+##燕
+##燙
+##燜
+##營
+##燥
+##燦
+##燧
+##燭
+##燮
+##燴
+##燻
+##燼
+##燿
+##爆
+##爍
+##爐
+##爛
+##爪
+##爬
+##爭
+##爰
+##爱
+##爲
+##爵
+##父
+##爷
+##爸
+##爹
+##爺
+##爻
+##爽
+##爾
+##牆
+##片
+##版
+##牌
+##牍
+##牒
+##牙
+##牛
+##牝
+##牟
+##牠
+##牡
+##牢
+##牦
+##牧
+##物
+##牯
+##牲
+##牴
+##牵
+##特
+##牺
+##牽
+##犀
+##犁
+##犄
+##犊
+##犍
+##犒
+##犢
+##犧
+##犬
+##犯
+##状
+##犷
+##犸
+##犹
+##狀
+##狂
+##狄
+##狈
+##狎
+##狐
+##狒
+##狗
+##狙
+##狞
+##狠
+##狡
+##狩
+##独
+##狭
+##狮
+##狰
+##狱
+##狸
+##狹
+##狼
+##狽
+##猎
+##猕
+##猖
+##猗
+##猙
+##猛
+##猜
+##猝
+##猥
+##猩
+##猪
+##猫
+##猬
+##献
+##猴
+##猶
+##猷
+##猾
+##猿
+##獄
+##獅
+##獎
+##獐
+##獒
+##獗
+##獠
+##獣
+##獨
+##獭
+##獰
+##獲
+##獵
+##獷
+##獸
+##獺
+##獻
+##獼
+##獾
+##玄
+##率
+##玉
+##王
+##玑
+##玖
+##玛
+##玟
+##玠
+##玥
+##玩
+##玫
+##玮
+##环
+##现
+##玲
+##玳
+##玷
+##玺
+##玻
+##珀
+##珂
+##珅
+##珈
+##珉
+##珊
+##珍
+##珏
+##珐
+##珑
+##珙
+##珞
+##珠
+##珣
+##珥
+##珩
+##珪
+##班
+##珮
+##珲
+##珺
+##現
+##球
+##琅
+##理
+##琇
+##琉
+##琊
+##琍
+##琏
+##琐
+##琛
+##琢
+##琥
+##琦
+##琨
+##琪
+##琬
+##琮
+##琰
+##琲
+##琳
+##琴
+##琵
+##琶
+##琺
+##琼
+##瑀
+##瑁
+##瑄
+##瑋
+##瑕
+##瑗
+##瑙
+##瑚
+##瑛
+##瑜
+##瑞
+##瑟
+##瑠
+##瑣
+##瑤
+##瑩
+##瑪
+##瑯
+##瑰
+##瑶
+##瑾
+##璀
+##璁
+##璃
+##璇
+##璉
+##璋
+##璎
+##璐
+##璜
+##璞
+##璟
+##璧
+##璨
+##環
+##璽
+##璿
+##瓊
+##瓏
+##瓒
+##瓜
+##瓢
+##瓣
+##瓤
+##瓦
+##瓮
+##瓯
+##瓴
+##瓶
+##瓷
+##甄
+##甌
+##甕
+##甘
+##甙
+##甚
+##甜
+##生
+##產
+##産
+##甥
+##甦
+##用
+##甩
+##甫
+##甬
+##甭
+##甯
+##田
+##由
+##甲
+##申
+##电
+##男
+##甸
+##町
+##画
+##甾
+##畀
+##畅
+##界
+##畏
+##畑
+##畔
+##留
+##畜
+##畝
+##畢
+##略
+##畦
+##番
+##畫
+##異
+##畲
+##畳
+##畴
+##當
+##畸
+##畹
+##畿
+##疆
+##疇
+##疊
+##疏
+##疑
+##疔
+##疖
+##疗
+##疙
+##疚
+##疝
+##疟
+##疡
+##疣
+##疤
+##疥
+##疫
+##疮
+##疯
+##疱
+##疲
+##疳
+##疵
+##疸
+##疹
+##疼
+##疽
+##疾
+##痂
+##病
+##症
+##痈
+##痉
+##痊
+##痍
+##痒
+##痔
+##痕
+##痘
+##痙
+##痛
+##痞
+##痠
+##痢
+##痣
+##痤
+##痧
+##痨
+##痪
+##痫
+##痰
+##痱
+##痴
+##痹
+##痺
+##痼
+##痿
+##瘀
+##瘁
+##瘋
+##瘍
+##瘓
+##瘘
+##瘙
+##瘟
+##瘠
+##瘡
+##瘢
+##瘤
+##瘦
+##瘧
+##瘩
+##瘪
+##瘫
+##瘴
+##瘸
+##瘾
+##療
+##癇
+##癌
+##癒
+##癖
+##癜
+##癞
+##癡
+##癢
+##癣
+##癥
+##癫
+##癬
+##癮
+##癱
+##癲
+##癸
+##発
+##登
+##發
+##白
+##百
+##皂
+##的
+##皆
+##皇
+##皈
+##皋
+##皎
+##皑
+##皓
+##皖
+##皙
+##皚
+##皮
+##皰
+##皱
+##皴
+##皺
+##皿
+##盂
+##盃
+##盅
+##盆
+##盈
+##益
+##盎
+##盏
+##盐
+##监
+##盒
+##盔
+##盖
+##盗
+##盘
+##盛
+##盜
+##盞
+##盟
+##盡
+##監
+##盤
+##盥
+##盧
+##盪
+##目
+##盯
+##盱
+##盲
+##直
+##相
+##盹
+##盼
+##盾
+##省
+##眈
+##眉
+##看
+##県
+##眙
+##眞
+##真
+##眠
+##眦
+##眨
+##眩
+##眯
+##眶
+##眷
+##眸
+##眺
+##眼
+##眾
+##着
+##睁
+##睇
+##睏
+##睐
+##睑
+##睛
+##睜
+##睞
+##睡
+##睢
+##督
+##睥
+##睦
+##睨
+##睪
+##睫
+##睬
+##睹
+##睽
+##睾
+##睿
+##瞄
+##瞅
+##瞇
+##瞋
+##瞌
+##瞎
+##瞑
+##瞒
+##瞓
+##瞞
+##瞟
+##瞠
+##瞥
+##瞧
+##瞩
+##瞪
+##瞬
+##瞭
+##瞰
+##瞳
+##瞻
+##瞼
+##瞿
+##矇
+##矍
+##矗
+##矚
+##矛
+##矜
+##矢
+##矣
+##知
+##矩
+##矫
+##短
+##矮
+##矯
+##石
+##矶
+##矽
+##矾
+##矿
+##码
+##砂
+##砌
+##砍
+##砒
+##研
+##砖
+##砗
+##砚
+##砝
+##砣
+##砥
+##砧
+##砭
+##砰
+##砲
+##破
+##砷
+##砸
+##砺
+##砼
+##砾
+##础
+##硅
+##硐
+##硒
+##硕
+##硝
+##硫
+##硬
+##确
+##硯
+##硼
+##碁
+##碇
+##碉
+##碌
+##碍
+##碎
+##碑
+##碓
+##碗
+##碘
+##碚
+##碛
+##碟
+##碣
+##碧
+##碩
+##碰
+##碱
+##碳
+##碴
+##確
+##碼
+##碾
+##磁
+##磅
+##磊
+##磋
+##磐
+##磕
+##磚
+##磡
+##磨
+##磬
+##磯
+##磲
+##磷
+##磺
+##礁
+##礎
+##礙
+##礡
+##礦
+##礪
+##礫
+##礴
+##示
+##礼
+##社
+##祀
+##祁
+##祂
+##祇
+##祈
+##祉
+##祎
+##祐
+##祕
+##祖
+##祗
+##祚
+##祛
+##祜
+##祝
+##神
+##祟
+##祠
+##祢
+##祥
+##票
+##祭
+##祯
+##祷
+##祸
+##祺
+##祿
+##禀
+##禁
+##禄
+##禅
+##禍
+##禎
+##福
+##禛
+##禦
+##禧
+##禪
+##禮
+##禱
+##禹
+##禺
+##离
+##禽
+##禾
+##禿
+##秀
+##私
+##秃
+##秆
+##秉
+##秋
+##种
+##科
+##秒
+##秘
+##租
+##秣
+##秤
+##秦
+##秧
+##秩
+##秭
+##积
+##称
+##秸
+##移
+##秽
+##稀
+##稅
+##程
+##稍
+##税
+##稔
+##稗
+##稚
+##稜
+##稞
+##稟
+##稠
+##稣
+##種
+##稱
+##稲
+##稳
+##稷
+##稹
+##稻
+##稼
+##稽
+##稿
+##穀
+##穂
+##穆
+##穌
+##積
+##穎
+##穗
+##穢
+##穩
+##穫
+##穴
+##究
+##穷
+##穹
+##空
+##穿
+##突
+##窃
+##窄
+##窈
+##窍
+##窑
+##窒
+##窓
+##窕
+##窖
+##窗
+##窘
+##窜
+##窝
+##窟
+##窠
+##窥
+##窦
+##窨
+##窩
+##窪
+##窮
+##窯
+##窺
+##窿
+##竄
+##竅
+##竇
+##竊
+##立
+##竖
+##站
+##竜
+##竞
+##竟
+##章
+##竣
+##童
+##竭
+##端
+##競
+##竹
+##竺
+##竽
+##竿
+##笃
+##笆
+##笈
+##笋
+##笏
+##笑
+##笔
+##笙
+##笛
+##笞
+##笠
+##符
+##笨
+##第
+##笹
+##笺
+##笼
+##筆
+##等
+##筊
+##筋
+##筍
+##筏
+##筐
+##筑
+##筒
+##答
+##策
+##筛
+##筝
+##筠
+##筱
+##筲
+##筵
+##筷
+##筹
+##签
+##简
+##箇
+##箋
+##箍
+##箏
+##箐
+##箔
+##箕
+##算
+##箝
+##管
+##箩
+##箫
+##箭
+##箱
+##箴
+##箸
+##節
+##篁
+##範
+##篆
+##篇
+##築
+##篑
+##篓
+##篙
+##篝
+##篠
+##篡
+##篤
+##篩
+##篪
+##篮
+##篱
+##篷
+##簇
+##簌
+##簍
+##簡
+##簦
+##簧
+##簪
+##簫
+##簷
+##簸
+##簽
+##簾
+##簿
+##籁
+##籃
+##籌
+##籍
+##籐
+##籟
+##籠
+##籤
+##籬
+##籮
+##籲
+##米
+##类
+##籼
+##籽
+##粄
+##粉
+##粑
+##粒
+##粕
+##粗
+##粘
+##粟
+##粤
+##粥
+##粧
+##粪
+##粮
+##粱
+##粲
+##粳
+##粵
+##粹
+##粼
+##粽
+##精
+##粿
+##糅
+##糊
+##糍
+##糕
+##糖
+##糗
+##糙
+##糜
+##糞
+##糟
+##糠
+##糧
+##糬
+##糯
+##糰
+##糸
+##系
+##糾
+##紀
+##紂
+##約
+##紅
+##紉
+##紊
+##紋
+##納
+##紐
+##紓
+##純
+##紗
+##紘
+##紙
+##級
+##紛
+##紜
+##素
+##紡
+##索
+##紧
+##紫
+##紮
+##累
+##細
+##紳
+##紹
+##紺
+##終
+##絃
+##組
+##絆
+##経
+##結
+##絕
+##絞
+##絡
+##絢
+##給
+##絨
+##絮
+##統
+##絲
+##絳
+##絵
+##絶
+##絹
+##綁
+##綏
+##綑
+##經
+##継
+##続
+##綜
+##綠
+##綢
+##綦
+##綫
+##綬
+##維
+##綱
+##網
+##綴
+##綵
+##綸
+##綺
+##綻
+##綽
+##綾
+##綿
+##緊
+##緋
+##総
+##緑
+##緒
+##緘
+##線
+##緝
+##緞
+##締
+##緣
+##編
+##緩
+##緬
+##緯
+##練
+##緹
+##緻
+##縁
+##縄
+##縈
+##縛
+##縝
+##縣
+##縫
+##縮
+##縱
+##縴
+##縷
+##總
+##績
+##繁
+##繃
+##繆
+##繇
+##繋
+##織
+##繕
+##繚
+##繞
+##繡
+##繩
+##繪
+##繫
+##繭
+##繳
+##繹
+##繼
+##繽
+##纂
+##續
+##纍
+##纏
+##纓
+##纔
+##纖
+##纜
+##纠
+##红
+##纣
+##纤
+##约
+##级
+##纨
+##纪
+##纫
+##纬
+##纭
+##纯
+##纰
+##纱
+##纲
+##纳
+##纵
+##纶
+##纷
+##纸
+##纹
+##纺
+##纽
+##纾
+##线
+##绀
+##练
+##组
+##绅
+##细
+##织
+##终
+##绊
+##绍
+##绎
+##经
+##绑
+##绒
+##结
+##绔
+##绕
+##绘
+##给
+##绚
+##绛
+##络
+##绝
+##绞
+##统
+##绡
+##绢
+##绣
+##绥
+##绦
+##继
+##绩
+##绪
+##绫
+##续
+##绮
+##绯
+##绰
+##绳
+##维
+##绵
+##绶
+##绷
+##绸
+##绻
+##综
+##绽
+##绾
+##绿
+##缀
+##缄
+##缅
+##缆
+##缇
+##缈
+##缉
+##缎
+##缓
+##缔
+##缕
+##编
+##缘
+##缙
+##缚
+##缜
+##缝
+##缠
+##缢
+##缤
+##缥
+##缨
+##缩
+##缪
+##缭
+##缮
+##缰
+##缱
+##缴
+##缸
+##缺
+##缽
+##罂
+##罄
+##罌
+##罐
+##网
+##罔
+##罕
+##罗
+##罚
+##罡
+##罢
+##罩
+##罪
+##置
+##罰
+##署
+##罵
+##罷
+##罹
+##羁
+##羅
+##羈
+##羊
+##羌
+##美
+##羔
+##羚
+##羞
+##羟
+##羡
+##羣
+##群
+##羥
+##羧
+##羨
+##義
+##羯
+##羲
+##羸
+##羹
+##羽
+##羿
+##翁
+##翅
+##翊
+##翌
+##翎
+##習
+##翔
+##翘
+##翟
+##翠
+##翡
+##翦
+##翩
+##翰
+##翱
+##翳
+##翹
+##翻
+##翼
+##耀
+##老
+##考
+##耄
+##者
+##耆
+##耋
+##而
+##耍
+##耐
+##耒
+##耕
+##耗
+##耘
+##耙
+##耦
+##耨
+##耳
+##耶
+##耷
+##耸
+##耻
+##耽
+##耿
+##聂
+##聆
+##聊
+##聋
+##职
+##聒
+##联
+##聖
+##聘
+##聚
+##聞
+##聪
+##聯
+##聰
+##聲
+##聳
+##聴
+##聶
+##職
+##聽
+##聾
+##聿
+##肃
+##肄
+##肅
+##肆
+##肇
+##肉
+##肋
+##肌
+##肏
+##肓
+##肖
+##肘
+##肚
+##肛
+##肝
+##肠
+##股
+##肢
+##肤
+##肥
+##肩
+##肪
+##肮
+##肯
+##肱
+##育
+##肴
+##肺
+##肽
+##肾
+##肿
+##胀
+##胁
+##胃
+##胄
+##胆
+##背
+##胍
+##胎
+##胖
+##胚
+##胛
+##胜
+##胝
+##胞
+##胡
+##胤
+##胥
+##胧
+##胫
+##胭
+##胯
+##胰
+##胱
+##胳
+##胴
+##胶
+##胸
+##胺
+##能
+##脂
+##脅
+##脆
+##脇
+##脈
+##脉
+##脊
+##脍
+##脏
+##脐
+##脑
+##脓
+##脖
+##脘
+##脚
+##脛
+##脣
+##脩
+##脫
+##脯
+##脱
+##脲
+##脳
+##脸
+##脹
+##脾
+##腆
+##腈
+##腊
+##腋
+##腌
+##腎
+##腐
+##腑
+##腓
+##腔
+##腕
+##腥
+##腦
+##腩
+##腫
+##腭
+##腮
+##腰
+##腱
+##腳
+##腴
+##腸
+##腹
+##腺
+##腻
+##腼
+##腾
+##腿
+##膀
+##膈
+##膊
+##膏
+##膑
+##膘
+##膚
+##膛
+##膜
+##膝
+##膠
+##膦
+##膨
+##膩
+##膳
+##膺
+##膻
+##膽
+##膾
+##膿
+##臀
+##臂
+##臃
+##臆
+##臉
+##臊
+##臍
+##臓
+##臘
+##臟
+##臣
+##臥
+##臧
+##臨
+##自
+##臬
+##臭
+##至
+##致
+##臺
+##臻
+##臼
+##臾
+##舀
+##舂
+##舅
+##舆
+##與
+##興
+##舉
+##舊
+##舌
+##舍
+##舎
+##舐
+##舒
+##舔
+##舖
+##舗
+##舛
+##舜
+##舞
+##舟
+##航
+##舫
+##般
+##舰
+##舱
+##舵
+##舶
+##舷
+##舸
+##船
+##舺
+##舾
+##艇
+##艋
+##艘
+##艙
+##艦
+##艮
+##良
+##艰
+##艱
+##色
+##艳
+##艷
+##艹
+##艺
+##艾
+##节
+##芃
+##芈
+##芊
+##芋
+##芍
+##芎
+##芒
+##芙
+##芜
+##芝
+##芡
+##芥
+##芦
+##芩
+##芪
+##芫
+##芬
+##芭
+##芮
+##芯
+##花
+##芳
+##芷
+##芸
+##芹
+##芻
+##芽
+##芾
+##苁
+##苄
+##苇
+##苋
+##苍
+##苏
+##苑
+##苒
+##苓
+##苔
+##苕
+##苗
+##苛
+##苜
+##苞
+##苟
+##苡
+##苣
+##若
+##苦
+##苫
+##苯
+##英
+##苷
+##苹
+##苻
+##茁
+##茂
+##范
+##茄
+##茅
+##茉
+##茎
+##茏
+##茗
+##茜
+##茧
+##茨
+##茫
+##茬
+##茭
+##茯
+##茱
+##茲
+##茴
+##茵
+##茶
+##茸
+##茹
+##茼
+##荀
+##荃
+##荆
+##草
+##荊
+##荏
+##荐
+##荒
+##荔
+##荖
+##荘
+##荚
+##荞
+##荟
+##荠
+##荡
+##荣
+##荤
+##荥
+##荧
+##荨
+##荪
+##荫
+##药
+##荳
+##荷
+##荸
+##荻
+##荼
+##荽
+##莅
+##莆
+##莉
+##莊
+##莎
+##莒
+##莓
+##莖
+##莘
+##莞
+##莠
+##莢
+##莧
+##莪
+##莫
+##莱
+##莲
+##莴
+##获
+##莹
+##莺
+##莽
+##莿
+##菀
+##菁
+##菅
+##菇
+##菈
+##菊
+##菌
+##菏
+##菓
+##菖
+##菘
+##菜
+##菟
+##菠
+##菡
+##菩
+##華
+##菱
+##菲
+##菸
+##菽
+##萁
+##萃
+##萄
+##萊
+##萋
+##萌
+##萍
+##萎
+##萘
+##萝
+##萤
+##营
+##萦
+##萧
+##萨
+##萩
+##萬
+##萱
+##萵
+##萸
+##萼
+##落
+##葆
+##葉
+##著
+##葚
+##葛
+##葡
+##董
+##葦
+##葩
+##葫
+##葬
+##葭
+##葯
+##葱
+##葳
+##葵
+##葷
+##葺
+##蒂
+##蒋
+##蒐
+##蒔
+##蒙
+##蒜
+##蒞
+##蒟
+##蒡
+##蒨
+##蒲
+##蒸
+##蒹
+##蒻
+##蒼
+##蒿
+##蓁
+##蓄
+##蓆
+##蓉
+##蓋
+##蓑
+##蓓
+##蓖
+##蓝
+##蓟
+##蓦
+##蓬
+##蓮
+##蓼
+##蓿
+##蔑
+##蔓
+##蔔
+##蔗
+##蔘
+##蔚
+##蔡
+##蔣
+##蔥
+##蔫
+##蔬
+##蔭
+##蔵
+##蔷
+##蔺
+##蔻
+##蔼
+##蔽
+##蕁
+##蕃
+##蕈
+##蕉
+##蕊
+##蕎
+##蕙
+##蕤
+##蕨
+##蕩
+##蕪
+##蕭
+##蕲
+##蕴
+##蕻
+##蕾
+##薄
+##薅
+##薇
+##薈
+##薊
+##薏
+##薑
+##薔
+##薙
+##薛
+##薦
+##薨
+##薩
+##薪
+##薬
+##薯
+##薰
+##薹
+##藉
+##藍
+##藏
+##藐
+##藓
+##藕
+##藜
+##藝
+##藤
+##藥
+##藩
+##藹
+##藻
+##藿
+##蘆
+##蘇
+##蘊
+##蘋
+##蘑
+##蘚
+##蘭
+##蘸
+##蘼
+##蘿
+##虎
+##虏
+##虐
+##虑
+##虔
+##處
+##虚
+##虛
+##虜
+##虞
+##號
+##虢
+##虧
+##虫
+##虬
+##虱
+##虹
+##虻
+##虽
+##虾
+##蚀
+##蚁
+##蚂
+##蚊
+##蚌
+##蚓
+##蚕
+##蚜
+##蚝
+##蚣
+##蚤
+##蚩
+##蚪
+##蚯
+##蚱
+##蚵
+##蛀
+##蛆
+##蛇
+##蛊
+##蛋
+##蛎
+##蛐
+##蛔
+##蛙
+##蛛
+##蛟
+##蛤
+##蛭
+##蛮
+##蛰
+##蛳
+##蛹
+##蛻
+##蛾
+##蜀
+##蜂
+##蜃
+##蜆
+##蜇
+##蜈
+##蜊
+##蜍
+##蜒
+##蜓
+##蜕
+##蜗
+##蜘
+##蜚
+##蜜
+##蜡
+##蜢
+##蜥
+##蜱
+##蜴
+##蜷
+##蜻
+##蜿
+##蝇
+##蝈
+##蝉
+##蝌
+##蝎
+##蝕
+##蝗
+##蝙
+##蝟
+##蝠
+##蝦
+##蝨
+##蝴
+##蝶
+##蝸
+##蝼
+##螂
+##螃
+##融
+##螞
+##螢
+##螨
+##螯
+##螳
+##螺
+##蟀
+##蟄
+##蟆
+##蟋
+##蟎
+##蟑
+##蟒
+##蟠
+##蟬
+##蟲
+##蟹
+##蟻
+##蟾
+##蠅
+##蠍
+##蠔
+##蠕
+##蠛
+##蠟
+##蠡
+##蠢
+##蠣
+##蠱
+##蠶
+##蠹
+##蠻
+##血
+##衄
+##衅
+##衆
+##行
+##衍
+##術
+##衔
+##街
+##衙
+##衛
+##衝
+##衞
+##衡
+##衢
+##衣
+##补
+##表
+##衩
+##衫
+##衬
+##衮
+##衰
+##衲
+##衷
+##衹
+##衾
+##衿
+##袁
+##袂
+##袄
+##袅
+##袈
+##袋
+##袍
+##袒
+##袖
+##袜
+##袞
+##袤
+##袪
+##被
+##袭
+##袱
+##裁
+##裂
+##装
+##裆
+##裊
+##裏
+##裔
+##裕
+##裘
+##裙
+##補
+##裝
+##裟
+##裡
+##裤
+##裨
+##裱
+##裳
+##裴
+##裸
+##裹
+##製
+##裾
+##褂
+##複
+##褐
+##褒
+##褓
+##褔
+##褚
+##褥
+##褪
+##褫
+##褲
+##褶
+##褻
+##襁
+##襄
+##襟
+##襠
+##襪
+##襬
+##襯
+##襲
+##西
+##要
+##覃
+##覆
+##覇
+##見
+##規
+##覓
+##視
+##覚
+##覦
+##覧
+##親
+##覬
+##観
+##覷
+##覺
+##覽
+##觀
+##见
+##观
+##规
+##觅
+##视
+##览
+##觉
+##觊
+##觎
+##觐
+##觑
+##角
+##觞
+##解
+##觥
+##触
+##觸
+##言
+##訂
+##計
+##訊
+##討
+##訓
+##訕
+##訖
+##託
+##記
+##訛
+##訝
+##訟
+##訣
+##訥
+##訪
+##設
+##許
+##訳
+##訴
+##訶
+##診
+##註
+##証
+##詆
+##詐
+##詔
+##評
+##詛
+##詞
+##詠
+##詡
+##詢
+##詣
+##試
+##詩
+##詫
+##詬
+##詭
+##詮
+##詰
+##話
+##該
+##詳
+##詹
+##詼
+##誅
+##誇
+##誉
+##誌
+##認
+##誓
+##誕
+##誘
+##語
+##誠
+##誡
+##誣
+##誤
+##誥
+##誦
+##誨
+##說
+##説
+##読
+##誰
+##課
+##誹
+##誼
+##調
+##諄
+##談
+##請
+##諏
+##諒
+##論
+##諗
+##諜
+##諡
+##諦
+##諧
+##諫
+##諭
+##諮
+##諱
+##諳
+##諷
+##諸
+##諺
+##諾
+##謀
+##謁
+##謂
+##謄
+##謊
+##謎
+##謐
+##謔
+##謗
+##謙
+##講
+##謝
+##謠
+##謨
+##謬
+##謹
+##謾
+##譁
+##證
+##譎
+##譏
+##識
+##譙
+##譚
+##譜
+##警
+##譬
+##譯
+##議
+##譲
+##譴
+##護
+##譽
+##讀
+##變
+##讓
+##讚
+##讞
+##计
+##订
+##认
+##讥
+##讧
+##讨
+##让
+##讪
+##讫
+##训
+##议
+##讯
+##记
+##讲
+##讳
+##讴
+##讶
+##讷
+##许
+##讹
+##论
+##讼
+##讽
+##设
+##访
+##诀
+##证
+##诃
+##评
+##诅
+##识
+##诈
+##诉
+##诊
+##诋
+##词
+##诏
+##译
+##试
+##诗
+##诘
+##诙
+##诚
+##诛
+##话
+##诞
+##诟
+##诠
+##诡
+##询
+##诣
+##诤
+##该
+##详
+##诧
+##诩
+##诫
+##诬
+##语
+##误
+##诰
+##诱
+##诲
+##说
+##诵
+##诶
+##请
+##诸
+##诺
+##读
+##诽
+##课
+##诿
+##谀
+##谁
+##调
+##谄
+##谅
+##谆
+##谈
+##谊
+##谋
+##谌
+##谍
+##谎
+##谏
+##谐
+##谑
+##谒
+##谓
+##谔
+##谕
+##谗
+##谘
+##谙
+##谚
+##谛
+##谜
+##谟
+##谢
+##谣
+##谤
+##谥
+##谦
+##谧
+##谨
+##谩
+##谪
+##谬
+##谭
+##谯
+##谱
+##谲
+##谴
+##谶
+##谷
+##豁
+##豆
+##豇
+##豈
+##豉
+##豊
+##豌
+##豎
+##豐
+##豔
+##豚
+##象
+##豢
+##豪
+##豫
+##豬
+##豹
+##豺
+##貂
+##貅
+##貌
+##貓
+##貔
+##貘
+##貝
+##貞
+##負
+##財
+##貢
+##貧
+##貨
+##販
+##貪
+##貫
+##責
+##貯
+##貰
+##貳
+##貴
+##貶
+##買
+##貸
+##費
+##貼
+##貽
+##貿
+##賀
+##賁
+##賂
+##賃
+##賄
+##資
+##賈
+##賊
+##賑
+##賓
+##賜
+##賞
+##賠
+##賡
+##賢
+##賣
+##賤
+##賦
+##質
+##賬
+##賭
+##賴
+##賺
+##購
+##賽
+##贅
+##贈
+##贊
+##贍
+##贏
+##贓
+##贖
+##贛
+##贝
+##贞
+##负
+##贡
+##财
+##责
+##贤
+##败
+##账
+##货
+##质
+##贩
+##贪
+##贫
+##贬
+##购
+##贮
+##贯
+##贰
+##贱
+##贲
+##贴
+##贵
+##贷
+##贸
+##费
+##贺
+##贻
+##贼
+##贾
+##贿
+##赁
+##赂
+##赃
+##资
+##赅
+##赈
+##赊
+##赋
+##赌
+##赎
+##赏
+##赐
+##赓
+##赔
+##赖
+##赘
+##赚
+##赛
+##赝
+##赞
+##赠
+##赡
+##赢
+##赣
+##赤
+##赦
+##赧
+##赫
+##赭
+##走
+##赳
+##赴
+##赵
+##赶
+##起
+##趁
+##超
+##越
+##趋
+##趕
+##趙
+##趟
+##趣
+##趨
+##足
+##趴
+##趵
+##趸
+##趺
+##趾
+##跃
+##跄
+##跆
+##跋
+##跌
+##跎
+##跑
+##跖
+##跚
+##跛
+##距
+##跟
+##跡
+##跤
+##跨
+##跩
+##跪
+##路
+##跳
+##践
+##跷
+##跹
+##跺
+##跻
+##踉
+##踊
+##踌
+##踏
+##踐
+##踝
+##踞
+##踟
+##踢
+##踩
+##踪
+##踮
+##踱
+##踴
+##踵
+##踹
+##蹂
+##蹄
+##蹇
+##蹈
+##蹉
+##蹊
+##蹋
+##蹑
+##蹒
+##蹙
+##蹟
+##蹣
+##蹤
+##蹦
+##蹩
+##蹬
+##蹭
+##蹲
+##蹴
+##蹶
+##蹺
+##蹼
+##蹿
+##躁
+##躇
+##躉
+##躊
+##躋
+##躍
+##躏
+##躪
+##身
+##躬
+##躯
+##躲
+##躺
+##軀
+##車
+##軋
+##軌
+##軍
+##軒
+##軟
+##転
+##軸
+##軼
+##軽
+##軾
+##較
+##載
+##輒
+##輓
+##輔
+##輕
+##輛
+##輝
+##輟
+##輩
+##輪
+##輯
+##輸
+##輻
+##輾
+##輿
+##轄
+##轅
+##轆
+##轉
+##轍
+##轎
+##轟
+##车
+##轧
+##轨
+##轩
+##转
+##轭
+##轮
+##软
+##轰
+##轲
+##轴
+##轶
+##轻
+##轼
+##载
+##轿
+##较
+##辄
+##辅
+##辆
+##辇
+##辈
+##辉
+##辊
+##辍
+##辐
+##辑
+##输
+##辕
+##辖
+##辗
+##辘
+##辙
+##辛
+##辜
+##辞
+##辟
+##辣
+##辦
+##辨
+##辩
+##辫
+##辭
+##辮
+##辯
+##辰
+##辱
+##農
+##边
+##辺
+##辻
+##込
+##辽
+##达
+##迁
+##迂
+##迄
+##迅
+##过
+##迈
+##迎
+##运
+##近
+##返
+##还
+##这
+##进
+##远
+##违
+##连
+##迟
+##迢
+##迤
+##迥
+##迦
+##迩
+##迪
+##迫
+##迭
+##述
+##迴
+##迷
+##迸
+##迹
+##迺
+##追
+##退
+##送
+##适
+##逃
+##逅
+##逆
+##选
+##逊
+##逍
+##透
+##逐
+##递
+##途
+##逕
+##逗
+##這
+##通
+##逛
+##逝
+##逞
+##速
+##造
+##逢
+##連
+##逮
+##週
+##進
+##逵
+##逶
+##逸
+##逻
+##逼
+##逾
+##遁
+##遂
+##遅
+##遇
+##遊
+##運
+##遍
+##過
+##遏
+##遐
+##遑
+##遒
+##道
+##達
+##違
+##遗
+##遙
+##遛
+##遜
+##遞
+##遠
+##遢
+##遣
+##遥
+##遨
+##適
+##遭
+##遮
+##遲
+##遴
+##遵
+##遶
+##遷
+##選
+##遺
+##遼
+##遽
+##避
+##邀
+##邁
+##邂
+##邃
+##還
+##邇
+##邈
+##邊
+##邋
+##邏
+##邑
+##邓
+##邕
+##邛
+##邝
+##邢
+##那
+##邦
+##邨
+##邪
+##邬
+##邮
+##邯
+##邰
+##邱
+##邳
+##邵
+##邸
+##邹
+##邺
+##邻
+##郁
+##郅
+##郊
+##郎
+##郑
+##郜
+##郝
+##郡
+##郢
+##郤
+##郦
+##郧
+##部
+##郫
+##郭
+##郴
+##郵
+##郷
+##郸
+##都
+##鄂
+##鄉
+##鄒
+##鄔
+##鄙
+##鄞
+##鄢
+##鄧
+##鄭
+##鄰
+##鄱
+##鄲
+##鄺
+##酉
+##酊
+##酋
+##酌
+##配
+##酐
+##酒
+##酗
+##酚
+##酝
+##酢
+##酣
+##酥
+##酩
+##酪
+##酬
+##酮
+##酯
+##酰
+##酱
+##酵
+##酶
+##酷
+##酸
+##酿
+##醃
+##醇
+##醉
+##醋
+##醍
+##醐
+##醒
+##醚
+##醛
+##醜
+##醞
+##醣
+##醪
+##醫
+##醬
+##醮
+##醯
+##醴
+##醺
+##釀
+##釁
+##采
+##釉
+##释
+##釋
+##里
+##重
+##野
+##量
+##釐
+##金
+##釗
+##釘
+##釜
+##針
+##釣
+##釦
+##釧
+##釵
+##鈀
+##鈉
+##鈍
+##鈎
+##鈔
+##鈕
+##鈞
+##鈣
+##鈦
+##鈪
+##鈴
+##鈺
+##鈾
+##鉀
+##鉄
+##鉅
+##鉉
+##鉑
+##鉗
+##鉚
+##鉛
+##鉤
+##鉴
+##鉻
+##銀
+##銃
+##銅
+##銑
+##銓
+##銖
+##銘
+##銜
+##銬
+##銭
+##銮
+##銳
+##銷
+##銹
+##鋁
+##鋅
+##鋒
+##鋤
+##鋪
+##鋰
+##鋸
+##鋼
+##錄
+##錐
+##錘
+##錚
+##錠
+##錢
+##錦
+##錨
+##錫
+##錮
+##錯
+##録
+##錳
+##錶
+##鍊
+##鍋
+##鍍
+##鍛
+##鍥
+##鍰
+##鍵
+##鍺
+##鍾
+##鎂
+##鎊
+##鎌
+##鎏
+##鎔
+##鎖
+##鎗
+##鎚
+##鎧
+##鎬
+##鎮
+##鎳
+##鏈
+##鏖
+##鏗
+##鏘
+##鏞
+##鏟
+##鏡
+##鏢
+##鏤
+##鏽
+##鐘
+##鐮
+##鐲
+##鐳
+##鐵
+##鐸
+##鐺
+##鑄
+##鑊
+##鑑
+##鑒
+##鑣
+##鑫
+##鑰
+##鑲
+##鑼
+##鑽
+##鑾
+##鑿
+##针
+##钉
+##钊
+##钎
+##钏
+##钒
+##钓
+##钗
+##钙
+##钛
+##钜
+##钝
+##钞
+##钟
+##钠
+##钡
+##钢
+##钣
+##钤
+##钥
+##钦
+##钧
+##钨
+##钩
+##钮
+##钯
+##钰
+##钱
+##钳
+##钴
+##钵
+##钺
+##钻
+##钼
+##钾
+##钿
+##铀
+##铁
+##铂
+##铃
+##铄
+##铅
+##铆
+##铉
+##铎
+##铐
+##铛
+##铜
+##铝
+##铠
+##铡
+##铢
+##铣
+##铤
+##铨
+##铩
+##铬
+##铭
+##铮
+##铰
+##铲
+##铵
+##银
+##铸
+##铺
+##链
+##铿
+##销
+##锁
+##锂
+##锄
+##锅
+##锆
+##锈
+##锉
+##锋
+##锌
+##锏
+##锐
+##锑
+##错
+##锚
+##锟
+##锡
+##锢
+##锣
+##锤
+##锥
+##锦
+##锭
+##键
+##锯
+##锰
+##锲
+##锵
+##锹
+##锺
+##锻
+##镀
+##镁
+##镂
+##镇
+##镉
+##镌
+##镍
+##镐
+##镑
+##镕
+##镖
+##镗
+##镛
+##镜
+##镣
+##镭
+##镯
+##镰
+##镳
+##镶
+##長
+##长
+##門
+##閃
+##閉
+##開
+##閎
+##閏
+##閑
+##閒
+##間
+##閔
+##閘
+##閡
+##関
+##閣
+##閥
+##閨
+##閩
+##閱
+##閲
+##閹
+##閻
+##閾
+##闆
+##闇
+##闊
+##闌
+##闍
+##闔
+##闕
+##闖
+##闘
+##關
+##闡
+##闢
+##门
+##闪
+##闫
+##闭
+##问
+##闯
+##闰
+##闲
+##间
+##闵
+##闷
+##闸
+##闹
+##闺
+##闻
+##闽
+##闾
+##阀
+##阁
+##阂
+##阅
+##阆
+##阇
+##阈
+##阉
+##阎
+##阐
+##阑
+##阔
+##阕
+##阖
+##阙
+##阚
+##阜
+##队
+##阡
+##阪
+##阮
+##阱
+##防
+##阳
+##阴
+##阵
+##阶
+##阻
+##阿
+##陀
+##陂
+##附
+##际
+##陆
+##陇
+##陈
+##陋
+##陌
+##降
+##限
+##陕
+##陛
+##陝
+##陞
+##陟
+##陡
+##院
+##陣
+##除
+##陨
+##险
+##陪
+##陰
+##陲
+##陳
+##陵
+##陶
+##陷
+##陸
+##険
+##陽
+##隅
+##隆
+##隈
+##隊
+##隋
+##隍
+##階
+##随
+##隐
+##隔
+##隕
+##隘
+##隙
+##際
+##障
+##隠
+##隣
+##隧
+##隨
+##險
+##隱
+##隴
+##隶
+##隸
+##隻
+##隼
+##隽
+##难
+##雀
+##雁
+##雄
+##雅
+##集
+##雇
+##雉
+##雋
+##雌
+##雍
+##雎
+##雏
+##雑
+##雒
+##雕
+##雖
+##雙
+##雛
+##雜
+##雞
+##離
+##難
+##雨
+##雪
+##雯
+##雰
+##雲
+##雳
+##零
+##雷
+##雹
+##電
+##雾
+##需
+##霁
+##霄
+##霆
+##震
+##霈
+##霉
+##霊
+##霍
+##霎
+##霏
+##霑
+##霓
+##霖
+##霜
+##霞
+##霧
+##霭
+##霰
+##露
+##霸
+##霹
+##霽
+##霾
+##靂
+##靄
+##靈
+##青
+##靓
+##靖
+##静
+##靚
+##靛
+##靜
+##非
+##靠
+##靡
+##面
+##靥
+##靦
+##革
+##靳
+##靴
+##靶
+##靼
+##鞅
+##鞋
+##鞍
+##鞏
+##鞑
+##鞘
+##鞠
+##鞣
+##鞦
+##鞭
+##韆
+##韋
+##韌
+##韓
+##韜
+##韦
+##韧
+##韩
+##韬
+##韭
+##音
+##韵
+##韶
+##韻
+##響
+##頁
+##頂
+##頃
+##項
+##順
+##須
+##頌
+##預
+##頑
+##頒
+##頓
+##頗
+##領
+##頜
+##頡
+##頤
+##頫
+##頭
+##頰
+##頷
+##頸
+##頹
+##頻
+##頼
+##顆
+##題
+##額
+##顎
+##顏
+##顔
+##願
+##顛
+##類
+##顧
+##顫
+##顯
+##顱
+##顴
+##页
+##顶
+##顷
+##项
+##顺
+##须
+##顼
+##顽
+##顾
+##顿
+##颁
+##颂
+##预
+##颅
+##领
+##颇
+##颈
+##颉
+##颊
+##颌
+##颍
+##颐
+##频
+##颓
+##颔
+##颖
+##颗
+##题
+##颚
+##颛
+##颜
+##额
+##颞
+##颠
+##颡
+##颢
+##颤
+##颦
+##颧
+##風
+##颯
+##颱
+##颳
+##颶
+##颼
+##飄
+##飆
+##风
+##飒
+##飓
+##飕
+##飘
+##飙
+##飚
+##飛
+##飞
+##食
+##飢
+##飨
+##飩
+##飪
+##飯
+##飲
+##飼
+##飽
+##飾
+##餃
+##餅
+##餉
+##養
+##餌
+##餐
+##餒
+##餓
+##餘
+##餚
+##餛
+##餞
+##餡
+##館
+##餮
+##餵
+##餾
+##饅
+##饈
+##饋
+##饌
+##饍
+##饑
+##饒
+##饕
+##饗
+##饞
+##饥
+##饨
+##饪
+##饬
+##饭
+##饮
+##饯
+##饰
+##饱
+##饲
+##饴
+##饵
+##饶
+##饷
+##饺
+##饼
+##饽
+##饿
+##馀
+##馁
+##馄
+##馅
+##馆
+##馈
+##馋
+##馍
+##馏
+##馒
+##馔
+##首
+##馗
+##香
+##馥
+##馨
+##馬
+##馭
+##馮
+##馳
+##馴
+##駁
+##駄
+##駅
+##駆
+##駐
+##駒
+##駕
+##駛
+##駝
+##駭
+##駱
+##駿
+##騁
+##騎
+##騏
+##験
+##騙
+##騨
+##騰
+##騷
+##驀
+##驅
+##驊
+##驍
+##驒
+##驕
+##驗
+##驚
+##驛
+##驟
+##驢
+##驥
+##马
+##驭
+##驮
+##驯
+##驰
+##驱
+##驳
+##驴
+##驶
+##驷
+##驸
+##驹
+##驻
+##驼
+##驾
+##驿
+##骁
+##骂
+##骄
+##骅
+##骆
+##骇
+##骈
+##骊
+##骋
+##验
+##骏
+##骐
+##骑
+##骗
+##骚
+##骛
+##骜
+##骞
+##骠
+##骡
+##骤
+##骥
+##骧
+##骨
+##骯
+##骰
+##骶
+##骷
+##骸
+##骼
+##髂
+##髅
+##髋
+##髏
+##髒
+##髓
+##體
+##髖
+##高
+##髦
+##髪
+##髮
+##髯
+##髻
+##鬃
+##鬆
+##鬍
+##鬓
+##鬚
+##鬟
+##鬢
+##鬣
+##鬥
+##鬧
+##鬱
+##鬼
+##魁
+##魂
+##魄
+##魅
+##魇
+##魍
+##魏
+##魔
+##魘
+##魚
+##魯
+##魷
+##鮑
+##鮨
+##鮪
+##鮭
+##鮮
+##鯉
+##鯊
+##鯖
+##鯛
+##鯨
+##鯰
+##鯽
+##鰍
+##鰓
+##鰭
+##鰲
+##鰻
+##鰾
+##鱈
+##鱉
+##鱔
+##鱗
+##鱷
+##鱸
+##鱼
+##鱿
+##鲁
+##鲈
+##鲍
+##鲑
+##鲛
+##鲜
+##鲟
+##鲢
+##鲤
+##鲨
+##鲫
+##鲱
+##鲲
+##鲶
+##鲷
+##鲸
+##鳃
+##鳄
+##鳅
+##鳌
+##鳍
+##鳕
+##鳖
+##鳗
+##鳝
+##鳞
+##鳥
+##鳩
+##鳳
+##鳴
+##鳶
+##鴉
+##鴕
+##鴛
+##鴦
+##鴨
+##鴻
+##鴿
+##鵑
+##鵜
+##鵝
+##鵡
+##鵬
+##鵰
+##鵲
+##鶘
+##鶩
+##鶯
+##鶴
+##鷗
+##鷲
+##鷹
+##鷺
+##鸚
+##鸞
+##鸟
+##鸠
+##鸡
+##鸢
+##鸣
+##鸥
+##鸦
+##鸨
+##鸪
+##鸭
+##鸯
+##鸳
+##鸵
+##鸽
+##鸾
+##鸿
+##鹂
+##鹃
+##鹄
+##鹅
+##鹈
+##鹉
+##鹊
+##鹌
+##鹏
+##鹑
+##鹕
+##鹘
+##鹜
+##鹞
+##鹤
+##鹦
+##鹧
+##鹫
+##鹭
+##鹰
+##鹳
+##鹵
+##鹹
+##鹼
+##鹽
+##鹿
+##麂
+##麋
+##麒
+##麓
+##麗
+##麝
+##麟
+##麥
+##麦
+##麩
+##麴
+##麵
+##麸
+##麺
+##麻
+##麼
+##麽
+##麾
+##黃
+##黄
+##黍
+##黎
+##黏
+##黑
+##黒
+##黔
+##默
+##黛
+##黜
+##黝
+##點
+##黠
+##黨
+##黯
+##黴
+##鼋
+##鼎
+##鼐
+##鼓
+##鼠
+##鼬
+##鼹
+##鼻
+##鼾
+##齁
+##齊
+##齋
+##齐
+##齒
+##齡
+##齢
+##齣
+##齦
+##齿
+##龄
+##龅
+##龈
+##龊
+##龋
+##龌
+##龍
+##龐
+##龔
+##龕
+##龙
+##龚
+##龛
+##龜
+##龟
+##︰
+##︱
+##︶
+##︿
+##﹁
+##﹂
+##﹍
+##﹏
+##﹐
+##﹑
+##﹒
+##﹔
+##﹕
+##﹖
+##﹗
+##﹙
+##﹚
+##﹝
+##﹞
+##﹡
+##﹣
+##！
+##＂
+##＃
+##＄
+##％
+##＆
+##＇
+##（
+##）
+##＊
+##，
+##－
+##．
+##／
+##：
+##；
+##＜
+##？
+##＠
+##［
+##＼
+##］
+##＾
+##＿
+##｀
+##ｆ
+##ｈ
+##ｊ
+##ｕ
+##ｗ
+##ｚ
+##｛
+##｝
+##｡
+##｢
+##｣
+##､
+##･
+##ｯ
+##ｰ
+##ｲ
+##ｸ
+##ｼ
+##ｽ
+##ﾄ
+##ﾉ
+##ﾌ
+##ﾗ
+##ﾙ
+##ﾝ
+##ﾞ
+##ﾟ
+##￣
+##￥
+##👍
+##🔥
+##😂
+##😎