From 92dc6de049735cd0232c9a38bd9e0d86faf52095 Mon Sep 17 00:00:00 2001
From: simson <526422051@qq.com>
Date: Wed, 1 Apr 2020 11:55:29 +0800
Subject: [PATCH 01/13] modify graphengine

---
 .gitmodules | 3 ---
 graphengine | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 graphengine

diff --git a/.gitmodules b/.gitmodules
index a241b6d69b..1f5fbad2b9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,6 +10,3 @@
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
-[submodule "graphengine"]
-	path = graphengine
-	url = https://gitee.com/mindspore/graphengine.git
diff --git a/graphengine b/graphengine
deleted file mode 160000
index 5f763679fa..0000000000
--- a/graphengine
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5f763679fa33de1608d07f7651c6f16012b953ea

From f338eb3a606efc4c36bd49690a629b6ab186643f Mon Sep 17 00:00:00 2001
From: simson <526422051@qq.com>
Date: Wed, 1 Apr 2020 11:57:09 +0800
Subject: [PATCH 02/13] add graphengine

---
 .gitmodules | 3 +++
 graphengine | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 graphengine

diff --git a/.gitmodules b/.gitmodules
index 1f5fbad2b9..a024019b14 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
+[submodule "graphengine"]
+	path = graphengine
+	url = https://gitee.com/ms-incubator/graphengine.git
diff --git a/graphengine b/graphengine
new file mode 160000
index 0000000000..21d3700f66
--- /dev/null
+++ b/graphengine
@@ -0,0 +1 @@
+Subproject commit 21d3700f661576edc37607a3bc961874ee5189a7

From 6f2b7abe04a97b4b8fb3b6de51124eed95cef4e9 Mon Sep 17 00:00:00 2001
From: yanghaoran <yanghaoran2@huawei.com>
Date: Thu, 2 Apr 2020 17:15:41 +0800
Subject: [PATCH 03/13] modify reduceminD and reducemaxD IR

---
 graphengine                                 |   2 +-
 mindspore/ccsrc/pipeline/pipeline.cc        |   2 +-
 mindspore/ccsrc/transform/convert.cc        |  16 +-
 mindspore/ccsrc/transform/graph_runner.cc   |   7 +
 mindspore/ccsrc/transform/op_declare.cc     | 159 +++++++++-----------
 mindspore/ccsrc/transform/op_declare.h      |  14 +-
 mindspore/ccsrc/transform/util.cc           |  11 +-
 mindspore/ccsrc/utils/context/ms_context.cc |   4 +-
 mindspore/ops/operations/__init__.py        |   3 +-
 mindspore/ops/operations/nn_ops.py          |  75 ++++++++-
 tests/ut/python/ops/test_ops.py             |   5 +
 11 files changed, 188 insertions(+), 110 deletions(-)

diff --git a/graphengine b/graphengine
index 21d3700f66..092c7a1f65 160000
--- a/graphengine
+++ b/graphengine
@@ -1 +1 @@
-Subproject commit 21d3700f661576edc37607a3bc961874ee5189a7
+Subproject commit 092c7a1f6548cac7d40e677af3498c3c49ea2bfd
diff --git a/mindspore/ccsrc/pipeline/pipeline.cc b/mindspore/ccsrc/pipeline/pipeline.cc
index 35336e975b..70ef9a5407 100644
--- a/mindspore/ccsrc/pipeline/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/pipeline.cc
@@ -1071,7 +1071,7 @@ bool ExecutorPy::AddDFGraph(const py::dict& init_params, const std::string& phas
   }
   std::string init_graph = "init_subgraph." + net_id;
   std::string checkpoint_name = "save." + net_id;
-  if (phase == "train") {
+  if (phase.find("train") != std::string::npos) {
     (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph(), {{"ge.exec.variable_acc", "1"}});
   } else {
     (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph());
diff --git a/mindspore/ccsrc/transform/convert.cc b/mindspore/ccsrc/transform/convert.cc
index 74b0695cff..87bfc8f6d8 100755
--- a/mindspore/ccsrc/transform/convert.cc
+++ b/mindspore/ccsrc/transform/convert.cc
@@ -171,6 +171,7 @@ const char kNameAbsGrad[] = "AbsGrad";
 const char kNameBinaryCrossEntropy[] = "BinaryCrossEntropy";
 const char kNameBinaryCrossEntropyGrad[] = "BinaryCrossEntropyGrad";
 const char kNameSparseApplyAdagrad[] = "SparseApplyAdagrad";
+const char kNameSparseApplyFtrlD[] = "SparseApplyFtrlD";
 const char kNameSpaceToDepth[] = "SpaceToDepth";
 const char kNameDepthToSpace[] = "DepthToSpace";
 const char kNameSign[] = "Sign";
@@ -189,7 +190,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameApplyMomentum), ADPT_DESC(ApplyMomentum)},
     {string(kNameMaxPool), ADPT_DESC(MaxPool)},
     {string(kNameAvgPool), ADPT_DESC(AvgPool)},
-    {string(kNameTopK), ADPT_DESC(TopKV2)},
+    {string(kNameTopK), ADPT_DESC(TopK)},
     {string(kNamePack), ADPT_DESC(Pack)},
     {string(kNameSplitD), ADPT_DESC(SplitD)},
     {string(kNameAllReduce), ADPT_DESC(HcomAllReduce)},
@@ -310,7 +311,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {prim::kPrimMinimum->name(), ADPT_DESC(Minimum)},
     {prim::kPrimSelect->name(), ADPT_DESC(Select)},
     {string(kNameLessEqual), ADPT_DESC(LessEqual)},
-    {prim::kPrimLogSoftmax->name(), ADPT_DESC(LogSoftmax)},
+    {prim::kPrimLogSoftmax->name(), ADPT_DESC(LogSoftmaxV2)},
     {string(kNameTruncatedNormal), ADPT_DESC(TruncatedNormal)},
     {string(kNameStridedSliceGrad), ADPT_DESC(StridedSliceGrad)},
     {prim::kPrimGelu->name(), ADPT_DESC(Gelu)},
@@ -343,7 +344,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {prim::kPrimMatMul->name(), ADPT_DESC(MatMul)},
 
     {string(kNameConst), ADPT_DESC(Constant, Const)},
-    {string(kNameSoftmax), ADPT_DESC(Softmax)},
+    {string(kNameSoftmax), ADPT_DESC(SoftmaxV2)},
     {string(kNameSoftmaxGrad), ADPT_DESC(SoftmaxGrad)},
     {string(kNameParam), ADPT_DESC(Data)},
     {string(kNameROIAlign), ADPT_DESC(ROIAlign)},
@@ -353,6 +354,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameBinaryCrossEntropy), ADPT_DESC(BinaryCrossEntropy)},
     {string(kNameBinaryCrossEntropyGrad), ADPT_DESC(BinaryCrossEntropyGrad)},
     {string(kNameSparseApplyAdagrad), ADPT_DESC(SparseApplyAdagradD)},
+    {string(kNameSparseApplyFtrlD), ADPT_DESC(SparseApplyFtrlD)},
     {string(kNameSpaceToDepth), ADPT_DESC(SpaceToDepth)},
     {string(kNameDepthToSpace), ADPT_DESC(DepthToSpace)},
     {string(kNameSign), ADPT_DESC(Sign)},
@@ -1017,8 +1019,8 @@ DfGraphConvertor &DfGraphConvertor::BuildGraph() {
     }
   }
 
-  // set up dependices
-  MS_LOG(DEBUG) << "set up dependices";
+  // set up dependencies
+  MS_LOG(DEBUG) << "set up dependencies";
   std::vector<AnfNodePtr> nodes = ::mindspore::TopoSort(anf_graph_->get_return());
   for (auto &it : nodes) {
     SetNodeInput(it);
@@ -1115,8 +1117,8 @@ void DfGraphConvertor::UpdateDataOpDesc(const AnfNodePtr &it, const OperatorPtr
   if (desc == nullptr) {
     MS_LOG(ERROR) << "Update data op descriptor failed! TensorDesc is null.";
   } else {
-    (void)std::static_pointer_cast<Data>(op)->update_input_desc_data(*desc);
-    (void)std::static_pointer_cast<Data>(op)->update_output_desc_out(*desc);
+    (void)std::static_pointer_cast<Data>(op)->update_input_desc_x(*desc);
+    (void)std::static_pointer_cast<Data>(op)->update_output_desc_y(*desc);
   }
 }
 
diff --git a/mindspore/ccsrc/transform/graph_runner.cc b/mindspore/ccsrc/transform/graph_runner.cc
index e77b1bcd73..2bff1a740c 100644
--- a/mindspore/ccsrc/transform/graph_runner.cc
+++ b/mindspore/ccsrc/transform/graph_runner.cc
@@ -135,6 +135,13 @@ Status GraphRunner::RunGraph(const RunOptions& options, const std::vector<GeTens
     return Status::FAILED;
   }
 
+  // The information of some nodes could be changed after fusion in some cases
+  // Therefore a graph needs to be rebuilt in above situation
+  if (sess_->IsGraphNeedRebuild(wrap_ptr->id_)) {
+    sess_->RemoveGraph(wrap_ptr->id_);
+    sess_->AddGraph(wrap_ptr->id_, *(wrap_ptr->graph_ptr_), wrap_ptr->options_);
+  }
+
   ge::Status ret = sess_->RunGraph(wrap_ptr->id_, ge_inputs, ge_outputs);
   if (ret != ge::GRAPH_SUCCESS) {
     MS_LOG(ERROR) << "Call GE RunGraph Failed, ret is: " << ret;
diff --git a/mindspore/ccsrc/transform/op_declare.cc b/mindspore/ccsrc/transform/op_declare.cc
index 78b949c525..07c5e9f5fe 100755
--- a/mindspore/ccsrc/transform/op_declare.cc
+++ b/mindspore/ccsrc/transform/op_declare.cc
@@ -138,11 +138,10 @@ OUTPUT_MAP(ApplyMomentum) = {{0, OUTPUT_DESC(var)}};
 INPUT_MAP(Summary) = {{2, INPUT_DESC(x)}};
 ATTR_MAP(Summary) = EMPTY_ATTR_MAP;
 
-// data
+// Data
 INPUT_MAP(Data) = EMPTY_INPUT_MAP;
 ATTR_MAP(Data) = EMPTY_ATTR_MAP;
 
-// resnet ops in ge
 // BatchNorm
 INPUT_MAP(BatchNorm) = {{1, INPUT_DESC(x)},
                         {2, INPUT_DESC(scale)},
@@ -194,9 +193,9 @@ OUTPUT_MAP(PRelu) = {{0, OUTPUT_DESC(y)}};
 
 // PReluGrad
 INPUT_MAP(PReluGrad) = {
-  {1, INPUT_DESC(input_gradients)}, {2, INPUT_DESC(input_features)}, {3, INPUT_DESC(input_weights)}};
+  {1, INPUT_DESC(grads)}, {2, INPUT_DESC(features)}, {3, INPUT_DESC(weights)}};
 ATTR_MAP(PReluGrad) = EMPTY_ATTR_MAP;
-OUTPUT_MAP(PReluGrad) = {{0, OUTPUT_DESC(output_backprops_dx)}, {1, OUTPUT_DESC(output_backprops_da)}};
+OUTPUT_MAP(PReluGrad) = {{0, OUTPUT_DESC(dx)}, {1, OUTPUT_DESC(da)}};
 
 // Sigmoid
 INPUT_MAP(Sigmoid) = {{1, INPUT_DESC(x)}};
@@ -241,12 +240,12 @@ ATTR_MAP(CumsumD) = {{"exclusive", ATTR_DESC(exclusive, AnyTraits<bool>())},
                      {"reverse", ATTR_DESC(reverse, AnyTraits<bool>())}};
 OUTPUT_MAP(CumsumD) = {{0, OUTPUT_DESC(y)}};
 
-// softmax
-INPUT_MAP(Softmax) = {{1, INPUT_DESC(x)}};
-ATTR_MAP(Softmax) = {
-  {"axis", ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
+// SoftmaxV2
+INPUT_MAP(SoftmaxV2) = {{1, INPUT_DESC(x)}};
+ATTR_MAP(SoftmaxV2) = {
+  {"axis", ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
 };
-OUTPUT_MAP(Softmax) = {{0, OUTPUT_DESC(y)}};
+OUTPUT_MAP(SoftmaxV2) = {{0, OUTPUT_DESC(y)}};
 
 // SoftmaxGrad
 INPUT_MAP(SoftmaxGrad) = {{1, INPUT_DESC(softmax)}, {2, INPUT_DESC(grad_softmax)}};
@@ -269,21 +268,21 @@ ATTR_MAP(GatherV2) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(GatherV2) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceSum
-INPUT_MAP(ReduceSum) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axis)}};
+INPUT_MAP(ReduceSum) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axes)}};
 ATTR_MAP(ReduceSum) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceSum) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceSumD
 INPUT_MAP(ReduceSumD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceSumD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceSumD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceSumD) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceProdD
 INPUT_MAP(ReduceProdD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceProdD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceProdD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceProdD) = {{0, OUTPUT_DESC(y)}};
 
@@ -294,7 +293,7 @@ ATTR_MAP(CumprodD) = {{"exclusive", ATTR_DESC(exclusive, AnyTraits<bool>())},
                       {"reverse", ATTR_DESC(reverse, AnyTraits<bool>())}};
 OUTPUT_MAP(CumprodD) = {{0, OUTPUT_DESC(y)}};
 
-// SoftmaxCrossEntropyWithLogits/
+// SoftmaxCrossEntropyWithLogits
 INPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{1, INPUT_DESC(features)}, {2, INPUT_DESC(labels)}};
 ATTR_MAP(SoftmaxCrossEntropyWithLogits) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{0, OUTPUT_DESC(loss)}, {1, OUTPUT_DESC(backprop)}};
@@ -306,7 +305,7 @@ INPUT_ATTR_MAP(MeanGrad) = {{2, ATTR_DESC(mean_grad_output_shape_value, kOpForma
 ATTR_MAP(MeanGrad) = {{"mode", ATTR_DESC(mode, AnyTraits<int64_t>())}};
 
 INPUT_MAP(SliceD) = {{1, INPUT_DESC(x)}};
-INPUT_ATTR_MAP(SliceD) = {{2, ATTR_DESC(begin, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())},
+INPUT_ATTR_MAP(SliceD) = {{2, ATTR_DESC(offsets, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())},
                           {3, ATTR_DESC(size, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(SliceD) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(SliceD) = {{0, OUTPUT_DESC(y)}};
@@ -401,42 +400,10 @@ ATTR_MAP(BoundingBoxDecode) = {
 };
 OUTPUT_MAP(BoundingBoxDecode) = {{0, OUTPUT_DESC(bboxes)}};
 
-#ifdef VALID_CODE
-
-// Less
-INPUT_MAP(Less) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}};
-ATTR_MAP(Less) = EMPTY_ATTR_MAP;
-OUTPUT_MAP(Less) = {{0, OUTPUT_DESC(z)}};
-
-// Cast
-INPUT_MAP(Cast) = {{1, INPUT_DESC(x)}};
-INPUT_ATTR_MAP(Cast) = {{2, ATTR_DESC(dst_type, AnyTraits<GEType>())}};
-ATTR_MAP(Cast) = {{"Truncate", ATTR_DESC(truncate, AnyTraits<bool>())}};
-OUTPUT_MAP(Cast) = {{0, OUTPUT_DESC(y)}};
-
-// Minimum
-INPUT_MAP(Minimum) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}};
-ATTR_MAP(Minimum) = {{"alpha", ATTR_DESC(alpha, AnyTraits<float>())}, {"beta", ATTR_DESC(beta, AnyTraits<float>())}};
-OUTPUT_MAP(Minimum) = {{0, OUTPUT_DESC(z)}};
-
-// Sub
-INPUT_MAP(Sub) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
-ATTR_MAP(Sub) = {{"alpha", ATTR_DESC(alpha, AnyTraits<float>())}, {"beta", ATTR_DESC(beta, AnyTraits<float>())}};
-
-#endif
-
-// TopKV2
-INPUT_MAP(TopKV2) = {
-  {1, INPUT_DESC(input)},
-  {2, INPUT_DESC(k)},
-};
-
-ATTR_MAP(TopKV2) = {{"T", ATTR_DESC(T, AnyTraits<GEType>())}, {"sorted", ATTR_DESC(sorted, AnyTraits<bool>())}};
-
-OUTPUT_MAP(TopKV2) = {
-  {0, OUTPUT_DESC(values)},
-  {1, OUTPUT_DESC(indices)},
-};
+// TopK
+INPUT_MAP(TopK) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(k)}};
+ATTR_MAP(TopK) = {{"sorted", ATTR_DESC(sorted, AnyTraits<bool>())}};
+OUTPUT_MAP(TopK) = {{0, OUTPUT_DESC(values)}, {1, OUTPUT_DESC(indices)}};
 
 // Multiply
 INPUT_MAP(Multiply) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}};
@@ -476,7 +443,7 @@ ATTR_MAP(Iou) = {{"mode", ATTR_DESC(mode, AnyTraits<std::string>())}};
 OUTPUT_MAP(Iou) = {{0, OUTPUT_DESC(overlap)}};
 
 // ResizeNearestNeighborD
-INPUT_MAP(ResizeNearestNeighborD) = {{1, INPUT_DESC(images)}};
+INPUT_MAP(ResizeNearestNeighborD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(ResizeNearestNeighborD) = {
   {"size", ATTR_DESC(size, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
   {"align_corners", ATTR_DESC(align_corners, AnyTraits<bool>())}};
@@ -506,17 +473,17 @@ ATTR_MAP(Relu6) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(Relu6) = {{0, OUTPUT_DESC(activations)}};
 
 // Relu6Grad
-INPUT_MAP(Relu6Grad) = {{1, INPUT_DESC(dy)}, {2, INPUT_DESC(y)}};
+INPUT_MAP(Relu6Grad) = {{1, INPUT_DESC(features)}, {2, INPUT_DESC(gradients)}};
 ATTR_MAP(Relu6Grad) = EMPTY_ATTR_MAP;
-OUTPUT_MAP(Relu6Grad) = {{0, OUTPUT_DESC(z)}};
+OUTPUT_MAP(Relu6Grad) = {{0, OUTPUT_DESC(backprops)}};
 
 // ResizeBilinearGrad
 INPUT_MAP(ResizeBilinearGrad) = {{1, INPUT_DESC(grads)}, {2, INPUT_DESC(original_image)}};
 ATTR_MAP(ResizeBilinearGrad) = {{"align_corners", ATTR_DESC(align_corners, AnyTraits<bool>())}};
 OUTPUT_MAP(ResizeBilinearGrad) = {{0, OUTPUT_DESC(y)}};
 
-// ResizeBilinear
-INPUT_MAP(ResizeBilinearD) = {{1, INPUT_DESC(images)}};
+// ResizeBilinearD
+INPUT_MAP(ResizeBilinearD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(ResizeBilinearD) = {
   {"size", ATTR_DESC(size, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
   {"align_corners", ATTR_DESC(align_corners, AnyTraits<bool>())}};
@@ -539,9 +506,9 @@ OUTPUT_MAP(NMSWithMask) = {
   {0, OUTPUT_DESC(selected_boxes)}, {1, OUTPUT_DESC(selected_idx)}, {2, OUTPUT_DESC(selected_mask)}};
 
 // Unpack
-INPUT_MAP(Unpack) = {{1, INPUT_DESC(value)}};
+INPUT_MAP(Unpack) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(Unpack) = {{"axis", ATTR_DESC(axis, AnyTraits<int>())}, {"num", ATTR_DESC(num, AnyTraits<int>())}};
-DYN_OUTPUT_MAP(Unpack) = {{0, DYN_OUTPUT_DESC(output)}};
+DYN_OUTPUT_MAP(Unpack) = {{0, DYN_OUTPUT_DESC(y)}};
 
 // ScatterNdUpdate
 INPUT_MAP(ScatterNdUpdate) = {{1, INPUT_DESC(var)}, {2, INPUT_DESC(indices)}, {3, INPUT_DESC(updates)}};
@@ -574,8 +541,8 @@ INPUT_MAP(SigmoidCrossEntropyWithLogitsGrad) = {
 ATTR_MAP(SigmoidCrossEntropyWithLogitsGrad) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(SigmoidCrossEntropyWithLogitsGrad) = {{0, OUTPUT_DESC(gradient)}};
 
-// ScatterNd
-INPUT_MAP(ScatterNdD) = {{1, INPUT_DESC(indices)}, {2, INPUT_DESC(updates)}};
+// ScatterNdD
+INPUT_MAP(ScatterNdD) = {{1, INPUT_DESC(indices)}, {2, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ScatterNdD) = {
   {3, ATTR_DESC(shape, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ScatterNdD) = EMPTY_ATTR_MAP;
@@ -587,7 +554,7 @@ ATTR_MAP(PadD) = {{"paddings", ATTR_DESC(paddings, AnyTraits<std::vector<std::ve
 OUTPUT_MAP(PadD) = {{0, OUTPUT_DESC(y)}};
 
 // GatherNd
-INPUT_MAP(GatherNd) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
+INPUT_MAP(GatherNd) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(indices)}};
 ATTR_MAP(GatherNd) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(GatherNd) = {{0, OUTPUT_DESC(y)}};
 
@@ -612,13 +579,13 @@ ATTR_MAP(ROIAlignGrad) = {
 // ArgMaxD
 INPUT_MAP(ArgMaxD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(ArgMaxD) = {{"axis", ATTR_DESC(dimension, AnyTraits<int>())},
-                     {"output_type", ATTR_DESC(output_type, AnyTraits<GEType>())}};
+                     {"output_type", ATTR_DESC(dtype, AnyTraits<GEType>())}};
 OUTPUT_MAP(ArgMaxD) = {{0, OUTPUT_DESC(y)}};
 
 // ArgMinD
 INPUT_MAP(ArgMinD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(ArgMinD) = {{"axis", ATTR_DESC(dimension, AnyTraits<int>())},
-                     {"output_type", ATTR_DESC(output_type, AnyTraits<GEType>())}};
+                     {"output_type", ATTR_DESC(dtype, AnyTraits<GEType>())}};
 OUTPUT_MAP(ArgMinD) = {{0, OUTPUT_DESC(y)}};
 
 // ArgMaxWithValue
@@ -634,14 +601,14 @@ ATTR_MAP(ArgMinWithValue) = {{"axis", ATTR_DESC(dimension, AnyTraits<int>())},
 OUTPUT_MAP(ArgMinWithValue) = {{0, OUTPUT_DESC(indice)}, {1, OUTPUT_DESC(values)}};
 
 // ReduceAll
-INPUT_MAP(ReduceAll) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axis)}};
+INPUT_MAP(ReduceAll) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axes)}};
 ATTR_MAP(ReduceAll) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceAll) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceMeanD
 INPUT_MAP(ReduceMeanD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceMeanD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceMeanD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceMeanD) = {{0, OUTPUT_DESC(y)}};
 
@@ -708,11 +675,12 @@ INPUT_MAP(BiasAddGrad) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(BiasAddGrad) = {{"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())}};
 OUTPUT_MAP(BiasAddGrad) = {{0, OUTPUT_DESC(y)}};
 
-// maxpoolgrad
+// MaxPoolGrad
 INPUT_MAP(MaxPoolGrad) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}, {3, INPUT_DESC(grad)}};
 ATTR_MAP(MaxPoolGrad) = {{"ksize", ATTR_DESC(ksize, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())},
                          {"strides", ATTR_DESC(strides, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())},
-                         {"padding", ATTR_DESC(padding, AnyTraits<std::string>())}};
+                         {"padding", ATTR_DESC(padding, AnyTraits<std::string>())},
+                         {"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())}};
 OUTPUT_MAP(MaxPoolGrad) = {{0, OUTPUT_DESC(y)}};
 
 // avgpoolgrad
@@ -739,28 +707,34 @@ ATTR_MAP(Conv2D) = {
   {"stride", ATTR_DESC(strides, "pad", AnyTraits<std::vector<int64_t>>())},
   {"pad_list", ATTR_DESC(pads, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
   {"dilation", ATTR_DESC(dilations, "pad", AnyTraits<std::vector<int64_t>>())},
+  {"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())},
+  {"group", ATTR_DESC(groups, AnyTraits<int>())}
 };
 OUTPUT_MAP(Conv2D) = {{0, OUTPUT_DESC(y)}};
 
 // Conv2DBackpropInputD
-INPUT_MAP(Conv2DBackpropInputD) = {{1, INPUT_DESC(out_backprop)}, {2, INPUT_DESC(filters)}};
+INPUT_MAP(Conv2DBackpropInputD) = {{1, INPUT_DESC(out_backprop)}, {2, INPUT_DESC(filter)}};
 INPUT_ATTR_MAP(Conv2DBackpropInputD) = {
-  {3, ATTR_DESC(input_sizes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {3, ATTR_DESC(input_size, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(Conv2DBackpropInputD) = {
   {"pad_list", ATTR_DESC(pads, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
-  {"stride", ATTR_DESC(strides, "strides", AnyTraits<std::vector<int64_t>>())},
+  {"stride", ATTR_DESC(strides, "pad", AnyTraits<std::vector<int64_t>>())},
   {"dilation", ATTR_DESC(dilations, "pad", AnyTraits<std::vector<int64_t>>())},
+  {"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())},
+  {"group", ATTR_DESC(groups, AnyTraits<int>())}
 };
 OUTPUT_MAP(Conv2DBackpropInputD) = {{0, OUTPUT_DESC(y)}};
 
 // Conv2DBackpropFilterD
 INPUT_MAP(Conv2DBackpropFilterD) = {{1, INPUT_DESC(out_backprop)}, {2, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(Conv2DBackpropFilterD) = {
-  {3, ATTR_DESC(filter_sizes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {3, ATTR_DESC(filter_size, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(Conv2DBackpropFilterD) = {
   {"pad_list", ATTR_DESC(pads, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
-  {"stride", ATTR_DESC(strides, "strides", AnyTraits<std::vector<int64_t>>())},
+  {"stride", ATTR_DESC(strides, "pad", AnyTraits<std::vector<int64_t>>())},
   {"dilation", ATTR_DESC(dilations, "pad", AnyTraits<std::vector<int64_t>>())},
+  {"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())},
+  {"group", ATTR_DESC(groups, AnyTraits<int>())}
 };
 OUTPUT_MAP(Conv2DBackpropFilterD) = {{0, OUTPUT_DESC(y)}};
 
@@ -798,8 +772,8 @@ OUTPUT_MAP(DepthwiseConv2DBackpropFilterD) = {{0, OUTPUT_DESC(filter_grad)}};
 
 // MatMul
 INPUT_MAP(MatMul) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
-ATTR_MAP(MatMul) = {{"transpose_a", ATTR_DESC(transpose_a, AnyTraits<bool>())},
-                    {"transpose_b", ATTR_DESC(transpose_b, AnyTraits<bool>())}};
+ATTR_MAP(MatMul) = {{"transpose_a", ATTR_DESC(transpose_x1, AnyTraits<bool>())},
+                    {"transpose_b", ATTR_DESC(transpose_x2, AnyTraits<bool>())}};
 OUTPUT_MAP(MatMul) = {{0, OUTPUT_DESC(y)}};
 
 // Merge
@@ -846,10 +820,10 @@ ATTR_MAP(Sub) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(Sub) = {{0, OUTPUT_DESC(y)}};
 
 // SplitD
-INPUT_MAP(SplitD) = {{1, INPUT_DESC(value)}};
+INPUT_MAP(SplitD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(SplitD) = {{"axis", ATTR_DESC(split_dim, AnyTraits<int>())},
                     {"output_num", ATTR_DESC(num_split, AnyTraits<int>())}};
-DYN_OUTPUT_MAP(SplitD) = {{0, DYN_OUTPUT_DESC(output)}};
+DYN_OUTPUT_MAP(SplitD) = {{0, DYN_OUTPUT_DESC(y)}};
 
 // Neg
 INPUT_MAP(Neg) = {{1, INPUT_DESC(x)}};
@@ -876,12 +850,12 @@ OUTPUT_MAP(Pack) = {{0, OUTPUT_DESC(y)}};
 
 // ConcatD
 INPUT_MAP(ConcatD) = EMPTY_INPUT_MAP;
-DYN_INPUT_MAP(ConcatD) = {{1, DYN_INPUT_DESC(input_values)}};
+DYN_INPUT_MAP(ConcatD) = {{1, DYN_INPUT_DESC(x)}};
 ATTR_MAP(ConcatD) = {
   {"axis", ATTR_DESC(concat_dim, AnyTraits<int>())},
   {"inputNums", ATTR_DESC(N, AnyTraits<int>())},
 };
-OUTPUT_MAP(ConcatD) = {{0, OUTPUT_DESC(output_data)}};
+OUTPUT_MAP(ConcatD) = {{0, OUTPUT_DESC(y)}};
 
 // Less
 INPUT_MAP(Less) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
@@ -916,14 +890,14 @@ OUTPUT_MAP(TanhGrad) = {{0, OUTPUT_DESC(z)}};
 // ReduceMinD
 INPUT_MAP(ReduceMinD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceMinD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceMinD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceMinD) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceMaxD
 INPUT_MAP(ReduceMaxD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceMaxD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceMaxD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceMaxD) = {{0, OUTPUT_DESC(y)}};
 
@@ -1008,11 +982,11 @@ INPUT_MAP(LessEqual) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
 ATTR_MAP(LessEqual) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(LessEqual) = {{0, OUTPUT_DESC(y)}};
 
-// LogSoftmax
-INPUT_MAP(LogSoftmax) = {{1, INPUT_DESC(logits)}};
-ATTR_MAP(LogSoftmax) = {
-  {"axis", ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
-OUTPUT_MAP(LogSoftmax) = {{0, OUTPUT_DESC(logsoftmax)}};
+// LogSoftmaxV2
+INPUT_MAP(LogSoftmaxV2) = {{1, INPUT_DESC(logits)}};
+ATTR_MAP(LogSoftmaxV2) = {
+  {"axis", ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+OUTPUT_MAP(LogSoftmaxV2) = {{0, OUTPUT_DESC(logsoftmax)}};
 
 // RandomChoiceWithMask
 INPUT_MAP(RandomChoiceWithMask) = {{1, INPUT_DESC(x)}};
@@ -1094,8 +1068,8 @@ OUTPUT_MAP(LayerNormGrad) = {{0, OUTPUT_DESC(pd_x)}, {1, OUTPUT_DESC(pd_gamma)},
 
 // BatchMatMul
 INPUT_MAP(BatchMatMul) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
-ATTR_MAP(BatchMatMul) = {{"transpose_x1", ATTR_DESC(adj_x, AnyTraits<bool>())},
-                         {"transpose_x2", ATTR_DESC(adj_y, AnyTraits<bool>())}};
+ATTR_MAP(BatchMatMul) = {{"transpose_x1", ATTR_DESC(adj_x1, AnyTraits<bool>())},
+                         {"transpose_x2", ATTR_DESC(adj_x2, AnyTraits<bool>())}};
 OUTPUT_MAP(BatchMatMul) = {{0, OUTPUT_DESC(y)}};
 
 // DropoutDoMask
@@ -1146,6 +1120,19 @@ ATTR_MAP(SparseApplyAdagradD) = {{"lr", ATTR_DESC(lr, AnyTraits<float>())},
                                  {"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
 OUTPUT_MAP(SparseApplyAdagradD) = {{0, OUTPUT_DESC(var)}};
 
+// SparseApplyFtrlD
+INPUT_MAP(SparseApplyFtrlD) = {{1, INPUT_DESC(var)},
+                               {2, INPUT_DESC(accum)},
+                               {3, INPUT_DESC(linear)},
+                               {4, INPUT_DESC(grad)},
+                               {5, INPUT_DESC(indices)}};
+ATTR_MAP(SparseApplyFtrlD) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())},
+                              {"lr", ATTR_DESC(lr, AnyTraits<float>())},
+                              {"l1", ATTR_DESC(l1, AnyTraits<float>())},
+                              {"l2", ATTR_DESC(l2, AnyTraits<float>())},
+                              {"lr_power", ATTR_DESC(lr_power, AnyTraits<float>())}};
+OUTPUT_MAP(SparseApplyFtrlD) = {{0, OUTPUT_DESC(var)}};
+
 // SpaceToDepth
 INPUT_MAP(SpaceToDepth) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(SpaceToDepth) = {{"block_size", ATTR_DESC(block_size, AnyTraits<int64_t>())}};
diff --git a/mindspore/ccsrc/transform/op_declare.h b/mindspore/ccsrc/transform/op_declare.h
index 03463b978f..9e4f407ebb 100755
--- a/mindspore/ccsrc/transform/op_declare.h
+++ b/mindspore/ccsrc/transform/op_declare.h
@@ -209,8 +209,8 @@ DECLARE_OP_USE_OUTPUT(Merge)
 DECLARE_OP_ADAPTER(Switch)
 DECLARE_OP_USE_OUTPUT(Switch)
 
-DECLARE_OP_ADAPTER(TopKV2)
-DECLARE_OP_USE_OUTPUT(TopKV2)
+DECLARE_OP_ADAPTER(TopK)
+DECLARE_OP_USE_OUTPUT(TopK)
 
 DECLARE_OP_ADAPTER(RealDiv)
 DECLARE_OP_USE_OUTPUT(RealDiv)
@@ -260,8 +260,8 @@ DECLARE_OP_ADAPTER(Select)
 DECLARE_OP_USE_OUTPUT(Select)
 DECLARE_OP_ADAPTER(LessEqual)
 DECLARE_OP_USE_OUTPUT(LessEqual)
-DECLARE_OP_ADAPTER(LogSoftmax)
-DECLARE_OP_USE_OUTPUT(LogSoftmax)
+DECLARE_OP_ADAPTER(LogSoftmaxV2)
+DECLARE_OP_USE_OUTPUT(LogSoftmaxV2)
 DECLARE_OP_ADAPTER(TruncatedNormal)
 DECLARE_OP_USE_OUTPUT(TruncatedNormal)
 DECLARE_OP_ADAPTER(StridedSliceGrad)
@@ -391,8 +391,8 @@ DECLARE_OP_ADAPTER(Sigmoid)
 DECLARE_OP_USE_OUTPUT(Sigmoid)
 DECLARE_OP_ADAPTER(SigmoidGrad)
 DECLARE_OP_USE_OUTPUT(SigmoidGrad)
-DECLARE_OP_ADAPTER(Softmax)
-DECLARE_OP_USE_OUTPUT(Softmax)
+DECLARE_OP_ADAPTER(SoftmaxV2)
+DECLARE_OP_USE_OUTPUT(SoftmaxV2)
 DECLARE_OP_ADAPTER(SoftmaxGrad)
 DECLARE_OP_USE_OUTPUT(SoftmaxGrad)
 DECLARE_OP_ADAPTER(Greater)
@@ -435,6 +435,8 @@ DECLARE_OP_ADAPTER(Round)
 DECLARE_OP_USE_OUTPUT(Round)
 DECLARE_OP_ADAPTER(ApplyFtrl)
 DECLARE_OP_USE_OUTPUT(ApplyFtrl)
+DECLARE_OP_ADAPTER(SparseApplyFtrlD)
+DECLARE_OP_USE_OUTPUT(SparseApplyFtrlD)
 #ifdef ENABLE_GE
 DECLARE_OP_ADAPTER(Print)
 DECLARE_OP_USE_DYN_INPUT(Print)
diff --git a/mindspore/ccsrc/transform/util.cc b/mindspore/ccsrc/transform/util.cc
index a106a20ad8..0a18763d12 100644
--- a/mindspore/ccsrc/transform/util.cc
+++ b/mindspore/ccsrc/transform/util.cc
@@ -361,12 +361,11 @@ MeTensorPtr TransformUtil::GenerateMeTensor(const GeTensorPtr& ge_tensor, const
     MS_LOG(ERROR) << "GE tensor data size is zero!";
     return nullptr;
   }
-  errno_t ret = memcpy_s(me_data_ptr, me_data_size, ge_tensor->GetData(), ge_tensor->GetSize());
-  if (ret != EOK) {
-    MS_LOG(INFO) << "GE tensor data size is " << ge_tensor->GetSize() << " bytes";
-    MS_LOG(ERROR) << "Copy GE tensor data to me tensor failed";
-    return nullptr;
-  }
+
+  // Use memcpy here, not memcpy_s, just because the size of ge_tensor may be bigger than 2GB
+  // which is the size limit of memcpy_s
+  memcpy(me_data_ptr, ge_tensor->GetData(), ge_tensor->GetSize());
+
   return make_shared<MeTensor>(me_tensor);
 }
 
diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index bf05af9858..e9b4586b21 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -355,7 +355,9 @@ void MsContext::GetGeOptions(std::map<std::string, std::string>* ge_options) con
     MS_LOG(ERROR) << "Set proto lib path failed!";
   }
 
-  // Disbale the global variable acc, only enable it whlie adding training graph in pipeline
+  // Enable auto mixed precision according to the context options
+  (*ge_options)["ge.exec.auto_mix_precision"] = std::to_string(auto_mixed_precision_flag_);
+  // Disable the global variable acc, only enable it whlie adding training graph in pipeline
   (*ge_options)["ge.exec.variable_acc"] = "0";
 #endif
 }
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index a75b078df8..77bb6d0ff3 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -65,7 +65,7 @@ from .nn_ops import (LSTM, SGD, Adam, ApplyMomentum, BatchNorm,
                      SmoothL1Loss, Softmax,
                      SoftmaxCrossEntropyWithLogits, ROIAlign,
                      SparseSoftmaxCrossEntropyWithLogits, Tanh,
-                     TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl)
+                     TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl, SparseApplyFtrlD)
 from .other_ops import Assign, IOU, BoundingBoxDecode, BoundingBoxEncode, CheckValid, MakeRefKey
 
 
@@ -217,6 +217,7 @@ __all__ = [
     "Abs",
     "BinaryCrossEntropy",
     "SparseApplyAdagrad",
+    "SparseApplyFtrlD",
     "SpaceToDepth",
     "DepthToSpace",
     "Conv2DBackpropInput",
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index afa4c7dfe3..57e409b44f 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -2141,6 +2141,79 @@ class SparseApplyAdagrad(PrimitiveWithInfer):
         return var_type
 
 
+class SparseApplyFtrlD(PrimitiveWithInfer):
+    r"""
+    Conduct experiment on updating on parameters related to FTRL optimization algorithm.
+
+    .. math ::
+            \text{accum} = \text{grad} * \text{grad}
+
+    .. math ::
+            \text{linear} += \text{grad} + (\text{accum} ^ {\text{-lr_power}} -
+            \frac{\text{accum} ^ \text{-lr_power}}{\text{lr}} * \text{var})
+
+    .. math ::
+            \text{quadratic} = {\text{1.0}/({\text{accum}^\text{lr_power} * \text{lr}}) + 2*\text{l2}
+
+    .. math ::
+            \text{var} = {\text{sign}({linear}) * \text{l1} - \text{linear}})/{ quadratic }
+            if \vert linear \vert > l1 \ else \ 0.0
+
+    Args:
+        lr (float): Learning rate.
+        l1 (float): temp value NO.1.
+        l2 (float): temp value No.2.
+        lr_power (float): temp value used as power number.
+        use_locking (bool): If true, updating the var and accum tensors will be protected. Default: False.
+
+    Inputs:
+       - **var** (Tensor) - Variable to be update. The type must be float32.
+       - **accum** (Tensor) - Accum to be update. The shape must be the same as `var`'s shape,
+         the type must be float32.
+       - **linear** (Tensor) - Linear to be update. The shape must be the same as `var`'s shape,
+         the type must be float32.
+       - **grad** (Tensor) - Gradient. The shape must be the same as `var`'s shape,
+         the type must be float32.
+       - **indices** (Tensor) - A vector of indices into the first dimension of 'var' and 'accum',
+         the shape of `indices` must be the same as `grad` in first dimension, the type must be int32.
+
+    Output:
+        Tensors, has the same shape and type as `var`.
+
+    """
+
+    @prim_attr_register
+    def __init__(self, lr, l1, l2, lr_power, use_locking=False):
+        """init SparseApplyFtrlD"""
+        self.lr = validator.check_type("lr", lr, [float])
+        self.l1 = validator.check_type("l1", l1, [float])
+        self.l2 = validator.check_type("l2", l2, [float])
+        self.lr_power = validator.check_type("lr_power", lr_power, [float])
+        self.use_locking = validator.check_type("use_locking", use_locking, [bool])
+
+    def infer_shape(self, var_shape, accum_shape, linear_shape, grad_shape, indices_shape):
+        validator.check_param_equal('var shape', var_shape, 'accum shape', accum_shape)
+        validator.check_param_equal('len of var shape', len(var_shape), 'len of grad shape', len(grad_shape))
+        validator.check_param_equal('len of var shape', len(var_shape), 'len of linear shape', len(linear_shape))
+        if len(var_shape) > 1:
+            validator.check_param_equal('var_shape', var_shape[1:], 'grad_shape', grad_shape[1:])
+            validator.check_param_equal('var_shape', var_shape[1:], 'linear_shape', linear_shape[1:])
+        validator.check_integer("len of indices shape", len(indices_shape), 1, Rel.EQ)
+        validator.check('the first dimension of grad', grad_shape[0],
+                        'the shape of indices', indices_shape[0], Rel.EQ)
+
+        return var_shape
+
+    def infer_dtype(self, var_type, accum_type, linear_type, grad_type, indices_type):
+        validator.check_subclass("var_type", var_type, mstype.tensor)
+        validator.check_subclass("accum_type", accum_type, mstype.tensor)
+        validator.check_subclass("linear_type", linear_type, mstype.tensor)
+        validator.check_subclass("grad_type", grad_type, mstype.tensor)
+        validator.check_subclass("indices_type", indices_type, mstype.tensor)
+
+        return var_type
+
+
 class LARSUpdate(PrimitiveWithInfer):
     """
     Conduct lars (layer-wise adaptive rate scaling) update on the square sum of gradient.
@@ -2244,4 +2317,4 @@ class ApplyFtrl(PrimitiveWithInfer):
         validator.check_typename("l1", l1_type,[mstype.float16, mstype.float32])
         validator.check_typename("l2", l2_type,[mstype.float16, mstype.float32])
         validator.check_typename("lr_power", lr_power_type,[mstype.float16, mstype.float32])
-        return var_type
\ No newline at end of file
+        return var_type
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index bfe8075972..8d7dd95072 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -749,6 +749,11 @@ test_case_nn_ops = [
         'desc_inputs': [[3, 3], [3, 3], [3, 3], Tensor(np.ones((3,), np.int32))],
         'desc_bprop': [3, 3],
         'skip': ['backward']}),
+    ('SparseApplyFtrlD', {
+        'block': P.SparseApplyFtrlD(0.1, 0.1, 0.1, -0.1),
+        'desc_inputs': [[3, 3], [3, 3], [3, 3], [3, 3], Tensor(2*np.ones((3,), np.int32))],
+        'desc_bprop': [3, 3],
+        'skip': ['backward']}),
     ('Flatten_1', {
         'block': NetForFlatten(),
         'desc_inputs': [Tensor(np.ones([2, 3, 4]).astype(np.int32)), Tensor(np.ones([2, 12]).astype(np.int32))],

From 0a977aa19dc216b43ddc8adc11490bb00d001c3e Mon Sep 17 00:00:00 2001
From: simson <526422051@qq.com>
Date: Wed, 6 May 2020 18:48:30 +0800
Subject: [PATCH 04/13] revert the limitation of end learning rate

---
 mindspore/nn/optim/lamb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index e026b1c560..97a81a590b 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -114,7 +114,7 @@ def _check_param_value(decay_steps, warmup_steps, start_learning_rate,
     _ = warmup_steps
     validator.check_float_positive('start_learning_rate', start_learning_rate, prim_name)
     validator.check_float_legal_value('start_learning_rate', start_learning_rate, prim_name)
-    validator.check_float_positive('end_learning_rate', end_learning_rate, prim_name)
+    validator.check_value_type("end_learning_rate", end_learning_rate, [float], prim_name)
     validator.check_float_legal_value('end_learning_rate', end_learning_rate, prim_name)
     validator.check_float_positive('power', power, prim_name)
     validator.check_float_legal_value('power', power, prim_name)

From 187568a833b1c7478c7526058588f2b9e755b76a Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Wed, 6 May 2020 21:20:32 +0800
Subject: [PATCH 05/13] adapt assign assignAdd relu6

adapt ResizeNearestNeighbourV2 with grad and ApplyAdam
---
 mindspore/ccsrc/kernel/tbe/tbe_adapter.cc |  6 ++---
 mindspore/ops/_op_impl/tbe/assign.py      | 30 ++++++++++++++++++++---
 mindspore/ops/_op_impl/tbe/assign_add.py  | 12 +++++++++
 mindspore/ops/_op_impl/tbe/relu6.py       |  4 +--
 4 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
index 8ce5504b8e..005c290aba 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
@@ -72,10 +72,10 @@ static std::map<string, string> tbe_func_adapter_map = {
   {"lamb_next_mv_with_decay_v1", "lamb_next_m_v_with_decay_v1"},
   {"lamb_next_mv", "lamb_next_m_v"},
   {"split", "split_d"},
-  {"resize_nearest_neighbor", "resize_nearest_neighbor_d"},
-  {"resize_nearest_neighbor_grad", "resize_nearest_neighbor_grad_d"},
+  {"resize_nearest_neighbor", "resize_nearest_neighbor_v2_d"},
+  {"resize_nearest_neighbor_grad", "resize_nearest_neighbor_v2_grad_d"},
   {"pad", "pad_d"},
-  {"adam", "apply_adam"}};
+  {"adam", "apply_adam_d"}};
 
 void TbeAdapter::NormalizeFuncName(std::string *func_name) {
   if (func_name == nullptr) {
diff --git a/mindspore/ops/_op_impl/tbe/assign.py b/mindspore/ops/_op_impl/tbe/assign.py
index 2fbd152c78..ff673a03c4 100644
--- a/mindspore/ops/_op_impl/tbe/assign.py
+++ b/mindspore/ops/_op_impl/tbe/assign.py
@@ -23,31 +23,53 @@ assign_op_info = TBERegOp("Assign") \
     .compute_cost(10) \
     .kernel_name("assign") \
     .partial_flag(True) \
-    .input(0, "resource", False, "required", "all") \
+    .input(0, "ref", False, "required", "all") \
     .input(1, "value", False, "required", "all") \
-    .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
+    .output(0, "ref", False, "required", "all") \
     .dtype_format(DataType.BOOL_Default, DataType.BOOL_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.BOOL_5HD, DataType.BOOL_5HD, DataType.BOOL_5HD) \
+    .dtype_format(DataType.BOOL_C1HWNCoC0, DataType.BOOL_C1HWNCoC0, DataType.BOOL_C1HWNCoC0) \
+    .dtype_format(DataType.BOOL_FracZ, DataType.BOOL_FracZ, DataType.BOOL_FracZ) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
     .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0) \
+    .dtype_format(DataType.I8_FracZ, DataType.I8_FracZ, DataType.I8_FracZ) \
     .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
     .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0) \
+    .dtype_format(DataType.U8_FracZ, DataType.U8_FracZ, DataType.U8_FracZ) \
     .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.I16_Default) \
     .dtype_format(DataType.I16_5HD, DataType.I16_5HD, DataType.I16_5HD) \
+    .dtype_format(DataType.I16_C1HWNCoC0, DataType.I16_C1HWNCoC0, DataType.I16_C1HWNCoC0) \
+    .dtype_format(DataType.I16_FracZ, DataType.I16_FracZ, DataType.I16_FracZ) \
     .dtype_format(DataType.U16_Default, DataType.U16_Default, DataType.U16_Default) \
     .dtype_format(DataType.U16_5HD, DataType.U16_5HD, DataType.U16_5HD) \
+    .dtype_format(DataType.U16_C1HWNCoC0, DataType.U16_C1HWNCoC0, DataType.U16_C1HWNCoC0) \
+    .dtype_format(DataType.U16_FracZ, DataType.U16_FracZ, DataType.U16_FracZ) \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0) \
+    .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ) \
     .dtype_format(DataType.U32_Default, DataType.U32_Default, DataType.U32_Default) \
     .dtype_format(DataType.U32_5HD, DataType.U32_5HD, DataType.U32_5HD) \
+    .dtype_format(DataType.U32_C1HWNCoC0, DataType.U32_C1HWNCoC0, DataType.U32_C1HWNCoC0) \
+    .dtype_format(DataType.U32_FracZ, DataType.U32_FracZ, DataType.U32_FracZ) \
     .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
     .dtype_format(DataType.I64_5HD, DataType.I64_5HD, DataType.I64_5HD) \
+    .dtype_format(DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0) \
+    .dtype_format(DataType.I64_FracZ, DataType.I64_FracZ, DataType.I64_FracZ) \
     .dtype_format(DataType.U64_Default, DataType.U64_Default, DataType.U64_Default) \
     .dtype_format(DataType.U64_5HD, DataType.U64_5HD, DataType.U64_5HD) \
+    .dtype_format(DataType.U64_C1HWNCoC0, DataType.U64_C1HWNCoC0, DataType.U64_C1HWNCoC0) \
+    .dtype_format(DataType.U64_FracZ, DataType.U64_FracZ, DataType.U64_FracZ) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
-    .dtype_format(DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/assign_add.py b/mindspore/ops/_op_impl/tbe/assign_add.py
index 2b20a7781d..7ad23ff3bc 100644
--- a/mindspore/ops/_op_impl/tbe/assign_add.py
+++ b/mindspore/ops/_op_impl/tbe/assign_add.py
@@ -28,16 +28,28 @@ assign_add_op_info = TBERegOp("AssignAdd") \
     .output(0, "ref", False, "required", "all") \
     .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
     .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0) \
+    .dtype_format(DataType.I8_FracZ, DataType.I8_FracZ, DataType.I8_FracZ) \
     .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
     .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0) \
+    .dtype_format(DataType.U8_FracZ, DataType.U8_FracZ, DataType.U8_FracZ) \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0) \
+    .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ) \
     .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
     .dtype_format(DataType.I64_5HD, DataType.I64_5HD, DataType.I64_5HD) \
+    .dtype_format(DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0) \
+    .dtype_format(DataType.I64_FracZ, DataType.I64_FracZ, DataType.I64_FracZ) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/relu6.py b/mindspore/ops/_op_impl/tbe/relu6.py
index bbedfdeb0f..d9bd7f9f8e 100644
--- a/mindspore/ops/_op_impl/tbe/relu6.py
+++ b/mindspore/ops/_op_impl/tbe/relu6.py
@@ -23,8 +23,8 @@ relu6_op_info = TBERegOp("ReLU6") \
     .compute_cost(10) \
     .kernel_name("relu6") \
     .partial_flag(True) \
-    .input(0, "features", False, "required", "all") \
-    .output(0, "activations", False, "required", "all") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default) \

From 263d82edc5660ca24d370cf7e575b69f64f49f2e Mon Sep 17 00:00:00 2001
From: zhoufeng <zhoufeng54@huawei.com>
Date: Thu, 7 May 2020 14:46:03 +0800
Subject: [PATCH 06/13] me-ge link hccl

Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
---
 mindspore/ccsrc/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 4c6ceb38e1..8d3818a777 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -125,7 +125,7 @@ endif()
 
 if (ENABLE_GE)
     if(ENABLE_TRAIN)
-        target_link_libraries(mindspore ge_client_train)
+        target_link_libraries(mindspore ge_client_train hccl)
     else ()
         target_link_libraries(mindspore ge_client)
     endif ()

From e97d33f7720229a0504fe1fdb206e93c01e67f70 Mon Sep 17 00:00:00 2001
From: liuxiao <liuxiao93@huawei.com>
Date: Wed, 6 May 2020 19:36:00 +0800
Subject: [PATCH 07/13] add ops for VM

---
 mindspore/ops/_grad/grad_nn_ops.py     |  3 +-
 mindspore/ops/_op_impl/tbe/__init__.py |  2 ++
 mindspore/ops/_op_impl/tbe/elu.py      | 40 ++++++++++++++++++++++++
 mindspore/ops/_op_impl/tbe/elu_grad.py | 43 ++++++++++++++++++++++++++
 mindspore/ops/operations/nn_ops.py     |  3 +-
 tests/ut/python/ops/test_ops.py        |  2 +-
 6 files changed, 89 insertions(+), 4 deletions(-)
 create mode 100644 mindspore/ops/_op_impl/tbe/elu.py
 create mode 100644 mindspore/ops/_op_impl/tbe/elu_grad.py

diff --git a/mindspore/ops/_grad/grad_nn_ops.py b/mindspore/ops/_grad/grad_nn_ops.py
index 153abc0fb6..362bda7368 100755
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@@ -600,7 +600,6 @@ def get_bprop_roi_align(self):
     sample_num = self.sample_num
 
     def bprop(inputs, rois, out, dout):
-        rois_shape = shape_op(rois)
         inputs_shape = shape_op(inputs)
         dx = G.ROIAlignGrad(inputs_shape,
                             pooled_height,
@@ -608,7 +607,7 @@ def get_bprop_roi_align(self):
                             spatial_scale,
                             sample_num,
                             )(dout, rois)
-        return dx, zeros_like(rois_shape)
+        return dx, zeros_like(rois)
 
     return bprop
 
diff --git a/mindspore/ops/_op_impl/tbe/__init__.py b/mindspore/ops/_op_impl/tbe/__init__.py
index 9dbe53049b..c6a08e8ff4 100644
--- a/mindspore/ops/_op_impl/tbe/__init__.py
+++ b/mindspore/ops/_op_impl/tbe/__init__.py
@@ -73,6 +73,8 @@ from .strideslice_d import _strided_slice_d_tbe
 from .strideslicegrad_d import _strided_slice_grad_d_tbe
 from .split_d import _split_d_tbe
 from .exp import _exp_tbe
+from .elu import _elu_tbe
+from .elu_grad import _elu_grad_tbe
 from .div import _div_tbe
 from .log import _log_tbe
 from .floor_div import _floor_div_tbe
diff --git a/mindspore/ops/_op_impl/tbe/elu.py b/mindspore/ops/_op_impl/tbe/elu.py
new file mode 100644
index 0000000000..9125d14727
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/elu.py
@@ -0,0 +1,40 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Elu op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+elu_op_info = TBERegOp("Elu") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("elu.so") \
+    .compute_cost(10) \
+    .kernel_name("elu") \
+    .partial_flag(True) \
+    .op_pattern("formatAgnostic") \
+    .attr("alpha", "optional", "float", "all", "1.0") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(elu_op_info)
+def _elu_tbe():
+    """Elu TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/elu_grad.py b/mindspore/ops/_op_impl/tbe/elu_grad.py
new file mode 100644
index 0000000000..c3486dd024
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/elu_grad.py
@@ -0,0 +1,43 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""EluGrad op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+elu_grad_op_info = TBERegOp("EluGrad") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("elu_grad.so") \
+    .compute_cost(10) \
+    .kernel_name("elu_grad") \
+    .partial_flag(True) \
+    .input(0, "grads", False, "required", "all") \
+    .input(1, "activations", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(elu_grad_op_info)
+def _elu_grad_tbe():
+    """EluGrad TBE register"""
+    return
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index 2a2dbe08a8..7ba341fd56 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -1527,7 +1527,8 @@ class L2Loss(PrimitiveWithInfer):
 
     def infer_dtype(self, x_type):
         validator.check_subclass("x_type", x_type, mstype.tensor, self.name)
-        validator.check_tensor_type_same({'x_type': x_type}, [mstype.double, mstype.float_, mstype.float16], self.name)
+        valid_types = [mstype.float16, mstype.float32, mstype.double]
+        validator.check_tensor_type_same({'x_type': x_type}, valid_types, self.name)
         return x_type
 
 
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index 9d7e8c898a..7a3d7d967f 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -874,7 +874,7 @@ test_case_nn_ops = [
         'skip': ['backward']}),
     ('L2Loss_1', {
         'block': P.L2Loss(),
-        'desc_inputs': [Tensor(np.array([1, 2, 3, 4]), mstype.float16)],
+        'desc_inputs': [Tensor(np.array([1, 2, 3, 4]), mstype.float32)],
         'desc_bprop': []}),
     ('L2Loss_2', {
         'block': P.L2Loss(),

From d6520f499650782f7717aefef9a429b8e7489828 Mon Sep 17 00:00:00 2001
From: gengdongjie <gengdongjie@huawei.com>
Date: Thu, 7 May 2020 23:44:23 +0800
Subject: [PATCH 08/13] add mix precision option

---
 mindspore/ccsrc/utils/context/ms_context.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index 6da1de9cdb..d728feae82 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -359,7 +359,11 @@ void MsContext::GetGeOptions(std::map<std::string, std::string> *ge_options) con
   }
 
   // Enable auto mixed precision according to the context options
-  (*ge_options)["ge.exec.auto_mix_precision"] = std::to_string(auto_mixed_precision_flag_);
+  if (auto_mixed_precision_flag_) {
+    (*ge_options)["ge.exec.precision_mode"] = "allow_mix_precision";
+  } else {
+    (*ge_options)["ge.exec.precision_mode"] = "must_keep_origin_dtype";
+  }
   // Disable the global variable acc, only enable it whlie adding training graph in pipeline
   (*ge_options)["ge.exec.variable_acc"] = "0";
 #endif

From 460a1e25c82131d70a5d2bb076f262645e504843 Mon Sep 17 00:00:00 2001
From: gengdongjie <gengdongjie@huawei.com>
Date: Fri, 8 May 2020 19:28:31 +0800
Subject: [PATCH 09/13] reset auto mix precision default off option

---
 mindspore/ccsrc/utils/context/ms_context.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index d728feae82..b8b4b3d8a1 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -362,7 +362,7 @@ void MsContext::GetGeOptions(std::map<std::string, std::string> *ge_options) con
   if (auto_mixed_precision_flag_) {
     (*ge_options)["ge.exec.precision_mode"] = "allow_mix_precision";
   } else {
-    (*ge_options)["ge.exec.precision_mode"] = "must_keep_origin_dtype";
+    (*ge_options)["ge.exec.precision_mode"] = "allow_fp32_to_fp16";
   }
   // Disable the global variable acc, only enable it whlie adding training graph in pipeline
   (*ge_options)["ge.exec.variable_acc"] = "0";

From 0b8cea801862a7fe7dae41d61bee6a9e94bb60ad Mon Sep 17 00:00:00 2001
From: guohongzilong <2713219276@qq.com>
Date: Thu, 23 Apr 2020 17:39:24 +0800
Subject: [PATCH 10/13] learning rate and weight decay support group mode

---
 mindspore/nn/optim/adam.py                    |  65 ++++--
 mindspore/nn/optim/ftrl.py                    |   3 +-
 mindspore/nn/optim/lamb.py                    |   2 +
 mindspore/nn/optim/momentum.py                |  50 +++-
 mindspore/nn/optim/optimizer.py               | 215 ++++++++++++++----
 mindspore/nn/optim/rmsprop.py                 |  71 ++++--
 mindspore/nn/optim/sgd.py                     |  45 +++-
 mindspore/nn/wrap/cell_wrapper.py             |   2 +-
 tests/ut/python/nn/optim/test_adam.py         |   4 +-
 tests/ut/python/nn/optim/test_optimizer.py    |   8 +-
 .../test_optimize_with_parameter_groups.py    | 210 +++++++++++++++++
 11 files changed, 570 insertions(+), 105 deletions(-)
 create mode 100644 tests/ut/python/optimizer/test_optimize_with_parameter_groups.py

diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py
index 1a386556d9..9893a81923 100755
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -103,9 +103,9 @@ def _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, po
     validator.check_integer('decay_steps', decay_steps, 0, Rel.GT, prim_name)
 
 
-@adam_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor",
+@adam_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", "Tensor",
                    "Tensor")
-def _run_opt_with_one_number(opt, lr, beta1_power, beta2_power, beta1, beta2, eps, gradient, params, moment1,
+def _run_opt_with_one_number(opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params, moment1,
                              moment2):
     """Apply adam optimizer to the weight parameter using Tensor."""
     success = True
@@ -136,9 +136,27 @@ class Adam(Optimizer):
     `beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`,
     :math:`\epsilon` represents `eps`.
 
+    Note:
+        The Adam optimizer supports separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
     Args:
-        params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
-                                  should be class mindspore.Parameter.
+        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
+            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
@@ -161,8 +179,6 @@ class Adam(Optimizer):
         weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
         loss_scale (float): A floating point value for the loss scale. Should be equal to or greater than 1. Default:
                             1.0.
-        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
-                                 lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name.
 
     Inputs:
         - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@@ -172,15 +188,26 @@ class Adam(Optimizer):
 
     Examples:
         >>> net = Net()
-        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> #1) All parameters use the same learning rate and weight decay
         >>> optim = nn.Adam(params=net.trainable_params())
-        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
+        >>>
+        >>> #2) Use parameter groups and set different values
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
+        >>>                 {'params': no_conv_params}]
+        >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
+        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
+        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
     """
 
     def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, use_locking=False,
-                 use_nesterov=False, weight_decay=0.0, loss_scale=1.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
-        super(Adam, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter)
+                 use_nesterov=False, weight_decay=0.0, loss_scale=1.0):
+        super(Adam, self).__init__(learning_rate, params, weight_decay, loss_scale)
         _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
         validator.check_value_type("use_locking", use_locking, [bool], self.cls_name)
         validator.check_value_type("use_nesterov", use_nesterov, [bool], self.cls_name)
@@ -216,10 +243,14 @@ class Adam(Optimizer):
         self.beta1_power = beta1_power
         beta2_power = self.beta2_power * self.beta2
         self.beta2_power = beta2_power
-        success = self.hyper_map(F.partial(adam_opt, self.opt, lr, beta1_power, beta2_power, self.beta1,
-                                           self.beta2, self.eps),
-                                 gradients, params, moment1, moment2)
-
+        if self.is_group:
+            success = self.hyper_map(F.partial(adam_opt, self.opt, beta1_power, beta2_power, self.beta1,
+                                               self.beta2, self.eps),
+                                     lr, gradients, params, moment1, moment2)
+        else:
+            success = self.hyper_map(F.partial(adam_opt, self.opt, beta1_power, beta2_power, self.beta1,
+                                               self.beta2, self.eps, lr),
+                                     gradients, params, moment1, moment2)
         return success
 
 
@@ -262,6 +293,8 @@ class AdamWeightDecay(Optimizer):
     def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0,
                  decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
         super(AdamWeightDecay, self).__init__(learning_rate, params)
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
         self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
         self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
@@ -329,6 +362,8 @@ class AdamWeightDecayDynamicLR(Optimizer):
                  weight_decay=0.0,
                  decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
         super(AdamWeightDecayDynamicLR, self).__init__(learning_rate, params)
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
         _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, power, self.cls_name)
         # turn them to scalar when me support scalar/tensor mix operations
diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py
index ccc1b3f10b..33edafa4e2 100644
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@@ -96,7 +96,8 @@ class FTRL(Optimizer):
     def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0,
                  use_locking=False, loss_scale=1.0, weight_decay=0.0):
         super(FTRL, self).__init__(learning_rate, params)
-
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param(initial_accum, learning_rate, lr_power, l1, l2, use_locking, loss_scale, weight_decay,
                      self.cls_name)
         self.moments = self.parameters.clone(prefix="moments", init=initial_accum)
diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index 97a81a590b..b4d478f52a 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -183,6 +183,8 @@ class Lamb(Optimizer):
                  decay_filter=lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name):
 
         super(Lamb, self).__init__(start_learning_rate, params)
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param_value(decay_steps, warmup_steps, start_learning_rate, end_learning_rate,
                            power, beta1, beta2, eps, weight_decay, self.cls_name)
 
diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py
index 67de590c5f..7cfbf11183 100755
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -23,7 +23,7 @@ momentum_opt = C.MultitypeFuncGraph("momentum_opt")
 
 
 @momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
-def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, moment):
+def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment):
     """Apply momentum optimizer to the weight parameter using Tensor."""
     success = True
     success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum))
@@ -36,9 +36,27 @@ class Momentum(Optimizer):
 
     Refer to the paper on the importance of initialization and momentum in deep learning for more details.
 
+    Note:
+        The Momentum optimizer supports separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
     Args:
-        params (list[Parameter]): A list of parameter, which will be updated. The element in `parameters`
-                                  should be class mindspore.Parameter.
+        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
+            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
@@ -49,8 +67,6 @@ class Momentum(Optimizer):
         momentum (float): Hyperparameter of type float, means momentum for the moving average.
         weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
         loss_scale (float): A floating point value for the loss scale. Default: 1.0.
-        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
-                                 lambda x: 'beta' not in x.name and 'gamma' not in x.name.
 
     Inputs:
         - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@@ -63,13 +79,24 @@ class Momentum(Optimizer):
 
     Examples:
         >>> net = Net()
-        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> #1) All parameters use the same learning rate and weight decay
         >>> optim = nn.Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>>
+        >>> #2) Use parameter groups and set different values
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
+        >>>                 {'params': no_conv_params}]
+        >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0)
+        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
+        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
+        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
         >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
     """
-    def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
-        super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter)
+    def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0):
+        super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale)
         if isinstance(momentum, float) and momentum < 0.0:
             raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
         self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
@@ -84,5 +111,8 @@ class Momentum(Optimizer):
         gradients = self.decay_weight(gradients)
         gradients = self.scale_grad(gradients)
         lr = self.get_lr()
-        success = self.hyper_map(F.partial(momentum_opt, self.opt, lr, self.momentum), gradients, params, moments)
+        if self.is_group:
+            success = self.hyper_map(F.partial(momentum_opt, self.opt, self.momentum), lr, gradients, params, moments)
+        else:
+            success = self.hyper_map(F.partial(momentum_opt, self.opt, self.momentum, lr), gradients, params, moments)
         return success
diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index 34abc2b1c2..671e92de3a 100755
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -28,7 +28,6 @@ from mindspore._checkparam import Rel
 from mindspore.common.tensor import Tensor
 from mindspore import log as logger
 
-
 __all__ = ['Optimizer']
 
 
@@ -42,68 +41,96 @@ class Optimizer(Cell):
         This class defines the API to add Ops to train a model. Never use
         this class directly, but instead instantiate one of its subclasses.
 
+        Some optimizers support separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
     Args:
         learning_rate (float): A floating point value for the learning rate. Should be greater than 0.
-        parameters (list): A list of parameter, which will be updated. The element in `parameters`
-            should be class mindspore.Parameter.
+        parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be
+            updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`,
+            the "params", "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
-            If the type of `weight_decay` input is int, it will be convertd to float. Default: 0.0.
+            If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
         loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the
-            type of `loss_scale` input is int, it will be convertd to float. Default: 1.0.
-        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default: lambda
-            x: 'beta' not in x.name and 'gamma' not in x.name.
+            type of `loss_scale` input is int, it will be converted to float. Default: 1.0.
 
     Raises:
         ValueError: If the learning_rate is a Tensor, but the dims of tensor is greater than 1.
         TypeError: If the learning_rate is not any of the three types: float, Tensor, Iterable.
     """
 
-    def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
+    def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0):
         super(Optimizer, self).__init__(auto_prefix=False)
+        if parameters and not isinstance(parameters, list):
+            parameters = list(parameters)
+
+        if not parameters:
+            raise ValueError("Optimizer got an empty parameter list.")
+
+        if not isinstance(parameters[0], (dict, Parameter)):
+            raise ValueError("Only a list of Parameter or dict can be supported.")
+
+        if isinstance(loss_scale, int):
+            loss_scale = float(loss_scale)
+        validator.check_value_type("loss_scale", loss_scale, [float], None)
+        validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, None)
+
+        if isinstance(weight_decay, int):
+            weight_decay = float(weight_decay)
+        validator.check_value_type("weight_decay", weight_decay, [float], None)
+        validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, None)
+
+        self.is_group = False
+        self.loss_scale = loss_scale
         if isinstance(learning_rate, float):
             self.dynamic_lr = False
             self.gather = None
             self.assignadd = None
             self.global_step = None
-            validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name)
-            learning_rate = Tensor(learning_rate, mstype.float32)
+            self.scalar_lr = learning_rate
         else:
             self.dynamic_lr = True
             self.gather = P.GatherV2()
             self.assignadd = P.AssignAdd()
             self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step')
-            if isinstance(learning_rate, Iterable):
-                learning_rate = Tensor(np.array(list(learning_rate)).astype(np.float32))
-            elif isinstance(learning_rate, Tensor):
-                if learning_rate.dim() > 1:
-                    raise ValueError("Learning rate should be a 0 or 1 dim `Tensor`,"
-                                     f"but got {learning_rate.dim()}.")
-                if learning_rate.dim() == 1 and learning_rate.size() < 2:
-                    logger.warning("If want to use the dynamic learning rate, please make sure that the number "
-                                   "of elements in the list, tuple or tensor passed is greater than 1.")
-            else:
-                raise TypeError("Learning rate should be float, Tensor or Iterable.")
-
-        if isinstance(weight_decay, int):
-            weight_decay = float(weight_decay)
-        validator.check_value_type("weight_decay", weight_decay, [float], None)
-        validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, None)
-
-        if isinstance(loss_scale, int):
-            loss_scale = float(loss_scale)
-        validator.check_value_type("loss_scale", loss_scale, [float], None)
-        validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, None)
-
-        self.loss_scale = loss_scale
-        self.learning_rate = Parameter(learning_rate, name="learning_rate")
-        self.parameters = ParameterTuple(parameters)
+            self.scalar_lr = None
+
+        learning_rate = self._get_single_lr(learning_rate)
+        if isinstance(parameters[0], dict):
+            self.is_group = True
+            self.params = []
+            self.group_lr = []
+            self.group_weight_decay = []
+            self._init_group_params(parameters, learning_rate, weight_decay)
+
+        if self.is_group:
+            self.learning_rate = ParameterTuple(self.group_lr)
+            self.parameters = ParameterTuple(self.params)
+            self.weight_decay = tuple(self.group_weight_decay)
+            decay_filter = lambda x: x > 0
+            self.decay_flags = tuple(decay_filter(x) for x in self.weight_decay)
+        else:
+            self.learning_rate = Parameter(learning_rate, name="learning_rate")
+            self.parameters = ParameterTuple(parameters)
+            self.weight_decay = weight_decay * loss_scale
+            decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name
+            self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
         self.reciprocal_scale = 1.0 / loss_scale
-        self.weight_decay = weight_decay * loss_scale
-        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
-
-        if not self.parameters:
-            raise ValueError("optimizer got an empty parameter list.")
+        self.exec_weight_decay = any(self.decay_flags)
+        self.param_length = len(self.parameters)
 
     def decay_weight(self, gradients):
         """
@@ -118,9 +145,15 @@ class Optimizer(Cell):
         Returns:
             tuple[Tensor], The gradients after weight decay.
         """
-        if self.weight_decay > 0:
-            params = self.parameters
-            gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags, params, gradients)
+        params = self.parameters
+        if self.is_group:
+            if self.exec_weight_decay:
+                gradients = self.hyper_map(F.partial(apply_decay), self.weight_decay, self.decay_flags,
+                                           params, gradients)
+        else:
+            if self.weight_decay > 0:
+                gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags,
+                                           params, gradients)
 
         return gradients
 
@@ -144,6 +177,83 @@ class Optimizer(Cell):
 
         return gradients
 
+    def _get_single_lr(self, learning_rate):
+        """Get learning rate in Tensor type."""
+        if isinstance(learning_rate, float):
+            validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name)
+            lr = Tensor(learning_rate, mstype.float32)
+        elif isinstance(learning_rate, Iterable):
+            lr = Tensor(np.array(list(learning_rate)).astype(np.float32))
+        elif isinstance(learning_rate, Tensor):
+            if learning_rate.dim() > 1:
+                raise ValueError("Learning rate should be a 0 or 1 dim `Tensor`,"
+                                 f"but got {learning_rate.dim()}.")
+            if learning_rate.dim() == 1 and learning_rate.size() < 2:
+                logger.warning("If want to use the dynamic learning rate, please make sure that the number "
+                               "of elements in the list, tuple or tensor passed is greater than 1.")
+            lr = learning_rate
+        else:
+            raise TypeError("Learning rate should be float, Tensor or Iterable.")
+        return lr
+
+    def _init_group_params(self, parameters, learning_rate, weight_decay):
+        """Init learning rate or weight decay in group params."""
+        origin_dynamic_lr = self.dynamic_lr
+        if self.dynamic_lr:
+            dynamic_lr_length = learning_rate.size()
+        else:
+            dynamic_lr_length = 0
+
+        for group_param in parameters:
+            lr_length = dynamic_lr_length
+            if 'lr' in group_param.keys():
+                self._get_single_lr(group_param['lr'])
+                if isinstance(group_param['lr'], Iterable):
+                    lr_length = len(group_param['lr'])
+                    self.dynamic_lr = True
+                elif isinstance(group_param['lr'], Tensor):
+                    lr_length = group_param['lr'].size()
+                    self.dynamic_lr = True
+            if dynamic_lr_length not in (lr_length, 0):
+                raise ValueError("The dynamic learning rate in group should be the same size.")
+            dynamic_lr_length = lr_length
+
+        if self.dynamic_lr and not origin_dynamic_lr:
+            self.gather = P.GatherV2()
+            self.assignadd = P.AssignAdd()
+            self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step')
+
+        params_store = []
+        for group_param in parameters:
+            self.params += group_param['params']
+            if 'lr' in group_param.keys():
+                params_dynamic_lr = isinstance(group_param['lr'], (Iterable, Tensor))
+
+                if self.dynamic_lr and not params_dynamic_lr:
+                    lr = Tensor(np.array([group_param['lr']] * dynamic_lr_length).astype(np.float32))
+                else:
+                    lr = self._get_single_lr(group_param['lr'])
+            else:
+                if self.dynamic_lr and not origin_dynamic_lr:
+                    lr = Tensor(np.array([self.scalar_lr] * dynamic_lr_length).astype(np.float32))
+                else:
+                    lr = learning_rate
+
+            if 'weight_decay' in group_param.keys():
+                validator.check_float_legal_value('weight_decay', group_param['weight_decay'], None)
+                validator.check_number_range('weight_decay', group_param['weight_decay'], 0.0, float("inf"),
+                                             Rel.INC_LEFT, self.cls_name)
+                weight_decay_ = group_param['weight_decay'] * self.loss_scale
+            else:
+                weight_decay_ = weight_decay * self.loss_scale
+
+            for param in group_param['params']:
+                if param in params_store:
+                    raise RuntimeError(f"The {param.name} parameter has appeared in parameter groups.")
+                params_store.append(param)
+                self.group_lr.append(Parameter(lr, name="lr_" + param.name))
+                self.group_weight_decay.append(weight_decay_)
+
     def get_lr(self):
         """
         Get the learning rate of current step.
@@ -151,11 +261,20 @@ class Optimizer(Cell):
         Returns:
             float, the learning rate of current step.
         """
-        lr = self.learning_rate
-        if self.dynamic_lr:
-            lr = self.gather(self.learning_rate, self.global_step, 0)
-            F.control_depend(lr, self.assignadd(self.global_step, 1))
+        if self.is_group:
+            lr = self.learning_rate
+            if self.dynamic_lr:
+                lr = ()
+                for i in range(self.param_length):
+                    current_dynamic_lr = self.gather(self.learning_rate[i], self.global_step, 0)
+                    lr += (current_dynamic_lr,)
+                F.control_depend(lr, self.assignadd(self.global_step, 1))
 
+        else:
+            lr = self.learning_rate
+            if self.dynamic_lr:
+                lr = self.gather(self.learning_rate, self.global_step, 0)
+                F.control_depend(lr, self.assignadd(self.global_step, 1))
         return lr
 
     def construct(self, *hyper_params):
diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py
index b1271587b4..b96d9499b2 100644
--- a/mindspore/nn/optim/rmsprop.py
+++ b/mindspore/nn/optim/rmsprop.py
@@ -22,17 +22,17 @@ rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt")
 centered_rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt")
 
 
-@rmsprop_opt.register("Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor")
-def _rmsprop_opt(opt, learning_rate, decay, epsilon, momentum, weight, ms, mom, grad):
+@rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
+def _rmsprop_opt(opt, decay, epsilon, momentum, learning_rate, weight, ms, mom, grad):
     """Apply rmsprop optimizer to the weight parameter using dynamic learning rate."""
     success = True
     success = F.depend(success, opt(weight, ms, mom, grad, learning_rate, decay, momentum, epsilon))
     return success
 
 
-@centered_rmsprop_opt.register("Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor",
+@centered_rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor",
                                "Tensor", "Tensor")
-def _centered_rmsprop_opt(opt, learning_rate, decay, epsilon, momentum, weight, mg, ms, mom, grad):
+def _centered_rmsprop_opt(opt, decay, epsilon, momentum, learning_rate, weight, mg, ms, mom, grad):
     """Apply centered rmsprop optimizer to the weight parameter using dynamic learning rate."""
     success = True
     success = F.depend(success, opt(weight, mg, ms, mom, grad, learning_rate, decay, momentum, epsilon))
@@ -44,6 +44,13 @@ class RMSProp(Optimizer):
     Implements Root Mean Squared Propagation (RMSProp) algorithm.
 
     Note:
+        The RMSProp optimizer supports separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
         Update `params` according to the RMSProp algorithm.
 
         The equation is as follows:
@@ -84,8 +91,18 @@ class RMSProp(Optimizer):
         represents `gradients`.
 
     Args:
-        params (list[Parameter]): A list of parameter, which will be updated. The element in `parameters`
-                                  should be class mindspore.Parameter.
+        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
+            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
@@ -95,15 +112,13 @@ class RMSProp(Optimizer):
                                                         Other cases are not supported. Default: 0.1.
         decay (float): Decay rate. Should be equal to or greater than 0. Default: 0.9.
         momentum (float): Hyperparameter of type float, means momentum for the moving average. Should be equal to or
-                          greater than 0.Default: 0.0.
+                          greater than 0. Default: 0.0.
         epsilon (float): Term added to the denominator to improve numerical stability. Should be greater than
                          0. Default: 1e-10.
         use_locking (bool): Enable a lock to protect the update of variable and accumlation tensors. Default: False.
         centered (bool): If True, gradients are normalized by the estimated variance of the gradient. Default: False.
         loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0.
         weight_decay (float): Weight decay (L2 penalty). Should be equal to or greater than 0. Default: 0.0.
-        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
-                                 lambda x: 'beta' not in x.name and 'gamma' not in x.name.
 
     Inputs:
         - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@@ -113,14 +128,25 @@ class RMSProp(Optimizer):
 
     Examples:
         >>> net = Net()
+        >>> #1) All parameters use the same learning rate and weight decay
+        >>> optim = nn.RMSProp(params=net.trainable_params(), learning_rate=lr)
+        >>>
+        >>> #2) Use parameter groups and set different values
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
+        >>>                 {'params': no_conv_params}]
+        >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
+        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
+        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>>
         >>> loss = nn.SoftmaxCrossEntropyWithLogits()
-        >>> opt = nn.RMSProp(params=net.trainable_params(), learning_rate=lr)
-        >>> model = Model(net, loss, opt)
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
     """
     def __init__(self, params, learning_rate=0.1, decay=0.9, momentum=0.0, epsilon=1e-10,
-                 use_locking=False, centered=False, loss_scale=1.0, weight_decay=0.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
-        super(RMSProp, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter)
+                 use_locking=False, centered=False, loss_scale=1.0, weight_decay=0.0):
+        super(RMSProp, self).__init__(learning_rate, params, weight_decay, loss_scale)
         validator.check_value_type("decay", decay, [float], self.cls_name)
         validator.check_number_range("decay", decay, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name)
         validator.check_value_type("momentum", momentum, [float], self.cls_name)
@@ -150,9 +176,18 @@ class RMSProp(Optimizer):
         gradients = self.scale_grad(gradients)
         lr = self.get_lr()
         if self.centered:
-            success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, lr, self.decay, self.epsilon,
-                                               self.momentum), params, self.mg, self.ms, self.moment, gradients)
+            if self.is_group:
+                success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, self.decay, self.epsilon,
+                                                   self.momentum), lr, params, self.mg, self.ms, self.moment, gradients)
+            else:
+                success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, self.decay, self.epsilon,
+                                                   self.momentum, lr), params, self.mg, self.ms, self.moment, gradients)
+
         else:
-            success = self.hyper_map(F.partial(rmsprop_opt, self.opt, lr, self.decay, self.epsilon,
-                                               self.momentum), params, self.ms, self.moment, gradients)
+            if self.is_group:
+                success = self.hyper_map(F.partial(rmsprop_opt, self.opt, self.decay, self.epsilon,
+                                                   self.momentum), lr, params, self.ms, self.moment, gradients)
+            else:
+                success = self.hyper_map(F.partial(rmsprop_opt, self.opt, self.decay, self.epsilon,
+                                                   self.momentum, lr), params, self.ms, self.moment, gradients)
         return success
diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py
index 388fe5db47..0db58af855 100755
--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -24,7 +24,7 @@ sgd_opt = C.MultitypeFuncGraph("sgd_opt")
 
 
 @sgd_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
-def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, accum, stat):
+def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, accum, stat):
     """Apply sgd optimizer to the weight parameter using Tensor."""
     success = True
     success = F.depend(success, opt(weight, gradient, learning_rate, accum, momentum, stat))
@@ -39,9 +39,27 @@ class SGD(Optimizer):
     Nesterov momentum is based on the formula from paper `On the importance of initialization and
     momentum in deep learning <http://proceedings.mlr.press/v28/sutskever13.html>`_.
 
+    Note:
+        The SGD optimizer supports separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
     Args:
-        params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
-                                  should be class mindspore.Parameter.
+        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
+            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
@@ -67,9 +85,21 @@ class SGD(Optimizer):
 
     Examples:
         >>> net = Net()
-        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> #1) All parameters use the same learning rate and weight decay
         >>> optim = nn.SGD(params=net.trainable_params())
-        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
+        >>>
+        >>> #2) Use parameter groups and set different values
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
+        >>>                 {'params': no_conv_params}]
+        >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
+        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
+        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
     """
     def __init__(self, params, learning_rate=0.1, momentum=0.0, dampening=0.0, weight_decay=0.0, nesterov=False,
                  loss_scale=1.0):
@@ -109,5 +139,8 @@ class SGD(Optimizer):
         gradients = self.decay_weight(gradients)
         gradients = self.scale_grad(gradients)
         lr = self.get_lr()
-        success = self.hyper_map(F.partial(sgd_opt, self.opt, lr, self.momentum), gradients, params, accum, stat)
+        if self.is_group:
+            success = self.hyper_map(F.partial(sgd_opt, self.opt, self.momentum), lr, gradients, params, accum, stat)
+        else:
+            success = self.hyper_map(F.partial(sgd_opt, self.opt, self.momentum, lr), gradients, params, accum, stat)
         return success
diff --git a/mindspore/nn/wrap/cell_wrapper.py b/mindspore/nn/wrap/cell_wrapper.py
index 60718ec2b1..499d85b34b 100644
--- a/mindspore/nn/wrap/cell_wrapper.py
+++ b/mindspore/nn/wrap/cell_wrapper.py
@@ -167,7 +167,7 @@ class TrainOneStepCell(Cell):
         super(TrainOneStepCell, self).__init__(auto_prefix=False)
         self.network = network
         self.network.add_flags(defer_inline=True)
-        self.weights = ParameterTuple(network.trainable_params())
+        self.weights = optimizer.parameters
         self.optimizer = optimizer
         self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
         self.sens = sens
diff --git a/tests/ut/python/nn/optim/test_adam.py b/tests/ut/python/nn/optim/test_adam.py
index d9321b1d26..269f276376 100644
--- a/tests/ut/python/nn/optim/test_adam.py
+++ b/tests/ut/python/nn/optim/test_adam.py
@@ -50,7 +50,7 @@ class NetWithoutWeight(nn.Cell):
 def test_adamwithoutparam():
     net = NetWithoutWeight()
     net.set_train()
-    with pytest.raises(ValueError, match=r"optimizer got an empty parameter list"):
+    with pytest.raises(ValueError, match=r"Optimizer got an empty parameter list"):
         AdamWeightDecay(net.trainable_params(), learning_rate=0.1)
 
 
@@ -104,5 +104,5 @@ def test_AdamWeightDecayDynamicLR():
 
 def test_adam_mindspore_flatten():
     net = nn.Flatten()
-    with pytest.raises(ValueError, match=r"optimizer got an empty parameter list"):
+    with pytest.raises(ValueError, match=r"Optimizer got an empty parameter list"):
         AdamWeightDecay(net.get_parameters())
diff --git a/tests/ut/python/nn/optim/test_optimizer.py b/tests/ut/python/nn/optim/test_optimizer.py
index 89fb1d812b..9f1ec9a36f 100644
--- a/tests/ut/python/nn/optim/test_optimizer.py
+++ b/tests/ut/python/nn/optim/test_optimizer.py
@@ -69,19 +69,19 @@ class TestSGD():
 class TestNullParam():
     """ TestNullParam definition """
     def test_optim_init(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(ValueError):
             Optimizer(0.1, None)
 
     def test_AdamWightDecay_init(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(ValueError):
             AdamWeightDecay(None)
 
     def test_AdamWeightDecayDynamicLR_init(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(ValueError):
             AdamWeightDecayDynamicLR(None, 10)
 
     def test_Sgd_init(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(ValueError):
             SGD(None)
 
 class TestUnsupportParam():
diff --git a/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py b/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py
new file mode 100644
index 0000000000..8dd98990fa
--- /dev/null
+++ b/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py
@@ -0,0 +1,210 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+import mindspore.common.dtype as mstype
+import mindspore.nn as nn
+from mindspore.nn.optim import Momentum, SGD, RMSProp, Adam
+from mindspore import context
+from mindspore.common.api import _executor
+from mindspore.common.tensor import Tensor
+from mindspore.ops import operations as P
+from mindspore.nn import TrainOneStepCell, WithLossCell
+
+context.set_context(mode=context.GRAPH_MODE)
+
+
+class LeNet5(nn.Cell):
+    """ LeNet5 definition """
+    def __init__(self):
+        super(LeNet5, self).__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5, pad_mode='valid')
+        self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')
+        self.fc1 = nn.Dense(16 * 5 * 5, 120)
+        self.fc2 = nn.Dense(120, 84)
+        self.fc3 = nn.Dense(84, 10)
+        self.relu = nn.ReLU()
+        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.flatten = P.Flatten()
+
+    def construct(self, x):
+        x = self.max_pool2d(self.relu(self.conv1(x)))
+        x = self.max_pool2d(self.relu(self.conv2(x)))
+        x = self.flatten(x)
+        x = self.relu(self.fc1(x))
+        x = self.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def test_group_lr():
+    inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
+    label = Tensor(np.ones([1, 10]).astype(np.float32))
+
+    net = LeNet5()
+    conv_lr = 0.8
+    default_lr = 0.1
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    net.set_train()
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+
+    opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9)
+    assert opt.is_group is True
+    assert opt.dynamic_lr is False
+    for lr, param in zip(opt.learning_rate, opt.parameters):
+        if param in conv_params:
+            assert lr.data == Tensor(conv_lr, mstype.float32)
+        else:
+            assert lr.data == Tensor(default_lr, mstype.float32)
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, opt)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_group_dynamic_1():
+    inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
+    label = Tensor(np.ones([1, 10]).astype(np.float32))
+
+    net = LeNet5()
+    conv_lr = 0.8
+    default_lr = (0.1, 0.2, 0.3)
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    net.set_train()
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+
+    opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9)
+    assert opt.is_group is True
+    assert opt.dynamic_lr is True
+    for lr, param in zip(opt.learning_rate, opt.parameters):
+        if param in conv_params:
+            assert lr.data == Tensor(np.array([conv_lr] * 3).astype(np.float32))
+        else:
+            assert lr.data == Tensor(np.array(list(default_lr)).astype(np.float32))
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, opt)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_group_dynamic_2():
+    inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
+    label = Tensor(np.ones([1, 10]).astype(np.float32))
+
+    net = LeNet5()
+    conv_lr = (0.1, 0.2, 0.3)
+    default_lr = 0.8
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    net.set_train()
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+
+    opt = RMSProp(group_params, learning_rate=default_lr)
+    assert opt.is_group is True
+    assert opt.dynamic_lr is True
+    for lr, param in zip(opt.learning_rate, opt.parameters):
+        if param in conv_params:
+            assert lr.data == Tensor(np.array(list(conv_lr)).astype(np.float32))
+        else:
+            assert lr.data == Tensor(np.array([default_lr] * 3).astype(np.float32))
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, opt)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_group_dynamic_no_same_size():
+    net = LeNet5()
+    conv_lr = (0.1, 0.2, 0.3)
+    default_lr = (0.1, 0.2)
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    with pytest.raises(ValueError):
+        Momentum(group_params, learning_rate=default_lr, momentum=0.9)
+
+
+def test_group_not_float_lr():
+    net = LeNet5()
+    conv_lr = 1
+    default_lr = 0.3
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    with pytest.raises(TypeError):
+        Momentum(group_params, learning_rate=default_lr, momentum=0.9)
+
+
+def test_group_not_float_weight_decay():
+    net = LeNet5()
+    conv_weight_decay = 1
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
+                    {'params': no_conv_params}]
+    with pytest.raises(TypeError):
+        Momentum(group_params, learning_rate=0.1, momentum=0.9)
+
+
+def test_weight_decay():
+    inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
+    label = Tensor(np.ones([1, 10]).astype(np.float32))
+
+    net = LeNet5()
+    conv_weight_decay = 0.8
+    default_weight_decay = 0.0
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
+                    {'params': no_conv_params}]
+    net.set_train()
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+
+    opt = SGD(group_params, learning_rate=0.1, weight_decay=default_weight_decay)
+    assert opt.is_group is True
+    for weight_decay, decay_flags, param in zip(opt.weight_decay, opt.decay_flags, opt.parameters):
+        if param in conv_params:
+            assert weight_decay == conv_weight_decay
+            assert decay_flags is True
+        else:
+            assert weight_decay == default_weight_decay
+            assert decay_flags is False
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, opt)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_group_repeat_param():
+    net = LeNet5()
+    conv_lr = 0.1
+    default_lr = 0.3
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': conv_params, 'lr': default_lr},
+                    {'params': no_conv_params}]
+    with pytest.raises(RuntimeError):
+        Adam(group_params, learning_rate=default_lr)

From 5a259eb67e7105e40b5994a3978ea906ab5f79bc Mon Sep 17 00:00:00 2001
From: guohongzilong <2713219276@qq.com>
Date: Fri, 15 May 2020 09:54:28 +0800
Subject: [PATCH 11/13] make optimizer parameter same as gradient

---
 mindspore/nn/wrap/grad_reducer.py | 2 +-
 mindspore/nn/wrap/loss_scale.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mindspore/nn/wrap/grad_reducer.py b/mindspore/nn/wrap/grad_reducer.py
index ee57297fe0..8383910a60 100644
--- a/mindspore/nn/wrap/grad_reducer.py
+++ b/mindspore/nn/wrap/grad_reducer.py
@@ -141,7 +141,7 @@ class DistributedGradReducer(Cell):
         >>>         super(TrainingWrapper, self).__init__(auto_prefix=False)
         >>>         self.network = network
         >>>         self.network.add_flags(defer_inline=True)
-        >>>         self.weights = ParameterTuple(network.trainable_params())
+        >>>         self.weights = optimizer.parameters
         >>>         self.optimizer = optimizer
         >>>         self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
         >>>         self.sens = sens
diff --git a/mindspore/nn/wrap/loss_scale.py b/mindspore/nn/wrap/loss_scale.py
index 65d66f0150..ae76cb055f 100644
--- a/mindspore/nn/wrap/loss_scale.py
+++ b/mindspore/nn/wrap/loss_scale.py
@@ -18,7 +18,7 @@ from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
 from mindspore.train.parallel_utils import ParallelMode
 from mindspore.parallel._utils import _get_device_num, _get_parallel_mode, _get_mirror_mean
 from ..cell import Cell
-from ...common import Tensor, ParameterTuple
+from ...common import Tensor
 from ...common.parameter import Parameter
 from ...ops import functional as F
 from ...ops import composite as C
@@ -201,7 +201,7 @@ class TrainOneStepWithLossScaleCell(Cell):
         super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
         self.network = network
         self.network.add_flags(defer_inline=True)
-        self.weights = ParameterTuple(network.trainable_params())
+        self.weights = optimizer.parameters
         self.optimizer = optimizer
         self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
         self.hyper_map = C.HyperMap()

From 22866fbe2577bbb1bcfe2befe15a8acdcb8f3c7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E6=88=90=E8=BE=89?=
 <liangchenghui@liangchenghuideMacBook-Pro.local>
Date: Sat, 16 May 2020 12:17:06 +0800
Subject: [PATCH 12/13] Adapt to TBE Cast operator latest interface

---
 mindspore/ccsrc/transform/op_declare.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/transform/op_declare.cc b/mindspore/ccsrc/transform/op_declare.cc
index 27c1d306aa..5cae6c77f7 100644
--- a/mindspore/ccsrc/transform/op_declare.cc
+++ b/mindspore/ccsrc/transform/op_declare.cc
@@ -823,7 +823,7 @@ OUTPUT_MAP(RealDiv) = {{0, OUTPUT_DESC(y)}};
 // Cast
 INPUT_MAP(Cast) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(Cast) = {{2, ATTR_DESC(dst_type, AnyTraits<GEType>())}};
-ATTR_MAP(Cast) = {{"Truncate", ATTR_DESC(truncate, AnyTraits<bool>())}};
+ATTR_MAP(Cast) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(Cast) = {{0, OUTPUT_DESC(y)}};
 
 // Reciprocal

From 11089e6077154b337d7e0bc3cc7affc44c361b21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E6=88=90=E8=BE=89?=
 <liangchenghui@liangchenghuideMacBook-Pro.local>
Date: Tue, 19 May 2020 10:56:22 +0800
Subject: [PATCH 13/13] Adapte ge lib name change form ge_client_train to
 ge_runner.

---
 mindspore/ccsrc/CMakeLists.txt | 2 +-
 tests/ut/cpp/CMakeLists.txt    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 8d3818a777..37842820a2 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -125,7 +125,7 @@ endif()
 
 if (ENABLE_GE)
     if(ENABLE_TRAIN)
-        target_link_libraries(mindspore ge_client_train hccl)
+        target_link_libraries(mindspore ge_runner hccl)
     else ()
         target_link_libraries(mindspore ge_client)
     endif ()
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index f5bc07ff69..8176c4fd37 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -128,7 +128,7 @@ add_executable(ut_tests ${UT_SRCS} ${MINDSPORE_SRC_LIST} ${UT_SUTB_SRC_LIST})
 
 if (ENABLE_GE)
     if(ENABLE_TRAIN)
-        target_link_libraries(ut_tests PRIVATE graph ge_client_train)
+        target_link_libraries(ut_tests PRIVATE graph ge_runner)
     else()
         target_link_libraries(ut_tests PRIVATE graph ge_client)
     endif()