From 92dc6de049735cd0232c9a38bd9e0d86faf52095 Mon Sep 17 00:00:00 2001
From: simson <526422051@qq.com>
Date: Wed, 1 Apr 2020 11:55:29 +0800
Subject: [PATCH 01/36] modify graphengine

---
 .gitmodules | 3 ---
 graphengine | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 graphengine

diff --git a/.gitmodules b/.gitmodules
index a241b6d69b..1f5fbad2b9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,6 +10,3 @@
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
-[submodule "graphengine"]
-	path = graphengine
-	url = https://gitee.com/mindspore/graphengine.git
diff --git a/graphengine b/graphengine
deleted file mode 160000
index 5f763679fa..0000000000
--- a/graphengine
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5f763679fa33de1608d07f7651c6f16012b953ea

From f338eb3a606efc4c36bd49690a629b6ab186643f Mon Sep 17 00:00:00 2001
From: simson <526422051@qq.com>
Date: Wed, 1 Apr 2020 11:57:09 +0800
Subject: [PATCH 02/36] add graphengine

---
 .gitmodules | 3 +++
 graphengine | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 graphengine

diff --git a/.gitmodules b/.gitmodules
index 1f5fbad2b9..a024019b14 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
+[submodule "graphengine"]
+	path = graphengine
+	url = https://gitee.com/ms-incubator/graphengine.git
diff --git a/graphengine b/graphengine
new file mode 160000
index 0000000000..21d3700f66
--- /dev/null
+++ b/graphengine
@@ -0,0 +1 @@
+Subproject commit 21d3700f661576edc37607a3bc961874ee5189a7

From 6f2b7abe04a97b4b8fb3b6de51124eed95cef4e9 Mon Sep 17 00:00:00 2001
From: yanghaoran <yanghaoran2@huawei.com>
Date: Thu, 2 Apr 2020 17:15:41 +0800
Subject: [PATCH 03/36] modify reduceminD and reducemaxD IR

---
 graphengine                                 |   2 +-
 mindspore/ccsrc/pipeline/pipeline.cc        |   2 +-
 mindspore/ccsrc/transform/convert.cc        |  16 +-
 mindspore/ccsrc/transform/graph_runner.cc   |   7 +
 mindspore/ccsrc/transform/op_declare.cc     | 159 +++++++++-----------
 mindspore/ccsrc/transform/op_declare.h      |  14 +-
 mindspore/ccsrc/transform/util.cc           |  11 +-
 mindspore/ccsrc/utils/context/ms_context.cc |   4 +-
 mindspore/ops/operations/__init__.py        |   3 +-
 mindspore/ops/operations/nn_ops.py          |  75 ++++++++-
 tests/ut/python/ops/test_ops.py             |   5 +
 11 files changed, 188 insertions(+), 110 deletions(-)

diff --git a/graphengine b/graphengine
index 21d3700f66..092c7a1f65 160000
--- a/graphengine
+++ b/graphengine
@@ -1 +1 @@
-Subproject commit 21d3700f661576edc37607a3bc961874ee5189a7
+Subproject commit 092c7a1f6548cac7d40e677af3498c3c49ea2bfd
diff --git a/mindspore/ccsrc/pipeline/pipeline.cc b/mindspore/ccsrc/pipeline/pipeline.cc
index 35336e975b..70ef9a5407 100644
--- a/mindspore/ccsrc/pipeline/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/pipeline.cc
@@ -1071,7 +1071,7 @@ bool ExecutorPy::AddDFGraph(const py::dict& init_params, const std::string& phas
   }
   std::string init_graph = "init_subgraph." + net_id;
   std::string checkpoint_name = "save." + net_id;
-  if (phase == "train") {
+  if (phase.find("train") != std::string::npos) {
     (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph(), {{"ge.exec.variable_acc", "1"}});
   } else {
     (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph());
diff --git a/mindspore/ccsrc/transform/convert.cc b/mindspore/ccsrc/transform/convert.cc
index 74b0695cff..87bfc8f6d8 100755
--- a/mindspore/ccsrc/transform/convert.cc
+++ b/mindspore/ccsrc/transform/convert.cc
@@ -171,6 +171,7 @@ const char kNameAbsGrad[] = "AbsGrad";
 const char kNameBinaryCrossEntropy[] = "BinaryCrossEntropy";
 const char kNameBinaryCrossEntropyGrad[] = "BinaryCrossEntropyGrad";
 const char kNameSparseApplyAdagrad[] = "SparseApplyAdagrad";
+const char kNameSparseApplyFtrlD[] = "SparseApplyFtrlD";
 const char kNameSpaceToDepth[] = "SpaceToDepth";
 const char kNameDepthToSpace[] = "DepthToSpace";
 const char kNameSign[] = "Sign";
@@ -189,7 +190,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameApplyMomentum), ADPT_DESC(ApplyMomentum)},
     {string(kNameMaxPool), ADPT_DESC(MaxPool)},
     {string(kNameAvgPool), ADPT_DESC(AvgPool)},
-    {string(kNameTopK), ADPT_DESC(TopKV2)},
+    {string(kNameTopK), ADPT_DESC(TopK)},
     {string(kNamePack), ADPT_DESC(Pack)},
     {string(kNameSplitD), ADPT_DESC(SplitD)},
     {string(kNameAllReduce), ADPT_DESC(HcomAllReduce)},
@@ -310,7 +311,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {prim::kPrimMinimum->name(), ADPT_DESC(Minimum)},
     {prim::kPrimSelect->name(), ADPT_DESC(Select)},
     {string(kNameLessEqual), ADPT_DESC(LessEqual)},
-    {prim::kPrimLogSoftmax->name(), ADPT_DESC(LogSoftmax)},
+    {prim::kPrimLogSoftmax->name(), ADPT_DESC(LogSoftmaxV2)},
     {string(kNameTruncatedNormal), ADPT_DESC(TruncatedNormal)},
     {string(kNameStridedSliceGrad), ADPT_DESC(StridedSliceGrad)},
     {prim::kPrimGelu->name(), ADPT_DESC(Gelu)},
@@ -343,7 +344,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {prim::kPrimMatMul->name(), ADPT_DESC(MatMul)},
 
     {string(kNameConst), ADPT_DESC(Constant, Const)},
-    {string(kNameSoftmax), ADPT_DESC(Softmax)},
+    {string(kNameSoftmax), ADPT_DESC(SoftmaxV2)},
     {string(kNameSoftmaxGrad), ADPT_DESC(SoftmaxGrad)},
     {string(kNameParam), ADPT_DESC(Data)},
     {string(kNameROIAlign), ADPT_DESC(ROIAlign)},
@@ -353,6 +354,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameBinaryCrossEntropy), ADPT_DESC(BinaryCrossEntropy)},
     {string(kNameBinaryCrossEntropyGrad), ADPT_DESC(BinaryCrossEntropyGrad)},
     {string(kNameSparseApplyAdagrad), ADPT_DESC(SparseApplyAdagradD)},
+    {string(kNameSparseApplyFtrlD), ADPT_DESC(SparseApplyFtrlD)},
     {string(kNameSpaceToDepth), ADPT_DESC(SpaceToDepth)},
     {string(kNameDepthToSpace), ADPT_DESC(DepthToSpace)},
     {string(kNameSign), ADPT_DESC(Sign)},
@@ -1017,8 +1019,8 @@ DfGraphConvertor &DfGraphConvertor::BuildGraph() {
     }
   }
 
-  // set up dependices
-  MS_LOG(DEBUG) << "set up dependices";
+  // set up dependencies
+  MS_LOG(DEBUG) << "set up dependencies";
   std::vector<AnfNodePtr> nodes = ::mindspore::TopoSort(anf_graph_->get_return());
   for (auto &it : nodes) {
     SetNodeInput(it);
@@ -1115,8 +1117,8 @@ void DfGraphConvertor::UpdateDataOpDesc(const AnfNodePtr &it, const OperatorPtr
   if (desc == nullptr) {
     MS_LOG(ERROR) << "Update data op descriptor failed! TensorDesc is null.";
   } else {
-    (void)std::static_pointer_cast<Data>(op)->update_input_desc_data(*desc);
-    (void)std::static_pointer_cast<Data>(op)->update_output_desc_out(*desc);
+    (void)std::static_pointer_cast<Data>(op)->update_input_desc_x(*desc);
+    (void)std::static_pointer_cast<Data>(op)->update_output_desc_y(*desc);
   }
 }
 
diff --git a/mindspore/ccsrc/transform/graph_runner.cc b/mindspore/ccsrc/transform/graph_runner.cc
index e77b1bcd73..2bff1a740c 100644
--- a/mindspore/ccsrc/transform/graph_runner.cc
+++ b/mindspore/ccsrc/transform/graph_runner.cc
@@ -135,6 +135,13 @@ Status GraphRunner::RunGraph(const RunOptions& options, const std::vector<GeTens
     return Status::FAILED;
   }
 
+  // The information of some nodes could be changed after fusion in some cases
+  // Therefore a graph needs to be rebuilt in above situation
+  if (sess_->IsGraphNeedRebuild(wrap_ptr->id_)) {
+    sess_->RemoveGraph(wrap_ptr->id_);
+    sess_->AddGraph(wrap_ptr->id_, *(wrap_ptr->graph_ptr_), wrap_ptr->options_);
+  }
+
   ge::Status ret = sess_->RunGraph(wrap_ptr->id_, ge_inputs, ge_outputs);
   if (ret != ge::GRAPH_SUCCESS) {
     MS_LOG(ERROR) << "Call GE RunGraph Failed, ret is: " << ret;
diff --git a/mindspore/ccsrc/transform/op_declare.cc b/mindspore/ccsrc/transform/op_declare.cc
index 78b949c525..07c5e9f5fe 100755
--- a/mindspore/ccsrc/transform/op_declare.cc
+++ b/mindspore/ccsrc/transform/op_declare.cc
@@ -138,11 +138,10 @@ OUTPUT_MAP(ApplyMomentum) = {{0, OUTPUT_DESC(var)}};
 INPUT_MAP(Summary) = {{2, INPUT_DESC(x)}};
 ATTR_MAP(Summary) = EMPTY_ATTR_MAP;
 
-// data
+// Data
 INPUT_MAP(Data) = EMPTY_INPUT_MAP;
 ATTR_MAP(Data) = EMPTY_ATTR_MAP;
 
-// resnet ops in ge
 // BatchNorm
 INPUT_MAP(BatchNorm) = {{1, INPUT_DESC(x)},
                         {2, INPUT_DESC(scale)},
@@ -194,9 +193,9 @@ OUTPUT_MAP(PRelu) = {{0, OUTPUT_DESC(y)}};
 
 // PReluGrad
 INPUT_MAP(PReluGrad) = {
-  {1, INPUT_DESC(input_gradients)}, {2, INPUT_DESC(input_features)}, {3, INPUT_DESC(input_weights)}};
+  {1, INPUT_DESC(grads)}, {2, INPUT_DESC(features)}, {3, INPUT_DESC(weights)}};
 ATTR_MAP(PReluGrad) = EMPTY_ATTR_MAP;
-OUTPUT_MAP(PReluGrad) = {{0, OUTPUT_DESC(output_backprops_dx)}, {1, OUTPUT_DESC(output_backprops_da)}};
+OUTPUT_MAP(PReluGrad) = {{0, OUTPUT_DESC(dx)}, {1, OUTPUT_DESC(da)}};
 
 // Sigmoid
 INPUT_MAP(Sigmoid) = {{1, INPUT_DESC(x)}};
@@ -241,12 +240,12 @@ ATTR_MAP(CumsumD) = {{"exclusive", ATTR_DESC(exclusive, AnyTraits<bool>())},
                      {"reverse", ATTR_DESC(reverse, AnyTraits<bool>())}};
 OUTPUT_MAP(CumsumD) = {{0, OUTPUT_DESC(y)}};
 
-// softmax
-INPUT_MAP(Softmax) = {{1, INPUT_DESC(x)}};
-ATTR_MAP(Softmax) = {
-  {"axis", ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
+// SoftmaxV2
+INPUT_MAP(SoftmaxV2) = {{1, INPUT_DESC(x)}};
+ATTR_MAP(SoftmaxV2) = {
+  {"axis", ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
 };
-OUTPUT_MAP(Softmax) = {{0, OUTPUT_DESC(y)}};
+OUTPUT_MAP(SoftmaxV2) = {{0, OUTPUT_DESC(y)}};
 
 // SoftmaxGrad
 INPUT_MAP(SoftmaxGrad) = {{1, INPUT_DESC(softmax)}, {2, INPUT_DESC(grad_softmax)}};
@@ -269,21 +268,21 @@ ATTR_MAP(GatherV2) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(GatherV2) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceSum
-INPUT_MAP(ReduceSum) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axis)}};
+INPUT_MAP(ReduceSum) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axes)}};
 ATTR_MAP(ReduceSum) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceSum) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceSumD
 INPUT_MAP(ReduceSumD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceSumD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceSumD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceSumD) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceProdD
 INPUT_MAP(ReduceProdD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceProdD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceProdD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceProdD) = {{0, OUTPUT_DESC(y)}};
 
@@ -294,7 +293,7 @@ ATTR_MAP(CumprodD) = {{"exclusive", ATTR_DESC(exclusive, AnyTraits<bool>())},
                       {"reverse", ATTR_DESC(reverse, AnyTraits<bool>())}};
 OUTPUT_MAP(CumprodD) = {{0, OUTPUT_DESC(y)}};
 
-// SoftmaxCrossEntropyWithLogits/
+// SoftmaxCrossEntropyWithLogits
 INPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{1, INPUT_DESC(features)}, {2, INPUT_DESC(labels)}};
 ATTR_MAP(SoftmaxCrossEntropyWithLogits) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{0, OUTPUT_DESC(loss)}, {1, OUTPUT_DESC(backprop)}};
@@ -306,7 +305,7 @@ INPUT_ATTR_MAP(MeanGrad) = {{2, ATTR_DESC(mean_grad_output_shape_value, kOpForma
 ATTR_MAP(MeanGrad) = {{"mode", ATTR_DESC(mode, AnyTraits<int64_t>())}};
 
 INPUT_MAP(SliceD) = {{1, INPUT_DESC(x)}};
-INPUT_ATTR_MAP(SliceD) = {{2, ATTR_DESC(begin, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())},
+INPUT_ATTR_MAP(SliceD) = {{2, ATTR_DESC(offsets, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())},
                           {3, ATTR_DESC(size, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(SliceD) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(SliceD) = {{0, OUTPUT_DESC(y)}};
@@ -401,42 +400,10 @@ ATTR_MAP(BoundingBoxDecode) = {
 };
 OUTPUT_MAP(BoundingBoxDecode) = {{0, OUTPUT_DESC(bboxes)}};
 
-#ifdef VALID_CODE
-
-// Less
-INPUT_MAP(Less) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}};
-ATTR_MAP(Less) = EMPTY_ATTR_MAP;
-OUTPUT_MAP(Less) = {{0, OUTPUT_DESC(z)}};
-
-// Cast
-INPUT_MAP(Cast) = {{1, INPUT_DESC(x)}};
-INPUT_ATTR_MAP(Cast) = {{2, ATTR_DESC(dst_type, AnyTraits<GEType>())}};
-ATTR_MAP(Cast) = {{"Truncate", ATTR_DESC(truncate, AnyTraits<bool>())}};
-OUTPUT_MAP(Cast) = {{0, OUTPUT_DESC(y)}};
-
-// Minimum
-INPUT_MAP(Minimum) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}};
-ATTR_MAP(Minimum) = {{"alpha", ATTR_DESC(alpha, AnyTraits<float>())}, {"beta", ATTR_DESC(beta, AnyTraits<float>())}};
-OUTPUT_MAP(Minimum) = {{0, OUTPUT_DESC(z)}};
-
-// Sub
-INPUT_MAP(Sub) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
-ATTR_MAP(Sub) = {{"alpha", ATTR_DESC(alpha, AnyTraits<float>())}, {"beta", ATTR_DESC(beta, AnyTraits<float>())}};
-
-#endif
-
-// TopKV2
-INPUT_MAP(TopKV2) = {
-  {1, INPUT_DESC(input)},
-  {2, INPUT_DESC(k)},
-};
-
-ATTR_MAP(TopKV2) = {{"T", ATTR_DESC(T, AnyTraits<GEType>())}, {"sorted", ATTR_DESC(sorted, AnyTraits<bool>())}};
-
-OUTPUT_MAP(TopKV2) = {
-  {0, OUTPUT_DESC(values)},
-  {1, OUTPUT_DESC(indices)},
-};
+// TopK
+INPUT_MAP(TopK) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(k)}};
+ATTR_MAP(TopK) = {{"sorted", ATTR_DESC(sorted, AnyTraits<bool>())}};
+OUTPUT_MAP(TopK) = {{0, OUTPUT_DESC(values)}, {1, OUTPUT_DESC(indices)}};
 
 // Multiply
 INPUT_MAP(Multiply) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}};
@@ -476,7 +443,7 @@ ATTR_MAP(Iou) = {{"mode", ATTR_DESC(mode, AnyTraits<std::string>())}};
 OUTPUT_MAP(Iou) = {{0, OUTPUT_DESC(overlap)}};
 
 // ResizeNearestNeighborD
-INPUT_MAP(ResizeNearestNeighborD) = {{1, INPUT_DESC(images)}};
+INPUT_MAP(ResizeNearestNeighborD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(ResizeNearestNeighborD) = {
   {"size", ATTR_DESC(size, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
   {"align_corners", ATTR_DESC(align_corners, AnyTraits<bool>())}};
@@ -506,17 +473,17 @@ ATTR_MAP(Relu6) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(Relu6) = {{0, OUTPUT_DESC(activations)}};
 
 // Relu6Grad
-INPUT_MAP(Relu6Grad) = {{1, INPUT_DESC(dy)}, {2, INPUT_DESC(y)}};
+INPUT_MAP(Relu6Grad) = {{1, INPUT_DESC(features)}, {2, INPUT_DESC(gradients)}};
 ATTR_MAP(Relu6Grad) = EMPTY_ATTR_MAP;
-OUTPUT_MAP(Relu6Grad) = {{0, OUTPUT_DESC(z)}};
+OUTPUT_MAP(Relu6Grad) = {{0, OUTPUT_DESC(backprops)}};
 
 // ResizeBilinearGrad
 INPUT_MAP(ResizeBilinearGrad) = {{1, INPUT_DESC(grads)}, {2, INPUT_DESC(original_image)}};
 ATTR_MAP(ResizeBilinearGrad) = {{"align_corners", ATTR_DESC(align_corners, AnyTraits<bool>())}};
 OUTPUT_MAP(ResizeBilinearGrad) = {{0, OUTPUT_DESC(y)}};
 
-// ResizeBilinear
-INPUT_MAP(ResizeBilinearD) = {{1, INPUT_DESC(images)}};
+// ResizeBilinearD
+INPUT_MAP(ResizeBilinearD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(ResizeBilinearD) = {
   {"size", ATTR_DESC(size, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
   {"align_corners", ATTR_DESC(align_corners, AnyTraits<bool>())}};
@@ -539,9 +506,9 @@ OUTPUT_MAP(NMSWithMask) = {
   {0, OUTPUT_DESC(selected_boxes)}, {1, OUTPUT_DESC(selected_idx)}, {2, OUTPUT_DESC(selected_mask)}};
 
 // Unpack
-INPUT_MAP(Unpack) = {{1, INPUT_DESC(value)}};
+INPUT_MAP(Unpack) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(Unpack) = {{"axis", ATTR_DESC(axis, AnyTraits<int>())}, {"num", ATTR_DESC(num, AnyTraits<int>())}};
-DYN_OUTPUT_MAP(Unpack) = {{0, DYN_OUTPUT_DESC(output)}};
+DYN_OUTPUT_MAP(Unpack) = {{0, DYN_OUTPUT_DESC(y)}};
 
 // ScatterNdUpdate
 INPUT_MAP(ScatterNdUpdate) = {{1, INPUT_DESC(var)}, {2, INPUT_DESC(indices)}, {3, INPUT_DESC(updates)}};
@@ -574,8 +541,8 @@ INPUT_MAP(SigmoidCrossEntropyWithLogitsGrad) = {
 ATTR_MAP(SigmoidCrossEntropyWithLogitsGrad) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(SigmoidCrossEntropyWithLogitsGrad) = {{0, OUTPUT_DESC(gradient)}};
 
-// ScatterNd
-INPUT_MAP(ScatterNdD) = {{1, INPUT_DESC(indices)}, {2, INPUT_DESC(updates)}};
+// ScatterNdD
+INPUT_MAP(ScatterNdD) = {{1, INPUT_DESC(indices)}, {2, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ScatterNdD) = {
   {3, ATTR_DESC(shape, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ScatterNdD) = EMPTY_ATTR_MAP;
@@ -587,7 +554,7 @@ ATTR_MAP(PadD) = {{"paddings", ATTR_DESC(paddings, AnyTraits<std::vector<std::ve
 OUTPUT_MAP(PadD) = {{0, OUTPUT_DESC(y)}};
 
 // GatherNd
-INPUT_MAP(GatherNd) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
+INPUT_MAP(GatherNd) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(indices)}};
 ATTR_MAP(GatherNd) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(GatherNd) = {{0, OUTPUT_DESC(y)}};
 
@@ -612,13 +579,13 @@ ATTR_MAP(ROIAlignGrad) = {
 // ArgMaxD
 INPUT_MAP(ArgMaxD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(ArgMaxD) = {{"axis", ATTR_DESC(dimension, AnyTraits<int>())},
-                     {"output_type", ATTR_DESC(output_type, AnyTraits<GEType>())}};
+                     {"output_type", ATTR_DESC(dtype, AnyTraits<GEType>())}};
 OUTPUT_MAP(ArgMaxD) = {{0, OUTPUT_DESC(y)}};
 
 // ArgMinD
 INPUT_MAP(ArgMinD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(ArgMinD) = {{"axis", ATTR_DESC(dimension, AnyTraits<int>())},
-                     {"output_type", ATTR_DESC(output_type, AnyTraits<GEType>())}};
+                     {"output_type", ATTR_DESC(dtype, AnyTraits<GEType>())}};
 OUTPUT_MAP(ArgMinD) = {{0, OUTPUT_DESC(y)}};
 
 // ArgMaxWithValue
@@ -634,14 +601,14 @@ ATTR_MAP(ArgMinWithValue) = {{"axis", ATTR_DESC(dimension, AnyTraits<int>())},
 OUTPUT_MAP(ArgMinWithValue) = {{0, OUTPUT_DESC(indice)}, {1, OUTPUT_DESC(values)}};
 
 // ReduceAll
-INPUT_MAP(ReduceAll) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axis)}};
+INPUT_MAP(ReduceAll) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axes)}};
 ATTR_MAP(ReduceAll) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceAll) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceMeanD
 INPUT_MAP(ReduceMeanD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceMeanD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceMeanD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceMeanD) = {{0, OUTPUT_DESC(y)}};
 
@@ -708,11 +675,12 @@ INPUT_MAP(BiasAddGrad) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(BiasAddGrad) = {{"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())}};
 OUTPUT_MAP(BiasAddGrad) = {{0, OUTPUT_DESC(y)}};
 
-// maxpoolgrad
+// MaxPoolGrad
 INPUT_MAP(MaxPoolGrad) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}, {3, INPUT_DESC(grad)}};
 ATTR_MAP(MaxPoolGrad) = {{"ksize", ATTR_DESC(ksize, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())},
                          {"strides", ATTR_DESC(strides, AnyTraits<int>(), AnyTraits<std::vector<int64_t>>())},
-                         {"padding", ATTR_DESC(padding, AnyTraits<std::string>())}};
+                         {"padding", ATTR_DESC(padding, AnyTraits<std::string>())},
+                         {"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())}};
 OUTPUT_MAP(MaxPoolGrad) = {{0, OUTPUT_DESC(y)}};
 
 // avgpoolgrad
@@ -739,28 +707,34 @@ ATTR_MAP(Conv2D) = {
   {"stride", ATTR_DESC(strides, "pad", AnyTraits<std::vector<int64_t>>())},
   {"pad_list", ATTR_DESC(pads, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
   {"dilation", ATTR_DESC(dilations, "pad", AnyTraits<std::vector<int64_t>>())},
+  {"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())},
+  {"group", ATTR_DESC(groups, AnyTraits<int>())}
 };
 OUTPUT_MAP(Conv2D) = {{0, OUTPUT_DESC(y)}};
 
 // Conv2DBackpropInputD
-INPUT_MAP(Conv2DBackpropInputD) = {{1, INPUT_DESC(out_backprop)}, {2, INPUT_DESC(filters)}};
+INPUT_MAP(Conv2DBackpropInputD) = {{1, INPUT_DESC(out_backprop)}, {2, INPUT_DESC(filter)}};
 INPUT_ATTR_MAP(Conv2DBackpropInputD) = {
-  {3, ATTR_DESC(input_sizes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {3, ATTR_DESC(input_size, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(Conv2DBackpropInputD) = {
   {"pad_list", ATTR_DESC(pads, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
-  {"stride", ATTR_DESC(strides, "strides", AnyTraits<std::vector<int64_t>>())},
+  {"stride", ATTR_DESC(strides, "pad", AnyTraits<std::vector<int64_t>>())},
   {"dilation", ATTR_DESC(dilations, "pad", AnyTraits<std::vector<int64_t>>())},
+  {"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())},
+  {"group", ATTR_DESC(groups, AnyTraits<int>())}
 };
 OUTPUT_MAP(Conv2DBackpropInputD) = {{0, OUTPUT_DESC(y)}};
 
 // Conv2DBackpropFilterD
 INPUT_MAP(Conv2DBackpropFilterD) = {{1, INPUT_DESC(out_backprop)}, {2, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(Conv2DBackpropFilterD) = {
-  {3, ATTR_DESC(filter_sizes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {3, ATTR_DESC(filter_size, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(Conv2DBackpropFilterD) = {
   {"pad_list", ATTR_DESC(pads, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())},
-  {"stride", ATTR_DESC(strides, "strides", AnyTraits<std::vector<int64_t>>())},
+  {"stride", ATTR_DESC(strides, "pad", AnyTraits<std::vector<int64_t>>())},
   {"dilation", ATTR_DESC(dilations, "pad", AnyTraits<std::vector<int64_t>>())},
+  {"data_format", ATTR_DESC(data_format, AnyTraits<std::string>())},
+  {"group", ATTR_DESC(groups, AnyTraits<int>())}
 };
 OUTPUT_MAP(Conv2DBackpropFilterD) = {{0, OUTPUT_DESC(y)}};
 
@@ -798,8 +772,8 @@ OUTPUT_MAP(DepthwiseConv2DBackpropFilterD) = {{0, OUTPUT_DESC(filter_grad)}};
 
 // MatMul
 INPUT_MAP(MatMul) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
-ATTR_MAP(MatMul) = {{"transpose_a", ATTR_DESC(transpose_a, AnyTraits<bool>())},
-                    {"transpose_b", ATTR_DESC(transpose_b, AnyTraits<bool>())}};
+ATTR_MAP(MatMul) = {{"transpose_a", ATTR_DESC(transpose_x1, AnyTraits<bool>())},
+                    {"transpose_b", ATTR_DESC(transpose_x2, AnyTraits<bool>())}};
 OUTPUT_MAP(MatMul) = {{0, OUTPUT_DESC(y)}};
 
 // Merge
@@ -846,10 +820,10 @@ ATTR_MAP(Sub) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(Sub) = {{0, OUTPUT_DESC(y)}};
 
 // SplitD
-INPUT_MAP(SplitD) = {{1, INPUT_DESC(value)}};
+INPUT_MAP(SplitD) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(SplitD) = {{"axis", ATTR_DESC(split_dim, AnyTraits<int>())},
                     {"output_num", ATTR_DESC(num_split, AnyTraits<int>())}};
-DYN_OUTPUT_MAP(SplitD) = {{0, DYN_OUTPUT_DESC(output)}};
+DYN_OUTPUT_MAP(SplitD) = {{0, DYN_OUTPUT_DESC(y)}};
 
 // Neg
 INPUT_MAP(Neg) = {{1, INPUT_DESC(x)}};
@@ -876,12 +850,12 @@ OUTPUT_MAP(Pack) = {{0, OUTPUT_DESC(y)}};
 
 // ConcatD
 INPUT_MAP(ConcatD) = EMPTY_INPUT_MAP;
-DYN_INPUT_MAP(ConcatD) = {{1, DYN_INPUT_DESC(input_values)}};
+DYN_INPUT_MAP(ConcatD) = {{1, DYN_INPUT_DESC(x)}};
 ATTR_MAP(ConcatD) = {
   {"axis", ATTR_DESC(concat_dim, AnyTraits<int>())},
   {"inputNums", ATTR_DESC(N, AnyTraits<int>())},
 };
-OUTPUT_MAP(ConcatD) = {{0, OUTPUT_DESC(output_data)}};
+OUTPUT_MAP(ConcatD) = {{0, OUTPUT_DESC(y)}};
 
 // Less
 INPUT_MAP(Less) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
@@ -916,14 +890,14 @@ OUTPUT_MAP(TanhGrad) = {{0, OUTPUT_DESC(z)}};
 // ReduceMinD
 INPUT_MAP(ReduceMinD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceMinD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceMinD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceMinD) = {{0, OUTPUT_DESC(y)}};
 
 // ReduceMaxD
 INPUT_MAP(ReduceMaxD) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(ReduceMaxD) = {
-  {2, ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+  {2, ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
 ATTR_MAP(ReduceMaxD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits<bool>())}};
 OUTPUT_MAP(ReduceMaxD) = {{0, OUTPUT_DESC(y)}};
 
@@ -1008,11 +982,11 @@ INPUT_MAP(LessEqual) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
 ATTR_MAP(LessEqual) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(LessEqual) = {{0, OUTPUT_DESC(y)}};
 
-// LogSoftmax
-INPUT_MAP(LogSoftmax) = {{1, INPUT_DESC(logits)}};
-ATTR_MAP(LogSoftmax) = {
-  {"axis", ATTR_DESC(axis, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
-OUTPUT_MAP(LogSoftmax) = {{0, OUTPUT_DESC(logsoftmax)}};
+// LogSoftmaxV2
+INPUT_MAP(LogSoftmaxV2) = {{1, INPUT_DESC(logits)}};
+ATTR_MAP(LogSoftmaxV2) = {
+  {"axis", ATTR_DESC(axes, AnyTraits<std::vector<int64_t>>(), AnyTraits<std::vector<int64_t>>())}};
+OUTPUT_MAP(LogSoftmaxV2) = {{0, OUTPUT_DESC(logsoftmax)}};
 
 // RandomChoiceWithMask
 INPUT_MAP(RandomChoiceWithMask) = {{1, INPUT_DESC(x)}};
@@ -1094,8 +1068,8 @@ OUTPUT_MAP(LayerNormGrad) = {{0, OUTPUT_DESC(pd_x)}, {1, OUTPUT_DESC(pd_gamma)},
 
 // BatchMatMul
 INPUT_MAP(BatchMatMul) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}};
-ATTR_MAP(BatchMatMul) = {{"transpose_x1", ATTR_DESC(adj_x, AnyTraits<bool>())},
-                         {"transpose_x2", ATTR_DESC(adj_y, AnyTraits<bool>())}};
+ATTR_MAP(BatchMatMul) = {{"transpose_x1", ATTR_DESC(adj_x1, AnyTraits<bool>())},
+                         {"transpose_x2", ATTR_DESC(adj_x2, AnyTraits<bool>())}};
 OUTPUT_MAP(BatchMatMul) = {{0, OUTPUT_DESC(y)}};
 
 // DropoutDoMask
@@ -1146,6 +1120,19 @@ ATTR_MAP(SparseApplyAdagradD) = {{"lr", ATTR_DESC(lr, AnyTraits<float>())},
                                  {"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
 OUTPUT_MAP(SparseApplyAdagradD) = {{0, OUTPUT_DESC(var)}};
 
+// SparseApplyFtrlD
+INPUT_MAP(SparseApplyFtrlD) = {{1, INPUT_DESC(var)},
+                               {2, INPUT_DESC(accum)},
+                               {3, INPUT_DESC(linear)},
+                               {4, INPUT_DESC(grad)},
+                               {5, INPUT_DESC(indices)}};
+ATTR_MAP(SparseApplyFtrlD) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())},
+                              {"lr", ATTR_DESC(lr, AnyTraits<float>())},
+                              {"l1", ATTR_DESC(l1, AnyTraits<float>())},
+                              {"l2", ATTR_DESC(l2, AnyTraits<float>())},
+                              {"lr_power", ATTR_DESC(lr_power, AnyTraits<float>())}};
+OUTPUT_MAP(SparseApplyFtrlD) = {{0, OUTPUT_DESC(var)}};
+
 // SpaceToDepth
 INPUT_MAP(SpaceToDepth) = {{1, INPUT_DESC(x)}};
 ATTR_MAP(SpaceToDepth) = {{"block_size", ATTR_DESC(block_size, AnyTraits<int64_t>())}};
diff --git a/mindspore/ccsrc/transform/op_declare.h b/mindspore/ccsrc/transform/op_declare.h
index 03463b978f..9e4f407ebb 100755
--- a/mindspore/ccsrc/transform/op_declare.h
+++ b/mindspore/ccsrc/transform/op_declare.h
@@ -209,8 +209,8 @@ DECLARE_OP_USE_OUTPUT(Merge)
 DECLARE_OP_ADAPTER(Switch)
 DECLARE_OP_USE_OUTPUT(Switch)
 
-DECLARE_OP_ADAPTER(TopKV2)
-DECLARE_OP_USE_OUTPUT(TopKV2)
+DECLARE_OP_ADAPTER(TopK)
+DECLARE_OP_USE_OUTPUT(TopK)
 
 DECLARE_OP_ADAPTER(RealDiv)
 DECLARE_OP_USE_OUTPUT(RealDiv)
@@ -260,8 +260,8 @@ DECLARE_OP_ADAPTER(Select)
 DECLARE_OP_USE_OUTPUT(Select)
 DECLARE_OP_ADAPTER(LessEqual)
 DECLARE_OP_USE_OUTPUT(LessEqual)
-DECLARE_OP_ADAPTER(LogSoftmax)
-DECLARE_OP_USE_OUTPUT(LogSoftmax)
+DECLARE_OP_ADAPTER(LogSoftmaxV2)
+DECLARE_OP_USE_OUTPUT(LogSoftmaxV2)
 DECLARE_OP_ADAPTER(TruncatedNormal)
 DECLARE_OP_USE_OUTPUT(TruncatedNormal)
 DECLARE_OP_ADAPTER(StridedSliceGrad)
@@ -391,8 +391,8 @@ DECLARE_OP_ADAPTER(Sigmoid)
 DECLARE_OP_USE_OUTPUT(Sigmoid)
 DECLARE_OP_ADAPTER(SigmoidGrad)
 DECLARE_OP_USE_OUTPUT(SigmoidGrad)
-DECLARE_OP_ADAPTER(Softmax)
-DECLARE_OP_USE_OUTPUT(Softmax)
+DECLARE_OP_ADAPTER(SoftmaxV2)
+DECLARE_OP_USE_OUTPUT(SoftmaxV2)
 DECLARE_OP_ADAPTER(SoftmaxGrad)
 DECLARE_OP_USE_OUTPUT(SoftmaxGrad)
 DECLARE_OP_ADAPTER(Greater)
@@ -435,6 +435,8 @@ DECLARE_OP_ADAPTER(Round)
 DECLARE_OP_USE_OUTPUT(Round)
 DECLARE_OP_ADAPTER(ApplyFtrl)
 DECLARE_OP_USE_OUTPUT(ApplyFtrl)
+DECLARE_OP_ADAPTER(SparseApplyFtrlD)
+DECLARE_OP_USE_OUTPUT(SparseApplyFtrlD)
 #ifdef ENABLE_GE
 DECLARE_OP_ADAPTER(Print)
 DECLARE_OP_USE_DYN_INPUT(Print)
diff --git a/mindspore/ccsrc/transform/util.cc b/mindspore/ccsrc/transform/util.cc
index a106a20ad8..0a18763d12 100644
--- a/mindspore/ccsrc/transform/util.cc
+++ b/mindspore/ccsrc/transform/util.cc
@@ -361,12 +361,11 @@ MeTensorPtr TransformUtil::GenerateMeTensor(const GeTensorPtr& ge_tensor, const
     MS_LOG(ERROR) << "GE tensor data size is zero!";
     return nullptr;
   }
-  errno_t ret = memcpy_s(me_data_ptr, me_data_size, ge_tensor->GetData(), ge_tensor->GetSize());
-  if (ret != EOK) {
-    MS_LOG(INFO) << "GE tensor data size is " << ge_tensor->GetSize() << " bytes";
-    MS_LOG(ERROR) << "Copy GE tensor data to me tensor failed";
-    return nullptr;
-  }
+
+  // Use memcpy here, not memcpy_s, just because the size of ge_tensor may be bigger than 2GB
+  // which is the size limit of memcpy_s
+  memcpy(me_data_ptr, ge_tensor->GetData(), ge_tensor->GetSize());
+
   return make_shared<MeTensor>(me_tensor);
 }
 
diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index bf05af9858..e9b4586b21 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -355,7 +355,9 @@ void MsContext::GetGeOptions(std::map<std::string, std::string>* ge_options) con
     MS_LOG(ERROR) << "Set proto lib path failed!";
   }
 
-  // Disbale the global variable acc, only enable it whlie adding training graph in pipeline
+  // Enable auto mixed precision according to the context options
+  (*ge_options)["ge.exec.auto_mix_precision"] = std::to_string(auto_mixed_precision_flag_);
+  // Disable the global variable acc, only enable it whlie adding training graph in pipeline
   (*ge_options)["ge.exec.variable_acc"] = "0";
 #endif
 }
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index a75b078df8..77bb6d0ff3 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -65,7 +65,7 @@ from .nn_ops import (LSTM, SGD, Adam, ApplyMomentum, BatchNorm,
                      SmoothL1Loss, Softmax,
                      SoftmaxCrossEntropyWithLogits, ROIAlign,
                      SparseSoftmaxCrossEntropyWithLogits, Tanh,
-                     TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl)
+                     TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl, SparseApplyFtrlD)
 from .other_ops import Assign, IOU, BoundingBoxDecode, BoundingBoxEncode, CheckValid, MakeRefKey
 
 
@@ -217,6 +217,7 @@ __all__ = [
     "Abs",
     "BinaryCrossEntropy",
     "SparseApplyAdagrad",
+    "SparseApplyFtrlD",
     "SpaceToDepth",
     "DepthToSpace",
     "Conv2DBackpropInput",
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index afa4c7dfe3..57e409b44f 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -2141,6 +2141,79 @@ class SparseApplyAdagrad(PrimitiveWithInfer):
         return var_type
 
 
+class SparseApplyFtrlD(PrimitiveWithInfer):
+    r"""
+    Conduct experiment on updating on parameters related to FTRL optimization algorithm.
+
+    .. math ::
+            \text{accum} = \text{grad} * \text{grad}
+
+    .. math ::
+            \text{linear} += \text{grad} + (\text{accum} ^ {\text{-lr_power}} -
+            \frac{\text{accum} ^ \text{-lr_power}}{\text{lr}} * \text{var})
+
+    .. math ::
+            \text{quadratic} = {\text{1.0}/({\text{accum}^\text{lr_power} * \text{lr}}) + 2*\text{l2}
+
+    .. math ::
+            \text{var} = {\text{sign}({linear}) * \text{l1} - \text{linear}})/{ quadratic }
+            if \vert linear \vert > l1 \ else \ 0.0
+
+    Args:
+        lr (float): Learning rate.
+        l1 (float): temp value NO.1.
+        l2 (float): temp value No.2.
+        lr_power (float): temp value used as power number.
+        use_locking (bool): If true, updating the var and accum tensors will be protected. Default: False.
+
+    Inputs:
+       - **var** (Tensor) - Variable to be update. The type must be float32.
+       - **accum** (Tensor) - Accum to be update. The shape must be the same as `var`'s shape,
+         the type must be float32.
+       - **linear** (Tensor) - Linear to be update. The shape must be the same as `var`'s shape,
+         the type must be float32.
+       - **grad** (Tensor) - Gradient. The shape must be the same as `var`'s shape,
+         the type must be float32.
+       - **indices** (Tensor) - A vector of indices into the first dimension of 'var' and 'accum',
+         the shape of `indices` must be the same as `grad` in first dimension, the type must be int32.
+
+    Output:
+        Tensors, has the same shape and type as `var`.
+
+    """
+
+    @prim_attr_register
+    def __init__(self, lr, l1, l2, lr_power, use_locking=False):
+        """init SparseApplyFtrlD"""
+        self.lr = validator.check_type("lr", lr, [float])
+        self.l1 = validator.check_type("l1", l1, [float])
+        self.l2 = validator.check_type("l2", l2, [float])
+        self.lr_power = validator.check_type("lr_power", lr_power, [float])
+        self.use_locking = validator.check_type("use_locking", use_locking, [bool])
+
+    def infer_shape(self, var_shape, accum_shape, linear_shape, grad_shape, indices_shape):
+        validator.check_param_equal('var shape', var_shape, 'accum shape', accum_shape)
+        validator.check_param_equal('len of var shape', len(var_shape), 'len of grad shape', len(grad_shape))
+        validator.check_param_equal('len of var shape', len(var_shape), 'len of linear shape', len(linear_shape))
+        if len(var_shape) > 1:
+            validator.check_param_equal('var_shape', var_shape[1:], 'grad_shape', grad_shape[1:])
+            validator.check_param_equal('var_shape', var_shape[1:], 'linear_shape', linear_shape[1:])
+        validator.check_integer("len of indices shape", len(indices_shape), 1, Rel.EQ)
+        validator.check('the first dimension of grad', grad_shape[0],
+                        'the shape of indices', indices_shape[0], Rel.EQ)
+
+        return var_shape
+
+    def infer_dtype(self, var_type, accum_type, linear_type, grad_type, indices_type):
+        validator.check_subclass("var_type", var_type, mstype.tensor)
+        validator.check_subclass("accum_type", accum_type, mstype.tensor)
+        validator.check_subclass("linear_type", linear_type, mstype.tensor)
+        validator.check_subclass("grad_type", grad_type, mstype.tensor)
+        validator.check_subclass("indices_type", indices_type, mstype.tensor)
+
+        return var_type
+
+
 class LARSUpdate(PrimitiveWithInfer):
     """
     Conduct lars (layer-wise adaptive rate scaling) update on the square sum of gradient.
@@ -2244,4 +2317,4 @@ class ApplyFtrl(PrimitiveWithInfer):
         validator.check_typename("l1", l1_type,[mstype.float16, mstype.float32])
         validator.check_typename("l2", l2_type,[mstype.float16, mstype.float32])
         validator.check_typename("lr_power", lr_power_type,[mstype.float16, mstype.float32])
-        return var_type
\ No newline at end of file
+        return var_type
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index bfe8075972..8d7dd95072 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -749,6 +749,11 @@ test_case_nn_ops = [
         'desc_inputs': [[3, 3], [3, 3], [3, 3], Tensor(np.ones((3,), np.int32))],
         'desc_bprop': [3, 3],
         'skip': ['backward']}),
+    ('SparseApplyFtrlD', {
+        'block': P.SparseApplyFtrlD(0.1, 0.1, 0.1, -0.1),
+        'desc_inputs': [[3, 3], [3, 3], [3, 3], [3, 3], Tensor(2*np.ones((3,), np.int32))],
+        'desc_bprop': [3, 3],
+        'skip': ['backward']}),
     ('Flatten_1', {
         'block': NetForFlatten(),
         'desc_inputs': [Tensor(np.ones([2, 3, 4]).astype(np.int32)), Tensor(np.ones([2, 12]).astype(np.int32))],

From 0a977aa19dc216b43ddc8adc11490bb00d001c3e Mon Sep 17 00:00:00 2001
From: simson <526422051@qq.com>
Date: Wed, 6 May 2020 18:48:30 +0800
Subject: [PATCH 04/36] revert the limitation of end learning rate

---
 mindspore/nn/optim/lamb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index e026b1c560..97a81a590b 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -114,7 +114,7 @@ def _check_param_value(decay_steps, warmup_steps, start_learning_rate,
     _ = warmup_steps
     validator.check_float_positive('start_learning_rate', start_learning_rate, prim_name)
     validator.check_float_legal_value('start_learning_rate', start_learning_rate, prim_name)
-    validator.check_float_positive('end_learning_rate', end_learning_rate, prim_name)
+    validator.check_value_type("end_learning_rate", end_learning_rate, [float], prim_name)
     validator.check_float_legal_value('end_learning_rate', end_learning_rate, prim_name)
     validator.check_float_positive('power', power, prim_name)
     validator.check_float_legal_value('power', power, prim_name)

From 187568a833b1c7478c7526058588f2b9e755b76a Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Wed, 6 May 2020 21:20:32 +0800
Subject: [PATCH 05/36] adapt assign assignAdd relu6

adapt ResizeNearestNeighbourV2 with grad and ApplyAdam
---
 mindspore/ccsrc/kernel/tbe/tbe_adapter.cc |  6 ++---
 mindspore/ops/_op_impl/tbe/assign.py      | 30 ++++++++++++++++++++---
 mindspore/ops/_op_impl/tbe/assign_add.py  | 12 +++++++++
 mindspore/ops/_op_impl/tbe/relu6.py       |  4 +--
 4 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
index 8ce5504b8e..005c290aba 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
@@ -72,10 +72,10 @@ static std::map<string, string> tbe_func_adapter_map = {
   {"lamb_next_mv_with_decay_v1", "lamb_next_m_v_with_decay_v1"},
   {"lamb_next_mv", "lamb_next_m_v"},
   {"split", "split_d"},
-  {"resize_nearest_neighbor", "resize_nearest_neighbor_d"},
-  {"resize_nearest_neighbor_grad", "resize_nearest_neighbor_grad_d"},
+  {"resize_nearest_neighbor", "resize_nearest_neighbor_v2_d"},
+  {"resize_nearest_neighbor_grad", "resize_nearest_neighbor_v2_grad_d"},
   {"pad", "pad_d"},
-  {"adam", "apply_adam"}};
+  {"adam", "apply_adam_d"}};
 
 void TbeAdapter::NormalizeFuncName(std::string *func_name) {
   if (func_name == nullptr) {
diff --git a/mindspore/ops/_op_impl/tbe/assign.py b/mindspore/ops/_op_impl/tbe/assign.py
index 2fbd152c78..ff673a03c4 100644
--- a/mindspore/ops/_op_impl/tbe/assign.py
+++ b/mindspore/ops/_op_impl/tbe/assign.py
@@ -23,31 +23,53 @@ assign_op_info = TBERegOp("Assign") \
     .compute_cost(10) \
     .kernel_name("assign") \
     .partial_flag(True) \
-    .input(0, "resource", False, "required", "all") \
+    .input(0, "ref", False, "required", "all") \
     .input(1, "value", False, "required", "all") \
-    .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
+    .output(0, "ref", False, "required", "all") \
     .dtype_format(DataType.BOOL_Default, DataType.BOOL_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.BOOL_5HD, DataType.BOOL_5HD, DataType.BOOL_5HD) \
+    .dtype_format(DataType.BOOL_C1HWNCoC0, DataType.BOOL_C1HWNCoC0, DataType.BOOL_C1HWNCoC0) \
+    .dtype_format(DataType.BOOL_FracZ, DataType.BOOL_FracZ, DataType.BOOL_FracZ) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
     .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0) \
+    .dtype_format(DataType.I8_FracZ, DataType.I8_FracZ, DataType.I8_FracZ) \
     .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
     .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0) \
+    .dtype_format(DataType.U8_FracZ, DataType.U8_FracZ, DataType.U8_FracZ) \
     .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.I16_Default) \
     .dtype_format(DataType.I16_5HD, DataType.I16_5HD, DataType.I16_5HD) \
+    .dtype_format(DataType.I16_C1HWNCoC0, DataType.I16_C1HWNCoC0, DataType.I16_C1HWNCoC0) \
+    .dtype_format(DataType.I16_FracZ, DataType.I16_FracZ, DataType.I16_FracZ) \
     .dtype_format(DataType.U16_Default, DataType.U16_Default, DataType.U16_Default) \
     .dtype_format(DataType.U16_5HD, DataType.U16_5HD, DataType.U16_5HD) \
+    .dtype_format(DataType.U16_C1HWNCoC0, DataType.U16_C1HWNCoC0, DataType.U16_C1HWNCoC0) \
+    .dtype_format(DataType.U16_FracZ, DataType.U16_FracZ, DataType.U16_FracZ) \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0) \
+    .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ) \
     .dtype_format(DataType.U32_Default, DataType.U32_Default, DataType.U32_Default) \
     .dtype_format(DataType.U32_5HD, DataType.U32_5HD, DataType.U32_5HD) \
+    .dtype_format(DataType.U32_C1HWNCoC0, DataType.U32_C1HWNCoC0, DataType.U32_C1HWNCoC0) \
+    .dtype_format(DataType.U32_FracZ, DataType.U32_FracZ, DataType.U32_FracZ) \
     .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
     .dtype_format(DataType.I64_5HD, DataType.I64_5HD, DataType.I64_5HD) \
+    .dtype_format(DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0) \
+    .dtype_format(DataType.I64_FracZ, DataType.I64_FracZ, DataType.I64_FracZ) \
     .dtype_format(DataType.U64_Default, DataType.U64_Default, DataType.U64_Default) \
     .dtype_format(DataType.U64_5HD, DataType.U64_5HD, DataType.U64_5HD) \
+    .dtype_format(DataType.U64_C1HWNCoC0, DataType.U64_C1HWNCoC0, DataType.U64_C1HWNCoC0) \
+    .dtype_format(DataType.U64_FracZ, DataType.U64_FracZ, DataType.U64_FracZ) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
-    .dtype_format(DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/assign_add.py b/mindspore/ops/_op_impl/tbe/assign_add.py
index 2b20a7781d..7ad23ff3bc 100644
--- a/mindspore/ops/_op_impl/tbe/assign_add.py
+++ b/mindspore/ops/_op_impl/tbe/assign_add.py
@@ -28,16 +28,28 @@ assign_add_op_info = TBERegOp("AssignAdd") \
     .output(0, "ref", False, "required", "all") \
     .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \
     .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \
+    .dtype_format(DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0) \
+    .dtype_format(DataType.I8_FracZ, DataType.I8_FracZ, DataType.I8_FracZ) \
     .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \
     .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \
+    .dtype_format(DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0) \
+    .dtype_format(DataType.U8_FracZ, DataType.U8_FracZ, DataType.U8_FracZ) \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \
+    .dtype_format(DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0) \
+    .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ) \
     .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
     .dtype_format(DataType.I64_5HD, DataType.I64_5HD, DataType.I64_5HD) \
+    .dtype_format(DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0) \
+    .dtype_format(DataType.I64_FracZ, DataType.I64_FracZ, DataType.I64_FracZ) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
     .get_op_info()
 
 
diff --git a/mindspore/ops/_op_impl/tbe/relu6.py b/mindspore/ops/_op_impl/tbe/relu6.py
index bbedfdeb0f..d9bd7f9f8e 100644
--- a/mindspore/ops/_op_impl/tbe/relu6.py
+++ b/mindspore/ops/_op_impl/tbe/relu6.py
@@ -23,8 +23,8 @@ relu6_op_info = TBERegOp("ReLU6") \
     .compute_cost(10) \
     .kernel_name("relu6") \
     .partial_flag(True) \
-    .input(0, "features", False, "required", "all") \
-    .output(0, "activations", False, "required", "all") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default) \

From 263d82edc5660ca24d370cf7e575b69f64f49f2e Mon Sep 17 00:00:00 2001
From: zhoufeng <zhoufeng54@huawei.com>
Date: Thu, 7 May 2020 14:46:03 +0800
Subject: [PATCH 06/36] me-ge link hccl

Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
---
 mindspore/ccsrc/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 4c6ceb38e1..8d3818a777 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -125,7 +125,7 @@ endif()
 
 if (ENABLE_GE)
     if(ENABLE_TRAIN)
-        target_link_libraries(mindspore ge_client_train)
+        target_link_libraries(mindspore ge_client_train hccl)
     else ()
         target_link_libraries(mindspore ge_client)
     endif ()

From e97d33f7720229a0504fe1fdb206e93c01e67f70 Mon Sep 17 00:00:00 2001
From: liuxiao <liuxiao93@huawei.com>
Date: Wed, 6 May 2020 19:36:00 +0800
Subject: [PATCH 07/36] add ops for VM

---
 mindspore/ops/_grad/grad_nn_ops.py     |  3 +-
 mindspore/ops/_op_impl/tbe/__init__.py |  2 ++
 mindspore/ops/_op_impl/tbe/elu.py      | 40 ++++++++++++++++++++++++
 mindspore/ops/_op_impl/tbe/elu_grad.py | 43 ++++++++++++++++++++++++++
 mindspore/ops/operations/nn_ops.py     |  3 +-
 tests/ut/python/ops/test_ops.py        |  2 +-
 6 files changed, 89 insertions(+), 4 deletions(-)
 create mode 100644 mindspore/ops/_op_impl/tbe/elu.py
 create mode 100644 mindspore/ops/_op_impl/tbe/elu_grad.py

diff --git a/mindspore/ops/_grad/grad_nn_ops.py b/mindspore/ops/_grad/grad_nn_ops.py
index 153abc0fb6..362bda7368 100755
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@@ -600,7 +600,6 @@ def get_bprop_roi_align(self):
     sample_num = self.sample_num
 
     def bprop(inputs, rois, out, dout):
-        rois_shape = shape_op(rois)
         inputs_shape = shape_op(inputs)
         dx = G.ROIAlignGrad(inputs_shape,
                             pooled_height,
@@ -608,7 +607,7 @@ def get_bprop_roi_align(self):
                             spatial_scale,
                             sample_num,
                             )(dout, rois)
-        return dx, zeros_like(rois_shape)
+        return dx, zeros_like(rois)
 
     return bprop
 
diff --git a/mindspore/ops/_op_impl/tbe/__init__.py b/mindspore/ops/_op_impl/tbe/__init__.py
index 9dbe53049b..c6a08e8ff4 100644
--- a/mindspore/ops/_op_impl/tbe/__init__.py
+++ b/mindspore/ops/_op_impl/tbe/__init__.py
@@ -73,6 +73,8 @@ from .strideslice_d import _strided_slice_d_tbe
 from .strideslicegrad_d import _strided_slice_grad_d_tbe
 from .split_d import _split_d_tbe
 from .exp import _exp_tbe
+from .elu import _elu_tbe
+from .elu_grad import _elu_grad_tbe
 from .div import _div_tbe
 from .log import _log_tbe
 from .floor_div import _floor_div_tbe
diff --git a/mindspore/ops/_op_impl/tbe/elu.py b/mindspore/ops/_op_impl/tbe/elu.py
new file mode 100644
index 0000000000..9125d14727
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/elu.py
@@ -0,0 +1,40 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Elu op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+elu_op_info = TBERegOp("Elu") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("elu.so") \
+    .compute_cost(10) \
+    .kernel_name("elu") \
+    .partial_flag(True) \
+    .op_pattern("formatAgnostic") \
+    .attr("alpha", "optional", "float", "all", "1.0") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \
+    .get_op_info()
+
+
+@op_info_register(elu_op_info)
+def _elu_tbe():
+    """Elu TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/elu_grad.py b/mindspore/ops/_op_impl/tbe/elu_grad.py
new file mode 100644
index 0000000000..c3486dd024
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/elu_grad.py
@@ -0,0 +1,43 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""EluGrad op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+elu_grad_op_info = TBERegOp("EluGrad") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("elu_grad.so") \
+    .compute_cost(10) \
+    .kernel_name("elu_grad") \
+    .partial_flag(True) \
+    .input(0, "grads", False, "required", "all") \
+    .input(1, "activations", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \
+    .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \
+    .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \
+    .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
+@op_info_register(elu_grad_op_info)
+def _elu_grad_tbe():
+    """EluGrad TBE register"""
+    return
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index 2a2dbe08a8..7ba341fd56 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -1527,7 +1527,8 @@ class L2Loss(PrimitiveWithInfer):
 
     def infer_dtype(self, x_type):
         validator.check_subclass("x_type", x_type, mstype.tensor, self.name)
-        validator.check_tensor_type_same({'x_type': x_type}, [mstype.double, mstype.float_, mstype.float16], self.name)
+        valid_types = [mstype.float16, mstype.float32, mstype.double]
+        validator.check_tensor_type_same({'x_type': x_type}, valid_types, self.name)
         return x_type
 
 
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index 9d7e8c898a..7a3d7d967f 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -874,7 +874,7 @@ test_case_nn_ops = [
         'skip': ['backward']}),
     ('L2Loss_1', {
         'block': P.L2Loss(),
-        'desc_inputs': [Tensor(np.array([1, 2, 3, 4]), mstype.float16)],
+        'desc_inputs': [Tensor(np.array([1, 2, 3, 4]), mstype.float32)],
         'desc_bprop': []}),
     ('L2Loss_2', {
         'block': P.L2Loss(),

From d6520f499650782f7717aefef9a429b8e7489828 Mon Sep 17 00:00:00 2001
From: gengdongjie <gengdongjie@huawei.com>
Date: Thu, 7 May 2020 23:44:23 +0800
Subject: [PATCH 08/36] add mix precision option

---
 mindspore/ccsrc/utils/context/ms_context.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index 6da1de9cdb..d728feae82 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -359,7 +359,11 @@ void MsContext::GetGeOptions(std::map<std::string, std::string> *ge_options) con
   }
 
   // Enable auto mixed precision according to the context options
-  (*ge_options)["ge.exec.auto_mix_precision"] = std::to_string(auto_mixed_precision_flag_);
+  if (auto_mixed_precision_flag_) {
+    (*ge_options)["ge.exec.precision_mode"] = "allow_mix_precision";
+  } else {
+    (*ge_options)["ge.exec.precision_mode"] = "must_keep_origin_dtype";
+  }
   // Disable the global variable acc, only enable it whlie adding training graph in pipeline
   (*ge_options)["ge.exec.variable_acc"] = "0";
 #endif

From 460a1e25c82131d70a5d2bb076f262645e504843 Mon Sep 17 00:00:00 2001
From: gengdongjie <gengdongjie@huawei.com>
Date: Fri, 8 May 2020 19:28:31 +0800
Subject: [PATCH 09/36] reset auto mix precision default off option

---
 mindspore/ccsrc/utils/context/ms_context.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index d728feae82..b8b4b3d8a1 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -362,7 +362,7 @@ void MsContext::GetGeOptions(std::map<std::string, std::string> *ge_options) con
   if (auto_mixed_precision_flag_) {
     (*ge_options)["ge.exec.precision_mode"] = "allow_mix_precision";
   } else {
-    (*ge_options)["ge.exec.precision_mode"] = "must_keep_origin_dtype";
+    (*ge_options)["ge.exec.precision_mode"] = "allow_fp32_to_fp16";
   }
   // Disable the global variable acc, only enable it whlie adding training graph in pipeline
   (*ge_options)["ge.exec.variable_acc"] = "0";

From 0b8cea801862a7fe7dae41d61bee6a9e94bb60ad Mon Sep 17 00:00:00 2001
From: guohongzilong <2713219276@qq.com>
Date: Thu, 23 Apr 2020 17:39:24 +0800
Subject: [PATCH 10/36] learning rate and weight decay support group mode

---
 mindspore/nn/optim/adam.py                    |  65 ++++--
 mindspore/nn/optim/ftrl.py                    |   3 +-
 mindspore/nn/optim/lamb.py                    |   2 +
 mindspore/nn/optim/momentum.py                |  50 +++-
 mindspore/nn/optim/optimizer.py               | 215 ++++++++++++++----
 mindspore/nn/optim/rmsprop.py                 |  71 ++++--
 mindspore/nn/optim/sgd.py                     |  45 +++-
 mindspore/nn/wrap/cell_wrapper.py             |   2 +-
 tests/ut/python/nn/optim/test_adam.py         |   4 +-
 tests/ut/python/nn/optim/test_optimizer.py    |   8 +-
 .../test_optimize_with_parameter_groups.py    | 210 +++++++++++++++++
 11 files changed, 570 insertions(+), 105 deletions(-)
 create mode 100644 tests/ut/python/optimizer/test_optimize_with_parameter_groups.py

diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py
index 1a386556d9..9893a81923 100755
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -103,9 +103,9 @@ def _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, po
     validator.check_integer('decay_steps', decay_steps, 0, Rel.GT, prim_name)
 
 
-@adam_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor",
+@adam_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", "Tensor",
                    "Tensor")
-def _run_opt_with_one_number(opt, lr, beta1_power, beta2_power, beta1, beta2, eps, gradient, params, moment1,
+def _run_opt_with_one_number(opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params, moment1,
                              moment2):
     """Apply adam optimizer to the weight parameter using Tensor."""
     success = True
@@ -136,9 +136,27 @@ class Adam(Optimizer):
     `beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`,
     :math:`\epsilon` represents `eps`.
 
+    Note:
+        The Adam optimizer supports separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
     Args:
-        params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
-                                  should be class mindspore.Parameter.
+        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
+            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
@@ -161,8 +179,6 @@ class Adam(Optimizer):
         weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
         loss_scale (float): A floating point value for the loss scale. Should be equal to or greater than 1. Default:
                             1.0.
-        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
-                                 lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name.
 
     Inputs:
         - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@@ -172,15 +188,26 @@ class Adam(Optimizer):
 
     Examples:
         >>> net = Net()
-        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> #1) All parameters use the same learning rate and weight decay
         >>> optim = nn.Adam(params=net.trainable_params())
-        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
+        >>>
+        >>> #2) Use parameter groups and set different values
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
+        >>>                 {'params': no_conv_params}]
+        >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
+        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
+        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
     """
 
     def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, use_locking=False,
-                 use_nesterov=False, weight_decay=0.0, loss_scale=1.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
-        super(Adam, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter)
+                 use_nesterov=False, weight_decay=0.0, loss_scale=1.0):
+        super(Adam, self).__init__(learning_rate, params, weight_decay, loss_scale)
         _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
         validator.check_value_type("use_locking", use_locking, [bool], self.cls_name)
         validator.check_value_type("use_nesterov", use_nesterov, [bool], self.cls_name)
@@ -216,10 +243,14 @@ class Adam(Optimizer):
         self.beta1_power = beta1_power
         beta2_power = self.beta2_power * self.beta2
         self.beta2_power = beta2_power
-        success = self.hyper_map(F.partial(adam_opt, self.opt, lr, beta1_power, beta2_power, self.beta1,
-                                           self.beta2, self.eps),
-                                 gradients, params, moment1, moment2)
-
+        if self.is_group:
+            success = self.hyper_map(F.partial(adam_opt, self.opt, beta1_power, beta2_power, self.beta1,
+                                               self.beta2, self.eps),
+                                     lr, gradients, params, moment1, moment2)
+        else:
+            success = self.hyper_map(F.partial(adam_opt, self.opt, beta1_power, beta2_power, self.beta1,
+                                               self.beta2, self.eps, lr),
+                                     gradients, params, moment1, moment2)
         return success
 
 
@@ -262,6 +293,8 @@ class AdamWeightDecay(Optimizer):
     def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0,
                  decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
         super(AdamWeightDecay, self).__init__(learning_rate, params)
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
         self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
         self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
@@ -329,6 +362,8 @@ class AdamWeightDecayDynamicLR(Optimizer):
                  weight_decay=0.0,
                  decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
         super(AdamWeightDecayDynamicLR, self).__init__(learning_rate, params)
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
         _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, power, self.cls_name)
         # turn them to scalar when me support scalar/tensor mix operations
diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py
index ccc1b3f10b..33edafa4e2 100644
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@@ -96,7 +96,8 @@ class FTRL(Optimizer):
     def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0,
                  use_locking=False, loss_scale=1.0, weight_decay=0.0):
         super(FTRL, self).__init__(learning_rate, params)
-
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param(initial_accum, learning_rate, lr_power, l1, l2, use_locking, loss_scale, weight_decay,
                      self.cls_name)
         self.moments = self.parameters.clone(prefix="moments", init=initial_accum)
diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index 97a81a590b..b4d478f52a 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -183,6 +183,8 @@ class Lamb(Optimizer):
                  decay_filter=lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name):
 
         super(Lamb, self).__init__(start_learning_rate, params)
+        if self.is_group:
+            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param_value(decay_steps, warmup_steps, start_learning_rate, end_learning_rate,
                            power, beta1, beta2, eps, weight_decay, self.cls_name)
 
diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py
index 67de590c5f..7cfbf11183 100755
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -23,7 +23,7 @@ momentum_opt = C.MultitypeFuncGraph("momentum_opt")
 
 
 @momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
-def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, moment):
+def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment):
     """Apply momentum optimizer to the weight parameter using Tensor."""
     success = True
     success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum))
@@ -36,9 +36,27 @@ class Momentum(Optimizer):
 
     Refer to the paper on the importance of initialization and momentum in deep learning for more details.
 
+    Note:
+        The Momentum optimizer supports separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
     Args:
-        params (list[Parameter]): A list of parameter, which will be updated. The element in `parameters`
-                                  should be class mindspore.Parameter.
+        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
+            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
@@ -49,8 +67,6 @@ class Momentum(Optimizer):
         momentum (float): Hyperparameter of type float, means momentum for the moving average.
         weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
         loss_scale (float): A floating point value for the loss scale. Default: 1.0.
-        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
-                                 lambda x: 'beta' not in x.name and 'gamma' not in x.name.
 
     Inputs:
         - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@@ -63,13 +79,24 @@ class Momentum(Optimizer):
 
     Examples:
         >>> net = Net()
-        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> #1) All parameters use the same learning rate and weight decay
         >>> optim = nn.Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9)
+        >>>
+        >>> #2) Use parameter groups and set different values
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
+        >>>                 {'params': no_conv_params}]
+        >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0)
+        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
+        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
+        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
         >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
     """
-    def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
-        super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter)
+    def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0):
+        super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale)
         if isinstance(momentum, float) and momentum < 0.0:
             raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
         self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
@@ -84,5 +111,8 @@ class Momentum(Optimizer):
         gradients = self.decay_weight(gradients)
         gradients = self.scale_grad(gradients)
         lr = self.get_lr()
-        success = self.hyper_map(F.partial(momentum_opt, self.opt, lr, self.momentum), gradients, params, moments)
+        if self.is_group:
+            success = self.hyper_map(F.partial(momentum_opt, self.opt, self.momentum), lr, gradients, params, moments)
+        else:
+            success = self.hyper_map(F.partial(momentum_opt, self.opt, self.momentum, lr), gradients, params, moments)
         return success
diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index 34abc2b1c2..671e92de3a 100755
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -28,7 +28,6 @@ from mindspore._checkparam import Rel
 from mindspore.common.tensor import Tensor
 from mindspore import log as logger
 
-
 __all__ = ['Optimizer']
 
 
@@ -42,68 +41,96 @@ class Optimizer(Cell):
         This class defines the API to add Ops to train a model. Never use
         this class directly, but instead instantiate one of its subclasses.
 
+        Some optimizers support separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
     Args:
         learning_rate (float): A floating point value for the learning rate. Should be greater than 0.
-        parameters (list): A list of parameter, which will be updated. The element in `parameters`
-            should be class mindspore.Parameter.
+        parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be
+            updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`,
+            the "params", "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
-            If the type of `weight_decay` input is int, it will be convertd to float. Default: 0.0.
+            If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
         loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the
-            type of `loss_scale` input is int, it will be convertd to float. Default: 1.0.
-        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default: lambda
-            x: 'beta' not in x.name and 'gamma' not in x.name.
+            type of `loss_scale` input is int, it will be converted to float. Default: 1.0.
 
     Raises:
         ValueError: If the learning_rate is a Tensor, but the dims of tensor is greater than 1.
         TypeError: If the learning_rate is not any of the three types: float, Tensor, Iterable.
     """
 
-    def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
+    def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0):
         super(Optimizer, self).__init__(auto_prefix=False)
+        if parameters and not isinstance(parameters, list):
+            parameters = list(parameters)
+
+        if not parameters:
+            raise ValueError("Optimizer got an empty parameter list.")
+
+        if not isinstance(parameters[0], (dict, Parameter)):
+            raise ValueError("Only a list of Parameter or dict can be supported.")
+
+        if isinstance(loss_scale, int):
+            loss_scale = float(loss_scale)
+        validator.check_value_type("loss_scale", loss_scale, [float], None)
+        validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, None)
+
+        if isinstance(weight_decay, int):
+            weight_decay = float(weight_decay)
+        validator.check_value_type("weight_decay", weight_decay, [float], None)
+        validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, None)
+
+        self.is_group = False
+        self.loss_scale = loss_scale
         if isinstance(learning_rate, float):
             self.dynamic_lr = False
             self.gather = None
             self.assignadd = None
             self.global_step = None
-            validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name)
-            learning_rate = Tensor(learning_rate, mstype.float32)
+            self.scalar_lr = learning_rate
         else:
             self.dynamic_lr = True
             self.gather = P.GatherV2()
             self.assignadd = P.AssignAdd()
             self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step')
-            if isinstance(learning_rate, Iterable):
-                learning_rate = Tensor(np.array(list(learning_rate)).astype(np.float32))
-            elif isinstance(learning_rate, Tensor):
-                if learning_rate.dim() > 1:
-                    raise ValueError("Learning rate should be a 0 or 1 dim `Tensor`,"
-                                     f"but got {learning_rate.dim()}.")
-                if learning_rate.dim() == 1 and learning_rate.size() < 2:
-                    logger.warning("If want to use the dynamic learning rate, please make sure that the number "
-                                   "of elements in the list, tuple or tensor passed is greater than 1.")
-            else:
-                raise TypeError("Learning rate should be float, Tensor or Iterable.")
-
-        if isinstance(weight_decay, int):
-            weight_decay = float(weight_decay)
-        validator.check_value_type("weight_decay", weight_decay, [float], None)
-        validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, None)
-
-        if isinstance(loss_scale, int):
-            loss_scale = float(loss_scale)
-        validator.check_value_type("loss_scale", loss_scale, [float], None)
-        validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, None)
-
-        self.loss_scale = loss_scale
-        self.learning_rate = Parameter(learning_rate, name="learning_rate")
-        self.parameters = ParameterTuple(parameters)
+            self.scalar_lr = None
+
+        learning_rate = self._get_single_lr(learning_rate)
+        if isinstance(parameters[0], dict):
+            self.is_group = True
+            self.params = []
+            self.group_lr = []
+            self.group_weight_decay = []
+            self._init_group_params(parameters, learning_rate, weight_decay)
+
+        if self.is_group:
+            self.learning_rate = ParameterTuple(self.group_lr)
+            self.parameters = ParameterTuple(self.params)
+            self.weight_decay = tuple(self.group_weight_decay)
+            decay_filter = lambda x: x > 0
+            self.decay_flags = tuple(decay_filter(x) for x in self.weight_decay)
+        else:
+            self.learning_rate = Parameter(learning_rate, name="learning_rate")
+            self.parameters = ParameterTuple(parameters)
+            self.weight_decay = weight_decay * loss_scale
+            decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name
+            self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
         self.reciprocal_scale = 1.0 / loss_scale
-        self.weight_decay = weight_decay * loss_scale
-        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
-
-        if not self.parameters:
-            raise ValueError("optimizer got an empty parameter list.")
+        self.exec_weight_decay = any(self.decay_flags)
+        self.param_length = len(self.parameters)
 
     def decay_weight(self, gradients):
         """
@@ -118,9 +145,15 @@ class Optimizer(Cell):
         Returns:
             tuple[Tensor], The gradients after weight decay.
         """
-        if self.weight_decay > 0:
-            params = self.parameters
-            gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags, params, gradients)
+        params = self.parameters
+        if self.is_group:
+            if self.exec_weight_decay:
+                gradients = self.hyper_map(F.partial(apply_decay), self.weight_decay, self.decay_flags,
+                                           params, gradients)
+        else:
+            if self.weight_decay > 0:
+                gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags,
+                                           params, gradients)
 
         return gradients
 
@@ -144,6 +177,83 @@ class Optimizer(Cell):
 
         return gradients
 
+    def _get_single_lr(self, learning_rate):
+        """Get learning rate in Tensor type."""
+        if isinstance(learning_rate, float):
+            validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name)
+            lr = Tensor(learning_rate, mstype.float32)
+        elif isinstance(learning_rate, Iterable):
+            lr = Tensor(np.array(list(learning_rate)).astype(np.float32))
+        elif isinstance(learning_rate, Tensor):
+            if learning_rate.dim() > 1:
+                raise ValueError("Learning rate should be a 0 or 1 dim `Tensor`,"
+                                 f"but got {learning_rate.dim()}.")
+            if learning_rate.dim() == 1 and learning_rate.size() < 2:
+                logger.warning("If want to use the dynamic learning rate, please make sure that the number "
+                               "of elements in the list, tuple or tensor passed is greater than 1.")
+            lr = learning_rate
+        else:
+            raise TypeError("Learning rate should be float, Tensor or Iterable.")
+        return lr
+
+    def _init_group_params(self, parameters, learning_rate, weight_decay):
+        """Init learning rate or weight decay in group params."""
+        origin_dynamic_lr = self.dynamic_lr
+        if self.dynamic_lr:
+            dynamic_lr_length = learning_rate.size()
+        else:
+            dynamic_lr_length = 0
+
+        for group_param in parameters:
+            lr_length = dynamic_lr_length
+            if 'lr' in group_param.keys():
+                self._get_single_lr(group_param['lr'])
+                if isinstance(group_param['lr'], Iterable):
+                    lr_length = len(group_param['lr'])
+                    self.dynamic_lr = True
+                elif isinstance(group_param['lr'], Tensor):
+                    lr_length = group_param['lr'].size()
+                    self.dynamic_lr = True
+            if dynamic_lr_length not in (lr_length, 0):
+                raise ValueError("The dynamic learning rate in group should be the same size.")
+            dynamic_lr_length = lr_length
+
+        if self.dynamic_lr and not origin_dynamic_lr:
+            self.gather = P.GatherV2()
+            self.assignadd = P.AssignAdd()
+            self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step')
+
+        params_store = []
+        for group_param in parameters:
+            self.params += group_param['params']
+            if 'lr' in group_param.keys():
+                params_dynamic_lr = isinstance(group_param['lr'], (Iterable, Tensor))
+
+                if self.dynamic_lr and not params_dynamic_lr:
+                    lr = Tensor(np.array([group_param['lr']] * dynamic_lr_length).astype(np.float32))
+                else:
+                    lr = self._get_single_lr(group_param['lr'])
+            else:
+                if self.dynamic_lr and not origin_dynamic_lr:
+                    lr = Tensor(np.array([self.scalar_lr] * dynamic_lr_length).astype(np.float32))
+                else:
+                    lr = learning_rate
+
+            if 'weight_decay' in group_param.keys():
+                validator.check_float_legal_value('weight_decay', group_param['weight_decay'], None)
+                validator.check_number_range('weight_decay', group_param['weight_decay'], 0.0, float("inf"),
+                                             Rel.INC_LEFT, self.cls_name)
+                weight_decay_ = group_param['weight_decay'] * self.loss_scale
+            else:
+                weight_decay_ = weight_decay * self.loss_scale
+
+            for param in group_param['params']:
+                if param in params_store:
+                    raise RuntimeError(f"The {param.name} parameter has appeared in parameter groups.")
+                params_store.append(param)
+                self.group_lr.append(Parameter(lr, name="lr_" + param.name))
+                self.group_weight_decay.append(weight_decay_)
+
     def get_lr(self):
         """
         Get the learning rate of current step.
@@ -151,11 +261,20 @@ class Optimizer(Cell):
         Returns:
             float, the learning rate of current step.
         """
-        lr = self.learning_rate
-        if self.dynamic_lr:
-            lr = self.gather(self.learning_rate, self.global_step, 0)
-            F.control_depend(lr, self.assignadd(self.global_step, 1))
+        if self.is_group:
+            lr = self.learning_rate
+            if self.dynamic_lr:
+                lr = ()
+                for i in range(self.param_length):
+                    current_dynamic_lr = self.gather(self.learning_rate[i], self.global_step, 0)
+                    lr += (current_dynamic_lr,)
+                F.control_depend(lr, self.assignadd(self.global_step, 1))
 
+        else:
+            lr = self.learning_rate
+            if self.dynamic_lr:
+                lr = self.gather(self.learning_rate, self.global_step, 0)
+                F.control_depend(lr, self.assignadd(self.global_step, 1))
         return lr
 
     def construct(self, *hyper_params):
diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py
index b1271587b4..b96d9499b2 100644
--- a/mindspore/nn/optim/rmsprop.py
+++ b/mindspore/nn/optim/rmsprop.py
@@ -22,17 +22,17 @@ rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt")
 centered_rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt")
 
 
-@rmsprop_opt.register("Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor")
-def _rmsprop_opt(opt, learning_rate, decay, epsilon, momentum, weight, ms, mom, grad):
+@rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
+def _rmsprop_opt(opt, decay, epsilon, momentum, learning_rate, weight, ms, mom, grad):
     """Apply rmsprop optimizer to the weight parameter using dynamic learning rate."""
     success = True
     success = F.depend(success, opt(weight, ms, mom, grad, learning_rate, decay, momentum, epsilon))
     return success
 
 
-@centered_rmsprop_opt.register("Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor",
+@centered_rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor",
                                "Tensor", "Tensor")
-def _centered_rmsprop_opt(opt, learning_rate, decay, epsilon, momentum, weight, mg, ms, mom, grad):
+def _centered_rmsprop_opt(opt, decay, epsilon, momentum, learning_rate, weight, mg, ms, mom, grad):
     """Apply centered rmsprop optimizer to the weight parameter using dynamic learning rate."""
     success = True
     success = F.depend(success, opt(weight, mg, ms, mom, grad, learning_rate, decay, momentum, epsilon))
@@ -44,6 +44,13 @@ class RMSProp(Optimizer):
     Implements Root Mean Squared Propagation (RMSProp) algorithm.
 
     Note:
+        The RMSProp optimizer supports separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
         Update `params` according to the RMSProp algorithm.
 
         The equation is as follows:
@@ -84,8 +91,18 @@ class RMSProp(Optimizer):
         represents `gradients`.
 
     Args:
-        params (list[Parameter]): A list of parameter, which will be updated. The element in `parameters`
-                                  should be class mindspore.Parameter.
+        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
+            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
@@ -95,15 +112,13 @@ class RMSProp(Optimizer):
                                                         Other cases are not supported. Default: 0.1.
         decay (float): Decay rate. Should be equal to or greater than 0. Default: 0.9.
         momentum (float): Hyperparameter of type float, means momentum for the moving average. Should be equal to or
-                          greater than 0.Default: 0.0.
+                          greater than 0. Default: 0.0.
         epsilon (float): Term added to the denominator to improve numerical stability. Should be greater than
                          0. Default: 1e-10.
         use_locking (bool): Enable a lock to protect the update of variable and accumlation tensors. Default: False.
         centered (bool): If True, gradients are normalized by the estimated variance of the gradient. Default: False.
         loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0.
         weight_decay (float): Weight decay (L2 penalty). Should be equal to or greater than 0. Default: 0.0.
-        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
-                                 lambda x: 'beta' not in x.name and 'gamma' not in x.name.
 
     Inputs:
         - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@@ -113,14 +128,25 @@ class RMSProp(Optimizer):
 
     Examples:
         >>> net = Net()
+        >>> #1) All parameters use the same learning rate and weight decay
+        >>> optim = nn.RMSProp(params=net.trainable_params(), learning_rate=lr)
+        >>>
+        >>> #2) Use parameter groups and set different values
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
+        >>>                 {'params': no_conv_params}]
+        >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
+        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
+        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>>
         >>> loss = nn.SoftmaxCrossEntropyWithLogits()
-        >>> opt = nn.RMSProp(params=net.trainable_params(), learning_rate=lr)
-        >>> model = Model(net, loss, opt)
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
     """
     def __init__(self, params, learning_rate=0.1, decay=0.9, momentum=0.0, epsilon=1e-10,
-                 use_locking=False, centered=False, loss_scale=1.0, weight_decay=0.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
-        super(RMSProp, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter)
+                 use_locking=False, centered=False, loss_scale=1.0, weight_decay=0.0):
+        super(RMSProp, self).__init__(learning_rate, params, weight_decay, loss_scale)
         validator.check_value_type("decay", decay, [float], self.cls_name)
         validator.check_number_range("decay", decay, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name)
         validator.check_value_type("momentum", momentum, [float], self.cls_name)
@@ -150,9 +176,18 @@ class RMSProp(Optimizer):
         gradients = self.scale_grad(gradients)
         lr = self.get_lr()
         if self.centered:
-            success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, lr, self.decay, self.epsilon,
-                                               self.momentum), params, self.mg, self.ms, self.moment, gradients)
+            if self.is_group:
+                success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, self.decay, self.epsilon,
+                                                   self.momentum), lr, params, self.mg, self.ms, self.moment, gradients)
+            else:
+                success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, self.decay, self.epsilon,
+                                                   self.momentum, lr), params, self.mg, self.ms, self.moment, gradients)
+
         else:
-            success = self.hyper_map(F.partial(rmsprop_opt, self.opt, lr, self.decay, self.epsilon,
-                                               self.momentum), params, self.ms, self.moment, gradients)
+            if self.is_group:
+                success = self.hyper_map(F.partial(rmsprop_opt, self.opt, self.decay, self.epsilon,
+                                                   self.momentum), lr, params, self.ms, self.moment, gradients)
+            else:
+                success = self.hyper_map(F.partial(rmsprop_opt, self.opt, self.decay, self.epsilon,
+                                                   self.momentum, lr), params, self.ms, self.moment, gradients)
         return success
diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py
index 388fe5db47..0db58af855 100755
--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -24,7 +24,7 @@ sgd_opt = C.MultitypeFuncGraph("sgd_opt")
 
 
 @sgd_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
-def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, accum, stat):
+def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, accum, stat):
     """Apply sgd optimizer to the weight parameter using Tensor."""
     success = True
     success = F.depend(success, opt(weight, gradient, learning_rate, accum, momentum, stat))
@@ -39,9 +39,27 @@ class SGD(Optimizer):
     Nesterov momentum is based on the formula from paper `On the importance of initialization and
     momentum in deep learning <http://proceedings.mlr.press/v28/sutskever13.html>`_.
 
+    Note:
+        The SGD optimizer supports separating parameter groups. Different parameter groups can set different
+        `learning_rate` and `weight_decay`.
+
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be
+        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.
+
     Args:
-        params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
-                                  should be class mindspore.Parameter.
+        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
+            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            "lr" and "weight_decay" are the keys can be parsed.
+
+            - params: Required. The value should be a list of `Parameter`.
+
+            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+              If not, the `learning_rate` in the API will be used.
+
+            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+              will be used. If not, the `weight_decay` in the API will be used.
+
         learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                         Iterable or a Tensor and the dims of the Tensor is 1,
                                                         use dynamic learning rate, then the i-th step will
@@ -67,9 +85,21 @@ class SGD(Optimizer):
 
     Examples:
         >>> net = Net()
-        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> #1) All parameters use the same learning rate and weight decay
         >>> optim = nn.SGD(params=net.trainable_params())
-        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
+        >>>
+        >>> #2) Use parameter groups and set different values
+        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
+        >>>                 {'params': no_conv_params}]
+        >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0)
+        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
+        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
+        >>> # learning rate of 0.1 and a weight decay of 0.0.
+        >>>
+        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+        >>> model = Model(net, loss_fn=loss, optimizer=optim)
     """
     def __init__(self, params, learning_rate=0.1, momentum=0.0, dampening=0.0, weight_decay=0.0, nesterov=False,
                  loss_scale=1.0):
@@ -109,5 +139,8 @@ class SGD(Optimizer):
         gradients = self.decay_weight(gradients)
         gradients = self.scale_grad(gradients)
         lr = self.get_lr()
-        success = self.hyper_map(F.partial(sgd_opt, self.opt, lr, self.momentum), gradients, params, accum, stat)
+        if self.is_group:
+            success = self.hyper_map(F.partial(sgd_opt, self.opt, self.momentum), lr, gradients, params, accum, stat)
+        else:
+            success = self.hyper_map(F.partial(sgd_opt, self.opt, self.momentum, lr), gradients, params, accum, stat)
         return success
diff --git a/mindspore/nn/wrap/cell_wrapper.py b/mindspore/nn/wrap/cell_wrapper.py
index 60718ec2b1..499d85b34b 100644
--- a/mindspore/nn/wrap/cell_wrapper.py
+++ b/mindspore/nn/wrap/cell_wrapper.py
@@ -167,7 +167,7 @@ class TrainOneStepCell(Cell):
         super(TrainOneStepCell, self).__init__(auto_prefix=False)
         self.network = network
         self.network.add_flags(defer_inline=True)
-        self.weights = ParameterTuple(network.trainable_params())
+        self.weights = optimizer.parameters
         self.optimizer = optimizer
         self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
         self.sens = sens
diff --git a/tests/ut/python/nn/optim/test_adam.py b/tests/ut/python/nn/optim/test_adam.py
index d9321b1d26..269f276376 100644
--- a/tests/ut/python/nn/optim/test_adam.py
+++ b/tests/ut/python/nn/optim/test_adam.py
@@ -50,7 +50,7 @@ class NetWithoutWeight(nn.Cell):
 def test_adamwithoutparam():
     net = NetWithoutWeight()
     net.set_train()
-    with pytest.raises(ValueError, match=r"optimizer got an empty parameter list"):
+    with pytest.raises(ValueError, match=r"Optimizer got an empty parameter list"):
         AdamWeightDecay(net.trainable_params(), learning_rate=0.1)
 
 
@@ -104,5 +104,5 @@ def test_AdamWeightDecayDynamicLR():
 
 def test_adam_mindspore_flatten():
     net = nn.Flatten()
-    with pytest.raises(ValueError, match=r"optimizer got an empty parameter list"):
+    with pytest.raises(ValueError, match=r"Optimizer got an empty parameter list"):
         AdamWeightDecay(net.get_parameters())
diff --git a/tests/ut/python/nn/optim/test_optimizer.py b/tests/ut/python/nn/optim/test_optimizer.py
index 89fb1d812b..9f1ec9a36f 100644
--- a/tests/ut/python/nn/optim/test_optimizer.py
+++ b/tests/ut/python/nn/optim/test_optimizer.py
@@ -69,19 +69,19 @@ class TestSGD():
 class TestNullParam():
     """ TestNullParam definition """
     def test_optim_init(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(ValueError):
             Optimizer(0.1, None)
 
     def test_AdamWightDecay_init(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(ValueError):
             AdamWeightDecay(None)
 
     def test_AdamWeightDecayDynamicLR_init(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(ValueError):
             AdamWeightDecayDynamicLR(None, 10)
 
     def test_Sgd_init(self):
-        with pytest.raises(TypeError):
+        with pytest.raises(ValueError):
             SGD(None)
 
 class TestUnsupportParam():
diff --git a/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py b/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py
new file mode 100644
index 0000000000..8dd98990fa
--- /dev/null
+++ b/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py
@@ -0,0 +1,210 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+import mindspore.common.dtype as mstype
+import mindspore.nn as nn
+from mindspore.nn.optim import Momentum, SGD, RMSProp, Adam
+from mindspore import context
+from mindspore.common.api import _executor
+from mindspore.common.tensor import Tensor
+from mindspore.ops import operations as P
+from mindspore.nn import TrainOneStepCell, WithLossCell
+
+context.set_context(mode=context.GRAPH_MODE)
+
+
+class LeNet5(nn.Cell):
+    """ LeNet5 definition """
+    def __init__(self):
+        super(LeNet5, self).__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5, pad_mode='valid')
+        self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')
+        self.fc1 = nn.Dense(16 * 5 * 5, 120)
+        self.fc2 = nn.Dense(120, 84)
+        self.fc3 = nn.Dense(84, 10)
+        self.relu = nn.ReLU()
+        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.flatten = P.Flatten()
+
+    def construct(self, x):
+        x = self.max_pool2d(self.relu(self.conv1(x)))
+        x = self.max_pool2d(self.relu(self.conv2(x)))
+        x = self.flatten(x)
+        x = self.relu(self.fc1(x))
+        x = self.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def test_group_lr():
+    inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
+    label = Tensor(np.ones([1, 10]).astype(np.float32))
+
+    net = LeNet5()
+    conv_lr = 0.8
+    default_lr = 0.1
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    net.set_train()
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+
+    opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9)
+    assert opt.is_group is True
+    assert opt.dynamic_lr is False
+    for lr, param in zip(opt.learning_rate, opt.parameters):
+        if param in conv_params:
+            assert lr.data == Tensor(conv_lr, mstype.float32)
+        else:
+            assert lr.data == Tensor(default_lr, mstype.float32)
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, opt)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_group_dynamic_1():
+    inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
+    label = Tensor(np.ones([1, 10]).astype(np.float32))
+
+    net = LeNet5()
+    conv_lr = 0.8
+    default_lr = (0.1, 0.2, 0.3)
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    net.set_train()
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+
+    opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9)
+    assert opt.is_group is True
+    assert opt.dynamic_lr is True
+    for lr, param in zip(opt.learning_rate, opt.parameters):
+        if param in conv_params:
+            assert lr.data == Tensor(np.array([conv_lr] * 3).astype(np.float32))
+        else:
+            assert lr.data == Tensor(np.array(list(default_lr)).astype(np.float32))
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, opt)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_group_dynamic_2():
+    inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
+    label = Tensor(np.ones([1, 10]).astype(np.float32))
+
+    net = LeNet5()
+    conv_lr = (0.1, 0.2, 0.3)
+    default_lr = 0.8
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    net.set_train()
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+
+    opt = RMSProp(group_params, learning_rate=default_lr)
+    assert opt.is_group is True
+    assert opt.dynamic_lr is True
+    for lr, param in zip(opt.learning_rate, opt.parameters):
+        if param in conv_params:
+            assert lr.data == Tensor(np.array(list(conv_lr)).astype(np.float32))
+        else:
+            assert lr.data == Tensor(np.array([default_lr] * 3).astype(np.float32))
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, opt)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_group_dynamic_no_same_size():
+    net = LeNet5()
+    conv_lr = (0.1, 0.2, 0.3)
+    default_lr = (0.1, 0.2)
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    with pytest.raises(ValueError):
+        Momentum(group_params, learning_rate=default_lr, momentum=0.9)
+
+
+def test_group_not_float_lr():
+    net = LeNet5()
+    conv_lr = 1
+    default_lr = 0.3
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': no_conv_params}]
+    with pytest.raises(TypeError):
+        Momentum(group_params, learning_rate=default_lr, momentum=0.9)
+
+
+def test_group_not_float_weight_decay():
+    net = LeNet5()
+    conv_weight_decay = 1
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
+                    {'params': no_conv_params}]
+    with pytest.raises(TypeError):
+        Momentum(group_params, learning_rate=0.1, momentum=0.9)
+
+
+def test_weight_decay():
+    inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
+    label = Tensor(np.ones([1, 10]).astype(np.float32))
+
+    net = LeNet5()
+    conv_weight_decay = 0.8
+    default_weight_decay = 0.0
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
+                    {'params': no_conv_params}]
+    net.set_train()
+    loss = nn.SoftmaxCrossEntropyWithLogits()
+
+    opt = SGD(group_params, learning_rate=0.1, weight_decay=default_weight_decay)
+    assert opt.is_group is True
+    for weight_decay, decay_flags, param in zip(opt.weight_decay, opt.decay_flags, opt.parameters):
+        if param in conv_params:
+            assert weight_decay == conv_weight_decay
+            assert decay_flags is True
+        else:
+            assert weight_decay == default_weight_decay
+            assert decay_flags is False
+
+    net_with_loss = WithLossCell(net, loss)
+    train_network = TrainOneStepCell(net_with_loss, opt)
+    _executor.compile(train_network, inputs, label)
+
+
+def test_group_repeat_param():
+    net = LeNet5()
+    conv_lr = 0.1
+    default_lr = 0.3
+    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
+    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
+    group_params = [{'params': conv_params, 'lr': conv_lr},
+                    {'params': conv_params, 'lr': default_lr},
+                    {'params': no_conv_params}]
+    with pytest.raises(RuntimeError):
+        Adam(group_params, learning_rate=default_lr)

From 5a259eb67e7105e40b5994a3978ea906ab5f79bc Mon Sep 17 00:00:00 2001
From: guohongzilong <2713219276@qq.com>
Date: Fri, 15 May 2020 09:54:28 +0800
Subject: [PATCH 11/36] make optimizer parameter same as gradient

---
 mindspore/nn/wrap/grad_reducer.py | 2 +-
 mindspore/nn/wrap/loss_scale.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mindspore/nn/wrap/grad_reducer.py b/mindspore/nn/wrap/grad_reducer.py
index ee57297fe0..8383910a60 100644
--- a/mindspore/nn/wrap/grad_reducer.py
+++ b/mindspore/nn/wrap/grad_reducer.py
@@ -141,7 +141,7 @@ class DistributedGradReducer(Cell):
         >>>         super(TrainingWrapper, self).__init__(auto_prefix=False)
         >>>         self.network = network
         >>>         self.network.add_flags(defer_inline=True)
-        >>>         self.weights = ParameterTuple(network.trainable_params())
+        >>>         self.weights = optimizer.parameters
         >>>         self.optimizer = optimizer
         >>>         self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
         >>>         self.sens = sens
diff --git a/mindspore/nn/wrap/loss_scale.py b/mindspore/nn/wrap/loss_scale.py
index 65d66f0150..ae76cb055f 100644
--- a/mindspore/nn/wrap/loss_scale.py
+++ b/mindspore/nn/wrap/loss_scale.py
@@ -18,7 +18,7 @@ from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
 from mindspore.train.parallel_utils import ParallelMode
 from mindspore.parallel._utils import _get_device_num, _get_parallel_mode, _get_mirror_mean
 from ..cell import Cell
-from ...common import Tensor, ParameterTuple
+from ...common import Tensor
 from ...common.parameter import Parameter
 from ...ops import functional as F
 from ...ops import composite as C
@@ -201,7 +201,7 @@ class TrainOneStepWithLossScaleCell(Cell):
         super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
         self.network = network
         self.network.add_flags(defer_inline=True)
-        self.weights = ParameterTuple(network.trainable_params())
+        self.weights = optimizer.parameters
         self.optimizer = optimizer
         self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
         self.hyper_map = C.HyperMap()

From 22866fbe2577bbb1bcfe2befe15a8acdcb8f3c7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E6=88=90=E8=BE=89?=
 <liangchenghui@liangchenghuideMacBook-Pro.local>
Date: Sat, 16 May 2020 12:17:06 +0800
Subject: [PATCH 12/36] Adapt to TBE Cast operator latest interface

---
 mindspore/ccsrc/transform/op_declare.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/transform/op_declare.cc b/mindspore/ccsrc/transform/op_declare.cc
index 27c1d306aa..5cae6c77f7 100644
--- a/mindspore/ccsrc/transform/op_declare.cc
+++ b/mindspore/ccsrc/transform/op_declare.cc
@@ -823,7 +823,7 @@ OUTPUT_MAP(RealDiv) = {{0, OUTPUT_DESC(y)}};
 // Cast
 INPUT_MAP(Cast) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(Cast) = {{2, ATTR_DESC(dst_type, AnyTraits<GEType>())}};
-ATTR_MAP(Cast) = {{"Truncate", ATTR_DESC(truncate, AnyTraits<bool>())}};
+ATTR_MAP(Cast) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(Cast) = {{0, OUTPUT_DESC(y)}};
 
 // Reciprocal

From 7de7073a585f284ea3f7c9125aee592e3cea1a9f Mon Sep 17 00:00:00 2001
From: jinyaohui <jinyaohui@huawei.com>
Date: Mon, 18 May 2020 15:16:51 +0800
Subject: [PATCH 13/36] add profiling

---
 mindspore/ccsrc/pipeline/init.cc              |  6 ++-
 mindspore/ccsrc/utils/context/ms_context.cc   |  6 +++
 mindspore/ccsrc/utils/context/ms_context.h    |  8 ++++
 mindspore/context.py                          | 38 ++++++++++++++++++-
 tests/ut/python/pynative_mode/test_context.py | 31 ++++++++++++++-
 5 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/mindspore/ccsrc/pipeline/init.cc b/mindspore/ccsrc/pipeline/init.cc
index f1feedb64f..216f87765a 100644
--- a/mindspore/ccsrc/pipeline/init.cc
+++ b/mindspore/ccsrc/pipeline/init.cc
@@ -151,7 +151,11 @@ PYBIND11_MODULE(_c_expression, m) {
          "Set whether to enable dynamic mem pool.")
     .def("set_graph_memory_max_size", &mindspore::MsContext::set_graph_memory_max_size, "set graph memory max size.")
     .def("set_variable_memory_max_size", &mindspore::MsContext::set_variable_memory_max_size,
-         "set variable memory max size");
+         "set variable memory max size")
+    .def("get_enable_profiling", &mindspore::MsContext::enable_profiling, "Get whether to open profiling.")
+    .def("set_enable_profiling", &mindspore::MsContext::set_enable_profiling, "Set whether to open profiling.")
+    .def("get_profiling_options", &mindspore::MsContext::profiling_options, "Get options to profiling.")
+    .def("set_profiling_options", &mindspore::MsContext::set_profiling_options, "Set options to profiling.");
 
   (void)py::class_<ParallelContext, std::shared_ptr<ParallelContext>>(m, "AutoParallelContext")
     .def_static("get_instance", &ParallelContext::GetInstance, "Get auto parallel context instance.")
diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index b8b4b3d8a1..df9fe32833 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -78,6 +78,8 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
   graph_memory_max_size_ = "0";
   variable_memory_max_size_ = "0";
   enable_loop_sink_ = target == kAscendDevice || target == kDavinciDevice;
+  profiling_mode_ = false;
+  profiling_options_ = "training_trace";
 }
 
 std::shared_ptr<MsContext> MsContext::GetInstance() {
@@ -279,6 +281,10 @@ void MsContext::GetGeOptions(std::map<std::string, std::string> *ge_options) con
   (*ge_options)["device_id"] = "0";
   (*ge_options)["ge.exec.enableDump"] = std::to_string(enable_dump_);
   (*ge_options)["ge.exec.dumpPath"] = save_dump_path_;
+  (*ge_options)["ge.exec.profilingMode"] = std::to_string(profiling_mode_);
+  if (profiling_mode_) {
+    (*ge_options)["ge.exec.profilingOptions"] = profiling_options_;
+  }
   // only not supported in ge
   auto tbe_plugin_path = common::GetEnv("ME_TBE_PLUGIN_PATH");
   if (!tbe_plugin_path.empty()) {
diff --git a/mindspore/ccsrc/utils/context/ms_context.h b/mindspore/ccsrc/utils/context/ms_context.h
index b2d594d10e..9895e70463 100644
--- a/mindspore/ccsrc/utils/context/ms_context.h
+++ b/mindspore/ccsrc/utils/context/ms_context.h
@@ -138,6 +138,12 @@ class MsContext {
     variable_memory_max_size_ = variable_memory_max_size;
   }
 
+  void set_enable_profiling(bool flag) { profiling_mode_ = flag; }
+  bool enable_profiling() const { return profiling_mode_; }
+
+  void set_profiling_options(const std::string &options) { profiling_options_ = options; }
+  std::string profiling_options() const { return profiling_options_; }
+
  private:
   MsContext(const std::string &backend_policy, const std::string &target);
   void GetGeOptions(std::map<std::string, std::string> *ge_options) const;
@@ -174,6 +180,8 @@ class MsContext {
   std::string graph_memory_max_size_;
   std::string variable_memory_max_size_;
   std::thread tdt_print_;
+  bool profiling_mode_;
+  std::string profiling_options_;
 };
 
 }  // namespace mindspore
diff --git a/mindspore/context.py b/mindspore/context.py
index 74acd7cd01..1f8bbbb423 100644
--- a/mindspore/context.py
+++ b/mindspore/context.py
@@ -305,6 +305,26 @@ class _Context:
     def save_dump_path(self, save_dump_path):
         self._context_handle.set_save_dump_path(save_dump_path)
 
+    @property
+    def enable_profiling(self):
+        return self._context_handle.get_enable_profiling()
+
+    @enable_profiling.setter
+    def enable_profiling(self, flag):
+        self._context_handle.set_enable_profiling(flag)
+
+    @property
+    def profiling_options(self):
+        return self._context_handle.get_profiling_options()
+
+    @profiling_options.setter
+    def profiling_options(self, option):
+        options = ["training_trace", "task_trace", "task_trace:training_trace", "training_trace:task_trace", "op_trace"]
+        if option not in options:
+            raise ValueError("Profiling options must be in 'training_trace' 'task_trace' "
+                             "'task_trace:training_trace' 'training_trace:task_trace' or 'op_trace'.")
+        self._context_handle.set_profiling_options(option)
+
     @property
     def reserve_class_name_in_scope(self):
         """Gets whether to save the network class name in the scope."""
@@ -485,7 +505,7 @@ def reset_auto_parallel_context():
                  enable_mem_reuse=bool, save_ms_model=bool, save_ms_model_path=str, enable_gpu_summary=bool,
                  enable_auto_mixed_precision=bool, enable_dump=bool, save_dump_path=str,
                  enable_reduce_precision=bool, enable_dynamic_memory=bool, graph_memory_max_size=str,
-                 variable_memory_max_size=str)
+                 variable_memory_max_size=str, enable_profiling=bool, profiling_options=str)
 def set_context(**kwargs):
     """
     Sets context for running environment.
@@ -530,6 +550,21 @@ def set_context(**kwargs):
         enable_dynamic_memory (bool): Whether to enable dynamic memory. Default: False.
         graph_memory_max_size (str): Sets graph memory max size. Default: "26GB".
         variable_memory_max_size (str): Sets variable memory max size. Default: "5GB".
+        enable_profiling (bool): Whether to open profiling. Default: False.
+        profiling_options (str): Sets profiling collection options, operators can profiling data here.
+            Profiling collection options, the values are as follows, supporting the collection of multiple data.
+
+            - training_trace: collect iterative trajectory data, that is, the training task and software information of
+              the AI software stack, to achieve performance analysis of the training task, focusing on data
+              enhancement, forward and backward calculation, gradient aggregation update and other related data.
+
+            - task_trace: collect task trajectory data, that is, the hardware information of the HWTS/AICore of
+              the Ascend 910 processor, and analyze the information of start and end of the task.
+
+            - op_trace: collect single operator performance data.
+            The profiling can choose training_trace, task_trace, training_trace and task_trace combination and
+            separated by colons; single operator can choose op_trace, op_trace cannot be combined with
+            training_trace and task_trace. Default: "training_trace".
 
     Raises:
         ValueError: If input key is not an attribute in context.
@@ -553,6 +588,7 @@ def set_context(**kwargs):
         >>> context.set_context(mode=context.GRAPH_MODE,
         >>>                     device_target="Ascend",device_id=0, save_graphs=True,
         >>>                     save_graphs_path="/mindspore")
+        >>> context.set_context(enable_profiling=True, profiling_options="training_trace")
     """
     for key, value in kwargs.items():
         if not hasattr(_context(), key):
diff --git a/tests/ut/python/pynative_mode/test_context.py b/tests/ut/python/pynative_mode/test_context.py
index 2425b53f42..4b002eb0c8 100644
--- a/tests/ut/python/pynative_mode/test_context.py
+++ b/tests/ut/python/pynative_mode/test_context.py
@@ -16,6 +16,8 @@
 import os
 import pytest
 from mindspore import context
+
+
 # pylint: disable=W0212
 # W0212: protected-access
 
@@ -72,6 +74,34 @@ def test_dump_target():
     assert context.get_context("save_dump_path") == "."
 
 
+def test_enable_profiling():
+    """ test_profiling_mode """
+    with pytest.raises(TypeError):
+        context.set_context(enable_profiling=1)
+    with pytest.raises(TypeError):
+        context.set_context(enable_profiling="1")
+    context.set_context(enable_profiling=True)
+    assert context.get_context("enable_profiling") is True
+    context.set_context(enable_profiling=False)
+    assert context.get_context("enable_profiling") is False
+
+
+def test_profiling_options():
+    """ test_profiling_options """
+    with pytest.raises(TypeError):
+        context.set_context(profiling_options=True)
+    with pytest.raises(TypeError):
+        context.set_context(profiling_options=1)
+    with pytest.raises(ValueError):
+        context.set_context(profiling_options="training_")
+    with pytest.raises(ValueError):
+        context.set_context(profiling_options="training_trace:op_trace")
+    context.set_context(profiling_options="training_trace")
+    assert context.get_context("profiling_options") == "training_trace"
+    context.set_context(profiling_options="training_trace:task_trace")
+    assert context.get_context("profiling_options") == "training_trace:task_trace"
+
+
 def test_set_context():
     """ test_set_context """
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
@@ -101,4 +131,3 @@ def teardown_module():
             os.rmdir(item_name)
         elif os.path.isfile(item_name):
             os.remove(item_name)
-

From 11089e6077154b337d7e0bc3cc7affc44c361b21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E6=88=90=E8=BE=89?=
 <liangchenghui@liangchenghuideMacBook-Pro.local>
Date: Tue, 19 May 2020 10:56:22 +0800
Subject: [PATCH 14/36] Adapte ge lib name change form ge_client_train to
 ge_runner.

---
 mindspore/ccsrc/CMakeLists.txt | 2 +-
 tests/ut/cpp/CMakeLists.txt    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 8d3818a777..37842820a2 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -125,7 +125,7 @@ endif()
 
 if (ENABLE_GE)
     if(ENABLE_TRAIN)
-        target_link_libraries(mindspore ge_client_train hccl)
+        target_link_libraries(mindspore ge_runner hccl)
     else ()
         target_link_libraries(mindspore ge_client)
     endif ()
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index f5bc07ff69..8176c4fd37 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -128,7 +128,7 @@ add_executable(ut_tests ${UT_SRCS} ${MINDSPORE_SRC_LIST} ${UT_SUTB_SRC_LIST})
 
 if (ENABLE_GE)
     if(ENABLE_TRAIN)
-        target_link_libraries(ut_tests PRIVATE graph ge_client_train)
+        target_link_libraries(ut_tests PRIVATE graph ge_runner)
     else()
         target_link_libraries(ut_tests PRIVATE graph ge_client)
     endif()

From 69cd90ae431d56c8476eed316dfc245e90b55414 Mon Sep 17 00:00:00 2001
From: zhoufeng <zhoufeng54@huawei.com>
Date: Thu, 21 May 2020 22:35:25 +0800
Subject: [PATCH 15/36] Use low version cxx11 abi

Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc07ccae8b..6b69c510d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,9 @@ endif ()
 
 include(${CMAKE_SOURCE_DIR}/cmake/options.cmake)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/modules/")
+if (ENABLE_GE)
+    add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+endif ()
 
 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
     set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Werror -Wno-return-std-move -Wno-unused-private-field -Wno-unused-lambda-capture -Wno-sign-compare -Wno-overloaded-virtual -Wno-unneeded-internal-declaration -Wno-unused-variable -Wno-pessimizing-move -Wno-inconsistent-missing-override -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")    

From 3fcc6f0c718be4bd3679b6a1544dde288e05e285 Mon Sep 17 00:00:00 2001
From: zhaojichen <zhaojichen1@huawei.com>
Date: Tue, 26 May 2020 08:38:36 -0400
Subject: [PATCH 16/36] fix bn train/eval problem

---
 mindspore/nn/layer/normalization.py | 32 +++++++++++++++++++----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/mindspore/nn/layer/normalization.py b/mindspore/nn/layer/normalization.py
index fd9279cf04..a66fe93e27 100644
--- a/mindspore/nn/layer/normalization.py
+++ b/mindspore/nn/layer/normalization.py
@@ -41,7 +41,7 @@ class _BatchNorm(Cell):
                  beta_init='zeros',
                  moving_mean_init='zeros',
                  moving_var_init='ones',
-                 use_batch_statistics=True,
+                 use_batch_statistics=None,
                  device_num_each_group=1):
         super(_BatchNorm, self).__init__()
         if num_features < 1:
@@ -143,7 +143,11 @@ class _BatchNorm(Cell):
         return y
 
     def construct(self, x):
-        if self.training and self.use_batch_statistics:
+        if self.use_batch_statistics is None:
+            flag = self.training
+        else:
+            flag = self.use_batch_statistics
+        if flag:
             if self.is_ge_backend and self.is_global:
                 axes, re_shape = _shape_infer(F.shape(x), self.num_features)
                 y = self._global_sync(x, axes, re_shape)
@@ -228,8 +232,10 @@ class BatchNorm1d(_BatchNorm):
         moving_var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving variance.
             The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
             'he_uniform', etc. Default: 'ones'.
-        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data, else use
-            the mean value and variance value of specified value. Default: True.
+        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
+            use the mean value and variance value of specified value. If None, training process will use the mean and
+            variance of current batch data and track the running mean and variance, eval process will use the running
+            mean and variance. Default: None.
 
     Inputs:
         - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -251,7 +257,7 @@ class BatchNorm1d(_BatchNorm):
                  beta_init='zeros',
                  moving_mean_init='zeros',
                  moving_var_init='ones',
-                 use_batch_statistics=True):
+                 use_batch_statistics=None):
         super(BatchNorm1d, self).__init__(num_features,
                                           eps,
                                           momentum,
@@ -299,8 +305,10 @@ class BatchNorm2d(_BatchNorm):
         moving_var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving variance.
             The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
             'he_uniform', etc. Default: 'ones'.
-        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data, else use
-            the mean value and variance value of specified value. Default: True.
+        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
+            use the mean value and variance value of specified value. If None, training process will use the mean and
+            variance of current batch data and track the running mean and variance, eval process will use the running
+            mean and variance. Default: None.
 
     Inputs:
         - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -322,7 +330,7 @@ class BatchNorm2d(_BatchNorm):
                  beta_init='zeros',
                  moving_mean_init='zeros',
                  moving_var_init='ones',
-                 use_batch_statistics=True):
+                 use_batch_statistics=None):
         super(BatchNorm2d, self).__init__(num_features,
                                           eps,
                                           momentum,
@@ -368,8 +376,10 @@ class GlobalBatchNorm(_BatchNorm):
         moving_var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving variance.
             The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
             'he_uniform', etc. Default: 'ones'.
-        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data, else use
-            the mean value and variance value of specified value. Default: True.
+        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
+            use the mean value and variance value of specified value. If None, training process will use the mean and
+            variance of current batch data and track the running mean and variance, eval process will use the running
+            mean and variance. Default: None.
 
     Inputs:
         - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -391,7 +401,7 @@ class GlobalBatchNorm(_BatchNorm):
                  beta_init='zeros',
                  moving_mean_init='zeros',
                  moving_var_init='ones',
-                 use_batch_statistics=True,
+                 use_batch_statistics=None,
                  device_num_each_group=1):
         super(GlobalBatchNorm, self).__init__(num_features,
                                               eps,

From 3bf9ca38bf31e1e10c205c49d8db149953c60125 Mon Sep 17 00:00:00 2001
From: xutianchun <xutianchun@huawei.com>
Date: Tue, 26 May 2020 16:53:38 +0800
Subject: [PATCH 17/36] register op: ctcloss

---
 mindspore/ops/_op_impl/aicpu/__init__.py |  1 +
 mindspore/ops/_op_impl/aicpu/ctcloss.py  | 42 ++++++++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 mindspore/ops/_op_impl/aicpu/ctcloss.py

diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index 37d008940d..5138d0f28c 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -24,3 +24,4 @@ from .flatten import _flatten_aicpu
 from .squeeze import _squeeze_aicpu
 from .expand_dims import _expand_dims_aicpu
 from .random_choice_with_mask import _random_choice_with_mask_aicpu
+from .ctcloss import _ctcloss_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/ctcloss.py b/mindspore/ops/_op_impl/aicpu/ctcloss.py
new file mode 100644
index 0000000000..c393cb04b6
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/ctcloss.py
@@ -0,0 +1,42 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""CTCLoss op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+ctcloss_op_info = AiCPURegOp("CTCLoss") \
+    .fusion_type("OPAQUE") \
+    .input(0, "inputs", "required") \
+    .input(1, "labels_indices", "required") \
+    .input(2, "labels_values", "required") \
+    .input(3, "sequence_length", "required") \
+    .output(0, "loss", "required") \
+    .output(1, "gradient", "required") \
+    .attr("preprocess_collapse_repeated", "bool") \
+    .attr("ctc_merge_repeated", "bool") \
+    .attr("ignore_longer_outputs_than_inputs", "bool") \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F64_Default, DataType.F64_Default) \
+    .dtype_format(DataType.F32_NCHW, DataType.I64_NCHW, DataType.I32_NCHW, DataType.I32_NCHW,
+                  DataType.F32_NCHW, DataType.F32_NCHW) \
+    .dtype_format(DataType.F64_NCHW, DataType.I64_NCHW, DataType.I32_NCHW, DataType.I32_NCHW,
+                  DataType.F64_NCHW, DataType.F64_NCHW) \
+    .get_op_info()
+
+@op_info_register(ctcloss_op_info)
+def _ctcloss_aicpu():
+    """CTCLoss AiCPU register"""
+    return

From 1f5d3ce567dcf35e962ae0e81137dff9804ef9a9 Mon Sep 17 00:00:00 2001
From: chenhaozhe <chenhaozhe1@huawei.com>
Date: Thu, 28 May 2020 10:30:56 +0800
Subject: [PATCH 18/36] fix performance of bert

---
 mindspore/ccsrc/pre_activate/common/helper.cc | 2 +-
 mindspore/nn/optim/lamb.py                    | 2 +-
 mindspore/ops/_grad/grad_math_ops.py          | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mindspore/ccsrc/pre_activate/common/helper.cc b/mindspore/ccsrc/pre_activate/common/helper.cc
index decaaaca62..bfdf092205 100644
--- a/mindspore/ccsrc/pre_activate/common/helper.cc
+++ b/mindspore/ccsrc/pre_activate/common/helper.cc
@@ -686,7 +686,7 @@ bool IsSameNode(const EquivPtr &equiv1, const EquivPtr &equiv2, const VarPtr &va
   MS_EXCEPTION_IF_NULL(equiv1_node);
   auto equiv2_node = GetAnfNodeByVar(equiv2, var_node);
   MS_EXCEPTION_IF_NULL(equiv2_node);
-  return equiv1_node == equiv2_node;
+  return *equiv1_node == *equiv2_node;
 }
 
 AnfNodePtr GetAnfNodeByVar(const EquivPtr &equiv, const VarPtr &var_node) {
diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index b4d478f52a..a6a38f164a 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -180,7 +180,7 @@ class Lamb(Optimizer):
                  beta2=0.999,
                  eps=1e-6,
                  weight_decay=0.0,
-                 decay_filter=lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name):
+                 decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()):
 
         super(Lamb, self).__init__(start_learning_rate, params)
         if self.is_group:
diff --git a/mindspore/ops/_grad/grad_math_ops.py b/mindspore/ops/_grad/grad_math_ops.py
index f457148d51..8edf6d82f2 100755
--- a/mindspore/ops/_grad/grad_math_ops.py
+++ b/mindspore/ops/_grad/grad_math_ops.py
@@ -191,8 +191,8 @@ def get_bprop_mul(self):
     mul_func = P.Mul()
 
     def bprop(x, y, out, dout):
-        bc_dx = mul_func(dout, y)
-        bc_dy = mul_func(dout, x)
+        bc_dx = mul_func(y, dout)
+        bc_dy = mul_func(x, dout)
         return binop_grad_common(x, y, bc_dx, bc_dy)
     return bprop
 

From e3d1e2f55b43d74e313ae5810e2945018414b2f3 Mon Sep 17 00:00:00 2001
From: yanzhenxiang2020 <yanzhenxiang@huawei.com>
Date: Tue, 26 May 2020 09:57:48 +0800
Subject: [PATCH 19/36] add RNNTLoss and RandomCategorical op for aicpu

---
 mindspore/ops/_grad/grad_nn_ops.py            | 12 ++++
 mindspore/ops/_op_impl/aicpu/__init__.py      |  2 +
 .../ops/_op_impl/aicpu/random_categorical.py  | 48 +++++++++++++++
 mindspore/ops/_op_impl/aicpu/rnnt_loss.py     | 37 ++++++++++++
 mindspore/ops/operations/__init__.py          |  5 +-
 mindspore/ops/operations/nn_ops.py            | 55 ++++++++++++++++++
 mindspore/ops/operations/random_ops.py        | 58 +++++++++++++++++++
 .../test_aicpu_ops/test_random_categorical.py | 38 ++++++++++++
 .../ascend/test_aicpu_ops/test_rnnt_loss.py   | 43 ++++++++++++++
 9 files changed, 297 insertions(+), 1 deletion(-)
 create mode 100644 mindspore/ops/_op_impl/aicpu/random_categorical.py
 create mode 100644 mindspore/ops/_op_impl/aicpu/rnnt_loss.py
 create mode 100644 tests/st/ops/ascend/test_aicpu_ops/test_random_categorical.py
 create mode 100644 tests/st/ops/ascend/test_aicpu_ops/test_rnnt_loss.py

diff --git a/mindspore/ops/_grad/grad_nn_ops.py b/mindspore/ops/_grad/grad_nn_ops.py
index c557301285..9f543c63cd 100755
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@@ -518,6 +518,18 @@ def get_bprop_l2_loss(self):
     return bprop
 
 
+@bprop_getters.register(P.RNNTLoss)
+def get_bprop_rnnt_loss(self):
+    """Grad definition for `RNNTLoss` operation."""
+    expand = P.ExpandDims()
+
+    def bprop(acts, labels, act_lens, label_lens, out, dout):
+        grad_loss = out[1]
+        grad = grad_loss * expand(expand(expand(dout[0], -1), -1), -1)
+        return grad, zeros_like(labels), zeros_like(act_lens), zeros_like(label_lens)
+    return bprop
+
+
 @bprop_getters.register(P.PReLU)
 def get_bprop_prelu(self):
     """Grad definition for `PReLU` operation."""
diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index 5138d0f28c..bb490d050b 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -25,3 +25,5 @@ from .squeeze import _squeeze_aicpu
 from .expand_dims import _expand_dims_aicpu
 from .random_choice_with_mask import _random_choice_with_mask_aicpu
 from .ctcloss import _ctcloss_aicpu
+from .rnnt_loss import _rnnt_loss_aicpu
+from .random_categorical import _random_categorical_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/random_categorical.py b/mindspore/ops/_op_impl/aicpu/random_categorical.py
new file mode 100644
index 0000000000..a0c6f64c97
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/random_categorical.py
@@ -0,0 +1,48 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""RandomCategorical op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+random_categorical_op_info = AiCPURegOp("RandomCategorical") \
+    .fusion_type("OPAQUE") \
+    .input(0, "logits", "required") \
+    .input(1, "num_sample", "required") \
+    .input(2, "seed", "required") \
+    .output(0, "output", "required") \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.I64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.I64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.I64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
+    .get_op_info()
+
+@op_info_register(random_categorical_op_info)
+def _random_categorical_aicpu():
+    """RandomCategorical AiCPU register"""
+    return
diff --git a/mindspore/ops/_op_impl/aicpu/rnnt_loss.py b/mindspore/ops/_op_impl/aicpu/rnnt_loss.py
new file mode 100644
index 0000000000..d35d102048
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/rnnt_loss.py
@@ -0,0 +1,37 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""RNNTLoss op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+rnnt_loss_op_info = AiCPURegOp("RNNTLoss") \
+    .fusion_type("OPAQUE") \
+    .input(0, "acts", "required") \
+    .input(1, "labels", "required") \
+    .input(2, "input_lengths", "required") \
+    .input(3, "label_lengths", "required") \
+    .output(0, "costs", "required") \
+    .output(1, "grads", "required") \
+    .attr("blank_label", "int") \
+    .dtype_format(DataType.F32_NCHW, DataType.I32_NCHW, DataType.I32_NCHW, DataType.I32_NCHW, DataType.F32_NCHW,
+                  DataType.F32_NCHW) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+@op_info_register(rnnt_loss_op_info)
+def _rnnt_loss_aicpu():
+    """RNNTLoss AiCPU register"""
+    return
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index af9e84685a..87601c5592 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -48,7 +48,7 @@ from .math_ops import (Abs, ACos, AddN, AssignAdd, AssignSub, Atan2, BatchMatMul
                        Reciprocal, CumSum,
                        Sin, Sqrt, Rsqrt,
                        Square, Sub, TensorAdd, Sign, Round, SquareSumAll)
-from .random_ops import (RandomChoiceWithMask)
+from .random_ops import (RandomChoiceWithMask, RandomCategorical)
 from .nn_ops import (LSTM, SGD, Adam, ApplyMomentum, BatchNorm,
                      BiasAdd, Conv2D,
                      DepthwiseConv2dNative,
@@ -63,6 +63,7 @@ from .nn_ops import (LSTM, SGD, Adam, ApplyMomentum, BatchNorm,
                      ResizeBilinear, Sigmoid,
                      SigmoidCrossEntropyWithLogits,
                      SmoothL1Loss, Softmax, Softplus,
+                     RNNTLoss,
                      SoftmaxCrossEntropyWithLogits, ROIAlign,
                      SparseSoftmaxCrossEntropyWithLogits, Tanh,
                      TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl,
@@ -147,6 +148,7 @@ __all__ = [
     'HSigmoid',
     'Tanh',
     'RandomChoiceWithMask',
+    'RandomCategorical',
     'ResizeBilinear',
     'ScalarSummary',
     'ImageSummary',
@@ -174,6 +176,7 @@ __all__ = [
     'SmoothL1Loss',
     'L2Loss',
     'CTCLoss',
+    'RNNTLoss',
     'ReduceAll',
     'ScalarToArray',
     'ScalarToTensor',
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index dcc5810105..4a19d0e113 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -1599,6 +1599,61 @@ class L2Loss(PrimitiveWithInfer):
         return x_type
 
 
+class RNNTLoss(PrimitiveWithInfer):
+    """
+    Computes the RNNTLoss and its gradient with respect to the softmax outputs.
+
+    Args:
+        blank_label (int): blank label. Default: 0.
+
+    Inputs:
+        - **acts** (Tensor[float32]) - Tensor of shape :math:`(B, T, U, V)`.
+        - **labels** (Tensor[int32]) - Tensor of shape :math:`(B, N)`.
+        - **input_lengths** (Tensor[int32]) - Tensor of shape :math:`(B,)`.
+        - **label_lebgths** (Tensor[int32]) - Tensor of shape :math:`(B,)`.
+
+    Outputs:
+        - **costs** (Tensor[int32]) - Tensor of shape :math:`(B,)`.
+        - **grads** (Tensor[int32]) - Has the same shape as `acts`.
+
+    Examples:
+        >>> B, T, U, V = 1, 2, 3, 5
+        >>> acts = np.random.random((B, T, U, V)).astype(np.float32)
+        >>> labels = np.array([[1, 2]]).astype(np.int32)
+        >>> input_length = np.array([T] * B).astype(np.int32)
+        >>> label_length = np.array([len(l) for l in labels]).astype(np.int32)
+        >>> rnnt_loss = P.RNNTLoss(blank_label=blank)
+        >>> costs, grads = rnnt_loss(Tensor(acts), Tensor(labels), Tensor(input_length), Tensor(label_length))
+    """
+    @prim_attr_register
+    def __init__(self, blank_label=0):
+        validator.check_value_type('blank_label', blank_label, [int], self.name)
+        self.init_prim_io_names(inputs=['acts', 'labels', 'input_length', 'label_length'],
+                                outputs=['costs', 'grads'])
+
+    def infer_shape(self, acts_shape, labels_shape, input_length_shape, label_length_shape):
+        validator.check_integer('acts_rank', len(acts_shape), 4, Rel.EQ, self.name)
+        validator.check_integer('labels_rank', len(labels_shape), 2, Rel.EQ, self.name)
+        validator.check_integer('input_length_rank', len(input_length_shape), 1, Rel.EQ, self.name)
+        validator.check_integer('label_length_rank', len(label_length_shape), 1, Rel.EQ, self.name)
+        validator.check('labels shape[0]', labels_shape[0], 'acts shape[0]', acts_shape[0], Rel.EQ, self.name)
+        validator.check('input_length size', input_length_shape[0], 'acts shape[0]', acts_shape[0], Rel.EQ, self.name)
+        validator.check('label_length size', label_length_shape[0], 'acts shape[0]', acts_shape[0], Rel.EQ, self.name)
+        costs_shape = (acts_shape[0],)
+        return (costs_shape, acts_shape)
+
+    def infer_dtype(self, acts_type, labels_type, input_length_type, label_length_type):
+        validator.check_subclass("acts_type", acts_type, mstype.tensor, self.name)
+        validator.check_subclass("labels_type", labels_type, mstype.tensor, self.name)
+        validator.check_subclass("input_length_type", input_length_type, mstype.tensor, self.name)
+        validator.check_subclass("label_length_type", label_length_type, mstype.tensor, self.name)
+        validator.check_tensor_type_same({"acts_type": acts_type}, [mstype.float32], self.name)
+        validator.check_tensor_type_same({"labels_type": labels_type}, [mstype.int32], self.name)
+        validator.check_tensor_type_same({"input_length_type": input_length_type}, [mstype.int32], self.name)
+        validator.check_tensor_type_same({"label_length_type": label_length_type}, [mstype.int32], self.name)
+        return (acts_type, acts_type)
+
+
 class SGD(PrimitiveWithInfer):
     """
     Computes stochastic gradient descent (optionally with momentum).
diff --git a/mindspore/ops/operations/random_ops.py b/mindspore/ops/operations/random_ops.py
index 2692b43b46..77201c25f9 100644
--- a/mindspore/ops/operations/random_ops.py
+++ b/mindspore/ops/operations/random_ops.py
@@ -64,3 +64,61 @@ class RandomChoiceWithMask(PrimitiveWithInfer):
     def infer_dtype(self, x_dtype):
         validator.check_tensor_type_same({'x': x_dtype}, [mstype.bool_], self.name)
         return (mstype.int32, mstype.bool_)
+
+
+class RandomCategorical(PrimitiveWithInfer):
+    """
+    Generates random samples from a given categorical distribution tensor.
+
+    Args:
+        dtype (mindspore.dtype): The type of output. Its value should be one of [mindspore.int16,
+            mindspore.int32, mindspore.int64]. Default: mindspore.int64.
+
+    Inputs:
+        - **logits** (Tensor) - The input tensor. 2-D Tensor with shape [batch_size, num_classes].
+        - **num_sample** (int) - Number of sample to be drawn. Only constant values is allowed.
+        - **seed** (int) - Random seed. Default: 0.
+
+    Outputs:
+        - **output** (Tensor) - The output Tensor with shape [batch_size, num_samples].
+
+    Examples:
+        >>> class Net(nn.Cell):
+        >>>   def __init__(self, num_sample):
+        >>>     super(Net, self).__init__()
+        >>>     self.random_categorical = P.RandomCategorical(mindspore.int64)
+        >>>     self.num_sample = num_sample
+        >>>   def construct(self, logits, seed=0):
+        >>>     return self.random_categorical(logits, self.num_sample, seed)
+        >>>
+        >>> x = np.random.random((10, 5)).astype(np.float32)
+        >>> net = Net(8)
+        >>> output = net(Tensor(x))
+    """
+    @prim_attr_register
+    def __init__(self, dtype=mstype.int64):
+        """Init RandomCategorical"""
+        self.dtype = dtype
+
+        valid_values = (mstype.int32, mstype.int16, mstype.int64)
+        validator.check_type_name("dtype", dtype, valid_values, self.name)
+        self.init_prim_io_names(inputs=['logits', 'num_samples', 'seed'],
+                                outputs=['output'])
+
+    def __infer__(self, logits, num_samples, seed):
+        logits_dtype = logits['dtype']
+        valid_types = (mstype.float32, mstype.float16, mstype.float64)
+        validator.check_tensor_type_same({'logits': logits_dtype}, valid_types, self.name)
+        num_samples_v = num_samples['value']
+        seed_v = seed['value']
+        validator.check_value_type('num_samples', num_samples_v, (int,), self.name)
+        validator.check_value_type('seed', seed_v, (int,), self.name)
+        validator.check_integer("num_samples", num_samples_v, 0, Rel.GT, self.name)
+        x_shape = list(logits['shape'])
+        if len(x_shape) != 2:
+            raise ValueError("RandomCategorical shape should be 2-dimension.")
+        ndim = len(x_shape) - 1
+        x_shape[ndim] = num_samples_v
+        return {'shape': (x_shape),
+                'dtype': (self.dtype),
+                'value': None}
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_random_categorical.py b/tests/st/ops/ascend/test_aicpu_ops/test_random_categorical.py
new file mode 100644
index 0000000000..6304e8b111
--- /dev/null
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_random_categorical.py
@@ -0,0 +1,38 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import mindspore
+from mindspore import Tensor
+from mindspore.ops import operations as P
+import mindspore.nn as nn
+from mindspore.common.api import ms_function
+import numpy as np
+import mindspore.context as context
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+class Net(nn.Cell):
+  def __init__(self, num_sample):
+    super(Net, self).__init__()
+    self.random_categorical = P.RandomCategorical(mindspore.int64)
+    self.num_sample = num_sample
+
+  def construct(self, logits, seed=0):
+    return self.random_categorical(logits, self.num_sample, seed)
+
+def test_net():
+  x = np.random.random((10, 5)).astype(np.float32)
+  net = Net(8)
+  output = net(Tensor(x))
+  print(x)
+  print(output.asnumpy())
+  print(output.dtype())
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_rnnt_loss.py b/tests/st/ops/ascend/test_aicpu_ops/test_rnnt_loss.py
new file mode 100644
index 0000000000..c7e2df07f8
--- /dev/null
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_rnnt_loss.py
@@ -0,0 +1,43 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import mindspore as ms
+from mindspore import Tensor
+from mindspore.ops import operations as P
+import mindspore.nn as nn
+from mindspore.common.api import ms_function
+import numpy as np
+import mindspore.context as context
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+class Net(nn.Cell):
+  def __init__(self):
+    super(Net, self).__init__()
+    self.rnnt_loss = P.RNNTLoss(blank_label=0)
+
+  def construct(self, acts, labels, act_lens, label_lens):
+    return self.rnnt_loss(acts, labels, act_lens, label_lens)
+
+
+def test_net():
+  B, T, U, V = 1, 2, 3, 5
+  acts = np.random.random((B, T, U, V)).astype(np.float32)
+  labels = np.array([[np.random.randint(1, V-1) for _ in range(U-1)]]).astype(np.int32)
+  input_length = np.array([T] * B).astype(np.int32)
+  label_length = np.array([len(l) for l in labels]).astype(np.int32)
+  
+  rnnt_loss = Net()
+  costs, grads = rnnt_loss(Tensor(acts), Tensor(labels), Tensor(input_length), Tensor(label_length))
+  print(Tensor(acts), Tensor(labels), Tensor(input_length), Tensor(label_length))
+  print(costs.asnumpy())
+  print(grads.asnumpy())

From 7716f7618c5a5d2d3b7d705511492769f41954ac Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Fri, 22 May 2020 11:22:27 +0800
Subject: [PATCH 20/36] add op BasicLSTMCell

---
 mindspore/_checkparam.py                      |   3 +
 mindspore/ops/_grad/grad_nn_ops.py            |  22 ++++
 mindspore/ops/_op_impl/tbe/__init__.py        |   4 +
 mindspore/ops/_op_impl/tbe/basic_lstm_cell.py |  57 ++++++++++
 .../tbe/basic_lstm_cell_c_state_grad.py       |  50 +++++++++
 .../tbe/basic_lstm_cell_input_grad.py         |  42 +++++++
 .../tbe/basic_lstm_cell_weight_grad.py        |  41 +++++++
 mindspore/ops/operations/__init__.py          |   5 +-
 mindspore/ops/operations/_grad_ops.py         | 103 +++++++++++++++++
 mindspore/ops/operations/nn_ops.py            | 106 ++++++++++++++++++
 tests/ut/python/ops/test_ops.py               |   5 +
 11 files changed, 436 insertions(+), 2 deletions(-)
 create mode 100644 mindspore/ops/_op_impl/tbe/basic_lstm_cell.py
 create mode 100644 mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad.py
 create mode 100644 mindspore/ops/_op_impl/tbe/basic_lstm_cell_input_grad.py
 create mode 100644 mindspore/ops/_op_impl/tbe/basic_lstm_cell_weight_grad.py

diff --git a/mindspore/_checkparam.py b/mindspore/_checkparam.py
index 0c101bf1a8..d8ca5a9845 100644
--- a/mindspore/_checkparam.py
+++ b/mindspore/_checkparam.py
@@ -299,6 +299,9 @@ class Validator:
         def get_typename(t):
             return t.__name__ if hasattr(t, '__name__') else str(t)
 
+        if isinstance(arg_type, type(mstype.tensor)):
+            arg_type = arg_type.element_type()
+
         if arg_type in valid_types:
             return arg_type
         type_names = [get_typename(t) for t in valid_types]
diff --git a/mindspore/ops/_grad/grad_nn_ops.py b/mindspore/ops/_grad/grad_nn_ops.py
index fa34ac545f..e998afb269 100755
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@@ -709,3 +709,25 @@ def get_bprop_ctc_loss(self):
         return grad, zeros_like(labels_indices), zeros_like(labels_values), zeros_like(sequence_length)
 
     return bprop
+
+
+@bprop_getters.register(P.BasicLSTMCell)
+def get_bprop_basic_lstm_cell(self):
+    """Grad definition for `BasicLSTMCell` operation."""
+    basic_lstm_cell_cstate_grad = G.BasicLSTMCellCStateGrad(
+        forget_bias=self.forget_bias,
+        activation=self.activation
+    )
+
+    basic_lstm_cell_weight_grad = G.BasicLSTMCellWeightGrad()
+
+    basic_lstm_cell_input_grad = G.BasicLSTMCellInputGrad(keep_prob=self.keep_prob)
+
+    def bprop(x, h, c, w, b, out, dout):
+        _, _, it, jt, ft, ot, tanhct = out
+        dct, dht, _, _, _, _, _ = dout
+        dgate, dct_1 = basic_lstm_cell_cstate_grad(c, dht, dct, it, jt, ft, ot, tanhct)
+        dxt, dht = basic_lstm_cell_input_grad(dgate, w)
+        dw, db = basic_lstm_cell_weight_grad(F.depend(x, dxt), h, dgate)
+        return dxt, dht, dct_1, dw, db
+    return bprop
diff --git a/mindspore/ops/_op_impl/tbe/__init__.py b/mindspore/ops/_op_impl/tbe/__init__.py
index a85db03759..3d1825b53e 100644
--- a/mindspore/ops/_op_impl/tbe/__init__.py
+++ b/mindspore/ops/_op_impl/tbe/__init__.py
@@ -227,3 +227,7 @@ from .asinh_grad import _asinh_grad_tbe
 from .atan import _atan_tbe
 from .atan_grad import _atan_grad_tbe
 from .atanh import _atanh_tbe
+from .basic_lstm_cell import _basic_lstm_cell_tbe
+from .basic_lstm_cell_c_state_grad import _basic_lstm_cell_c_state_grad_tbe
+from .basic_lstm_cell_weight_grad import _basic_lstm_cell_weight_grad_tbe
+from .basic_lstm_cell_input_grad import _basic_lstm_cell_input_grad_tbe
diff --git a/mindspore/ops/_op_impl/tbe/basic_lstm_cell.py b/mindspore/ops/_op_impl/tbe/basic_lstm_cell.py
new file mode 100644
index 0000000000..76ad1e4607
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/basic_lstm_cell.py
@@ -0,0 +1,57 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BasicLSTMCell op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+basic_lstm_cell_op_info = TBERegOp("BasicLSTMCell") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("basic_lstm_cell.so") \
+    .compute_cost(10) \
+    .kernel_name("basic_lstm_cell") \
+    .attr("keep_prob", "optional", "float", "all") \
+    .attr("forget_bias", "optional", "float", "all") \
+    .attr("state_is_tuple", "optional", "bool", "true") \
+    .attr("activation", "optional", "str", "all") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "h", False, "required", "all") \
+    .input(2, "c", False, "required", "all") \
+    .input(3, "w", False, "required", "all") \
+    .input(4, "b", False, "required", "all") \
+    .input(5, "mask", False, "optional", "all") \
+    .output(0, "ct", False, "required", "all") \
+    .output(1, "ht", False, "required", "all") \
+    .output(2, "it", False, "optional", "all") \
+    .output(3, "jt", False, "optional", "all") \
+    .output(4, "ft", False, "optional", "all") \
+    .output(5, "ot", False, "optional", "all") \
+    .output(6, "tanhct", False, "optional", "all") \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F32_FracNZ, DataType.F16_FracZ,
+                  DataType.F32_Default, DataType.U8_Default, DataType.F32_FracNZ, DataType.F16_FracNZ,
+                  DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ,
+                  DataType.F32_FracNZ) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracZ,
+                  DataType.F16_Default, DataType.U8_Default, DataType.F16_FracNZ, DataType.F16_FracNZ,
+                  DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ,
+                  DataType.F16_FracNZ) \
+    .get_op_info()
+
+
+@op_info_register(basic_lstm_cell_op_info)
+def _basic_lstm_cell_tbe():
+    """BasicLSTMCell TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad.py b/mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad.py
new file mode 100644
index 0000000000..099756ad35
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad.py
@@ -0,0 +1,50 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BasicLSTMCellCStateGrad op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+basic_lstm_cell_c_state_grad_op_info = TBERegOp("BasicLSTMCellCStateGrad") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("basic_lstm_cell_c_state_grad.so") \
+    .compute_cost(10) \
+    .kernel_name("basic_lstm_cell_c_state_grad") \
+    .attr("forget_bias", "optional", "float", "all") \
+    .attr("activation", "optional", "str", "all") \
+    .partial_flag(True) \
+    .input(0, "c", False, "required", "all") \
+    .input(1, "dht", False, "required", "all") \
+    .input(2, "dct", False, "required", "all") \
+    .input(3, "it", False, "required", "all") \
+    .input(4, "ft", False, "required", "all") \
+    .input(5, "jt", False, "required", "all") \
+    .input(6, "ot", False, "required", "all") \
+    .input(7, "tanhct", False, "required", "all") \
+    .output(0, "dgate", False, "required", "all") \
+    .output(1, "dct_1", False, "required", "all") \
+    .dtype_format(DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ,
+                  DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ,
+                  DataType.F16_FracNZ, DataType.F16_FracNZ) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ,
+                  DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ,
+                  DataType.F32_FracNZ, DataType.F16_FracNZ) \
+    .get_op_info()
+
+
+@op_info_register(basic_lstm_cell_c_state_grad_op_info)
+def _basic_lstm_cell_c_state_grad_tbe():
+    """BasicLSTMCellCStateGrad TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/basic_lstm_cell_input_grad.py b/mindspore/ops/_op_impl/tbe/basic_lstm_cell_input_grad.py
new file mode 100644
index 0000000000..d976d1143b
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/basic_lstm_cell_input_grad.py
@@ -0,0 +1,42 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BasicLSTMCellInputGrad op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+basic_lstm_cell_input_grad_op_info = TBERegOp("BasicLSTMCellInputGrad") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("basic_lstm_cell_input_grad.so") \
+    .compute_cost(10) \
+    .kernel_name("basic_lstm_cell_input_grad") \
+    .attr("keep_prob", "optional", "float", "all") \
+    .partial_flag(True) \
+    .input(0, "dgate", False, "required", "all") \
+    .input(1, "w", False, "required", "all") \
+    .input(2, "dropout_mask", False, "optional", "all") \
+    .output(0, "dxt", False, "required", "all") \
+    .output(1, "dht", False, "required", "all") \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracZ, DataType.U8_Default, DataType.F32_FracNZ,
+                  DataType.F32_FracNZ) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracZ, DataType.U8_Default, DataType.F16_FracNZ,
+                  DataType.F16_FracNZ) \
+    .get_op_info()
+
+
+@op_info_register(basic_lstm_cell_input_grad_op_info)
+def _basic_lstm_cell_input_grad_tbe():
+    """BasicLSTMCellInputGrad TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/basic_lstm_cell_weight_grad.py b/mindspore/ops/_op_impl/tbe/basic_lstm_cell_weight_grad.py
new file mode 100644
index 0000000000..83726bc510
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/basic_lstm_cell_weight_grad.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BasicLSTMCellWeightGrad op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+basic_lstm_cell_weight_grad_op_info = TBERegOp("BasicLSTMCellWeightGrad") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("basic_lstm_cell_weight_grad.so") \
+    .compute_cost(10) \
+    .kernel_name("basic_lstm_cell_weight_grad") \
+    .partial_flag(True) \
+    .input(0, "x", False, "required", "all") \
+    .input(1, "h", False, "required", "all") \
+    .input(2, "dgate", False, "required", "all") \
+    .output(0, "dw", False, "required", "all") \
+    .output(1, "db", False, "required", "all") \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracZ,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracZ,
+                  DataType.F16_Default) \
+    .get_op_info()
+
+
+@op_info_register(basic_lstm_cell_weight_grad_op_info)
+def _basic_lstm_cell_weight_grad_tbe():
+    """BasicLSTMCellWeightGrad TBE register"""
+    return
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index a5c2e9edbb..e282219983 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -71,7 +71,7 @@ from .nn_ops import (LSTM, SGD, Adam, ApplyMomentum, BatchNorm,
                      SparseSoftmaxCrossEntropyWithLogits, Tanh,
                      TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl, SparseApplyFtrl,
                      ApplyProximalAdagrad, SparseApplyProximalAdagrad,
-                     ApplyRMSProp, ApplyCenteredRMSProp)
+                     ApplyRMSProp, ApplyCenteredRMSProp, BasicLSTMCell)
 from .other_ops import Assign, IOU, BoundingBoxDecode, BoundingBoxEncode, CheckValid, MakeRefKey, CheckBprop
 from . import _quant_ops
 from ._quant_ops import *
@@ -285,7 +285,8 @@ __all__ = [
     "BesselI0e",
     "BesselI1e",
     "Atan",
-    "Atanh"
+    "Atanh",
+    "BasicLSTMCell"
 ]
 
 __all__.extend(_quant_ops.__all__)
diff --git a/mindspore/ops/operations/_grad_ops.py b/mindspore/ops/operations/_grad_ops.py
index 6a2bf43e83..008f5f0edb 100644
--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@@ -1173,3 +1173,106 @@ class AtanGrad(PrimitiveWithInfer):
         args = {"x": x, "dout": dout}
         validator.check_tensor_type_same(args, mstype.number_type, self.name)
         return x
+
+
+class BasicLSTMCellCStateGrad(PrimitiveWithInfer):
+    """Computes the state gradients of BasicLSTMCell."""
+
+    @prim_attr_register
+    def __init__(self, forget_bias, activation):
+        self.forget_bias = validator.check_value_type("forget_bias", forget_bias, [float], self.name)
+        self.activation = validator.check_string("activation", activation, ['tanh'], self.name)
+
+    def infer_shape(self, c_shape, dht_shape, dct_shape, it_shape, jt_shape, ft_shape, ot_shape, tanhct_shape):
+        # dhy and dcy should be same shape
+        validator.check_integer("c rank", len(c_shape), 2, Rel.EQ, self.name)
+        validator.check("dht rank", len(dht_shape), "c rank", len(c_shape), Rel.EQ, self.name)
+        validator.check("dct rank", len(dct_shape), "c rank", len(c_shape), Rel.EQ, self.name)
+        validator.check("it rank", len(it_shape), "c rank", len(c_shape), Rel.EQ, self.name)
+        validator.check("jt rank", len(jt_shape), "c rank", len(c_shape), Rel.EQ, self.name)
+        validator.check("ft rank", len(ft_shape), "c rank", len(c_shape), Rel.EQ, self.name)
+        validator.check("ot rank", len(ot_shape), "c rank", len(c_shape), Rel.EQ, self.name)
+        validator.check("tanhct rank", len(tanhct_shape), "c rank", len(c_shape), Rel.EQ, self.name)
+        validator.check("dht shape", dht_shape, "c shape", c_shape, Rel.EQ, self.name)
+        validator.check("dct shape", dct_shape, "c shape", c_shape, Rel.EQ, self.name)
+        validator.check("it shape", it_shape, "c shape", c_shape, Rel.EQ, self.name)
+        validator.check("jt shape", jt_shape, "c shape", c_shape, Rel.EQ, self.name)
+        validator.check("ft shape", ft_shape, "c shape", c_shape, Rel.EQ, self.name)
+        validator.check("ot shape", ot_shape, "c shape", c_shape, Rel.EQ, self.name)
+        validator.check("tanhct shape", tanhct_shape, "c shape", c_shape, Rel.EQ, self.name)
+
+        dgate_shape = (c_shape[0], 4 * c_shape[1])
+        dct_1_shape = c_shape
+
+        return (dgate_shape, dct_1_shape)
+
+    def infer_dtype(self, c_dtype, dht_dtype, dct_dtype, it_dtype, jt_dtype, ft_dtype, ot_dtype, tanhct_dtype):
+        validator.check_subclass("c", c_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("dht", dht_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("dct", dct_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("it", it_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("jt", jt_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("ft", ft_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("ot", ot_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("tanhct", tanhct_dtype, [mstype.tensor], self.name)
+        validator.check_type_name("c", c_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("dht", dht_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("dct", dct_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("it", it_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("jt", jt_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("ft", ft_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("ot", ot_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("tanhct", tanhct_dtype, [mstype.float16, mstype.float32], self.name)
+        return (c_dtype, c_dtype)
+
+
+class BasicLSTMCellWeightGrad(PrimitiveWithInfer):
+    """Computes the weight gradients of BasicLSTM."""
+
+    @prim_attr_register
+    def __init__(self):
+        pass
+
+    def infer_shape(self, x_shape, h_shape, dgate_shape):
+        validator.check_integer("x rank", len(x_shape), 2, Rel.EQ, self.name)
+        validator.check("h rank", len(h_shape), " x rank", len(x_shape), Rel.EQ, self.name)
+        validator.check("dgate rank", len(dgate_shape), "x rank", len(x_shape), Rel.EQ, self.name)
+        validator.check("h_shape[0]", h_shape[0], "x_shape[0]", x_shape[0], Rel.EQ, self.name)
+        validator.check("dgate_shape[0]", dgate_shape[0], "h_shape[0]", h_shape[0], Rel.EQ, self.name)
+        validator.check("dgate_shape[1]", dgate_shape[1], "4*h_shape[1]", 4 * h_shape[1], Rel.EQ, self.name)
+        dw_shape = (dgate_shape[1], x_shape[1] + h_shape[1], 1, 1)
+        db_shape = (dgate_shape[1], 1, 1, 1)
+        return (dw_shape, db_shape)
+
+    def infer_dtype(self, x_dtype, h_dtype, dgate_dtype):
+        validator.check_subclass("x", x_dtype, mstype.tensor, self.name)
+        validator.check_subclass("h", h_dtype, mstype.tensor, self.name)
+        validator.check_subclass("dgate", dgate_dtype, mstype.tensor, self.name)
+        validator.check_type_name("x", x_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("h", h_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("dgate", dgate_dtype, [mstype.float16, mstype.float32], self.name)
+        return (x_dtype, x_dtype)
+
+
+class BasicLSTMCellInputGrad(PrimitiveWithInfer):
+    """Computes the input gradients of BasicLSTM."""
+
+    @prim_attr_register
+    def __init__(self, keep_prob):
+        self.keep_prob = validator.check_value_type("keep_prob", keep_prob, [float], self.name)
+        self.keep_prob = validator.check_number_range("keep_prob", keep_prob, 0.0, 1.0, Rel.INC_BOTH, self.name)
+
+    def infer_shape(self, dgate_shape, w_shape):
+        validator.check_integer("dgate rank", len(dgate_shape), 2, Rel.EQ, self.name)
+        validator.check_integer("w rank", len(w_shape), 4, Rel.EQ, self.name)
+        validator.check("dgate_shape[1]", dgate_shape[1], "w_shape[0]", w_shape[0], Rel.EQ, self.name)
+        dxt_shape = (dgate_shape[0], w_shape[1] - w_shape[0] // 4)
+        dht_shape = (dgate_shape[0], dgate_shape[1] // 4)
+        return (dxt_shape, dht_shape)
+
+    def infer_dtype(self, dgate_dtype, w_dtype):
+        validator.check_subclass("dgate", dgate_dtype, mstype.tensor, self.name)
+        validator.check_subclass("w", w_dtype, mstype.tensor, self.name)
+        validator.check_type_name("dgate", dgate_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("w", w_dtype, [mstype.float16, mstype.float32], self.name)
+        return (dgate_dtype, dgate_dtype)
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index 98a3ccd9a7..027a9e9525 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -3418,3 +3418,109 @@ class CTCLoss(PrimitiveWithInfer):
         validator.check_tensor_type_same({"labels_values_dtype": labels_values}, [mstype.int32], self.name)
         validator.check_tensor_type_same({"sequence_length_dtype": sequence_length}, [mstype.int32], self.name)
         return inputs, inputs
+
+
+class BasicLSTMCell(PrimitiveWithInfer):
+    r"""
+    Performs the long short term memory(LSTM) on the input.
+
+    .. math::
+        \begin{array}{ll} \\
+            i_t = \sigma(W_{ix} x_t + b_{ix} + W_{ih} h_{(t-1)} + b_{ih}) \\
+            f_t = \sigma(W_{fx} x_t + b_{fx} + W_{fh} h_{(t-1)} + b_{fh}) \\
+            \tilde{c}_t = \tanh(W_{cx} x_t + b_{cx} + W_{ch} h_{(t-1)} + b_{ch}) \\
+            o_t = \sigma(W_{ox} x_t + b_{ox} + W_{oh} h_{(t-1)} + b_{oh}) \\
+            c_t = f_t * c_{(t-1)} + i_t * \tilde{c}_t \\
+            h_t = o_t * \tanh(c_t) \\
+        \end{array}
+
+    Here :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. :math:`W, b`
+    are learnable weights between the output and the input in the formula. For instance,
+    :math:`W_{ix}, b_{ix}` are the weight and bias used to transform from input :math:`x` to :math:`i`.
+    Details can be found in paper `LONG SHORT-TERM MEMORY
+    <https://www.bioinf.jku.at/publications/older/2604.pdf>`_ and
+    `Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling
+    <https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/43905.pdf>`_.
+
+    Args:
+        keep_prob (float): If not 1.0, append `Dropout` layer on the outputs of each
+            LSTM layer except the last layer. Default 1.0. The range of dropout is [0.0, 1.0].
+        forget_bias (float): Add forget bias to forget gate biases in order to decrease former scale. Default to 1.0.
+        state_is_tuple (bool): If True, state is tensor tuple, containing h and c; If False, one tensor,
+          need split first. Default to True.
+        activation (str): Activation. Default to "tanh".
+
+    Inputs:
+        - **x** (Tensor) - Current words. Tensor of shape (`batch_size`, `input_size`).
+        - **h** (Tensor) - Hidden state last moment. Tensor of shape (`batch_size`, `hidden_size`).
+        - **c** (Tensor) - Cell state last moment. Tensor of shape (`batch_size`, `hidden_size`).
+        - **w** (Tensor) - Weight. Tensor of shape (`4 x hidden_size`, `input_size + hidden_size`, 1, 1).
+        - **b** (Tensor) - Bias. Tensor of shape (`4 x hidden_size`, 1, 1, 1).
+
+    Outputs:
+        - **ct** (Tensor) - Forward :math:`c_t` cache at moment `t`. Tensor of shape (`batch_size`, `hidden_size`).
+        - **ht** (Tensor) - Cell output. Tensor of shape (`batch_size`, `hidden_size`).
+        - **it** (Tensor) - Forward :math:`i_t` cache at moment `t`. Tensor of shape (`batch_size`, `4 x hidden_size`).
+        - **jt** (Tensor) - Forward :math:`j_t` cache at moment `t`. Tensor of shape (`batch_size`, `4 x hidden_size`).
+        - **ft** (Tensor) - Forward :math:`f_t` cache at moment `t`. Tensor of shape (`batch_size`, `4 x hidden_size`).
+        - **ot** (Tensor) - Forward :math:`o_t` cache at moment `t`. Tensor of shape (`batch_size`, `4 x hidden_size`).
+        - **tanhct** (Tensor) - Forward :math:`tanh c_t` cache at moment `t`.
+          Tensor of shape (`batch_size`, `4 x hidden_size`).
+
+    Examples:
+         'block': P.BasicLSTMCell(keep_prob=1.0, forget_bias=1.0, state_is_tuple=True, activation='tanh'),
+        'desc_inputs': [[128, 128], [128, 128], [128, 128], [512, 256, 1, 1],[512, 1, 1, 1]],
+        'desc_bprop': [[128, 128], [128, 128], [128, 128], [128, 128], [128, 128], [128, 128], [128, 128]],
+
+        >>> x = Tensor(np.random.rand(128, 128).astype(np.float16))
+        >>> h = Tensor(np.random.rand(128, 128).astype(np.float16))
+        >>> c = Tensor(np.random.rand(128, 128).astype(np.float16))
+        >>> w = Tensor(np.random.rand(512, 256, 1, 1).astype(np.float16))
+        >>> b = Tensor(np.random.rand(512, 1, 1, 1).astype(np.float16))
+        >>> lstm = P.BasicLSTMCell(keep_prob=1.0, forget_bias=1.0, state_is_tuple=True, activation='tanh')
+        >>> lstm(x, h, c, w, b)
+    """
+
+    @prim_attr_register
+    def __init__(self, keep_prob=1.0, forget_bias=1.0, state_is_tuple=True, activation="tanh"):
+        self.keep_prob = validator.check_value_type("keep_prob", keep_prob, [float], self.name)
+        self.keep_prob = validator.check_number_range("keep_prob", keep_prob, 0.0, 1.0, Rel.INC_BOTH, self.name)
+        self.forget_bias = validator.check_value_type("forget_bias", forget_bias, [float], self.name)
+        self.state_is_tuple = validator.check_value_type("state_is_tuple", state_is_tuple, [bool], self.name)
+        self.activation = validator.check_string("activation", activation, ['tanh'], self.name)
+
+    def infer_shape(self, x_shape, h_shape, c_shape, w_shape, b_shape):
+        # (batch_size, input_size)
+        validator.check_integer("x_shape", len(x_shape), 2, Rel.EQ, self.name)
+
+        # h and c should be same shape
+        validator.check_integer("h_shape", len(h_shape), 2, Rel.EQ, self.name)
+        validator.check("h rank", len(h_shape), "c rank", len(c_shape), Rel.EQ, self.name)
+        validator.check("h shape", h_shape, "c shape", c_shape, Rel.EQ, self.name)
+        validator.check_integer("w rank", len(w_shape), 4, Rel.EQ, self.name)
+        validator.check_integer("b rank", len(b_shape), 4, Rel.EQ, self.name)
+        validator.check("w_shape[0]", w_shape[0], "4*h_shape[1]", 4 * h_shape[1], Rel.EQ, self.name)
+        validator.check("w_shape[1]", w_shape[1], "x_shape[1]+h_shape[1]", x_shape[1] + h_shape[1], Rel.EQ, self.name)
+        validator.check("b_shape[0]", b_shape[0], "4*h_shape[1]", 4*h_shape[1], Rel.EQ, self.name)
+        ct_shape = c_shape
+        ht_shape = h_shape
+        it_shape = h_shape
+        jt_shape = h_shape
+        ft_shape = h_shape
+        ot_shape = h_shape
+        tanhct_shape = h_shape
+
+        return (ct_shape, ht_shape, it_shape, jt_shape, ft_shape, ot_shape, tanhct_shape)
+
+    def infer_dtype(self, x_dtype, h_dtype, c_dtype, w_dtype, b_dtype):
+        validator.check_subclass("x", x_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("h", h_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("c", c_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("w", w_dtype, [mstype.tensor], self.name)
+        validator.check_subclass("b", b_dtype, [mstype.tensor], self.name)
+        validator.check_type_name("x", x_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("h", h_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("c", c_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("w", w_dtype, [mstype.float16, mstype.float32], self.name)
+        validator.check_type_name("b", b_dtype, [mstype.float16, mstype.float32], self.name)
+        return (x_dtype, x_dtype, x_dtype, x_dtype, x_dtype, x_dtype, x_dtype)
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index 9482d7b1ee..bbf33b3c94 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -878,6 +878,11 @@ test_case_nn_ops = [
         'desc_inputs': [[128, 64, 32, 32], [128, 64, 32, 32], [64], [64], [64]],
         'desc_bprop': [[128, 64, 32, 32], [64], [64], [64], [64]],
         'skip': ['backward']}),
+    ('BasicLSTMCell', {
+        'block': P.BasicLSTMCell(keep_prob=1.0, forget_bias=1.0, state_is_tuple=True, activation='tanh'),
+        'desc_inputs': [[128, 128], [128, 128], [128, 128], [512, 256, 1, 1],[512, 1, 1, 1]],
+        'desc_bprop': [[128, 128], [128, 128], [128, 128], [128, 128], [128, 128], [128, 128], [128, 128]],
+        'skip': []}),
     ('TopK', {
         'block': P.TopK(),
         'desc_const': [5],

From 021c87700ff90b3752058595f54b9c3c02cbe6bc Mon Sep 17 00:00:00 2001
From: Wei Luning <weiluning@huawei.com>
Date: Mon, 1 Jun 2020 16:59:23 +0800
Subject: [PATCH 21/36] fix bug in do signature

---
 .gitignore                                    |  1 +
 mindspore/ccsrc/kernel/common_utils.cc        |  2 +-
 .../ccsrc/operator/composite/do_signature.cc  | 51 +++++++------
 mindspore/ccsrc/operator/prim_others.cc       |  2 +-
 mindspore/ccsrc/optimizer/irpass.cc           |  4 +-
 .../ccsrc/optimizer/irpass/ref_eliminate.h    |  5 ++
 .../ccsrc/pipeline/parse/function_block.cc    |  6 +-
 .../static_analysis/abstract_value.cc         |  4 +-
 .../ccsrc/pipeline/static_analysis/prim.cc    |  2 +-
 mindspore/common/dtype.py                     |  2 +-
 mindspore/ops/operations/array_ops.py         | 20 +++--
 mindspore/ops/operations/math_ops.py          |  4 +-
 mindspore/ops/operations/other_ops.py         |  2 +
 tests/ut/python/ops/test_layer_switch.py      | 15 ++++
 tests/ut/python/ops/test_nn_ops.py            | 17 -----
 tests/ut/python/ops/test_signature.py         | 75 +++++++++++++++++++
 16 files changed, 153 insertions(+), 59 deletions(-)
 create mode 100644 tests/ut/python/ops/test_signature.py

diff --git a/.gitignore b/.gitignore
index b5d3193101..77ff222a1a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,6 +65,7 @@ test_temp_summary_event_file/
 *.ckpt
 *.shp
 *.pkl
+*.pb
 .clangd
 mindspore/version.py
 mindspore/default_config.py
diff --git a/mindspore/ccsrc/kernel/common_utils.cc b/mindspore/ccsrc/kernel/common_utils.cc
index 54980c2cb7..2769e0c42a 100644
--- a/mindspore/ccsrc/kernel/common_utils.cc
+++ b/mindspore/ccsrc/kernel/common_utils.cc
@@ -253,7 +253,7 @@ std::string Dtype2String(const std::string &dtypes) {
 std::string TypeId2String(TypeId type_id) {
   auto iter = type_id_str_map.find(type_id);
   if (iter == type_id_str_map.end()) {
-    MS_EXCEPTION(ArgumentError) << "Illegal input dtype." << TypeIdLabel(type_id);
+    return std::string(TypeIdLabel(type_id));
   }
   return iter->second;
 }
diff --git a/mindspore/ccsrc/operator/composite/do_signature.cc b/mindspore/ccsrc/operator/composite/do_signature.cc
index 5300a3dede..0cc4ee0483 100644
--- a/mindspore/ccsrc/operator/composite/do_signature.cc
+++ b/mindspore/ccsrc/operator/composite/do_signature.cc
@@ -47,16 +47,6 @@ const std::vector<Signature> &GetSignature(const ValuePtr &function) {
   return empty;
 }
 
-const std::string GetOpName(const ValuePtr &function) {
-  std::string name = "";
-  if (function->isa<Primitive>()) {
-    name = function->cast<PrimitivePyPtr>()->name();
-  } else if (function->isa<MetaFuncGraph>()) {
-    name = function->cast<MetaFuncGraphPtr>()->name();
-  }
-  return name;
-}
-
 void ProcessDefault(const std::string &func_name, const AbstractBasePtrList &args_spec_list,
                     const std::vector<Signature> &signature, bool has_var, std::vector<AnfNodePtr> *const op_inputs) {
   std::size_t sig_size = signature.size();
@@ -93,7 +83,8 @@ void setMaxType(TypeId *max_type_id, TypeId *max_type, size_t *max_type_number,
   *max_type_number = type_number;
 }
 
-TypeId GetMaxTypeId(const abstract::AbstractBasePtrList &args_spec_list, std::vector<size_t> indexs) {
+TypeId GetMaxTypeId(const abstract::AbstractBasePtrList &args_spec_list, std::vector<size_t> indexs,
+                    const std::set<size_t> &write_indexs) {
   TypeId max_type_id = kTypeUnknown;
   TypeId max_type = kTypeUnknown;
   size_t max_type_number = 0;
@@ -103,7 +94,12 @@ TypeId GetMaxTypeId(const abstract::AbstractBasePtrList &args_spec_list, std::ve
     TypeId arg_type = kTypeUnknown;
     AbstractBasePtr arg_value = args_spec_list[index];
     if (arg_value->isa<abstract::AbstractRef>()) {
-      arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
+      auto is_write = (write_indexs.find(index) != write_indexs.end());
+      if (is_write) {
+        arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref_origin();
+      } else {
+        arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
+      }
     }
     if (arg_value->isa<abstract::AbstractTensor>()) {
       auto tensor = arg_value->cast<abstract::AbstractTensorPtr>();
@@ -157,7 +153,8 @@ TypeId GetMaxTypeId(const abstract::AbstractBasePtrList &args_spec_list, std::ve
 
 // Get the largest type of index in the same SignatureEnumDType of arguments.
 std::map<SignatureEnumDType, TypeId> GetMaxDtype(const std::vector<SignatureEnumDType> &dtypes,
-                                                 const abstract::AbstractBasePtrList &args_spec_list) {
+                                                 const abstract::AbstractBasePtrList &args_spec_list,
+                                                 const std::set<size_t> &write_indexs) {
   // record index for signature.dtypes of the same type
   // eg. [T, T1, T, T2, T, T1, T3] -> {{T:(0,2,4)}, {T1:(1,5)}, {T2:(3)}, {T3:(6)}}
   std::map<SignatureEnumDType, std::vector<size_t>> type_indexs;
@@ -192,7 +189,7 @@ std::map<SignatureEnumDType, TypeId> GetMaxDtype(const std::vector<SignatureEnum
       (void)dst_type.insert(std::make_pair(type, kTypeUnknown));
       continue;
     }
-    (void)dst_type.insert(std::make_pair(type, GetMaxTypeId(args_spec_list, indexs)));
+    (void)dst_type.insert(std::make_pair(type, GetMaxTypeId(args_spec_list, indexs, write_indexs)));
   }
   return dst_type;
 }
@@ -205,9 +202,9 @@ AnfNodePtr DoCast(const AnfNodePtr &param, const TypeId &type_id, const FuncGrap
   return NewCNode({cast_node, param, dtype_node}, graph);
 }
 
-void DoAutoCast(const std::vector<Signature> &signature, const abstract::AbstractBasePtrList &args_spec_list,
-                const FuncGraphPtr &graph, std::vector<AnfNodePtr> *const op_inputs,
-                const std::set<size_t> &write_indexs) {
+void DoAutoCast(const std::string &func_name, const std::vector<Signature> &signature,
+                const abstract::AbstractBasePtrList &args_spec_list, const FuncGraphPtr &graph,
+                std::vector<AnfNodePtr> *const op_inputs, const std::set<size_t> &write_indexs) {
   std::vector<SignatureEnumDType> dtypes;
   (void)std::transform(signature.begin(), signature.end(), std::back_inserter(dtypes),
                        [](const Signature &sig) { return sig.dtype; });
@@ -216,16 +213,23 @@ void DoAutoCast(const std::vector<Signature> &signature, const abstract::Abstrac
     return;
   }
   // Stat the index of the arguments with the largest type in the same SignatureEnumDType.
-  std::map<SignatureEnumDType, TypeId> dst_type = GetMaxDtype(dtypes, args_spec_list);
+  std::map<SignatureEnumDType, TypeId> dst_type = GetMaxDtype(dtypes, args_spec_list, write_indexs);
   // Identify which arg requires auto cast
   for (size_t i = 0; i < args_spec_list.size(); ++i) {
     auto it = dst_type.find(dtypes[i]);
     if (it == dst_type.end() || it->second == kTypeUnknown) {
       continue;
     }
+    auto rw_it = write_indexs.find(i);
+    auto is_write = (rw_it != write_indexs.end());
+
     AbstractBasePtr arg_value = args_spec_list[i];
     if (arg_value->isa<abstract::AbstractRef>()) {
-      arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
+      if (is_write) {
+        arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref_origin();
+      } else {
+        arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
+      }
     }
     TypeId arg_type_id = kTypeUnknown;
     if (arg_value->isa<abstract::AbstractTensor>()) {
@@ -243,10 +247,9 @@ void DoAutoCast(const std::vector<Signature> &signature, const abstract::Abstrac
     if (it_map == type_map.end()) {
       continue;
     }
-    auto rw_it = write_indexs.find(i);
-    if (rw_it != write_indexs.end()) {
+    if (is_write) {
       if (arg_type_id != it->second) {
-        MS_LOG(EXCEPTION) << "In op '" << GetOpName(graph) << "', argument '" << args_spec_list[i]
+        MS_LOG(EXCEPTION) << "In op '" << func_name << "', argument '" << args_spec_list[i]
                           << "' can not cast type from '" << TypeIdLabel(arg_type_id) << "' to '"
                           << TypeIdLabel(it->second) << "' automatically.";
       }
@@ -297,8 +300,8 @@ AnfNodePtr BuildNewCNode(const FuncGraphPtr &func_graph, const std::string &func
       if (sig == SignatureEnumRW::kRWRead) {
         param = func_graph->NewCNode({NewValueNode(prim::kPrimGetRefValue), param});
       } else if (sig == SignatureEnumRW::kRWWrite) {
+        param = func_graph->NewCNode({NewValueNode(prim::kPrimGetRefOrigin), param});
         write_indexs.insert(i);
-        param = func_graph->NewCNode({NewValueNode(prim::kPrimGetRefKey), param});
       }
       // If sig is SignatureEnumRW::kRWRef, not do anything.
     } else if (sig == SignatureEnumRW::kRWWrite && type->type_id() != kObjectTypeRefKey) {
@@ -308,7 +311,7 @@ AnfNodePtr BuildNewCNode(const FuncGraphPtr &func_graph, const std::string &func
   }
   // process default
   ProcessDefault(func_name, args_spec_list, signature, has_var, &op_inputs);
-  DoAutoCast(signature, args_spec_list, func_graph, &op_inputs, write_indexs);
+  DoAutoCast(func_name, signature, args_spec_list, func_graph, &op_inputs, write_indexs);
   return func_graph->NewCNode(op_inputs);
 }
 }  // namespace
diff --git a/mindspore/ccsrc/operator/prim_others.cc b/mindspore/ccsrc/operator/prim_others.cc
index 84144380f8..b8e89378e6 100644
--- a/mindspore/ccsrc/operator/prim_others.cc
+++ b/mindspore/ccsrc/operator/prim_others.cc
@@ -160,7 +160,7 @@ AbstractBasePtr InferImplGetRefOrigin(const AnalysisEnginePtr &, const Primitive
                                       const AbstractBasePtrList &args_spec_list) {
   // arguments: value
   if (args_spec_list.size() != 1) {
-    MS_LOG(EXCEPTION) << "get_ref_value requires 1 parameters, while the input size is " << args_spec_list.size()
+    MS_LOG(EXCEPTION) << "get_ref_origin requires 1 parameters, while the input size is " << args_spec_list.size()
                       << ".";
   }
   TypePtr type = args_spec_list[0]->GetTypeTrack();
diff --git a/mindspore/ccsrc/optimizer/irpass.cc b/mindspore/ccsrc/optimizer/irpass.cc
index 4a867546b6..107bf1eb57 100644
--- a/mindspore/ccsrc/optimizer/irpass.cc
+++ b/mindspore/ccsrc/optimizer/irpass.cc
@@ -81,8 +81,8 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
 
   // Ref eliminate
   make_ref_eliminate_ = MakeSubstitution(MakeRefEliminater(), "make_ref_eliminate", prim::kPrimMakeRef);
-  get_make_ref_eliminate_ =
-    MakeSubstitution(GetMakeRefEliminater(), "get_make_ref_eliminate", {prim::kPrimGetRefKey, prim::kPrimGetRefValue});
+  get_make_ref_eliminate_ = MakeSubstitution(GetMakeRefEliminater(), "get_make_ref_eliminate",
+                                             {prim::kPrimGetRefKey, prim::kPrimGetRefValue, prim::kPrimGetRefOrigin});
 
   replace_refkey_by_param_ =
     MakeSubstitution(ReplaceRefkeyByParam(), "replace_refkey_by_param", IsValueNode<RefKey>, opt::FORCE_RENORM);
diff --git a/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h b/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
index 01bdd0906e..201992ef13 100644
--- a/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
@@ -48,6 +48,7 @@ class MakeRefEliminater : public AnfVisitor {
 
 // {prim::kPrimGetRefKey, {prim::kPrimMakeRef, X, Y, Z}} -> X
 // {prim::kPrimGetRefValue, {prim::kPrimMakeRef, X, Y, Z}} -> Y
+// {prim::kPrimGetRefOrigin, {prim::kPrimMakeRef, X, Y, Z}} -> Z
 class GetMakeRefEliminater : public AnfVisitor {
  public:
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
@@ -71,6 +72,10 @@ class GetMakeRefEliminater : public AnfVisitor {
       return ref->input(2);
     }
 
+    if (cnode->IsApply(prim::kPrimGetRefOrigin)) {
+      return ref->input(3);
+    }
+
     return nullptr;
   }
 };
diff --git a/mindspore/ccsrc/pipeline/parse/function_block.cc b/mindspore/ccsrc/pipeline/parse/function_block.cc
index 24e7ae74fb..66534390a0 100644
--- a/mindspore/ccsrc/pipeline/parse/function_block.cc
+++ b/mindspore/ccsrc/pipeline/parse/function_block.cc
@@ -315,7 +315,7 @@ void FunctionBlock::InsertDependItemsBeforeReturn() {
 
   ValueNodePtr make_tuple_op = NewValueNode(prim::kPrimMakeTuple);
   ValueNodePtr depend_op = NewValueNode(prim::kPrimDepend);
-  ValueNodePtr get_refkey_op = NewValueNode(prim::kPrimGetRefKey);
+  ValueNodePtr get_ref_origin_op = NewValueNode(prim::kPrimGetRefOrigin);
   ValueNodePtr stop_gradient_op = NewValueNode(prim::kPrimStopGradient);
   const std::string primitive_name("assign");
   const std::string module_name("mindspore.ops.functional");
@@ -329,8 +329,8 @@ void FunctionBlock::InsertDependItemsBeforeReturn() {
   vec_states.emplace_back(make_tuple_op);
   for (auto &item : state_assign_) {
     auto source = ReadVariable(item.second);
-    auto refkey = func_graph()->NewCNode({get_refkey_op, item.first});
-    auto assign = func_graph()->NewCNode({assign_op, refkey, source});
+    auto origin = func_graph()->NewCNode({get_ref_origin_op, item.first});
+    auto assign = func_graph()->NewCNode({assign_op, origin, source});
     MS_LOG(INFO) << "SetState read " << item.first->ToString() << ", " << item.second;
     vec_states.emplace_back(assign);
   }
diff --git a/mindspore/ccsrc/pipeline/static_analysis/abstract_value.cc b/mindspore/ccsrc/pipeline/static_analysis/abstract_value.cc
index e7f6579b95..d4f0c6f8d4 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/abstract_value.cc
+++ b/mindspore/ccsrc/pipeline/static_analysis/abstract_value.cc
@@ -801,8 +801,8 @@ bool AbstractRef::operator==(const AbstractBase &other) const {
 std::string AbstractRef::ToString() const {
   std::ostringstream buffer;
   buffer << type_name() << "("
-         << "key: " << ref_key_->ToString() << "ref_value: " << ref_->ToString()
-         << "origin_value: " << ref_origin_->ToString();
+         << "key: " << ref_key_->ToString() << " ref_value: " << ref_->ToString()
+         << " origin_value: " << ref_origin_->ToString();
   auto value = GetValueTrack();
   if (value) {
     buffer << ", value: " << value->ToString();
diff --git a/mindspore/ccsrc/pipeline/static_analysis/prim.cc b/mindspore/ccsrc/pipeline/static_analysis/prim.cc
index cf5fd59390..f2f85df430 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/prim.cc
+++ b/mindspore/ccsrc/pipeline/static_analysis/prim.cc
@@ -783,7 +783,7 @@ class RefToEmbedEvaluator : public SymbolicPrimEvaluator {
     AbstractBasePtr abs = node_conf->GetEvaluatedValue()->abstract();
     AbstractRefPtr ref_abs = abs->cast<AbstractRefPtr>();
     if (ref_abs == nullptr) {
-      MS_LOG(ERROR) << "The first parameter of RefToEmbed should be Ref.";
+      MS_LOG(ERROR) << "The first parameter of RefToEmbed should be Ref, but " << abs->ToString();
       return nullptr;
     }
     auto key_abs = ref_abs->ref_key();
diff --git a/mindspore/common/dtype.py b/mindspore/common/dtype.py
index 2c1ff60298..02a27591d4 100644
--- a/mindspore/common/dtype.py
+++ b/mindspore/common/dtype.py
@@ -170,7 +170,7 @@ def get_py_obj_dtype(obj):
         Type of MindSpore type.
     """
     # Tensor
-    if hasattr(obj, 'dtype'):
+    if hasattr(obj, 'dtype') and callable(obj.dtype) and isinstance(obj.dtype(), typing.Type):
         return tensor_type(obj.dtype())
     if hasattr(obj, '__primitive_flag__') or hasattr(obj, 'construct'):
         return function
diff --git a/mindspore/ops/operations/array_ops.py b/mindspore/ops/operations/array_ops.py
index 19828d3871..43a48b67b3 100644
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@@ -31,7 +31,9 @@ from ...common.tensor import Tensor
 from ..operations.math_ops import _infer_shape_reduce
 from .._utils import get_concat_offset
 from ..primitive import Primitive, PrimitiveWithInfer, prim_attr_register
-
+from ..._c_expression import signature_rw as sig_rw
+from ..._c_expression import signature_kind as sig_kind
+from ..._c_expression import signature_dtype as sig_dtype
 
 def _check_infer_attr_reduce(axis, keep_dims, prim_name):
     validator.check_value_type('keep_dims', keep_dims, [bool], prim_name)
@@ -2140,13 +2142,17 @@ class ScatterUpdate(PrimitiveWithInfer):
         >>> input_x = mindspore.Parameter(Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mindspore.float32))
         >>> indices = Tensor(np.array([[0, 0], [1, 1]]), mindspore.int32)
         >>> update = Tensor(np.array([1.0, 2.2]), mindspore.float32)
-        >>> op = P.ScatterNdUpdate()
+        >>> op = P.ScatterUpdate()
         >>> output = op(input_x, indices, update)
     """
-
+    __mindspore_signature__ = (
+        ('x', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('indices', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T1),
+        ('value', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T)
+    )
     @prim_attr_register
     def __init__(self, use_locking=True):
-        """Init ScatterNdUpdate"""
+        """Init ScatterUpdate"""
         self.init_prim_io_names(inputs=['x', 'indices', 'value'], outputs=['y'])
 
     def infer_shape(self, x_shape, indices_shape, value_shape):
@@ -2185,7 +2191,11 @@ class ScatterNdUpdate(PrimitiveWithInfer):
         >>> op = P.ScatterNdUpdate()
         >>> output = op(input_x, indices, update)
     """
-
+    __mindspore_signature__ = (
+        ('x', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
+        ('indices', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T1),
+        ('value', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T)
+    )
     @prim_attr_register
     def __init__(self, use_locking=True):
         """Init ScatterNdUpdate"""
diff --git a/mindspore/ops/operations/math_ops.py b/mindspore/ops/operations/math_ops.py
index 3481ef9efc..9afdc50caa 100644
--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@@ -179,7 +179,7 @@ class AssignAdd(PrimitiveWithInfer):
         return value
 
     def infer_dtype(self, variable, value):
-        args = {"value": value}
+        args = {"variable": variable, "value": value}
         validator.check_scalar_or_tensor_type_same(args, mstype.number_type, self.name)
         return value
 
@@ -222,7 +222,7 @@ class AssignSub(PrimitiveWithInfer):
         return value
 
     def infer_dtype(self, variable, value):
-        args = {"value": value}
+        args = {"variable": variable, "value": value}
         validator.check_scalar_or_tensor_type_same(args, mstype.number_type, self.name)
         return value
 
diff --git a/mindspore/ops/operations/other_ops.py b/mindspore/ops/operations/other_ops.py
index 95f9df440c..d73f53eb6a 100644
--- a/mindspore/ops/operations/other_ops.py
+++ b/mindspore/ops/operations/other_ops.py
@@ -58,6 +58,8 @@ class Assign(PrimitiveWithInfer):
         return variable
 
     def infer_dtype(self, variable, value):
+        args = {"variable": variable, "value": value}
+        validator.check_tensor_type_same(args, (mstype.bool_,) + mstype.number_type, self.name)
         return variable
 
 
diff --git a/tests/ut/python/ops/test_layer_switch.py b/tests/ut/python/ops/test_layer_switch.py
index 35636637a4..82aa6db39f 100644
--- a/tests/ut/python/ops/test_layer_switch.py
+++ b/tests/ut/python/ops/test_layer_switch.py
@@ -1,3 +1,18 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test layer switch"""
 import numpy as np
 
 import mindspore
diff --git a/tests/ut/python/ops/test_nn_ops.py b/tests/ut/python/ops/test_nn_ops.py
index 15ff49e2c0..2c97b49e15 100644
--- a/tests/ut/python/ops/test_nn_ops.py
+++ b/tests/ut/python/ops/test_nn_ops.py
@@ -345,19 +345,6 @@ class Conv2dNativeNet(nn.Cell):
         return self.flatten(self.conv(input_x, self.weight))
 
 
-class MakeRefKeyNet(nn.Cell):
-    """ MakeRefKeyNet definition """
-
-    def __init__(self):
-        super(MakeRefKeyNet, self).__init__()
-        self.y = Parameter(Tensor([1.0], mindspore.float32), name="y")
-
-    def construct(self, x):
-        key = P.MakeRefKey("y")()
-        P.Assign()(key, x)
-        return x
-
-
 class StateNet(nn.Cell):
     """ StateTestTensor definition """
 
@@ -538,10 +525,6 @@ test_cases = [
         'block': Grad(NetWithLossClass(Conv2dNativeNet())),
         'desc_inputs': [Tensor(np.ones([1, 3, 16, 16], np.float32)), Tensor(np.zeros([1, 1764], np.float32))],
     }),
-    ('MakeRefKey', {
-        'block': MakeRefKeyNet(),
-        'desc_inputs': [Tensor([2.0], mindspore.float32)],
-    }),
     ('StateTest', {
         'block': StateNet(),
         'desc_inputs': [Tensor(np.ones([2, 1, 2, 2]).astype(np.float32))],
diff --git a/tests/ut/python/ops/test_signature.py b/tests/ut/python/ops/test_signature.py
new file mode 100644
index 0000000000..e6447be8f3
--- /dev/null
+++ b/tests/ut/python/ops/test_signature.py
@@ -0,0 +1,75 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+test assign sub
+"""
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+import mindspore.ops.operations as P
+from mindspore import Tensor
+from mindspore.common.initializer import initializer
+from mindspore.common.parameter import Parameter
+import mindspore as ms
+
+class AssignW(nn.Cell):
+    def __init__(self):
+        super(AssignW, self).__init__()
+        self.assign = P.Assign()
+
+    def construct(self, x, w):
+        self.assign(x, w)
+        return x
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.b = Parameter(initializer('ones', [5]), name='b')
+        self.assign = AssignW()
+
+    def construct(self, value):
+        return self.assign(self.b, value)
+
+
+def test_assign_through_cell():
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
+    net = Net()
+    net.to_float(ms.float16)
+    net.add_flags_recursive(fp16=False)
+    input_data = Tensor(np.ones([5]).astype(np.float32))
+    net(input_data)
+    with pytest.raises(TypeError):
+        net(None)
+
+
+class NetScatterNdUpdate(nn.Cell):
+    def __init__(self):
+        super(NetScatterNdUpdate, self).__init__()
+        self.b = Parameter(initializer('ones', [5, 5]), name='b')
+        self.scatter = P.ScatterNdUpdate()
+
+    def construct(self, idx, x):
+        return self.scatter(self.b, idx, x)
+
+
+def test_scatter_nd_update():
+    context.set_context(mode=context.GRAPH_MODE)
+    net = NetScatterNdUpdate()
+    x = Tensor(np.ones([5]).astype(np.float16))
+    idx = Tensor(np.ones([1]).astype(np.int32))
+    net(idx, x)

From 73a836f286d7cae466deb240da462f27ce580f08 Mon Sep 17 00:00:00 2001
From: zhaojichen <zhaojichen1@huawei.com>
Date: Wed, 3 Jun 2020 23:04:50 -0400
Subject: [PATCH 22/36] fix applyrmsprop

---
 mindspore/ccsrc/transform/op_declare.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/transform/op_declare.cc b/mindspore/ccsrc/transform/op_declare.cc
index 735c9aac09..ee59d56003 100644
--- a/mindspore/ccsrc/transform/op_declare.cc
+++ b/mindspore/ccsrc/transform/op_declare.cc
@@ -1226,7 +1226,7 @@ OUTPUT_MAP(Atan2) = {{0, OUTPUT_DESC(y)}};
 
 // ApplyRMSPropD
 INPUT_MAP(ApplyRMSPropD) = {
-  {1, INPUT_DESC(var)}, {2, INPUT_DESC(ms)}, {3, INPUT_DESC(mom)}, {4, INPUT_DESC(grad)}, {5, INPUT_DESC(lr)}};
+  {1, INPUT_DESC(var)}, {2, INPUT_DESC(ms)}, {3, INPUT_DESC(mom)}, {4, INPUT_DESC(lr)}, {5, INPUT_DESC(grad)}};
 INPUT_ATTR_MAP(ApplyRMSPropD) = {{6, ATTR_DESC(rho, AnyTraits<float>())},
                                  {7, ATTR_DESC(momentum, AnyTraits<float>())},
                                  {8, ATTR_DESC(epsilon, AnyTraits<float>())}};

From 56913ff1cfc095dab1083c9fb542f39ba6aa203d Mon Sep 17 00:00:00 2001
From: xutianchun <xutianchun@huawei.com>
Date: Thu, 4 Jun 2020 19:19:10 +0800
Subject: [PATCH 23/36] add op: ReverseSequence

---
 mindspore/ops/_op_impl/aicpu/__init__.py      |  1 +
 .../ops/_op_impl/aicpu/reverse_sequence.py    | 78 +++++++++++++++++++
 mindspore/ops/operations/__init__.py          |  3 +-
 mindspore/ops/operations/array_ops.py         | 51 ++++++++++++
 4 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 mindspore/ops/_op_impl/aicpu/reverse_sequence.py

diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index bb490d050b..58db081de3 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -27,3 +27,4 @@ from .random_choice_with_mask import _random_choice_with_mask_aicpu
 from .ctcloss import _ctcloss_aicpu
 from .rnnt_loss import _rnnt_loss_aicpu
 from .random_categorical import _random_categorical_aicpu
+from .reverse_sequence import _reverse_sequence_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/reverse_sequence.py b/mindspore/ops/_op_impl/aicpu/reverse_sequence.py
new file mode 100644
index 0000000000..678a4a61f3
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/reverse_sequence.py
@@ -0,0 +1,78 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReverseSequence op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+reverse_sequence_op_info = AiCPURegOp("ReverseSequence") \
+    .fusion_type("OPAQUE") \
+    .input(0, "x", "required") \
+    .input(1, "seq_lengths", "required") \
+    .output(0, "y", "required") \
+    .attr("seq_dim", "int") \
+    .attr("batch_dim", "int") \
+    .dtype_format(DataType.BOOL_Default, DataType.I32_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I32_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I32_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U16_Default, DataType.I32_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.I32_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.I32_Default, DataType.U64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, DataType.F64_Default) \
+    .dtype_format(DataType.BOOL_NCHW, DataType.I32_NCHW, DataType.BOOL_NCHW) \
+    .dtype_format(DataType.I8_NCHW, DataType.I32_NCHW, DataType.I8_NCHW) \
+    .dtype_format(DataType.I16_NCHW, DataType.I32_NCHW, DataType.I16_NCHW) \
+    .dtype_format(DataType.I32_NCHW, DataType.I32_NCHW, DataType.I32_NCHW) \
+    .dtype_format(DataType.I64_NCHW, DataType.I32_NCHW, DataType.I64_NCHW) \
+    .dtype_format(DataType.U8_NCHW, DataType.I32_NCHW, DataType.U8_NCHW) \
+    .dtype_format(DataType.U16_NCHW, DataType.I32_NCHW, DataType.U16_NCHW) \
+    .dtype_format(DataType.U32_NCHW, DataType.I32_NCHW, DataType.U32_NCHW) \
+    .dtype_format(DataType.U64_NCHW, DataType.I32_NCHW, DataType.U64_NCHW) \
+    .dtype_format(DataType.F16_NCHW, DataType.I32_NCHW, DataType.F16_NCHW) \
+    .dtype_format(DataType.F32_NCHW, DataType.I32_NCHW, DataType.F32_NCHW) \
+    .dtype_format(DataType.F64_NCHW, DataType.I32_NCHW, DataType.F64_NCHW) \
+    .dtype_format(DataType.BOOL_Default, DataType.I64_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I64_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I64_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U16_Default, DataType.I64_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.I64_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.I64_Default, DataType.U64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.F64_Default) \
+    .dtype_format(DataType.BOOL_NCHW, DataType.I64_NCHW, DataType.BOOL_NCHW) \
+    .dtype_format(DataType.I8_NCHW, DataType.I64_NCHW, DataType.I8_NCHW) \
+    .dtype_format(DataType.I16_NCHW, DataType.I64_NCHW, DataType.I16_NCHW) \
+    .dtype_format(DataType.I32_NCHW, DataType.I64_NCHW, DataType.I32_NCHW) \
+    .dtype_format(DataType.I64_NCHW, DataType.I64_NCHW, DataType.I64_NCHW) \
+    .dtype_format(DataType.U8_NCHW, DataType.I64_NCHW, DataType.U8_NCHW) \
+    .dtype_format(DataType.U16_NCHW, DataType.I64_NCHW, DataType.U16_NCHW) \
+    .dtype_format(DataType.U32_NCHW, DataType.I64_NCHW, DataType.U32_NCHW) \
+    .dtype_format(DataType.U64_NCHW, DataType.I64_NCHW, DataType.U64_NCHW) \
+    .dtype_format(DataType.F16_NCHW, DataType.I64_NCHW, DataType.F16_NCHW) \
+    .dtype_format(DataType.F32_NCHW, DataType.I64_NCHW, DataType.F32_NCHW) \
+    .dtype_format(DataType.F64_NCHW, DataType.I64_NCHW, DataType.F64_NCHW) \
+    .get_op_info()
+
+@op_info_register(reverse_sequence_op_info)
+def _reverse_sequence_aicpu():
+    """ReverseSequence AiCPU register"""
+    return
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index e282219983..9168364583 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -30,7 +30,7 @@ from .array_ops import (Argmax, Argmin, Cast, Concat, Pack, Unpack,
                         Squeeze, StridedSlice, Tile,
                         Transpose, TruncatedNormal, TupleToArray, UnsortedSegmentMin,
                         UnsortedSegmentSum, SpaceToDepth, DepthToSpace, SpaceToBatch, BatchToSpace,
-                        SpaceToBatchND, BatchToSpaceND)
+                        SpaceToBatchND, BatchToSpaceND, ReverseSequence)
 from .comm_ops import (AllGather, AllReduce, _AlltoAll, ReduceScatter, Broadcast,
                        _MirrorOperator, ReduceOp, _VirtualDataset,
                        _VirtualDiv, _GetTensorSlice,
@@ -278,6 +278,7 @@ __all__ = [
     "ApplyCenteredRMSProp",
     "SpaceToBatchND",
     "BatchToSpaceND",
+    "ReverseSequence",
     "SquareSumAll",
     "BitwiseAnd",
     "BitwiseOr",
diff --git a/mindspore/ops/operations/array_ops.py b/mindspore/ops/operations/array_ops.py
index 43a48b67b3..d53f92c2a3 100644
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@@ -2720,3 +2720,54 @@ class BatchToSpaceND(PrimitiveWithInfer):
                              f'block_shape_prod {block_shape_prod}')
         out_shape[0] = out_shape[0] // block_shape_prod
         return out_shape
+
+
+class ReverseSequence(PrimitiveWithInfer):
+    """
+    Reverses variable length slices.
+
+    Note:
+        If the specified axis is a negative number, the index is counted
+        backward from the end and starts at 1.
+
+    Raises:
+        ValueError: If axis is not an integer or not in the valid range.
+    Args:
+        seq_dim (int): The dimension which is partially reversed. Required.
+        batch_dim (int): The dimension along which reversal is performed. Default: 0
+
+    Inputs:
+        - **x** (Tensor) - The input to reverse.
+        - **seq_lengths** (int) - Must be 1-D vector with types: int32, int64
+
+    Outputs:
+        Reversed tensor with the same shape and data type as x.
+
+    Examples:
+        >>> x = Tensor(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), mindspore.float32)
+        >>> seq_lengths = Tensor(np.array([1, 2, 3]))
+        >>> reverse_sequence = P.ReverseSequence(seq_dim=1)
+        >>> output = reverse_sequence(x, seq_lengths)
+    """
+
+    @prim_attr_register
+    def __init__(self, seq_dim, batch_dim=0):
+        """init ReverseSequence"""
+        self.init_prim_io_names(inputs=['x', 'seq_lengths'], outputs=['y'])
+        validator.check_value_type("seq_dim", seq_dim, [int], self.name)
+        self.seq_dim_ = seq_dim
+        validator.check_value_type("batch_dim", batch_dim, [int], self.name)
+        self.batch_dim_ = batch_dim
+
+    def infer_shape(self, x, seq_lengths):
+        validator.check("seq_dim", self.seq_dim_, "x rank", len(x), Rel.LE, self.name)
+        validator.check("batch_dim", self.batch_dim_, "x rank", len(x), Rel.LE, self.name)
+        validator.check("batch_dim", self.batch_dim_, "seq_dim", self.seq_dim_, Rel.NE, self.name)
+        validator.check("seq_lengths rank", len(seq_lengths), "expected", 1, Rel.EQ, self.name)
+        validator.check("seq_lengths vector size", seq_lengths[0],
+                        "input size along batch_dim", x[self.batch_dim_], Rel.EQ, self.name)
+        return x
+
+    def infer_dtype(self, x, seq_lengths):
+        validator.check_tensor_type_same({"seq_lengths_dtype": seq_lengths}, [mstype.int32, mstype.int64], self.name)
+        return x

From 1642be4a676a5c380da6762a31f2df2d549b3877 Mon Sep 17 00:00:00 2001
From: huangdongrun <huangdongrun@huawei.com>
Date: Thu, 4 Jun 2020 21:36:54 +0800
Subject: [PATCH 24/36] fix initiliazer

---
 mindspore/common/initializer.py           | 15 +++++++++------
 tests/ut/python/utils/test_initializer.py | 13 ++++++++++++-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/mindspore/common/initializer.py b/mindspore/common/initializer.py
index 820c5b59de..54c0a1debe 100644
--- a/mindspore/common/initializer.py
+++ b/mindspore/common/initializer.py
@@ -338,12 +338,6 @@ def initializer(init, shape=None, dtype=mstype.float32):
                              "the variable shape {}.".format(list(init.shape()), shape))
         return init
 
-    if isinstance(init, str):
-        init_obj = _INITIALIZER_ALIAS[init.lower()]()
-        if init_obj is None:
-            raise ValueError("The class corresponding to '{}' was not found.".format(init))
-        init = init_obj
-
     if isinstance(shape, list):
         shape = tuple(shape)
     elif isinstance(shape, numbers.Number):
@@ -354,6 +348,15 @@ def initializer(init, shape=None, dtype=mstype.float32):
         raise ValueError("Error shape={}".format(shape))
 
     if isinstance(init, Initializer):
+        init.shape = init.shape if init.shape is not None else shape
+        init.dtype = init.dtype if init.dtype is not None else dtype
+        return init
+
+    if isinstance(init, str):
+        init_obj = _INITIALIZER_ALIAS[init.lower()]()
+        if init_obj is None:
+            raise ValueError("The class corresponding to '{}' was not found.".format(init))
+        init = init_obj
         init.shape = shape
         init.dtype = dtype
         return init
diff --git a/tests/ut/python/utils/test_initializer.py b/tests/ut/python/utils/test_initializer.py
index 417d0bb2b1..57709baa76 100644
--- a/tests/ut/python/utils/test_initializer.py
+++ b/tests/ut/python/utils/test_initializer.py
@@ -141,7 +141,18 @@ def test_init_abnormal():
     with py.raises(TypeError):
         init.initializer([''], [5, 4], ms.float32)
 
-
+def test_initializer_reinit():
+    weights = init.initializer("XavierUniform", shape=(10, 1, 10, 10), dtype=ms.float16)
+    assert weights.dtype == ms.float16
+    assert weights.shape == (10, 1, 10, 10)
+    weights = init.initializer(weights)
+    assert weights.dtype == ms.float16
+    assert weights.shape == (10, 1, 10, 10)
+    weights.shape = None
+    weights = init.initializer(weights, (10, 1))
+    assert weights.dtype == ms.float16
+    assert weights.shape == (10, 1)
+    
 def test_init_xavier_uniform():
     """ test_init_xavier_uniform """
     gain = 1.2

From 89302a60cfab9e510a68cb02aceb4810245c82f1 Mon Sep 17 00:00:00 2001
From: yanzhenxiang2020 <yanzhenxiang@huawei.com>
Date: Tue, 2 Jun 2020 20:40:58 +0800
Subject: [PATCH 25/36] add pack op for aicpu

---
 .../kernel/aicpu/aicpu_kernel_metadata.cc     |   4 +-
 mindspore/ccsrc/kernel/aicpu/aicpu_util.h     |   1 +
 mindspore/ops/_op_impl/aicpu/__init__.py      |   1 +
 mindspore/ops/_op_impl/aicpu/pack.py          |  41 ++++
 .../st/ops/ascend/test_aicpu_ops/test_pack.py | 176 ++++++++++++++++++
 5 files changed, 221 insertions(+), 2 deletions(-)
 create mode 100644 mindspore/ops/_op_impl/aicpu/pack.py
 create mode 100644 tests/st/ops/ascend/test_aicpu_ops/test_pack.py

diff --git a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_metadata.cc b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_metadata.cc
index e8636ffa2e..3670a2d76f 100644
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_metadata.cc
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_metadata.cc
@@ -38,10 +38,10 @@ void AicpuMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<
     return;
   }
   // For compatibility with the current framework
-  if (op_name == kPrint || op_name == kGetNext) {
+  if (op_name == kPrint || op_name == kGetNext || op_name == kPack) {
     std::vector<std::string> inputs_format{};
     std::vector<TypeId> inputs_type{};
-    if (op_name == kPrint) {
+    if (op_name == kPrint || op_name == kPack) {
       for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
         inputs_format.emplace_back(kOpFormat_DEFAULT);
         inputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, input_index));
diff --git a/mindspore/ccsrc/kernel/aicpu/aicpu_util.h b/mindspore/ccsrc/kernel/aicpu/aicpu_util.h
index b6f43414e3..50f7b36d94 100644
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_util.h
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_util.h
@@ -28,6 +28,7 @@ constexpr auto kInitDataSetQueue = "InitDataSetQueue";
 constexpr auto kInitData = "InitData";
 constexpr auto kGetNext = "GetNext";
 constexpr auto kPrint = "Print";
+constexpr auto kPack = "Pack";
 
 constexpr auto kOutputTypes = "output_types";
 constexpr auto kOutputShapes = "output_shapes";
diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index 58db081de3..7d90d72b88 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -28,3 +28,4 @@ from .ctcloss import _ctcloss_aicpu
 from .rnnt_loss import _rnnt_loss_aicpu
 from .random_categorical import _random_categorical_aicpu
 from .reverse_sequence import _reverse_sequence_aicpu
+from .pack import _pack_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/pack.py b/mindspore/ops/_op_impl/aicpu/pack.py
new file mode 100644
index 0000000000..179651d884
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/pack.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Pack op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+pack_op_info = AiCPURegOp("Pack") \
+    .fusion_type("OPAQUE") \
+    .attr("axis", "int") \
+    .input(0, "x", "dynamic") \
+    .output(0, "y", "required") \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U16_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.U64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.F64_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.BOOL_Default) \
+    .get_op_info()
+
+@op_info_register(pack_op_info)
+def _pack_aicpu():
+    """Pack AiCPU register"""
+    return
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_pack.py b/tests/st/ops/ascend/test_aicpu_ops/test_pack.py
new file mode 100644
index 0000000000..affb9b90ef
--- /dev/null
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_pack.py
@@ -0,0 +1,176 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+
+
+class Net(nn.Cell):
+    def __init__(self, x, axis):
+        super(Net, self).__init__()
+        self.pack = P.Pack(axis)
+        self.x = x
+
+    def construct(self):
+        return self.pack(self.x)
+
+
+def test_net_bool():
+    x = np.random.randn(3, 5, 4)>0
+    y = np.random.randn(3, 5, 4)>0
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_int8():
+    x = np.random.randn(3, 5, 4).astype(np.int8)
+    y = np.random.randn(3, 5, 4).astype(np.int8)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_uint8():
+    x = np.random.randn(3, 5, 4).astype(np.uint8)
+    y = np.random.randn(3, 5, 4).astype(np.uint8)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_int16():
+    x = np.random.randn(3, 5, 4).astype(np.int16)
+    y = np.random.randn(3, 5, 4).astype(np.int16)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_uint16():
+    x = np.random.randn(3, 5, 4).astype(np.uint16)
+    y = np.random.randn(3, 5, 4).astype(np.uint16)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_int32():
+    x = np.random.randn(3, 5, 4).astype(np.int32)
+    y = np.random.randn(3, 5, 4).astype(np.int32)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_uint32():
+    x = np.random.randn(3, 5, 4).astype(np.uint32)
+    y = np.random.randn(3, 5, 4).astype(np.uint32)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_int64():
+    x = np.random.randn(3, 5, 4).astype(np.int64)
+    y = np.random.randn(3, 5, 4).astype(np.int64)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_uint64():
+    x = np.random.randn(3, 5, 4).astype(np.uint64)
+    y = np.random.randn(3, 5, 4).astype(np.uint64)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_float16():
+    x = np.random.randn(3, 5, 4).astype(np.float16)
+    y = np.random.randn(3, 5, 4).astype(np.float16)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_float32():
+    x = np.random.randn(3, 5, 4).astype(np.float32)
+    y = np.random.randn(3, 5, 4).astype(np.float32)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))
+
+
+def test_net_float64():
+    x = np.random.randn(3, 5, 4).astype(np.float64)
+    y = np.random.randn(3, 5, 4).astype(np.float64)
+    axis = -1
+    net = Net((Tensor(x), Tensor(y)), axis)
+    output = net()
+    print(x)
+    print(y)
+    print(output.asnumpy())
+    assert np.array_equal(output.asnumpy(), np.stack([x,y], axis))

From 9ce86e8832965031a4ffea2074588fac5f0eaeb9 Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Tue, 9 Jun 2020 11:58:13 +0800
Subject: [PATCH 26/36] composed op CosineEmbeddingLoss

---
 mindspore/_checkparam.py        |  9 +++++
 mindspore/nn/loss/__init__.py   |  5 ++-
 mindspore/nn/loss/loss.py       | 70 +++++++++++++++++++++++++++++++++
 tests/ut/python/nn/test_loss.py |  8 ++++
 4 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/mindspore/_checkparam.py b/mindspore/_checkparam.py
index d8ca5a9845..880d26bfad 100644
--- a/mindspore/_checkparam.py
+++ b/mindspore/_checkparam.py
@@ -322,6 +322,15 @@ class Validator:
             return arg_value
         raise TypeError(f"{msg_prefix} `{arg_name}` must be float.")
 
+    @staticmethod
+    def check_reduce_shape(ori_shape, shape, axis, prim_name):
+        """Checks whether shape is ori_shape reduced on axis"""
+        axis = axis if isinstance(axis, Iterable) else (axis,)
+        exp_shape = [ori_shape[i] for i in range(len(ori_shape)) if i not in axis]
+        if list(shape) != exp_shape:
+            raise ValueError(f'For {prim_name}, {ori_shape} reduce on {axis} should be '
+                             f'{tuple(exp_shape)}, but got {shape}.')
+
 
 class ParamValidator:
     """Parameter validator. NOTICE: this class will be replaced by `class Validator`"""
diff --git a/mindspore/nn/loss/__init__.py b/mindspore/nn/loss/__init__.py
index f08f5aa721..ce5870699b 100644
--- a/mindspore/nn/loss/__init__.py
+++ b/mindspore/nn/loss/__init__.py
@@ -20,8 +20,9 @@ It shows how well the model works on a dataset and the optimization target which
 """
 
 from .loss import L1Loss, MSELoss, SmoothL1Loss, \
-    SoftmaxCrossEntropyWithLogits, SoftmaxCrossEntropyExpand
+    SoftmaxCrossEntropyWithLogits, SoftmaxCrossEntropyExpand, CosineEmbeddingLoss
 
 __all__ = ['L1Loss', 'MSELoss', 'SmoothL1Loss',
            'SoftmaxCrossEntropyWithLogits',
-           'SoftmaxCrossEntropyExpand']
+           'SoftmaxCrossEntropyExpand',
+           'CosineEmbeddingLoss']
diff --git a/mindspore/nn/loss/loss.py b/mindspore/nn/loss/loss.py
index ac419c32c3..c7e38fd943 100644
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -17,9 +17,11 @@ import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 from mindspore.ops import operations as P
 from mindspore.ops import functional as F
+from mindspore.ops.primitive import constexpr
 from mindspore.nn.cell import Cell
 from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
+from mindspore.ops.composite.multitype_ops import _constexpr_utils as const_utils
 from ... import context
 
 
@@ -329,3 +331,71 @@ class SoftmaxCrossEntropyExpand(Cell):
         loss = self.reduce_mean(loss, -1)
 
         return loss
+
+
+@constexpr
+def _check_reduced_shape_valid(ori_shape, reduced_shape, axis, cls_name):
+    validator.check_reduce_shape(ori_shape, reduced_shape, axis, cls_name)
+
+class CosineEmbeddingLoss(_Loss):
+    r"""
+    Computes the similarity between two tensors using cosine distance.
+
+    Given two tensors `x1`, `x2`, and a Tensor label `y` with values 1 or -1:
+
+    .. math::
+        loss(x_1, x_2, y) = \begin{cases}
+        1-cos(x_1, x_2), & \text{if } y = 1\\
+        max(0, cos(x_1, x_2)-margin), & \text{if } y = -1\\
+        \end{cases}
+
+    Args:
+        margin (float): Should be in [-1.0, 1.0]. Default 0.0.
+        reduction (str): Specifies which reduction to apply to the output. It should be one of
+          "none", "mean", "sum", meaning no reduction, reduce mean or sum on output, respectively. Default "mean".
+
+    Inputs:
+        - **input_x1** (Tensor) - Input tensor.
+        - **input_x2** (Tensor) - Its shape and data type should be the same as `input_x1`'s shape and data type.
+        - **y** (Tensor) - Contains value 1 or -1. Suppose `input_x1` shape is
+          :math:`(x_1, x_2, x_3,..., x_R)`, then `target` shape should be :math:`(x_1, x_3, x_4, ..., x_R)`.
+
+    Outputs:
+        - **loss** (Tensor) - If `reduction` is "none", its shape is the same as `y`'s shape, loss value otherwise.
+
+    Examples:
+        >>> x1 = Tensor(np.array([[0.3, 0.8], [0.4, 0.3]]), mindspore.float32)
+        >>> x2 = Tensor(np.array([[0.4, 1.2], [-0.4, -0.9]]), mindspore.float32)
+        >>> y = Tensor(np.array([1,-1]), mindspore.int32)
+        >>> cosine_embedding_loss = P.CosineEmbeddingLoss()
+        >>> cosine_embedding_loss(x1, x2, target)
+        [0.0003426671]
+    """
+    def __init__(self, margin=0.0, reduction="mean"):
+        super(CosineEmbeddingLoss, self).__init__(reduction)
+        self.reduce_sum = P.ReduceSum()
+        self.maximum = P.Maximum()
+        validator.check_value_type("margin", margin, [float], self.cls_name)
+        self.margin = validator.check_number_range("margin", margin, -1.0, 1.0, Rel.INC_BOTH, self.cls_name)
+
+    def construct(self, x1, x2, y):
+        F.same_type_shape(x1, x2)
+        _check_reduced_shape_valid(F.shape(x1), F.shape(y), (1,), self.cls_name)
+        # if target > 0, 1-cosine(x1, x2)
+        # else, max(0, cosine(x1, x2)-margin)
+        np_eps = const_utils.get_np_eps(F.dtype(x1))
+        eps = F.cast(np_eps, F.dtype(x1))
+        prod_sum = self.reduce_sum(x1 * x2, (1,))
+        square1 = self.reduce_sum(F.square(x1), (1,)) + eps
+        square2 = self.reduce_sum(F.square(x2), (1,)) + eps
+        denom = F.sqrt(square1 * square2)
+        cosine = prod_sum / denom
+
+        pos_value = 1.0 - cosine
+        neg_value = self.maximum(cosine - self.margin, 0.0)
+        zeros = F.zeros_like_tensor(cosine)
+        pos_part = F.select(y == 1, pos_value, zeros)
+        neg_part = F.select(y == -1, neg_value, zeros)
+        output_unreduced = pos_part + neg_part
+
+        return self.get_loss(output_unreduced)
diff --git a/tests/ut/python/nn/test_loss.py b/tests/ut/python/nn/test_loss.py
index 21e3006818..7d2329dcfe 100644
--- a/tests/ut/python/nn/test_loss.py
+++ b/tests/ut/python/nn/test_loss.py
@@ -62,3 +62,11 @@ def test_SoftmaxCrossEntropyExpand():
     logits = Tensor(np.random.randint(0, 9, [100, 10]).astype(np.float32))
     labels = Tensor(np.random.randint(0, 9, [10,]).astype(np.float32))
     _executor.compile(loss, logits, labels)
+
+def test_cosine_embedding_loss():
+    """ test CosineEmbeddingLoss """
+    loss = nn.CosineEmbeddingLoss()
+    x1 = Tensor(np.array([[0.3, 0.8], [0.4, 0.3]]).astype(np.float32))
+    x2 = Tensor(np.array([[0.4, 1.2], [-0.4, -0.9]]).astype(np.float32))
+    label = Tensor(np.array([1, -1]).astype(np.int32))
+    loss(x1, x2, label)

From 9baf1ba99c2d20d61b6f6a11e5b53cc70753b4d8 Mon Sep 17 00:00:00 2001
From: xutianchun <xutianchun@huawei.com>
Date: Tue, 9 Jun 2020 22:11:38 +0800
Subject: [PATCH 27/36] add ReverseSequenceGrad

---
 mindspore/ops/_grad/grad_array_ops.py | 11 +++++++++++
 tests/ut/python/ops/test_ops.py       |  5 +++++
 2 files changed, 16 insertions(+)

diff --git a/mindspore/ops/_grad/grad_array_ops.py b/mindspore/ops/_grad/grad_array_ops.py
index 1861a4d726..72d8d74f46 100644
--- a/mindspore/ops/_grad/grad_array_ops.py
+++ b/mindspore/ops/_grad/grad_array_ops.py
@@ -580,3 +580,14 @@ def get_bprop_batch_to_space_nd(self):
         dx = batch_to_space_nd_grad(dout)
         return (dx,)
     return bprop
+
+
+@bprop_getters.register(P.ReverseSequence)
+def get_bprop_reverse_sequence(self):
+    """Generate bprop for ReverseSequence"""
+    reverse_sequence_grad = P.ReverseSequence(batch_dim=self.batch_dim_, seq_dim=self.seq_dim_)
+
+    def bprop(x, seq_lengths, out, dout):
+        dx = reverse_sequence_grad(dout, seq_lengths)
+        return dx, zeros_like(seq_lengths)
+    return bprop
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index bbf33b3c94..5251fe81f9 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -1378,6 +1378,11 @@ test_case_array_ops = [
         'desc_inputs': [Tensor(np.array([[1, 2, 3], [4, 5, 6], [4, 2, 1]]).astype(np.float32)),
                         Tensor(np.array([0, 1, 1]).astype(np.int32))],
         'desc_bprop': [Tensor(np.array([[1, 2, 3], [4, 2, 1]]).astype(np.float32))]}),
+    ('ReverseSequence', {
+        'block': P.ReverseSequence(1, 0),
+        'desc_inputs': [Tensor(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.float32)),
+                        Tensor(np.array([1, 2, 3]).astype(np.int32))],
+        'desc_bprop': [[3, 3]]}),
 ]
 
 test_case_other_ops = [

From dda612d649b289ea654eab0e97d174d9913e45a4 Mon Sep 17 00:00:00 2001
From: kswang <wangkaisheng2@huawei.com>
Date: Thu, 11 Jun 2020 21:19:03 +0800
Subject: [PATCH 28/36] default fusion group for ge

---
 mindspore/parallel/_auto_parallel_context.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/mindspore/parallel/_auto_parallel_context.py b/mindspore/parallel/_auto_parallel_context.py
index 0219029037..21ef1d59f2 100644
--- a/mindspore/parallel/_auto_parallel_context.py
+++ b/mindspore/parallel/_auto_parallel_context.py
@@ -274,10 +274,7 @@ class _AutoParallelContext:
 
         self._context_handle.set_all_reduce_fusion_split_indices(indices, group)
         if context.get_context("device_target") == "Ascend":
-            if group == "":
-                _set_fusion_strategy_by_idx(indices)
-            else:
-                _set_fusion_strategy_by_idx(indices, group)
+            _set_fusion_strategy_by_idx(indices)
 
     def get_all_reduce_fusion_split_indices(self, group="hccl_world_groupsum1"):
         """
@@ -330,10 +327,7 @@ class _AutoParallelContext:
 
         self._context_handle.set_all_reduce_fusion_split_sizes(sizes, group)
         if context.get_context("device_target") == "Ascend":
-            if group == "":
-                _set_fusion_strategy_by_size(sizes)
-            else:
-                _set_fusion_strategy_by_size(sizes, group)
+            _set_fusion_strategy_by_size(sizes)
 
     def get_all_reduce_fusion_split_sizes(self, group="hccl_world_groupsum1"):
         """

From d557f00fc7e95913ab3712f36b2da1ac0e0b3a6e Mon Sep 17 00:00:00 2001
From: changzherui <changzherui1@huawei.com>
Date: Thu, 11 Jun 2020 22:40:41 +0800
Subject: [PATCH 29/36] fix ckpt

---
 mindspore/ccsrc/utils/callbacks_ge.cc | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/mindspore/ccsrc/utils/callbacks_ge.cc b/mindspore/ccsrc/utils/callbacks_ge.cc
index 1f11ac4d0d..151b78d010 100644
--- a/mindspore/ccsrc/utils/callbacks_ge.cc
+++ b/mindspore/ccsrc/utils/callbacks_ge.cc
@@ -95,14 +95,18 @@ uint32_t CheckpointSaveCallback(uint32_t graph_id, const std::map<std::string, g
   for (auto &item : params_list) {
     std::string name = item.first;
     std::shared_ptr<ge::Tensor> ge_tensor_ptr = std::make_shared<ge::Tensor>(item.second);
-    TensorPtr tensor_ptr = GetMeTensorTransformed(graph_id, name, ge_tensor_ptr);
-    if (tensor_ptr == nullptr) {
-      MS_LOG(EXCEPTION) << "Transform ge tensor to me tensor failed";
+    if (name.size() > 5 && name.compare(name.size() - 5, 5, "_temp") == 0) {
+      continue;
+    } else {
+      TensorPtr tensor_ptr = GetMeTensorTransformed(graph_id, name, ge_tensor_ptr);
+      if (tensor_ptr == nullptr) {
+        MS_LOG(EXCEPTION) << "Transform ge tensor to me tensor failed";
+      }
+      py::dict param_dict;
+      param_dict["name"] = name;
+      param_dict["data"] = tensor_ptr;
+      parameter_list.append(param_dict);
     }
-    py::dict param_dict;
-    param_dict["name"] = name;
-    param_dict["data"] = tensor_ptr;
-    parameter_list.append(param_dict);
   }
   py::bool_ ret =
     parse::python_adapter::CallPyFn(PYTHON_MOD_CALLBACK_MODULE, PYTHON_FUN_PROCESS_CHECKPOINT, parameter_list);

From 3629c343e0cfa7c9efbe8c855930e95d9e35d6cb Mon Sep 17 00:00:00 2001
From: guohongzilong <2713219276@qq.com>
Date: Fri, 12 Jun 2020 17:26:43 +0800
Subject: [PATCH 30/36] add dumpmode for dump function

---
 mindspore/ccsrc/utils/context/ms_context.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index a726d79cd5..aa6fe1157e 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -281,6 +281,7 @@ void MsContext::GetGeOptions(std::map<std::string, std::string> *ge_options) con
   (*ge_options)["device_id"] = "0";
   (*ge_options)["ge.exec.enableDump"] = std::to_string(enable_dump_);
   (*ge_options)["ge.exec.dumpPath"] = save_dump_path_;
+  (*ge_options)["ge.exec.dumpMode"] = "output";
   MS_LOG(INFO) << "The enable dump state is " << std::to_string(enable_dump_) << " and save dump path is "
                << save_dump_path_ << ".";
   (*ge_options)["ge.exec.profilingMode"] = std::to_string(profiling_mode_);

From dd593c674a8fe247c337249a0f65b31b54497a65 Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Mon, 15 Jun 2020 10:39:43 +0800
Subject: [PATCH 31/36] fix basic lstm cell bp error

---
 mindspore/ops/_grad/grad_nn_ops.py                         | 2 +-
 mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mindspore/ops/_grad/grad_nn_ops.py b/mindspore/ops/_grad/grad_nn_ops.py
index e998afb269..4c4acb802c 100755
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@@ -726,7 +726,7 @@ def get_bprop_basic_lstm_cell(self):
     def bprop(x, h, c, w, b, out, dout):
         _, _, it, jt, ft, ot, tanhct = out
         dct, dht, _, _, _, _, _ = dout
-        dgate, dct_1 = basic_lstm_cell_cstate_grad(c, dht, dct, it, jt, ft, ot, tanhct)
+        dgate, dct_1 = basic_lstm_cell_cstate_grad(c, dht, dct, it, ft, jt, ot, tanhct)
         dxt, dht = basic_lstm_cell_input_grad(dgate, w)
         dw, db = basic_lstm_cell_weight_grad(F.depend(x, dxt), h, dgate)
         return dxt, dht, dct_1, dw, db
diff --git a/mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad.py b/mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad.py
index 099756ad35..440b1ce2c7 100644
--- a/mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad.py
+++ b/mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad.py
@@ -37,10 +37,10 @@ basic_lstm_cell_c_state_grad_op_info = TBERegOp("BasicLSTMCellCStateGrad") \
     .output(1, "dct_1", False, "required", "all") \
     .dtype_format(DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ,
                   DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ,
-                  DataType.F16_FracNZ, DataType.F16_FracNZ) \
+                  DataType.F16_FracNZ, DataType.F32_FracNZ) \
     .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ,
                   DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_FracNZ,
-                  DataType.F32_FracNZ, DataType.F16_FracNZ) \
+                  DataType.F16_FracNZ, DataType.F16_FracNZ) \
     .get_op_info()
 
 

From 10555b77026e92ee7e75d13e4436230ba279ce23 Mon Sep 17 00:00:00 2001
From: xutianchun <xutianchun@huawei.com>
Date: Thu, 11 Jun 2020 15:36:36 +0800
Subject: [PATCH 32/36] add CropAndResize op

---
 mindspore/ops/_op_impl/aicpu/__init__.py      |   1 +
 .../ops/_op_impl/aicpu/crop_and_resize.py     |  69 ++++++++++
 mindspore/ops/operations/__init__.py          |   4 +-
 mindspore/ops/operations/image_ops.py         | 126 ++++++++++++++++++
 4 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 mindspore/ops/_op_impl/aicpu/crop_and_resize.py
 create mode 100644 mindspore/ops/operations/image_ops.py

diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index 7d90d72b88..4709714de0 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -29,3 +29,4 @@ from .rnnt_loss import _rnnt_loss_aicpu
 from .random_categorical import _random_categorical_aicpu
 from .reverse_sequence import _reverse_sequence_aicpu
 from .pack import _pack_aicpu
+from .crop_and_resize import _crop_and_resize_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/crop_and_resize.py b/mindspore/ops/_op_impl/aicpu/crop_and_resize.py
new file mode 100644
index 0000000000..f52e6b00ee
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/crop_and_resize.py
@@ -0,0 +1,69 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""CropAndResize op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+crop_and_resize_op_info = AiCPURegOp("CropAndResize") \
+    .fusion_type("OPAQUE") \
+    .input(0, "image", "required") \
+    .input(1, "boxes", "required") \
+    .input(2, "box_index", "required") \
+    .input(3, "crop_size", "required") \
+    .output(0, "y", "required") \
+    .attr("method", "str") \
+    .attr("extrapolation_value", "float") \
+    .dtype_format(DataType.I8_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.I16_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.U8_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.U16_Default, DataType.F32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default) \
+    .dtype_format(DataType.I8_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC) \
+    .dtype_format(DataType.I16_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC) \
+    .dtype_format(DataType.I32_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC) \
+    .dtype_format(DataType.I64_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC) \
+    .dtype_format(DataType.F16_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC) \
+    .dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC) \
+    .dtype_format(DataType.F64_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC) \
+    .dtype_format(DataType.U8_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC) \
+    .dtype_format(DataType.U16_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.I32_NHWC,
+                  DataType.F32_NHWC) \
+    .get_op_info()
+
+
+@op_info_register(crop_and_resize_op_info)
+def _crop_and_resize_aicpu():
+    """CropAndResize AiCPU register"""
+    return
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index 9168364583..47c14f592c 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -19,6 +19,7 @@ Primitive operator classes.
 A collection of operators to build nerual networks or computing functions.
 """
 
+from .image_ops import (CropAndResize)
 from .array_ops import (Argmax, Argmin, Cast, Concat, Pack, Unpack,
                         Diag, DiagPart, DType, ExpandDims, Eye,
                         Fill, GatherNd, GatherV2, InvertPermutation,
@@ -287,7 +288,8 @@ __all__ = [
     "BesselI1e",
     "Atan",
     "Atanh",
-    "BasicLSTMCell"
+    "BasicLSTMCell",
+    "CropAndResize"
 ]
 
 __all__.extend(_quant_ops.__all__)
diff --git a/mindspore/ops/operations/image_ops.py b/mindspore/ops/operations/image_ops.py
new file mode 100644
index 0000000000..68dae34530
--- /dev/null
+++ b/mindspore/ops/operations/image_ops.py
@@ -0,0 +1,126 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""image_ops"""
+from ..._checkparam import Validator as validator
+from ..._checkparam import Rel
+from ...common import dtype as mstype
+from ..primitive import PrimitiveWithInfer, prim_attr_register
+
+
+class CropAndResize(PrimitiveWithInfer):
+    """
+    Extracts crops from the input image tensor and resizes them.
+
+    Note:
+        In case that the output shape depends on crop_size, the crop_size should be constant.
+
+    Args:
+        method (str):  	An optional string specifying the sampling method for resizing.
+            It can be either "bilinear" or "nearest" and default to "bilinear"
+        extrapolation_value (float): An optional float defaults to 0. Value used for extrapolation, when applicable.
+
+    Inputs:
+        - **x** (Tensor) - The input image must be a 4-D tensor of shape [batch, image_height, image_width, depth].
+            Types allowed: int8, int16, int32, int64, float16, float32, float64, uint8, uint16.
+        - **boxes** (Tensor) - A 2-D tensor of shape [num_boxes, 4].
+            The i-th row of the tensor specifies the coordinates of a box in the box_ind[i] image
+            and is specified in normalized coordinates [y1, x1, y2, x2]. A normalized coordinate value of y is mapped to
+            the image coordinate at y * (image_height - 1), so as the [0, 1] interval of normalized image height is
+            mapped to [0, image_height - 1] in image height coordinates. We do allow y1 > y2, in which case the sampled
+            crop is an up-down flipped version of the original image. The width dimension is treated similarly.
+            Normalized coordinates outside the [0, 1] range are allowed, in which case we use extrapolation_value to
+            extrapolate the input image values. Types allowd: float32.
+        - **box_index** (Tensor) - A 1-D tensor of shape [num_boxes] with int32 values in [0, batch).
+            The value of box_ind[i] specifies the image that the i-th box refers to. Types allowd: int32.
+        - **crop_size** (Tensor) - Only constant value is allowd. Types allowed: int32.
+            A 1-D tensor of 2 elements, size = [crop_height, crop_width].
+            All cropped image patches are resized to this size. The aspect ratio of the image content is not preserved.
+            Both crop_height and crop_width need to be positive.
+    Outputs:
+        A 4-D tensor of shape [num_boxes, crop_height, crop_width, depth] with type: float32.
+
+    Examples:
+        >>> class CropAndResizeNet(nn.Cell):
+        >>>     def __init__(self, crop_size):
+        >>>         super(CropAndResizeNet, self).__init__()
+        >>>         self.crop_and_resize = P.CropAndResize()
+        >>>         self.crop_size = crop_size
+        >>>     @ms_function
+        >>>     def construct(self, x, boxes, box_index):
+        >>>         return self.crop_and_resize(x, boxes, box_index, self.crop_size)
+        >>>
+        >>> BATCH_SIZE = 1
+        >>> NUM_BOXES = 5
+        >>> IMAGE_HEIGHT = 256
+        >>> IMAGE_WIDTH = 256
+        >>> CHANNELS = 3
+        >>> image = np.random.normal(size=[BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS]).astype(np.float32)
+        >>> boxes = np.random.uniform(shape=[NUM_BOXES, 4]).astype(np.float32)
+        >>> box_index = np.random.uniform(shape=[NUM_BOXES], low=0, high=BATCH_SIZE).astype(np.int32)
+        >>> crop_size = np.array([24, 24]).astype(np.int32)
+        >>> crop_and_resize = CropAndResizeNet(crop_size=Tensor(crop_size))
+        >>> output = crop_and_resize(Tensor(image), Tensor(boxes), Tensor(box_index))
+        >>> print(output.asnumpy())
+    """
+
+    @prim_attr_register
+    def __init__(self, method="bilinear", extrapolation_value=0.0):
+        """init CropAndResize"""
+        self.init_prim_io_names(inputs=['x', 'boxes', 'box_index', 'crop_size'], outputs=['y'])
+        validator.check_value_type("method", method, [str], self.name)
+        validator.check_string("method", method, ["bilinear", "nearest"], self.name)
+        self.method = method
+        validator.check_value_type("extrapolation_value", extrapolation_value, [float], self.name)
+        self.extrapolation_value = extrapolation_value
+
+    def __infer__(self, x, boxes, box_index, crop_size):
+        # get shape
+        x_shape = list(x['shape'])
+        boxes_shape = list(boxes['shape'])
+        box_index_shape = list(box_index['shape'])
+        crop_size_shape = list(crop_size['shape'])
+        # get value
+        if crop_size['value'] is None:
+            raise ValueError(f"For {self.name}, crop_size must be const.")
+        crop_size_value = crop_size['value'].asnumpy()
+        # get dtype
+        x_dtype = x['dtype']
+        boxes_dtype = boxes['dtype']
+        box_index_dtype = box_index['dtype']
+        crop_size_dtype = crop_size['dtype']
+        # check dytpe
+        validator.check_tensor_type_same({"x": x_dtype},
+                                         [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.float16,
+                                          mstype.float32, mstype.float64, mstype.uint8, mstype.uint16], self.name)
+        validator.check_tensor_type_same({"boxes": boxes_dtype}, [mstype.float32], self.name)
+        validator.check_tensor_type_same({"box_index": box_index_dtype}, [mstype.int32], self.name)
+        validator.check_tensor_type_same({"crop_size": crop_size_dtype}, [mstype.int32], self.name)
+        # check input shape rank
+        validator.check("x rank", len(x_shape), "expected", 4, Rel.EQ, self.name)
+        validator.check("boxes rank", len(boxes_shape), "expected", 2, Rel.EQ, self.name)
+        validator.check("box_index rank", len(box_index_shape), "expected", 1, Rel.EQ, self.name)
+        validator.check("crop_size rank", len(crop_size_shape), "expected", 1, Rel.EQ, self.name)
+
+        validator.check("boxes dim_0", boxes_shape[0], "box_index dim_0", box_index_shape[0], Rel.EQ, self.name)
+        validator.check("boxes dim_1", boxes_shape[1], "expected", 4, Rel.EQ, self.name)
+
+        num_boxes = boxes_shape[0]
+        crop_height = crop_size_value[0]
+        crop_width = crop_size_value[1]
+        depth = x_shape[3]
+        return {'shape': (num_boxes, crop_height, crop_width, depth),
+                'dtype': mstype.float32,
+                'value': None}

From ed1a6dbc1c15d864f2bff43afbbb08bb3e562d5c Mon Sep 17 00:00:00 2001
From: hanjun996 <hanjun12@huawei.com>
Date: Thu, 18 Jun 2020 21:26:04 +0800
Subject: [PATCH 33/36] sync change of tdt

---
 mindspore/ccsrc/CMakeLists.txt              |  2 +-
 mindspore/ccsrc/utils/context/ms_context.cc | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 48a3f5d65e..4184d29281 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -131,7 +131,7 @@ if (ENABLE_GE)
     else ()
         target_link_libraries(mindspore ge_client)
     endif ()
-    target_link_libraries(mindspore graph tsdclient)
+    target_link_libraries(mindspore graph tsdclient datatransfer)
 endif()
 
 if (ENABLE_D)
diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index aa6fe1157e..0aacf2d2a1 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -177,14 +177,21 @@ bool MsContext::OpenTsd() {
   }
 
   MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";
-
+#if (defined(ENABLE_TDTQUE) && defined(ENABLE_GE))
+  int32_t initStatus = tdt::TdtHostInit(device_id);
+  if (initStatus != TDT_OK_CODE) {
+    MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << ".";
+    return false;
+  }
+  tdt_print_ = std::thread(TensorPrint());
+#endif
   TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
   if (status != TDT_OK) {
     MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
     return false;
   }
   tsd_ref_++;
-#ifdef ENABLE_TDTQUE
+#if (defined(ENABLE_TDTQUE) && !defined(ENABLE_GE))
   int32_t initStatus = tdt::TdtHostInit(device_id);
   if (initStatus != TDT_OK_CODE) {
     MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << ".";

From 7eb3749f26b0b943df7d6c49a477c9d7b0a38f20 Mon Sep 17 00:00:00 2001
From: yanzhenxiang2020 <yanzhenxiang@huawei.com>
Date: Wed, 3 Jun 2020 15:08:20 +0800
Subject: [PATCH 34/36] add cast op for aicpu

---
 mindspore/ops/_op_impl/aicpu/__init__.py      |   1 +
 mindspore/ops/_op_impl/aicpu/cast.py          | 172 ++++++++++++++++++
 .../st/ops/ascend/test_aicpu_ops/test_cast.py |  75 ++++++++
 3 files changed, 248 insertions(+)
 create mode 100644 mindspore/ops/_op_impl/aicpu/cast.py
 create mode 100644 tests/st/ops/ascend/test_aicpu_ops/test_cast.py

diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index 4709714de0..d2c9fa14f3 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -30,3 +30,4 @@ from .random_categorical import _random_categorical_aicpu
 from .reverse_sequence import _reverse_sequence_aicpu
 from .pack import _pack_aicpu
 from .crop_and_resize import _crop_and_resize_aicpu
+from .cast import _cast_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/cast.py b/mindspore/ops/_op_impl/aicpu/cast.py
new file mode 100644
index 0000000000..32dbea3147
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/cast.py
@@ -0,0 +1,172 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Cast op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+cast_op_info = AiCPURegOp("Cast") \
+    .fusion_type("OPAQUE") \
+    .input(0, "x", "required") \
+    .output(0, "y", "required") \
+    .dtype_format(DataType.U8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U8_Default, DataType.U64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I16_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I32_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.F16_Default) \
+    .dtype_format(DataType.U8_Default, DataType.F32_Default) \
+    .dtype_format(DataType.U8_Default, DataType.F64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.U16_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U16_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U16_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U16_Default, DataType.U64_Default) \
+    .dtype_format(DataType.U16_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U16_Default, DataType.I16_Default) \
+    .dtype_format(DataType.U16_Default, DataType.I32_Default) \
+    .dtype_format(DataType.U16_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.U16_Default, DataType.F32_Default) \
+    .dtype_format(DataType.U16_Default, DataType.F64_Default) \
+    .dtype_format(DataType.U16_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.U32_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U32_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U32_Default, DataType.U64_Default) \
+    .dtype_format(DataType.U32_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.U32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U32_Default, DataType.F16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.U32_Default, DataType.F64_Default) \
+    .dtype_format(DataType.U32_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.U64_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U64_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U64_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.U64_Default) \
+    .dtype_format(DataType.U64_Default, DataType.I8_Default) \
+    .dtype_format(DataType.U64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.U64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U64_Default, DataType.F16_Default) \
+    .dtype_format(DataType.U64_Default, DataType.F32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.F64_Default) \
+    .dtype_format(DataType.U64_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I8_Default, DataType.U8_Default) \
+    .dtype_format(DataType.I8_Default, DataType.U16_Default) \
+    .dtype_format(DataType.I8_Default, DataType.U32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.U64_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I64_Default) \
+    .dtype_format(DataType.I8_Default, DataType.F16_Default) \
+    .dtype_format(DataType.I8_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.F64_Default) \
+    .dtype_format(DataType.I8_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I16_Default, DataType.U8_Default) \
+    .dtype_format(DataType.I16_Default, DataType.U16_Default) \
+    .dtype_format(DataType.I16_Default, DataType.U32_Default) \
+    .dtype_format(DataType.I16_Default, DataType.U64_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I64_Default) \
+    .dtype_format(DataType.I16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.I16_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I16_Default, DataType.F64_Default) \
+    .dtype_format(DataType.I16_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I32_Default, DataType.U8_Default) \
+    .dtype_format(DataType.I32_Default, DataType.U16_Default) \
+    .dtype_format(DataType.I32_Default, DataType.U32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.U64_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.I32_Default, DataType.F16_Default) \
+    .dtype_format(DataType.I32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.F64_Default) \
+    .dtype_format(DataType.I32_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I64_Default, DataType.U8_Default) \
+    .dtype_format(DataType.I64_Default, DataType.U16_Default) \
+    .dtype_format(DataType.I64_Default, DataType.U32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.U64_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.I64_Default, DataType.F16_Default) \
+    .dtype_format(DataType.I64_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.F64_Default) \
+    .dtype_format(DataType.I64_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.F16_Default, DataType.U8_Default) \
+    .dtype_format(DataType.F16_Default, DataType.U16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.U32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.U64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I8_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.F32_Default, DataType.U8_Default) \
+    .dtype_format(DataType.F32_Default, DataType.U16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.U32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.U64_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I8_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F64_Default) \
+    .dtype_format(DataType.F32_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.F64_Default, DataType.U8_Default) \
+    .dtype_format(DataType.F64_Default, DataType.U16_Default) \
+    .dtype_format(DataType.F64_Default, DataType.U32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.U64_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I8_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F64_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F64_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.F64_Default) \
+    .dtype_format(DataType.F64_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.U8_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.U16_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.U32_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.U64_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I8_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I16_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I32_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I64_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.F16_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.F32_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.F64_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.BOOL_Default) \
+    .get_op_info()
+
+@op_info_register(cast_op_info)
+def _cast_aicpu():
+    """Cast AiCPU register"""
+    return
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_cast.py b/tests/st/ops/ascend/test_aicpu_ops/test_cast.py
new file mode 100644
index 0000000000..c236c866c0
--- /dev/null
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_cast.py
@@ -0,0 +1,75 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import mindspore.common.dtype as mstype
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend")
+
+class Net(nn.Cell):
+    def __init__(self, x, dtype):
+        super(Net, self).__init__()
+        self.cast = P.Cast()
+        self.x = x
+        self.dtype = dtype
+
+    def construct(self):
+        return self.cast(self.x, self.dtype)
+
+def test_net_f32_bool():
+    x = np.random.randn(3,4).astype(np.float32)
+    x[:,1] = 0
+    net = Net(Tensor(x), mstype.bool_)
+    output = net()
+    print(output.asnumpy())
+    print(Tensor(x).dtype)
+    print(output.dtype)
+
+def test_net_f16_bool():
+    x = np.random.randn(3,4).astype(np.float16)
+    x[:,1] = 0
+    net = Net(Tensor(x), mstype.bool_)
+    output = net()
+    print(output.asnumpy())
+    print(Tensor(x).dtype)
+    print(output.dtype)
+
+def test_net_f64_bool():
+    x = np.random.randn(3,4).astype(np.float64)
+    x[:,1] = 0
+    net = Net(Tensor(x), mstype.bool_)
+    output = net()
+    print(output.asnumpy())
+    print(Tensor(x).dtype)
+    print(output.dtype)
+
+def test_net_int16_float16():
+    x = np.random.randint(-512, 512, size=(3,4)).astype(np.int16)
+    net = Net(Tensor(x), mstype.float16)
+    output = net()
+    print(output.asnumpy())
+    print(Tensor(x).dtype)
+    print(output.dtype)
+
+def test_net_int64_float16():
+    x = np.random.randint(-512, 512, size=(3,4)).astype(np.int64)
+    net = Net(Tensor(x), mstype.float16)
+    output = net()
+    print(output.asnumpy())
+    print(Tensor(x).dtype)
+    print(output.dtype)

From 5888d745bf95b3eeea593069d2607e2b078453ea Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Tue, 16 Jun 2020 12:16:24 +0800
Subject: [PATCH 35/36] fix ssim filter size check

---
 mindspore/nn/layer/image.py     | 9 +++++++--
 tests/ut/python/ops/test_ops.py | 4 ++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/mindspore/nn/layer/image.py b/mindspore/nn/layer/image.py
index 7d8eef4d6f..39cc7895f3 100644
--- a/mindspore/nn/layer/image.py
+++ b/mindspore/nn/layer/image.py
@@ -104,6 +104,12 @@ def _check_input_4d(input_shape, param_name, func_name):
         raise ValueError(f"{func_name} {param_name} should be 4d, but got shape {input_shape}")
     return True
 
+@constexpr
+def _check_input_filter_size(input_shape, param_name, filter_size, func_name):
+    _check_input_4d(input_shape, param_name, func_name)
+    validator.check(param_name + " shape[2]", input_shape[2], "filter_size", filter_size, Rel.GE, func_name)
+    validator.check(param_name + " shape[3]", input_shape[3], "filter_size", filter_size, Rel.GE, func_name)
+
 class SSIM(Cell):
     r"""
     Returns SSIM index between img1 and img2.
@@ -154,8 +160,7 @@ class SSIM(Cell):
         self.mean = P.DepthwiseConv2dNative(channel_multiplier=1, kernel_size=filter_size)
 
     def construct(self, img1, img2):
-        _check_input_4d(F.shape(img1), "img1", self.cls_name)
-        _check_input_4d(F.shape(img2), "img2", self.cls_name)
+        _check_input_filter_size(F.shape(img1), "img1", self.filter_size, self.cls_name)
         P.SameTypeShape()(img1, img2)
         max_val = _convert_img_dtype_to_float32(self.max_val, self.max_val)
         img1 = _convert_img_dtype_to_float32(img1, self.max_val)
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index 5251fe81f9..752c99960a 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -1523,6 +1523,10 @@ raise_set = [
         'block': (P.PReLU(), {'exception': ValueError}),
         'desc_inputs': [[2], [1]],
         'desc_bprop': [[1]]}),
+    ('SSIM', {
+        'block': (nn.SSIM(), {'exception': ValueError}),
+        'desc_inputs': [Tensor(np.ones((1, 3, 8, 8)), mstype.float32),
+                        Tensor(np.ones((1, 3, 8, 8)), mstype.float32)]}),
 
 ]
 

From 92880788f385abba0b81a6af37814f2ecdfb1732 Mon Sep 17 00:00:00 2001
From: wuxuejian <wuxuejian1@huawei.com>
Date: Mon, 22 Jun 2020 09:40:57 +0800
Subject: [PATCH 36/36] add aicpu embeddinglookup

move embeddinglookup to the internal
---
 mindspore/ops/_grad/grad_array_ops.py         |  26 +++++
 mindspore/ops/_op_impl/aicpu/__init__.py      |   1 +
 .../ops/_op_impl/aicpu/embedding_lookup.py    | 102 ++++++++++++++++++
 mindspore/ops/operations/_inner_ops.py        |  70 ++++++++++++
 mindspore/ops/operations/array_ops.py         |  55 +++-------
 tests/st/ops/ascend/test_embedding_lookup.py  |  42 ++++++++
 .../python/parallel/test_embeddinglookup.py   |  18 ++--
 7 files changed, 266 insertions(+), 48 deletions(-)
 create mode 100644 mindspore/ops/_op_impl/aicpu/embedding_lookup.py
 create mode 100644 tests/st/ops/ascend/test_embedding_lookup.py

diff --git a/mindspore/ops/_grad/grad_array_ops.py b/mindspore/ops/_grad/grad_array_ops.py
index 72d8d74f46..b7b7af8082 100644
--- a/mindspore/ops/_grad/grad_array_ops.py
+++ b/mindspore/ops/_grad/grad_array_ops.py
@@ -17,6 +17,7 @@
 
 from .. import operations as P
 from ..operations import _grad_ops as G
+from ..operations import _inner_ops as inner
 from ..composite.multitype_ops.zeros_like_impl import zeros_like
 from .. import functional as F
 from .grad_base import bprop_getters
@@ -188,6 +189,31 @@ def get_bprop_tile(self):
     return bprop
 
 
+@bprop_getters.register(inner.EmbeddingLookup)
+def get_bprop_embedding_lookup(self):
+    """Generate bprop for EmbeddingLookup"""
+    host_sub = P.Sub().add_prim_attr('primitive_target', 'CPU')
+    host_reshape = P.Reshape().add_prim_attr('primitive_target', 'CPU')
+    def bprop_sparse(x, indices, offset, reduce_scatter_flag, split_num, out, dout):
+        x_shp = shape_op(x)
+        if reduce_scatter_flag is True:
+            elu_grad = G.EmbeddingLookupCommGrad()
+            actual_dout = elu_grad(dout, split_num)
+        else:
+            actual_dout = dout
+        new_indices = host_sub(indices - offset)
+        # Reshape the 'new_indices'
+        new_indices_shape_changed = (size_op(new_indices),)
+        new_indices = host_reshape(new_indices, new_indices_shape_changed)
+        # Reshape the 'actual_dout'
+        x_shp_tail = x_shp[1:]
+        actual_dout_shape_changed = new_indices_shape_changed + x_shp_tail
+        actual_dout = host_reshape(actual_dout, actual_dout_shape_changed)
+        return (new_indices, actual_dout, x_shp), zeros_like(new_indices), zeros_like(axis), \
+               zeros_like(reduce_scatter_flag), zeros_like(split_num)
+    return bprop_sparse
+
+
 @bprop_getters.register(P.Transpose)
 def get_bprop_transpose(self):
     """Generate bprop for Transpose"""
diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index 4709714de0..48df11c23a 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -14,6 +14,7 @@
 
 """aicpu ops"""
 from .init_data_set_queue import _init_data_set_queue_aicpu
+from .embedding_lookup import _embedding_lookup_aicpu
 from .dropout_genmask import _dropout_genmask_aicpu
 from .get_next import _get_next_aicpu
 from .print_tensor import _print_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/embedding_lookup.py b/mindspore/ops/_op_impl/aicpu/embedding_lookup.py
new file mode 100644
index 0000000000..8eecc5145d
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/embedding_lookup.py
@@ -0,0 +1,102 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""EmbeddingLookup op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+embeddingLookup_op_info = AiCPURegOp("EmbeddingLookup") \
+    .fusion_type("OPAQUE") \
+    .input(0, "params", "required") \
+    .input(1, "indices", "required") \
+    .input(2, "offset", "required") \
+    .output(0, "output", "required") \
+    .dtype_format(DataType.I8_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U16_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.U64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.F64_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I32_Default, \
+    DataType.I32_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U16_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.U64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.F64_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I64_Default, \
+    DataType.I64_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.U8_Default) \
+    .dtype_format(DataType.U16_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.U16_Default) \
+    .dtype_format(DataType.U32_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.U32_Default) \
+    .dtype_format(DataType.U64_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.U64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.F64_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I64_Default, \
+    DataType.I32_Default, DataType.BOOL_Default) \
+    .get_op_info()
+
+@op_info_register(embeddingLookup_op_info)
+def _embedding_lookup_aicpu():
+    """EmbeddingLookup AiCPU register"""
+    return
diff --git a/mindspore/ops/operations/_inner_ops.py b/mindspore/ops/operations/_inner_ops.py
index 38f399316a..2f9970eb0c 100644
--- a/mindspore/ops/operations/_inner_ops.py
+++ b/mindspore/ops/operations/_inner_ops.py
@@ -96,3 +96,73 @@ class ExtractImagePatches(PrimitiveWithInfer):
         """infer dtype"""
         validator.check_tensor_type_same({"input_x": input_x}, mstype.number_type, self.name)
         return input_x
+
+
+class EmbeddingLookup(PrimitiveWithInfer):
+    """
+    Returns a slice of input tensor based on the specified indices.
+
+    This Primitive has the similar functionality as GatherV2 operating on `axis = 0`, but has three more inputs:
+    `offset`, `reduce_scatter_flag` and `split_num`. This primitive runs on the host instead of devices.
+
+    Inputs:
+        - **input_params** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+          The Tensor slice, instead of the entire Tensor.
+        - **input_indices** (Tensor) - The shape of tensor is :math:`(y_1, y_2, ..., y_S)`.
+          Specifies the indices of elements of the original Tensor. Values can be out of range of `input_params`,
+          and the exceeding part will be filled with 0 in the output.
+        - **offset** (int) - Specifies the offset value of this `input_params` slice. Thus the real indices
+          are equal to `input_indices` minus `offset`.
+        - **reduce_scatter_flag** (bool) - Specifies whether perform reduce_scatter on host or not.
+          Only constant value is allowed.
+        - **split_num** (int) - Specifies the number of partitions of the reduce_scatter produces. This variable
+          is used only if `reduce_scatter_flag` is True. Only constant value is allowed.
+
+
+    Outputs:
+        Tensor, the shape of tensor is :math:`(z_1, z_2, ..., z_N)`.
+
+    Examples:
+        >>> input_params = Tensor(np.array([[8, 9], [10, 11], [12, 13], [14, 15]]), mindspore.float32)
+        >>> input_indices = Tensor(np.array([[5, 2], [8, 5]]), mindspore.int32)
+        >>> offset = 4
+        >>> reduce_scatter_flag = False
+        >>> split_num = 1
+        >>> out = P.EmbeddingLookup()(input_params, input_indices, offset, reduce_scatter_flag, split_num)
+        [[[10, 11], [0 ,0]], [[0, 0], [10, 11]]]
+    """
+    @prim_attr_register
+    def __init__(self):
+        """init index_select"""
+        self.__setattr_flag__ = True
+        self.init_prim_io_names(inputs=['params', 'indices', 'offset', 'reduce_scatter_flag', 'split_num'],
+                                outputs=['output'])
+        self.add_prim_attr('primitive_target', 'CPU')
+
+    def __infer__(self, params, indices, offset, reduce_scatter_flag=False, split_num=2):
+        validator.check_subclass("params", params['dtype'], mstype.tensor, self.name)
+        validator.check_tensor_type_same({"indices": indices['dtype']}, mstype.int_type, self.name)
+        validator.check_subclass("offset", offset['dtype'], mstype.int_, self.name)
+        validator.check_subclass("split_num", split_num['dtype'], mstype.int_, self.name)
+        if split_num['value'] < 1:
+            raise ValueError("The parameter 'split_num' must be positive, but got %d." % split_num)
+        params_shp = params['shape']
+        out_shape = indices['shape'] + params_shp[1:]
+        if reduce_scatter_flag is None:
+            raise ValueError("The value of 'reduce_scatter_flag' is None.")
+        reduce_scatter_flag_value = reduce_scatter_flag['value']
+        if split_num is None:
+            raise ValueError("The value of 'split_num_value' is None.")
+        split_num_value = split_num['value']
+        if reduce_scatter_flag_value is True:
+            # Partition the tensor along the dimension 0. The shape size of dimension 0 should be divisible by
+            # (split_num * 8)
+            if out_shape[0] % (split_num_value * 8) != 0:
+                raise ValueError("The dimension 0 of the shape: %d, is not divisible by: %d." %
+                                 (out_shape[0], (split_num_value * 8)))
+            # After 'Concat' on host, the shape size of dimension 0 is: out_shape[0] // 8
+            out_shape[0] = out_shape[0] // 8
+        out = {'shape': out_shape,
+               'dtype': params['dtype'],
+               'value': None}
+        return out
diff --git a/mindspore/ops/operations/array_ops.py b/mindspore/ops/operations/array_ops.py
index d53f92c2a3..79a92ed7c8 100644
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@@ -577,64 +577,43 @@ class Range(PrimitiveWithInfer):
 class EmbeddingLookup(PrimitiveWithInfer):
     """
     Returns a slice of input tensor based on the specified indices and axis. This Primitive has the similar
-    functionality as GatherV2, but has three more inputs: `offset`, `reduce_scatter_flag` and `split_num`.
+    functionality as GatherV2, but has one more inputs: `offset`.
+    This primitive runs on the acipu devices.
 
     Inputs:
-        - **input_params** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+        - **params** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
           The Tensor slice, instead of the entire Tensor.
-        - **input_indices** (Tensor) - The shape of tensor is :math:`(y_1, y_2, ..., y_S)`.
-          Specifies the indices of elements of the original Tensor. Must be in the range
-          `[0, input_param.shape()[axis])`.
-        - **axis** (int) - Specifies the dimension index to gather indices.
-        - **offset** (int) - Specifies the offset value of this `input_params` slice. Thus the real indices
-          are equal to `input_indices` minus `offset`.
-        - **reduce_scatter_flag** (bool) - Specifies whether perform reduce_scatter on host or not.
-        - **split_num** (int) - Specifies the number of partitions of the reduce_scatter produces. This variable
-          is used only if `reduce_scatter_flag` is True.
+        - **indices** (Tensor) - The shape of tensor is :math:`(y_1, y_2, ..., y_S)`.
+          Specifies the indices of elements of the original Tensor. Values can be out of range of `params`,
+          and the exceeding part will be filled with 0 in the output.
+          The indices to do lookup operation whose data type should be mindspore.int32 or mindspore.int64.
+        - **offset** (int) - Specifies the offset value of this `params` slice. Thus the real indices
+          are equal to `indices` minus `offset`.
 
 
     Outputs:
         Tensor, the shape of tensor is :math:`(z_1, z_2, ..., z_N)`.
 
     Examples:
-        >>> input_params = Tensor(np.array([[8, 9], [10, 11], [12, 13], [14, 15]]), mindspore.float32)
-        >>> input_indices = Tensor(np.array([[5, 2], [8, 5]]), mindspore.int32)
-        >>> axis = 0
+        >>> params = Tensor(np.array([[8, 9], [10, 11], [12, 13], [14, 15]]), mindspore.float32)
+        >>> indices = Tensor(np.array([[5, 2], [8, 5]]), mindspore.int32)
         >>> offset = 4
-        >>> reduce_scatter_flag = False
-        >>> split_num = 1
-        >>> out = P.EmbeddingLookup()(input_params, input_indices, axis, offset, reduce_scatter_flag, split_num)
+        >>> out = P.EmbeddingLookup()(params, indices, offset)
         [[[10, 11], [0 ,0]], [[0, 0], [10, 11]]]
     """
     @prim_attr_register
     def __init__(self):
         """init index_select"""
-        self.__setattr_flag__ = True
-        self.init_prim_io_names(inputs=['params', 'indices', 'axis', 'offset', 'reduce_scatter_flag', 'split_num'],
+        self.init_prim_io_names(inputs=['params', 'indices', 'offset'],
                                 outputs=['output'])
-        self.add_prim_attr('target', 'CPU')
 
-    def __infer__(self, params, indices, axis, offset, reduce_scatter_flag=False, split_num=2):
+    def __infer__(self, params, indices, offset):
         validator.check_subclass("params", params['dtype'], mstype.tensor, self.name)
-        validator.check_tensor_type_same({"indices": indices['dtype']}, mstype.int_type, self.name)
-        validator.check_subclass("axis", axis['dtype'], mstype.int_, self.name)
+        valid_types = (mstype.int32, mstype.int64)
+        validator.check_tensor_type_same({"indices": indices['dtype']}, valid_types, self.name)
         validator.check_subclass("offset", offset['dtype'], mstype.int_, self.name)
-        validator.check_subclass("split_num", split_num['dtype'], mstype.int_, self.name)
-        if split_num['value'] < 1:
-            raise ValueError("The parameter 'split_num' must be positive, but got %d." % split_num)
-        axis_v = axis['value']
         params_shp = params['shape']
-        rank = len(params_shp)
-        validator.check_int_range("axis", axis_v, -rank, rank, Rel.INC_LEFT, self.name)
-        if axis_v < 0:
-            axis_v += rank
-        out_shape = params_shp[:axis_v] + indices['shape'] + params_shp[axis_v + 1:]
-        if reduce_scatter_flag:
-            # partition the tensor along the dimension 0.
-            if out_shape[0] % split_num['value'] != 0:
-                raise ValueError("The dimension 0 of the shape: %d, is not divisible by split_num: %d." %
-                                 (out_shape[0], split_num['value']))
-            out_shape[0] = out_shape[0] // split_num['value']
+        out_shape = indices['shape'] + params_shp[1:]
         out = {'shape': out_shape,
                'dtype': params['dtype'],
                'value': None}
diff --git a/tests/st/ops/ascend/test_embedding_lookup.py b/tests/st/ops/ascend/test_embedding_lookup.py
new file mode 100644
index 0000000000..483fdcdbc4
--- /dev/null
+++ b/tests/st/ops/ascend/test_embedding_lookup.py
@@ -0,0 +1,42 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore.context as context
+import mindspore.common.dtype as mstype
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE,
+                    device_target="Ascend")
+
+
+class Net(nn.Cell):
+    def __init__(self, offset):
+        super(Net, self).__init__()
+        self.embedding = P.EmbeddingLookup()
+        self.offset = offset
+
+    def construct(self, param, index):
+        return self.embedding(param, index, self.offset)
+
+
+def test_embedding_lookup_sparse():
+    params = Tensor(np.array([[8, 9], [10, 11], [12, 13], [14, 15]]), mstype.int32)
+    indices = Tensor(np.array([[5, 2], [8, 5]]), mstype.int32)
+    offset = 4
+    embedding = Net(offset)
+    out = embedding(params, indices)
+    assert(out.asnumpy() == [[[10, 11], [0, 0]], [[0, 0], [10, 11]]]).all()
diff --git a/tests/ut/python/parallel/test_embeddinglookup.py b/tests/ut/python/parallel/test_embeddinglookup.py
index b934028a48..b306061981 100644
--- a/tests/ut/python/parallel/test_embeddinglookup.py
+++ b/tests/ut/python/parallel/test_embeddinglookup.py
@@ -19,6 +19,7 @@ import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common.api import _executor
 from mindspore.ops import operations as P
+from mindspore.ops.operations import _inner_ops as inner
 from tests.ut.python.ops.test_math_ops import VirtualLoss
 
 
@@ -33,29 +34,27 @@ class NetWithLoss(nn.Cell):
         return self.loss(predict)
 
 class Net(nn.Cell):
-    def __init__(self, shape, axis, offset, reduce_scatter_flag, split_num):
+    def __init__(self, shape, offset, reduce_scatter_flag, split_num):
         super().__init__()
         self.index = Tensor(np.ones(shape), dtype=ms.int32)
-        self.axis = axis
         self.offset = offset
         self.reduce_scatter_flag = reduce_scatter_flag
         self.split_num = split_num
-        self.elu = P.EmbeddingLookup()
+        self.elu = inner.EmbeddingLookup()
         self.mm = P.BatchMatMul()
 
     def construct(self, x, y):
-        out = self.elu(x, self.index, self.axis, self.offset, self.reduce_scatter_flag, self.split_num)
+        out = self.elu(x, self.index, self.offset, self.reduce_scatter_flag, self.split_num)
         out = self.mm(out, y)
         return out
 
 
 def test_embeddinglookup_reducescatter_false():
     shape = [8, 8]
-    axis = 0
     offset = 8
     reduce_scatter_flag = False
     split_num = 1
-    net = NetWithLoss(Net(shape, axis, offset, reduce_scatter_flag, split_num))
+    net = NetWithLoss(Net(shape, offset, reduce_scatter_flag, split_num))
     net.set_auto_parallel()
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
@@ -64,14 +63,13 @@ def test_embeddinglookup_reducescatter_false():
 
 
 def test_embeddinglookup_reducescatter_true():
-    shape = [8, 8]
-    axis = 0
+    shape = [64, 8]
     offset = 8
     reduce_scatter_flag = True
     split_num = 8
-    net = NetWithLoss(Net(shape, axis, offset, reduce_scatter_flag, split_num))
+    net = NetWithLoss(Net(shape, offset, reduce_scatter_flag, split_num))
     net.set_auto_parallel()
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
-    y = Tensor(np.ones([1, 32, 8]), dtype=ms.float32)
+    y = Tensor(np.ones([8, 32, 8]), dtype=ms.float32)
     _executor.compile(net, x, y)