From 92dc6de049735cd0232c9a38bd9e0d86faf52095 Mon Sep 17 00:00:00 2001 From: simson <526422051@qq.com> Date: Wed, 1 Apr 2020 11:55:29 +0800 Subject: [PATCH 01/13] modify graphengine --- .gitmodules | 3 --- graphengine | 1 - 2 files changed, 4 deletions(-) delete mode 160000 graphengine diff --git a/.gitmodules b/.gitmodules index a241b6d69b..1f5fbad2b9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,6 +10,3 @@ [submodule "third_party/protobuf"] path = third_party/protobuf url = https://github.com/protocolbuffers/protobuf.git -[submodule "graphengine"] - path = graphengine - url = https://gitee.com/mindspore/graphengine.git diff --git a/graphengine b/graphengine deleted file mode 160000 index 5f763679fa..0000000000 --- a/graphengine +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5f763679fa33de1608d07f7651c6f16012b953ea From f338eb3a606efc4c36bd49690a629b6ab186643f Mon Sep 17 00:00:00 2001 From: simson <526422051@qq.com> Date: Wed, 1 Apr 2020 11:57:09 +0800 Subject: [PATCH 02/13] add graphengine --- .gitmodules | 3 +++ graphengine | 1 + 2 files changed, 4 insertions(+) create mode 160000 graphengine diff --git a/.gitmodules b/.gitmodules index 1f5fbad2b9..a024019b14 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "third_party/protobuf"] path = third_party/protobuf url = https://github.com/protocolbuffers/protobuf.git +[submodule "graphengine"] + path = graphengine + url = https://gitee.com/ms-incubator/graphengine.git diff --git a/graphengine b/graphengine new file mode 160000 index 0000000000..21d3700f66 --- /dev/null +++ b/graphengine @@ -0,0 +1 @@ +Subproject commit 21d3700f661576edc37607a3bc961874ee5189a7 From 6f2b7abe04a97b4b8fb3b6de51124eed95cef4e9 Mon Sep 17 00:00:00 2001 From: yanghaoran Date: Thu, 2 Apr 2020 17:15:41 +0800 Subject: [PATCH 03/13] modify reduceminD and reducemaxD IR --- graphengine | 2 +- mindspore/ccsrc/pipeline/pipeline.cc | 2 +- mindspore/ccsrc/transform/convert.cc | 16 +- mindspore/ccsrc/transform/graph_runner.cc | 7 + mindspore/ccsrc/transform/op_declare.cc | 159 +++++++++----------- mindspore/ccsrc/transform/op_declare.h | 14 +- mindspore/ccsrc/transform/util.cc | 11 +- mindspore/ccsrc/utils/context/ms_context.cc | 4 +- mindspore/ops/operations/__init__.py | 3 +- mindspore/ops/operations/nn_ops.py | 75 ++++++++- tests/ut/python/ops/test_ops.py | 5 + 11 files changed, 188 insertions(+), 110 deletions(-) diff --git a/graphengine b/graphengine index 21d3700f66..092c7a1f65 160000 --- a/graphengine +++ b/graphengine @@ -1 +1 @@ -Subproject commit 21d3700f661576edc37607a3bc961874ee5189a7 +Subproject commit 092c7a1f6548cac7d40e677af3498c3c49ea2bfd diff --git a/mindspore/ccsrc/pipeline/pipeline.cc b/mindspore/ccsrc/pipeline/pipeline.cc index 35336e975b..70ef9a5407 100644 --- a/mindspore/ccsrc/pipeline/pipeline.cc +++ b/mindspore/ccsrc/pipeline/pipeline.cc @@ -1071,7 +1071,7 @@ bool ExecutorPy::AddDFGraph(const py::dict& init_params, const std::string& phas } std::string init_graph = "init_subgraph." + net_id; std::string checkpoint_name = "save." + net_id; - if (phase == "train") { + if (phase.find("train") != std::string::npos) { (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph(), {{"ge.exec.variable_acc", "1"}}); } else { (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph()); diff --git a/mindspore/ccsrc/transform/convert.cc b/mindspore/ccsrc/transform/convert.cc index 74b0695cff..87bfc8f6d8 100755 --- a/mindspore/ccsrc/transform/convert.cc +++ b/mindspore/ccsrc/transform/convert.cc @@ -171,6 +171,7 @@ const char kNameAbsGrad[] = "AbsGrad"; const char kNameBinaryCrossEntropy[] = "BinaryCrossEntropy"; const char kNameBinaryCrossEntropyGrad[] = "BinaryCrossEntropyGrad"; const char kNameSparseApplyAdagrad[] = "SparseApplyAdagrad"; +const char kNameSparseApplyFtrlD[] = "SparseApplyFtrlD"; const char kNameSpaceToDepth[] = "SpaceToDepth"; const char kNameDepthToSpace[] = "DepthToSpace"; const char kNameSign[] = "Sign"; @@ -189,7 +190,7 @@ std::unordered_map &DfGraphConvertor::get_adpt_ma {string(kNameApplyMomentum), ADPT_DESC(ApplyMomentum)}, {string(kNameMaxPool), ADPT_DESC(MaxPool)}, {string(kNameAvgPool), ADPT_DESC(AvgPool)}, - {string(kNameTopK), ADPT_DESC(TopKV2)}, + {string(kNameTopK), ADPT_DESC(TopK)}, {string(kNamePack), ADPT_DESC(Pack)}, {string(kNameSplitD), ADPT_DESC(SplitD)}, {string(kNameAllReduce), ADPT_DESC(HcomAllReduce)}, @@ -310,7 +311,7 @@ std::unordered_map &DfGraphConvertor::get_adpt_ma {prim::kPrimMinimum->name(), ADPT_DESC(Minimum)}, {prim::kPrimSelect->name(), ADPT_DESC(Select)}, {string(kNameLessEqual), ADPT_DESC(LessEqual)}, - {prim::kPrimLogSoftmax->name(), ADPT_DESC(LogSoftmax)}, + {prim::kPrimLogSoftmax->name(), ADPT_DESC(LogSoftmaxV2)}, {string(kNameTruncatedNormal), ADPT_DESC(TruncatedNormal)}, {string(kNameStridedSliceGrad), ADPT_DESC(StridedSliceGrad)}, {prim::kPrimGelu->name(), ADPT_DESC(Gelu)}, @@ -343,7 +344,7 @@ std::unordered_map &DfGraphConvertor::get_adpt_ma {prim::kPrimMatMul->name(), ADPT_DESC(MatMul)}, {string(kNameConst), ADPT_DESC(Constant, Const)}, - {string(kNameSoftmax), ADPT_DESC(Softmax)}, + {string(kNameSoftmax), ADPT_DESC(SoftmaxV2)}, {string(kNameSoftmaxGrad), ADPT_DESC(SoftmaxGrad)}, {string(kNameParam), ADPT_DESC(Data)}, {string(kNameROIAlign), ADPT_DESC(ROIAlign)}, @@ -353,6 +354,7 @@ std::unordered_map &DfGraphConvertor::get_adpt_ma {string(kNameBinaryCrossEntropy), ADPT_DESC(BinaryCrossEntropy)}, {string(kNameBinaryCrossEntropyGrad), ADPT_DESC(BinaryCrossEntropyGrad)}, {string(kNameSparseApplyAdagrad), ADPT_DESC(SparseApplyAdagradD)}, + {string(kNameSparseApplyFtrlD), ADPT_DESC(SparseApplyFtrlD)}, {string(kNameSpaceToDepth), ADPT_DESC(SpaceToDepth)}, {string(kNameDepthToSpace), ADPT_DESC(DepthToSpace)}, {string(kNameSign), ADPT_DESC(Sign)}, @@ -1017,8 +1019,8 @@ DfGraphConvertor &DfGraphConvertor::BuildGraph() { } } - // set up dependices - MS_LOG(DEBUG) << "set up dependices"; + // set up dependencies + MS_LOG(DEBUG) << "set up dependencies"; std::vector nodes = ::mindspore::TopoSort(anf_graph_->get_return()); for (auto &it : nodes) { SetNodeInput(it); @@ -1115,8 +1117,8 @@ void DfGraphConvertor::UpdateDataOpDesc(const AnfNodePtr &it, const OperatorPtr if (desc == nullptr) { MS_LOG(ERROR) << "Update data op descriptor failed! TensorDesc is null."; } else { - (void)std::static_pointer_cast(op)->update_input_desc_data(*desc); - (void)std::static_pointer_cast(op)->update_output_desc_out(*desc); + (void)std::static_pointer_cast(op)->update_input_desc_x(*desc); + (void)std::static_pointer_cast(op)->update_output_desc_y(*desc); } } diff --git a/mindspore/ccsrc/transform/graph_runner.cc b/mindspore/ccsrc/transform/graph_runner.cc index e77b1bcd73..2bff1a740c 100644 --- a/mindspore/ccsrc/transform/graph_runner.cc +++ b/mindspore/ccsrc/transform/graph_runner.cc @@ -135,6 +135,13 @@ Status GraphRunner::RunGraph(const RunOptions& options, const std::vectorIsGraphNeedRebuild(wrap_ptr->id_)) { + sess_->RemoveGraph(wrap_ptr->id_); + sess_->AddGraph(wrap_ptr->id_, *(wrap_ptr->graph_ptr_), wrap_ptr->options_); + } + ge::Status ret = sess_->RunGraph(wrap_ptr->id_, ge_inputs, ge_outputs); if (ret != ge::GRAPH_SUCCESS) { MS_LOG(ERROR) << "Call GE RunGraph Failed, ret is: " << ret; diff --git a/mindspore/ccsrc/transform/op_declare.cc b/mindspore/ccsrc/transform/op_declare.cc index 78b949c525..07c5e9f5fe 100755 --- a/mindspore/ccsrc/transform/op_declare.cc +++ b/mindspore/ccsrc/transform/op_declare.cc @@ -138,11 +138,10 @@ OUTPUT_MAP(ApplyMomentum) = {{0, OUTPUT_DESC(var)}}; INPUT_MAP(Summary) = {{2, INPUT_DESC(x)}}; ATTR_MAP(Summary) = EMPTY_ATTR_MAP; -// data +// Data INPUT_MAP(Data) = EMPTY_INPUT_MAP; ATTR_MAP(Data) = EMPTY_ATTR_MAP; -// resnet ops in ge // BatchNorm INPUT_MAP(BatchNorm) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(scale)}, @@ -194,9 +193,9 @@ OUTPUT_MAP(PRelu) = {{0, OUTPUT_DESC(y)}}; // PReluGrad INPUT_MAP(PReluGrad) = { - {1, INPUT_DESC(input_gradients)}, {2, INPUT_DESC(input_features)}, {3, INPUT_DESC(input_weights)}}; + {1, INPUT_DESC(grads)}, {2, INPUT_DESC(features)}, {3, INPUT_DESC(weights)}}; ATTR_MAP(PReluGrad) = EMPTY_ATTR_MAP; -OUTPUT_MAP(PReluGrad) = {{0, OUTPUT_DESC(output_backprops_dx)}, {1, OUTPUT_DESC(output_backprops_da)}}; +OUTPUT_MAP(PReluGrad) = {{0, OUTPUT_DESC(dx)}, {1, OUTPUT_DESC(da)}}; // Sigmoid INPUT_MAP(Sigmoid) = {{1, INPUT_DESC(x)}}; @@ -241,12 +240,12 @@ ATTR_MAP(CumsumD) = {{"exclusive", ATTR_DESC(exclusive, AnyTraits())}, {"reverse", ATTR_DESC(reverse, AnyTraits())}}; OUTPUT_MAP(CumsumD) = {{0, OUTPUT_DESC(y)}}; -// softmax -INPUT_MAP(Softmax) = {{1, INPUT_DESC(x)}}; -ATTR_MAP(Softmax) = { - {"axis", ATTR_DESC(axis, AnyTraits>(), AnyTraits>())}, +// SoftmaxV2 +INPUT_MAP(SoftmaxV2) = {{1, INPUT_DESC(x)}}; +ATTR_MAP(SoftmaxV2) = { + {"axis", ATTR_DESC(axes, AnyTraits>(), AnyTraits>())}, }; -OUTPUT_MAP(Softmax) = {{0, OUTPUT_DESC(y)}}; +OUTPUT_MAP(SoftmaxV2) = {{0, OUTPUT_DESC(y)}}; // SoftmaxGrad INPUT_MAP(SoftmaxGrad) = {{1, INPUT_DESC(softmax)}, {2, INPUT_DESC(grad_softmax)}}; @@ -269,21 +268,21 @@ ATTR_MAP(GatherV2) = EMPTY_ATTR_MAP; OUTPUT_MAP(GatherV2) = {{0, OUTPUT_DESC(y)}}; // ReduceSum -INPUT_MAP(ReduceSum) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axis)}}; +INPUT_MAP(ReduceSum) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axes)}}; ATTR_MAP(ReduceSum) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits())}}; OUTPUT_MAP(ReduceSum) = {{0, OUTPUT_DESC(y)}}; // ReduceSumD INPUT_MAP(ReduceSumD) = {{1, INPUT_DESC(x)}}; INPUT_ATTR_MAP(ReduceSumD) = { - {2, ATTR_DESC(axis, AnyTraits>(), AnyTraits>())}}; + {2, ATTR_DESC(axes, AnyTraits>(), AnyTraits>())}}; ATTR_MAP(ReduceSumD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits())}}; OUTPUT_MAP(ReduceSumD) = {{0, OUTPUT_DESC(y)}}; // ReduceProdD INPUT_MAP(ReduceProdD) = {{1, INPUT_DESC(x)}}; INPUT_ATTR_MAP(ReduceProdD) = { - {2, ATTR_DESC(axis, AnyTraits>(), AnyTraits>())}}; + {2, ATTR_DESC(axes, AnyTraits>(), AnyTraits>())}}; ATTR_MAP(ReduceProdD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits())}}; OUTPUT_MAP(ReduceProdD) = {{0, OUTPUT_DESC(y)}}; @@ -294,7 +293,7 @@ ATTR_MAP(CumprodD) = {{"exclusive", ATTR_DESC(exclusive, AnyTraits())}, {"reverse", ATTR_DESC(reverse, AnyTraits())}}; OUTPUT_MAP(CumprodD) = {{0, OUTPUT_DESC(y)}}; -// SoftmaxCrossEntropyWithLogits/ +// SoftmaxCrossEntropyWithLogits INPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{1, INPUT_DESC(features)}, {2, INPUT_DESC(labels)}}; ATTR_MAP(SoftmaxCrossEntropyWithLogits) = EMPTY_ATTR_MAP; OUTPUT_MAP(SoftmaxCrossEntropyWithLogits) = {{0, OUTPUT_DESC(loss)}, {1, OUTPUT_DESC(backprop)}}; @@ -306,7 +305,7 @@ INPUT_ATTR_MAP(MeanGrad) = {{2, ATTR_DESC(mean_grad_output_shape_value, kOpForma ATTR_MAP(MeanGrad) = {{"mode", ATTR_DESC(mode, AnyTraits())}}; INPUT_MAP(SliceD) = {{1, INPUT_DESC(x)}}; -INPUT_ATTR_MAP(SliceD) = {{2, ATTR_DESC(begin, AnyTraits(), AnyTraits>())}, +INPUT_ATTR_MAP(SliceD) = {{2, ATTR_DESC(offsets, AnyTraits(), AnyTraits>())}, {3, ATTR_DESC(size, AnyTraits(), AnyTraits>())}}; ATTR_MAP(SliceD) = EMPTY_ATTR_MAP; OUTPUT_MAP(SliceD) = {{0, OUTPUT_DESC(y)}}; @@ -401,42 +400,10 @@ ATTR_MAP(BoundingBoxDecode) = { }; OUTPUT_MAP(BoundingBoxDecode) = {{0, OUTPUT_DESC(bboxes)}}; -#ifdef VALID_CODE - -// Less -INPUT_MAP(Less) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}}; -ATTR_MAP(Less) = EMPTY_ATTR_MAP; -OUTPUT_MAP(Less) = {{0, OUTPUT_DESC(z)}}; - -// Cast -INPUT_MAP(Cast) = {{1, INPUT_DESC(x)}}; -INPUT_ATTR_MAP(Cast) = {{2, ATTR_DESC(dst_type, AnyTraits())}}; -ATTR_MAP(Cast) = {{"Truncate", ATTR_DESC(truncate, AnyTraits())}}; -OUTPUT_MAP(Cast) = {{0, OUTPUT_DESC(y)}}; - -// Minimum -INPUT_MAP(Minimum) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}}; -ATTR_MAP(Minimum) = {{"alpha", ATTR_DESC(alpha, AnyTraits())}, {"beta", ATTR_DESC(beta, AnyTraits())}}; -OUTPUT_MAP(Minimum) = {{0, OUTPUT_DESC(z)}}; - -// Sub -INPUT_MAP(Sub) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}}; -ATTR_MAP(Sub) = {{"alpha", ATTR_DESC(alpha, AnyTraits())}, {"beta", ATTR_DESC(beta, AnyTraits())}}; - -#endif - -// TopKV2 -INPUT_MAP(TopKV2) = { - {1, INPUT_DESC(input)}, - {2, INPUT_DESC(k)}, -}; - -ATTR_MAP(TopKV2) = {{"T", ATTR_DESC(T, AnyTraits())}, {"sorted", ATTR_DESC(sorted, AnyTraits())}}; - -OUTPUT_MAP(TopKV2) = { - {0, OUTPUT_DESC(values)}, - {1, OUTPUT_DESC(indices)}, -}; +// TopK +INPUT_MAP(TopK) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(k)}}; +ATTR_MAP(TopK) = {{"sorted", ATTR_DESC(sorted, AnyTraits())}}; +OUTPUT_MAP(TopK) = {{0, OUTPUT_DESC(values)}, {1, OUTPUT_DESC(indices)}}; // Multiply INPUT_MAP(Multiply) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(y)}}; @@ -476,7 +443,7 @@ ATTR_MAP(Iou) = {{"mode", ATTR_DESC(mode, AnyTraits())}}; OUTPUT_MAP(Iou) = {{0, OUTPUT_DESC(overlap)}}; // ResizeNearestNeighborD -INPUT_MAP(ResizeNearestNeighborD) = {{1, INPUT_DESC(images)}}; +INPUT_MAP(ResizeNearestNeighborD) = {{1, INPUT_DESC(x)}}; ATTR_MAP(ResizeNearestNeighborD) = { {"size", ATTR_DESC(size, AnyTraits>(), AnyTraits>())}, {"align_corners", ATTR_DESC(align_corners, AnyTraits())}}; @@ -506,17 +473,17 @@ ATTR_MAP(Relu6) = EMPTY_ATTR_MAP; OUTPUT_MAP(Relu6) = {{0, OUTPUT_DESC(activations)}}; // Relu6Grad -INPUT_MAP(Relu6Grad) = {{1, INPUT_DESC(dy)}, {2, INPUT_DESC(y)}}; +INPUT_MAP(Relu6Grad) = {{1, INPUT_DESC(features)}, {2, INPUT_DESC(gradients)}}; ATTR_MAP(Relu6Grad) = EMPTY_ATTR_MAP; -OUTPUT_MAP(Relu6Grad) = {{0, OUTPUT_DESC(z)}}; +OUTPUT_MAP(Relu6Grad) = {{0, OUTPUT_DESC(backprops)}}; // ResizeBilinearGrad INPUT_MAP(ResizeBilinearGrad) = {{1, INPUT_DESC(grads)}, {2, INPUT_DESC(original_image)}}; ATTR_MAP(ResizeBilinearGrad) = {{"align_corners", ATTR_DESC(align_corners, AnyTraits())}}; OUTPUT_MAP(ResizeBilinearGrad) = {{0, OUTPUT_DESC(y)}}; -// ResizeBilinear -INPUT_MAP(ResizeBilinearD) = {{1, INPUT_DESC(images)}}; +// ResizeBilinearD +INPUT_MAP(ResizeBilinearD) = {{1, INPUT_DESC(x)}}; ATTR_MAP(ResizeBilinearD) = { {"size", ATTR_DESC(size, AnyTraits>(), AnyTraits>())}, {"align_corners", ATTR_DESC(align_corners, AnyTraits())}}; @@ -539,9 +506,9 @@ OUTPUT_MAP(NMSWithMask) = { {0, OUTPUT_DESC(selected_boxes)}, {1, OUTPUT_DESC(selected_idx)}, {2, OUTPUT_DESC(selected_mask)}}; // Unpack -INPUT_MAP(Unpack) = {{1, INPUT_DESC(value)}}; +INPUT_MAP(Unpack) = {{1, INPUT_DESC(x)}}; ATTR_MAP(Unpack) = {{"axis", ATTR_DESC(axis, AnyTraits())}, {"num", ATTR_DESC(num, AnyTraits())}}; -DYN_OUTPUT_MAP(Unpack) = {{0, DYN_OUTPUT_DESC(output)}}; +DYN_OUTPUT_MAP(Unpack) = {{0, DYN_OUTPUT_DESC(y)}}; // ScatterNdUpdate INPUT_MAP(ScatterNdUpdate) = {{1, INPUT_DESC(var)}, {2, INPUT_DESC(indices)}, {3, INPUT_DESC(updates)}}; @@ -574,8 +541,8 @@ INPUT_MAP(SigmoidCrossEntropyWithLogitsGrad) = { ATTR_MAP(SigmoidCrossEntropyWithLogitsGrad) = EMPTY_ATTR_MAP; OUTPUT_MAP(SigmoidCrossEntropyWithLogitsGrad) = {{0, OUTPUT_DESC(gradient)}}; -// ScatterNd -INPUT_MAP(ScatterNdD) = {{1, INPUT_DESC(indices)}, {2, INPUT_DESC(updates)}}; +// ScatterNdD +INPUT_MAP(ScatterNdD) = {{1, INPUT_DESC(indices)}, {2, INPUT_DESC(x)}}; INPUT_ATTR_MAP(ScatterNdD) = { {3, ATTR_DESC(shape, AnyTraits>(), AnyTraits>())}}; ATTR_MAP(ScatterNdD) = EMPTY_ATTR_MAP; @@ -587,7 +554,7 @@ ATTR_MAP(PadD) = {{"paddings", ATTR_DESC(paddings, AnyTraits())}, - {"output_type", ATTR_DESC(output_type, AnyTraits())}}; + {"output_type", ATTR_DESC(dtype, AnyTraits())}}; OUTPUT_MAP(ArgMaxD) = {{0, OUTPUT_DESC(y)}}; // ArgMinD INPUT_MAP(ArgMinD) = {{1, INPUT_DESC(x)}}; ATTR_MAP(ArgMinD) = {{"axis", ATTR_DESC(dimension, AnyTraits())}, - {"output_type", ATTR_DESC(output_type, AnyTraits())}}; + {"output_type", ATTR_DESC(dtype, AnyTraits())}}; OUTPUT_MAP(ArgMinD) = {{0, OUTPUT_DESC(y)}}; // ArgMaxWithValue @@ -634,14 +601,14 @@ ATTR_MAP(ArgMinWithValue) = {{"axis", ATTR_DESC(dimension, AnyTraits())}, OUTPUT_MAP(ArgMinWithValue) = {{0, OUTPUT_DESC(indice)}, {1, OUTPUT_DESC(values)}}; // ReduceAll -INPUT_MAP(ReduceAll) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axis)}}; +INPUT_MAP(ReduceAll) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axes)}}; ATTR_MAP(ReduceAll) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits())}}; OUTPUT_MAP(ReduceAll) = {{0, OUTPUT_DESC(y)}}; // ReduceMeanD INPUT_MAP(ReduceMeanD) = {{1, INPUT_DESC(x)}}; INPUT_ATTR_MAP(ReduceMeanD) = { - {2, ATTR_DESC(axis, AnyTraits>(), AnyTraits>())}}; + {2, ATTR_DESC(axes, AnyTraits>(), AnyTraits>())}}; ATTR_MAP(ReduceMeanD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits())}}; OUTPUT_MAP(ReduceMeanD) = {{0, OUTPUT_DESC(y)}}; @@ -708,11 +675,12 @@ INPUT_MAP(BiasAddGrad) = {{1, INPUT_DESC(x)}}; ATTR_MAP(BiasAddGrad) = {{"data_format", ATTR_DESC(data_format, AnyTraits())}}; OUTPUT_MAP(BiasAddGrad) = {{0, OUTPUT_DESC(y)}}; -// maxpoolgrad +// MaxPoolGrad INPUT_MAP(MaxPoolGrad) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}, {3, INPUT_DESC(grad)}}; ATTR_MAP(MaxPoolGrad) = {{"ksize", ATTR_DESC(ksize, AnyTraits(), AnyTraits>())}, {"strides", ATTR_DESC(strides, AnyTraits(), AnyTraits>())}, - {"padding", ATTR_DESC(padding, AnyTraits())}}; + {"padding", ATTR_DESC(padding, AnyTraits())}, + {"data_format", ATTR_DESC(data_format, AnyTraits())}}; OUTPUT_MAP(MaxPoolGrad) = {{0, OUTPUT_DESC(y)}}; // avgpoolgrad @@ -739,28 +707,34 @@ ATTR_MAP(Conv2D) = { {"stride", ATTR_DESC(strides, "pad", AnyTraits>())}, {"pad_list", ATTR_DESC(pads, AnyTraits>(), AnyTraits>())}, {"dilation", ATTR_DESC(dilations, "pad", AnyTraits>())}, + {"data_format", ATTR_DESC(data_format, AnyTraits())}, + {"group", ATTR_DESC(groups, AnyTraits())} }; OUTPUT_MAP(Conv2D) = {{0, OUTPUT_DESC(y)}}; // Conv2DBackpropInputD -INPUT_MAP(Conv2DBackpropInputD) = {{1, INPUT_DESC(out_backprop)}, {2, INPUT_DESC(filters)}}; +INPUT_MAP(Conv2DBackpropInputD) = {{1, INPUT_DESC(out_backprop)}, {2, INPUT_DESC(filter)}}; INPUT_ATTR_MAP(Conv2DBackpropInputD) = { - {3, ATTR_DESC(input_sizes, AnyTraits>(), AnyTraits>())}}; + {3, ATTR_DESC(input_size, AnyTraits>(), AnyTraits>())}}; ATTR_MAP(Conv2DBackpropInputD) = { {"pad_list", ATTR_DESC(pads, AnyTraits>(), AnyTraits>())}, - {"stride", ATTR_DESC(strides, "strides", AnyTraits>())}, + {"stride", ATTR_DESC(strides, "pad", AnyTraits>())}, {"dilation", ATTR_DESC(dilations, "pad", AnyTraits>())}, + {"data_format", ATTR_DESC(data_format, AnyTraits())}, + {"group", ATTR_DESC(groups, AnyTraits())} }; OUTPUT_MAP(Conv2DBackpropInputD) = {{0, OUTPUT_DESC(y)}}; // Conv2DBackpropFilterD INPUT_MAP(Conv2DBackpropFilterD) = {{1, INPUT_DESC(out_backprop)}, {2, INPUT_DESC(x)}}; INPUT_ATTR_MAP(Conv2DBackpropFilterD) = { - {3, ATTR_DESC(filter_sizes, AnyTraits>(), AnyTraits>())}}; + {3, ATTR_DESC(filter_size, AnyTraits>(), AnyTraits>())}}; ATTR_MAP(Conv2DBackpropFilterD) = { {"pad_list", ATTR_DESC(pads, AnyTraits>(), AnyTraits>())}, - {"stride", ATTR_DESC(strides, "strides", AnyTraits>())}, + {"stride", ATTR_DESC(strides, "pad", AnyTraits>())}, {"dilation", ATTR_DESC(dilations, "pad", AnyTraits>())}, + {"data_format", ATTR_DESC(data_format, AnyTraits())}, + {"group", ATTR_DESC(groups, AnyTraits())} }; OUTPUT_MAP(Conv2DBackpropFilterD) = {{0, OUTPUT_DESC(y)}}; @@ -798,8 +772,8 @@ OUTPUT_MAP(DepthwiseConv2DBackpropFilterD) = {{0, OUTPUT_DESC(filter_grad)}}; // MatMul INPUT_MAP(MatMul) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}}; -ATTR_MAP(MatMul) = {{"transpose_a", ATTR_DESC(transpose_a, AnyTraits())}, - {"transpose_b", ATTR_DESC(transpose_b, AnyTraits())}}; +ATTR_MAP(MatMul) = {{"transpose_a", ATTR_DESC(transpose_x1, AnyTraits())}, + {"transpose_b", ATTR_DESC(transpose_x2, AnyTraits())}}; OUTPUT_MAP(MatMul) = {{0, OUTPUT_DESC(y)}}; // Merge @@ -846,10 +820,10 @@ ATTR_MAP(Sub) = EMPTY_ATTR_MAP; OUTPUT_MAP(Sub) = {{0, OUTPUT_DESC(y)}}; // SplitD -INPUT_MAP(SplitD) = {{1, INPUT_DESC(value)}}; +INPUT_MAP(SplitD) = {{1, INPUT_DESC(x)}}; ATTR_MAP(SplitD) = {{"axis", ATTR_DESC(split_dim, AnyTraits())}, {"output_num", ATTR_DESC(num_split, AnyTraits())}}; -DYN_OUTPUT_MAP(SplitD) = {{0, DYN_OUTPUT_DESC(output)}}; +DYN_OUTPUT_MAP(SplitD) = {{0, DYN_OUTPUT_DESC(y)}}; // Neg INPUT_MAP(Neg) = {{1, INPUT_DESC(x)}}; @@ -876,12 +850,12 @@ OUTPUT_MAP(Pack) = {{0, OUTPUT_DESC(y)}}; // ConcatD INPUT_MAP(ConcatD) = EMPTY_INPUT_MAP; -DYN_INPUT_MAP(ConcatD) = {{1, DYN_INPUT_DESC(input_values)}}; +DYN_INPUT_MAP(ConcatD) = {{1, DYN_INPUT_DESC(x)}}; ATTR_MAP(ConcatD) = { {"axis", ATTR_DESC(concat_dim, AnyTraits())}, {"inputNums", ATTR_DESC(N, AnyTraits())}, }; -OUTPUT_MAP(ConcatD) = {{0, OUTPUT_DESC(output_data)}}; +OUTPUT_MAP(ConcatD) = {{0, OUTPUT_DESC(y)}}; // Less INPUT_MAP(Less) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}}; @@ -916,14 +890,14 @@ OUTPUT_MAP(TanhGrad) = {{0, OUTPUT_DESC(z)}}; // ReduceMinD INPUT_MAP(ReduceMinD) = {{1, INPUT_DESC(x)}}; INPUT_ATTR_MAP(ReduceMinD) = { - {2, ATTR_DESC(axis, AnyTraits>(), AnyTraits>())}}; + {2, ATTR_DESC(axes, AnyTraits>(), AnyTraits>())}}; ATTR_MAP(ReduceMinD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits())}}; OUTPUT_MAP(ReduceMinD) = {{0, OUTPUT_DESC(y)}}; // ReduceMaxD INPUT_MAP(ReduceMaxD) = {{1, INPUT_DESC(x)}}; INPUT_ATTR_MAP(ReduceMaxD) = { - {2, ATTR_DESC(axis, AnyTraits>(), AnyTraits>())}}; + {2, ATTR_DESC(axes, AnyTraits>(), AnyTraits>())}}; ATTR_MAP(ReduceMaxD) = {{"keep_dims", ATTR_DESC(keep_dims, AnyTraits())}}; OUTPUT_MAP(ReduceMaxD) = {{0, OUTPUT_DESC(y)}}; @@ -1008,11 +982,11 @@ INPUT_MAP(LessEqual) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}}; ATTR_MAP(LessEqual) = EMPTY_ATTR_MAP; OUTPUT_MAP(LessEqual) = {{0, OUTPUT_DESC(y)}}; -// LogSoftmax -INPUT_MAP(LogSoftmax) = {{1, INPUT_DESC(logits)}}; -ATTR_MAP(LogSoftmax) = { - {"axis", ATTR_DESC(axis, AnyTraits>(), AnyTraits>())}}; -OUTPUT_MAP(LogSoftmax) = {{0, OUTPUT_DESC(logsoftmax)}}; +// LogSoftmaxV2 +INPUT_MAP(LogSoftmaxV2) = {{1, INPUT_DESC(logits)}}; +ATTR_MAP(LogSoftmaxV2) = { + {"axis", ATTR_DESC(axes, AnyTraits>(), AnyTraits>())}}; +OUTPUT_MAP(LogSoftmaxV2) = {{0, OUTPUT_DESC(logsoftmax)}}; // RandomChoiceWithMask INPUT_MAP(RandomChoiceWithMask) = {{1, INPUT_DESC(x)}}; @@ -1094,8 +1068,8 @@ OUTPUT_MAP(LayerNormGrad) = {{0, OUTPUT_DESC(pd_x)}, {1, OUTPUT_DESC(pd_gamma)}, // BatchMatMul INPUT_MAP(BatchMatMul) = {{1, INPUT_DESC(x1)}, {2, INPUT_DESC(x2)}}; -ATTR_MAP(BatchMatMul) = {{"transpose_x1", ATTR_DESC(adj_x, AnyTraits())}, - {"transpose_x2", ATTR_DESC(adj_y, AnyTraits())}}; +ATTR_MAP(BatchMatMul) = {{"transpose_x1", ATTR_DESC(adj_x1, AnyTraits())}, + {"transpose_x2", ATTR_DESC(adj_x2, AnyTraits())}}; OUTPUT_MAP(BatchMatMul) = {{0, OUTPUT_DESC(y)}}; // DropoutDoMask @@ -1146,6 +1120,19 @@ ATTR_MAP(SparseApplyAdagradD) = {{"lr", ATTR_DESC(lr, AnyTraits())}, {"use_locking", ATTR_DESC(use_locking, AnyTraits())}}; OUTPUT_MAP(SparseApplyAdagradD) = {{0, OUTPUT_DESC(var)}}; +// SparseApplyFtrlD +INPUT_MAP(SparseApplyFtrlD) = {{1, INPUT_DESC(var)}, + {2, INPUT_DESC(accum)}, + {3, INPUT_DESC(linear)}, + {4, INPUT_DESC(grad)}, + {5, INPUT_DESC(indices)}}; +ATTR_MAP(SparseApplyFtrlD) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits())}, + {"lr", ATTR_DESC(lr, AnyTraits())}, + {"l1", ATTR_DESC(l1, AnyTraits())}, + {"l2", ATTR_DESC(l2, AnyTraits())}, + {"lr_power", ATTR_DESC(lr_power, AnyTraits())}}; +OUTPUT_MAP(SparseApplyFtrlD) = {{0, OUTPUT_DESC(var)}}; + // SpaceToDepth INPUT_MAP(SpaceToDepth) = {{1, INPUT_DESC(x)}}; ATTR_MAP(SpaceToDepth) = {{"block_size", ATTR_DESC(block_size, AnyTraits())}}; diff --git a/mindspore/ccsrc/transform/op_declare.h b/mindspore/ccsrc/transform/op_declare.h index 03463b978f..9e4f407ebb 100755 --- a/mindspore/ccsrc/transform/op_declare.h +++ b/mindspore/ccsrc/transform/op_declare.h @@ -209,8 +209,8 @@ DECLARE_OP_USE_OUTPUT(Merge) DECLARE_OP_ADAPTER(Switch) DECLARE_OP_USE_OUTPUT(Switch) -DECLARE_OP_ADAPTER(TopKV2) -DECLARE_OP_USE_OUTPUT(TopKV2) +DECLARE_OP_ADAPTER(TopK) +DECLARE_OP_USE_OUTPUT(TopK) DECLARE_OP_ADAPTER(RealDiv) DECLARE_OP_USE_OUTPUT(RealDiv) @@ -260,8 +260,8 @@ DECLARE_OP_ADAPTER(Select) DECLARE_OP_USE_OUTPUT(Select) DECLARE_OP_ADAPTER(LessEqual) DECLARE_OP_USE_OUTPUT(LessEqual) -DECLARE_OP_ADAPTER(LogSoftmax) -DECLARE_OP_USE_OUTPUT(LogSoftmax) +DECLARE_OP_ADAPTER(LogSoftmaxV2) +DECLARE_OP_USE_OUTPUT(LogSoftmaxV2) DECLARE_OP_ADAPTER(TruncatedNormal) DECLARE_OP_USE_OUTPUT(TruncatedNormal) DECLARE_OP_ADAPTER(StridedSliceGrad) @@ -391,8 +391,8 @@ DECLARE_OP_ADAPTER(Sigmoid) DECLARE_OP_USE_OUTPUT(Sigmoid) DECLARE_OP_ADAPTER(SigmoidGrad) DECLARE_OP_USE_OUTPUT(SigmoidGrad) -DECLARE_OP_ADAPTER(Softmax) -DECLARE_OP_USE_OUTPUT(Softmax) +DECLARE_OP_ADAPTER(SoftmaxV2) +DECLARE_OP_USE_OUTPUT(SoftmaxV2) DECLARE_OP_ADAPTER(SoftmaxGrad) DECLARE_OP_USE_OUTPUT(SoftmaxGrad) DECLARE_OP_ADAPTER(Greater) @@ -435,6 +435,8 @@ DECLARE_OP_ADAPTER(Round) DECLARE_OP_USE_OUTPUT(Round) DECLARE_OP_ADAPTER(ApplyFtrl) DECLARE_OP_USE_OUTPUT(ApplyFtrl) +DECLARE_OP_ADAPTER(SparseApplyFtrlD) +DECLARE_OP_USE_OUTPUT(SparseApplyFtrlD) #ifdef ENABLE_GE DECLARE_OP_ADAPTER(Print) DECLARE_OP_USE_DYN_INPUT(Print) diff --git a/mindspore/ccsrc/transform/util.cc b/mindspore/ccsrc/transform/util.cc index a106a20ad8..0a18763d12 100644 --- a/mindspore/ccsrc/transform/util.cc +++ b/mindspore/ccsrc/transform/util.cc @@ -361,12 +361,11 @@ MeTensorPtr TransformUtil::GenerateMeTensor(const GeTensorPtr& ge_tensor, const MS_LOG(ERROR) << "GE tensor data size is zero!"; return nullptr; } - errno_t ret = memcpy_s(me_data_ptr, me_data_size, ge_tensor->GetData(), ge_tensor->GetSize()); - if (ret != EOK) { - MS_LOG(INFO) << "GE tensor data size is " << ge_tensor->GetSize() << " bytes"; - MS_LOG(ERROR) << "Copy GE tensor data to me tensor failed"; - return nullptr; - } + + // Use memcpy here, not memcpy_s, just because the size of ge_tensor may be bigger than 2GB + // which is the size limit of memcpy_s + memcpy(me_data_ptr, ge_tensor->GetData(), ge_tensor->GetSize()); + return make_shared(me_tensor); } diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc index bf05af9858..e9b4586b21 100644 --- a/mindspore/ccsrc/utils/context/ms_context.cc +++ b/mindspore/ccsrc/utils/context/ms_context.cc @@ -355,7 +355,9 @@ void MsContext::GetGeOptions(std::map* ge_options) con MS_LOG(ERROR) << "Set proto lib path failed!"; } - // Disbale the global variable acc, only enable it whlie adding training graph in pipeline + // Enable auto mixed precision according to the context options + (*ge_options)["ge.exec.auto_mix_precision"] = std::to_string(auto_mixed_precision_flag_); + // Disable the global variable acc, only enable it whlie adding training graph in pipeline (*ge_options)["ge.exec.variable_acc"] = "0"; #endif } diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py index a75b078df8..77bb6d0ff3 100644 --- a/mindspore/ops/operations/__init__.py +++ b/mindspore/ops/operations/__init__.py @@ -65,7 +65,7 @@ from .nn_ops import (LSTM, SGD, Adam, ApplyMomentum, BatchNorm, SmoothL1Loss, Softmax, SoftmaxCrossEntropyWithLogits, ROIAlign, SparseSoftmaxCrossEntropyWithLogits, Tanh, - TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl) + TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl, SparseApplyFtrlD) from .other_ops import Assign, IOU, BoundingBoxDecode, BoundingBoxEncode, CheckValid, MakeRefKey @@ -217,6 +217,7 @@ __all__ = [ "Abs", "BinaryCrossEntropy", "SparseApplyAdagrad", + "SparseApplyFtrlD", "SpaceToDepth", "DepthToSpace", "Conv2DBackpropInput", diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py index afa4c7dfe3..57e409b44f 100644 --- a/mindspore/ops/operations/nn_ops.py +++ b/mindspore/ops/operations/nn_ops.py @@ -2141,6 +2141,79 @@ class SparseApplyAdagrad(PrimitiveWithInfer): return var_type +class SparseApplyFtrlD(PrimitiveWithInfer): + r""" + Conduct experiment on updating on parameters related to FTRL optimization algorithm. + + .. math :: + \text{accum} = \text{grad} * \text{grad} + + .. math :: + \text{linear} += \text{grad} + (\text{accum} ^ {\text{-lr_power}} - + \frac{\text{accum} ^ \text{-lr_power}}{\text{lr}} * \text{var}) + + .. math :: + \text{quadratic} = {\text{1.0}/({\text{accum}^\text{lr_power} * \text{lr}}) + 2*\text{l2} + + .. math :: + \text{var} = {\text{sign}({linear}) * \text{l1} - \text{linear}})/{ quadratic } + if \vert linear \vert > l1 \ else \ 0.0 + + Args: + lr (float): Learning rate. + l1 (float): temp value NO.1. + l2 (float): temp value No.2. + lr_power (float): temp value used as power number. + use_locking (bool): If true, updating the var and accum tensors will be protected. Default: False. + + Inputs: + - **var** (Tensor) - Variable to be update. The type must be float32. + - **accum** (Tensor) - Accum to be update. The shape must be the same as `var`'s shape, + the type must be float32. + - **linear** (Tensor) - Linear to be update. The shape must be the same as `var`'s shape, + the type must be float32. + - **grad** (Tensor) - Gradient. The shape must be the same as `var`'s shape, + the type must be float32. + - **indices** (Tensor) - A vector of indices into the first dimension of 'var' and 'accum', + the shape of `indices` must be the same as `grad` in first dimension, the type must be int32. + + Output: + Tensors, has the same shape and type as `var`. + + """ + + @prim_attr_register + def __init__(self, lr, l1, l2, lr_power, use_locking=False): + """init SparseApplyFtrlD""" + self.lr = validator.check_type("lr", lr, [float]) + self.l1 = validator.check_type("l1", l1, [float]) + self.l2 = validator.check_type("l2", l2, [float]) + self.lr_power = validator.check_type("lr_power", lr_power, [float]) + self.use_locking = validator.check_type("use_locking", use_locking, [bool]) + + def infer_shape(self, var_shape, accum_shape, linear_shape, grad_shape, indices_shape): + validator.check_param_equal('var shape', var_shape, 'accum shape', accum_shape) + validator.check_param_equal('len of var shape', len(var_shape), 'len of grad shape', len(grad_shape)) + validator.check_param_equal('len of var shape', len(var_shape), 'len of linear shape', len(linear_shape)) + if len(var_shape) > 1: + validator.check_param_equal('var_shape', var_shape[1:], 'grad_shape', grad_shape[1:]) + validator.check_param_equal('var_shape', var_shape[1:], 'linear_shape', linear_shape[1:]) + validator.check_integer("len of indices shape", len(indices_shape), 1, Rel.EQ) + validator.check('the first dimension of grad', grad_shape[0], + 'the shape of indices', indices_shape[0], Rel.EQ) + + return var_shape + + def infer_dtype(self, var_type, accum_type, linear_type, grad_type, indices_type): + validator.check_subclass("var_type", var_type, mstype.tensor) + validator.check_subclass("accum_type", accum_type, mstype.tensor) + validator.check_subclass("linear_type", linear_type, mstype.tensor) + validator.check_subclass("grad_type", grad_type, mstype.tensor) + validator.check_subclass("indices_type", indices_type, mstype.tensor) + + return var_type + + class LARSUpdate(PrimitiveWithInfer): """ Conduct lars (layer-wise adaptive rate scaling) update on the square sum of gradient. @@ -2244,4 +2317,4 @@ class ApplyFtrl(PrimitiveWithInfer): validator.check_typename("l1", l1_type,[mstype.float16, mstype.float32]) validator.check_typename("l2", l2_type,[mstype.float16, mstype.float32]) validator.check_typename("lr_power", lr_power_type,[mstype.float16, mstype.float32]) - return var_type \ No newline at end of file + return var_type diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py index bfe8075972..8d7dd95072 100755 --- a/tests/ut/python/ops/test_ops.py +++ b/tests/ut/python/ops/test_ops.py @@ -749,6 +749,11 @@ test_case_nn_ops = [ 'desc_inputs': [[3, 3], [3, 3], [3, 3], Tensor(np.ones((3,), np.int32))], 'desc_bprop': [3, 3], 'skip': ['backward']}), + ('SparseApplyFtrlD', { + 'block': P.SparseApplyFtrlD(0.1, 0.1, 0.1, -0.1), + 'desc_inputs': [[3, 3], [3, 3], [3, 3], [3, 3], Tensor(2*np.ones((3,), np.int32))], + 'desc_bprop': [3, 3], + 'skip': ['backward']}), ('Flatten_1', { 'block': NetForFlatten(), 'desc_inputs': [Tensor(np.ones([2, 3, 4]).astype(np.int32)), Tensor(np.ones([2, 12]).astype(np.int32))], From 0a977aa19dc216b43ddc8adc11490bb00d001c3e Mon Sep 17 00:00:00 2001 From: simson <526422051@qq.com> Date: Wed, 6 May 2020 18:48:30 +0800 Subject: [PATCH 04/13] revert the limitation of end learning rate --- mindspore/nn/optim/lamb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py index e026b1c560..97a81a590b 100755 --- a/mindspore/nn/optim/lamb.py +++ b/mindspore/nn/optim/lamb.py @@ -114,7 +114,7 @@ def _check_param_value(decay_steps, warmup_steps, start_learning_rate, _ = warmup_steps validator.check_float_positive('start_learning_rate', start_learning_rate, prim_name) validator.check_float_legal_value('start_learning_rate', start_learning_rate, prim_name) - validator.check_float_positive('end_learning_rate', end_learning_rate, prim_name) + validator.check_value_type("end_learning_rate", end_learning_rate, [float], prim_name) validator.check_float_legal_value('end_learning_rate', end_learning_rate, prim_name) validator.check_float_positive('power', power, prim_name) validator.check_float_legal_value('power', power, prim_name) From 187568a833b1c7478c7526058588f2b9e755b76a Mon Sep 17 00:00:00 2001 From: zhaozhenlong Date: Wed, 6 May 2020 21:20:32 +0800 Subject: [PATCH 05/13] adapt assign assignAdd relu6 adapt ResizeNearestNeighbourV2 with grad and ApplyAdam --- mindspore/ccsrc/kernel/tbe/tbe_adapter.cc | 6 ++--- mindspore/ops/_op_impl/tbe/assign.py | 30 ++++++++++++++++++++--- mindspore/ops/_op_impl/tbe/assign_add.py | 12 +++++++++ mindspore/ops/_op_impl/tbe/relu6.py | 4 +-- 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc index 8ce5504b8e..005c290aba 100644 --- a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc +++ b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc @@ -72,10 +72,10 @@ static std::map tbe_func_adapter_map = { {"lamb_next_mv_with_decay_v1", "lamb_next_m_v_with_decay_v1"}, {"lamb_next_mv", "lamb_next_m_v"}, {"split", "split_d"}, - {"resize_nearest_neighbor", "resize_nearest_neighbor_d"}, - {"resize_nearest_neighbor_grad", "resize_nearest_neighbor_grad_d"}, + {"resize_nearest_neighbor", "resize_nearest_neighbor_v2_d"}, + {"resize_nearest_neighbor_grad", "resize_nearest_neighbor_v2_grad_d"}, {"pad", "pad_d"}, - {"adam", "apply_adam"}}; + {"adam", "apply_adam_d"}}; void TbeAdapter::NormalizeFuncName(std::string *func_name) { if (func_name == nullptr) { diff --git a/mindspore/ops/_op_impl/tbe/assign.py b/mindspore/ops/_op_impl/tbe/assign.py index 2fbd152c78..ff673a03c4 100644 --- a/mindspore/ops/_op_impl/tbe/assign.py +++ b/mindspore/ops/_op_impl/tbe/assign.py @@ -23,31 +23,53 @@ assign_op_info = TBERegOp("Assign") \ .compute_cost(10) \ .kernel_name("assign") \ .partial_flag(True) \ - .input(0, "resource", False, "required", "all") \ + .input(0, "ref", False, "required", "all") \ .input(1, "value", False, "required", "all") \ - .output(0, "y", False, "required", "all") \ - .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \ + .output(0, "ref", False, "required", "all") \ .dtype_format(DataType.BOOL_Default, DataType.BOOL_Default, DataType.BOOL_Default) \ + .dtype_format(DataType.BOOL_5HD, DataType.BOOL_5HD, DataType.BOOL_5HD) \ + .dtype_format(DataType.BOOL_C1HWNCoC0, DataType.BOOL_C1HWNCoC0, DataType.BOOL_C1HWNCoC0) \ + .dtype_format(DataType.BOOL_FracZ, DataType.BOOL_FracZ, DataType.BOOL_FracZ) \ + .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \ .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \ + .dtype_format(DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0) \ + .dtype_format(DataType.I8_FracZ, DataType.I8_FracZ, DataType.I8_FracZ) \ .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \ .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \ + .dtype_format(DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0) \ + .dtype_format(DataType.U8_FracZ, DataType.U8_FracZ, DataType.U8_FracZ) \ .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.I16_Default) \ .dtype_format(DataType.I16_5HD, DataType.I16_5HD, DataType.I16_5HD) \ + .dtype_format(DataType.I16_C1HWNCoC0, DataType.I16_C1HWNCoC0, DataType.I16_C1HWNCoC0) \ + .dtype_format(DataType.I16_FracZ, DataType.I16_FracZ, DataType.I16_FracZ) \ .dtype_format(DataType.U16_Default, DataType.U16_Default, DataType.U16_Default) \ .dtype_format(DataType.U16_5HD, DataType.U16_5HD, DataType.U16_5HD) \ + .dtype_format(DataType.U16_C1HWNCoC0, DataType.U16_C1HWNCoC0, DataType.U16_C1HWNCoC0) \ + .dtype_format(DataType.U16_FracZ, DataType.U16_FracZ, DataType.U16_FracZ) \ .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \ .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \ + .dtype_format(DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0) \ + .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ) \ .dtype_format(DataType.U32_Default, DataType.U32_Default, DataType.U32_Default) \ .dtype_format(DataType.U32_5HD, DataType.U32_5HD, DataType.U32_5HD) \ + .dtype_format(DataType.U32_C1HWNCoC0, DataType.U32_C1HWNCoC0, DataType.U32_C1HWNCoC0) \ + .dtype_format(DataType.U32_FracZ, DataType.U32_FracZ, DataType.U32_FracZ) \ .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \ .dtype_format(DataType.I64_5HD, DataType.I64_5HD, DataType.I64_5HD) \ + .dtype_format(DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0) \ + .dtype_format(DataType.I64_FracZ, DataType.I64_FracZ, DataType.I64_FracZ) \ .dtype_format(DataType.U64_Default, DataType.U64_Default, DataType.U64_Default) \ .dtype_format(DataType.U64_5HD, DataType.U64_5HD, DataType.U64_5HD) \ + .dtype_format(DataType.U64_C1HWNCoC0, DataType.U64_C1HWNCoC0, DataType.U64_C1HWNCoC0) \ + .dtype_format(DataType.U64_FracZ, DataType.U64_FracZ, DataType.U64_FracZ) \ .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \ .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \ + .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \ + .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \ .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \ .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \ - .dtype_format(DataType.F32_FracNZ, DataType.F32_FracNZ, DataType.F32_FracNZ) \ + .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \ + .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \ .get_op_info() diff --git a/mindspore/ops/_op_impl/tbe/assign_add.py b/mindspore/ops/_op_impl/tbe/assign_add.py index 2b20a7781d..7ad23ff3bc 100644 --- a/mindspore/ops/_op_impl/tbe/assign_add.py +++ b/mindspore/ops/_op_impl/tbe/assign_add.py @@ -28,16 +28,28 @@ assign_add_op_info = TBERegOp("AssignAdd") \ .output(0, "ref", False, "required", "all") \ .dtype_format(DataType.I8_Default, DataType.I8_Default, DataType.I8_Default) \ .dtype_format(DataType.I8_5HD, DataType.I8_5HD, DataType.I8_5HD) \ + .dtype_format(DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0, DataType.I8_C1HWNCoC0) \ + .dtype_format(DataType.I8_FracZ, DataType.I8_FracZ, DataType.I8_FracZ) \ .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.U8_Default) \ .dtype_format(DataType.U8_5HD, DataType.U8_5HD, DataType.U8_5HD) \ + .dtype_format(DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0, DataType.U8_C1HWNCoC0) \ + .dtype_format(DataType.U8_FracZ, DataType.U8_FracZ, DataType.U8_FracZ) \ .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \ .dtype_format(DataType.I32_5HD, DataType.I32_5HD, DataType.I32_5HD) \ + .dtype_format(DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0, DataType.I32_C1HWNCoC0) \ + .dtype_format(DataType.I32_FracZ, DataType.I32_FracZ, DataType.I32_FracZ) \ .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \ .dtype_format(DataType.I64_5HD, DataType.I64_5HD, DataType.I64_5HD) \ + .dtype_format(DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0, DataType.I64_C1HWNCoC0) \ + .dtype_format(DataType.I64_FracZ, DataType.I64_FracZ, DataType.I64_FracZ) \ .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \ .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \ + .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \ + .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \ .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \ .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \ + .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \ + .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \ .get_op_info() diff --git a/mindspore/ops/_op_impl/tbe/relu6.py b/mindspore/ops/_op_impl/tbe/relu6.py index bbedfdeb0f..d9bd7f9f8e 100644 --- a/mindspore/ops/_op_impl/tbe/relu6.py +++ b/mindspore/ops/_op_impl/tbe/relu6.py @@ -23,8 +23,8 @@ relu6_op_info = TBERegOp("ReLU6") \ .compute_cost(10) \ .kernel_name("relu6") \ .partial_flag(True) \ - .input(0, "features", False, "required", "all") \ - .output(0, "activations", False, "required", "all") \ + .input(0, "x", False, "required", "all") \ + .output(0, "y", False, "required", "all") \ .dtype_format(DataType.F16_Default, DataType.F16_Default) \ .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \ .dtype_format(DataType.F32_Default, DataType.F32_Default) \ From 263d82edc5660ca24d370cf7e575b69f64f49f2e Mon Sep 17 00:00:00 2001 From: zhoufeng Date: Thu, 7 May 2020 14:46:03 +0800 Subject: [PATCH 06/13] me-ge link hccl Signed-off-by: zhoufeng --- mindspore/ccsrc/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index 4c6ceb38e1..8d3818a777 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -125,7 +125,7 @@ endif() if (ENABLE_GE) if(ENABLE_TRAIN) - target_link_libraries(mindspore ge_client_train) + target_link_libraries(mindspore ge_client_train hccl) else () target_link_libraries(mindspore ge_client) endif () From e97d33f7720229a0504fe1fdb206e93c01e67f70 Mon Sep 17 00:00:00 2001 From: liuxiao Date: Wed, 6 May 2020 19:36:00 +0800 Subject: [PATCH 07/13] add ops for VM --- mindspore/ops/_grad/grad_nn_ops.py | 3 +- mindspore/ops/_op_impl/tbe/__init__.py | 2 ++ mindspore/ops/_op_impl/tbe/elu.py | 40 ++++++++++++++++++++++++ mindspore/ops/_op_impl/tbe/elu_grad.py | 43 ++++++++++++++++++++++++++ mindspore/ops/operations/nn_ops.py | 3 +- tests/ut/python/ops/test_ops.py | 2 +- 6 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 mindspore/ops/_op_impl/tbe/elu.py create mode 100644 mindspore/ops/_op_impl/tbe/elu_grad.py diff --git a/mindspore/ops/_grad/grad_nn_ops.py b/mindspore/ops/_grad/grad_nn_ops.py index 153abc0fb6..362bda7368 100755 --- a/mindspore/ops/_grad/grad_nn_ops.py +++ b/mindspore/ops/_grad/grad_nn_ops.py @@ -600,7 +600,6 @@ def get_bprop_roi_align(self): sample_num = self.sample_num def bprop(inputs, rois, out, dout): - rois_shape = shape_op(rois) inputs_shape = shape_op(inputs) dx = G.ROIAlignGrad(inputs_shape, pooled_height, @@ -608,7 +607,7 @@ def get_bprop_roi_align(self): spatial_scale, sample_num, )(dout, rois) - return dx, zeros_like(rois_shape) + return dx, zeros_like(rois) return bprop diff --git a/mindspore/ops/_op_impl/tbe/__init__.py b/mindspore/ops/_op_impl/tbe/__init__.py index 9dbe53049b..c6a08e8ff4 100644 --- a/mindspore/ops/_op_impl/tbe/__init__.py +++ b/mindspore/ops/_op_impl/tbe/__init__.py @@ -73,6 +73,8 @@ from .strideslice_d import _strided_slice_d_tbe from .strideslicegrad_d import _strided_slice_grad_d_tbe from .split_d import _split_d_tbe from .exp import _exp_tbe +from .elu import _elu_tbe +from .elu_grad import _elu_grad_tbe from .div import _div_tbe from .log import _log_tbe from .floor_div import _floor_div_tbe diff --git a/mindspore/ops/_op_impl/tbe/elu.py b/mindspore/ops/_op_impl/tbe/elu.py new file mode 100644 index 0000000000..9125d14727 --- /dev/null +++ b/mindspore/ops/_op_impl/tbe/elu.py @@ -0,0 +1,40 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Elu op""" +from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType + +elu_op_info = TBERegOp("Elu") \ + .fusion_type("ELEMWISE") \ + .async_flag(False) \ + .binfile_name("elu.so") \ + .compute_cost(10) \ + .kernel_name("elu") \ + .partial_flag(True) \ + .op_pattern("formatAgnostic") \ + .attr("alpha", "optional", "float", "all", "1.0") \ + .input(0, "x", False, "required", "all") \ + .output(0, "y", False, "required", "all") \ + .dtype_format(DataType.F16_Default, DataType.F16_Default) \ + .dtype_format(DataType.F16_5HD, DataType.F16_5HD) \ + .dtype_format(DataType.F32_Default, DataType.F32_Default) \ + .dtype_format(DataType.F32_5HD, DataType.F32_5HD) \ + .get_op_info() + + +@op_info_register(elu_op_info) +def _elu_tbe(): + """Elu TBE register""" + return diff --git a/mindspore/ops/_op_impl/tbe/elu_grad.py b/mindspore/ops/_op_impl/tbe/elu_grad.py new file mode 100644 index 0000000000..c3486dd024 --- /dev/null +++ b/mindspore/ops/_op_impl/tbe/elu_grad.py @@ -0,0 +1,43 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""EluGrad op""" +from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType + +elu_grad_op_info = TBERegOp("EluGrad") \ + .fusion_type("ELEMWISE") \ + .async_flag(False) \ + .binfile_name("elu_grad.so") \ + .compute_cost(10) \ + .kernel_name("elu_grad") \ + .partial_flag(True) \ + .input(0, "grads", False, "required", "all") \ + .input(1, "activations", False, "required", "all") \ + .output(0, "y", False, "required", "all") \ + .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD) \ + .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ) \ + .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \ + .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default) \ + .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD) \ + .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ) \ + .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \ + .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \ + .get_op_info() + + +@op_info_register(elu_grad_op_info) +def _elu_grad_tbe(): + """EluGrad TBE register""" + return diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py index 2a2dbe08a8..7ba341fd56 100644 --- a/mindspore/ops/operations/nn_ops.py +++ b/mindspore/ops/operations/nn_ops.py @@ -1527,7 +1527,8 @@ class L2Loss(PrimitiveWithInfer): def infer_dtype(self, x_type): validator.check_subclass("x_type", x_type, mstype.tensor, self.name) - validator.check_tensor_type_same({'x_type': x_type}, [mstype.double, mstype.float_, mstype.float16], self.name) + valid_types = [mstype.float16, mstype.float32, mstype.double] + validator.check_tensor_type_same({'x_type': x_type}, valid_types, self.name) return x_type diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py index 9d7e8c898a..7a3d7d967f 100755 --- a/tests/ut/python/ops/test_ops.py +++ b/tests/ut/python/ops/test_ops.py @@ -874,7 +874,7 @@ test_case_nn_ops = [ 'skip': ['backward']}), ('L2Loss_1', { 'block': P.L2Loss(), - 'desc_inputs': [Tensor(np.array([1, 2, 3, 4]), mstype.float16)], + 'desc_inputs': [Tensor(np.array([1, 2, 3, 4]), mstype.float32)], 'desc_bprop': []}), ('L2Loss_2', { 'block': P.L2Loss(), From d6520f499650782f7717aefef9a429b8e7489828 Mon Sep 17 00:00:00 2001 From: gengdongjie Date: Thu, 7 May 2020 23:44:23 +0800 Subject: [PATCH 08/13] add mix precision option --- mindspore/ccsrc/utils/context/ms_context.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc index 6da1de9cdb..d728feae82 100644 --- a/mindspore/ccsrc/utils/context/ms_context.cc +++ b/mindspore/ccsrc/utils/context/ms_context.cc @@ -359,7 +359,11 @@ void MsContext::GetGeOptions(std::map *ge_options) con } // Enable auto mixed precision according to the context options - (*ge_options)["ge.exec.auto_mix_precision"] = std::to_string(auto_mixed_precision_flag_); + if (auto_mixed_precision_flag_) { + (*ge_options)["ge.exec.precision_mode"] = "allow_mix_precision"; + } else { + (*ge_options)["ge.exec.precision_mode"] = "must_keep_origin_dtype"; + } // Disable the global variable acc, only enable it whlie adding training graph in pipeline (*ge_options)["ge.exec.variable_acc"] = "0"; #endif From 460a1e25c82131d70a5d2bb076f262645e504843 Mon Sep 17 00:00:00 2001 From: gengdongjie Date: Fri, 8 May 2020 19:28:31 +0800 Subject: [PATCH 09/13] reset auto mix precision default off option --- mindspore/ccsrc/utils/context/ms_context.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc index d728feae82..b8b4b3d8a1 100644 --- a/mindspore/ccsrc/utils/context/ms_context.cc +++ b/mindspore/ccsrc/utils/context/ms_context.cc @@ -362,7 +362,7 @@ void MsContext::GetGeOptions(std::map *ge_options) con if (auto_mixed_precision_flag_) { (*ge_options)["ge.exec.precision_mode"] = "allow_mix_precision"; } else { - (*ge_options)["ge.exec.precision_mode"] = "must_keep_origin_dtype"; + (*ge_options)["ge.exec.precision_mode"] = "allow_fp32_to_fp16"; } // Disable the global variable acc, only enable it whlie adding training graph in pipeline (*ge_options)["ge.exec.variable_acc"] = "0"; From 0b8cea801862a7fe7dae41d61bee6a9e94bb60ad Mon Sep 17 00:00:00 2001 From: guohongzilong <2713219276@qq.com> Date: Thu, 23 Apr 2020 17:39:24 +0800 Subject: [PATCH 10/13] learning rate and weight decay support group mode --- mindspore/nn/optim/adam.py | 65 ++++-- mindspore/nn/optim/ftrl.py | 3 +- mindspore/nn/optim/lamb.py | 2 + mindspore/nn/optim/momentum.py | 50 +++- mindspore/nn/optim/optimizer.py | 215 ++++++++++++++---- mindspore/nn/optim/rmsprop.py | 71 ++++-- mindspore/nn/optim/sgd.py | 45 +++- mindspore/nn/wrap/cell_wrapper.py | 2 +- tests/ut/python/nn/optim/test_adam.py | 4 +- tests/ut/python/nn/optim/test_optimizer.py | 8 +- .../test_optimize_with_parameter_groups.py | 210 +++++++++++++++++ 11 files changed, 570 insertions(+), 105 deletions(-) create mode 100644 tests/ut/python/optimizer/test_optimize_with_parameter_groups.py diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py index 1a386556d9..9893a81923 100755 --- a/mindspore/nn/optim/adam.py +++ b/mindspore/nn/optim/adam.py @@ -103,9 +103,9 @@ def _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, po validator.check_integer('decay_steps', decay_steps, 0, Rel.GT, prim_name) -@adam_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", +@adam_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") -def _run_opt_with_one_number(opt, lr, beta1_power, beta2_power, beta1, beta2, eps, gradient, params, moment1, +def _run_opt_with_one_number(opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params, moment1, moment2): """Apply adam optimizer to the weight parameter using Tensor.""" success = True @@ -136,9 +136,27 @@ class Adam(Optimizer): `beta1_power` and `beta2_power`, :math:`\alpha` represents `learning_rate`, :math:`w` represents `params`, :math:`\epsilon` represents `eps`. + Note: + The Adam optimizer supports separating parameter groups. Different parameter groups can set different + `learning_rate` and `weight_decay`. + + When separating parameter groups, the weight decay in each group will be applied on the parameters if the + value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be + applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. + Args: - params (list[Parameter]): A list of parameter, which will be updated. The element in `params` - should be class mindspore.Parameter. + params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, + the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", + "lr" and "weight_decay" are the keys can be parsed. + + - params: Required. The value should be a list of `Parameter`. + + - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used. + If not, the `learning_rate` in the API will be used. + + - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay + will be used. If not, the `weight_decay` in the API will be used. + learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is Iterable or a Tensor and the dims of the Tensor is 1, use dynamic learning rate, then the i-th step will @@ -161,8 +179,6 @@ class Adam(Optimizer): weight_decay (float): Weight decay (L2 penalty). Default: 0.0. loss_scale (float): A floating point value for the loss scale. Should be equal to or greater than 1. Default: 1.0. - decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default: - lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name. Inputs: - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. @@ -172,15 +188,26 @@ class Adam(Optimizer): Examples: >>> net = Net() - >>> loss = nn.SoftmaxCrossEntropyWithLogits() + >>> #1) All parameters use the same learning rate and weight decay >>> optim = nn.Adam(params=net.trainable_params()) - >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) + >>> + >>> #2) Use parameter groups and set different values + >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, + >>> {'params': no_conv_params}] + >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0) + >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 + >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a + >>> # learning rate of 0.1 and a weight decay of 0.0. + >>> + >>> loss = nn.SoftmaxCrossEntropyWithLogits() + >>> model = Model(net, loss_fn=loss, optimizer=optim) """ def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, use_locking=False, - use_nesterov=False, weight_decay=0.0, loss_scale=1.0, - decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): - super(Adam, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter) + use_nesterov=False, weight_decay=0.0, loss_scale=1.0): + super(Adam, self).__init__(learning_rate, params, weight_decay, loss_scale) _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) validator.check_value_type("use_locking", use_locking, [bool], self.cls_name) validator.check_value_type("use_nesterov", use_nesterov, [bool], self.cls_name) @@ -216,10 +243,14 @@ class Adam(Optimizer): self.beta1_power = beta1_power beta2_power = self.beta2_power * self.beta2 self.beta2_power = beta2_power - success = self.hyper_map(F.partial(adam_opt, self.opt, lr, beta1_power, beta2_power, self.beta1, - self.beta2, self.eps), - gradients, params, moment1, moment2) - + if self.is_group: + success = self.hyper_map(F.partial(adam_opt, self.opt, beta1_power, beta2_power, self.beta1, + self.beta2, self.eps), + lr, gradients, params, moment1, moment2) + else: + success = self.hyper_map(F.partial(adam_opt, self.opt, beta1_power, beta2_power, self.beta1, + self.beta2, self.eps, lr), + gradients, params, moment1, moment2) return success @@ -262,6 +293,8 @@ class AdamWeightDecay(Optimizer): def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0, decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): super(AdamWeightDecay, self).__init__(learning_rate, params) + if self.is_group: + raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.") _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) self.beta1 = Tensor(np.array([beta1]).astype(np.float32)) self.beta2 = Tensor(np.array([beta2]).astype(np.float32)) @@ -329,6 +362,8 @@ class AdamWeightDecayDynamicLR(Optimizer): weight_decay=0.0, decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): super(AdamWeightDecayDynamicLR, self).__init__(learning_rate, params) + if self.is_group: + raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.") _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, power, self.cls_name) # turn them to scalar when me support scalar/tensor mix operations diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py index ccc1b3f10b..33edafa4e2 100644 --- a/mindspore/nn/optim/ftrl.py +++ b/mindspore/nn/optim/ftrl.py @@ -96,7 +96,8 @@ class FTRL(Optimizer): def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0, use_locking=False, loss_scale=1.0, weight_decay=0.0): super(FTRL, self).__init__(learning_rate, params) - + if self.is_group: + raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.") _check_param(initial_accum, learning_rate, lr_power, l1, l2, use_locking, loss_scale, weight_decay, self.cls_name) self.moments = self.parameters.clone(prefix="moments", init=initial_accum) diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py index 97a81a590b..b4d478f52a 100755 --- a/mindspore/nn/optim/lamb.py +++ b/mindspore/nn/optim/lamb.py @@ -183,6 +183,8 @@ class Lamb(Optimizer): decay_filter=lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name): super(Lamb, self).__init__(start_learning_rate, params) + if self.is_group: + raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.") _check_param_value(decay_steps, warmup_steps, start_learning_rate, end_learning_rate, power, beta1, beta2, eps, weight_decay, self.cls_name) diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py index 67de590c5f..7cfbf11183 100755 --- a/mindspore/nn/optim/momentum.py +++ b/mindspore/nn/optim/momentum.py @@ -23,7 +23,7 @@ momentum_opt = C.MultitypeFuncGraph("momentum_opt") @momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") -def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, moment): +def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment): """Apply momentum optimizer to the weight parameter using Tensor.""" success = True success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum)) @@ -36,9 +36,27 @@ class Momentum(Optimizer): Refer to the paper on the importance of initialization and momentum in deep learning for more details. + Note: + The Momentum optimizer supports separating parameter groups. Different parameter groups can set different + `learning_rate` and `weight_decay`. + + When separating parameter groups, the weight decay in each group will be applied on the parameters if the + value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be + applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. + Args: - params (list[Parameter]): A list of parameter, which will be updated. The element in `parameters` - should be class mindspore.Parameter. + params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, + the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", + "lr" and "weight_decay" are the keys can be parsed. + + - params: Required. The value should be a list of `Parameter`. + + - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used. + If not, the `learning_rate` in the API will be used. + + - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay + will be used. If not, the `weight_decay` in the API will be used. + learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is Iterable or a Tensor and the dims of the Tensor is 1, use dynamic learning rate, then the i-th step will @@ -49,8 +67,6 @@ class Momentum(Optimizer): momentum (float): Hyperparameter of type float, means momentum for the moving average. weight_decay (float): Weight decay (L2 penalty). Default: 0.0. loss_scale (float): A floating point value for the loss scale. Default: 1.0. - decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default: - lambda x: 'beta' not in x.name and 'gamma' not in x.name. Inputs: - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. @@ -63,13 +79,24 @@ class Momentum(Optimizer): Examples: >>> net = Net() - >>> loss = nn.SoftmaxCrossEntropyWithLogits() + >>> #1) All parameters use the same learning rate and weight decay >>> optim = nn.Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) + >>> + >>> #2) Use parameter groups and set different values + >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, + >>> {'params': no_conv_params}] + >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0) + >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 + >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a + >>> # learning rate of 0.1 and a weight decay of 0.0. + >>> + >>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) """ - def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, - decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): - super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter) + def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0): + super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") @@ -84,5 +111,8 @@ class Momentum(Optimizer): gradients = self.decay_weight(gradients) gradients = self.scale_grad(gradients) lr = self.get_lr() - success = self.hyper_map(F.partial(momentum_opt, self.opt, lr, self.momentum), gradients, params, moments) + if self.is_group: + success = self.hyper_map(F.partial(momentum_opt, self.opt, self.momentum), lr, gradients, params, moments) + else: + success = self.hyper_map(F.partial(momentum_opt, self.opt, self.momentum, lr), gradients, params, moments) return success diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py index 34abc2b1c2..671e92de3a 100755 --- a/mindspore/nn/optim/optimizer.py +++ b/mindspore/nn/optim/optimizer.py @@ -28,7 +28,6 @@ from mindspore._checkparam import Rel from mindspore.common.tensor import Tensor from mindspore import log as logger - __all__ = ['Optimizer'] @@ -42,68 +41,96 @@ class Optimizer(Cell): This class defines the API to add Ops to train a model. Never use this class directly, but instead instantiate one of its subclasses. + Some optimizers support separating parameter groups. Different parameter groups can set different + `learning_rate` and `weight_decay`. + + When separating parameter groups, the weight decay in each group will be applied on the parameters if the + value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be + applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. + Args: learning_rate (float): A floating point value for the learning rate. Should be greater than 0. - parameters (list): A list of parameter, which will be updated. The element in `parameters` - should be class mindspore.Parameter. + parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be + updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`, + the "params", "lr" and "weight_decay" are the keys can be parsed. + + - params: Required. The value should be a list of `Parameter`. + + - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used. + If not, the `learning_rate` in the API will be used. + + - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay + will be used. If not, the `weight_decay` in the API will be used. + weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0. - If the type of `weight_decay` input is int, it will be convertd to float. Default: 0.0. + If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0. loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the - type of `loss_scale` input is int, it will be convertd to float. Default: 1.0. - decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default: lambda - x: 'beta' not in x.name and 'gamma' not in x.name. + type of `loss_scale` input is int, it will be converted to float. Default: 1.0. Raises: ValueError: If the learning_rate is a Tensor, but the dims of tensor is greater than 1. TypeError: If the learning_rate is not any of the three types: float, Tensor, Iterable. """ - def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0, - decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): + def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0): super(Optimizer, self).__init__(auto_prefix=False) + if parameters and not isinstance(parameters, list): + parameters = list(parameters) + + if not parameters: + raise ValueError("Optimizer got an empty parameter list.") + + if not isinstance(parameters[0], (dict, Parameter)): + raise ValueError("Only a list of Parameter or dict can be supported.") + + if isinstance(loss_scale, int): + loss_scale = float(loss_scale) + validator.check_value_type("loss_scale", loss_scale, [float], None) + validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, None) + + if isinstance(weight_decay, int): + weight_decay = float(weight_decay) + validator.check_value_type("weight_decay", weight_decay, [float], None) + validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, None) + + self.is_group = False + self.loss_scale = loss_scale if isinstance(learning_rate, float): self.dynamic_lr = False self.gather = None self.assignadd = None self.global_step = None - validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name) - learning_rate = Tensor(learning_rate, mstype.float32) + self.scalar_lr = learning_rate else: self.dynamic_lr = True self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step') - if isinstance(learning_rate, Iterable): - learning_rate = Tensor(np.array(list(learning_rate)).astype(np.float32)) - elif isinstance(learning_rate, Tensor): - if learning_rate.dim() > 1: - raise ValueError("Learning rate should be a 0 or 1 dim `Tensor`," - f"but got {learning_rate.dim()}.") - if learning_rate.dim() == 1 and learning_rate.size() < 2: - logger.warning("If want to use the dynamic learning rate, please make sure that the number " - "of elements in the list, tuple or tensor passed is greater than 1.") - else: - raise TypeError("Learning rate should be float, Tensor or Iterable.") - - if isinstance(weight_decay, int): - weight_decay = float(weight_decay) - validator.check_value_type("weight_decay", weight_decay, [float], None) - validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, None) - - if isinstance(loss_scale, int): - loss_scale = float(loss_scale) - validator.check_value_type("loss_scale", loss_scale, [float], None) - validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, None) - - self.loss_scale = loss_scale - self.learning_rate = Parameter(learning_rate, name="learning_rate") - self.parameters = ParameterTuple(parameters) + self.scalar_lr = None + + learning_rate = self._get_single_lr(learning_rate) + if isinstance(parameters[0], dict): + self.is_group = True + self.params = [] + self.group_lr = [] + self.group_weight_decay = [] + self._init_group_params(parameters, learning_rate, weight_decay) + + if self.is_group: + self.learning_rate = ParameterTuple(self.group_lr) + self.parameters = ParameterTuple(self.params) + self.weight_decay = tuple(self.group_weight_decay) + decay_filter = lambda x: x > 0 + self.decay_flags = tuple(decay_filter(x) for x in self.weight_decay) + else: + self.learning_rate = Parameter(learning_rate, name="learning_rate") + self.parameters = ParameterTuple(parameters) + self.weight_decay = weight_decay * loss_scale + decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name + self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.reciprocal_scale = 1.0 / loss_scale - self.weight_decay = weight_decay * loss_scale - self.decay_flags = tuple(decay_filter(x) for x in self.parameters) - - if not self.parameters: - raise ValueError("optimizer got an empty parameter list.") + self.exec_weight_decay = any(self.decay_flags) + self.param_length = len(self.parameters) def decay_weight(self, gradients): """ @@ -118,9 +145,15 @@ class Optimizer(Cell): Returns: tuple[Tensor], The gradients after weight decay. """ - if self.weight_decay > 0: - params = self.parameters - gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags, params, gradients) + params = self.parameters + if self.is_group: + if self.exec_weight_decay: + gradients = self.hyper_map(F.partial(apply_decay), self.weight_decay, self.decay_flags, + params, gradients) + else: + if self.weight_decay > 0: + gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags, + params, gradients) return gradients @@ -144,6 +177,83 @@ class Optimizer(Cell): return gradients + def _get_single_lr(self, learning_rate): + """Get learning rate in Tensor type.""" + if isinstance(learning_rate, float): + validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name) + lr = Tensor(learning_rate, mstype.float32) + elif isinstance(learning_rate, Iterable): + lr = Tensor(np.array(list(learning_rate)).astype(np.float32)) + elif isinstance(learning_rate, Tensor): + if learning_rate.dim() > 1: + raise ValueError("Learning rate should be a 0 or 1 dim `Tensor`," + f"but got {learning_rate.dim()}.") + if learning_rate.dim() == 1 and learning_rate.size() < 2: + logger.warning("If want to use the dynamic learning rate, please make sure that the number " + "of elements in the list, tuple or tensor passed is greater than 1.") + lr = learning_rate + else: + raise TypeError("Learning rate should be float, Tensor or Iterable.") + return lr + + def _init_group_params(self, parameters, learning_rate, weight_decay): + """Init learning rate or weight decay in group params.""" + origin_dynamic_lr = self.dynamic_lr + if self.dynamic_lr: + dynamic_lr_length = learning_rate.size() + else: + dynamic_lr_length = 0 + + for group_param in parameters: + lr_length = dynamic_lr_length + if 'lr' in group_param.keys(): + self._get_single_lr(group_param['lr']) + if isinstance(group_param['lr'], Iterable): + lr_length = len(group_param['lr']) + self.dynamic_lr = True + elif isinstance(group_param['lr'], Tensor): + lr_length = group_param['lr'].size() + self.dynamic_lr = True + if dynamic_lr_length not in (lr_length, 0): + raise ValueError("The dynamic learning rate in group should be the same size.") + dynamic_lr_length = lr_length + + if self.dynamic_lr and not origin_dynamic_lr: + self.gather = P.GatherV2() + self.assignadd = P.AssignAdd() + self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step') + + params_store = [] + for group_param in parameters: + self.params += group_param['params'] + if 'lr' in group_param.keys(): + params_dynamic_lr = isinstance(group_param['lr'], (Iterable, Tensor)) + + if self.dynamic_lr and not params_dynamic_lr: + lr = Tensor(np.array([group_param['lr']] * dynamic_lr_length).astype(np.float32)) + else: + lr = self._get_single_lr(group_param['lr']) + else: + if self.dynamic_lr and not origin_dynamic_lr: + lr = Tensor(np.array([self.scalar_lr] * dynamic_lr_length).astype(np.float32)) + else: + lr = learning_rate + + if 'weight_decay' in group_param.keys(): + validator.check_float_legal_value('weight_decay', group_param['weight_decay'], None) + validator.check_number_range('weight_decay', group_param['weight_decay'], 0.0, float("inf"), + Rel.INC_LEFT, self.cls_name) + weight_decay_ = group_param['weight_decay'] * self.loss_scale + else: + weight_decay_ = weight_decay * self.loss_scale + + for param in group_param['params']: + if param in params_store: + raise RuntimeError(f"The {param.name} parameter has appeared in parameter groups.") + params_store.append(param) + self.group_lr.append(Parameter(lr, name="lr_" + param.name)) + self.group_weight_decay.append(weight_decay_) + def get_lr(self): """ Get the learning rate of current step. @@ -151,11 +261,20 @@ class Optimizer(Cell): Returns: float, the learning rate of current step. """ - lr = self.learning_rate - if self.dynamic_lr: - lr = self.gather(self.learning_rate, self.global_step, 0) - F.control_depend(lr, self.assignadd(self.global_step, 1)) + if self.is_group: + lr = self.learning_rate + if self.dynamic_lr: + lr = () + for i in range(self.param_length): + current_dynamic_lr = self.gather(self.learning_rate[i], self.global_step, 0) + lr += (current_dynamic_lr,) + F.control_depend(lr, self.assignadd(self.global_step, 1)) + else: + lr = self.learning_rate + if self.dynamic_lr: + lr = self.gather(self.learning_rate, self.global_step, 0) + F.control_depend(lr, self.assignadd(self.global_step, 1)) return lr def construct(self, *hyper_params): diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py index b1271587b4..b96d9499b2 100644 --- a/mindspore/nn/optim/rmsprop.py +++ b/mindspore/nn/optim/rmsprop.py @@ -22,17 +22,17 @@ rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt") centered_rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt") -@rmsprop_opt.register("Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor") -def _rmsprop_opt(opt, learning_rate, decay, epsilon, momentum, weight, ms, mom, grad): +@rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") +def _rmsprop_opt(opt, decay, epsilon, momentum, learning_rate, weight, ms, mom, grad): """Apply rmsprop optimizer to the weight parameter using dynamic learning rate.""" success = True success = F.depend(success, opt(weight, ms, mom, grad, learning_rate, decay, momentum, epsilon)) return success -@centered_rmsprop_opt.register("Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", +@centered_rmsprop_opt.register("Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") -def _centered_rmsprop_opt(opt, learning_rate, decay, epsilon, momentum, weight, mg, ms, mom, grad): +def _centered_rmsprop_opt(opt, decay, epsilon, momentum, learning_rate, weight, mg, ms, mom, grad): """Apply centered rmsprop optimizer to the weight parameter using dynamic learning rate.""" success = True success = F.depend(success, opt(weight, mg, ms, mom, grad, learning_rate, decay, momentum, epsilon)) @@ -44,6 +44,13 @@ class RMSProp(Optimizer): Implements Root Mean Squared Propagation (RMSProp) algorithm. Note: + The RMSProp optimizer supports separating parameter groups. Different parameter groups can set different + `learning_rate` and `weight_decay`. + + When separating parameter groups, the weight decay in each group will be applied on the parameters if the + value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be + applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. + Update `params` according to the RMSProp algorithm. The equation is as follows: @@ -84,8 +91,18 @@ class RMSProp(Optimizer): represents `gradients`. Args: - params (list[Parameter]): A list of parameter, which will be updated. The element in `parameters` - should be class mindspore.Parameter. + params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, + the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", + "lr" and "weight_decay" are the keys can be parsed. + + - params: Required. The value should be a list of `Parameter`. + + - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used. + If not, the `learning_rate` in the API will be used. + + - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay + will be used. If not, the `weight_decay` in the API will be used. + learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is Iterable or a Tensor and the dims of the Tensor is 1, use dynamic learning rate, then the i-th step will @@ -95,15 +112,13 @@ class RMSProp(Optimizer): Other cases are not supported. Default: 0.1. decay (float): Decay rate. Should be equal to or greater than 0. Default: 0.9. momentum (float): Hyperparameter of type float, means momentum for the moving average. Should be equal to or - greater than 0.Default: 0.0. + greater than 0. Default: 0.0. epsilon (float): Term added to the denominator to improve numerical stability. Should be greater than 0. Default: 1e-10. use_locking (bool): Enable a lock to protect the update of variable and accumlation tensors. Default: False. centered (bool): If True, gradients are normalized by the estimated variance of the gradient. Default: False. loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0. weight_decay (float): Weight decay (L2 penalty). Should be equal to or greater than 0. Default: 0.0. - decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default: - lambda x: 'beta' not in x.name and 'gamma' not in x.name. Inputs: - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. @@ -113,14 +128,25 @@ class RMSProp(Optimizer): Examples: >>> net = Net() + >>> #1) All parameters use the same learning rate and weight decay + >>> optim = nn.RMSProp(params=net.trainable_params(), learning_rate=lr) + >>> + >>> #2) Use parameter groups and set different values + >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, + >>> {'params': no_conv_params}] + >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0) + >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 + >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a + >>> # learning rate of 0.1 and a weight decay of 0.0. + >>> >>> loss = nn.SoftmaxCrossEntropyWithLogits() - >>> opt = nn.RMSProp(params=net.trainable_params(), learning_rate=lr) - >>> model = Model(net, loss, opt) + >>> model = Model(net, loss_fn=loss, optimizer=optim) """ def __init__(self, params, learning_rate=0.1, decay=0.9, momentum=0.0, epsilon=1e-10, - use_locking=False, centered=False, loss_scale=1.0, weight_decay=0.0, - decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): - super(RMSProp, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter) + use_locking=False, centered=False, loss_scale=1.0, weight_decay=0.0): + super(RMSProp, self).__init__(learning_rate, params, weight_decay, loss_scale) validator.check_value_type("decay", decay, [float], self.cls_name) validator.check_number_range("decay", decay, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name) validator.check_value_type("momentum", momentum, [float], self.cls_name) @@ -150,9 +176,18 @@ class RMSProp(Optimizer): gradients = self.scale_grad(gradients) lr = self.get_lr() if self.centered: - success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, lr, self.decay, self.epsilon, - self.momentum), params, self.mg, self.ms, self.moment, gradients) + if self.is_group: + success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, self.decay, self.epsilon, + self.momentum), lr, params, self.mg, self.ms, self.moment, gradients) + else: + success = self.hyper_map(F.partial(centered_rmsprop_opt, self.opt, self.decay, self.epsilon, + self.momentum, lr), params, self.mg, self.ms, self.moment, gradients) + else: - success = self.hyper_map(F.partial(rmsprop_opt, self.opt, lr, self.decay, self.epsilon, - self.momentum), params, self.ms, self.moment, gradients) + if self.is_group: + success = self.hyper_map(F.partial(rmsprop_opt, self.opt, self.decay, self.epsilon, + self.momentum), lr, params, self.ms, self.moment, gradients) + else: + success = self.hyper_map(F.partial(rmsprop_opt, self.opt, self.decay, self.epsilon, + self.momentum, lr), params, self.ms, self.moment, gradients) return success diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py index 388fe5db47..0db58af855 100755 --- a/mindspore/nn/optim/sgd.py +++ b/mindspore/nn/optim/sgd.py @@ -24,7 +24,7 @@ sgd_opt = C.MultitypeFuncGraph("sgd_opt") @sgd_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") -def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, accum, stat): +def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, accum, stat): """Apply sgd optimizer to the weight parameter using Tensor.""" success = True success = F.depend(success, opt(weight, gradient, learning_rate, accum, momentum, stat)) @@ -39,9 +39,27 @@ class SGD(Optimizer): Nesterov momentum is based on the formula from paper `On the importance of initialization and momentum in deep learning `_. + Note: + The SGD optimizer supports separating parameter groups. Different parameter groups can set different + `learning_rate` and `weight_decay`. + + When separating parameter groups, the weight decay in each group will be applied on the parameters if the + value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be + applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. + Args: - params (list[Parameter]): A list of parameter, which will be updated. The element in `params` - should be class mindspore.Parameter. + params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, + the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", + "lr" and "weight_decay" are the keys can be parsed. + + - params: Required. The value should be a list of `Parameter`. + + - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used. + If not, the `learning_rate` in the API will be used. + + - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay + will be used. If not, the `weight_decay` in the API will be used. + learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is Iterable or a Tensor and the dims of the Tensor is 1, use dynamic learning rate, then the i-th step will @@ -67,9 +85,21 @@ class SGD(Optimizer): Examples: >>> net = Net() - >>> loss = nn.SoftmaxCrossEntropyWithLogits() + >>> #1) All parameters use the same learning rate and weight decay >>> optim = nn.SGD(params=net.trainable_params()) - >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) + >>> + >>> #2) Use parameter groups and set different values + >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, + >>> {'params': no_conv_params}] + >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0) + >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 + >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a + >>> # learning rate of 0.1 and a weight decay of 0.0. + >>> + >>> loss = nn.SoftmaxCrossEntropyWithLogits() + >>> model = Model(net, loss_fn=loss, optimizer=optim) """ def __init__(self, params, learning_rate=0.1, momentum=0.0, dampening=0.0, weight_decay=0.0, nesterov=False, loss_scale=1.0): @@ -109,5 +139,8 @@ class SGD(Optimizer): gradients = self.decay_weight(gradients) gradients = self.scale_grad(gradients) lr = self.get_lr() - success = self.hyper_map(F.partial(sgd_opt, self.opt, lr, self.momentum), gradients, params, accum, stat) + if self.is_group: + success = self.hyper_map(F.partial(sgd_opt, self.opt, self.momentum), lr, gradients, params, accum, stat) + else: + success = self.hyper_map(F.partial(sgd_opt, self.opt, self.momentum, lr), gradients, params, accum, stat) return success diff --git a/mindspore/nn/wrap/cell_wrapper.py b/mindspore/nn/wrap/cell_wrapper.py index 60718ec2b1..499d85b34b 100644 --- a/mindspore/nn/wrap/cell_wrapper.py +++ b/mindspore/nn/wrap/cell_wrapper.py @@ -167,7 +167,7 @@ class TrainOneStepCell(Cell): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.add_flags(defer_inline=True) - self.weights = ParameterTuple(network.trainable_params()) + self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.sens = sens diff --git a/tests/ut/python/nn/optim/test_adam.py b/tests/ut/python/nn/optim/test_adam.py index d9321b1d26..269f276376 100644 --- a/tests/ut/python/nn/optim/test_adam.py +++ b/tests/ut/python/nn/optim/test_adam.py @@ -50,7 +50,7 @@ class NetWithoutWeight(nn.Cell): def test_adamwithoutparam(): net = NetWithoutWeight() net.set_train() - with pytest.raises(ValueError, match=r"optimizer got an empty parameter list"): + with pytest.raises(ValueError, match=r"Optimizer got an empty parameter list"): AdamWeightDecay(net.trainable_params(), learning_rate=0.1) @@ -104,5 +104,5 @@ def test_AdamWeightDecayDynamicLR(): def test_adam_mindspore_flatten(): net = nn.Flatten() - with pytest.raises(ValueError, match=r"optimizer got an empty parameter list"): + with pytest.raises(ValueError, match=r"Optimizer got an empty parameter list"): AdamWeightDecay(net.get_parameters()) diff --git a/tests/ut/python/nn/optim/test_optimizer.py b/tests/ut/python/nn/optim/test_optimizer.py index 89fb1d812b..9f1ec9a36f 100644 --- a/tests/ut/python/nn/optim/test_optimizer.py +++ b/tests/ut/python/nn/optim/test_optimizer.py @@ -69,19 +69,19 @@ class TestSGD(): class TestNullParam(): """ TestNullParam definition """ def test_optim_init(self): - with pytest.raises(TypeError): + with pytest.raises(ValueError): Optimizer(0.1, None) def test_AdamWightDecay_init(self): - with pytest.raises(TypeError): + with pytest.raises(ValueError): AdamWeightDecay(None) def test_AdamWeightDecayDynamicLR_init(self): - with pytest.raises(TypeError): + with pytest.raises(ValueError): AdamWeightDecayDynamicLR(None, 10) def test_Sgd_init(self): - with pytest.raises(TypeError): + with pytest.raises(ValueError): SGD(None) class TestUnsupportParam(): diff --git a/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py b/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py new file mode 100644 index 0000000000..8dd98990fa --- /dev/null +++ b/tests/ut/python/optimizer/test_optimize_with_parameter_groups.py @@ -0,0 +1,210 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import numpy as np +import pytest +import mindspore.common.dtype as mstype +import mindspore.nn as nn +from mindspore.nn.optim import Momentum, SGD, RMSProp, Adam +from mindspore import context +from mindspore.common.api import _executor +from mindspore.common.tensor import Tensor +from mindspore.ops import operations as P +from mindspore.nn import TrainOneStepCell, WithLossCell + +context.set_context(mode=context.GRAPH_MODE) + + +class LeNet5(nn.Cell): + """ LeNet5 definition """ + def __init__(self): + super(LeNet5, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 5, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') + self.fc1 = nn.Dense(16 * 5 * 5, 120) + self.fc2 = nn.Dense(120, 84) + self.fc3 = nn.Dense(84, 10) + self.relu = nn.ReLU() + self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) + self.flatten = P.Flatten() + + def construct(self, x): + x = self.max_pool2d(self.relu(self.conv1(x))) + x = self.max_pool2d(self.relu(self.conv2(x))) + x = self.flatten(x) + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +def test_group_lr(): + inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01) + label = Tensor(np.ones([1, 10]).astype(np.float32)) + + net = LeNet5() + conv_lr = 0.8 + default_lr = 0.1 + conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + group_params = [{'params': conv_params, 'lr': conv_lr}, + {'params': no_conv_params}] + net.set_train() + loss = nn.SoftmaxCrossEntropyWithLogits() + + opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9) + assert opt.is_group is True + assert opt.dynamic_lr is False + for lr, param in zip(opt.learning_rate, opt.parameters): + if param in conv_params: + assert lr.data == Tensor(conv_lr, mstype.float32) + else: + assert lr.data == Tensor(default_lr, mstype.float32) + + net_with_loss = WithLossCell(net, loss) + train_network = TrainOneStepCell(net_with_loss, opt) + _executor.compile(train_network, inputs, label) + + +def test_group_dynamic_1(): + inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01) + label = Tensor(np.ones([1, 10]).astype(np.float32)) + + net = LeNet5() + conv_lr = 0.8 + default_lr = (0.1, 0.2, 0.3) + conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + group_params = [{'params': conv_params, 'lr': conv_lr}, + {'params': no_conv_params}] + net.set_train() + loss = nn.SoftmaxCrossEntropyWithLogits() + + opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9) + assert opt.is_group is True + assert opt.dynamic_lr is True + for lr, param in zip(opt.learning_rate, opt.parameters): + if param in conv_params: + assert lr.data == Tensor(np.array([conv_lr] * 3).astype(np.float32)) + else: + assert lr.data == Tensor(np.array(list(default_lr)).astype(np.float32)) + + net_with_loss = WithLossCell(net, loss) + train_network = TrainOneStepCell(net_with_loss, opt) + _executor.compile(train_network, inputs, label) + + +def test_group_dynamic_2(): + inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01) + label = Tensor(np.ones([1, 10]).astype(np.float32)) + + net = LeNet5() + conv_lr = (0.1, 0.2, 0.3) + default_lr = 0.8 + conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + group_params = [{'params': conv_params, 'lr': conv_lr}, + {'params': no_conv_params}] + net.set_train() + loss = nn.SoftmaxCrossEntropyWithLogits() + + opt = RMSProp(group_params, learning_rate=default_lr) + assert opt.is_group is True + assert opt.dynamic_lr is True + for lr, param in zip(opt.learning_rate, opt.parameters): + if param in conv_params: + assert lr.data == Tensor(np.array(list(conv_lr)).astype(np.float32)) + else: + assert lr.data == Tensor(np.array([default_lr] * 3).astype(np.float32)) + + net_with_loss = WithLossCell(net, loss) + train_network = TrainOneStepCell(net_with_loss, opt) + _executor.compile(train_network, inputs, label) + + +def test_group_dynamic_no_same_size(): + net = LeNet5() + conv_lr = (0.1, 0.2, 0.3) + default_lr = (0.1, 0.2) + conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + group_params = [{'params': conv_params, 'lr': conv_lr}, + {'params': no_conv_params}] + with pytest.raises(ValueError): + Momentum(group_params, learning_rate=default_lr, momentum=0.9) + + +def test_group_not_float_lr(): + net = LeNet5() + conv_lr = 1 + default_lr = 0.3 + conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + group_params = [{'params': conv_params, 'lr': conv_lr}, + {'params': no_conv_params}] + with pytest.raises(TypeError): + Momentum(group_params, learning_rate=default_lr, momentum=0.9) + + +def test_group_not_float_weight_decay(): + net = LeNet5() + conv_weight_decay = 1 + conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay}, + {'params': no_conv_params}] + with pytest.raises(TypeError): + Momentum(group_params, learning_rate=0.1, momentum=0.9) + + +def test_weight_decay(): + inputs = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01) + label = Tensor(np.ones([1, 10]).astype(np.float32)) + + net = LeNet5() + conv_weight_decay = 0.8 + default_weight_decay = 0.0 + conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay}, + {'params': no_conv_params}] + net.set_train() + loss = nn.SoftmaxCrossEntropyWithLogits() + + opt = SGD(group_params, learning_rate=0.1, weight_decay=default_weight_decay) + assert opt.is_group is True + for weight_decay, decay_flags, param in zip(opt.weight_decay, opt.decay_flags, opt.parameters): + if param in conv_params: + assert weight_decay == conv_weight_decay + assert decay_flags is True + else: + assert weight_decay == default_weight_decay + assert decay_flags is False + + net_with_loss = WithLossCell(net, loss) + train_network = TrainOneStepCell(net_with_loss, opt) + _executor.compile(train_network, inputs, label) + + +def test_group_repeat_param(): + net = LeNet5() + conv_lr = 0.1 + default_lr = 0.3 + conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) + no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) + group_params = [{'params': conv_params, 'lr': conv_lr}, + {'params': conv_params, 'lr': default_lr}, + {'params': no_conv_params}] + with pytest.raises(RuntimeError): + Adam(group_params, learning_rate=default_lr) From 5a259eb67e7105e40b5994a3978ea906ab5f79bc Mon Sep 17 00:00:00 2001 From: guohongzilong <2713219276@qq.com> Date: Fri, 15 May 2020 09:54:28 +0800 Subject: [PATCH 11/13] make optimizer parameter same as gradient --- mindspore/nn/wrap/grad_reducer.py | 2 +- mindspore/nn/wrap/loss_scale.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mindspore/nn/wrap/grad_reducer.py b/mindspore/nn/wrap/grad_reducer.py index ee57297fe0..8383910a60 100644 --- a/mindspore/nn/wrap/grad_reducer.py +++ b/mindspore/nn/wrap/grad_reducer.py @@ -141,7 +141,7 @@ class DistributedGradReducer(Cell): >>> super(TrainingWrapper, self).__init__(auto_prefix=False) >>> self.network = network >>> self.network.add_flags(defer_inline=True) - >>> self.weights = ParameterTuple(network.trainable_params()) + >>> self.weights = optimizer.parameters >>> self.optimizer = optimizer >>> self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) >>> self.sens = sens diff --git a/mindspore/nn/wrap/loss_scale.py b/mindspore/nn/wrap/loss_scale.py index 65d66f0150..ae76cb055f 100644 --- a/mindspore/nn/wrap/loss_scale.py +++ b/mindspore/nn/wrap/loss_scale.py @@ -18,7 +18,7 @@ from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.train.parallel_utils import ParallelMode from mindspore.parallel._utils import _get_device_num, _get_parallel_mode, _get_mirror_mean from ..cell import Cell -from ...common import Tensor, ParameterTuple +from ...common import Tensor from ...common.parameter import Parameter from ...ops import functional as F from ...ops import composite as C @@ -201,7 +201,7 @@ class TrainOneStepWithLossScaleCell(Cell): super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.add_flags(defer_inline=True) - self.weights = ParameterTuple(network.trainable_params()) + self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() From 22866fbe2577bbb1bcfe2befe15a8acdcb8f3c7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=81=E6=88=90=E8=BE=89?= Date: Sat, 16 May 2020 12:17:06 +0800 Subject: [PATCH 12/13] Adapt to TBE Cast operator latest interface --- mindspore/ccsrc/transform/op_declare.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindspore/ccsrc/transform/op_declare.cc b/mindspore/ccsrc/transform/op_declare.cc index 27c1d306aa..5cae6c77f7 100644 --- a/mindspore/ccsrc/transform/op_declare.cc +++ b/mindspore/ccsrc/transform/op_declare.cc @@ -823,7 +823,7 @@ OUTPUT_MAP(RealDiv) = {{0, OUTPUT_DESC(y)}}; // Cast INPUT_MAP(Cast) = {{1, INPUT_DESC(x)}}; INPUT_ATTR_MAP(Cast) = {{2, ATTR_DESC(dst_type, AnyTraits())}}; -ATTR_MAP(Cast) = {{"Truncate", ATTR_DESC(truncate, AnyTraits())}}; +ATTR_MAP(Cast) = EMPTY_ATTR_MAP; OUTPUT_MAP(Cast) = {{0, OUTPUT_DESC(y)}}; // Reciprocal From 11089e6077154b337d7e0bc3cc7affc44c361b21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=81=E6=88=90=E8=BE=89?= Date: Tue, 19 May 2020 10:56:22 +0800 Subject: [PATCH 13/13] Adapte ge lib name change form ge_client_train to ge_runner. --- mindspore/ccsrc/CMakeLists.txt | 2 +- tests/ut/cpp/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index 8d3818a777..37842820a2 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -125,7 +125,7 @@ endif() if (ENABLE_GE) if(ENABLE_TRAIN) - target_link_libraries(mindspore ge_client_train hccl) + target_link_libraries(mindspore ge_runner hccl) else () target_link_libraries(mindspore ge_client) endif () diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index f5bc07ff69..8176c4fd37 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -128,7 +128,7 @@ add_executable(ut_tests ${UT_SRCS} ${MINDSPORE_SRC_LIST} ${UT_SUTB_SRC_LIST}) if (ENABLE_GE) if(ENABLE_TRAIN) - target_link_libraries(ut_tests PRIVATE graph ge_client_train) + target_link_libraries(ut_tests PRIVATE graph ge_runner) else() target_link_libraries(ut_tests PRIVATE graph ge_client) endif()