From 3c2057297e8e84398127a9c5cd8f1858ee385adf Mon Sep 17 00:00:00 2001 From: panyifeng Date: Fri, 19 Jun 2020 14:46:24 +0800 Subject: [PATCH] support multi param for tuple grad --- mindspore/ccsrc/operator/prim_others.cc | 43 ++++++++++++++----- mindspore/ccsrc/pipeline/action.cc | 3 +- .../pipeline/static_analysis/abstract_value.h | 8 ++-- mindspore/common/parameter.py | 10 ++--- mindspore/nn/optim/adam.py | 2 +- mindspore/nn/optim/ftrl.py | 2 +- mindspore/nn/optim/lazyadam.py | 5 ++- mindspore/ops/composite/base.py | 38 +++------------- tests/ut/python/nn/optim/test_adam.py | 3 +- .../nn/optim/test_adam_with_tuple_grad.py | 4 +- tests/ut/python/nn/optim/test_ftrl.py | 3 +- tests/ut/python/nn/optim/test_lazyadam.py | 3 +- .../python/nn/optim/test_proximal_ada_grad.py | 3 +- 13 files changed, 65 insertions(+), 62 deletions(-) diff --git a/mindspore/ccsrc/operator/prim_others.cc b/mindspore/ccsrc/operator/prim_others.cc index a7323ed3cb..432b12f83b 100644 --- a/mindspore/ccsrc/operator/prim_others.cc +++ b/mindspore/ccsrc/operator/prim_others.cc @@ -59,7 +59,8 @@ class UndeterminedShapeType { public: explicit UndeterminedShapeType(const std::string &env_str) { // param_name indices_shape indices_type values_shape values_type dense_shape - // export UNDETERMINED_SPARSE_SHAPE_TYPES="w1:2:Int32:2 1 2:Float32:3 1 2" + // export UNDETERMINED_SPARSE_SHAPE_TYPES="sparse_key_w1:2:Int32:2 1 2:Float32:3 1 2;sparse_key_w2:2:Int32:2 1 + // 2:Float32:3 1 2" std::vector fields; string tmp; std::stringstream input(env_str); @@ -115,6 +116,20 @@ std::vector UndeterminedShapeType::GetShape(const std::string &shape_str) { } const size_t UndeterminedShapeType::fields_num = 6; +std::unordered_map g_undetermined_configs; +void InitUndeterminedFromEnv(const std::string &sparse_shape_types) { + if (!g_undetermined_configs.empty()) { + return; + } + std::string tmp; + std::stringstream input(sparse_shape_types); + while (std::getline(input, tmp, ';')) { + auto config = UndeterminedShapeType(tmp); + g_undetermined_configs.insert(std::make_pair(config.param_name(), config)); + MS_LOG(DEBUG) << "Undetermined config from env: " << tmp; + } +} + AbstractBasePtr InferImplEnvGetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive, const AbstractBasePtrList &args_spec_list) { MS_EXCEPTION_IF_NULL(primitive); @@ -128,27 +143,33 @@ AbstractBasePtr InferImplEnvGetItem(const AnalysisEnginePtr &, const PrimitivePt MS_LOG(EXCEPTION) << "EnvGetItem evaluator args[1] should be a SymbolicKeyInstance but: " << key->ToString(); } - if (key->sparse_grad()) { + if (!key->sparse_grad().empty()) { // Will be fixed once undetermined type ready auto sparse_shape_types = common::GetEnv("UNDETERMINED_SPARSE_SHAPE_TYPES"); if (sparse_shape_types.empty()) { - sparse_shape_types = "w1:2:Int32:2 1 2:Float32:3 1 2"; + sparse_shape_types = "sparse_key_w1:2:Int32:2 1 2:Float32:3 1 2;sparse_key_w2:2:Int32:2 1 2:Float32:3 1 2"; } - MS_LOG(DEBUG) << "EnvGetItem is sparse_grad " << key->ToString() << ", Undetermined shape is " - << sparse_shape_types; + InitUndeterminedFromEnv(sparse_shape_types); - auto shape_types = UndeterminedShapeType(sparse_shape_types); + auto shape_types = g_undetermined_configs.find(key->sparse_grad()); + if (shape_types == g_undetermined_configs.end()) { + MS_LOG(EXCEPTION) << "Param " << key->ToString() + << " has sparse_grad, but shape/type is not configured in env UNDETERMINED_SPARSE_SHAPE_TYPES: " + << sparse_shape_types; + } + MS_LOG(DEBUG) << "EnvGetItem is sparse_grad " << key->ToString(); AbstractBasePtrList sparse_list; // indices - auto indices_ele = std::make_shared(kAnyValue, shape_types.indices_type()); - auto indices = std::make_shared(indices_ele, std::make_shared(shape_types.indices_shape())); + auto indices_ele = std::make_shared(kAnyValue, shape_types->second.indices_type()); + auto indices = + std::make_shared(indices_ele, std::make_shared(shape_types->second.indices_shape())); sparse_list.emplace_back(indices); // values - auto dout_ele = std::make_shared(kAnyValue, shape_types.values_type()); - auto dout = std::make_shared(dout_ele, std::make_shared(shape_types.values_shape())); + auto dout_ele = std::make_shared(kAnyValue, shape_types->second.values_type()); + auto dout = std::make_shared(dout_ele, std::make_shared(shape_types->second.values_shape())); sparse_list.emplace_back(dout); // dense_shape - sparse_list.emplace_back(std::make_shared(shape_types.dense_shape())); + sparse_list.emplace_back(std::make_shared(shape_types->second.dense_shape())); return std::make_shared(sparse_list); } diff --git a/mindspore/ccsrc/pipeline/action.cc b/mindspore/ccsrc/pipeline/action.cc index f127305d1b..5799ba35bd 100644 --- a/mindspore/ccsrc/pipeline/action.cc +++ b/mindspore/ccsrc/pipeline/action.cc @@ -229,7 +229,8 @@ bool AbstractSpecializeAction(const ResourcePtr &res) { if (param_node->has_default()) { auto param_value = std::dynamic_pointer_cast(param_node->default_param()); AbstractBasePtr ptr = abstract::FromValue(parse::data_converter::PyDataToValue(param_value->value()), true); - auto sparse_grad = py::cast(parse::python_adapter::GetPyObjAttr(param_value->value(), "sparse_grad")); + auto sparse_grad = + py::cast(parse::python_adapter::GetPyObjAttr(param_value->value(), "sparse_grad")); ptr->set_sparse_grad(sparse_grad); parallel::ParallelParameterContextRestoreInNoTraining(func_graph, param_node, ptr); diff --git a/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h b/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h index dcd6f8f951..f3375d22d6 100644 --- a/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h +++ b/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h @@ -44,7 +44,7 @@ class AbstractBase : public Base { public: explicit AbstractBase(const ValuePtr &value = nullptr, const TypePtr &type = kAnyType, const BaseShapePtr &shape = kNoShape) - : value_(value), type_(type), shape_(shape), sparse_grad_(false) {} + : value_(value), type_(type), shape_(shape), sparse_grad_("") {} ~AbstractBase() override = default; MS_DECLARE_PARENT(AbstractBase, Base) @@ -53,13 +53,13 @@ class AbstractBase : public Base { virtual bool operator==(const AbstractBase &other) const; void set_value(const ValuePtr &value) { value_ = value; } - void set_sparse_grad(const bool &sparse_grad) { sparse_grad_ = sparse_grad; } + void set_sparse_grad(const std::string &sparse_grad) { sparse_grad_ = sparse_grad; } void set_type(const TypePtr &type) { type_ = type; } void set_shape(const BaseShapePtr &shape) { shape_ = shape; } void set_value_desc(const std::string &desc) { value_desc_ = desc; } const std::string &value_desc() const { return value_desc_; } ValuePtr GetValueTrack() const { return value_; } - bool sparse_grad() const { return sparse_grad_; } + const std::string &sparse_grad() const { return sparse_grad_; } TypePtr GetTypeTrack() const { return type_; } BaseShapePtr GetShapeTrack() const { return shape_; } @@ -87,7 +87,7 @@ class AbstractBase : public Base { TypePtr type_; BaseShapePtr shape_; std::string value_desc_; // store initial value description for error report - bool sparse_grad_; + std::string sparse_grad_; }; class AbstractScalar : public AbstractBase { diff --git a/mindspore/common/parameter.py b/mindspore/common/parameter.py index 69affee2c3..6dca4fca9c 100644 --- a/mindspore/common/parameter.py +++ b/mindspore/common/parameter.py @@ -51,9 +51,9 @@ class Parameter: requires_grad (bool): True if the parameter requires gradient. Default: True. layerwise_parallel (bool): A kind of model parallel mode. When layerwise_parallel is true in paralle mode, broadcast and gradients communication would not be applied on parameters. Default: False. - sparse_grad (bool): True if the parameter's gradient is sparse. Default: False. + sparse_grad (str): Set if the parameter's gradient is sparse. Default: empty. """ - def __init__(self, default_input, name, requires_grad=True, layerwise_parallel=False, sparse_grad=False): + def __init__(self, default_input, name, requires_grad=True, layerwise_parallel=False, sparse_grad=""): self.set_parameter_data(default_input) self.name = name self.requires_grad = requires_grad @@ -181,9 +181,9 @@ class Parameter: return self._sparse_grad @sparse_grad.setter - def sparse_grad(self, value=True): - if not isinstance(value, bool): - raise TypeError("`sparse_grad` parameter must be bool type") + def sparse_grad(self, value=""): + if not isinstance(value, str): + raise TypeError("`sparse_grad` parameter must be str type") self._sparse_grad = value @property diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py index 92cab56a05..786be1bd0b 100755 --- a/mindspore/nn/optim/adam.py +++ b/mindspore/nn/optim/adam.py @@ -156,7 +156,7 @@ class Adam(Optimizer): To improve parameter groups performance, the customized order of parameters can be supported. The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the - `sparse_grad` of `Parameter` being set as True. The sparse feature is under continuous development. The sparse + `sparse_grad` of `Parameter` being set. The sparse feature is under continuous development. The sparse behavior is currently performed on the CPU, weight decay is not supported. Args: diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py index d1f49a3791..a40d6737cb 100644 --- a/mindspore/nn/optim/ftrl.py +++ b/mindspore/nn/optim/ftrl.py @@ -72,7 +72,7 @@ class FTRL(Optimizer): Note: The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the - `sparse_grad` of `Parameter` being set as True. The sparse feature is under continuous development. The sparse + `sparse_grad` of `Parameter` being set. The sparse feature is under continuous development. The sparse behavior is currently performed on the CPU, weight decay is not supported. Args: diff --git a/mindspore/nn/optim/lazyadam.py b/mindspore/nn/optim/lazyadam.py index d9df717b8a..48d33bf798 100644 --- a/mindspore/nn/optim/lazyadam.py +++ b/mindspore/nn/optim/lazyadam.py @@ -92,9 +92,10 @@ class LazyAdam(Optimizer): applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the - `sparse_grad` of `Parameter` being set as True. The sparse behavior, to be notice, is not equivalent to the + `sparse_grad` of `Parameter` being set. The sparse behavior, to be notice, is not equivalent to the original Adam algorithm, as only the current indices parames will be updated. The sparse feature is under - continuous development. The sparse behavior is currently performed on the CPU, weight decay is not supported. + continuous development. The sparse behavior is currently performed on the CPU, weight decay is + not supported. Args: params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, diff --git a/mindspore/ops/composite/base.py b/mindspore/ops/composite/base.py index 63e83a126c..e283867684 100644 --- a/mindspore/ops/composite/base.py +++ b/mindspore/ops/composite/base.py @@ -241,6 +241,7 @@ class HyperMap(HyperMap_): return func(*args_list) return tuple(map(hypermap, *args_list)) + class Map(Map_): """ Map will apply the set operation on input sequences. @@ -271,37 +272,12 @@ class Map(Map_): Map_.__init__(self) def __call__(self, *args): - func = args[0] - count = 0 - count_max = 1 - args_list = args[1:] - if self.ops is not None: - func = self.ops - args_list = args - for item in args_list: - if isinstance(item, (tuple, list)): - count_max = len(item) - break - - def get_item(x): - nonlocal count - if isinstance(x, (tuple, list)): - return x[count] - return x - - for i in range(count_max): - true_args = tuple(map(get_item, args_list)) - func(*true_args) - count = i + 1 - return True - - def register(self, *type_names): - """Register a function for the given type string.""" - - def deco(fn): - self.register_fn(type_names, fn) - return fn - return deco + func = self.ops + args_list = args + if self.ops is None: + func = args[0] + args_list = args[1:] + return tuple(map(func, *args_list)) class _ListAppend(ListAppend_): diff --git a/tests/ut/python/nn/optim/test_adam.py b/tests/ut/python/nn/optim/test_adam.py index be22c8abdc..3fd18b9664 100644 --- a/tests/ut/python/nn/optim/test_adam.py +++ b/tests/ut/python/nn/optim/test_adam.py @@ -53,7 +53,8 @@ class NetWithSparseGatherV2(nn.Cell): """ NetWithSparseGatherV2 definition """ def __init__(self): super(NetWithSparseGatherV2, self).__init__() - self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", sparse_grad=True) + self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), + name="weight1", sparse_grad="sparse_key_w1") self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2") self.axis = 0 self.gather = P.SparseGatherV2() diff --git a/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py b/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py index 86ea99b1ae..5222f920ba 100644 --- a/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py +++ b/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py @@ -154,8 +154,8 @@ def test_AdamWeightDecaySparse(): class NetWithSparseGatherV2(nn.Cell): def __init__(self): super(NetWithSparseGatherV2, self).__init__() - self.w1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="w1", sparse_grad=True) - self.w2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="w2") + self.w1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="w1", sparse_grad="sparse_key_w1") + self.w2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="w2", sparse_grad="sparse_key_w2") self.gatherv2 = P.SparseGatherV2() self.axis = 0 def construct(self, indices): diff --git a/tests/ut/python/nn/optim/test_ftrl.py b/tests/ut/python/nn/optim/test_ftrl.py index 213ce6c460..f0f094c177 100644 --- a/tests/ut/python/nn/optim/test_ftrl.py +++ b/tests/ut/python/nn/optim/test_ftrl.py @@ -41,7 +41,8 @@ class NetWithSparseGatherV2(nn.Cell): """ NetWithSparseGatherV2 definition """ def __init__(self): super(NetWithSparseGatherV2, self).__init__() - self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", sparse_grad=True) + self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), + name="weight1", sparse_grad="sparse_key_w1") self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2") self.axis = 0 self.gather = P.SparseGatherV2() diff --git a/tests/ut/python/nn/optim/test_lazyadam.py b/tests/ut/python/nn/optim/test_lazyadam.py index 77b02f9ff9..713fffc50d 100644 --- a/tests/ut/python/nn/optim/test_lazyadam.py +++ b/tests/ut/python/nn/optim/test_lazyadam.py @@ -43,7 +43,8 @@ class NetWithSparseGatherV2(nn.Cell): """ NetWithSparseGatherV2 definition """ def __init__(self): super(NetWithSparseGatherV2, self).__init__() - self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", sparse_grad=True) + self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), + name="weight1", sparse_grad="sparse_key_w1") self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2") self.axis = 0 self.gather = P.SparseGatherV2() diff --git a/tests/ut/python/nn/optim/test_proximal_ada_grad.py b/tests/ut/python/nn/optim/test_proximal_ada_grad.py index 52e418d39b..a43a4ad23d 100644 --- a/tests/ut/python/nn/optim/test_proximal_ada_grad.py +++ b/tests/ut/python/nn/optim/test_proximal_ada_grad.py @@ -40,7 +40,8 @@ class NetWithSparseGatherV2(nn.Cell): """ NetWithSparseGatherV2 definition """ def __init__(self): super(NetWithSparseGatherV2, self).__init__() - self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", sparse_grad=True) + self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", + sparse_grad="sparse_key_w1") self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="weight2") self.axis = 0 self.gather = P.SparseGatherV2()