diff --git a/mindspore/ccsrc/frontend/parallel/ps/common.h b/mindspore/ccsrc/frontend/parallel/ps/common.h index b0d557dc1f..a021dff1ad 100644 --- a/mindspore/ccsrc/frontend/parallel/ps/common.h +++ b/mindspore/ccsrc/frontend/parallel/ps/common.h @@ -56,9 +56,11 @@ constexpr char kMomentum[] = "momentum"; constexpr char kApplyMomentum[] = "ApplyMomentum"; constexpr char kSparseAdam[] = "Adam"; +constexpr char kSparseLazyAdam[] = "LazyAdam"; constexpr char kSparseFtrl[] = "Ftrl"; constexpr char kApplyMomentumOp[] = "Momentum"; constexpr char kSparseAdamOp[] = "Adam"; +constexpr char kSparseLazyAdamOp[] = "LazyAdam"; constexpr char kSparseFtrlOp[] = "FTRL"; constexpr int kInitWeightsCmd = 10; diff --git a/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h b/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h index 6eedbd76b3..5a9c4af38a 100644 --- a/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h +++ b/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h @@ -42,6 +42,7 @@ #include "backend/kernel_compiler/kernel.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" #include "backend/kernel_compiler/cpu/ps/pserver_kernel.h" +#include "backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.h" #include "backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.h" #include "backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h" #include "backend/kernel_compiler/cpu/ps/apply_momentum_ps_kernel.h" @@ -374,6 +375,11 @@ void ParameterServer::InitOptimInputsShape(const Keys &keys, const Values &va const CNodePtr cnode = GetCNode(optim_op_name); MS_EXCEPTION_IF_NULL(cnode); if (optim_name == kSparseAdam) { + std::shared_ptr optimizer = + std::make_shared(rank_id_, pserver_num_); + optimizer->InitKernel(cnode, optim_inputs_shape_[key]); + optimizers_[key] = optimizer; + } else if (optim_name == kSparseLazyAdam) { std::shared_ptr optimizer = std::make_shared(rank_id_, pserver_num_); optimizer->InitKernel(cnode, optim_inputs_shape_[key]); diff --git a/mindspore/ccsrc/frontend/parallel/ps/util.cc b/mindspore/ccsrc/frontend/parallel/ps/util.cc index 1bda9c1323..c951006a8c 100644 --- a/mindspore/ccsrc/frontend/parallel/ps/util.cc +++ b/mindspore/ccsrc/frontend/parallel/ps/util.cc @@ -25,19 +25,22 @@ namespace ps { std::unordered_map Util::optimizer_to_ids{ {kApplyMomentum, 0}, {kSparseAdam, 1}, - {kSparseFtrl, 2}, + {kSparseLazyAdam, 2}, + {kSparseFtrl, 3}, }; std::unordered_map Util::id_to_optimizers{ {0, kApplyMomentum}, {1, kSparseAdam}, - {2, kSparseFtrl}, + {2, kSparseLazyAdam}, + {3, kSparseFtrl}, }; std::unordered_map Util::id_to_optimizer_nodes{ {0, kApplyMomentumOp}, {1, kSparseAdamOp}, - {2, kSparseFtrlOp}, + {2, kSparseLazyAdamOp}, + {3, kSparseFtrlOp}, }; bool Util::IsParamServerMode() { return IsRoleOfWorker() || IsRoleOfPServer() || IsRoleOfScheduler(); } diff --git a/mindspore/nn/optim/lazyadam.py b/mindspore/nn/optim/lazyadam.py index 80dfd9d38c..788c68a2a8 100644 --- a/mindspore/nn/optim/lazyadam.py +++ b/mindspore/nn/optim/lazyadam.py @@ -27,25 +27,40 @@ from .optimizer import Optimizer _lazy_adam_opt = C.MultitypeFuncGraph("lazy_adam_opt") -@_lazy_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", - "RowTensor", "Tensor", "Tensor", "Tensor") -def _run_opt_with_sparse(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params, - moment1, moment2): +@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", + "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool") +def _run_opt_with_sparse(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps, + lr, gradient, params, moment1, moment2, ps_parameter): """Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse.""" success = True - success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2, - eps, gradient.values, gradient.indices)) + indices = gradient.indices + values = gradient.values + if ps_parameter: + op_shape = P.Shape() + shapes = (op_shape(params), op_shape(moment1), op_shape(moment2), + op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1), + op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices)) + success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, + eps, values, indices), shapes), params)) + else: + success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2, + eps, values, indices)) return success -@_lazy_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", - "Tensor", "Tensor", "Tensor") -def _run_opt_with_one_number(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params, - moment1, moment2): - """Apply adam optimizer to the weight parameter using Tensor.""" +@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", + "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool") +def _run_opt_with_one_number(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps, + lr, gradient, params, moment1, moment2, ps_parameter): + """Apply lazy adam optimizer to the weight parameter using Tensor.""" success = True - success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2, - eps, gradient)) + if ps_parameter: + op_shape = P.Shape() + success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, gradient), + (op_shape(params), op_shape(moment1), op_shape(moment2))), params)) + else: + success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2, + eps, gradient)) return success @@ -173,7 +188,7 @@ class LazyAdam(Optimizer): self.beta2 = Tensor(beta2, mstype.float32) self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power") self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power") - self.eps = eps + self.eps = Tensor(eps, mstype.float32) self.use_nesterov = use_nesterov self.use_locking = use_locking @@ -184,6 +199,10 @@ class LazyAdam(Optimizer): self.opt = P.Adam(use_locking, use_nesterov) self.sparse_opt = P.FusedSparseLazyAdam(use_locking, use_nesterov) + self._ps_pull = P.Pull() + self._ps_push = P.Push("Adam", [0, 1, 2]) + self._ps_push.add_prim_attr("use_nesterov", use_nesterov) + def construct(self, gradients): gradients = self.decay_weight(gradients) gradients = self.scale_grad(gradients) @@ -193,11 +212,11 @@ class LazyAdam(Optimizer): self.beta2_power = self.beta2_power * self.beta2 if self.is_group_lr: - success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self.beta1_power, - self.beta2_power, self.beta1, self.beta2, self.eps), - lr, gradients, self.parameters, self.moment1, self.moment2) + success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull, + self.beta1_power, self.beta2_power, self.beta1, self.beta2, self.eps), + lr, gradients, self.parameters, self.moment1, self.moment2, self.ps_parameters) else: - success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self.beta1_power, - self.beta2_power, self.beta1, self.beta2, self.eps, lr), - gradients, self.parameters, self.moment1, self.moment2) + success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull, + self.beta1_power, self.beta2_power, self.beta1, self.beta2, self.eps, lr), + gradients, self.parameters, self.moment1, self.moment2, self.ps_parameters) return success diff --git a/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py b/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py index d66cb0772c..bda889fe87 100644 --- a/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py +++ b/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py @@ -328,20 +328,13 @@ class TrainStepWrap(nn.Cell): self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) - if host_device_mix and is_auto_parallel: + if (host_device_mix and is_auto_parallel) or parameter_server: self.optimizer_d = LazyAdam( self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) self.optimizer_w.sparse_opt.add_prim_attr("primitive_target", "CPU") self.optimizer_d.sparse_opt.add_prim_attr("primitive_target", "CPU") - elif parameter_server: - self.optimizer_d = Adam( - self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) - self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, - l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) - self.optimizer_w.sparse_opt.add_prim_attr("primitive_target", "CPU") - self.optimizer_d.sparse_opt.add_prim_attr("primitive_target", "CPU") else: self.optimizer_d = Adam( self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)