Browse Source

Add lazy adam support for PS.

tags/v0.7.0-beta
ZPaC 5 years ago
parent
commit
a923747eab
5 changed files with 54 additions and 31 deletions
  1. +2
    -0
      mindspore/ccsrc/frontend/parallel/ps/common.h
  2. +6
    -0
      mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
  3. +6
    -3
      mindspore/ccsrc/frontend/parallel/ps/util.cc
  4. +39
    -20
      mindspore/nn/optim/lazyadam.py
  5. +1
    -8
      model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py

+ 2
- 0
mindspore/ccsrc/frontend/parallel/ps/common.h View File

@@ -56,9 +56,11 @@ constexpr char kMomentum[] = "momentum";


constexpr char kApplyMomentum[] = "ApplyMomentum"; constexpr char kApplyMomentum[] = "ApplyMomentum";
constexpr char kSparseAdam[] = "Adam"; constexpr char kSparseAdam[] = "Adam";
constexpr char kSparseLazyAdam[] = "LazyAdam";
constexpr char kSparseFtrl[] = "Ftrl"; constexpr char kSparseFtrl[] = "Ftrl";
constexpr char kApplyMomentumOp[] = "Momentum"; constexpr char kApplyMomentumOp[] = "Momentum";
constexpr char kSparseAdamOp[] = "Adam"; constexpr char kSparseAdamOp[] = "Adam";
constexpr char kSparseLazyAdamOp[] = "LazyAdam";
constexpr char kSparseFtrlOp[] = "FTRL"; constexpr char kSparseFtrlOp[] = "FTRL";


constexpr int kInitWeightsCmd = 10; constexpr int kInitWeightsCmd = 10;


+ 6
- 0
mindspore/ccsrc/frontend/parallel/ps/parameter_server.h View File

@@ -42,6 +42,7 @@
#include "backend/kernel_compiler/kernel.h" #include "backend/kernel_compiler/kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "backend/kernel_compiler/cpu/ps/pserver_kernel.h" #include "backend/kernel_compiler/cpu/ps/pserver_kernel.h"
#include "backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.h"
#include "backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.h" #include "backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.h"
#include "backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h" #include "backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h"
#include "backend/kernel_compiler/cpu/ps/apply_momentum_ps_kernel.h" #include "backend/kernel_compiler/cpu/ps/apply_momentum_ps_kernel.h"
@@ -374,6 +375,11 @@ void ParameterServer<T>::InitOptimInputsShape(const Keys &keys, const Values &va
const CNodePtr cnode = GetCNode(optim_op_name); const CNodePtr cnode = GetCNode(optim_op_name);
MS_EXCEPTION_IF_NULL(cnode); MS_EXCEPTION_IF_NULL(cnode);
if (optim_name == kSparseAdam) { if (optim_name == kSparseAdam) {
std::shared_ptr<PServerKernel> optimizer =
std::make_shared<kernel::ps::SparseApplyAdamPSKernel>(rank_id_, pserver_num_);
optimizer->InitKernel(cnode, optim_inputs_shape_[key]);
optimizers_[key] = optimizer;
} else if (optim_name == kSparseLazyAdam) {
std::shared_ptr<PServerKernel> optimizer = std::shared_ptr<PServerKernel> optimizer =
std::make_shared<kernel::ps::SparseApplyLazyAdamPSKernel>(rank_id_, pserver_num_); std::make_shared<kernel::ps::SparseApplyLazyAdamPSKernel>(rank_id_, pserver_num_);
optimizer->InitKernel(cnode, optim_inputs_shape_[key]); optimizer->InitKernel(cnode, optim_inputs_shape_[key]);


+ 6
- 3
mindspore/ccsrc/frontend/parallel/ps/util.cc View File

@@ -25,19 +25,22 @@ namespace ps {
std::unordered_map<std::string, int> Util::optimizer_to_ids{ std::unordered_map<std::string, int> Util::optimizer_to_ids{
{kApplyMomentum, 0}, {kApplyMomentum, 0},
{kSparseAdam, 1}, {kSparseAdam, 1},
{kSparseFtrl, 2},
{kSparseLazyAdam, 2},
{kSparseFtrl, 3},
}; };


std::unordered_map<int, std::string> Util::id_to_optimizers{ std::unordered_map<int, std::string> Util::id_to_optimizers{
{0, kApplyMomentum}, {0, kApplyMomentum},
{1, kSparseAdam}, {1, kSparseAdam},
{2, kSparseFtrl},
{2, kSparseLazyAdam},
{3, kSparseFtrl},
}; };


std::unordered_map<int, std::string> Util::id_to_optimizer_nodes{ std::unordered_map<int, std::string> Util::id_to_optimizer_nodes{
{0, kApplyMomentumOp}, {0, kApplyMomentumOp},
{1, kSparseAdamOp}, {1, kSparseAdamOp},
{2, kSparseFtrlOp},
{2, kSparseLazyAdamOp},
{3, kSparseFtrlOp},
}; };


bool Util::IsParamServerMode() { return IsRoleOfWorker() || IsRoleOfPServer() || IsRoleOfScheduler(); } bool Util::IsParamServerMode() { return IsRoleOfWorker() || IsRoleOfPServer() || IsRoleOfScheduler(); }


+ 39
- 20
mindspore/nn/optim/lazyadam.py View File

@@ -27,25 +27,40 @@ from .optimizer import Optimizer
_lazy_adam_opt = C.MultitypeFuncGraph("lazy_adam_opt") _lazy_adam_opt = C.MultitypeFuncGraph("lazy_adam_opt")




@_lazy_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor",
"RowTensor", "Tensor", "Tensor", "Tensor")
def _run_opt_with_sparse(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
moment1, moment2):
@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor",
"Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool")
def _run_opt_with_sparse(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps,
lr, gradient, params, moment1, moment2, ps_parameter):
"""Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse.""" """Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse."""
success = True success = True
success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
eps, gradient.values, gradient.indices))
indices = gradient.indices
values = gradient.values
if ps_parameter:
op_shape = P.Shape()
shapes = (op_shape(params), op_shape(moment1), op_shape(moment2),
op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
eps, values, indices), shapes), params))
else:
success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
eps, values, indices))
return success return success




@_lazy_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor",
"Tensor", "Tensor", "Tensor")
def _run_opt_with_one_number(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
moment1, moment2):
"""Apply adam optimizer to the weight parameter using Tensor."""
@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor",
"Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
def _run_opt_with_one_number(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps,
lr, gradient, params, moment1, moment2, ps_parameter):
"""Apply lazy adam optimizer to the weight parameter using Tensor."""
success = True success = True
success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
eps, gradient))
if ps_parameter:
op_shape = P.Shape()
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, gradient),
(op_shape(params), op_shape(moment1), op_shape(moment2))), params))
else:
success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
eps, gradient))
return success return success




@@ -173,7 +188,7 @@ class LazyAdam(Optimizer):
self.beta2 = Tensor(beta2, mstype.float32) self.beta2 = Tensor(beta2, mstype.float32)
self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power") self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power")
self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power") self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power")
self.eps = eps
self.eps = Tensor(eps, mstype.float32)
self.use_nesterov = use_nesterov self.use_nesterov = use_nesterov
self.use_locking = use_locking self.use_locking = use_locking


@@ -184,6 +199,10 @@ class LazyAdam(Optimizer):
self.opt = P.Adam(use_locking, use_nesterov) self.opt = P.Adam(use_locking, use_nesterov)
self.sparse_opt = P.FusedSparseLazyAdam(use_locking, use_nesterov) self.sparse_opt = P.FusedSparseLazyAdam(use_locking, use_nesterov)


self._ps_pull = P.Pull()
self._ps_push = P.Push("Adam", [0, 1, 2])
self._ps_push.add_prim_attr("use_nesterov", use_nesterov)

def construct(self, gradients): def construct(self, gradients):
gradients = self.decay_weight(gradients) gradients = self.decay_weight(gradients)
gradients = self.scale_grad(gradients) gradients = self.scale_grad(gradients)
@@ -193,11 +212,11 @@ class LazyAdam(Optimizer):
self.beta2_power = self.beta2_power * self.beta2 self.beta2_power = self.beta2_power * self.beta2


if self.is_group_lr: if self.is_group_lr:
success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self.beta1_power,
self.beta2_power, self.beta1, self.beta2, self.eps),
lr, gradients, self.parameters, self.moment1, self.moment2)
success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull,
self.beta1_power, self.beta2_power, self.beta1, self.beta2, self.eps),
lr, gradients, self.parameters, self.moment1, self.moment2, self.ps_parameters)
else: else:
success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self.beta1_power,
self.beta2_power, self.beta1, self.beta2, self.eps, lr),
gradients, self.parameters, self.moment1, self.moment2)
success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull,
self.beta1_power, self.beta2_power, self.beta1, self.beta2, self.eps, lr),
gradients, self.parameters, self.moment1, self.moment2, self.ps_parameters)
return success return success

+ 1
- 8
model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py View File

@@ -328,20 +328,13 @@ class TrainStepWrap(nn.Cell):
self.weights_w = ParameterTuple(weights_w) self.weights_w = ParameterTuple(weights_w)
self.weights_d = ParameterTuple(weights_d) self.weights_d = ParameterTuple(weights_d)
if host_device_mix and is_auto_parallel:
if (host_device_mix and is_auto_parallel) or parameter_server:
self.optimizer_d = LazyAdam( self.optimizer_d = LazyAdam(
self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
self.optimizer_w.sparse_opt.add_prim_attr("primitive_target", "CPU") self.optimizer_w.sparse_opt.add_prim_attr("primitive_target", "CPU")
self.optimizer_d.sparse_opt.add_prim_attr("primitive_target", "CPU") self.optimizer_d.sparse_opt.add_prim_attr("primitive_target", "CPU")
elif parameter_server:
self.optimizer_d = Adam(
self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
self.optimizer_w.sparse_opt.add_prim_attr("primitive_target", "CPU")
self.optimizer_d.sparse_opt.add_prim_attr("primitive_target", "CPU")
else: else:
self.optimizer_d = Adam( self.optimizer_d = Adam(
self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)


Loading…
Cancel
Save