| @@ -34,7 +34,8 @@ namespace parallel { | |||||
| #define OPERATOR_TO_OPERATOR_CONNECTOR "-" | #define OPERATOR_TO_OPERATOR_CONNECTOR "-" | ||||
| #define DEFAULT_DEVICE_MEMORY_CAPACITY (1024.0 * 1024.0 * 1024.0 * 16.0) | #define DEFAULT_DEVICE_MEMORY_CAPACITY (1024.0 * 1024.0 * 1024.0 * 16.0) | ||||
| #define DEFAULT_COST_MODEL_ALPHA 1.0 | #define DEFAULT_COST_MODEL_ALPHA 1.0 | ||||
| #define DEFAULT_COST_MODEL_BETA 400.0 | |||||
| #define DEFAULT_COST_MODEL_BETA_ASCEND 400.0 // for 'device_target = Ascend' | |||||
| #define DEFAULT_COST_MODEL_BETA_GPU 50.0 // for 'device_target = GPU' | |||||
| #define DEFAULT_COST_MODEL_GAMMA 0.001 | #define DEFAULT_COST_MODEL_GAMMA 0.001 | ||||
| #define DEFAULT_COST_MODEL_SIMPLIFY_CALCULATION true | #define DEFAULT_COST_MODEL_SIMPLIFY_CALCULATION true | ||||
| #define DEFAULT_COST_MODEL_COMMUNI_THRESHOLD 2048.0 | #define DEFAULT_COST_MODEL_COMMUNI_THRESHOLD 2048.0 | ||||
| @@ -73,7 +74,7 @@ class CostGraph { | |||||
| CostGraph() { | CostGraph() { | ||||
| dev_memory_ = DEFAULT_DEVICE_MEMORY_CAPACITY; | dev_memory_ = DEFAULT_DEVICE_MEMORY_CAPACITY; | ||||
| costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA; | costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA; | ||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA; | |||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA_ASCEND; | |||||
| } | } | ||||
| ~CostGraph() = default; | ~CostGraph() = default; | ||||
| void AddOperator(const OperatorInfoPtr &op) { ops_.push_back(op); } | void AddOperator(const OperatorInfoPtr &op) { ops_.push_back(op); } | ||||
| @@ -20,6 +20,7 @@ | |||||
| #include "frontend/parallel/allreduce_fusion/allreduce_fusion.h" | #include "frontend/parallel/allreduce_fusion/allreduce_fusion.h" | ||||
| #include "frontend/parallel/auto_parallel/graph_costmodel.h" | #include "frontend/parallel/auto_parallel/graph_costmodel.h" | ||||
| #include "utils/context/ms_context.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace parallel { | namespace parallel { | ||||
| @@ -41,7 +42,7 @@ CostModelContext::CostModelContext() { | |||||
| void CostModelContext::ResetCostModel() { | void CostModelContext::ResetCostModel() { | ||||
| device_memory_capacity_ = DEFAULT_DEVICE_MEMORY_CAPACITY; | device_memory_capacity_ = DEFAULT_DEVICE_MEMORY_CAPACITY; | ||||
| costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA; | costmodel_alpha_ = DEFAULT_COST_MODEL_ALPHA; | ||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA; | |||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA_ASCEND; | |||||
| costmodel_gamma_ = DEFAULT_COST_MODEL_GAMMA; | costmodel_gamma_ = DEFAULT_COST_MODEL_GAMMA; | ||||
| costmodel_communi_threshold_ = DEFAULT_COST_MODEL_COMMUNI_THRESHOLD; | costmodel_communi_threshold_ = DEFAULT_COST_MODEL_COMMUNI_THRESHOLD; | ||||
| costmodel_communi_const_ = DEFAULT_COST_MODEL_COMMUNI_CONST; | costmodel_communi_const_ = DEFAULT_COST_MODEL_COMMUNI_CONST; | ||||
| @@ -66,6 +67,12 @@ void CostModelContext::ResetAlgoParameters() { | |||||
| elementwise_stra_follow_ = DEFAULT_ELEMENTWISE_OP_STRA_FOLLOW; | elementwise_stra_follow_ = DEFAULT_ELEMENTWISE_OP_STRA_FOLLOW; | ||||
| } | } | ||||
| void CostModelContext::set_costmodel_context_for_device(const std::string &device_target) { | |||||
| if (device_target == kGPUDevice) { | |||||
| costmodel_beta_ = DEFAULT_COST_MODEL_BETA_GPU; | |||||
| } | |||||
| } | |||||
| void CostModelContext::set_device_memory_capacity(double dm_capacity) { device_memory_capacity_ = dm_capacity; } | void CostModelContext::set_device_memory_capacity(double dm_capacity) { device_memory_capacity_ = dm_capacity; } | ||||
| void CostModelContext::set_costmodel_alpha(double cm_alpha) { costmodel_alpha_ = cm_alpha; } | void CostModelContext::set_costmodel_alpha(double cm_alpha) { costmodel_alpha_ = cm_alpha; } | ||||
| @@ -35,6 +35,7 @@ class CostModelContext { | |||||
| static std::shared_ptr<CostModelContext> GetInstance(); | static std::shared_ptr<CostModelContext> GetInstance(); | ||||
| void set_costmodel_context_for_device(const std::string &); | |||||
| // DEVICE_MEMORY_CAPACITY | // DEVICE_MEMORY_CAPACITY | ||||
| void set_device_memory_capacity(double); | void set_device_memory_capacity(double); | ||||
| double device_memory_capacity() const { return device_memory_capacity_; } | double device_memory_capacity() const { return device_memory_capacity_; } | ||||
| @@ -21,6 +21,7 @@ | |||||
| #include "./common.h" | #include "./common.h" | ||||
| #include "utils/convert_utils.h" | #include "utils/convert_utils.h" | ||||
| #include "utils/tensorprint_utils.h" | #include "utils/tensorprint_utils.h" | ||||
| #include "frontend/parallel/costmodel_context.h" | |||||
| #ifndef NO_DLIB | #ifndef NO_DLIB | ||||
| #include "tdt/tsd_client.h" | #include "tdt/tsd_client.h" | ||||
| #include "tdt/tdt_host_interface.h" | #include "tdt/tdt_host_interface.h" | ||||
| @@ -146,6 +147,7 @@ bool MsContext::set_device_target(const std::string &target) { | |||||
| } else { | } else { | ||||
| device_target_ = target; | device_target_ = target; | ||||
| } | } | ||||
| parallel::CostModelContext::GetInstance()->set_costmodel_context_for_device(device_target_); | |||||
| MS_LOG(INFO) << "ms set context device target:" << target; | MS_LOG(INFO) << "ms set context device target:" << target; | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -678,3 +678,56 @@ def test_train_64k_8p(batch_size=32, num_classes=65536): # 1048576 #131072 #327 | |||||
| assert v == [[1, 1], [dev_num, 1]] | assert v == [[1, 1], [dev_num, 1]] | ||||
| elif re.search('ReduceSum-op', k) is not None: | elif re.search('ReduceSum-op', k) is not None: | ||||
| assert v == [[1, dev_num]] | assert v == [[1, dev_num]] | ||||
| def test_train_8k_8p_gpu(batch_size=32, num_classes=8192): | |||||
| dev_num = 8 | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) | |||||
| set_algo_parameters(elementwise_op_strategy_follow=True) | |||||
| resset_op_id() | |||||
| np.random.seed(6) | |||||
| input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32) | |||||
| label_np = np.zeros([batch_size]).astype(np.int32) | |||||
| for i in range(0, batch_size): | |||||
| label_np[i] = i % num_classes | |||||
| dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1) | |||||
| net = resnet50(num_classes) | |||||
| loss = SoftmaxCrossEntropyExpand(sparse=True) | |||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) | |||||
| model = Model(net, loss_fn=loss, optimizer=opt) | |||||
| model.train(5, dataset, dataset_sink_mode=False) | |||||
| strategies = _executor._get_strategy(model._train_network) | |||||
| for (k, v) in strategies.items(): | |||||
| if re.search('Conv2D-op', k) is not None: | |||||
| assert v[0][0] == dev_num | |||||
| elif re.search('MatMul-op', k) is not None: | |||||
| assert v == [[1, 1], [dev_num, 1]] | |||||
| elif re.search('ReduceSum-op', k) is not None: | |||||
| assert v == [[1, dev_num]] | |||||
| def test_train_4k_8p_gpu(batch_size=32, num_classes=4096): | |||||
| dev_num = 8 | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) | |||||
| set_algo_parameters(elementwise_op_strategy_follow=True) | |||||
| resset_op_id() | |||||
| np.random.seed(6) | |||||
| input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32) | |||||
| label_np = np.zeros([batch_size]).astype(np.int32) | |||||
| for i in range(0, batch_size): | |||||
| label_np[i] = i % num_classes | |||||
| dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1) | |||||
| net = resnet50(num_classes) | |||||
| loss = SoftmaxCrossEntropyExpand(sparse=True) | |||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) | |||||
| model = Model(net, loss_fn=loss, optimizer=opt) | |||||
| model.train(5, dataset, dataset_sink_mode=False) | |||||
| strategies = _executor._get_strategy(model._train_network) | |||||
| for (k, v) in strategies.items(): | |||||
| if re.search('Conv2D-op', k) is not None: | |||||
| assert v[0][0] == dev_num | |||||
| elif re.search('MatMul-op', k) is not None: | |||||
| assert v == [[dev_num, 1], [1, 1]] | |||||
| elif re.search('ReduceSum-op', k) is not None: | |||||
| assert v == [[dev_num, 1]] | |||||