Browse Source

!751 Remove enable hccl

Merge pull request !751 from zjun/remove_enable_hccl
tags/v0.3.0-alpha
mindspore-ci-bot Gitee 6 years ago
parent
commit
9399dffe0e
19 changed files with 39 additions and 97 deletions
  1. +5
    -11
      example/resnet101_imagenet2012/eval.py
  2. +5
    -11
      example/resnet101_imagenet2012/train.py
  3. +5
    -11
      example/resnet50_cifar10/eval.py
  4. +5
    -11
      example/resnet50_cifar10/train.py
  5. +1
    -1
      example/vgg16_cifar10/eval.py
  6. +1
    -1
      example/vgg16_cifar10/train.py
  7. +0
    -2
      example/yolov3_coco2017/train.py
  8. +0
    -2
      mindspore/ccsrc/pipeline/init.cc
  9. +1
    -1
      mindspore/ccsrc/pipeline/pipeline.cc
  10. +1
    -10
      mindspore/context.py
  11. +1
    -1
      mindspore/nn/wrap/grad_reducer.py
  12. +0
    -1
      tests/st/auto_parallel/onehot_model_parallel.py
  13. +0
    -1
      tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py
  14. +0
    -1
      tests/st/auto_parallel/test_resnet50_expand_loss_2p.py
  15. +4
    -10
      tests/st/mem_reuse/resnet_cifar_memreuse.py
  16. +4
    -10
      tests/st/mem_reuse/resnet_cifar_normal.py
  17. +4
    -10
      tests/st/tbe_networks/resnet_cifar.py
  18. +0
    -1
      tests/st/tbe_networks/test_resnet_cifar_8p.py
  19. +2
    -1
      tests/ut/python/parallel/test_auto_parallel_resnet.py

+ 5
- 11
example/resnet101_imagenet2012/eval.py View File

@@ -51,17 +51,11 @@ context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)

if __name__ == '__main__':
if args_opt.do_eval:
context.set_context(enable_hccl=False)
else:
if args_opt.run_distribute:
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True, parameter_broadcast=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()
else:
context.set_context(enable_hccl=False)
if not args_opt.do_eval and args_opt.run_distribute:
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True, parameter_broadcast=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()

epoch_size = config.epoch_size
net = resnet101(class_num=config.class_num)


+ 5
- 11
example/resnet101_imagenet2012/train.py View File

@@ -56,17 +56,11 @@ context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)

if __name__ == '__main__':
if args_opt.do_eval:
context.set_context(enable_hccl=False)
else:
if args_opt.run_distribute:
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True, parameter_broadcast=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()
else:
context.set_context(enable_hccl=False)
if not args_opt.do_eval and args_opt.run_distribute:
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True, parameter_broadcast=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()

epoch_size = config.epoch_size
net = resnet101(class_num=config.class_num)


+ 5
- 11
example/resnet50_cifar10/eval.py View File

@@ -51,17 +51,11 @@ context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)

if __name__ == '__main__':
if args_opt.do_eval:
context.set_context(enable_hccl=False)
else:
if args_opt.run_distribute:
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()
else:
context.set_context(enable_hccl=False)
if not args_opt.do_eval and args_opt.run_distribute:
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()

epoch_size = config.epoch_size
net = resnet50(class_num=config.class_num)


+ 5
- 11
example/resnet50_cifar10/train.py View File

@@ -54,17 +54,11 @@ context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)

if __name__ == '__main__':
if args_opt.do_eval:
context.set_context(enable_hccl=False)
else:
if args_opt.run_distribute:
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()
else:
context.set_context(enable_hccl=False)
if not args_opt.do_eval and args_opt.run_distribute:
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()

epoch_size = config.epoch_size
net = resnet50(class_num=config.class_num)


+ 1
- 1
example/vgg16_cifar10/eval.py View File

@@ -37,7 +37,7 @@ if __name__ == '__main__':

context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
context.set_context(device_id=args_opt.device_id)
context.set_context(enable_mem_reuse=True, enable_hccl=False)
context.set_context(enable_mem_reuse=True)

net = vgg16(num_classes=cfg.num_classes)
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, cfg.momentum,


+ 1
- 1
example/vgg16_cifar10/train.py View File

@@ -66,7 +66,7 @@ if __name__ == '__main__':
context.set_context(device_id=args_opt.device_id)
context.set_context(enable_task_sink=True)
context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True, enable_hccl=False)
context.set_context(enable_mem_reuse=True)

device_num = int(os.environ.get("DEVICE_NUM", 1))
if device_num > 1:


+ 0
- 2
example/yolov3_coco2017/train.py View File

@@ -90,13 +90,11 @@ if __name__ == '__main__':
if args_opt.distribute:
device_num = args_opt.device_num
context.reset_auto_parallel_context()
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
device_num=device_num)
init()
rank = args_opt.device_id % device_num
else:
context.set_context(enable_hccl=False)
rank = 0
device_num = 1



+ 0
- 2
mindspore/ccsrc/pipeline/init.cc View File

@@ -115,8 +115,6 @@ PYBIND11_MODULE(_c_expression, m) {
.def("set_device_id", &mindspore::MsContext::set_device_id, "Set device id.")
.def("open_tsd", &mindspore::MsContext::OpenTsd, "Open tdt dataset client.")
.def("close_tsd", &mindspore::MsContext::CloseTsd, "Close tdt dataset client.")
.def("set_hccl_flag", &mindspore::MsContext::set_enable_hccl, "Set enable hccl.")
.def("get_hccl_flag", &mindspore::MsContext::enable_hccl, "Get whether to enable hccl.")
.def("set_task_sink_flag", &mindspore::MsContext::set_enable_task_sink, "Set enable task sink.")
.def("get_task_sink_flag", &mindspore::MsContext::enable_task_sink, "Get whether to enable task sink.")
.def("get_save_graphs_flag", &mindspore::MsContext::save_graphs_flag, "Get whether to save graphs.")


+ 1
- 1
mindspore/ccsrc/pipeline/pipeline.cc View File

@@ -773,7 +773,7 @@ void InitHccl() {
(void)ms_context->OpenTsd();
uint32_t device_id = ms_context->device_id();
std::string device_name = ms_context->device_target();
ms_context->set_enable_hccl(true);
if (ms_context->backend_policy() == "ms" && ms_context->device_target() == kAscendDevice) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id);
MS_EXCEPTION_IF_NULL(runtime_instance);


+ 1
- 10
mindspore/context.py View File

@@ -225,14 +225,6 @@ class _Context:
if not success:
raise RuntimeError("Device id set failed!!!")

@property
def enable_hccl(self):
return self._context_handle.get_hccl_flag()

@enable_hccl.setter
def enable_hccl(self, hccl):
self._context_handle.set_hccl_flag(hccl)

@property
def enable_ir_fusion(self):
return self._context_handle.get_ir_fusion_flag()
@@ -482,7 +474,7 @@ def reset_auto_parallel_context():


@args_type_check(mode=int, precompile_only=bool, device_target=str,
device_id=int, enable_ir_fusion=bool, save_graphs=bool, enable_hccl=bool,
device_id=int, enable_ir_fusion=bool, save_graphs=bool,
enable_task_sink=bool, save_graphs_path=str, enable_loop_sink=bool,
enable_mem_reuse=bool, save_ms_model=bool, save_ms_model_path=str, enable_gpu_summary=bool,
enable_auto_mixed_precision=bool, enable_dump=bool, save_dump_path=str,
@@ -515,7 +507,6 @@ def set_context(**kwargs):
while device_num_per_host should no more than 4096. Default: 0.
enable_ir_fusion (bool): Whether to enable ir fusion. Default: True.
save_graphs (bool): Whether to save graphs. Default: False.
enable_hccl (bool): Whether to enable hccl. Default: False.
enable_loop_sink (bool): Whether to enable loop sink. Default: True.
enable_task_sink (bool): Whether to enable task sink. Default: True.
enable_mem_reuse (bool): Whether to enable memory reuse. Default: True.


+ 1
- 1
mindspore/nn/wrap/grad_reducer.py View File

@@ -130,7 +130,7 @@ class DistributedGradReducer(Cell):
>>>
>>> device_id = int(os.environ["DEVICE_ID"])
>>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True,
>>> device_id=int(device_id), enable_hccl=True)
>>> device_id=int(device_id))
>>> init()
>>> context.reset_auto_parallel_context()
>>> context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL)


+ 0
- 1
tests/st/auto_parallel/onehot_model_parallel.py View File

@@ -33,7 +33,6 @@ def setup_module():
global rank_id
np.random.seed(0)
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
context.set_context(enable_hccl=True)
context.set_context(enable_task_sink=True,
device_id=device_id)
context.set_context(enable_ir_fusion=True)


+ 0
- 1
tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py View File

@@ -46,7 +46,6 @@ def setup_module():
global rank_id
np.random.seed(0)
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
context.set_context(enable_hccl=True)
context.set_context(enable_task_sink=True,
device_id=device_id)
context.set_context(enable_ir_fusion=True)


+ 0
- 1
tests/st/auto_parallel/test_resnet50_expand_loss_2p.py View File

@@ -31,7 +31,6 @@ from mindspore.train.callback import Callback
from mindspore.parallel import set_algo_parameters

context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
context.set_context(enable_hccl=True)
context.set_context(enable_task_sink=True, device_id=int(os.getenv('DEVICE_ID')))
context.set_context(enable_ir_fusion=True)
context.set_context(enable_loop_sink=False)


+ 4
- 10
tests/st/mem_reuse/resnet_cifar_memreuse.py View File

@@ -122,16 +122,10 @@ class CrossEntropyLoss(nn.Cell):


if __name__ == '__main__':
if args_opt.do_eval:
context.set_context(enable_hccl=False)
else:
if args_opt.run_distribute:
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
context.set_auto_parallel_context(all_reduce_fusion_split_indices=[140])
init()
else:
context.set_context(enable_hccl=False)
if not args_opt.do_eval and args_opt.run_distribute:
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
context.set_auto_parallel_context(all_reduce_fusion_split_indices=[140])
init()

context.set_context(mode=context.GRAPH_MODE)
epoch_size = args_opt.epoch_size


+ 4
- 10
tests/st/mem_reuse/resnet_cifar_normal.py View File

@@ -123,16 +123,10 @@ class CrossEntropyLoss(nn.Cell):


if __name__ == '__main__':
if args_opt.do_eval:
context.set_context(enable_hccl=False)
else:
if args_opt.run_distribute:
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
context.set_auto_parallel_context(all_reduce_fusion_split_indices=[140])
init()
else:
context.set_context(enable_hccl=False)
if not args_opt.do_eval and args_opt.run_distribute:
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
context.set_auto_parallel_context(all_reduce_fusion_split_indices=[140])
init()

context.set_context(mode=context.GRAPH_MODE)
epoch_size = args_opt.epoch_size


+ 4
- 10
tests/st/tbe_networks/resnet_cifar.py View File

@@ -122,16 +122,10 @@ class CrossEntropyLoss(nn.Cell):


if __name__ == '__main__':
if args_opt.do_eval:
context.set_context(enable_hccl=False)
else:
if args_opt.run_distribute:
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()
else:
context.set_context(enable_hccl=False)
if not args_opt.do_eval and args_opt.run_distribute:
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()

context.set_context(mode=context.GRAPH_MODE)
epoch_size = args_opt.epoch_size


+ 0
- 1
tests/st/tbe_networks/test_resnet_cifar_8p.py View File

@@ -153,7 +153,6 @@ def train_process(q, device_id, epoch_size, num_classes, device_num, batch_size,
context.set_context(enable_task_sink=True, device_id=device_id)
context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)
context.set_context(enable_hccl=enable_hccl)
os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
os.environ['RANK_ID'] = str(device_id)
os.environ['RANK_SIZE'] = str(device_num)


+ 2
- 1
tests/ut/python/parallel/test_auto_parallel_resnet.py View File

@@ -19,6 +19,7 @@ from mindspore import Tensor
from mindspore.ops import operations as P
from mindspore.nn.optim.momentum import Momentum
from mindspore.common.initializer import TruncatedNormal
from mindspore.communication.management import init
from mindspore.train.model import Model, ParallelMode
from mindspore import context
import os
@@ -31,10 +32,10 @@ from mindspore.parallel import set_algo_parameters
from mindspore.parallel import _cost_model_context as cost_model_context

context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
context.set_context(enable_hccl=True)
context.set_context(enable_task_sink=True, device_id= 0)
context.set_context(enable_ir_fusion=True)
context.set_context(enable_loop_sink=False)
init()

def weight_variable(shape, factor=0.1):
return TruncatedNormal(0.02)


Loading…
Cancel
Save