GitOrigin-RevId: 9ffd8a6149
tags/v1.0.0-rc1
| @@ -7,8 +7,11 @@ | |||||
| # software distributed under the License is distributed on an | # software distributed under the License is distributed on an | ||||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| import functools | import functools | ||||
| import multiprocessing as mp | |||||
| from typing import Callable | from typing import Callable | ||||
| from megengine.device import get_device_count | |||||
| from .group import group_barrier, is_distributed | from .group import group_barrier, is_distributed | ||||
| @@ -26,3 +29,16 @@ def synchronized(func: Callable): | |||||
| return ret | return ret | ||||
| return wrapper | return wrapper | ||||
| def get_device_count_by_fork(device_type: str): | |||||
| q = mp.Queue() | |||||
| def worker(queue): | |||||
| num = get_device_count(device_type) | |||||
| queue.put(num) | |||||
| p = mp.Process(target=worker, args=(q,)) | |||||
| p.start() | |||||
| p.join() | |||||
| return q.get() | |||||
| @@ -21,6 +21,7 @@ import megengine as mge | |||||
| import megengine.distributed as dist | import megengine.distributed as dist | ||||
| import megengine.functional as F | import megengine.functional as F | ||||
| from megengine.device import get_default_device, set_default_device | from megengine.device import get_default_device, set_default_device | ||||
| from megengine.distributed.helper import get_device_count_by_fork | |||||
| from megengine.functional.debug_param import set_conv_execution_strategy | from megengine.functional.debug_param import set_conv_execution_strategy | ||||
| from megengine.module import AvgPool2d, BatchNorm2d, Conv2d, Linear, Module | from megengine.module import AvgPool2d, BatchNorm2d, Conv2d, Linear, Module | ||||
| from megengine.optimizer import SGD | from megengine.optimizer import SGD | ||||
| @@ -196,6 +197,7 @@ def run_test( | |||||
| assert p.exitcode == 0 | assert p.exitcode == 0 | ||||
| @pytest.mark.skipif(get_device_count_by_fork("gpu") < 4, reason="need more gpu device") | |||||
| @pytest.mark.isolated_distributed | @pytest.mark.isolated_distributed | ||||
| @pytest.mark.skipif( | @pytest.mark.skipif( | ||||
| platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM" | platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM" | ||||
| @@ -6,6 +6,8 @@ | |||||
| # Unless required by applicable law or agreed to in writing, | # Unless required by applicable law or agreed to in writing, | ||||
| # software distributed under the License is distributed on an | # software distributed under the License is distributed on an | ||||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| import platform | |||||
| import numpy as np | import numpy as np | ||||
| import pytest | import pytest | ||||
| @@ -13,6 +15,7 @@ import megengine.functional as F | |||||
| from megengine import Buffer, Parameter, is_cuda_available, tensor | from megengine import Buffer, Parameter, is_cuda_available, tensor | ||||
| from megengine.core._trace_option import use_tensor_shape | from megengine.core._trace_option import use_tensor_shape | ||||
| from megengine.core.tensor.utils import astensor1d | from megengine.core.tensor.utils import astensor1d | ||||
| from megengine.distributed.helper import get_device_count_by_fork | |||||
| from megengine.test import assertTensorClose | from megengine.test import assertTensorClose | ||||
| @@ -323,17 +326,35 @@ def copy_test(dst, src): | |||||
| assert np.allclose(data, y.numpy()) | assert np.allclose(data, y.numpy()) | ||||
| @pytest.mark.skipif(not is_cuda_available(), reason="CUDA is disabled") | |||||
| @pytest.mark.skipif( | |||||
| platform.system() == "Darwin", reason="do not imp GPU mode at macos now" | |||||
| ) | |||||
| @pytest.mark.skipif( | |||||
| platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | |||||
| ) | |||||
| @pytest.mark.skipif(get_device_count_by_fork("gpu") == 0, reason="CUDA is disabled") | |||||
| def test_copy_h2d(): | def test_copy_h2d(): | ||||
| copy_test("cpu0", "gpu0") | copy_test("cpu0", "gpu0") | ||||
| @pytest.mark.skipif(not is_cuda_available(), reason="CUDA is disabled") | |||||
| @pytest.mark.skipif( | |||||
| platform.system() == "Darwin", reason="do not imp GPU mode at macos now" | |||||
| ) | |||||
| @pytest.mark.skipif( | |||||
| platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | |||||
| ) | |||||
| @pytest.mark.skipif(get_device_count_by_fork("gpu") == 0, reason="CUDA is disabled") | |||||
| def test_copy_d2h(): | def test_copy_d2h(): | ||||
| copy_test("gpu0", "cpu0") | copy_test("gpu0", "cpu0") | ||||
| @pytest.mark.skipif(not is_cuda_available(), reason="CUDA is disabled") | |||||
| @pytest.mark.skipif( | |||||
| platform.system() == "Darwin", reason="do not imp GPU mode at macos now" | |||||
| ) | |||||
| @pytest.mark.skipif( | |||||
| platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | |||||
| ) | |||||
| @pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device") | |||||
| def test_copy_d2d(): | def test_copy_d2d(): | ||||
| copy_test("gpu0", "gpu1") | copy_test("gpu0", "gpu1") | ||||
| copy_test("gpu0:0", "gpu0:1") | copy_test("gpu0:0", "gpu0:1") | ||||
| @@ -14,6 +14,7 @@ import pytest | |||||
| import megengine as mge | import megengine as mge | ||||
| import megengine.distributed as dist | import megengine.distributed as dist | ||||
| from megengine.distributed.helper import get_device_count_by_fork | |||||
| def _assert_q_empty(q): | def _assert_q_empty(q): | ||||
| @@ -36,6 +37,7 @@ def _assert_q_val(q, val): | |||||
| @pytest.mark.skipif( | @pytest.mark.skipif( | ||||
| platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | ||||
| ) | ) | ||||
| @pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device") | |||||
| @pytest.mark.isolated_distributed | @pytest.mark.isolated_distributed | ||||
| def test_init_process_group(): | def test_init_process_group(): | ||||
| world_size = 2 | world_size = 2 | ||||
| @@ -43,8 +45,6 @@ def test_init_process_group(): | |||||
| server = dist.Server(port) | server = dist.Server(port) | ||||
| def worker(rank, backend): | def worker(rank, backend): | ||||
| if mge.get_device_count("gpu") < world_size: | |||||
| return | |||||
| dist.init_process_group("localhost", port, world_size, rank, rank, backend) | dist.init_process_group("localhost", port, world_size, rank, rank, backend) | ||||
| assert dist.is_distributed() == True | assert dist.is_distributed() == True | ||||
| assert dist.get_rank() == rank | assert dist.get_rank() == rank | ||||
| @@ -82,6 +82,7 @@ def test_init_process_group(): | |||||
| @pytest.mark.skipif( | @pytest.mark.skipif( | ||||
| platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | ||||
| ) | ) | ||||
| @pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device") | |||||
| @pytest.mark.isolated_distributed | @pytest.mark.isolated_distributed | ||||
| def test_new_group(): | def test_new_group(): | ||||
| world_size = 3 | world_size = 3 | ||||
| @@ -90,8 +91,6 @@ def test_new_group(): | |||||
| server = dist.Server(port) | server = dist.Server(port) | ||||
| def worker(rank): | def worker(rank): | ||||
| if mge.get_device_count("gpu") < world_size: | |||||
| return | |||||
| dist.init_process_group("localhost", port, world_size, rank, rank) | dist.init_process_group("localhost", port, world_size, rank, rank) | ||||
| if rank in ranks: | if rank in ranks: | ||||
| group = dist.new_group(ranks) | group = dist.new_group(ranks) | ||||
| @@ -117,6 +116,7 @@ def test_new_group(): | |||||
| @pytest.mark.skipif( | @pytest.mark.skipif( | ||||
| platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | ||||
| ) | ) | ||||
| @pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device") | |||||
| @pytest.mark.isolated_distributed | @pytest.mark.isolated_distributed | ||||
| def test_group_barrier(): | def test_group_barrier(): | ||||
| world_size = 2 | world_size = 2 | ||||
| @@ -124,8 +124,6 @@ def test_group_barrier(): | |||||
| server = dist.Server(port) | server = dist.Server(port) | ||||
| def worker(rank, q): | def worker(rank, q): | ||||
| if mge.get_device_count("gpu") < world_size: | |||||
| return | |||||
| dist.init_process_group("localhost", port, world_size, rank, rank) | dist.init_process_group("localhost", port, world_size, rank, rank) | ||||
| dist.group_barrier() | dist.group_barrier() | ||||
| if rank == 0: | if rank == 0: | ||||
| @@ -154,6 +152,7 @@ def test_group_barrier(): | |||||
| @pytest.mark.skipif( | @pytest.mark.skipif( | ||||
| platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | ||||
| ) | ) | ||||
| @pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device") | |||||
| @pytest.mark.isolated_distributed | @pytest.mark.isolated_distributed | ||||
| def test_synchronized(): | def test_synchronized(): | ||||
| world_size = 2 | world_size = 2 | ||||
| @@ -165,8 +164,6 @@ def test_synchronized(): | |||||
| q.put(rank) | q.put(rank) | ||||
| def worker(rank, q): | def worker(rank, q): | ||||
| if mge.get_device_count("gpu") < world_size: | |||||
| return | |||||
| dist.init_process_group("localhost", port, world_size, rank, rank) | dist.init_process_group("localhost", port, world_size, rank, rank) | ||||
| dist.group_barrier() | dist.group_barrier() | ||||
| if rank == 0: | if rank == 0: | ||||
| @@ -10,6 +10,14 @@ import platform | |||||
| import pytest | import pytest | ||||
| import megengine as mge | |||||
| import megengine.distributed as dist | |||||
| from megengine import tensor | |||||
| from megengine.distributed.group import Group | |||||
| from megengine.distributed.helper import get_device_count_by_fork | |||||
| from megengine.module import SyncBatchNorm | |||||
| from megengine.test import assertTensorClose | |||||
| @pytest.mark.skipif( | @pytest.mark.skipif( | ||||
| platform.system() == "Darwin", reason="do not imp GPU mode at macos now" | platform.system() == "Darwin", reason="do not imp GPU mode at macos now" | ||||
| @@ -17,6 +25,7 @@ import pytest | |||||
| @pytest.mark.skipif( | @pytest.mark.skipif( | ||||
| platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | platform.system() == "Windows", reason="do not imp GPU mode at Windows now" | ||||
| ) | ) | ||||
| @pytest.mark.skipif(get_device_count_by_fork("gpu") < 4, reason="need more gpu device") | |||||
| @pytest.mark.isolated_distributed | @pytest.mark.isolated_distributed | ||||
| def test_syncbn(): | def test_syncbn(): | ||||
| import numpy as np | import numpy as np | ||||
| @@ -39,15 +48,6 @@ def test_syncbn(): | |||||
| port = server.py_server_port | port = server.py_server_port | ||||
| def worker(rank, data, yv_expect, running_mean, running_var): | def worker(rank, data, yv_expect, running_mean, running_var): | ||||
| import megengine as mge | |||||
| import megengine.distributed as dist | |||||
| from megengine import tensor | |||||
| from megengine.module import SyncBatchNorm | |||||
| from megengine.distributed.group import Group | |||||
| from megengine.test import assertTensorClose | |||||
| if mge.get_device_count("gpu") < nr_ranks: | |||||
| return | |||||
| dist.init_process_group("localhost", port, nr_ranks, rank, rank) | dist.init_process_group("localhost", port, nr_ranks, rank, rank) | ||||
| group = Group([i for i in range(nr_ranks)]) | group = Group([i for i in range(nr_ranks)]) | ||||
| bn = SyncBatchNorm(nr_chan, eps=eps, momentum=momentum, group=group) | bn = SyncBatchNorm(nr_chan, eps=eps, momentum=momentum, group=group) | ||||