From 70e2ea50a951991fe4ad92772c15afb9ccd48784 Mon Sep 17 00:00:00 2001 From: zhangyinxia Date: Tue, 2 Mar 2021 16:20:12 +0800 Subject: [PATCH] add st --- example/matmul_distributed/master.py | 26 ++ example/matmul_distributed/worker.py | 31 +++ tests/st/distribute_agent_fault/__init__.py | 14 ++ .../distribute_agent_fault/kill_15_agent.sh | 214 ++++++++++++++++ .../st/distribute_agent_fault/kill_9_agent.sh | 221 +++++++++++++++++ .../test_distribute_agent_fault.py | 43 ++++ tests/st/distribute_master_fault/__init__.py | 14 ++ .../distribute_master_fault/kill_15_master.sh | 214 ++++++++++++++++ .../distribute_master_fault/kill_9_master.sh | 214 ++++++++++++++++ .../test_distribute_master_fault.py | 43 ++++ tests/st/distribute_worker_fault/__init__.py | 14 ++ .../distribute_worker_fault/kill_15_worker.sh | 214 ++++++++++++++++ .../distribute_worker_fault/kill_9_worker.sh | 228 ++++++++++++++++++ .../test_distribute_worker_fault.py | 43 ++++ tests/st/master_fault/__init__.py | 2 +- tests/st/worker_fault/__init__.py | 2 +- 16 files changed, 1535 insertions(+), 2 deletions(-) create mode 100644 example/matmul_distributed/master.py create mode 100644 example/matmul_distributed/worker.py create mode 100644 tests/st/distribute_agent_fault/__init__.py create mode 100644 tests/st/distribute_agent_fault/kill_15_agent.sh create mode 100644 tests/st/distribute_agent_fault/kill_9_agent.sh create mode 100644 tests/st/distribute_agent_fault/test_distribute_agent_fault.py create mode 100644 tests/st/distribute_master_fault/__init__.py create mode 100644 tests/st/distribute_master_fault/kill_15_master.sh create mode 100644 tests/st/distribute_master_fault/kill_9_master.sh create mode 100644 tests/st/distribute_master_fault/test_distribute_master_fault.py create mode 100644 tests/st/distribute_worker_fault/__init__.py create mode 100644 tests/st/distribute_worker_fault/kill_15_worker.sh create mode 100644 tests/st/distribute_worker_fault/kill_9_worker.sh create mode 100644 tests/st/distribute_worker_fault/test_distribute_worker_fault.py diff --git a/example/matmul_distributed/master.py b/example/matmul_distributed/master.py new file mode 100644 index 0000000..fcc169d --- /dev/null +++ b/example/matmul_distributed/master.py @@ -0,0 +1,26 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Start Distributed Servable matmul""" + +from mindspore_serving import master + + +def start(): + master.start_grpc_server("127.0.0.1", 5500) + master.start_master_server("127.0.0.1", 6500) + + +if __name__ == "__main__": + start() diff --git a/example/matmul_distributed/worker.py b/example/matmul_distributed/worker.py new file mode 100644 index 0000000..5c3b571 --- /dev/null +++ b/example/matmul_distributed/worker.py @@ -0,0 +1,31 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Start Distributed Servable matmul""" + +import os +import sys +from mindspore_serving.worker import distributed + + +def start(): + servable_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + distributed.start_distributed_servable(servable_dir, "matmul", + rank_table_json_file="rank_table_8pcs.json", + version_number=1, + worker_ip="127.0.0.1", worker_port=6200, + master_ip="127.0.0.1", master_port=6500) + +if __name__ == "__main__": + start() diff --git a/tests/st/distribute_agent_fault/__init__.py b/tests/st/distribute_agent_fault/__init__.py new file mode 100644 index 0000000..919b057 --- /dev/null +++ b/tests/st/distribute_agent_fault/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/tests/st/distribute_agent_fault/kill_15_agent.sh b/tests/st/distribute_agent_fault/kill_15_agent.sh new file mode 100644 index 0000000..1914548 --- /dev/null +++ b/tests/st/distribute_agent_fault/kill_15_agent.sh @@ -0,0 +1,214 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + sh export_model.sh &> model.log + echo "### end to generate mode for serving test ###" + result=`find . -name matmul.mindir | wc -l` + if [ ${result} -ne 8 ] + then + cat model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} +start_agent() +{ + echo "### start serving agent ###" + unset http_proxy https_proxy + python3 agent.py > agent.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server agent failed to start." + fi + + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat agent.log + echo "start serving agent failed!" && exit 1 + fi + echo "### start serving agent end ###" +} +kill_agent() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 9 ] + then + echo "agent start failed" + echo $num + clean_pid && exit 1 + fi + ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15 + if [ $? -ne 0 ] + then + echo "kill agent failed" + fi + sleep 25 + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "worker exit failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "agent exit failed" + echo $num + clean_pid && exit 1 + fi +} + +test_agent_fault_model() +{ + start_master + start_worker + start_agent + kill_agent + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta +rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py +cp -r ../../../example/matmul_distributed/* . +prepare_model +test_agent_fault_model +echo "### end to serving test ###" diff --git a/tests/st/distribute_agent_fault/kill_9_agent.sh b/tests/st/distribute_agent_fault/kill_9_agent.sh new file mode 100644 index 0000000..883a15c --- /dev/null +++ b/tests/st/distribute_agent_fault/kill_9_agent.sh @@ -0,0 +1,221 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + sh export_model.sh &> model.log + echo "### end to generate mode for serving test ###" + result=`find . -name matmul.mindir | wc -l` + if [ ${result} -ne 8 ] + then + cat model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} +start_agent() +{ + echo "### start serving agent ###" + unset http_proxy https_proxy + python3 agent.py > agent.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server agent failed to start." + fi + + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat agent.log + echo "start serving agent failed!" && exit 1 + fi + echo "### start serving agent end ###" +} +kill_agent() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 9 ] + then + echo "agent start failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Pong Time Out from' worker.log | wc -l` + if [ $num -ne 0 ] + then + echo "worker has exited" + echo $num + clean_pid && exit 1 + fi + ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + if [ $? -ne 0 ] + then + echo "kill agent failed" + fi + sleep 25 + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "agent exit failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Pong Time Out from' worker.log | wc -l` + if [ $num -ne 8 ] + then + echo "catch agent exit failed" + echo $num + clean_pid && exit 1 + fi +} + +test_agent_fault_model() +{ + start_master + start_worker + start_agent + kill_agent + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta +rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py +cp -r ../../../example/matmul_distributed/* . +prepare_model +test_agent_fault_model +echo "### end to serving test ###" diff --git a/tests/st/distribute_agent_fault/test_distribute_agent_fault.py b/tests/st/distribute_agent_fault/test_distribute_agent_fault.py new file mode 100644 index 0000000..7e1584d --- /dev/null +++ b/tests/st/distribute_agent_fault/test_distribute_agent_fault.py @@ -0,0 +1,43 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import sys +import pytest +import numpy as np + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.env_single +def test_distribute_agent_fault(): + """test_serving""" + sh_path = os.path.split(os.path.realpath(__file__))[0] + python_path_folders = [] + for python_path in sys.path: + if os.path.isdir(python_path): + python_path_folders += [python_path] + folders = [] + for folder in python_path_folders: + folders += [os.path.join(folder, x) for x in os.listdir(folder) \ + if os.path.isdir(os.path.join(folder, x)) and \ + '/site-packages/mindspore' in os.path.join(folder, x)] + ret = os.system(f"sh {sh_path}/kill_15_agent.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + ret = os.system(f"sh {sh_path}/kill_9_agent.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + +if __name__ == '__main__': + test_distribute_agent_fault() diff --git a/tests/st/distribute_master_fault/__init__.py b/tests/st/distribute_master_fault/__init__.py new file mode 100644 index 0000000..919b057 --- /dev/null +++ b/tests/st/distribute_master_fault/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/tests/st/distribute_master_fault/kill_15_master.sh b/tests/st/distribute_master_fault/kill_15_master.sh new file mode 100644 index 0000000..4738729 --- /dev/null +++ b/tests/st/distribute_master_fault/kill_15_master.sh @@ -0,0 +1,214 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + sh export_model.sh &> model.log + echo "### end to generate mode for serving test ###" + result=`find . -name matmul.mindir | wc -l` + if [ ${result} -ne 8 ] + then + cat model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} +start_agent() +{ + echo "### start serving agent ###" + unset http_proxy https_proxy + python3 agent.py > agent.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server agent failed to start." + fi + + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat agent.log + echo "start serving agent failed!" && exit 1 + fi + echo "### start serving agent end ###" +} +kill_master() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 9 ] + then + echo "agent start failed" + echo $num + clean_pid && exit 1 + fi + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15 + if [ $? -ne 0 ] + then + echo "kill master failed" + fi + sleep 15 + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "master exit failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "worker exit failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "agent exit failed" + echo $num + clean_pid && exit 1 + fi +} + +test_master_fault_model() +{ + start_master + start_worker + start_agent + kill_master + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta +rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py +cp -r ../../../example/matmul_distributed/* . +prepare_model +test_master_fault_model +echo "### end to serving test ###" diff --git a/tests/st/distribute_master_fault/kill_9_master.sh b/tests/st/distribute_master_fault/kill_9_master.sh new file mode 100644 index 0000000..eb7e651 --- /dev/null +++ b/tests/st/distribute_master_fault/kill_9_master.sh @@ -0,0 +1,214 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + sh export_model.sh &> model.log + echo "### end to generate mode for serving test ###" + result=`find . -name matmul.mindir | wc -l` + if [ ${result} -ne 8 ] + then + cat model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} +start_agent() +{ + echo "### start serving agent ###" + unset http_proxy https_proxy + python3 agent.py > agent.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server agent failed to start." + fi + + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat agent.log + echo "start serving agent failed!" && exit 1 + fi + echo "### start serving agent end ###" +} +kill_master() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 9 ] + then + echo "agent start failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Ping Time Out from' worker.log | wc -l` + if [ $num -ne 0 ] + then + echo "worker has exited" + echo $num + clean_pid && exit 1 + fi + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + if [ $? -ne 0 ] + then + echo "kill master failed" + fi + sleep 25 + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "master exit failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Ping Time Out from' worker.log | wc -l` + if [ $num -ne 1 ] + then + echo "catch master exit failed" + echo $num + clean_pid && exit 1 + fi +} + +test_master_fault_model() +{ + start_master + start_worker + start_agent + kill_master + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta +rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py +cp -r ../../../example/matmul_distributed/* . +prepare_model +test_master_fault_model +echo "### end to serving test ###" diff --git a/tests/st/distribute_master_fault/test_distribute_master_fault.py b/tests/st/distribute_master_fault/test_distribute_master_fault.py new file mode 100644 index 0000000..777e50f --- /dev/null +++ b/tests/st/distribute_master_fault/test_distribute_master_fault.py @@ -0,0 +1,43 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import sys +import pytest +import numpy as np + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.env_single +def test_distribute_master_fault(): + """test_serving""" + sh_path = os.path.split(os.path.realpath(__file__))[0] + python_path_folders = [] + for python_path in sys.path: + if os.path.isdir(python_path): + python_path_folders += [python_path] + folders = [] + for folder in python_path_folders: + folders += [os.path.join(folder, x) for x in os.listdir(folder) \ + if os.path.isdir(os.path.join(folder, x)) and \ + '/site-packages/mindspore' in os.path.join(folder, x)] + ret = os.system(f"sh {sh_path}/kill_15_master.sh {folders[0].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + ret = os.system(f"sh {sh_path}/kill_9_master.sh {folders[0].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + +if __name__ == '__main__': + test_distribute_master_fault() diff --git a/tests/st/distribute_worker_fault/__init__.py b/tests/st/distribute_worker_fault/__init__.py new file mode 100644 index 0000000..919b057 --- /dev/null +++ b/tests/st/distribute_worker_fault/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/tests/st/distribute_worker_fault/kill_15_worker.sh b/tests/st/distribute_worker_fault/kill_15_worker.sh new file mode 100644 index 0000000..8759a58 --- /dev/null +++ b/tests/st/distribute_worker_fault/kill_15_worker.sh @@ -0,0 +1,214 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + sh export_model.sh &> model.log + echo "### end to generate mode for serving test ###" + result=`find . -name matmul.mindir | wc -l` + if [ ${result} -ne 8 ] + then + cat model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} +start_agent() +{ + echo "### start serving agent ###" + unset http_proxy https_proxy + python3 agent.py > agent.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server agent failed to start." + fi + + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat agent.log + echo "start serving agent failed!" && exit 1 + fi + echo "### start serving agent end ###" +} +kill_agent() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 9 ] + then + echo "agent start failed" + echo $num + clean_pid && exit 1 + fi + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15 + if [ $? -ne 0 ] + then + echo "kill worker failed" + fi + sleep 25 + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "worker exit failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "agent exit failed" + echo $num + clean_pid && exit 1 + fi +} + +test_agent_fault_model() +{ + start_master + start_worker + start_agent + kill_agent + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta +rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py +cp -r ../../../example/matmul_distributed/* . +prepare_model +test_agent_fault_model +echo "### end to serving test ###" diff --git a/tests/st/distribute_worker_fault/kill_9_worker.sh b/tests/st/distribute_worker_fault/kill_9_worker.sh new file mode 100644 index 0000000..a16c550 --- /dev/null +++ b/tests/st/distribute_worker_fault/kill_9_worker.sh @@ -0,0 +1,228 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + sh export_model.sh &> model.log + echo "### end to generate mode for serving test ###" + result=`find . -name matmul.mindir | wc -l` + if [ ${result} -ne 8 ] + then + cat model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} +start_agent() +{ + echo "### start serving agent ###" + unset http_proxy https_proxy + python3 agent.py > agent.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server agent failed to start." + fi + + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Child 0: Receive success' agent.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat agent.log + echo "start serving agent failed!" && exit 1 + fi + echo "### start serving agent end ###" +} +kill_worker() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep agent.py | grep -v grep | wc -l` + if [ $num -ne 9 ] + then + echo "agent start failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Pong Time Out from' master.log | wc -l` + if [ $num -ne 0 ] + then + echo "worker has exited" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Ping Time Out from' agent.log | wc -l` + if [ $num -ne 0 ] + then + echo "worker has exited" + echo $num + clean_pid && exit 1 + fi + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + if [ $? -ne 0 ] + then + echo "kill worker failed" + fi + sleep 25 + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "worker exit failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Pong Time Out from' master.log | wc -l` + if [ $num -ne 1 ] + then + echo "master catch worker exit failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Ping Time Out from' agent.log | wc -l` + if [ $num -ne 8 ] + then + echo "agent catch worker exit failed" + echo $num + clean_pid && exit 1 + fi +} + +test_worker_fault_model() +{ + start_master + start_worker + start_agent + kill_worker + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta +rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py +cp -r ../../../example/matmul_distributed/* . +prepare_model +test_worker_fault_model +echo "### end to serving test ###" diff --git a/tests/st/distribute_worker_fault/test_distribute_worker_fault.py b/tests/st/distribute_worker_fault/test_distribute_worker_fault.py new file mode 100644 index 0000000..bd780ad --- /dev/null +++ b/tests/st/distribute_worker_fault/test_distribute_worker_fault.py @@ -0,0 +1,43 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import sys +import pytest +import numpy as np + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.env_single +def test_distribute_worker_fault(): + """test_serving""" + sh_path = os.path.split(os.path.realpath(__file__))[0] + python_path_folders = [] + for python_path in sys.path: + if os.path.isdir(python_path): + python_path_folders += [python_path] + folders = [] + for folder in python_path_folders: + folders += [os.path.join(folder, x) for x in os.listdir(folder) \ + if os.path.isdir(os.path.join(folder, x)) and \ + '/site-packages/mindspore' in os.path.join(folder, x)] + ret = os.system(f"sh {sh_path}/kill_15_worker.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + ret = os.system(f"sh {sh_path}/kill_9_worker.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + +if __name__ == '__main__': + test_distribute_worker_fault() diff --git a/tests/st/master_fault/__init__.py b/tests/st/master_fault/__init__.py index 47b43a6..919b057 100644 --- a/tests/st/master_fault/__init__.py +++ b/tests/st/master_fault/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/st/worker_fault/__init__.py b/tests/st/worker_fault/__init__.py index 47b43a6..919b057 100644 --- a/tests/st/worker_fault/__init__.py +++ b/tests/st/worker_fault/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.