diff --git a/tests/st/master_fault/__init__.py b/tests/st/master_fault/__init__.py new file mode 100644 index 0000000..47b43a6 --- /dev/null +++ b/tests/st/master_fault/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/tests/st/master_fault/kill_15_master.sh b/tests/st/master_fault/kill_15_master.sh new file mode 100644 index 0000000..6b6e85a --- /dev/null +++ b/tests/st/master_fault/kill_15_master.sh @@ -0,0 +1,167 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + python3 add_model.py &> add_model.log + echo "### end to generate mode for serving test ###" + result=`find . -name tensor_add.mindir | wc -l` + if [ ${result} -ne 1 ] + then + cat add_model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin to send pong' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin to send pong' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} + +kill_master() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15 + if [ $? -ne 0 ] + then + echo "kill master failed" + fi + sleep 5 + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "master exit failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "worker exit failed" + echo $num + clean_pid && exit 1 + fi +} + +test_master_fault_model() +{ + start_master + start_worker + kill_master + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/add ${CURRPATH}/kernel_meta +rm -rf add client.py client_mul_process.py export_model master_with_worker.py master.py worker.py +cp -r ../../../example/add/* . +prepare_model +test_master_fault_model +echo "### end to serving test ###" diff --git a/tests/st/master_fault/kill_9_master.sh b/tests/st/master_fault/kill_9_master.sh new file mode 100644 index 0000000..af07298 --- /dev/null +++ b/tests/st/master_fault/kill_9_master.sh @@ -0,0 +1,174 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + python3 add_model.py &> add_model.log + echo "### end to generate mode for serving test ###" + result=`find . -name tensor_add.mindir | wc -l` + if [ ${result} -ne 1 ] + then + cat add_model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin to send pong' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin to send pong' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} + +kill_master() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Ping Time Out from' worker.log | wc -l` + if [ $num -ne 0 ] + then + echo "master has exited" + echo $num + clean_pid && exit 1 + fi + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + if [ $? -ne 0 ] + then + echo "kill master failed" + fi + sleep 25 + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "master exit failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Ping Time Out from' worker.log | wc -l` + if [ $num -ne 1 ] + then + echo "catch master exit failed" + echo $num + clean_pid && exit 1 + fi +} + +test_master_fault_model() +{ + start_master + start_worker + kill_master + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/add ${CURRPATH}/kernel_meta +rm -rf add client.py client_mul_process.py export_model master_with_worker.py master.py worker.py +cp -r ../../../example/add/* . +prepare_model +test_master_fault_model +echo "### end to serving test ###" diff --git a/tests/st/master_fault/test_master_fault.py b/tests/st/master_fault/test_master_fault.py new file mode 100644 index 0000000..4a78163 --- /dev/null +++ b/tests/st/master_fault/test_master_fault.py @@ -0,0 +1,43 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import sys +import pytest +import numpy as np + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.env_single +def test_master_fault(): + """test_serving""" + sh_path = os.path.split(os.path.realpath(__file__))[0] + python_path_folders = [] + for python_path in sys.path: + if os.path.isdir(python_path): + python_path_folders += [python_path] + folders = [] + for folder in python_path_folders: + folders += [os.path.join(folder, x) for x in os.listdir(folder) \ + if os.path.isdir(os.path.join(folder, x)) and \ + '/site-packages/mindspore' in os.path.join(folder, x)] + ret = os.system(f"sh {sh_path}/kill_15_master.sh {folders[0].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + ret = os.system(f"sh {sh_path}/kill_9_master.sh {folders[0].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + +if __name__ == '__main__': + test_master_fault() diff --git a/tests/st/resnet/resnet.sh b/tests/st/resnet/resnet.sh index 95d68cb..9a83a3d 100644 --- a/tests/st/resnet/resnet.sh +++ b/tests/st/resnet/resnet.sh @@ -58,7 +58,7 @@ start_service() python3 master_with_worker.py > service.log 2>&1 & if [ $? -ne 0 ] then - echo "server faile to start." + echo "server failed to start." fi result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' service.log | wc -l` @@ -88,7 +88,7 @@ pytest_serving() then clean_pid cat client.log - echo "client faile to start." && exit 1 + echo "client failed to start." && exit 1 fi echo "### client end ###" } @@ -107,3 +107,4 @@ rm -rf client.py export_model master_with_worker.py resnet50 test_image cp -r ../../../example/resnet/* . prepare_model test_renet_model +echo "### end to serving test ###" diff --git a/tests/st/worker_fault/__init__.py b/tests/st/worker_fault/__init__.py new file mode 100644 index 0000000..47b43a6 --- /dev/null +++ b/tests/st/worker_fault/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/tests/st/worker_fault/kill_15_worker.sh b/tests/st/worker_fault/kill_15_worker.sh new file mode 100644 index 0000000..296a23b --- /dev/null +++ b/tests/st/worker_fault/kill_15_worker.sh @@ -0,0 +1,183 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + python3 add_model.py &> add_model.log + echo "### end to generate mode for serving test ###" + result=`find . -name tensor_add.mindir | wc -l` + if [ ${result} -ne 1 ] + then + cat add_model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin to send pong' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin to send pong' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} + +kill_worker() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Worker Exit' master.log | wc -l` + if [ $num -ne 0 ] + then + echo "worker has exited" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15 + if [ $? -ne 0 ] + then + echo "kill master failed" + fi + sleep 5 + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + echo "worker exit failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Worker Exit' master.log | wc -l` + if [ $num -ne 1 ] + then + echo "catch worker exit failed" + echo $num + clean_pid && exit 1 + fi + clean_pid +} + +test_worker_fault_model() +{ + start_master + start_worker + kill_worker + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/add ${CURRPATH}/kernel_meta +rm -rf add client.py client_mul_process.py export_model master_with_worker.py master.py worker.py +cp -r ../../../example/add/* . +prepare_model +test_worker_fault_model +echo "### end to serving test ###" diff --git a/tests/st/worker_fault/kill_9_worker.sh b/tests/st/worker_fault/kill_9_worker.sh new file mode 100644 index 0000000..af59fbe --- /dev/null +++ b/tests/st/worker_fault/kill_9_worker.sh @@ -0,0 +1,175 @@ +#!/bin/bash + +export GLOG_v=1 +export DEVICE_ID=1 + +MINDSPORE_INSTALL_PATH=$1 +ENV_DEVICE_ID=$DEVICE_ID +CURRPATH=$(cd "$(dirname $0)" || exit; pwd) +CURRUSER=$(whoami) +PROJECT_PATH=${CURRPATH}/../../../ +echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} +echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} +echo "CURRPATH:" ${CURRPATH} +echo "CURRUSER:" ${CURRUSER} +echo "PROJECT_PATH:" ${PROJECT_PATH} + +export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} +#export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} + +echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} +echo "PYTHONPATH: " ${PYTHONPATH} +echo "-------------show MINDSPORE_INSTALL_PATH----------------" +ls -l ${MINDSPORE_INSTALL_PATH} +echo "------------------show /usr/lib64/----------------------" +ls -l /usr/local/python/python375/lib/ + +clean_pid() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 0 ] + then + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + sleep 6 + fi +} + +prepare_model() +{ + echo "### begin to generate mode for serving test ###" + cd export_model + python3 add_model.py &> add_model.log + echo "### end to generate mode for serving test ###" + result=`find . -name tensor_add.mindir | wc -l` + if [ ${result} -ne 1 ] + then + cat add_model.log + echo "### generate model for serving test failed ###" && exit 1 + clean_pid + cd - + fi + cd - +} + +start_master() +{ + echo "### start serving master ###" + unset http_proxy https_proxy + python3 master.py > master.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server master failed to start." + fi + + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 50 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` + done + + if [ ${count} -eq 50 ] + then + clean_pid + cat master.log + echo "start serving master failed!" && exit 1 + fi + echo "### start serving master end ###" +} +start_worker() +{ + echo "### start serving worker ###" + unset http_proxy https_proxy + python3 worker.py > worker.log 2>&1 & + if [ $? -ne 0 ] + then + echo "server worker failed to start." + fi + + result=`grep -E 'Begin to send pong' worker.log | wc -l` + count=0 + while [[ ${result} -ne 1 && ${count} -lt 100 ]] + do + sleep 1 + count=$(($count+1)) + result=`grep -E 'Begin to send pong' worker.log | wc -l` + done + + if [ ${count} -eq 100 ] + then + clean_pid + cat worker.log + echo "start serving worker failed!" && exit 1 + fi + echo "### start serving worker end ###" +} + +kill_worker() +{ + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Pong Time Out from' master.log | wc -l` + if [ $num -ne 0 ] + then + echo "worker has exited" + echo $num + clean_pid && exit 1 + fi + num=`ps -ef | grep worker.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "worker start failed" + echo $num + clean_pid && exit 1 + fi + + ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 + if [ $? -ne 0 ] + then + echo "kill master failed" + fi + sleep 25 + num=`ps -ef | grep master.py | grep -v grep | wc -l` + if [ $num -ne 1 ] + then + echo "master start failed" + echo $num + clean_pid && exit 1 + fi + num=`grep -E 'Recv Pong Time Out from' master.log | wc -l` + if [ $num -ne 1 ] + then + echo "catch worker exit failed" + echo $num + clean_pid && exit 1 + fi +} + +test_worker_fault_model() +{ + start_master + start_worker + kill_worker + clean_pid +} + +echo "-----serving start-----" +rm -rf serving *.log *.mindir *.dat ${CURRPATH}/add ${CURRPATH}/kernel_meta +rm -rf add client.py client_mul_process.py export_model master_with_worker.py master.py worker.py +cp -r ../../../example/add/* . +prepare_model +test_worker_fault_model +echo "### end to serving test ###" diff --git a/tests/st/worker_fault/test_worker_fault.py b/tests/st/worker_fault/test_worker_fault.py new file mode 100644 index 0000000..e02425b --- /dev/null +++ b/tests/st/worker_fault/test_worker_fault.py @@ -0,0 +1,43 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +import sys +import pytest +import numpy as np + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.env_single +def test_worker_fault(): + """test_serving""" + sh_path = os.path.split(os.path.realpath(__file__))[0] + python_path_folders = [] + for python_path in sys.path: + if os.path.isdir(python_path): + python_path_folders += [python_path] + folders = [] + for folder in python_path_folders: + folders += [os.path.join(folder, x) for x in os.listdir(folder) \ + if os.path.isdir(os.path.join(folder, x)) and \ + '/site-packages/mindspore' in os.path.join(folder, x)] + ret = os.system(f"sh {sh_path}/kill_15_worker.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + ret = os.system(f"sh {sh_path}/kill_9_worker.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") + assert np.allclose(ret, 0) + +if __name__ == '__main__': + test_worker_fault()