From: @zhangyinxia Reviewed-by: @zhoufeng54,@xu-yfei Signed-off-by: @xu-yfeitags/v1.2.0
| @@ -0,0 +1,26 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Start Distributed Servable matmul""" | |||
| from mindspore_serving import master | |||
| def start(): | |||
| master.start_grpc_server("127.0.0.1", 5500) | |||
| master.start_master_server("127.0.0.1", 6500) | |||
| if __name__ == "__main__": | |||
| start() | |||
| @@ -0,0 +1,31 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Start Distributed Servable matmul""" | |||
| import os | |||
| import sys | |||
| from mindspore_serving.worker import distributed | |||
| def start(): | |||
| servable_dir = os.path.dirname(os.path.realpath(sys.argv[0])) | |||
| distributed.start_distributed_servable(servable_dir, "matmul", | |||
| rank_table_json_file="rank_table_8pcs.json", | |||
| version_number=1, | |||
| worker_ip="127.0.0.1", worker_port=6200, | |||
| master_ip="127.0.0.1", master_port=6500) | |||
| if __name__ == "__main__": | |||
| start() | |||
| @@ -0,0 +1,14 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| @@ -0,0 +1,214 @@ | |||
| #!/bin/bash | |||
| export GLOG_v=1 | |||
| export DEVICE_ID=1 | |||
| MINDSPORE_INSTALL_PATH=$1 | |||
| ENV_DEVICE_ID=$DEVICE_ID | |||
| CURRPATH=$(cd "$(dirname $0)" || exit; pwd) | |||
| CURRUSER=$(whoami) | |||
| PROJECT_PATH=${CURRPATH}/../../../ | |||
| echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} | |||
| echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} | |||
| echo "CURRPATH:" ${CURRPATH} | |||
| echo "CURRUSER:" ${CURRUSER} | |||
| echo "PROJECT_PATH:" ${PROJECT_PATH} | |||
| export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} | |||
| #export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} | |||
| echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} | |||
| echo "PYTHONPATH: " ${PYTHONPATH} | |||
| echo "-------------show MINDSPORE_INSTALL_PATH----------------" | |||
| ls -l ${MINDSPORE_INSTALL_PATH} | |||
| echo "------------------show /usr/lib64/----------------------" | |||
| ls -l /usr/local/python/python375/lib/ | |||
| clean_pid() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| } | |||
| prepare_model() | |||
| { | |||
| echo "### begin to generate mode for serving test ###" | |||
| cd export_model | |||
| sh export_model.sh &> model.log | |||
| echo "### end to generate mode for serving test ###" | |||
| result=`find . -name matmul.mindir | wc -l` | |||
| if [ ${result} -ne 8 ] | |||
| then | |||
| cat model.log | |||
| echo "### generate model for serving test failed ###" && exit 1 | |||
| clean_pid | |||
| cd - | |||
| fi | |||
| cd - | |||
| } | |||
| start_master() | |||
| { | |||
| echo "### start serving master ###" | |||
| unset http_proxy https_proxy | |||
| python3 master.py > master.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server master failed to start." | |||
| fi | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 50 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 50 ] | |||
| then | |||
| clean_pid | |||
| cat master.log | |||
| echo "start serving master failed!" && exit 1 | |||
| fi | |||
| echo "### start serving master end ###" | |||
| } | |||
| start_worker() | |||
| { | |||
| echo "### start serving worker ###" | |||
| unset http_proxy https_proxy | |||
| python3 worker.py > worker.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server worker failed to start." | |||
| fi | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat worker.log | |||
| echo "start serving worker failed!" && exit 1 | |||
| fi | |||
| echo "### start serving worker end ###" | |||
| } | |||
| start_agent() | |||
| { | |||
| echo "### start serving agent ###" | |||
| unset http_proxy https_proxy | |||
| python3 agent.py > agent.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server agent failed to start." | |||
| fi | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat agent.log | |||
| echo "start serving agent failed!" && exit 1 | |||
| fi | |||
| echo "### start serving agent end ###" | |||
| } | |||
| kill_agent() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "worker start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 9 ] | |||
| then | |||
| echo "agent start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15 | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "kill agent failed" | |||
| fi | |||
| sleep 25 | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "worker exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "agent exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| } | |||
| test_agent_fault_model() | |||
| { | |||
| start_master | |||
| start_worker | |||
| start_agent | |||
| kill_agent | |||
| clean_pid | |||
| } | |||
| echo "-----serving start-----" | |||
| rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta | |||
| rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py | |||
| cp -r ../../../example/matmul_distributed/* . | |||
| prepare_model | |||
| test_agent_fault_model | |||
| echo "### end to serving test ###" | |||
| @@ -0,0 +1,221 @@ | |||
| #!/bin/bash | |||
| export GLOG_v=1 | |||
| export DEVICE_ID=1 | |||
| MINDSPORE_INSTALL_PATH=$1 | |||
| ENV_DEVICE_ID=$DEVICE_ID | |||
| CURRPATH=$(cd "$(dirname $0)" || exit; pwd) | |||
| CURRUSER=$(whoami) | |||
| PROJECT_PATH=${CURRPATH}/../../../ | |||
| echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} | |||
| echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} | |||
| echo "CURRPATH:" ${CURRPATH} | |||
| echo "CURRUSER:" ${CURRUSER} | |||
| echo "PROJECT_PATH:" ${PROJECT_PATH} | |||
| export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} | |||
| #export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} | |||
| echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} | |||
| echo "PYTHONPATH: " ${PYTHONPATH} | |||
| echo "-------------show MINDSPORE_INSTALL_PATH----------------" | |||
| ls -l ${MINDSPORE_INSTALL_PATH} | |||
| echo "------------------show /usr/lib64/----------------------" | |||
| ls -l /usr/local/python/python375/lib/ | |||
| clean_pid() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| } | |||
| prepare_model() | |||
| { | |||
| echo "### begin to generate mode for serving test ###" | |||
| cd export_model | |||
| sh export_model.sh &> model.log | |||
| echo "### end to generate mode for serving test ###" | |||
| result=`find . -name matmul.mindir | wc -l` | |||
| if [ ${result} -ne 8 ] | |||
| then | |||
| cat model.log | |||
| echo "### generate model for serving test failed ###" && exit 1 | |||
| clean_pid | |||
| cd - | |||
| fi | |||
| cd - | |||
| } | |||
| start_master() | |||
| { | |||
| echo "### start serving master ###" | |||
| unset http_proxy https_proxy | |||
| python3 master.py > master.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server master failed to start." | |||
| fi | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 50 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 50 ] | |||
| then | |||
| clean_pid | |||
| cat master.log | |||
| echo "start serving master failed!" && exit 1 | |||
| fi | |||
| echo "### start serving master end ###" | |||
| } | |||
| start_worker() | |||
| { | |||
| echo "### start serving worker ###" | |||
| unset http_proxy https_proxy | |||
| python3 worker.py > worker.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server worker failed to start." | |||
| fi | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat worker.log | |||
| echo "start serving worker failed!" && exit 1 | |||
| fi | |||
| echo "### start serving worker end ###" | |||
| } | |||
| start_agent() | |||
| { | |||
| echo "### start serving agent ###" | |||
| unset http_proxy https_proxy | |||
| python3 agent.py > agent.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server agent failed to start." | |||
| fi | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat agent.log | |||
| echo "start serving agent failed!" && exit 1 | |||
| fi | |||
| echo "### start serving agent end ###" | |||
| } | |||
| kill_agent() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "worker start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 9 ] | |||
| then | |||
| echo "agent start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`grep -E 'Recv Pong Time Out from' worker.log | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "worker has exited" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "kill agent failed" | |||
| fi | |||
| sleep 25 | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "agent exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`grep -E 'Recv Pong Time Out from' worker.log | wc -l` | |||
| if [ $num -ne 8 ] | |||
| then | |||
| echo "catch agent exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| } | |||
| test_agent_fault_model() | |||
| { | |||
| start_master | |||
| start_worker | |||
| start_agent | |||
| kill_agent | |||
| clean_pid | |||
| } | |||
| echo "-----serving start-----" | |||
| rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta | |||
| rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py | |||
| cp -r ../../../example/matmul_distributed/* . | |||
| prepare_model | |||
| test_agent_fault_model | |||
| echo "### end to serving test ###" | |||
| @@ -0,0 +1,43 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| import os | |||
| import sys | |||
| import pytest | |||
| import numpy as np | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_arm_ascend_training | |||
| @pytest.mark.env_single | |||
| def test_distribute_agent_fault(): | |||
| """test_serving""" | |||
| sh_path = os.path.split(os.path.realpath(__file__))[0] | |||
| python_path_folders = [] | |||
| for python_path in sys.path: | |||
| if os.path.isdir(python_path): | |||
| python_path_folders += [python_path] | |||
| folders = [] | |||
| for folder in python_path_folders: | |||
| folders += [os.path.join(folder, x) for x in os.listdir(folder) \ | |||
| if os.path.isdir(os.path.join(folder, x)) and \ | |||
| '/site-packages/mindspore' in os.path.join(folder, x)] | |||
| ret = os.system(f"sh {sh_path}/kill_15_agent.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") | |||
| assert np.allclose(ret, 0) | |||
| ret = os.system(f"sh {sh_path}/kill_9_agent.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") | |||
| assert np.allclose(ret, 0) | |||
| if __name__ == '__main__': | |||
| test_distribute_agent_fault() | |||
| @@ -0,0 +1,14 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| @@ -0,0 +1,214 @@ | |||
| #!/bin/bash | |||
| export GLOG_v=1 | |||
| export DEVICE_ID=1 | |||
| MINDSPORE_INSTALL_PATH=$1 | |||
| ENV_DEVICE_ID=$DEVICE_ID | |||
| CURRPATH=$(cd "$(dirname $0)" || exit; pwd) | |||
| CURRUSER=$(whoami) | |||
| PROJECT_PATH=${CURRPATH}/../../../ | |||
| echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} | |||
| echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} | |||
| echo "CURRPATH:" ${CURRPATH} | |||
| echo "CURRUSER:" ${CURRUSER} | |||
| echo "PROJECT_PATH:" ${PROJECT_PATH} | |||
| export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} | |||
| #export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} | |||
| echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} | |||
| echo "PYTHONPATH: " ${PYTHONPATH} | |||
| echo "-------------show MINDSPORE_INSTALL_PATH----------------" | |||
| ls -l ${MINDSPORE_INSTALL_PATH} | |||
| echo "------------------show /usr/lib64/----------------------" | |||
| ls -l /usr/local/python/python375/lib/ | |||
| clean_pid() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| } | |||
| prepare_model() | |||
| { | |||
| echo "### begin to generate mode for serving test ###" | |||
| cd export_model | |||
| sh export_model.sh &> model.log | |||
| echo "### end to generate mode for serving test ###" | |||
| result=`find . -name matmul.mindir | wc -l` | |||
| if [ ${result} -ne 8 ] | |||
| then | |||
| cat model.log | |||
| echo "### generate model for serving test failed ###" && exit 1 | |||
| clean_pid | |||
| cd - | |||
| fi | |||
| cd - | |||
| } | |||
| start_master() | |||
| { | |||
| echo "### start serving master ###" | |||
| unset http_proxy https_proxy | |||
| python3 master.py > master.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server master failed to start." | |||
| fi | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 50 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 50 ] | |||
| then | |||
| clean_pid | |||
| cat master.log | |||
| echo "start serving master failed!" && exit 1 | |||
| fi | |||
| echo "### start serving master end ###" | |||
| } | |||
| start_worker() | |||
| { | |||
| echo "### start serving worker ###" | |||
| unset http_proxy https_proxy | |||
| python3 worker.py > worker.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server worker failed to start." | |||
| fi | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat worker.log | |||
| echo "start serving worker failed!" && exit 1 | |||
| fi | |||
| echo "### start serving worker end ###" | |||
| } | |||
| start_agent() | |||
| { | |||
| echo "### start serving agent ###" | |||
| unset http_proxy https_proxy | |||
| python3 agent.py > agent.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server agent failed to start." | |||
| fi | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat agent.log | |||
| echo "start serving agent failed!" && exit 1 | |||
| fi | |||
| echo "### start serving agent end ###" | |||
| } | |||
| kill_master() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "worker start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 9 ] | |||
| then | |||
| echo "agent start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15 | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "kill master failed" | |||
| fi | |||
| sleep 15 | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "master exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "worker exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "agent exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| } | |||
| test_master_fault_model() | |||
| { | |||
| start_master | |||
| start_worker | |||
| start_agent | |||
| kill_master | |||
| clean_pid | |||
| } | |||
| echo "-----serving start-----" | |||
| rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta | |||
| rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py | |||
| cp -r ../../../example/matmul_distributed/* . | |||
| prepare_model | |||
| test_master_fault_model | |||
| echo "### end to serving test ###" | |||
| @@ -0,0 +1,214 @@ | |||
| #!/bin/bash | |||
| export GLOG_v=1 | |||
| export DEVICE_ID=1 | |||
| MINDSPORE_INSTALL_PATH=$1 | |||
| ENV_DEVICE_ID=$DEVICE_ID | |||
| CURRPATH=$(cd "$(dirname $0)" || exit; pwd) | |||
| CURRUSER=$(whoami) | |||
| PROJECT_PATH=${CURRPATH}/../../../ | |||
| echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} | |||
| echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} | |||
| echo "CURRPATH:" ${CURRPATH} | |||
| echo "CURRUSER:" ${CURRUSER} | |||
| echo "PROJECT_PATH:" ${PROJECT_PATH} | |||
| export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} | |||
| #export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} | |||
| echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} | |||
| echo "PYTHONPATH: " ${PYTHONPATH} | |||
| echo "-------------show MINDSPORE_INSTALL_PATH----------------" | |||
| ls -l ${MINDSPORE_INSTALL_PATH} | |||
| echo "------------------show /usr/lib64/----------------------" | |||
| ls -l /usr/local/python/python375/lib/ | |||
| clean_pid() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| } | |||
| prepare_model() | |||
| { | |||
| echo "### begin to generate mode for serving test ###" | |||
| cd export_model | |||
| sh export_model.sh &> model.log | |||
| echo "### end to generate mode for serving test ###" | |||
| result=`find . -name matmul.mindir | wc -l` | |||
| if [ ${result} -ne 8 ] | |||
| then | |||
| cat model.log | |||
| echo "### generate model for serving test failed ###" && exit 1 | |||
| clean_pid | |||
| cd - | |||
| fi | |||
| cd - | |||
| } | |||
| start_master() | |||
| { | |||
| echo "### start serving master ###" | |||
| unset http_proxy https_proxy | |||
| python3 master.py > master.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server master failed to start." | |||
| fi | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 50 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 50 ] | |||
| then | |||
| clean_pid | |||
| cat master.log | |||
| echo "start serving master failed!" && exit 1 | |||
| fi | |||
| echo "### start serving master end ###" | |||
| } | |||
| start_worker() | |||
| { | |||
| echo "### start serving worker ###" | |||
| unset http_proxy https_proxy | |||
| python3 worker.py > worker.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server worker failed to start." | |||
| fi | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat worker.log | |||
| echo "start serving worker failed!" && exit 1 | |||
| fi | |||
| echo "### start serving worker end ###" | |||
| } | |||
| start_agent() | |||
| { | |||
| echo "### start serving agent ###" | |||
| unset http_proxy https_proxy | |||
| python3 agent.py > agent.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server agent failed to start." | |||
| fi | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat agent.log | |||
| echo "start serving agent failed!" && exit 1 | |||
| fi | |||
| echo "### start serving agent end ###" | |||
| } | |||
| kill_master() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "worker start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 9 ] | |||
| then | |||
| echo "agent start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`grep -E 'Recv Ping Time Out from' worker.log | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "worker has exited" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "kill master failed" | |||
| fi | |||
| sleep 25 | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "master exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`grep -E 'Recv Ping Time Out from' worker.log | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "catch master exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| } | |||
| test_master_fault_model() | |||
| { | |||
| start_master | |||
| start_worker | |||
| start_agent | |||
| kill_master | |||
| clean_pid | |||
| } | |||
| echo "-----serving start-----" | |||
| rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta | |||
| rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py | |||
| cp -r ../../../example/matmul_distributed/* . | |||
| prepare_model | |||
| test_master_fault_model | |||
| echo "### end to serving test ###" | |||
| @@ -0,0 +1,43 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| import os | |||
| import sys | |||
| import pytest | |||
| import numpy as np | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_arm_ascend_training | |||
| @pytest.mark.env_single | |||
| def test_distribute_master_fault(): | |||
| """test_serving""" | |||
| sh_path = os.path.split(os.path.realpath(__file__))[0] | |||
| python_path_folders = [] | |||
| for python_path in sys.path: | |||
| if os.path.isdir(python_path): | |||
| python_path_folders += [python_path] | |||
| folders = [] | |||
| for folder in python_path_folders: | |||
| folders += [os.path.join(folder, x) for x in os.listdir(folder) \ | |||
| if os.path.isdir(os.path.join(folder, x)) and \ | |||
| '/site-packages/mindspore' in os.path.join(folder, x)] | |||
| ret = os.system(f"sh {sh_path}/kill_15_master.sh {folders[0].split('mindspore', 1)[0] + 'mindspore'}") | |||
| assert np.allclose(ret, 0) | |||
| ret = os.system(f"sh {sh_path}/kill_9_master.sh {folders[0].split('mindspore', 1)[0] + 'mindspore'}") | |||
| assert np.allclose(ret, 0) | |||
| if __name__ == '__main__': | |||
| test_distribute_master_fault() | |||
| @@ -0,0 +1,14 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| @@ -0,0 +1,214 @@ | |||
| #!/bin/bash | |||
| export GLOG_v=1 | |||
| export DEVICE_ID=1 | |||
| MINDSPORE_INSTALL_PATH=$1 | |||
| ENV_DEVICE_ID=$DEVICE_ID | |||
| CURRPATH=$(cd "$(dirname $0)" || exit; pwd) | |||
| CURRUSER=$(whoami) | |||
| PROJECT_PATH=${CURRPATH}/../../../ | |||
| echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} | |||
| echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} | |||
| echo "CURRPATH:" ${CURRPATH} | |||
| echo "CURRUSER:" ${CURRUSER} | |||
| echo "PROJECT_PATH:" ${PROJECT_PATH} | |||
| export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} | |||
| #export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} | |||
| echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} | |||
| echo "PYTHONPATH: " ${PYTHONPATH} | |||
| echo "-------------show MINDSPORE_INSTALL_PATH----------------" | |||
| ls -l ${MINDSPORE_INSTALL_PATH} | |||
| echo "------------------show /usr/lib64/----------------------" | |||
| ls -l /usr/local/python/python375/lib/ | |||
| clean_pid() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| } | |||
| prepare_model() | |||
| { | |||
| echo "### begin to generate mode for serving test ###" | |||
| cd export_model | |||
| sh export_model.sh &> model.log | |||
| echo "### end to generate mode for serving test ###" | |||
| result=`find . -name matmul.mindir | wc -l` | |||
| if [ ${result} -ne 8 ] | |||
| then | |||
| cat model.log | |||
| echo "### generate model for serving test failed ###" && exit 1 | |||
| clean_pid | |||
| cd - | |||
| fi | |||
| cd - | |||
| } | |||
| start_master() | |||
| { | |||
| echo "### start serving master ###" | |||
| unset http_proxy https_proxy | |||
| python3 master.py > master.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server master failed to start." | |||
| fi | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 50 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 50 ] | |||
| then | |||
| clean_pid | |||
| cat master.log | |||
| echo "start serving master failed!" && exit 1 | |||
| fi | |||
| echo "### start serving master end ###" | |||
| } | |||
| start_worker() | |||
| { | |||
| echo "### start serving worker ###" | |||
| unset http_proxy https_proxy | |||
| python3 worker.py > worker.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server worker failed to start." | |||
| fi | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat worker.log | |||
| echo "start serving worker failed!" && exit 1 | |||
| fi | |||
| echo "### start serving worker end ###" | |||
| } | |||
| start_agent() | |||
| { | |||
| echo "### start serving agent ###" | |||
| unset http_proxy https_proxy | |||
| python3 agent.py > agent.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server agent failed to start." | |||
| fi | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat agent.log | |||
| echo "start serving agent failed!" && exit 1 | |||
| fi | |||
| echo "### start serving agent end ###" | |||
| } | |||
| kill_agent() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "worker start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 9 ] | |||
| then | |||
| echo "agent start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15 | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "kill worker failed" | |||
| fi | |||
| sleep 25 | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "worker exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "agent exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| } | |||
| test_agent_fault_model() | |||
| { | |||
| start_master | |||
| start_worker | |||
| start_agent | |||
| kill_agent | |||
| clean_pid | |||
| } | |||
| echo "-----serving start-----" | |||
| rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta | |||
| rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py | |||
| cp -r ../../../example/matmul_distributed/* . | |||
| prepare_model | |||
| test_agent_fault_model | |||
| echo "### end to serving test ###" | |||
| @@ -0,0 +1,228 @@ | |||
| #!/bin/bash | |||
| export GLOG_v=1 | |||
| export DEVICE_ID=1 | |||
| MINDSPORE_INSTALL_PATH=$1 | |||
| ENV_DEVICE_ID=$DEVICE_ID | |||
| CURRPATH=$(cd "$(dirname $0)" || exit; pwd) | |||
| CURRUSER=$(whoami) | |||
| PROJECT_PATH=${CURRPATH}/../../../ | |||
| echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH} | |||
| echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID} | |||
| echo "CURRPATH:" ${CURRPATH} | |||
| echo "CURRUSER:" ${CURRUSER} | |||
| echo "PROJECT_PATH:" ${PROJECT_PATH} | |||
| export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH} | |||
| #export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH} | |||
| echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH} | |||
| echo "PYTHONPATH: " ${PYTHONPATH} | |||
| echo "-------------show MINDSPORE_INSTALL_PATH----------------" | |||
| ls -l ${MINDSPORE_INSTALL_PATH} | |||
| echo "------------------show /usr/lib64/----------------------" | |||
| ls -l /usr/local/python/python375/lib/ | |||
| clean_pid() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| sleep 6 | |||
| fi | |||
| } | |||
| prepare_model() | |||
| { | |||
| echo "### begin to generate mode for serving test ###" | |||
| cd export_model | |||
| sh export_model.sh &> model.log | |||
| echo "### end to generate mode for serving test ###" | |||
| result=`find . -name matmul.mindir | wc -l` | |||
| if [ ${result} -ne 8 ] | |||
| then | |||
| cat model.log | |||
| echo "### generate model for serving test failed ###" && exit 1 | |||
| clean_pid | |||
| cd - | |||
| fi | |||
| cd - | |||
| } | |||
| start_master() | |||
| { | |||
| echo "### start serving master ###" | |||
| unset http_proxy https_proxy | |||
| python3 master.py > master.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server master failed to start." | |||
| fi | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 50 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' master.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 50 ] | |||
| then | |||
| clean_pid | |||
| cat master.log | |||
| echo "start serving master failed!" && exit 1 | |||
| fi | |||
| echo "### start serving master end ###" | |||
| } | |||
| start_worker() | |||
| { | |||
| echo "### start serving worker ###" | |||
| unset http_proxy https_proxy | |||
| python3 worker.py > worker.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server worker failed to start." | |||
| fi | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Begin waiting ready of all agents' worker.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat worker.log | |||
| echo "start serving worker failed!" && exit 1 | |||
| fi | |||
| echo "### start serving worker end ###" | |||
| } | |||
| start_agent() | |||
| { | |||
| echo "### start serving agent ###" | |||
| unset http_proxy https_proxy | |||
| python3 agent.py > agent.log 2>&1 & | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "server agent failed to start." | |||
| fi | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| count=0 | |||
| while [[ ${result} -ne 1 && ${count} -lt 100 ]] | |||
| do | |||
| sleep 1 | |||
| count=$(($count+1)) | |||
| result=`grep -E 'Child 0: Receive success' agent.log | wc -l` | |||
| done | |||
| if [ ${count} -eq 100 ] | |||
| then | |||
| clean_pid | |||
| cat agent.log | |||
| echo "start serving agent failed!" && exit 1 | |||
| fi | |||
| echo "### start serving agent end ###" | |||
| } | |||
| kill_worker() | |||
| { | |||
| num=`ps -ef | grep master.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "worker start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`ps -ef | grep agent.py | grep -v grep | wc -l` | |||
| if [ $num -ne 9 ] | |||
| then | |||
| echo "agent start failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`grep -E 'Recv Pong Time Out from' master.log | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "worker has exited" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`grep -E 'Recv Ping Time Out from' agent.log | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "worker has exited" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9 | |||
| if [ $? -ne 0 ] | |||
| then | |||
| echo "kill worker failed" | |||
| fi | |||
| sleep 25 | |||
| num=`ps -ef | grep worker.py | grep -v grep | wc -l` | |||
| if [ $num -ne 0 ] | |||
| then | |||
| echo "worker exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`grep -E 'Recv Pong Time Out from' master.log | wc -l` | |||
| if [ $num -ne 1 ] | |||
| then | |||
| echo "master catch worker exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| num=`grep -E 'Recv Ping Time Out from' agent.log | wc -l` | |||
| if [ $num -ne 8 ] | |||
| then | |||
| echo "agent catch worker exit failed" | |||
| echo $num | |||
| clean_pid && exit 1 | |||
| fi | |||
| } | |||
| test_worker_fault_model() | |||
| { | |||
| start_master | |||
| start_worker | |||
| start_agent | |||
| kill_worker | |||
| clean_pid | |||
| } | |||
| echo "-----serving start-----" | |||
| rm -rf serving *.log *.mindir *.dat ${CURRPATH}/matmul ${CURRPATH}/kernel_meta | |||
| rm -rf client.py *.json export_model master_with_worker.py master.py worker.py agent.py | |||
| cp -r ../../../example/matmul_distributed/* . | |||
| prepare_model | |||
| test_worker_fault_model | |||
| echo "### end to serving test ###" | |||
| @@ -0,0 +1,43 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| import os | |||
| import sys | |||
| import pytest | |||
| import numpy as np | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_arm_ascend_training | |||
| @pytest.mark.env_single | |||
| def test_distribute_worker_fault(): | |||
| """test_serving""" | |||
| sh_path = os.path.split(os.path.realpath(__file__))[0] | |||
| python_path_folders = [] | |||
| for python_path in sys.path: | |||
| if os.path.isdir(python_path): | |||
| python_path_folders += [python_path] | |||
| folders = [] | |||
| for folder in python_path_folders: | |||
| folders += [os.path.join(folder, x) for x in os.listdir(folder) \ | |||
| if os.path.isdir(os.path.join(folder, x)) and \ | |||
| '/site-packages/mindspore' in os.path.join(folder, x)] | |||
| ret = os.system(f"sh {sh_path}/kill_15_worker.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") | |||
| assert np.allclose(ret, 0) | |||
| ret = os.system(f"sh {sh_path}/kill_9_worker.sh {folders[-1].split('mindspore', 1)[0] + 'mindspore'}") | |||
| assert np.allclose(ret, 0) | |||
| if __name__ == '__main__': | |||
| test_distribute_worker_fault() | |||
| @@ -1,4 +1,4 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| @@ -1,4 +1,4 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||