|
- #!/bin/bash
-
- export GLOG_v=1
- export DEVICE_ID=1
-
- MINDSPORE_INSTALL_PATH=$1
- ENV_DEVICE_ID=$DEVICE_ID
- CURRPATH=$(cd "$(dirname $0)" || exit; pwd)
- CURRUSER=$(whoami)
- PROJECT_PATH=${CURRPATH}/../../../
- echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH}
- echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID}
- echo "CURRPATH:" ${CURRPATH}
- echo "CURRUSER:" ${CURRUSER}
- echo "PROJECT_PATH:" ${PROJECT_PATH}
-
- export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH}
- #export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH}
-
- echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH}
- echo "PYTHONPATH: " ${PYTHONPATH}
- echo "-------------show MINDSPORE_INSTALL_PATH----------------"
- ls -l ${MINDSPORE_INSTALL_PATH}
- echo "------------------show /usr/lib64/----------------------"
- ls -l /usr/local/python/python375/lib/
-
- clean_master_pid()
- {
- ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep
- if [ $? -eq 0 ]
- then
- ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15
- if [ $? -ne 0 ]
- then
- echo "clean master pip failed"
- fi
-
- num=`ps -ef | grep agent.py | grep -v grep | wc -l`
- count=0
- while [[ ${num} -ne 0 && ${count} -lt 10 ]]
- do
- sleep 1
- count=$(($count+1))
- num=`ps -ef | grep agent.py | grep -v grep | wc -l`
- done
-
- if [ ${count} -eq 10 ]
- then
- echo "agent exit failed"
- echo $num
- ps -ef | grep agent.py | grep -v grep
- echo "------------------------------ agent failed log begin: "
- cat agent.log
- echo "------------------------------ agent failed log end"
- clean_pid && exit 1
- fi
- sleep 1
-
- ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep
- if [ $? -eq 0 ]
- then
- ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
- echo "### master pid exist, clean master pip failed ###" & exit 1
- fi
- ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep
- if [ $? -eq 0 ]
- then
- ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
- echo "### master pid is killed but worker pid exist ###" & exit 1
- fi
- ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep
- if [ $? -eq 0 ]
- then
- ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
- echo "### worker pid is killed but agent pid exist ###" & exit 1
- fi
- fi
- }
-
- clean_worker_pid()
- {
- ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep
- if [ $? -eq 0 ]
- then
- ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15
- if [ $? -ne 0 ]
- then
- echo "clean worker pip failed"
- fi
- sleep 6
- ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep
- if [ $? -eq 0 ]
- then
- ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
- echo "### worker pid exist, clean worker pip failed ###" & exit 1
- fi
- ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep
- if [ $? -eq 0 ]
- then
- ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
- echo "### worker pid is killed but agent pid exist ###" & exit 1
- fi
- fi
- }
-
- clean_agent_pid()
- {
- ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep
- if [ $? -eq 0 ]
- then
- ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
- if [ $? -eq 0 ]
- then
- echo "clean agent pid failed"
- fi
- fi
- }
-
- prepare_model()
- {
- echo "### begin to generate mode for serving matmul distribute test ###"
- cd export_model
- bash export_model.sh &> export_model.log
- if [ $? -ne 0 ]
- then
- cat export_model.log
- echo "### generate model for serving matmul distribute test failed ###" && exit 1
- clean_master_pid
- clean_worker_pid
- clean_agent_pid
- cd -
- fi
- cd -
- }
-
- start_service()
- {
- echo "### start serving service ###"
- unset http_proxy https_proxy
- python3 master.py > start_master.log 2>&1 &
- if [ $? -ne 0 ]
- then
- echo "master server failed to start." && exit 1
- fi
-
- result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' start_master.log | wc -l`
- count=0
- while [[ ${result} -ne 1 && ${count} -lt 50 ]]
- do
- sleep 1
- count=$(($count+1))
- result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' start_master.log | wc -l`
- done
-
- if [ ${count} -eq 50 ]
- then
- clean_master_pid
- cat start_master.log
- echo "start serving service failed!" && exit 1
- fi
-
- echo "### start serving master service end ###"
-
- python3 worker.py > start_worker.log 2>&1 &
- if [ $? -ne 0 ]
- then
- echo "worker server failed to start." && exit 1
- fi
-
- result=`grep -E 'gRPC server start success, listening on 127.0.0.1:6200' start_worker.log | wc -l`
- count=0
- while [[ ${result} -ne 1 && ${count} -lt 50 ]]
- do
- sleep 1
- count=$(($count+1))
- result=`grep -E 'gRPC server start success, listening on 127.0.0.1:6200' start_worker.log | wc -l`
- done
-
- if [ ${count} -eq 50 ]
- then
- clean_master_pid
- clean_worker_pid
- cat start_worker.log
- echo "start worker service failed!" && exit 1
- fi
-
- echo "### start worker service end ###"
-
- python3 agent.py > start_agent.log 2>&1 &
- if [ $? -ne 0 ]
- then
- echo "agent server failed to start." && exit 1
- fi
-
- result=`grep -E 'Agent server start success, listening on 127.0.0.1:' start_agent.log | grep -E '7000|7001|7002|7003|7004|7005|7006|7007'| wc -l`
- count=0
- while [[ ${result} -ne 8 && ${count} -lt 150 ]]
- do
- sleep 1
- count=$(($count+1))
- result=`grep -E 'Agent server start success, listening on 127.0.0.1:' start_agent.log | grep -E '7000|7001|7002|7003|7004|7005|7006|7007'| wc -l`
- done
-
- if [ ${count} -eq 150 ]
- then
- clean_master_pid
- clean_worker_pid
- clean_agent_pid
- cat start_agent.log
- echo "start agent service failed!" && exit 1
- fi
-
- echo "### start agent service end ###"
- }
-
- pytest_serving()
- {
- unset http_proxy https_proxy
- echo "### client start ###"
- python3 client.py > client.log 2>&1
- if [ $? -ne 0 ]
- then
- clean_master_pid
- clean_worker_pid
- clean_agent_pid
- cat client.log
- echo "client failed to start." && exit 1
- fi
- echo "### client end ###"
- }
-
- test_add_model()
- {
- start_service
- pytest_serving
- cat client.log
- clean_master_pid
- clean_worker_pid
- clean_agent_pid
- }
-
- echo "-----serving start-----"
- rm -rf serving *.log *.dat ${CURRPATH}/matmul ${CURRPATH}/model ${CURRPATH}/kernel_meta ${CURRPATH}/somas_meta
- rm -rf client.py export_model temp_rank_table master.py worker.py agent.py master_with_worker.py rank_table_8pcs.json
- cp -r ../../../example/matmul_distributed/* .
- prepare_model
- test_add_model
|