You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matmul_distribute.sh 6.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. #!/bin/bash
  2. export GLOG_v=1
  3. export DEVICE_ID=1
  4. MINDSPORE_INSTALL_PATH=$1
  5. ENV_DEVICE_ID=$DEVICE_ID
  6. CURRPATH=$(cd "$(dirname $0)" || exit; pwd)
  7. CURRUSER=$(whoami)
  8. PROJECT_PATH=${CURRPATH}/../../../
  9. echo "MINDSPORE_INSTALL_PATH:" ${MINDSPORE_INSTALL_PATH}
  10. echo "ENV_DEVICE_ID:" ${ENV_DEVICE_ID}
  11. echo "CURRPATH:" ${CURRPATH}
  12. echo "CURRUSER:" ${CURRUSER}
  13. echo "PROJECT_PATH:" ${PROJECT_PATH}
  14. export LD_LIBRARY_PATH=${MINDSPORE_INSTALL_PATH}/lib:${LD_LIBRARY_PATH}
  15. #export PYTHONPATH=${MINDSPORE_INSTALL_PATH}/:${PYTHONPATH}
  16. echo "LD_LIBRARY_PATH: " ${LD_LIBRARY_PATH}
  17. echo "PYTHONPATH: " ${PYTHONPATH}
  18. echo "-------------show MINDSPORE_INSTALL_PATH----------------"
  19. ls -l ${MINDSPORE_INSTALL_PATH}
  20. echo "------------------show /usr/lib64/----------------------"
  21. ls -l /usr/local/python/python375/lib/
  22. clean_master_pid()
  23. {
  24. ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep
  25. if [ $? -eq 0 ]
  26. then
  27. ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15
  28. if [ $? -ne 0 ]
  29. then
  30. echo "clean master pip failed"
  31. fi
  32. num=`ps -ef | grep agent.py | grep -v grep | wc -l`
  33. count=0
  34. while [[ ${num} -ne 0 && ${count} -lt 10 ]]
  35. do
  36. sleep 1
  37. count=$(($count+1))
  38. num=`ps -ef | grep agent.py | grep -v grep | wc -l`
  39. done
  40. if [ ${count} -eq 10 ]
  41. then
  42. echo "agent exit failed"
  43. echo $num
  44. ps -ef | grep agent.py | grep -v grep
  45. echo "------------------------------ agent failed log begin: "
  46. cat agent.log
  47. echo "------------------------------ agent failed log end"
  48. clean_pid && exit 1
  49. fi
  50. sleep 1
  51. ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep
  52. if [ $? -eq 0 ]
  53. then
  54. ps aux | grep 'master.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
  55. echo "### master pid exist, clean master pip failed ###" & exit 1
  56. fi
  57. ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep
  58. if [ $? -eq 0 ]
  59. then
  60. ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
  61. echo "### master pid is killed but worker pid exist ###" & exit 1
  62. fi
  63. ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep
  64. if [ $? -eq 0 ]
  65. then
  66. ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
  67. echo "### worker pid is killed but agent pid exist ###" & exit 1
  68. fi
  69. fi
  70. }
  71. clean_worker_pid()
  72. {
  73. ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep
  74. if [ $? -eq 0 ]
  75. then
  76. ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -15
  77. if [ $? -ne 0 ]
  78. then
  79. echo "clean worker pip failed"
  80. fi
  81. sleep 6
  82. ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep
  83. if [ $? -eq 0 ]
  84. then
  85. ps aux | grep 'worker.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
  86. echo "### worker pid exist, clean worker pip failed ###" & exit 1
  87. fi
  88. ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep
  89. if [ $? -eq 0 ]
  90. then
  91. ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
  92. echo "### worker pid is killed but agent pid exist ###" & exit 1
  93. fi
  94. fi
  95. }
  96. clean_agent_pid()
  97. {
  98. ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep
  99. if [ $? -eq 0 ]
  100. then
  101. ps aux | grep 'agent.py' | grep ${CURRUSER} | grep -v grep | awk '{print $2}' | xargs kill -9
  102. if [ $? -eq 0 ]
  103. then
  104. echo "clean agent pid failed"
  105. fi
  106. fi
  107. }
  108. prepare_model()
  109. {
  110. echo "### begin to generate mode for serving matmul distribute test ###"
  111. cd export_model
  112. bash export_model.sh &> export_model.log
  113. if [ $? -ne 0 ]
  114. then
  115. cat export_model.log
  116. echo "### generate model for serving matmul distribute test failed ###" && exit 1
  117. clean_master_pid
  118. clean_worker_pid
  119. clean_agent_pid
  120. cd -
  121. fi
  122. cd -
  123. }
  124. start_service()
  125. {
  126. echo "### start serving service ###"
  127. unset http_proxy https_proxy
  128. python3 master.py > start_master.log 2>&1 &
  129. if [ $? -ne 0 ]
  130. then
  131. echo "master server failed to start." && exit 1
  132. fi
  133. result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' start_master.log | wc -l`
  134. count=0
  135. while [[ ${result} -ne 1 && ${count} -lt 50 ]]
  136. do
  137. sleep 1
  138. count=$(($count+1))
  139. result=`grep -E 'Serving gRPC server start success, listening on 127.0.0.1:5500' start_master.log | wc -l`
  140. done
  141. if [ ${count} -eq 50 ]
  142. then
  143. clean_master_pid
  144. cat start_master.log
  145. echo "start serving service failed!" && exit 1
  146. fi
  147. echo "### start serving master service end ###"
  148. python3 worker.py > start_worker.log 2>&1 &
  149. if [ $? -ne 0 ]
  150. then
  151. echo "worker server failed to start." && exit 1
  152. fi
  153. result=`grep -E 'gRPC server start success, listening on 127.0.0.1:6200' start_worker.log | wc -l`
  154. count=0
  155. while [[ ${result} -ne 1 && ${count} -lt 50 ]]
  156. do
  157. sleep 1
  158. count=$(($count+1))
  159. result=`grep -E 'gRPC server start success, listening on 127.0.0.1:6200' start_worker.log | wc -l`
  160. done
  161. if [ ${count} -eq 50 ]
  162. then
  163. clean_master_pid
  164. clean_worker_pid
  165. cat start_worker.log
  166. echo "start worker service failed!" && exit 1
  167. fi
  168. echo "### start worker service end ###"
  169. python3 agent.py > start_agent.log 2>&1 &
  170. if [ $? -ne 0 ]
  171. then
  172. echo "agent server failed to start." && exit 1
  173. fi
  174. result=`grep -E 'Agent server start success, listening on 127.0.0.1:' start_agent.log | grep -E '7000|7001|7002|7003|7004|7005|7006|7007'| wc -l`
  175. count=0
  176. while [[ ${result} -ne 8 && ${count} -lt 150 ]]
  177. do
  178. sleep 1
  179. count=$(($count+1))
  180. result=`grep -E 'Agent server start success, listening on 127.0.0.1:' start_agent.log | grep -E '7000|7001|7002|7003|7004|7005|7006|7007'| wc -l`
  181. done
  182. if [ ${count} -eq 150 ]
  183. then
  184. clean_master_pid
  185. clean_worker_pid
  186. clean_agent_pid
  187. cat start_agent.log
  188. echo "start agent service failed!" && exit 1
  189. fi
  190. echo "### start agent service end ###"
  191. }
  192. pytest_serving()
  193. {
  194. unset http_proxy https_proxy
  195. echo "### client start ###"
  196. python3 client.py > client.log 2>&1
  197. if [ $? -ne 0 ]
  198. then
  199. clean_master_pid
  200. clean_worker_pid
  201. clean_agent_pid
  202. cat client.log
  203. echo "client failed to start." && exit 1
  204. fi
  205. echo "### client end ###"
  206. }
  207. test_add_model()
  208. {
  209. start_service
  210. pytest_serving
  211. cat client.log
  212. clean_master_pid
  213. clean_worker_pid
  214. clean_agent_pid
  215. }
  216. echo "-----serving start-----"
  217. rm -rf serving *.log *.dat ${CURRPATH}/matmul ${CURRPATH}/model ${CURRPATH}/kernel_meta ${CURRPATH}/somas_meta
  218. rm -rf client.py export_model temp_rank_table master.py worker.py agent.py master_with_worker.py rank_table_8pcs.json
  219. cp -r ../../../example/matmul_distributed/* .
  220. prepare_model
  221. test_add_model

A lightweight and high-performance service module that helps MindSpore developers efficiently deploy online inference services in the production environment.