From c7fd49e2f0b112e1e5de7bf036a73bec6b83b694 Mon Sep 17 00:00:00 2001 From: zhoufeng Date: Fri, 5 Feb 2021 20:48:15 +0800 Subject: [PATCH] terminate tbe process pool in separate thread, kill child of fork in model converter Signed-off-by: zhoufeng --- .../tbe_compiler/tbe_process.py | 12 ++++++--- .../model_converter_utils/multi_process.cc | 25 ++++++++++++++++--- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py index a3528dddb1..a0a299f718 100644 --- a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py +++ b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================ """tbe process""" +import threading import traceback import multiprocessing import subprocess @@ -137,11 +138,16 @@ class TbeProcess: res = "TBEException", "ERROR: [MS_BUILD_PROCESS_NUM] type should be a int num, but got :" + process_num return res + def close_pool(self): + self.__pool.terminate() + self.__pool.join() + del self.__pool + def exit(self): if self.__pool is not None: - self.__pool.terminate() - self.__pool.join() - del self.__pool + stop_thread = threading.Thread(target=self.close_pool) + stop_thread.daemon = True + stop_thread.start() def start_compile_op(self, op_json): """ diff --git a/mindspore/ccsrc/cxx_api/model/model_converter_utils/multi_process.cc b/mindspore/ccsrc/cxx_api/model/model_converter_utils/multi_process.cc index 50ca5477a8..58f70c3c77 100644 --- a/mindspore/ccsrc/cxx_api/model/model_converter_utils/multi_process.cc +++ b/mindspore/ccsrc/cxx_api/model/model_converter_utils/multi_process.cc @@ -65,13 +65,30 @@ Status MultiProcess::MainProcess(ProcessFuncCall parent_process, ProcessFuncCall if (pid == 0) { ChildProcess(child_process); shared_memory.Detach(); - MS_LOG_INFO << "Model converter: child process exit"; - exit(0); + MS_LOG_INFO << "Model converter: child process sleep waiting for exit signal."; + while (1) { + // waiting for signal + } } else { // parent process ret = ParentProcess(parent_process); shared_memory.Detach(); - int status; - wait(&status); + + MS_LOG_INFO << "Model converter: parent process kills child of fork."; + (void)kill(pid, SIGKILL); + constexpr uint32_t kMaxLoopCount = 5; + bool child_exited = false; + for (uint32_t i = 0; i < kMaxLoopCount; ++i) { + int status; + if (waitpid(pid, &status, WNOHANG) == pid) { + MS_LOG(INFO) << "Child process " << pid << " exits success."; + child_exited = true; + break; + } + sleep(1); + } + if (!child_exited) { + MS_LOG(WARNING) << "Child process " << pid << " has been killed but waitpid failed."; + } shared_memory.Destroy(); } return ret;