Browse Source

!1024 verify mindinsight state by post_worker_init log

From: @liangyongxiong1024
Reviewed-by: @lilongfei15,@wenkai_dist
Signed-off-by: @wenkai_dist
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
c8ea2d73df
2 changed files with 26 additions and 17 deletions
  1. +6
    -7
      mindinsight/backend/config/gunicorn_conf.py
  2. +20
    -10
      mindinsight/backend/run.py

+ 6
- 7
mindinsight/backend/config/gunicorn_conf.py View File

@@ -49,15 +49,14 @@ def on_starting(server):
threading.Thread(target=hook.on_startup, args=(server.log,)).start()


def post_fork(server, worker):
def post_worker_init(worker):
"""
Launch a process to listen worker after gunicorn fork worker.
Launch a process to listen worker after gunicorn worker is initialized.

Children processes of gunicorn worker should be killed when worker has been killed
because gunicorn master murders this worker for some reasons such as worker timeout.

Args:
server (Arbiter): gunicorn server instance.
worker (ThreadWorker): worker instance.
"""
def murder_worker_children_processes():
@@ -72,19 +71,19 @@ def post_fork(server, worker):
if os.getppid() != worker.pid:
current_worker_pid = os.getppid()
for proc in processes_to_kill:
server.log.info("Original worker pid: %d, current worker pid: %d, stop process %d",
worker.log.info("Original worker pid: %d, current worker pid: %d, stop process %d",
worker.pid, current_worker_pid, proc.pid)
try:
proc.send_signal(signal.SIGKILL)
except psutil.NoSuchProcess:
continue
except psutil.Error as ex:
server.log.error("Stop process %d failed. Detail: %s.", proc.pid, str(ex))
server.log.info("%d processes have been killed.", len(processes_to_kill))
worker.log.error("Stop process %d failed. Detail: %s.", proc.pid, str(ex))
worker.log.info("%d processes have been killed.", len(processes_to_kill))
break
time.sleep(1)

listen_process = multiprocessing.Process(target=murder_worker_children_processes,
name="murder_worker_children_processes")
listen_process.start()
server.log.info("Server pid: %d, start to listening.", server.pid)
worker.log.info("Server pid: %d, start to listening.", worker.ppid)

+ 20
- 10
mindinsight/backend/run.py View File

@@ -78,19 +78,23 @@ def _is_match_one(sub_string_list, src_string):
return False


def _check_stat_from_log(log_info):
def _check_stat_from_log(pid, log_info):
"""
Determine the service startup status based on the log information.

Args:
pid (int): The gunicorn process ID.
log_info (str): The output log of service startup.

Returns:
str, the state value that is one of the follows: "unknown", "failed" and "success".
"""
server_state = ServerStateEnum.UNKNOWN.value
match_success_info = "Listening at: http://%s:%d" % \
(settings.HOST, int(settings.PORT))

# should be synchronized to startup log in gunicorn post_worker_init hook
# refer to mindinsight/backend/config/gunicorn_conf.py
match_success_info = "Server pid: %d, start to listening." % pid

common_failed_info_list = [
"[ERROR] Retrying in 1 second",
"[INFO] Reason: App failed to load",
@@ -130,11 +134,12 @@ def _get_access_log_path():
return access_log_path


def _check_state_from_log(log_abspath, start_pos=0):
def _check_state_from_log(pid, log_abspath, start_pos=0):
"""
Check the service startup status based on the log file.

Args:
pid (int): The gunicorn process ID.
log_abspath (str): Absolute path of the log file.
start_pos (int): Offset position of the log file.

@@ -157,7 +162,7 @@ def _check_state_from_log(log_abspath, start_pos=0):
server_is_start = True
continue
if server_is_start:
log_result = _check_stat_from_log(line)
log_result = _check_stat_from_log(pid, line)
# ignore "unknown" result
if log_result != ServerStateEnum.UNKNOWN.value:
state_result["state"] = log_result
@@ -175,11 +180,12 @@ def _check_state_from_log(log_abspath, start_pos=0):
return state_result


def _check_server_start_stat(log_abspath, start_pos=None):
def _check_server_start_stat(pid, log_abspath, start_pos=None):
"""
Checking the Server Startup Status.

Args:
pid (int): The gunicorn process ID.
log_abspath (str): The log file path.
start_pos (int): The log file start position.

@@ -193,15 +199,19 @@ def _check_server_start_stat(log_abspath, start_pos=None):
if not log_abspath:
return state_result

# sleep 1 second for gunicorn master to be ready
time.sleep(1)

log_pos = _get_file_size(log_abspath) if start_pos is None else start_pos
try_cnt = 0
try_cnt_max = 2

while try_cnt < try_cnt_max:
try_cnt += 1
time.sleep(1)
if _get_file_size(log_abspath) > log_pos:
state_result.update(_check_state_from_log(log_abspath, log_pos))
try_cnt += 1
file_size = _get_file_size(log_abspath)
if file_size > log_pos:
state_result.update(_check_state_from_log(pid, log_abspath, log_pos))
break

if not state_result['prompt_message']:
@@ -274,7 +284,7 @@ def start():
console.error("Start MindInsight failed. See log for details, log path: %s.", error_log_abspath)
sys.exit(1)
else:
state_result = _check_server_start_stat(error_log_abspath, log_size)
state_result = _check_server_start_stat(process.pid, error_log_abspath, log_size)
# print gunicorn start state to stdout
label = 'Web address:'
format_args = label, settings.HOST, str(settings.PORT), settings.URL_PATH_PREFIX


Loading…
Cancel
Save