From 2926a416c564d92c8ad804efe2fb32cb80eb2b2a Mon Sep 17 00:00:00 2001 From: Tudor Brindus Date: Mon, 25 Dec 2023 16:15:57 -0500 Subject: [PATCH] judge: be more graceful around worker timeouts - Increase the timeout to 60s to avoid killing the worker when a user has aborted a long-running compilation. - Rather than display `EOFError` when we kill the worker, display a more informative `TimeoutError` instead. --- dmoj/judge.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/dmoj/judge.py b/dmoj/judge.py index b3b4f71ff..cc615c31f 100644 --- a/dmoj/judge.py +++ b/dmoj/judge.py @@ -46,7 +46,10 @@ class IPC(Enum): REQUEST_ABORT = 'REQUEST-ABORT' -IPC_TEARDOWN_TIMEOUT = 5 # seconds +# This needs to be at least as large as the timeout for the largest compiler time limit, but we don't enforce that here. +# (Otherwise, aborting during a compilation that exceeds this time limit would result in a `TimeoutError` IE instead of +# a `CompileError`.) +IPC_TIMEOUT = 60 # seconds logger = logging.getLogger(__name__) @@ -302,6 +305,7 @@ class JudgeWorker: def __init__(self, submission: Submission) -> None: self.submission = submission self._abort_requested = False + self._sent_sigkill_to_worker_process = False # FIXME(tbrindus): marked Any pending grader cleanups. self.grader: Any = None @@ -326,6 +330,10 @@ def communicate(self) -> Generator[Tuple[IPC, tuple], None, None]: logger.error('Worker has not sent a message in %d seconds, assuming dead and killing.', recv_timeout) self.worker_process.kill() raise + except EOFError: + if self._sent_sigkill_to_worker_process: + raise TimeoutError('worker did not exit in %d seconds, so it was killed' % IPC_TIMEOUT) + raise except Exception: logger.error('Failed to read IPC message from worker!') raise @@ -336,16 +344,17 @@ def communicate(self) -> Generator[Tuple[IPC, tuple], None, None]: else: yield ipc_type, data - def wait_with_timeout(self, timeout=IPC_TEARDOWN_TIMEOUT) -> None: + def wait_with_timeout(self) -> None: if self.worker_process and self.worker_process.is_alive(): # Might be None if run was never called, or failed. try: - self.worker_process.join(timeout=timeout) + self.worker_process.join(timeout=IPC_TIMEOUT) except OSError: logger.exception('Exception while waiting for worker to shut down, ignoring...') finally: if self.worker_process.is_alive(): logger.error('Worker is still alive, sending SIGKILL!') + self._sent_sigkill_to_worker_process = True self.worker_process.kill() def request_abort_grading(self) -> None: @@ -426,7 +435,7 @@ def _report_unhandled_exception() -> None: # We may have failed before sending the IPC.BYE down the connection, in which case the judge will never # close its side of the connection -- so `ipc_recv_thread` will never exit. But we can't wait forever in # this case, since we're blocking the main judge from proceeding. - ipc_recv_thread.join(timeout=IPC_TEARDOWN_TIMEOUT) + ipc_recv_thread.join(timeout=IPC_TIMEOUT) if ipc_recv_thread.is_alive(): logger.error('Judge IPC recv thread is still alive after timeout, shutting worker down anyway!')