Skip to content

Commit

Permalink
judge: be more graceful around worker timeouts
Browse files Browse the repository at this point in the history
- Increase the timeout to 60s to avoid killing the worker when a user has
  aborted a long-running compilation.
- Rather than display `EOFError` when we kill the worker, display a more
  informative `TimeoutError` instead.
  • Loading branch information
Xyene committed Dec 25, 2023
1 parent e44898f commit 2926a41
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions dmoj/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ class IPC(Enum):
REQUEST_ABORT = 'REQUEST-ABORT'


IPC_TEARDOWN_TIMEOUT = 5 # seconds
# This needs to be at least as large as the timeout for the largest compiler time limit, but we don't enforce that here.
# (Otherwise, aborting during a compilation that exceeds this time limit would result in a `TimeoutError` IE instead of
# a `CompileError`.)
IPC_TIMEOUT = 60 # seconds


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -302,6 +305,7 @@ class JudgeWorker:
def __init__(self, submission: Submission) -> None:
self.submission = submission
self._abort_requested = False
self._sent_sigkill_to_worker_process = False
# FIXME(tbrindus): marked Any pending grader cleanups.
self.grader: Any = None

Expand All @@ -326,6 +330,10 @@ def communicate(self) -> Generator[Tuple[IPC, tuple], None, None]:
logger.error('Worker has not sent a message in %d seconds, assuming dead and killing.', recv_timeout)
self.worker_process.kill()
raise
except EOFError:
if self._sent_sigkill_to_worker_process:
raise TimeoutError('worker did not exit in %d seconds, so it was killed' % IPC_TIMEOUT)
raise
except Exception:
logger.error('Failed to read IPC message from worker!')
raise
Expand All @@ -336,16 +344,17 @@ def communicate(self) -> Generator[Tuple[IPC, tuple], None, None]:
else:
yield ipc_type, data

def wait_with_timeout(self, timeout=IPC_TEARDOWN_TIMEOUT) -> None:
def wait_with_timeout(self) -> None:
if self.worker_process and self.worker_process.is_alive():
# Might be None if run was never called, or failed.
try:
self.worker_process.join(timeout=timeout)
self.worker_process.join(timeout=IPC_TIMEOUT)
except OSError:
logger.exception('Exception while waiting for worker to shut down, ignoring...')
finally:
if self.worker_process.is_alive():
logger.error('Worker is still alive, sending SIGKILL!')
self._sent_sigkill_to_worker_process = True
self.worker_process.kill()

def request_abort_grading(self) -> None:
Expand Down Expand Up @@ -426,7 +435,7 @@ def _report_unhandled_exception() -> None:
# We may have failed before sending the IPC.BYE down the connection, in which case the judge will never
# close its side of the connection -- so `ipc_recv_thread` will never exit. But we can't wait forever in
# this case, since we're blocking the main judge from proceeding.
ipc_recv_thread.join(timeout=IPC_TEARDOWN_TIMEOUT)
ipc_recv_thread.join(timeout=IPC_TIMEOUT)
if ipc_recv_thread.is_alive():
logger.error('Judge IPC recv thread is still alive after timeout, shutting worker down anyway!')

Expand Down

0 comments on commit 2926a41

Please sign in to comment.