From 670abbe305341e8c160418e7a80c3b6b396e8486 Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Mon, 15 Jun 2020 02:17:25 -0400 Subject: [PATCH] jobserver.py: _try_read()'s alarm timeout needs to throw an exception. In python3, os.read() automatically retries after EINTR, which breaks our ability to interrupt on SIGALRM. Instead, throw an exception from the SIGALRM handler, which should work on both python2 and python3. This fixes a rare deadlock during parallel builds on python3. For background: https://www.python.org/dev/peps/pep-0475/#backward-compatibility "Applications relying on the fact that system calls are interrupted with InterruptedError will hang. The authors of this PEP don't think that such applications exist [...]" Well, apparently they were mistaken :) --- redo/jobserver.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/redo/jobserver.py b/redo/jobserver.py index 800f5d3..86eea3c 100644 --- a/redo/jobserver.py +++ b/redo/jobserver.py @@ -142,8 +142,10 @@ def release_mine(): _release(1) +class TimeoutError(Exception): pass + def _timeout(sig, frame): - pass + raise TimeoutError() # We make the pipes use the first available fd numbers starting at startfd. @@ -171,11 +173,13 @@ def _try_read(fd, n): return None # try again # ok, the socket is readable - but some other process might get there # first. We have to set an alarm() in case our read() gets stuck. - oldh = signal.signal(signal.SIGALRM, _timeout) try: + oldh = signal.signal(signal.SIGALRM, _timeout) signal.setitimer(signal.ITIMER_REAL, 0.01, 0.01) # emergency fallback try: b = os.read(fd, 1) + except TimeoutError: + return None # try again except OSError as e: if e.errno in (errno.EAGAIN, errno.EINTR): # interrupted or it was nonblocking