From 670abbe305341e8c160418e7a80c3b6b396e8486 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@tailscale.com>
Date: Mon, 15 Jun 2020 02:17:25 -0400
Subject: [PATCH] jobserver.py: _try_read()'s alarm timeout needs to throw an
 exception.

In python3, os.read() automatically retries after EINTR, which breaks
our ability to interrupt on SIGALRM.

Instead, throw an exception from the SIGALRM handler, which should work
on both python2 and python3.

This fixes a rare deadlock during parallel builds on python3.

For background:
https://www.python.org/dev/peps/pep-0475/#backward-compatibility

"Applications relying on the fact that system calls are interrupted
with InterruptedError will hang. The authors of this PEP don't think
that such applications exist [...]"

Well, apparently they were mistaken :)
---
 redo/jobserver.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/redo/jobserver.py b/redo/jobserver.py
index 800f5d3..86eea3c 100644
--- a/redo/jobserver.py
+++ b/redo/jobserver.py
@@ -142,8 +142,10 @@ def release_mine():
     _release(1)
 
 
+class TimeoutError(Exception): pass
+
 def _timeout(sig, frame):
-    pass
+    raise TimeoutError()
 
 
 # We make the pipes use the first available fd numbers starting at startfd.
@@ -171,11 +173,13 @@ def _try_read(fd, n):
         return None  # try again
     # ok, the socket is readable - but some other process might get there
     # first.  We have to set an alarm() in case our read() gets stuck.
-    oldh = signal.signal(signal.SIGALRM, _timeout)
     try:
+        oldh = signal.signal(signal.SIGALRM, _timeout)
         signal.setitimer(signal.ITIMER_REAL, 0.01, 0.01)  # emergency fallback
         try:
             b = os.read(fd, 1)
+        except TimeoutError:
+            return None  # try again
         except OSError as e:
             if e.errno in (errno.EAGAIN, errno.EINTR):
                 # interrupted or it was nonblocking