Merge pull request #17 from Autodesk/fix_unicode

Fix unicode handling
Autodesk · Jun 19, 2017 · 5b14c78 · 5b14c78
2 parents b42cb55 + 214c929
commit 5b14c78
Show file tree

Hide file tree

Showing 11 changed files with 292 additions and 89 deletions.
diff --git a/NOTICES b/NOTICES
@@ -0,0 +1,12 @@
+This project incorporates source code from the following sources:
+
+       Python
+--------------------------------
+Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012, 2013, 2014, 2015, 2016, 2017 Python Software Foundation; All Rights
+Reserved
+SOURCE CODE: pyccc/backports.py 
+DESCRIPTION: source code backported from Python 3 standard libraries; see source 
+files for details.
+LICENSE: https://docs.python.org/3/license.html
+WEBSITE: https://www.python.org
diff --git a/pyccc/backports.py b/pyccc/backports.py
@@ -0,0 +1,217 @@
+from __future__ import print_function, absolute_import, division
+from future.builtins import *
+from future import standard_library, builtins
+standard_library.install_aliases()
+
+# Copyright 2017 Autodesk Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from future.utils import PY2
+
+from codecs import BOM_UTF8, lookup
+import re
+import inspect
+
+
+if PY2:
+    # This is only necessary for Python 2, it's taken care of in Python 3
+
+    cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
+    blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
+
+    def detect_encoding(readline):
+        """
+        The detect_encoding() function is used to detect the encoding that should
+        be used to decode a Python source file.  It requires one argument, readline,
+        in the same way as the tokenize() generator.
+        It will call readline a maximum of twice, and return the encoding used
+        (as a string) and a list of any lines (left as bytes) it has read in.
+        It detects the encoding from the presence of a utf-8 bom or an encoding
+        cookie as specified in pep-0263.  If both a bom and a cookie are present,
+        but disagree, a SyntaxError will be raised.  If the encoding cookie is an
+        invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+        'utf-8-sig' is returned.
+        If no encoding is specified, then the default of 'utf-8' will be returned.
+
+        Note:
+            Copied without modificaiton from Python 3.6.1 tokenize standard library module
+
+            Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+            2011, 2012, 2013, 2014, 2015, 2016, 2017 Python Software Foundation; All Rights
+            Reserved"
+
+            See also py-cloud-compute-cannon/NOTICES.
+        """
+        try:
+            filename = readline.__self__.name
+        except AttributeError:
+            filename = None
+        bom_found = False
+        encoding = None
+        default = 'utf-8'
+        def read_or_stop():
+            try:
+                return readline()
+            except StopIteration:
+                return b''
+
+        def find_cookie(line):
+            try:
+                # Decode as UTF-8. Either the line is an encoding declaration,
+                # in which case it should be pure ASCII, or it must be UTF-8
+                # per default encoding.
+                line_string = line.decode('utf-8')
+            except UnicodeDecodeError:
+                msg = "invalid or missing encoding declaration"
+                if filename is not None:
+                    msg = '{} for {!r}'.format(msg, filename)
+                raise SyntaxError(msg)
+
+            match = cookie_re.match(line_string)
+            if not match:
+                return None
+            encoding = _get_normal_name(match.group(1))
+            try:
+                codec = lookup(encoding)
+            except LookupError:
+                # This behaviour mimics the Python interpreter
+                if filename is None:
+                    msg = "unknown encoding: " + encoding
+                else:
+                    msg = "unknown encoding for {!r}: {}".format(filename,
+                            encoding)
+                raise SyntaxError(msg)
+
+            if bom_found:
+                if encoding != 'utf-8':
+                    # This behaviour mimics the Python interpreter
+                    if filename is None:
+                        msg = 'encoding problem: utf-8'
+                    else:
+                        msg = 'encoding problem for {!r}: utf-8'.format(filename)
+                    raise SyntaxError(msg)
+                encoding += '-sig'
+            return encoding
+
+        first = read_or_stop()
+        if first.startswith(BOM_UTF8):
+            bom_found = True
+            first = first[3:]
+            default = 'utf-8-sig'
+        if not first:
+            return default, []
+
+        encoding = find_cookie(first)
+        if encoding:
+            return encoding, [first]
+        if not blank_re.match(first):
+            return default, [first]
+
+        second = read_or_stop()
+        if not second:
+            return default, [first]
+
+        encoding = find_cookie(second)
+        if encoding:
+            return encoding, [first, second]
+
+        return default, [first, second]
+
+
+def _get_normal_name(orig_enc):
+    """Imitates get_normal_name in tokenizer.c.
+
+    Note:
+        Copied without modification from Python 3.6.1 tokenize standard library module
+
+        Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+        2011, 2012, 2013, 2014, 2015, 2016, 2017 Python Software Foundation; All Rights
+        Reserved"
+
+        See also py-cloud-compute-cannon/NOTICES.
+    """
+    # Only care about the first 12 characters.
+    enc = orig_enc[:12].lower().replace("_", "-")
+    if enc == "utf-8" or enc.startswith("utf-8-"):
+        return "utf-8"
+    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+        return "iso-8859-1"
+    return orig_enc
+
+
+def getclosurevars(func):
+    """
+    Get the mapping of free variables to their current values.
+
+    Returns a named tuple of dicts mapping the current nonlocal, global
+    and builtin references as seen by the body of the function. A final
+    set of unbound names that could not be resolved is also provided.
+
+    Note:
+        Modified function from the Python 3.5 inspect standard library module
+
+        Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+        2011, 2012, 2013, 2014, 2015, 2016, 2017 Python Software Foundation; All Rights
+        Reserved"
+
+        See also py-cloud-compute-cannon/NOTICES.
+    """
+    if inspect.ismethod(func):
+        func = func.__func__
+
+    elif not inspect.isroutine(func):
+        raise TypeError("'{!r}' is not a Python function".format(func))
+
+    # AMVMOD: deal with python 2 builtins that don't define these
+    code = getattr(func, '__code__', None)
+    closure = getattr(func, '__closure__', None)
+    co_names = getattr(code, 'co_names', ())
+    glb = getattr(func, '__globals__', {})
+
+    # Nonlocal references are named in co_freevars and resolved
+    # by looking them up in __closure__ by positional index
+    if closure is None:
+        nonlocal_vars = {}
+    else:
+        nonlocal_vars = {var: cell.cell_contents
+                         for var, cell in zip(code.co_freevars, func.__closure__)}
+
+    # Global and builtin references are named in co_names and resolved
+    # by looking them up in __globals__ or __builtins__
+    global_ns = glb
+    builtin_ns = global_ns.get("__builtins__", builtins.__dict__)
+    if inspect.ismodule(builtin_ns):
+        builtin_ns = builtin_ns.__dict__
+    global_vars = {}
+    builtin_vars = {}
+    unbound_names = set()
+    for name in co_names:
+        if name in ("None", "True", "False"):
+            # Because these used to be builtins instead of keywords, they
+            # may still show up as name references. We ignore them.
+            continue
+        try:
+            global_vars[name] = global_ns[name]
+        except KeyError:
+            try:
+                builtin_vars[name] = builtin_ns[name]
+            except KeyError:
+                unbound_names.add(name)
+
+    return {'nonlocal': nonlocal_vars,
+            'global': global_vars,
+            'builtin': builtin_vars,
+            'unbound': unbound_names}
+
diff --git a/pyccc/docker_utils.py b/pyccc/docker_utils.py
@@ -64,7 +64,7 @@ def build_dfile_stream(client, dfilestream, is_tar=False, **kwargs):
     # this blocks until the image is done building
     for x in buildcmd:
         if isinstance(x, bytes):
-            x = x.decode('utf-8')  # TODO: is this a bug? Why is docker API returning bytes?
+            x = x.decode('utf-8')
         logging.info('building image:%s' % (x.rstrip('\n')))
 
     result = json.loads(_issue1134_helper(x))

diff --git a/pyccc/engines/dockerengine.py b/pyccc/engines/dockerengine.py
@@ -78,12 +78,12 @@ def submit(self, job):
 
         job.container = self.client.create_container(job.imageid,
                                                      command=cmdstring,
-                                                     working_dir=job.workingdir)
+                                                     working_dir=job.workingdir,
+                                                     environment={'PYTHONIOENCODING':'utf-8'})
         self.client.start(job.container)
         job.containerid = job.container['Id']
         job.jobid = job.containerid
 
-
     def wait(self, job):
         return self.client.wait(job.container)
 
@@ -125,10 +125,7 @@ def _list_output_files(self, job):
     def _get_final_stds(self, job):
         stdout = self.client.logs(job.container, stdout=True, stderr=False)
         stderr = self.client.logs(job.container, stdout=False, stderr=True)
-        return (utils.autodecode(stdout),
-                utils.autodecode(stderr))
-
-
+        return stdout.decode('utf-8'), stderr.decode('utf-8')
 
 
 class DockerMachine(Docker):

diff --git a/pyccc/engines/subproc.py b/pyccc/engines/subproc.py
@@ -68,7 +68,8 @@ def submit(self, job):
                                        shell=True,
                                        cwd=job.workingdir,
                                        stdout=subprocess.PIPE,
-                                       stderr=subprocess.PIPE)
+                                       stderr=subprocess.PIPE,
+                                       env={'PYTHONIOENCODING': 'utf-8'})
         job.jobid = job.subproc.pid
         job._started = True
         return job.subproc.pid
@@ -96,7 +97,6 @@ def _list_output_files(self, job, dir=None):
 
     def _get_final_stds(self, job):
         strings = []
-        #Todo - we'll need to buffer any streamed output, since stdout isn't seekable
         for fileobj in (job.subproc.stdout, job.subproc.stderr):
-            strings.append(fileobj.read().decode(self.term_encoding))
-        return strings
+            strings.append(fileobj.read().decode('utf-8'))
+        return strings
diff --git a/pyccc/files/bytecontainer.py b/pyccc/files/bytecontainer.py
@@ -73,7 +73,7 @@ def open(self, mode='r', encoding=None):
 
         if access_type == 't' and encoding is not None and encoding != self.encoded_with:
             warnings.warn('Attempting to decode %s as "%s", but encoding is declared as "%s"'
-                          %(self, encoding, self.encoded_with))
+                          % (self, encoding, self.encoded_with))
 
         if encoding is None:
             encoding = self.encoded_with

diff --git a/pyccc/picklers.py b/pyccc/picklers.py
@@ -50,6 +50,8 @@ def persistent_id(self, obj):
         """
         if getattr(obj, '_persistent_ref', None) is not None:
             return obj._persistent_ref
+        else:
+            return None
 
 
 class ReturningUnpickler(pickle.Unpickler):

diff --git a/pyccc/python.py b/pyccc/python.py
@@ -146,30 +146,28 @@ def _get_source(self, remote_function=None):
         Returns:
              bytes: utf-8 encoded source code
         """
+        srclines = [u'# -*- coding: utf-8 -*-\n']
         if self.sendsource:
             func = self.function_call.function
             if self.function_call.is_instancemethod:
                 obj = func.__self__.__class__
             else:
                 obj = func
-            srclines = [src.getsource(obj)]
-        elif self.function_call.is_instancemethod:
-            srclines = ['']
-        else:
-            srclines = ["from %s import %s\n"%(self.function_call.function.__module__,
-                                               self.function_call.function.__name__)]
+            srclines.append(src.getsource(obj))
+        elif not self.function_call.is_instancemethod:
+            srclines.append("from %s import %s\n"%(self.function_call.function.__module__,
+                                                   self.function_call.function.__name__))
 
         if remote_function is not None and remote_function.global_functions:
             for name, f in remote_function.global_functions.items():
-                srclines.append('\n# source code for function "%s"' % name)
+                srclines.append(u'\n# source code for function "%s"' % name)
                 srclines.append(src.getsource(f))
 
         # This is the only source code needed from pyccc
         srclines.append(PACKAGEDFUNCTIONSOURCE)
 
-        if isinstance(srclines, str):
-            srclines = b'# -*- coding: utf-8 -*-\n' + srclines.encode('utf-8')
-        return '\n'.join(srclines)
+        srccode = '\n'.join(srclines)
+        return srccode.encode('utf-8')
 
     @property
     def result(self):