[PATCH 3 of 3 py3] py3: use 'surrogatepass' error handler to process U+DCxx transparently

Yuya Nishihara yuya at tcha.org
Sat Sep 16 10:32:52 EDT 2017


# HG changeset patch
# User Yuya Nishihara <yuya at tcha.org>
# Date 1505570148 -32400
#      Sat Sep 16 22:55:48 2017 +0900
# Node ID 5b12a6c1ee08f4a09d73947eee88505f46ee0856
# Parent  320bcaa9820502e56d8b54a5b470ecc8e1fda463
py3: use 'surrogatepass' error handler to process U+DCxx transparently

It's disallowed by default on Python 3.

https://docs.python.org/3/library/codecs.html#error-handlers

diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -448,6 +448,13 @@ def jsonescape(s, paranoid=False):
         pass
     return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
 
+# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
+# bytes are mapped to that range.
+if pycompat.ispy3:
+    _utf8strict = r'surrogatepass'
+else:
+    _utf8strict = r'strict'
+
 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
 
 def getutf8char(s, pos):
@@ -464,7 +471,7 @@ def getutf8char(s, pos):
 
     c = s[pos:pos + l]
     # validate with attempted decode
-    c.decode("utf-8")
+    c.decode("utf-8", _utf8strict)
     return c
 
 def toutf8b(s):
@@ -503,7 +510,7 @@ def toutf8b(s):
         if isinstance(s, localstr):
             return s._utf8
         try:
-            s.decode('utf-8')
+            s.decode('utf-8', _utf8strict)
             return s
         except UnicodeDecodeError:
             pass
@@ -517,12 +524,12 @@ def toutf8b(s):
             c = getutf8char(s, pos)
             if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                 # have to re-escape existing U+DCxx characters
-                c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+                c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                 pos += 1
             else:
                 pos += len(c)
         except UnicodeDecodeError:
-            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
             pos += 1
         r += c
     return r
@@ -570,7 +577,7 @@ def fromutf8b(s):
         pos += len(c)
         # unescape U+DCxx characters
         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
-            c = chr(ord(c.decode("utf-8")) & 0xff)
+            c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
         r += c
     return r
 
diff --git a/mercurial/pure/charencode.py b/mercurial/pure/charencode.py
--- a/mercurial/pure/charencode.py
+++ b/mercurial/pure/charencode.py
@@ -64,6 +64,11 @@ def jsonescapeu8fast(u8chars, paranoid):
     except IndexError:
         raise ValueError
 
+if pycompat.ispy3:
+    _utf8strict = r'surrogatepass'
+else:
+    _utf8strict = r'strict'
+
 def jsonescapeu8fallback(u8chars, paranoid):
     """Convert a UTF-8 byte string to JSON-escaped form (slow path)
 
@@ -74,6 +79,7 @@ def jsonescapeu8fallback(u8chars, parano
     else:
         jm = _jsonmap
     # non-BMP char is represented as UTF-16 surrogate pair
-    u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16'))
+    u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
+    u16codes = array.array(r'H', u16b)
     u16codes.pop(0)  # drop BOM
     return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
diff --git a/tests/test-doctest.py b/tests/test-doctest.py
--- a/tests/test-doctest.py
+++ b/tests/test-doctest.py
@@ -50,7 +50,7 @@ testmod('mercurial.config')
 testmod('mercurial.context')
 testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
 testmod('mercurial.dispatch')
-testmod('mercurial.encoding', py3=False)  # py3: multiple encoding issues
+testmod('mercurial.encoding')
 testmod('mercurial.formatter', py3=False)  # py3: write bytes to stdout
 testmod('mercurial.hg')
 testmod('mercurial.hgweb.hgwebdir_mod', py3=False)  # py3: repr(bytes) ?


More information about the Mercurial-devel mailing list