[PATCH] encoding: handle UTF-16 internal limit with fromutf8b (issue5033)

Matt Mackall mpm at selenic.com
Thu Jan 7 21:01:56 UTC 2016


# HG changeset patch
# User Matt Mackall <mpm at selenic.com>
# Date 1452200277 21600
#      Thu Jan 07 14:57:57 2016 -0600
# Node ID 7aa1dbfbd7a0966ae0e241b16b72fde6df2cb94a
# Parent  b8405d739149cdd6d8d9bd5e3dd2ad8487b1f09a
encoding: handle UTF-16 internal limit with fromutf8b (issue5033)

Default builds of Python have a Unicode type that isn't actually full
Unicode but UTF-16, so characters may not actually be characters.
Since our UTF-8b hack escaping uses a plane that overlaps with the
UTF-16 escaping system, this gets extra complicated. This changes the
code to work on a list of integer code points rather than
"characters", and adds a path to unpack full Unicode codepoints in the
UTF-16 case.

diff -r b8405d739149 -r 7aa1dbfbd7a0 mercurial/encoding.py
--- a/mercurial/encoding.py	Sat Jan 02 02:13:56 2016 +0100
+++ b/mercurial/encoding.py	Thu Jan 07 14:57:57 2016 -0600
@@ -9,6 +9,8 @@
 
 import locale
 import os
+import struct
+import sys
 import unicodedata
 
 from . import (
@@ -516,6 +518,8 @@
     True
     >>> roundtrip("\\xef\\xef\\xbf\\xbd")
     True
+    >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
+    True
     '''
 
     # fast path - look for uDxxx prefixes in s
@@ -523,10 +527,23 @@
         return s
 
     u = s.decode("utf-8")
+    if sys.maxunicode > 65535:
+        # Our Python build is sane and stores UTF-32 internally, will
+        # return full Unicode characters when iterating
+        cpl = [ord(c) for c in u]
+    else:
+        # Our Python stores UTF-16 internally (default build) and will
+        # return surrogate pairs for characters > U+FFFF, thus
+        # defeating the point of having a Unicode string type.
+        # We need to unpack as UCS-4.
+        a = u.encode("utf-32-be")
+        cpl = struct.unpack('>%dL' % (len(a) / 4), a)
+
     r = ""
-    for c in u:
-        if ord(c) & 0xffff00 == 0xdc00:
-            r += chr(ord(c) & 0xff)
+
+    for cp in cpl:
+        if cp & 0xffff00 == 0xdc00:
+            r += chr(cp & 0xff)
         else:
-            r += c.encode("utf-8")
+            r += unichr(cp).encode("utf-8")
     return r


More information about the Mercurial-devel mailing list