[PATCH] encoding: handle UTF-16 internal limit with fromutf8b (issue5033)
Matt Mackall
mpm at selenic.com
Thu Jan 7 21:01:56 UTC 2016
# HG changeset patch
# User Matt Mackall <mpm at selenic.com>
# Date 1452200277 21600
# Thu Jan 07 14:57:57 2016 -0600
# Node ID 7aa1dbfbd7a0966ae0e241b16b72fde6df2cb94a
# Parent b8405d739149cdd6d8d9bd5e3dd2ad8487b1f09a
encoding: handle UTF-16 internal limit with fromutf8b (issue5033)
Default builds of Python have a Unicode type that isn't actually full
Unicode but UTF-16, so characters may not actually be characters.
Since our UTF-8b hack escaping uses a plane that overlaps with the
UTF-16 escaping system, this gets extra complicated. This changes the
code to work on a list of integer code points rather than
"characters", and adds a path to unpack full Unicode codepoints in the
UTF-16 case.
diff -r b8405d739149 -r 7aa1dbfbd7a0 mercurial/encoding.py
--- a/mercurial/encoding.py Sat Jan 02 02:13:56 2016 +0100
+++ b/mercurial/encoding.py Thu Jan 07 14:57:57 2016 -0600
@@ -9,6 +9,8 @@
import locale
import os
+import struct
+import sys
import unicodedata
from . import (
@@ -516,6 +518,8 @@
True
>>> roundtrip("\\xef\\xef\\xbf\\xbd")
True
+ >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
+ True
'''
# fast path - look for uDxxx prefixes in s
@@ -523,10 +527,23 @@
return s
u = s.decode("utf-8")
+ if sys.maxunicode > 65535:
+ # Our Python build is sane and stores UTF-32 internally, will
+ # return full Unicode characters when iterating
+ cpl = [ord(c) for c in u]
+ else:
+ # Our Python stores UTF-16 internally (default build) and will
+ # return surrogate pairs for characters > U+FFFF, thus
+ # defeating the point of having a Unicode string type.
+ # We need to unpack as UCS-4.
+ a = u.encode("utf-32-be")
+ cpl = struct.unpack('>%dL' % (len(a) / 4), a)
+
r = ""
- for c in u:
- if ord(c) & 0xffff00 == 0xdc00:
- r += chr(ord(c) & 0xff)
+
+ for cp in cpl:
+ if cp & 0xffff00 == 0xdc00:
+ r += chr(cp & 0xff)
else:
- r += c.encode("utf-8")
+ r += unichr(cp).encode("utf-8")
return r
More information about the Mercurial-devel
mailing list