[PATCH 2 of 3] encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara
yuya at tcha.org
Sat Jan 16 06:33:54 CST 2016
# HG changeset patch
# User Yuya Nishihara <yuya at tcha.org>
# Date 1451212114 -32400
# Sun Dec 27 19:28:34 2015 +0900
# Node ID b3b1bef76d54a4755a1b221a36b00253eefefd9a
# Parent a3c68ee9fac119b70c0b515081f161fb708deca7
encoding: add option to escape non-ascii characters in JSON
This is necessary for hgweb to embed JSON data in HTML. JSON data must be
able to be embedded in non-UTF-8 HTML page so long as the page encoding is
compatible with ASCII.
According to RFC 7159, non-BMP character is represented as UTF-16 surrogate
pair. This function first splits an input string into an array of UTF-16
code points.
https://tools.ietf.org/html/rfc7159.html#section-7
diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -7,6 +7,7 @@
from __future__ import absolute_import
+import array
import locale
import os
import unicodedata
@@ -379,8 +380,9 @@ class normcasespecs(object):
other = 0
_jsonmap = {}
+_paranoidjsonmap = {}
-def jsonescape(s):
+def jsonescape(s, paranoid=False):
'''returns a string suitable for JSON
JSON is problematic for us because it doesn't support non-Unicode
@@ -405,12 +407,24 @@ def jsonescape(s):
'utf-8: caf\\xc3\\xa9'
>>> jsonescape('')
''
+
+ If paranoid, non-ascii characters are also escaped. This is suitable for
+ web output.
+
+ >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
+ 'escape boundary: ~ \\\\u007f \\\\u0080'
+ >>> jsonescape('a weird byte: \\xdd', paranoid=True)
+ 'a weird byte: \\\\udcdd'
+ >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
+ 'utf-8: caf\\\\u00e9'
+ >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
+ 'non-BMP: \\\\ud834\\\\udd1e'
'''
if not _jsonmap:
for x in xrange(32):
_jsonmap[chr(x)] = "\\u%04x" % x
- for x in xrange(32, 256):
+ for x in xrange(32, 127):
c = chr(x)
_jsonmap[c] = c
_jsonmap['\x7f'] = '\\u007f'
@@ -421,8 +435,27 @@ def jsonescape(s):
_jsonmap['\b'] = '\\b'
_jsonmap['\f'] = '\\f'
_jsonmap['\r'] = '\\r'
+ _paranoidjsonmap.update(_jsonmap)
+ for x in xrange(128, 256):
+ c = chr(x)
+ _jsonmap[c] = c
- return ''.join(_jsonmap[c] for c in toutf8b(s))
+ if paranoid:
+ jm = _paranoidjsonmap
+ else:
+ jm = _jsonmap
+
+ u8chars = toutf8b(s)
+ try:
+ return ''.join(jm[c] for c in u8chars) # fast path
+ except KeyError:
+ pass
+ # non-BMP char is represented as UTF-16 surrogate pair
+ u16codes = array.array('H')
+ u16codes.fromstring(u8chars.decode('utf-8').encode('utf-16'))
+ u16codes.pop(0) # drop BOM
+ return ''.join(jm[chr(x)] if x < 128 else '\\u%04x' % x
+ for x in u16codes)
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
More information about the Mercurial-devel
mailing list