[PATCH 1 of 5] encoding: add getutf8char helper
Matt Mackall
mpm at selenic.com
Fri Nov 6 21:48:46 UTC 2015
# HG changeset patch
# User Matt Mackall <mpm at selenic.com>
# Date 1446763726 21600
# Thu Nov 05 16:48:46 2015 -0600
# Node ID 9939f038b1679e2729efe87f8e67af7ce2885a73
# Parent f9984f76fd90e439221425d751e29bae17bec995
encoding: add getutf8char helper
This allows us to find character boundaries in byte strings when
trying to do custom encodings.
diff -r f9984f76fd90 -r 9939f038b167 mercurial/encoding.py
--- a/mercurial/encoding.py Wed Nov 04 15:17:52 2015 -0600
+++ b/mercurial/encoding.py Thu Nov 05 16:48:46 2015 -0600
@@ -414,6 +414,25 @@
return ''.join(_jsonmap[c] for c in toutf8b(s))
+_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
+
+def getutf8char(s, pos):
+ '''get the next full utf-8 character in the given string, starting at pos
+
+ Raises a UnicodeError if the given location does not start a valid
+ utf-8 character.
+ '''
+
+ # find how many bytes to attempt decoding from first nibble
+ l = _utf8len[ord(s[pos]) >> 4]
+ if not l: # ascii
+ return s[pos]
+
+ c = s[pos:pos + l]
+ # validate with attempted decode
+ c.decode("utf-8")
+ return c
+
def toutf8b(s):
'''convert a local, possibly-binary string into UTF-8b
More information about the Mercurial-devel
mailing list