[PATCH 1 of 2] encoding: make utf8b encoder more robust (issue4927)
Matt Mackall
mpm at selenic.com
Mon Nov 2 23:27:19 UTC 2015
# HG changeset patch
# User Matt Mackall <mpm at selenic.com>
# Date 1446506176 21600
# Mon Nov 02 17:16:16 2015 -0600
# Node ID 6bee6f327de32755da038193f453aa6bed6810c7
# Parent 859f453e8b4e2b42b6b6552b79c5c5e7e2fc1cf7
encoding: make utf8b encoder more robust (issue4927)
It could lose sync if it saw a dropped character. The new code
explicitly looks for a new replacement character sequence (U+fffd) appearing.
This requires rewriting the loop to allow lookahead on the source so
that we can see if the replacement sequence is on both sides.
diff -r 859f453e8b4e -r 6bee6f327de3 mercurial/encoding.py
--- a/mercurial/encoding.py Mon Nov 02 12:12:24 2015 -0800
+++ b/mercurial/encoding.py Mon Nov 02 17:16:16 2015 -0600
@@ -452,15 +452,24 @@
return s
except UnicodeDecodeError:
# surrogate-encode any characters that don't round-trip
- s2 = s.decode('utf-8', 'ignore').encode('utf-8')
+ s2 = s.decode('utf-8', 'replace').encode('utf-8')
r = ""
- pos = 0
- for c in s:
- if s2[pos:pos + 1] == c:
- r += c
- pos += 1
+ pos1 = 0
+ pos2 = 0
+ l = len(s)
+ while pos1 < l:
+ if (s2[pos2] == "\xef" and
+ s2[pos2:pos2 + 3] == "\xef\xbf\xbd" and
+ s[pos1:pos1 + 3] != "\xef\xbf\xbd"):
+ # character got replaced by U+fffd, add surrogate
+ r += unichr(0xdc00 + ord(s[pos1])).encode('utf-8')
+ # skip over replacement character
+ pos1 += 1
+ pos2 += 3
else:
- r += unichr(0xdc00 + ord(c)).encode('utf-8')
+ r += s[pos1]
+ pos1 += 1
+ pos2 += 1
return r
def fromutf8b(s):
More information about the Mercurial-devel
mailing list