[PATCH 1 of 2] encoding: make utf8b encoder more robust (issue4927)

Mon Nov 2 23:27:19 UTC 2015

# HG changeset patch
# User Matt Mackall <mpm at selenic.com>
# Date 1446506176 21600
#      Mon Nov 02 17:16:16 2015 -0600
# Node ID 6bee6f327de32755da038193f453aa6bed6810c7
# Parent  859f453e8b4e2b42b6b6552b79c5c5e7e2fc1cf7
encoding: make utf8b encoder more robust (issue4927)

It could lose sync if it saw a dropped character. The new code
explicitly looks for a new replacement character sequence (U+fffd) appearing.
This requires rewriting the loop to allow lookahead on the source so
that we can see if the replacement sequence is on both sides.

diff -r 859f453e8b4e -r 6bee6f327de3 mercurial/encoding.py

--- a/mercurial/encoding.py	Mon Nov 02 12:12:24 2015 -0800
+++ b/mercurial/encoding.py	Mon Nov 02 17:16:16 2015 -0600
@@ -452,15 +452,24 @@
         return s
     except UnicodeDecodeError:
         # surrogate-encode any characters that don't round-trip
-        s2 = s.decode('utf-8', 'ignore').encode('utf-8')
+        s2 = s.decode('utf-8', 'replace').encode('utf-8')
         r = ""
-        pos = 0
-        for c in s:
-            if s2[pos:pos + 1] == c:
-                r += c
-                pos += 1
+        pos1 = 0
+        pos2 = 0
+        l = len(s)
+        while pos1 < l:
+            if (s2[pos2] == "\xef" and
+                s2[pos2:pos2 + 3] == "\xef\xbf\xbd" and
+                s[pos1:pos1 + 3] != "\xef\xbf\xbd"):
+                # character got replaced by U+fffd, add surrogate
+                r += unichr(0xdc00 + ord(s[pos1])).encode('utf-8')
+                # skip over replacement character
+                pos1 += 1
+                pos2 += 3
             else:
-                r += unichr(0xdc00 + ord(c)).encode('utf-8')
+                r += s[pos1]
+                pos1 += 1
+                pos2 += 1
         return r
 
 def fromutf8b(s):