[PATCH 1 of 2] encoding: add BOM (byte-order-mark) handling function (issue2162)

Yuya Nishihara yuya at tcha.org
Sat Jun 19 08:56:34 CDT 2010


# HG changeset patch
# User Yuya Nishihara <yuya at tcha.org>
# Date 1276951076 -32400
# Node ID 19dc05994d6cc2727efbde32c53bcbb1dfbdc83b
# Parent  e5a2134c083b223bd2998b6694e430e4999caee3
encoding: add BOM (byte-order-mark) handling function (issue2162)

diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -6,7 +6,7 @@
 # GNU General Public License version 2 or any later version.
 
 import error
-import sys, unicodedata, locale, os
+import sys, unicodedata, locale, os, codecs
 
 _encodingfixup = {'646': 'ascii', 'ANSI_X3.4-1968': 'ascii'}
 
@@ -75,3 +75,39 @@ def colwidth(s):
         return sum([w(c) in 'WFA' and 2 or 1 for c in d])
     return len(d)
 
+_bommap = [
+    ('utf_8', codecs.BOM_UTF8),
+    ('utf_32_le', codecs.BOM_UTF32_LE), ('utf_32_be', codecs.BOM_UTF32_BE),
+    # BOM_UTF16 must be after BOM_UTF32 because they have the same
+    # leading characters.
+    ('utf_16_le', codecs.BOM_UTF16_LE), ('utf_16_be', codecs.BOM_UTF16_BE)]
+
+def detectbom(s):
+    """Strip BOM from the given string; return (stripped, encoding)"""
+    if (not s) or (ord(s[0]) < 0x80 and ord(s[0]) != 0x00):
+        return s, None  # obviously not a unicode
+
+    for enc, bom in _bommap:
+        if s.startswith(bom):
+            return s[len(bom):], enc
+
+    return s, None
+
+def bomtolocal(s):
+    """Convert a string to local encoding if BOM detected
+
+    UTF-16/32 and sometimes UTF-8 prepend BOM (byte-order-mark).
+    This function tries to detect character encoding according to BOM,
+    then converts to local encoding. If it doesn't include BOM, this
+    returns the original string.
+    """
+    s, enc = detectbom(s)
+    if not enc:
+        return s
+
+    try:
+        return s.decode(enc).encode(encoding, 'replace')
+    except LookupError, k:
+        raise error.Abort("%s, please check your locale settings" % k)
+    except UnicodeDecodeError:
+        return s  # BOM-like string detected, but it isn't


More information about the Mercurial-devel mailing list