[PATCH] highlight: fixes garbled text in non-UTF-8 environment

Yuya Nishihara youjah at gmail.com
Wed Sep 2 05:56:13 CDT 2009


# HG changeset patch
# User Yuya Nishihara <yuya at tcha.org>
# Date 1251527055 -32400
# Node ID 54e7217e12558be85f8ae410f1a168b58b966bae
# Parent  37042e8b3b342b2e380d8be3e3f7692584c92d33
highlight: fixes garbled text in non-UTF-8 environment

This patch treats all files inside repository as encoded by
locale's encoding when pygmentize.

We can assume that most files are written in locale's encoding,
but current implementation treats them as UTF-8.
So there's no way to specify the encoding of files.

Current implementation, db7557359636 (issue1341):
1. Convert original `text`, which is treated as UTF-8, to locale's encoding.
   `encoding.tolocal()` is the method to convert from internal UTF-8 to local.
   If original `text` is not UTF-8, e.g. Japanese EUC-JP, some characters
   become garbled here.
2. pygmentize, with no UnicodeDecodeError.

This patch:
1. Convert original `text`, which is treated as locale's encoding, to unicode.
   Pygments prefers unicode object than raw str. [1]_
   If original `text` is not encoded by locale's encoding, some characters
   become garbled here.
2. pygmentize, also with no UnicodeDecodeError :)
3. Convert unicode back to raw str, which is encoded by locale's.

.. [1] http://pygments.org/docs/unicode/

diff --git a/hgext/highlight/highlight.py b/hgext/highlight/highlight.py
--- a/hgext/highlight/highlight.py
+++ b/hgext/highlight/highlight.py
@@ -33,25 +33,30 @@
         return
 
     # avoid UnicodeDecodeError in pygments
-    text = encoding.tolocal(text)
+    # they say "the best way is to pass Pygments unicode objects."
+    # <http://pygments.org/docs/unicode/>
+    text = unicode(text, encoding.encoding, 'replace')
+                # assumes text's encoding is same as locale's.
 
     # To get multi-line strings right, we can't format line-by-line
     try:
-        lexer = guess_lexer_for_filename(fctx.path(), text[:1024],
-                                         encoding=encoding.encoding)
+        lexer = guess_lexer_for_filename(fctx.path(), text[:1024])
     except (ClassNotFound, ValueError):
         try:
-            lexer = guess_lexer(text[:1024], encoding=encoding.encoding)
+            lexer = guess_lexer(text[:1024])
         except (ClassNotFound, ValueError):
-            lexer = TextLexer(encoding=encoding.encoding)
+            lexer = TextLexer()
 
-    formatter = HtmlFormatter(style=style, encoding=encoding.encoding)
+    formatter = HtmlFormatter(style=style)
 
     colorized = highlight(text, lexer, formatter)
     # strip wrapping div
     colorized = colorized[:colorized.find('\n</pre>')]
     colorized = colorized[colorized.find('<pre>')+5:]
-    coloriter = iter(colorized.splitlines())
+
+    coloriter = iter([s.encode(encoding.encoding, 'replace')
+                                # convert back to raw str
+                                for s in colorized.splitlines()])
 
     tmpl.filters['colorize'] = lambda x: coloriter.next()
 
diff --git a/tests/test-highlight-nonutf8 b/tests/test-highlight-nonutf8
new file mode 100755
--- /dev/null
+++ b/tests/test-highlight-nonutf8
@@ -0,0 +1,38 @@
+#!/bin/sh
+# Test for garble-less highlight of non-UTF-8 file contents.
+
+"$TESTDIR/hghave" pygments || exit 80
+
+cat <<EOF >> $HGRCPATH
+[extensions]
+hgext.highlight =
+[web]
+pygments_style = friendly
+EOF
+
+hg init test
+cd test
+
+printf '\265\376\n' >> eucjp.txt  # Japanese kanji "Kyo"
+
+hg ci -Ama
+
+hg_serve_get () {
+    echo % HGENCODING="$1" hg serve
+    HGENCODING="$1" hg serve -p $HGPORT -d -n test --pid-file=hg.pid -A access.log -E errors.log
+    cat hg.pid >> $DAEMON_PIDS
+
+    echo % hgweb filerevision, html
+    ("$TESTDIR/get-with-headers.py" localhost:$HGPORT "/file/tip/$2") \
+        | awk '/<div class="parity0 source">/ { print; }' \
+        | python -c 'import sys; print "".join([ord(c) < 0x80 and c or "\\x%02x" % ord(c) for c in sys.stdin.read()])'
+            # convert characters >= 0x80 to \xXX
+
+    echo % errors encountered
+    cat errors.log
+    "$TESTDIR/killdaemons.py"
+}
+
+hg_serve_get euc-jp eucjp.txt
+hg_serve_get utf-8 eucjp.txt
+hg_serve_get us-ascii eucjp.txt
diff --git a/tests/test-highlight-nonutf8.out b/tests/test-highlight-nonutf8.out
new file mode 100644
--- /dev/null
+++ b/tests/test-highlight-nonutf8.out
@@ -0,0 +1,16 @@
+adding eucjp.txt
+% HGENCODING=euc-jp hg serve
+% hgweb filerevision, html
+<div class="parity0 source"><a href="#l1" id="l1">     1</a> \xb5\xfe</div>
+
+% errors encountered
+% HGENCODING=utf-8 hg serve
+% hgweb filerevision, html
+<div class="parity0 source"><a href="#l1" id="l1">     1</a> \xef\xbf\xbd\xef\xbf\xbd</div>
+
+% errors encountered
+% HGENCODING=us-ascii hg serve
+% hgweb filerevision, html
+<div class="parity0 source"><a href="#l1" id="l1">     1</a> ??</div>
+
+% errors encountered


More information about the Mercurial-devel mailing list