[PATCH 1 of 8] use UTF-8 to encode/decode log text

Andrey grooz-work at gorodok.net
Mon Nov 20 11:43:04 CST 2006


On 20 November 2006 (Mon) 22:55, Matt Mackall wrote:
> This is still going to throw exceptions on existing repos, right?
> That's absolutely not acceptable.
>
> Again, this should go in util and be robust. And don't bother with
> making a CHANGELOG_ENCODING variable, please.

What about this?

Please note that safe_decode function should better use ui.encodings['default']
instead of locale.getpreferredencoding(), but passing an ui object to this
function every time is clearly unacceptable. That is why I suggested to move all
config related stuff from ui into a separate module. :)

# HG changeset patch
# User Andrey <grooz-work at gorodok.net>
# Date 1164043562 -21600
# Node ID 360befe49f4979f6ec8b2988c2884feac3eea2ed
# Parent  1dba5b1038d2c5d9bf494dcf64508c2c5047ef78
added safe_decode function

diff -r 1dba5b1038d2 -r 360befe49f49 mercurial/util.py
--- a/mercurial/util.py Tue Nov 14 10:29:30 2006 +0600
+++ b/mercurial/util.py Mon Nov 20 23:26:02 2006 +0600
@@ -15,7 +15,7 @@ from i18n import gettext as _
 from i18n import gettext as _
 from demandload import *
 demandload(globals(), "cStringIO errno getpass popen2 re shutil sys tempfile")
-demandload(globals(), "os threading time calendar ConfigParser")
+demandload(globals(), "os threading time calendar ConfigParser locale")

 # used by parsedate
 defaultdateformats = ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M',
@@ -1083,3 +1083,13 @@ def drop_scheme(scheme, path):
         if path.startswith('//'):
             path = path[2:]
     return path
+
+def safe_decode(text, encoding):
+    try:
+        return unicode(text, encoding)
+    except UnicodeDecodeError:
+        try:
+            return unicode(text, locale.getpreferredencoding())
+        except UnicodeDecodeError:
+            return unicode(text, 'ISO-8859-1') # can't fail
+
# HG changeset patch
# User Andrey <grooz-work at gorodok.net>
# Date 1164043746 -21600
# Node ID 49ae4f2cfc6fc5c4f54040bc321116dfeff27410
# Parent  360befe49f4979f6ec8b2988c2884feac3eea2ed
use UTF-8 to encode/decode log text

diff -r 360befe49f49 -r 49ae4f2cfc6f mercurial/changelog.py
--- a/mercurial/changelog.py    Mon Nov 20 23:26:02 2006 +0600
+++ b/mercurial/changelog.py    Mon Nov 20 23:29:06 2006 +0600
@@ -60,6 +60,7 @@ class changelog(revlog):
         """
         if not text:
             return (nullid, "", (0, 0), [], "", {})
+        text = util.safe_decode(text, 'UTF-8')
         last = text.index("\n\n")
         desc = text[last + 2:]
         l = text[:last].split('\n')
@@ -98,4 +99,4 @@ class changelog(revlog):
         list.sort()
         l = [hex(manifest), user, parseddate] + list + ["", desc]
         text = "\n".join(l)
-        return self.addrevision(text, transaction, self.count(), p1, p2)
+        return self.addrevision(text.encode('UTF-8'), transaction, self.count(), p1, p2)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: changelog_utf8.diff
Type: text/x-diff
Size: 1011 bytes
Desc: not available
Url : http://www.selenic.com/pipermail/mercurial-devel/attachments/20061120/d38f42da/changelog_utf8.bin
-------------- next part --------------
A non-text attachment was scrubbed...
Name: safe_decode.diff
Type: text/x-diff
Size: 1220 bytes
Desc: not available
Url : http://www.selenic.com/pipermail/mercurial-devel/attachments/20061120/d38f42da/safe_decode.bin


More information about the Mercurial-devel mailing list