[PATCH 1 of 4] convert/bzr: expect unicode metadata, encode in UTF-8 (issue3232)

Thu Feb 2 03:17:13 CST 2012

# HG changeset patch
# User Patrick Mezard <pmezard at gmail.com>
# Date 1328174104 -3600
# Branch stable
# Node ID d4716c94801ad7d0495c8395bbc94bf6d5c86716
# Parent  0620421044a2bcaafd054a6ee454614888699de8
convert/bzr: expect unicode metadata, encode in UTF-8 (issue3232)

Before this patch, metadata and file names were interpreted like:
- unicode objects were converted to UTF-8
- non unicode objects were left unchanged

Looking at the code and bzr being known for transcoding filenames, we expect
everything to be returned as unicode objects, and we want to encode them in
UTF-8, like the subversion source does. To do that, we just remove the custom
implementation of .recode().

diff --git a/hgext/convert/bzr.py b/hgext/convert/bzr.py
--- a/hgext/convert/bzr.py
+++ b/hgext/convert/bzr.py
@@ -143,7 +143,6 @@
         return commit(parents=parents,
                 date='%d %d' % (rev.timestamp, -rev.timezone),
                 author=self.recode(rev.committer),
-                # bzr returns bytestrings or unicode, depending on the content
                 desc=self.recode(rev.message),
                 rev=version)
 
@@ -231,7 +230,11 @@
                 continue
 
             # we got unicode paths, need to convert them
-            path, topath = [self.recode(part) for part in paths]
+            path, topath = paths
+            if path is not None:
+                path = self.recode(path)
+            if topath is not None:
+                topath = self.recode(topath)
             seen.add(path or topath)
 
             if topath is None:
@@ -260,19 +263,3 @@
         parentmap = self.sourcerepo.get_parent_map(ids)
         parents = tuple([parent for parent in ids if parent in parentmap])
         return parents
-
-    def recode(self, s, encoding=None):
-        """This version of recode tries to encode unicode to bytecode,
-        and preferably using the UTF-8 codec.
-        Other types than Unicode are silently returned, this is by
-        intention, e.g. the None-type is not going to be encoded but instead
-        just passed through
-        """
-        if not encoding:
-            encoding = self.encoding or 'utf-8'
-
-        if isinstance(s, unicode):
-            return s.encode(encoding)
-        else:
-            # leave it alone
-            return s