[PATCH 2 of 2 STABLE] i18n: fix case folding problem in some problematic encodings

FUJIWARA Katsunori foozy at lares.dti.ne.jp
Thu Nov 24 00:19:20 CST 2011


# HG changeset patch
# User FUJIWARA Katsunori <foozy at lares.dti.ne.jp>
# Date 1322114737 -32400
# Branch stable
# Node ID ccae15e928bfd34266aca2ecf257207597af832a
# Parent  8b3cfad9c07307cabb8a49861563bc57ddf37fbd
i18n: fix case folding problem in some problematic encodings

changeset 28e98a8b173d for case folding problem with ambiguous
encoding was not enough.

this patch covers up a fault of fix in it.

this patch switches:

  - from "str.lower()" to "encoding.lower()"

    "str.lower()" on byte sequence may break string in some character
    encoding (e.g.: cp932 for Japanese)

    this patch also add comments to lines where "str.lower()" on
    filenames are kept because it is enough, for future maintenance.

  - from "os.path.normcase()" to "util.normcase()"

    in changeset 6eff984d8e76 and b2fd4746414a, some of
    "os.path.normcase()" invocations are replaced by
    "util.normcase()", but not all.

    for consistency, this patch replace all "os.path.normcase()"
    invocations by "util.normcase()" other than ones in "windows.py".

patch hunk to catch LookupError in "encoding.lower()" is for passing
test-encoding.t, because switching to "encoding.lower()" causes
configuration error detection here.

diff -r 8b3cfad9c073 -r ccae15e928bf hgext/win32mbcs.py
--- a/hgext/win32mbcs.py	Thu Nov 24 14:59:26 2011 +0900
+++ b/hgext/win32mbcs.py	Thu Nov 24 15:05:37 2011 +0900
@@ -130,7 +130,8 @@
  os.path.splitunc os.path.normpath os.path.normcase os.makedirs
  mercurial.util.endswithsep mercurial.util.splitpath mercurial.util.checkcase
  mercurial.util.fspath mercurial.util.pconvert mercurial.util.normpath
- mercurial.util.checkwinfilename mercurial.util.checkosfilename'''
+ mercurial.util.checkwinfilename mercurial.util.checkosfilename
+ mercurial.util.normcase'''
 
 # codec and alias names of sjis and big5 to be faked.
 problematic_encodings = '''big5 big5-tw csbig5 big5hkscs big5-hkscs
diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/encoding.py
--- a/mercurial/encoding.py	Thu Nov 24 14:59:26 2011 +0900
+++ b/mercurial/encoding.py	Thu Nov 24 15:05:37 2011 +0900
@@ -171,3 +171,5 @@
         return lu.encode(encoding)
     except UnicodeError:
         return s.lower() # we don't know how to fold this except in ASCII
+    except LookupError, k:
+        raise error.Abort("%s, please check your locale settings" % k)
diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/posix.py
--- a/mercurial/posix.py	Thu Nov 24 14:59:26 2011 +0900
+++ b/mercurial/posix.py	Thu Nov 24 15:05:37 2011 +0900
@@ -6,6 +6,7 @@
 # GNU General Public License version 2 or any later version.
 
 from i18n import _
+import encoding
 import os, sys, errno, stat, getpass, pwd, grp, tempfile, unicodedata
 
 posixfile = open
@@ -166,7 +167,8 @@
 
 # os.path.normcase is a no-op, which doesn't help us on non-native filesystems
 def normcase(path):
-    return path.lower()
+    # 'path' may be byte sequence in problematic character encoding
+    return encoding.lower(path)
 
 if sys.platform == 'darwin':
     import fcntl # only needed on darwin, missing on jython
diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/scmutil.py
--- a/mercurial/scmutil.py	Thu Nov 24 14:59:26 2011 +0900
+++ b/mercurial/scmutil.py	Thu Nov 24 15:05:37 2011 +0900
@@ -86,8 +86,11 @@
         # AIX ignores "/" at end of path, others raise EISDIR.
         if util.endswithsep(path):
             raise util.Abort(_("path ends in directory separator: %s") % path)
-        normpath = os.path.normcase(path)
+        normpath = util.normcase(path)
         parts = util.splitpath(normpath)
+        # below 'lower()'s are for comparison only with ASCII strings,
+        # so encoding.lower() is not needed
+        # even if encoding of 'path' is problematic one
         if (os.path.splitdrive(path)[0]
             or parts[0].lower() in ('.hg', '.hg.', '')
             or os.pardir in parts):
@@ -451,6 +454,9 @@
             return rcpath
         value = value.replace('/', os.sep)
         for p in value.split(os.pathsep):
+            # below 'lower()' is for comparison only with ASCII string,
+            # so encoding.lower() is not needed
+            # even if encoding of 'path' is problematic one
             if p.lower().endswith('mercurial.ini'):
                 rcpath.append(p)
             elif os.path.isdir(p):
diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/util.py
--- a/mercurial/util.py	Thu Nov 24 14:59:26 2011 +0900
+++ b/mercurial/util.py	Thu Nov 24 15:05:37 2011 +0900
@@ -593,6 +593,10 @@
     Requires a path (like /foo/.hg) ending with a foldable final
     directory component.
     """
+    # this is invoked with 'path' of which last element consists of
+    # only ASCII characters (e.g.: '.hg'),
+    # so upper()/lower() never cause problem,
+    # even if encoding of 'path' is problematic one
     s1 = os.stat(path)
     d, b = os.path.split(path)
     p2 = os.path.join(d, b.upper())
@@ -614,8 +618,17 @@
     with root. Note that this function is unnecessary, and should not be
     called, for case-sensitive filesystems (simply because it's expensive).
     '''
+
+    # 'name' may be byte sequence in problematic character encoding,
+    # because win32mbcs is enabled only in Windows native environment.
+    def lower(x):
+        if isinstance(x, unicode): # win32mbcs enabled
+            return x.lower()
+        else:
+            return encoding.lower(x)
+
     # If name is absolute, make it relative
-    if name.lower().startswith(root.lower()):
+    if lower(name).startswith(lower(root)):
         l = len(root)
         if name[l] == os.sep or name[l] == os.altsep:
             l = l + 1
@@ -630,8 +643,19 @@
     # Protect backslashes. This gets silly very quickly.
     seps.replace('\\','\\\\')
     pattern = re.compile(r'([^%s]+)|([%s]+)' % (seps, seps))
-    dir = os.path.normcase(os.path.normpath(root))
+    dir = normcase(os.path.normpath(root))
     result = []
+
+    # encoding.fromlocal() on 'name' is not needed because:
+    #
+    #   - Windows ('\\' as path separator) + problematic encoding
+    #     should enable win32mbcs which wraps this function for
+    #     unicode-nize of 'name'
+    #
+    #   - other posix environment (e.g.: cygwin) uses '/' as path
+    #     separator, so re.findall() does not break byte sequence,
+    #     even though it uses problematic encoding
+
     for part, sep in pattern.findall(name):
         if sep:
             result.append(sep)
@@ -641,10 +665,10 @@
             _fspathcache[dir] = os.listdir(dir)
         contents = _fspathcache[dir]
 
-        lpart = part.lower()
+        lpart = lower(part)
         lenp = len(part)
         for n in contents:
-            if lenp == len(n) and n.lower() == lpart:
+            if lenp == len(n) and lower(n) == lpart:
                 result.append(n)
                 break
         else:


More information about the Mercurial-devel mailing list