[PATCH STABLE V2] i18n: fix case folding problem with problematic encodings

Tue Nov 29 14:24:49 CST 2011

# HG changeset patch
# User FUJIWARA Katsunori <foozy at lares.dti.ne.jp>
# Date 1322598040 -32400
# Branch stable
# Node ID 5bf954f0303aefbcbfc2eefbefc5d7e9f95b98a7
# Parent  e387e760b207383c961ed8accd35583791a33bb0
i18n: fix case folding problem with problematic encodings

changeset 28e98a8b173d for case folding problem with problematic
encoding was not enough.

this patch covers up a fault of fix in it.

this patch does:

  - switch from "os.path.normcase()" to "util.normcase()"

    in changeset 6eff984d8e76 and b2fd4746414a, some of
    "os.path.normcase()" invocations are replaced by
    "util.normcase()", but not all.

    for consistency, this patch replaces all "os.path.normcase()"
    invocations by "util.normcase()" other than ones in "windows.py".

    and for optimization, this patch prevents each path components
    from being lower-ed, by using normcase-ed (= lower-ed) one in the
    beginning of the function.

  - switch internal format from str to unicode for "util.fspath()"

    for safeness against problematic encoding and efficiency, this
    patch switches internal foramat of this function from str as byte
    sequence to uicode.

    this switching also causes omitting "util.normcase()" from wrapping
    target list of win32mbcs

  - switch from "str.lower()" to "encoding.lower()"

    for safeness against problematic encoding.

    this also causes configuration error detection at
    "encoding.lower()", so this patch has hunks to:

      - "encoding.lower()" for catching LookupError
      - "test-encoding.t" for message with hint

  - move "normcase()" definition for posix to "util.py"

    for preventing cyclic dependency on "encoding.py".

  - add "test-casefolding-cp932.t"

    this also introduces cp932 aware filesystem check (cp932fs) to
    hghave, to prevent this test from running on cp932 un-aware
    filesystem.

diff -r e387e760b207 -r 5bf954f0303a hgext/win32mbcs.py

--- a/hgext/win32mbcs.py	Fri Nov 25 02:09:48 2011 +0100
+++ b/hgext/win32mbcs.py	Wed Nov 30 05:20:40 2011 +0900
@@ -129,7 +129,7 @@
 funcs = '''os.path.join os.path.split os.path.splitext
  os.path.splitunc os.path.normpath os.path.normcase os.makedirs
  mercurial.util.endswithsep mercurial.util.splitpath mercurial.util.checkcase
- mercurial.util.fspath mercurial.util.pconvert mercurial.util.normpath
+ mercurial.util.normcase mercurial.util.pconvert mercurial.util.normpath
  mercurial.util.checkwinfilename mercurial.util.checkosfilename'''
 
 # codec and alias names of sjis and big5 to be faked.
diff -r e387e760b207 -r 5bf954f0303a mercurial/encoding.py
--- a/mercurial/encoding.py	Fri Nov 25 02:09:48 2011 +0100
+++ b/mercurial/encoding.py	Wed Nov 30 05:20:40 2011 +0900
@@ -171,3 +171,5 @@
         return lu.encode(encoding)
     except UnicodeError:
         return s.lower() # we don't know how to fold this except in ASCII
+    except LookupError, k:
+        raise error.Abort(k, hint="please check your locale settings")
diff -r e387e760b207 -r 5bf954f0303a mercurial/posix.py
--- a/mercurial/posix.py	Fri Nov 25 02:09:48 2011 +0100
+++ b/mercurial/posix.py	Wed Nov 30 05:20:40 2011 +0900
@@ -164,10 +164,6 @@
     st2 = os.lstat(fpath2)
     return st1.st_dev == st2.st_dev
 
-# os.path.normcase is a no-op, which doesn't help us on non-native filesystems
-def normcase(path):
-    return path.lower()
-
 if sys.platform == 'darwin':
     import fcntl # only needed on darwin, missing on jython
 
diff -r e387e760b207 -r 5bf954f0303a mercurial/scmutil.py
--- a/mercurial/scmutil.py	Fri Nov 25 02:09:48 2011 +0100
+++ b/mercurial/scmutil.py	Wed Nov 30 05:20:40 2011 +0900
@@ -86,17 +86,16 @@
         # AIX ignores "/" at end of path, others raise EISDIR.
         if util.endswithsep(path):
             raise util.Abort(_("path ends in directory separator: %s") % path)
-        normpath = os.path.normcase(path)
+        normpath = util.normcase(path)
         parts = util.splitpath(normpath)
         if (os.path.splitdrive(path)[0]
-            or parts[0].lower() in ('.hg', '.hg.', '')
+            or parts[0] in ('.hg', '.hg.', '')
             or os.pardir in parts):
             raise util.Abort(_("path contains illegal component: %s") % path)
-        if '.hg' in path.lower():
-            lparts = [p.lower() for p in parts]
+        if '.hg' in normpath:
             for p in '.hg', '.hg.':
-                if p in lparts[1:]:
-                    pos = lparts.index(p)
+                if p in parts[1:]:
+                    pos = parts.index(p)
                     base = os.path.join(*parts[:pos])
                     raise util.Abort(_('path %r is inside nested repo %r')
                                      % (path, base))
@@ -451,7 +450,7 @@
             return rcpath
         value = value.replace('/', os.sep)
         for p in value.split(os.pathsep):
-            if p.lower().endswith('mercurial.ini'):
+            if encoding.lower(p).endswith('mercurial.ini'):
                 rcpath.append(p)
             elif os.path.isdir(p):
                 for f, kind in osutil.listdir(p):
diff -r e387e760b207 -r 5bf954f0303a mercurial/util.py
--- a/mercurial/util.py	Fri Nov 25 02:09:48 2011 +0100
+++ b/mercurial/util.py	Wed Nov 30 05:20:40 2011 +0900
@@ -44,7 +44,6 @@
 makedir = platform.makedir
 nlinks = platform.nlinks
 normpath = platform.normpath
-normcase = platform.normcase
 nulldev = platform.nulldev
 openhardlinks = platform.openhardlinks
 oslink = platform.oslink
@@ -548,8 +547,12 @@
 
 if os.name == 'nt':
     checkosfilename = checkwinfilename
+    normcase = platform.normcase
 else:
     checkosfilename = platform.checkosfilename
+    # os.path.normcase is a no-op, which doesn't help us on non-native
+    # filesystems
+    normcase = encoding.lower
 
 def makelock(info, pathname):
     try:
@@ -614,14 +617,18 @@
     with root. Note that this function is unnecessary, and should not be
     called, for case-sensitive filesystems (simply because it's expensive).
     '''
+    # use unicode object for problematic encodings
+    lname = normcase(name).decode(encoding.encoding)
+    lroot = normcase(root).decode(encoding.encoding)
+
     # If name is absolute, make it relative
-    if name.lower().startswith(root.lower()):
-        l = len(root)
-        if name[l] == os.sep or name[l] == os.altsep:
+    if lname.startswith(lroot):
+        l = len(lroot)
+        if lname[l] == os.sep or lname[l] == os.altsep:
             l = l + 1
-        name = name[l:]
+        lname = lname[l:]
 
-    if not os.path.lexists(os.path.join(root, name)):
+    if not os.path.lexists(os.path.join(lroot, lname)):
         return None
 
     seps = os.sep
@@ -630,9 +637,9 @@
     # Protect backslashes. This gets silly very quickly.
     seps.replace('\\','\\\\')
     pattern = re.compile(r'([^%s]+)|([%s]+)' % (seps, seps))
-    dir = os.path.normcase(os.path.normpath(root))
+    dir = os.path.normpath(lroot)
     result = []
-    for part, sep in pattern.findall(name):
+    for lpart, sep in pattern.findall(lname):
         if sep:
             result.append(sep)
             continue
@@ -641,18 +648,17 @@
             _fspathcache[dir] = os.listdir(dir)
         contents = _fspathcache[dir]
 
-        lpart = part.lower()
-        lenp = len(part)
+        lenp = len(lpart)
         for n in contents:
             if lenp == len(n) and n.lower() == lpart:
                 result.append(n)
                 break
         else:
             # Cannot happen, as the file exists!
-            result.append(part)
+            result.append(lpart)
         dir = os.path.join(dir, lpart)
 
-    return ''.join(result)
+    return u''.join(result).encode(encoding.encoding)
 
 def checknlink(testfile):
     '''check whether hardlink count reporting works properly'''
diff -r e387e760b207 -r 5bf954f0303a tests/hghave
--- a/tests/hghave	Fri Nov 25 02:09:48 2011 +0100
+++ b/tests/hghave	Wed Nov 30 05:20:40 2011 +0900
@@ -91,6 +91,33 @@
     finally:
         os.remove(path)
 
+def has_cp932fs():
+    d = tempfile.mkdtemp(prefix=tempprefix, dir=".")
+    try:
+        names = [ "\x83\x41", "\x83\x61" ]
+        for name in names:
+            f = open(os.path.join(d, name), "w")
+            try:
+                f.write(name)
+            finally:
+                f.close()
+        for name in names:
+            f = open(os.path.join(d, name), "r")
+            try:
+                content = f.read()
+                if name != content:
+                    return False
+            finally:
+                f.close()
+        return True
+    finally:
+        for name in names:
+            try:
+                os.remove(os.path.join(d, name))
+            except Exception, e:
+                pass # ignore
+        os.rmdir(d)
+
 def has_inotify():
     try:
         import hgext.inotify.linux.watcher
@@ -224,6 +251,7 @@
     "git": (has_git, "git command line client"),
     "gpg": (has_gpg, "gpg client"),
     "icasefs": (has_icasefs, "case insensitive file system"),
+    "cp932fs": (has_cp932fs, "cp932 encoding aware file system"),
     "inotify": (has_inotify, "inotify extension support"),
     "lsprof": (has_lsprof, "python lsprof module"),
     "mtn": (has_mtn, "monotone client (>= 1.0)"),
diff -r e387e760b207 -r 5bf954f0303a tests/test-casefolding-cp932.t
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-casefolding-cp932.t	Wed Nov 30 05:20:40 2011 +0900
@@ -0,0 +1,99 @@
+run only on case-insensitive filesystems, because collision check at
+"hg update" is done only on case-insensitive filesystems
+
+  $ "$TESTDIR/hghave" icasefs || exit 80
+  $ LC_ALL=
+  $ export LC_ALL
+  $ LC_CTYPE=ja_JP.cp932
+  $ export LC_CTYPE
+  $ "$TESTDIR/hghave" cp932fs || exit 80
+
+setup repository, and target files
+
+  $ HGENCODING=cp932
+  $ export HGENCODING
+  $ hg init t
+  $ cd t
+  $ python << EOF
+  > import os
+  > names = ["\x83\x41", # cp932(0x83, 0x41='A'), UNICODE(0x30a2)
+  >          "\x83\x5A", # cp932(0x83, 0x5A='Z'), UNICODE(0x30bb)
+  >          "\x83\x61", # cp932(0x83, 0x61='a'), UNICODE(0x30c2)
+  >          "\x83\x7A", # cp932(0x83, 0x7A='z'), UNICODE(0x30db)
+  >         ]
+  > for num, name in zip(range(len(names)), names):
+  >     # file for getting target filename of "hg add"
+  >     f = file(str(num), 'w'); f.write(name); f.close()
+  >     # target file of "hg add"
+  >     f = file(name, 'w'); f.write(name); f.write("\n"); f.close()
+  > 
+  >     # under subdirectory
+  >     dn = '%s_' % name
+  >     fn = os.path.join(dn, name)
+  >     # file for getting target filename of "hg add"
+  >     f = file('%d_' % num, 'w'); f.write(fn); f.close()
+  >     os.mkdir(dn)
+  >     # target file of "hg add"
+  >     f = file(fn, 'w'); f.write(dn); f.write("\n"); f.close()
+  > EOF
+
+test filename collison check at "hg add"
+
+  $ hg add --config ui.portablefilenames=abort `cat 0`
+  $ hg add --config ui.portablefilenames=abort `cat 0_`
+  $ hg add --config ui.portablefilenames=abort `cat 1`
+  $ hg add --config ui.portablefilenames=abort `cat 1_`
+  $ hg add --config ui.portablefilenames=abort `cat 2`
+  $ hg add --config ui.portablefilenames=abort `cat 2_`
+  $ hg add --config ui.portablefilenames=abort `cat 3`
+  $ hg add --config ui.portablefilenames=abort `cat 3_`
+  $ hg status -a
+  A \x83A (esc)
+  A \x83A_/\x83A (esc)
+  A \x83Z (esc)
+  A \x83Z_/\x83Z (esc)
+  A \x83a (esc)
+  A \x83a_/\x83a (esc)
+  A \x83z (esc)
+  A \x83z_/\x83z (esc)
+
+test filename collision check at "hg update"
+
+  $ hg commit -m 'revision 0'
+  $ hg update null
+  0 files updated, 0 files merged, 8 files removed, 0 files unresolved
+  $ hg update tip
+  8 files updated, 0 files merged, 0 files removed, 0 files unresolved
+  $ hg status -A
+  ? 0
+  ? 0_
+  ? 1
+  ? 1_
+  ? 2
+  ? 2_
+  ? 3
+  ? 3_
+  C \x83A (esc)
+  C \x83A_/\x83A (esc)
+  C \x83Z (esc)
+  C \x83Z_/\x83Z (esc)
+  C \x83a (esc)
+  C \x83a_/\x83a (esc)
+  C \x83z (esc)
+  C \x83z_/\x83z (esc)
+  $ cat `cat 0`
+  \x83A (esc)
+  $ cat `cat 0_`
+  \x83A_ (esc)
+  $ cat `cat 1`
+  \x83Z (esc)
+  $ cat `cat 1_`
+  \x83Z_ (esc)
+  $ cat `cat 2`
+  \x83a (esc)
+  $ cat `cat 2_`
+  \x83a_ (esc)
+  $ cat `cat 3`
+  \x83z (esc)
+  $ cat `cat 3_`
+  \x83z_ (esc)
diff -r e387e760b207 -r 5bf954f0303a tests/test-encoding-align.t
--- a/tests/test-encoding-align.t	Fri Nov 25 02:09:48 2011 +0100
+++ b/tests/test-encoding-align.t	Wed Nov 30 05:20:40 2011 +0900
@@ -1,5 +1,9 @@
 Test alignment of multibyte characters
 
+  $ LC_ALL=
+  $ export LC_ALL
+  $ LC_CTYPE=en_US.utf-8
+  $ export LC_CTYPE
   $ HGENCODING=utf-8
   $ export HGENCODING
   $ hg init t
diff -r e387e760b207 -r 5bf954f0303a tests/test-encoding.t
--- a/tests/test-encoding.t	Fri Nov 25 02:09:48 2011 +0100
+++ b/tests/test-encoding.t	Wed Nov 30 05:20:40 2011 +0900
@@ -234,7 +234,8 @@
 hg log (dolphin)
 
   $ HGENCODING=dolphin hg log
-  abort: unknown encoding: dolphin, please check your locale settings
+  abort: unknown encoding: dolphin
+  (please check your locale settings)
   [255]
   $ HGENCODING=ascii hg branch `cat latin-1-tag`
   abort: decoding near '\xe9': 'ascii' codec can't decode byte 0xe9 in position 0: ordinal not in range(128)! (esc)