[PATCH 6 of 6 STABLE V3] i18n: use encoding.lower/upper for encoding aware case folding

Mon Dec 12 07:25:42 CST 2011

# HG changeset patch
# User FUJIWARA Katsunori <foozy at lares.dti.ne.jp>
# Date 1323677419 -32400
# Branch stable
# Node ID b337a7d848538a5179ea74cb4ec3d23169b6116c
# Parent  22ae1ea19de2966fe4b13a049a25403cdf1bd8ab
i18n: use encoding.lower/upper for encoding aware case folding

this patch uses encoding.lower/upper for case folding, because ones of
str can not fold case of non ascii characters correctly.

to avoid cyclic dependency, this patch introduces
encodinglower/encodingupper in both posix/windows specific files.

this patch also adds test shown below:

  - test-casefolding-cp932.t:

    test for case folding in problematic encoding cp932.

    this runs only in cygwin environment on Japanese Windows, because
    case folding of NTFS on other Windows can not treat byte sequence
    in cp932 correctly.

  - test-casefolding-geo.t:

    test for case folding of Georgian alphabet in utf-8 encoding.

    WinXP(NTFS) and MacOSX(HFS+) can not treat case folded u'\u10a0'
    and u'\u2d00' as same alphabet, but Win7(NTFS) and Python unicode
    implementation can.

some "normcase()" are excluded from function wrap list in
hgext/win32mbcs.py, because they become encoding aware by this patch.

diff -r 22ae1ea19de2 -r b337a7d84853 hgext/win32mbcs.py

--- a/hgext/win32mbcs.py	Mon Dec 12 17:10:19 2011 +0900
+++ b/hgext/win32mbcs.py	Mon Dec 12 17:10:19 2011 +0900
@@ -128,8 +128,6 @@
 #       they use result of os.path.split()
 funcs = '''os.path.join os.path.split os.path.splitext
  os.path.splitunc os.path.normpath os.makedirs
- mercurial.windows.normcase
- mercurial.util.normcase
  mercurial.util.endswithsep mercurial.util.splitpath mercurial.util.checkcase
  mercurial.util.fspath mercurial.util.pconvert mercurial.util.normpath
  mercurial.util.checkwinfilename mercurial.util.checkosfilename'''
diff -r 22ae1ea19de2 -r b337a7d84853 mercurial/posix.py
--- a/mercurial/posix.py	Mon Dec 12 17:10:19 2011 +0900
+++ b/mercurial/posix.py	Mon Dec 12 17:10:19 2011 +0900
@@ -164,9 +164,12 @@
     st2 = os.lstat(fpath2)
     return st1.st_dev == st2.st_dev
 
+encodinglower = None
+encodingupper = None
+
 # os.path.normcase is a no-op, which doesn't help us on non-native filesystems
 def normcase(path):
-    return path.upper()
+    return encodingupper(path)
 
 if sys.platform == 'darwin':
     import fcntl # only needed on darwin, missing on jython
diff -r 22ae1ea19de2 -r b337a7d84853 mercurial/util.py
--- a/mercurial/util.py	Mon Dec 12 17:10:19 2011 +0900
+++ b/mercurial/util.py	Mon Dec 12 17:10:19 2011 +0900
@@ -24,6 +24,9 @@
 else:
     import posix as platform
 
+platform.encodinglower = encoding.lower
+platform.encodingupper = encoding.upper
+
 cachestat = platform.cachestat
 checkexec = platform.checkexec
 checklink = platform.checklink
@@ -595,9 +598,9 @@
     """
     s1 = os.stat(path)
     d, b = os.path.split(path)
-    b2 = b.upper()
+    b2 = encoding.upper(b)
     if b == b2:
-        b2 = b.lower()
+        b2 = encoding.lower(b)
         if b == b2:
             return True # no evidence against case sensitivity
     p2 = os.path.join(d, b2)
diff -r 22ae1ea19de2 -r b337a7d84853 mercurial/windows.py
--- a/mercurial/windows.py	Mon Dec 12 17:10:19 2011 +0900
+++ b/mercurial/windows.py	Mon Dec 12 17:10:19 2011 +0900
@@ -131,8 +131,11 @@
 def normpath(path):
     return pconvert(os.path.normpath(path))
 
+encodinglower = None
+encodingupper = None
+
 def normcase(path):
-    return path.upper()
+    return encodingupper(path)
 
 def realpath(path):
     '''
diff -r 22ae1ea19de2 -r b337a7d84853 tests/hghave
--- a/tests/hghave	Mon Dec 12 17:10:19 2011 +0900
+++ b/tests/hghave	Mon Dec 12 17:10:19 2011 +0900
@@ -106,6 +106,52 @@
     finally:
         os.rmdir(d)
 
+def has_geofs():
+    # this assumes case insensitivity of file system
+    d = tempfile.mkdtemp(prefix=tempprefix, dir=".")
+    try:
+        p1 = os.path.join(d, u'\u10a0'.encode('utf-8'))
+        p2 = os.path.join(d, u'\u2d00'.encode('utf-8'))
+        f = file(p1, 'w'); f.write(p1); f.close()
+        try:
+            return os.path.lexists(p2)
+        except OSError, e:
+            return False
+        finally:
+            os.remove(p1)
+    finally:
+        os.rmdir(d)
+
+def has_cp932fs():
+    # in cp932 encoding, u'\u30a2' and u'\u30c2' have ASCII 'A'
+    # as second byte, but in different case.
+    # so cp932 un-aware case insensitive filesystems treat them as same.
+    d = tempfile.mkdtemp(prefix=tempprefix, dir=".")
+    names = [n.encode('cp932') for n in [ u'\u30a2', u'\u30c2' ]]
+    try:
+        for name in names:
+            f = open(os.path.join(d, name), "w")
+            try:
+                f.write(name)
+            finally:
+                f.close()
+        for name in names:
+            f = open(os.path.join(d, name), "r")
+            try:
+                content = f.read()
+                if name != content:
+                    return False
+            finally:
+                f.close()
+        return True
+    finally:
+        for name in names:
+            try:
+                os.remove(os.path.join(d, name))
+            except Exception, e:
+                pass # ignore
+        os.rmdir(d)
+
 def has_inotify():
     try:
         import hgext.inotify.linux.watcher
@@ -240,6 +286,8 @@
     "gpg": (has_gpg, "gpg client"),
     "icasefs": (has_icasefs, "case insensitive file system"),
     "uninfdfs": (has_uninfdfs, "unicode NFD file system"),
+    "geofs": (has_geofs, "Georgian lower Khutsuri sensitive file system"),
+    "cp932fs": (has_cp932fs, "cp932 encoding aware file system"),
     "inotify": (has_inotify, "inotify extension support"),
     "lsprof": (has_lsprof, "python lsprof module"),
     "mtn": (has_mtn, "monotone client (>= 1.0)"),
diff -r 22ae1ea19de2 -r b337a7d84853 tests/test-casefolding-cp932.t
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-casefolding-cp932.t	Mon Dec 12 17:10:19 2011 +0900
@@ -0,0 +1,112 @@
+run only on case-insensitive filesystems, because collision check at
+"hg update" is done only on case-insensitive filesystems
+
+  $ "$TESTDIR/hghave" icasefs || exit 80
+
+run only on cp932 encoding aware filesystems
+
+  $ LC_ALL=
+  $ export LC_ALL
+  $ LC_CTYPE=ja_JP.cp932
+  $ export LC_CTYPE
+  $ HGENCODING=cp932
+  $ export HGENCODING
+  $ "$TESTDIR/hghave" cp932fs || exit 80
+
+setup repository, and target files
+
+  $ hg init repo1
+  $ cd repo1
+  $ python << EOF
+  > import os
+  > names = [n.encode('cp932') for n in [
+  >              u'\u30a2', # \x83 \x41(='A') in cp932
+  >              u'\u30bb', # \x83 \x5A(='Z')
+  >              u'\u30c2', # \x83 \x61(='a')
+  >              u'\u30db', # \x83 \x7A(='z')
+  >         ]]
+  > for num, name in zip(range(len(names)), names):
+  >     # file for getting target filename of "hg add"
+  >     f = file('%d' % num, 'w'); f.write(name); f.close()
+  >     # target file of "hg add"
+  >     f = file(name, 'w'); f.write('%s\n' % name); f.close()
+  > 
+  >     # under subdirectory
+  >     dn = '%s_' % name
+  >     os.mkdir(dn)
+  >     fn = os.path.join(dn, name)
+  > 
+  >     # file for getting target filename of "hg add"
+  >     f = file('%d_' % num, 'w'); f.write(fn); f.close()
+  >     # target file of "hg add"
+  >     f = file(fn, 'w'); f.write('%s\n' % dn); f.close()
+  > EOF
+
+test filename collison check at "hg add"
+
+  $ hg add --config ui.portablefilenames=abort `cat 0`
+  $ hg add --config ui.portablefilenames=abort `cat 0_`
+  $ hg add --config ui.portablefilenames=abort `cat 1`
+  $ hg add --config ui.portablefilenames=abort `cat 1_`
+  $ hg add --config ui.portablefilenames=abort `cat 2`
+  $ hg add --config ui.portablefilenames=abort `cat 2_`
+  $ hg add --config ui.portablefilenames=abort `cat 3`
+  $ hg add --config ui.portablefilenames=abort `cat 3_`
+  $ hg status -a
+  A \x83A (esc)
+  A \x83A_/\x83A (esc)
+  A \x83Z (esc)
+  A \x83Z_/\x83Z (esc)
+  A \x83a (esc)
+  A \x83a_/\x83a (esc)
+  A \x83z (esc)
+  A \x83z_/\x83z (esc)
+
+test filename collision check at "hg update"
+
+  $ hg commit -m 'revision 0'
+  $ hg update null
+  0 files updated, 0 files merged, 8 files removed, 0 files unresolved
+  $ hg update tip
+  8 files updated, 0 files merged, 0 files removed, 0 files unresolved
+
+check status of working directory
+
+  $ hg status -A
+  ? 0
+  ? 0_
+  ? 1
+  ? 1_
+  ? 2
+  ? 2_
+  ? 3
+  ? 3_
+  C \x83A (esc)
+  C \x83A_/\x83A (esc)
+  C \x83Z (esc)
+  C \x83Z_/\x83Z (esc)
+  C \x83a (esc)
+  C \x83a_/\x83a (esc)
+  C \x83z (esc)
+  C \x83z_/\x83z (esc)
+
+check contents of files
+
+  $ cat `cat 0`
+  \x83A (esc)
+  $ cat `cat 0_`
+  \x83A_ (esc)
+  $ cat `cat 1`
+  \x83Z (esc)
+  $ cat `cat 1_`
+  \x83Z_ (esc)
+  $ cat `cat 2`
+  \x83a (esc)
+  $ cat `cat 2_`
+  \x83a_ (esc)
+  $ cat `cat 3`
+  \x83z (esc)
+  $ cat `cat 3_`
+  \x83z_ (esc)
+
+  $ cd ..
diff -r 22ae1ea19de2 -r b337a7d84853 tests/test-casefolding-geo.t
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-casefolding-geo.t	Mon Dec 12 17:10:19 2011 +0900
@@ -0,0 +1,43 @@
+run only on case-insensitive filesystems
+
+  $ "$TESTDIR/hghave" icasefs || exit 80
+
+run only on Georgian lower Khutsuri sensitive file systems
+
+  $ "$TESTDIR/hghave" geofs || exit 80
+
+  $ LC_ALL=
+  $ export LC_ALL
+  $ LC_CTYPE=en_US.UTF-8
+  $ export LC_CTYPE
+  $ HGENCODING=utf-8
+  $ export HGENCODING
+
+  $ hg init repo
+  $ cd repo
+
+create file named as u'\u10a0' (Georgian upper Khutsuri)
+
+  $ python <<EOF
+  > names = [ (u'\u10a0', True), (u'\u2d00', False) ]
+  > for i, (name, create)  in zip(range(len(names)), names):
+  >     encname = name.encode('utf-8')
+  >     # file for getting target filename of "hg add"
+  >     f = file(str(i), 'w'); f.write(encname); f.close()
+  >     # target file of "hg add"
+  >     if create:
+  >         f = file(encname, 'w'); f.write(encname); f.close()
+  > EOF
+
+add target file by u'\u2d00' (lower of u'\u10a0')
+
+  $ hg add `cat 1`
+  adding \xe1\x82\xa0 (esc)
+  $ hg status
+  A \xe1\x82\xa0 (esc)
+  ? 0
+  ? 1
+  $ hg ci -m 'add upper case by lower case'
+  $ hg manifest
+  \xe1\x82\xa0 (esc)
+  $ cd ..
diff -r 22ae1ea19de2 -r b337a7d84853 tests/test-encoding.t
--- a/tests/test-encoding.t	Mon Dec 12 17:10:19 2011 +0900
+++ b/tests/test-encoding.t	Mon Dec 12 17:10:19 2011 +0900
@@ -234,7 +234,8 @@
 hg log (dolphin)
 
   $ HGENCODING=dolphin hg log
-  abort: unknown encoding: dolphin, please check your locale settings
+  abort: unknown encoding: dolphin
+  (please check your locale settings)
   [255]
   $ HGENCODING=ascii hg branch `cat latin-1-tag`
   abort: decoding near '\xe9': 'ascii' codec can't decode byte 0xe9 in position 0: ordinal not in range(128)! (esc)