[PATCH 2 of 2 STABLE] i18n: fix case folding problem in some problematic encodings

Thu Nov 24 04:37:03 CST 2011

On 11/24/2011 07:19 AM, FUJIWARA Katsunori wrote:
> # HG changeset patch
> # User FUJIWARA Katsunori<foozy at lares.dti.ne.jp>
> # Date 1322114737 -32400
> # Branch stable
> # Node ID ccae15e928bfd34266aca2ecf257207597af832a
> # Parent  8b3cfad9c07307cabb8a49861563bc57ddf37fbd
> i18n: fix case folding problem in some problematic encodings
>
> changeset 28e98a8b173d for case folding problem with ambiguous
> encoding was not enough.
>
> this patch covers up a fault of fix in it.
>
> this patch switches:
>
>    - from "str.lower()" to "encoding.lower()"
>
>      "str.lower()" on byte sequence may break string in some character
>      encoding (e.g.: cp932 for Japanese)
>
>      this patch also add comments to lines where "str.lower()" on
>      filenames are kept because it is enough, for future maintenance.
>
>    - from "os.path.normcase()" to "util.normcase()"
>
>      in changeset 6eff984d8e76 and b2fd4746414a, some of
>      "os.path.normcase()" invocations are replaced by
>      "util.normcase()", but not all.
>
>      for consistency, this patch replace all "os.path.normcase()"
>      invocations by "util.normcase()" other than ones in "windows.py".
>
> patch hunk to catch LookupError in "encoding.lower()" is for passing
> test-encoding.t, because switching to "encoding.lower()" causes
> configuration error detection here.
>
> diff -r 8b3cfad9c073 -r ccae15e928bf hgext/win32mbcs.py
> --- a/hgext/win32mbcs.py	Thu Nov 24 14:59:26 2011 +0900
> +++ b/hgext/win32mbcs.py	Thu Nov 24 15:05:37 2011 +0900
> @@ -130,7 +130,8 @@
>    os.path.splitunc os.path.normpath os.path.normcase os.makedirs
>    mercurial.util.endswithsep mercurial.util.splitpath mercurial.util.checkcase
>    mercurial.util.fspath mercurial.util.pconvert mercurial.util.normpath
> - mercurial.util.checkwinfilename mercurial.util.checkosfilename'''
> + mercurial.util.checkwinfilename mercurial.util.checkosfilename
> + mercurial.util.normcase'''
>
>   # codec and alias names of sjis and big5 to be faked.
>   problematic_encodings = '''big5 big5-tw csbig5 big5hkscs big5-hkscs
> diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/encoding.py
> --- a/mercurial/encoding.py	Thu Nov 24 14:59:26 2011 +0900
> +++ b/mercurial/encoding.py	Thu Nov 24 15:05:37 2011 +0900
> @@ -171,3 +171,5 @@
>           return lu.encode(encoding)
>       except UnicodeError:
>           return s.lower() # we don't know how to fold this except in ASCII
> +    except LookupError, k:
> +        raise error.Abort("%s, please check your locale settings" % k)

We have hint=... for hints.

> diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/posix.py
> --- a/mercurial/posix.py	Thu Nov 24 14:59:26 2011 +0900
> +++ b/mercurial/posix.py	Thu Nov 24 15:05:37 2011 +0900
> @@ -6,6 +6,7 @@
>   # GNU General Public License version 2 or any later version.
>
>   from i18n import _
> +import encoding
>   import os, sys, errno, stat, getpass, pwd, grp, tempfile, unicodedata
>
>   posixfile = open
> @@ -166,7 +167,8 @@
>
>   # os.path.normcase is a no-op, which doesn't help us on non-native filesystems
>   def normcase(path):
> -    return path.lower()
> +    # 'path' may be byte sequence in problematic character encoding
> +    return encoding.lower(path)
>

That introduces a cyclic dependency.

But wouldn't the windows normcase have the same need for using 
encoding.lower as the posix one?

Perhaps normcase should move to util and be a wrapper around a platform 
specific helper function (that just should normalize slashes).

Here you are changing non-windows code. Would it be possible to add a 
non-windows test for this?

> diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/scmutil.py
> --- a/mercurial/scmutil.py	Thu Nov 24 14:59:26 2011 +0900
> +++ b/mercurial/scmutil.py	Thu Nov 24 15:05:37 2011 +0900
> @@ -86,8 +86,11 @@
>           # AIX ignores "/" at end of path, others raise EISDIR.
>           if util.endswithsep(path):
>               raise util.Abort(_("path ends in directory separator: %s") % path)
> -        normpath = os.path.normcase(path)
> +        normpath = util.normcase(path)
>           parts = util.splitpath(normpath)
> +        # below 'lower()'s are for comparison only with ASCII strings,
> +        # so encoding.lower() is not needed
> +        # even if encoding of 'path' is problematic one

Wouldn't it be better to do the "obviously correct" thing instead of 
adding comments describing a clever optimization?

>           if (os.path.splitdrive(path)[0]
>               or parts[0].lower() in ('.hg', '.hg.', '')
>               or os.pardir in parts):
> @@ -451,6 +454,9 @@
>               return rcpath
>           value = value.replace('/', os.sep)
>           for p in value.split(os.pathsep):
> +            # below 'lower()' is for comparison only with ASCII string,
> +            # so encoding.lower() is not needed
> +            # even if encoding of 'path' is problematic one
>               if p.lower().endswith('mercurial.ini'):
>                   rcpath.append(p)
>               elif os.path.isdir(p):
> diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/util.py
> --- a/mercurial/util.py	Thu Nov 24 14:59:26 2011 +0900
> +++ b/mercurial/util.py	Thu Nov 24 15:05:37 2011 +0900
> @@ -593,6 +593,10 @@
>       Requires a path (like /foo/.hg) ending with a foldable final
>       directory component.
>       """
> +    # this is invoked with 'path' of which last element consists of
> +    # only ASCII characters (e.g.: '.hg'),
> +    # so upper()/lower() never cause problem,
> +    # even if encoding of 'path' is problematic one
>       s1 = os.stat(path)
>       d, b = os.path.split(path)
>       p2 = os.path.join(d, b.upper())
> @@ -614,8 +618,17 @@
>       with root. Note that this function is unnecessary, and should not be
>       called, for case-sensitive filesystems (simply because it's expensive).
>       '''
> +
> +    # 'name' may be byte sequence in problematic character encoding,
> +    # because win32mbcs is enabled only in Windows native environment.
> +    def lower(x):
> +        if isinstance(x, unicode): # win32mbcs enabled
> +            return x.lower()
> +        else:
> +            return encoding.lower(x)

I think win32mbcs specific code or knowledge should stay in the 
win32mbcs extension.

>       # If name is absolute, make it relative
> -    if name.lower().startswith(root.lower()):
> +    if lower(name).startswith(lower(root)):
>           l = len(root)
>           if name[l] == os.sep or name[l] == os.altsep:
>               l = l + 1
> @@ -630,8 +643,19 @@
>       # Protect backslashes. This gets silly very quickly.
>       seps.replace('\\','\\\\')
>       pattern = re.compile(r'([^%s]+)|([%s]+)' % (seps, seps))
> -    dir = os.path.normcase(os.path.normpath(root))
> +    dir = normcase(os.path.normpath(root))
>       result = []
> +
> +    # encoding.fromlocal() on 'name' is not needed because:
> +    #
> +    #   - Windows ('\\' as path separator) + problematic encoding
> +    #     should enable win32mbcs which wraps this function for
> +    #     unicode-nize of 'name'
> +    #
> +    #   - other posix environment (e.g.: cygwin) uses '/' as path
> +    #     separator, so re.findall() does not break byte sequence,
> +    #     even though it uses problematic encoding

FWIW: I don't understand that comment.

However: name has already temporarily been converted to lower, and later 
all parts of it is lowered again. Why not just lower name once in the 
beginning of the function?

>       for part, sep in pattern.findall(name):
>           if sep:
>               result.append(sep)
> @@ -641,10 +665,10 @@
>               _fspathcache[dir] = os.listdir(dir)
>           contents = _fspathcache[dir]
>
> -        lpart = part.lower()
> +        lpart = lower(part)
>           lenp = len(part)
>           for n in contents:
> -            if lenp == len(n) and n.lower() == lpart:
> +            if lenp == len(n) and lower(n) == lpart:
>                   result.append(n)
>                   break
>           else:

(And Matt just dropped test-casecollision-i18n.t - you would have to add 
it again if you think anybody else is ever going to run it.)

/Mads