[PATCH 2 of 4 STABLE] i18n: use "encoding.lower()" to normalize specified keywords for log searching

Sun Dec 25 05:38:08 CST 2011

# HG changeset patch
# User FUJIWARA Katsunori <foozy at lares.dti.ne.jp>
# Date 1324812916 -32400
# Branch stable
# Node ID baccb82b6bc3ea34bd3ce8de939bb177bc8a4927
# Parent  f26c74b9cc9f393e7aacec50c3260c8042b995db
i18n: use "encoding.lower()" to normalize specified keywords for log searching

some problematic encoding (e.g.: cp932) uses ASCII alphabet characters
in byte sequence of multi byte characters.

"str.lower()" on such byte sequence may treat distinct characters as
same one, and cause unexpected log matching.

this patch uses "encoding.lower()" instead of "str.lower()" to
normalize strings for compare.

diff -r f26c74b9cc9f -r baccb82b6bc3 mercurial/commands.py

--- a/mercurial/commands.py	Sun Dec 25 20:32:48 2011 +0900
+++ b/mercurial/commands.py	Sun Dec 25 20:35:16 2011 +0900
@@ -3865,14 +3865,21 @@
             return
         if df and not df(ctx.date()[0]):
             return
-        if opts['user'] and not [k for k in opts['user']
-                                 if k.lower() in ctx.user().lower()]:
-            return
+
+        lower = encoding.lower
+        if opts.get('user'):
+            luser = lower(ctx.user())
+            for k in [lower(x) for x in opts['user']]:
+                if (k in luser):
+                    break
+            else:
+                return
         if opts.get('keyword'):
-            for k in [kw.lower() for kw in opts['keyword']]:
-                if (k in ctx.user().lower() or
-                    k in ctx.description().lower() or
-                    k in " ".join(ctx.files()).lower()):
+            luser = lower(ctx.user())
+            ldesc = lower(ctx.description())
+            lfiles = lower(" ".join(ctx.files()))
+            for k in [lower(x) for x in opts['keyword']]:
+                if (k in luser or k in ldesc or k in lfiles):
                     break
             else:
                 return
diff -r f26c74b9cc9f -r baccb82b6bc3 tests/test-log.t
--- a/tests/test-log.t	Sun Dec 25 20:32:48 2011 +0900
+++ b/tests/test-log.t	Sun Dec 25 20:35:16 2011 +0900
@@ -1157,3 +1157,56 @@
   $ hg log --template='{rev}:{node}\n' --hidden
   1:a765632148dc55d38c35c4f247c618701886cb2f
   0:9f758d63dcde62d547ebfb08e1e7ee96535f2b05
+
+clear extensions configuration
+  $ echo '[extensions]' >> $HGRCPATH
+  $ echo "hidden=!" >> $HGRCPATH
+  $ cd ..
+
+test -u/-k for problematic encoding
+# unicode: cp932:
+# u30A2    0x83 0x41(= 'A')
+# u30C2    0x83 0x61(= 'a')
+
+  $ hg init problematicencoding
+  $ cd problematicencoding
+
+  $ python > setup.sh <<EOF
+  > print u'''
+  > echo a > text
+  > hg add text
+  > hg --encoding utf-8 commit -u '\u30A2' -m none
+  > echo b > text
+  > hg --encoding utf-8 commit -u '\u30C2' -m none
+  > echo c > text
+  > hg --encoding utf-8 commit -u none -m '\u30A2'
+  > echo d > text
+  > hg --encoding utf-8 commit -u none -m '\u30C2'
+  > '''.encode('utf-8')
+  > EOF
+  $ sh < setup.sh
+
+test in problematic encoding
+  $ python > test.sh <<EOF
+  > print u'''
+  > hg --encoding cp932 log --template '{rev}\\n' -u '\u30A2'
+  > echo ====
+  > hg --encoding cp932 log --template '{rev}\\n' -u '\u30C2'
+  > echo ====
+  > hg --encoding cp932 log --template '{rev}\\n' -k '\u30A2'
+  > echo ====
+  > hg --encoding cp932 log --template '{rev}\\n' -k '\u30C2'
+  > '''.encode('cp932')
+  > EOF
+  $ sh < test.sh
+  0
+  ====
+  1
+  ====
+  2
+  0
+  ====
+  3
+  1
+
+  $ cd ..