[PATCH 3 of 4 STABLE] i18n: use "encoding.lower()" to normalize specified string for revset

Sat Dec 24 08:31:13 CST 2011

# HG changeset patch
# User FUJIWARA Katsunori <foozy at lares.dti.ne.jp>
# Date 1324735129 -32400
# Branch stable
# Node ID f51cde5621601b93245da68c3ba3723730bac188
# Parent  4e856321cb9978bd2d00685b489276d127e0df4c
i18n: use "encoding.lower()" to normalize specified string for revset

some problematic encoding (e.g.: cp932) uses ASCII alphabet characters
in byte sequence of multi byte characters.

"str.lower()" on such byte sequence may treat distinct characters as
same one, and cause unexpected log matching.

this patch uses "encoding.lower()" instead of "str.lower()" to
normalize strings for compare.

diff -r 4e856321cb99 -r f51cde562160 mercurial/revset.py

--- a/mercurial/revset.py	Sat Dec 24 22:58:49 2011 +0900
+++ b/mercurial/revset.py	Sat Dec 24 22:58:49 2011 +0900
@@ -11,6 +11,7 @@
 import bookmarks as bookmarksmod
 import match as matchmod
 from i18n import _
+import encoding
 
 elements = {
     "(": (20, ("group", 1, ")"), ("func", 1, ")")),
@@ -233,8 +234,8 @@
     Alias for ``user(string)``.
     """
     # i18n: "author" is a keyword
-    n = getstring(x, _("author requires a string")).lower()
-    return [r for r in subset if n in repo[r].user().lower()]
+    n = encoding.lower(getstring(x, _("author requires a string")))
+    return [r for r in subset if n in encoding.lower(repo[r].user())]
 
 def bisect(repo, subset, x):
     """``bisect(string)``
@@ -376,11 +377,11 @@
     Search commit message for string. The match is case-insensitive.
     """
     # i18n: "desc" is a keyword
-    ds = getstring(x, _("desc requires a string")).lower()
+    ds = encoding.lower(getstring(x, _("desc requires a string")))
     l = []
     for r in subset:
         c = repo[r]
-        if ds in c.description().lower():
+        if ds in encoding.lower(c.description()):
             l.append(r)
     return l
 
@@ -522,12 +523,12 @@
     string. The match is case-insensitive.
     """
     # i18n: "keyword" is a keyword
-    kw = getstring(x, _("keyword requires a string")).lower()
+    kw = encoding.lower(getstring(x, _("keyword requires a string")))
     l = []
     for r in subset:
         c = repo[r]
         t = " ".join(c.files() + [c.user(), c.description()])
-        if kw in t.lower():
+        if kw in encoding.lower(t):
             l.append(r)
     return l
 
diff -r 4e856321cb99 -r f51cde562160 tests/test-revset-i18n.t
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-revset-i18n.t	Sat Dec 24 22:58:49 2011 +0900
@@ -0,0 +1,46 @@
+  $ hg init repo1
+  $ cd repo1
+
+  $ HGENCODING=utf-8
+  $ export HGENCODING
+
+# unicode: cp932:
+# u30A2    0x83 0x41(= 'A')
+# u30C2    0x83 0x61(= 'a')
+
+create rev #0
+  $ echo a > text
+  $ hg add text
+  $ python -c "print u'hg commit -u \'\u30A2\' -m none'.encode('utf-8')" | sh
+
+create rev #1
+  $ echo b > text
+  $ python -c "print u'hg commit -u \'\u30C2\' -m none'.encode('utf-8')" | sh
+
+create rev #2
+  $ echo c > text
+  $ python -c "print u'hg commit -u none -m \'\u30A2\''.encode('utf-8')" | sh
+
+create rev #3
+  $ echo d > text
+  $ python -c "print u'hg commit -u none -m \'\u30C2\''.encode('utf-8')" | sh
+
+change encoding to cp932
+  $ HGENCODING=cp932
+
+  $ python -c "print u'hg log -r \'author(\u30A2)\' --template \'{rev}\\n\''.encode('cp932')" | sh
+  0
+  $ python -c "print u'hg log -r \'author(\u30C2)\' --template \'{rev}\\n\''.encode('cp932')" | sh
+  1
+
+  $ python -c "print u'hg log -r \'desc(\u30A2)\' --template \'{rev}\\n\''.encode('cp932')" | sh
+  2
+  $ python -c "print u'hg log -r \'desc(\u30C2)\' --template \'{rev}\\n\''.encode('cp932')" | sh
+  3
+
+  $ python -c "print u'hg log -r \'keyword(\u30A2)\' --template \'{rev}\\n\''.encode('cp932')" | sh
+  0
+  2
+  $ python -c "print u'hg log -r \'keyword(\u30C2)\' --template \'{rev}\\n\''.encode('cp932')" | sh
+  1
+  3