[PATCH 3 of 4 RFC] i18n: add hook point to make tokenizing process as encoding safe

Thu May 24 12:04:27 CDT 2012

# HG changeset patch
# User FUJIWARA Katsunori <foozy at lares.dti.ne.jp>
# Date 1337873761 -32400
# Branch stable
# Node ID 1d5a60c7f44f106af3c0d56139a1b6d0fd3d0b3c
# Parent  a85e6240d0ab23191d158390095dd48852dcdc39
i18n: add hook point to make tokenizing process as encoding safe

added hook poit is "tokenize()" in "mercurial/encoding.py".

when win32mbcs is enabled, "tokenize()" is replaced with specific
implementation to do below:

    1. convert from specified string to unicode object
    2. invoke the tokenizer with unicode to get generator from it
    3. "pos" of returned value is one in unicode, so recalculate one
       for in byte sequence
    4. convert "token" and "value" from unicode to byte sequence, and
       return them

step (3) of above is required, because the last "pos" value should be
equal to the length of the specified byte sequence, and otherwise it
causes exception raising.

this affects to invocations of below:

    - mercurial.fileset.tokenize()
    - mercurial.revset.tokenize()
    - mercurial.templater.tokenize()

diff -r a85e6240d0ab -r 1d5a60c7f44f hgext/win32mbcs.py

--- a/hgext/win32mbcs.py	Fri May 25 00:27:13 2012 +0900
+++ b/hgext/win32mbcs.py	Fri May 25 00:36:01 2012 +0900
@@ -164,6 +164,23 @@
         raise util.Abort(_("[win32mbcs] conversion in filtering failed with"
                          " %s encoding\n") % (_encoding))
 
+def _tokenize(tokenizer, s):
+    try:
+        us = decode(s)
+        for token, value, pos in tokenizer(us):
+            # re-calculate position in MBCS string
+            pos = len(encode(us[:pos]))
+            yield encode((token, value, pos))
+    except UnicodeError:
+        raise util.Abort(_("[win32mbcs] conversion in tokenizing failed with"
+                         " %s encoding\n") % (_encoding))
+
+def safetokenize(tokenizer, s):
+    if isinstance(s, unicode):
+        return tokenizer(s)
+    else:
+        return _tokenize(tokenizer, s)
+
 def replacename(name, replacement):
     module, name = name.rsplit('.', 1)
     module = sys.modules[module]
@@ -211,6 +228,7 @@
         wrapname("mercurial.osutil.listdir", wrapperforlistdir)
         replacename("mercurial.encoding.escape", safeescape)
         replacename("mercurial.encoding.filter", safefilter)
+        replacename("mercurial.encoding.tokenize", safetokenize)
         # Check sys.args manually instead of using ui.debug() because
         # command line options is not yet applied when
         # extensions.loadall() is called.
diff -r a85e6240d0ab -r 1d5a60c7f44f mercurial/encoding.py
--- a/mercurial/encoding.py	Fri May 25 00:27:13 2012 +0900
+++ b/mercurial/encoding.py	Fri May 25 00:36:01 2012 +0900
@@ -223,6 +223,12 @@
     """
     return filter(s)
 
+def tokenize(tokenizer, s):
+    """Hook point to ensure that the tokenizer parses specified string safely
+    in current encoding.
+    """
+    return tokenizer(s)
+
 def toutf8b(s):
     '''convert a local, possibly-binary string into UTF-8b
 
diff -r a85e6240d0ab -r 1d5a60c7f44f mercurial/parser.py
--- a/mercurial/parser.py	Fri May 25 00:27:13 2012 +0900
+++ b/mercurial/parser.py	Fri May 25 00:36:01 2012 +0900
@@ -15,7 +15,7 @@
 # an action is a tree node name, a tree label, and an optional match
 # __call__(program) parses program into a labelled tree
 
-import error
+import error, encoding
 from i18n import _
 
 class parser(object):
@@ -77,7 +77,7 @@
         return expr
     def parse(self, message):
         'generate a parse tree from a message'
-        self._iter = self._tokenizer(message)
+        self._iter = encoding.tokenize(self._tokenizer, message)
         self._advance()
         res = self._parse()
         token, value, pos = self.current