[PATCH 4 of 7] templater: introduce one-pass parsing of nested template strings

Sun Jul 12 09:54:59 CDT 2015

# HG changeset patch
# User Yuya Nishihara <yuya at tcha.org>
# Date 1434377495 -32400
#      Mon Jun 15 23:11:35 2015 +0900
# Node ID 2651605c4d75484687faf5c97653c24580dce6fa
# Parent  04e18b2b2519bac75601f659c0f5cea7ea610a4c
templater: introduce one-pass parsing of nested template strings

Instead of re-parsing quoted strings as templates, the tokenizer can delegate
the parsing of nested template strings to the parser. It has two benefits:

 1. syntax errors can be reported with absolute positions
 2. nested template can use quotes just like shell: "{"{rev}"}"

It doesn't sound nice that the tokenizer recurses into the parser. We could
instead make the tokenize itself recursive, but it would be much more
complicated because we would have to adjust binding strengths carefully and
put dummy infix operators to concatenate template fragments.

Now "string" token without r"" never appears. It will be removed by the next
patch.

diff --git a/mercurial/templater.py b/mercurial/templater.py
--- a/mercurial/templater.py
+++ b/mercurial/templater.py
@@ -24,6 +24,7 @@ elements = {
     "symbol": (0, ("symbol",), None),
     "string": (0, ("template",), None),
     "rawstring": (0, ("rawstring",), None),
+    "template": (0, ("template",), None),
     "end": (0, None, None),
 }
 
@@ -35,6 +36,11 @@ def tokenize(program, start, end):
             pass
         elif c in "(,)%|": # handle simple operators
             yield (c, None, pos)
+        elif c in '"\'': # handle quoted templates
+            s = pos + 1
+            data, pos = _parsetemplate(program, s, end, c)
+            yield ('template', data, s)
+            pos -= 1
         elif (c in '"\'' or c == 'r' and
               program[pos:pos + 2] in ("r'", 'r"')): # handle quoted strings
             if c == 'r':
@@ -89,7 +95,7 @@ def tokenize(program, start, end):
                 pos += 1
                 token = 'rawstring'
             else:
-                token = 'string'
+                token = 'template'
             quote = program[pos:pos + 2]
             s = pos = pos + 2
             while pos < end: # find closing escaped quote
@@ -102,6 +108,8 @@ def tokenize(program, start, end):
                         data = program[s:pos].decode('string-escape')
                     except ValueError: # unbalanced escapes
                         raise error.ParseError(_("syntax error"), s)
+                    if token == 'template':
+                        data = _parsetemplate(data, 0, len(data))[0]
                     yield (token, data, s)
                     pos += 1
                     break
@@ -127,27 +135,47 @@ def tokenize(program, start, end):
         pos += 1
     raise error.ParseError(_("unterminated template expansion"), start)
 
-def _parsetemplate(tmpl, start, stop):
+def _parsetemplate(tmpl, start, stop, quote=''):
+    r"""
+    >>> _parsetemplate('foo{bar}"baz', 0, 12)
+    ([('string', 'foo'), ('symbol', 'bar'), ('string', '"baz')], 12)
+    >>> _parsetemplate('foo{bar}"baz', 0, 12, quote='"')
+    ([('string', 'foo'), ('symbol', 'bar')], 9)
+    >>> _parsetemplate('foo"{bar}', 0, 9, quote='"')
+    ([('string', 'foo')], 4)
+    >>> _parsetemplate(r'foo\"bar"baz', 0, 12, quote='"')
+    ([('string', 'foo"'), ('string', 'bar')], 9)
+    >>> _parsetemplate(r'foo\\"bar', 0, 10, quote='"')
+    ([('string', 'foo\\\\')], 6)
+    """
     parsed = []
+    sepchars = '{' + quote
     pos = start
     p = parser.parser(elements)
     while pos < stop:
-        n = tmpl.find('{', pos, stop)
+        n = min((tmpl.find(c, pos, stop) for c in sepchars),
+                key=lambda n: (n < 0, n))
         if n < 0:
             parsed.append(('string', tmpl[pos:stop]))
             pos = stop
             break
+        c = tmpl[n]
         bs = (n - pos) - len(tmpl[pos:n].rstrip('\\'))
         if bs % 2 == 1:
             # escaped (e.g. '\{', '\\\{', but not '\\{')
-            parsed.append(('string', (tmpl[pos:n - 1] + "{")))
+            parsed.append(('string', (tmpl[pos:n - 1] + c)))
             pos = n + 1
             continue
         if n > pos:
             parsed.append(('string', tmpl[pos:n]))
+        if c == quote:
+            return parsed, n + 1
 
         parseres, pos = p.parse(tokenize(tmpl, n + 1, stop))
         parsed.append(parseres)
+
+    if quote:
+        raise error.ParseError(_("unterminated string"), start)
     return parsed, pos
 
 def compiletemplate(tmpl, context):
@@ -182,7 +210,7 @@ def getfilter(exp, context):
 
 def gettemplate(exp, context):
     if exp[0] == 'template':
-        return compiletemplate(exp[1], context)
+        return [compileexp(e, context, methods) for e in exp[1]]
     if exp[0] == 'symbol':
         # unlike runsymbol(), here 'symbol' is always taken as template name
         # even if it exists in mapping. this allows us to override mapping
@@ -215,7 +243,7 @@ def runsymbol(context, mapping, key):
     return v
 
 def buildtemplate(exp, context):
-    ctmpl = compiletemplate(exp[1], context)
+    ctmpl = [compileexp(e, context, methods) for e in exp[1]]
     if len(ctmpl) == 1:
         return ctmpl[0]  # fast path for string with no template fragment
     return (runtemplate, ctmpl)
diff --git a/tests/test-command-template.t b/tests/test-command-template.t
--- a/tests/test-command-template.t
+++ b/tests/test-command-template.t
@@ -2541,6 +2541,16 @@ Behind the scenes, this will throw Value
   abort: template filter 'datefilter' is not compatible with keyword 'author'
   [255]
 
+Error in nested template:
+
+  $ hg log -T '{"date'
+  hg: parse error at 2: unterminated string
+  [255]
+
+  $ hg log -T '{"foo{date|=}"}'
+  hg: parse error at 11: syntax error
+  [255]
+
 Thrown an error if a template function doesn't exist
 
   $ hg tip --template '{foo()}\n'
@@ -2952,7 +2962,7 @@ escaped single quotes and errors:
   $ hg log -r 2 -T "{if(rev, '{if(rev, r\'foo\')}')}"'\n'
   foo
   $ hg log -r 2 -T '{if(rev, "{if(rev, \")}")}\n'
-  hg: parse error at 11: unterminated string
+  hg: parse error at 21: unterminated string
   [255]
   $ hg log -r 2 -T '{if(rev, \"\\"")}\n'
   hg: parse error at 11: syntax error
@@ -3069,6 +3079,14 @@ Test string escaping in nested expressio
   3:\x6eo user, \x6eo domai\x6e
   4:\x5c\x786eew bra\x5c\x786ech
 
+Test quotes in nested expression are evaluated just like a $(command)
+substitution in POSIX shells:
+
+  $ hg log -R a -r 8 -T '{"{"{rev}:{node|short}"}"}\n'
+  8:95c24699272e
+  $ hg log -R a -r 8 -T '{"{"\{{rev}} \"{node|short}\""}"}\n'
+  {8} "95c24699272e"
+
 Test recursive evaluation:
 
   $ hg init r
diff --git a/tests/test-doctest.py b/tests/test-doctest.py
--- a/tests/test-doctest.py
+++ b/tests/test-doctest.py
@@ -26,6 +26,7 @@ testmod('mercurial.revset')
 testmod('mercurial.store')
 testmod('mercurial.subrepo')
 testmod('mercurial.templatefilters')
+testmod('mercurial.templater')
 testmod('mercurial.ui')
 testmod('mercurial.url')
 testmod('mercurial.util')