[PATCH 2 of 3 RFC] lfs: convert threshold testing to use a matcher

Sat Dec 30 17:36:01 EST 2017

Excerpts from Matt Harbison's message of 2017-12-29 16:43:23 -0500:
> lfs: convert threshold testing to use a matcher
> [...] 
> --- a/mercurial/fileset.py
> +++ b/mercurial/fileset.py
> @@ -380,7 +380,14 @@
>      else:
>          raise error.ParseError(_("couldn't parse size: %s") % expr)
>  
> -    return [f for f in mctx.existing() if m(mctx.ctx[f].size())]
> +    # This explodes when converting largefiles -> lfs.  It looks like the first
> +    # commit converts fine, and then when converting the second commit,
> +    # 'lfs.txt' is one of the files tested here, even though it wasn't modified
> +    # in this commit (but it was added in the parent).
> +
> +    # XXX: 'f in mctx.ctx and ...' doesn't short circuit None.size(), so
> +    # something appears to be wrong with the ctx for conversions.
> +    return [f for f in mctx.existing() if mctx.ctx[f] and m(mctx.ctx[f].size())]

As you can see here, this scans all existing files to do the "size(> x)"
test. The old code is O(1) to do the same test. In a repo with millions of
files, this will basically make things unusable.

>  @predicate('encoding(name)', callexisting=True)
>  def encoding(mctx, x):

FWIW, I had an internal patch to make the filter more flexible while still
maintain good performance. It was accepted but didn't land because the
feature was not urgently needed and there was concern about conflicting with
the fileset synatx.

# HG changeset patch
# User Jun Wu <quark at fb.com>
# Date 1493767928 25200
#      Tue May 02 16:32:08 2017 -0700
# Node ID bc949f9b315e3c3967853f209b7d8f8a5e81a6ce
# Parent  c95721badfb48141e376d428d0621784563688b4
lfs: add a small language to filter files

We want a way to specify what files to be converted to LFS at commit time.
And per discussion, we also want to specify what files to skip text diff or
merge in another config option. The current `lfs.threshold` config option
could not satisfy complex needs.

This diff adds a small language for that. It's self-explained, and deals
with both simple and complex cases. For example:

  always                 # everything
  >20MB                  # larger than 20MB
  !.txt                  # except for .txt files
  .zip | .tar.gz | .7z   # some types of compressed files
  /bin                   # files under "bin" in the project root
  (.php & >2MB) | (.js & >5MB) | .tar.gz | (/bin & !/bin/README) | >1GB

diff --git a/hgext3rd/lfs/filterlang.py b/hgext3rd/lfs/filterlang.py
new file mode 100644
--- /dev/null
+++ b/hgext3rd/lfs/filterlang.py
@@ -0,0 +1,95 @@
+# filterlang.py - a simple language to select files
+#
+# Copyright 2017 Facebook, Inc.
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+import itertools
+from mercurial import (
+    error,
+    parser,
+    util,
+)
+
+elements = {
+    # copied from mercurial/revsetlang.py, simplified - only boolean operations
+    # and parentheses are interesting here.
+    # token-type: binding-strength, primary, prefix, infix, suffix
+    "(":      (21, None,     ("group", 1, ")"), None,       None),
+    "!":      (10, None,     ("not", 10),       None,       None),
+    "&":      (5,  None,     None,              ("and", 5), None),
+    "|":      (4,  None,     None,              ("or", 4),  None),
+    ")":      (0,  None,     None,              None,       None),
+    "symbol": (0,  "symbol", None,              None,       None),
+    "end":    (0,  None,     None,              None,       None),
+}
+
+def _tokenize(text):
+    text = memoryview(text) # make slice zero-copy
+    special = ' ()&|!'
+    pos = 0
+    l = len(text)
+    while pos < l:
+        symbol = ''.join(itertools.takewhile(lambda ch: ch not in special,
+                                             text[pos:]))
+        if symbol:
+            yield ('symbol', symbol, pos)
+            pos += len(symbol)
+        else: # special char
+            if text[pos] != ' ': # ignore space silently
+                yield (text[pos], None, pos)
+            pos += 1
+    yield ('end', None, pos)
+
+def _parse(text):
+    tree, pos = parser.parser(elements).parse(_tokenize(text))
+    if pos != len(text):
+        raise error.ParseError('invalid token', pos)
+    return tree
+
+def _compile(tree):
+    op = tree[0]
+    if op == 'symbol':
+        name = tree[1]
+        op = name[0]
+        if op == '>': # size greater than test, ex. ">4M"
+            size = util.sizetoint(name[1:])
+            return lambda n, s: s > size
+        elif op == '.': # file extension test, ex. ".tar.gz"
+            return lambda n, s: n.endswith(name)
+        elif op == '/': # directory or full path test
+            p = name[1:].rstrip('/') # prefix
+            pl = len(p)
+            f = lambda n, s: n.startswith(p) and (len(n) == pl or n[pl] == '/')
+            return f
+        elif name == 'always': # always true
+            return lambda n, s: True
+        else:
+            raise error.ParseError('invalid symbol', name)
+    elif op in ['or', 'and']:
+        funcs = [_compile(t) for t in tree[1:]]
+        summary = {'or': any, 'and': all}[op]
+        return lambda n, s: summary(f(n, s) for f in funcs)
+    elif op == 'not':
+        return lambda n, s: not _compile(tree[1])(n, s)
+    elif op == 'group':
+        return _compile(tree[1])
+    else:
+        raise error.ProgrammingError('illegal tree: %r' % tree)
+
+def compile(text):
+    """generate a function (path, size) -> bool from filter specification.
+
+    "text" could contain "&", "|", "()", "!" for common logic operations,
+    ".extname" for file extension test, ">size" for size test, "/dir/subdir"
+    for directory test. The text could also be "always" or "!always" if no test
+    is wanted.
+
+    For example, "(.php & >10MB) | .zip | (/bin & !/bin/README)" will catch all
+    php files whose size is greater than 10 MB, all files whose name ends with
+    ".zip", and all files under "bin" in the repo root except for "bin/README".
+    """
+    return _compile(_parse(text))
diff --git a/tests/test-lfs-filterlang.py b/tests/test-lfs-filterlang.py
new file mode 100644
--- /dev/null
+++ b/tests/test-lfs-filterlang.py
@@ -0,0 +1,30 @@
+import os
+import sys
+
+# make it runnable directly without run-tests.py
+sys.path[0:0] = [os.path.join(os.path.dirname(__file__), '..')]
+
+from lfs import filterlang
+
+def check(text, truecases, falsecases):
+    f = filterlang.compile(text)
+    for args in truecases:
+        if not f(*args):
+            print('unexpected: %r should include %r' % (text, args))
+    for args in falsecases:
+        if f(*args):
+            print('unexpected: %r should exclude %r' % (text, args))
+
+check('always', [('a.php', 123), ('b.txt', 0)], [])
+check('!!!!((!(!!always)))', [], [('a.php', 123), ('b.txt', 0)])
+
+check('/a & (.b | .c)', [('a/b.b', 0), ('a/c.c', 0)], [('b/c.c', 0)])
+check('(/a & .b) | .c', [('a/b.b', 0), ('a/c.c', 0), ('b/c.c', 0)], [])
+
+check('!!.bin | >20B | /bin | !>10 | !always',
+      [('a.bin', 11), ('b.txt', 21), ('bin/abc', 11)],
+      [('a.notbin', 11), ('b.txt', 11), ('bin2/abc', 11)])
+
+check('(.php & >10KB) | .zip | (/bin & !/bin/README) | >1M',
+      [('a.php', 15000), ('a.zip', 0), ('bin/a', 0), ('bin/README', 1e7)],
+      [('a.php', 5000), ('b.zip2', 0), ('t/bin/a', 0), ('bin/README', 1)])