[PATCH 2 of 3] lfs: add a small language to filter files

Thu Jan 4 23:58:55 EST 2018

# HG changeset patch
# User Matt Harbison <matt_harbison at yahoo.com>
# Date 1514704880 18000
#      Sun Dec 31 02:21:20 2017 -0500
# Node ID 8c20ade835ce43441c61e56e63d9bf92deaacd55
# Parent  2798cb4faacdae2db46e84ba0f3beaf506848915
lfs: add a small language to filter files

This patch was authored by Jun Wu for the fb-experimental repo, to avoid using
matcher for efficiency[1].  All I've changed here is the package (hgext3rd ->
hgext), and fixed up the imports in the test file (use absolute_import,
print_function, and 'from lfs import ...' -> 'from hgext.lfs import...').

We want a way to specify what files to be converted to LFS at commit time.
And per discussion, we also want to specify what files to skip text diff or
merge in another config option. The current `lfs.threshold` config option
could not satisfy complex needs.

This diff adds a small language for that. It's self-explained, and deals
with both simple and complex cases. For example:

  always                 # everything
  >20MB                  # larger than 20MB
  !.txt                  # except for .txt files
  .zip | .tar.gz | .7z   # some types of compressed files
  /bin                   # files under "bin" in the project root
  (.php & >2MB) | (.js & >5MB) | .tar.gz | (/bin & !/bin/README) | >1GB

[1] https://www.mercurial-scm.org/pipermail/mercurial-devel/2017-December/109387.html

diff --git a/hgext/lfs/filterlang.py b/hgext/lfs/filterlang.py
new file mode 100644
--- /dev/null
+++ b/hgext/lfs/filterlang.py
@@ -0,0 +1,95 @@
+# filterlang.py - a simple language to select files
+#
+# Copyright 2017 Facebook, Inc.
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+import itertools
+from mercurial import (
+    error,
+    parser,
+    util,
+)
+
+elements = {
+    # copied from mercurial/revsetlang.py, simplified - only boolean operations
+    # and parentheses are interesting here.
+    # token-type: binding-strength, primary, prefix, infix, suffix
+    "(":      (21, None,     ("group", 1, ")"), None,       None),
+    "!":      (10, None,     ("not", 10),       None,       None),
+    "&":      (5,  None,     None,              ("and", 5), None),
+    "|":      (4,  None,     None,              ("or", 4),  None),
+    ")":      (0,  None,     None,              None,       None),
+    "symbol": (0,  "symbol", None,              None,       None),
+    "end":    (0,  None,     None,              None,       None),
+}
+
+def _tokenize(text):
+    text = memoryview(text) # make slice zero-copy
+    special = ' ()&|!'
+    pos = 0
+    l = len(text)
+    while pos < l:
+        symbol = ''.join(itertools.takewhile(lambda ch: ch not in special,
+                                             text[pos:]))
+        if symbol:
+            yield ('symbol', symbol, pos)
+            pos += len(symbol)
+        else: # special char
+            if text[pos] != ' ': # ignore space silently
+                yield (text[pos], None, pos)
+            pos += 1
+    yield ('end', None, pos)
+
+def _parse(text):
+    tree, pos = parser.parser(elements).parse(_tokenize(text))
+    if pos != len(text):
+        raise error.ParseError('invalid token', pos)
+    return tree
+
+def _compile(tree):
+    op = tree[0]
+    if op == 'symbol':
+        name = tree[1]
+        op = name[0]
+        if op == '>': # size greater than test, ex. ">4M"
+            size = util.sizetoint(name[1:])
+            return lambda n, s: s > size
+        elif op == '.': # file extension test, ex. ".tar.gz"
+            return lambda n, s: n.endswith(name)
+        elif op == '/': # directory or full path test
+            p = name[1:].rstrip('/') # prefix
+            pl = len(p)
+            f = lambda n, s: n.startswith(p) and (len(n) == pl or n[pl] == '/')
+            return f
+        elif name == 'always': # always true
+            return lambda n, s: True
+        else:
+            raise error.ParseError('invalid symbol', name)
+    elif op in ['or', 'and']:
+        funcs = [_compile(t) for t in tree[1:]]
+        summary = {'or': any, 'and': all}[op]
+        return lambda n, s: summary(f(n, s) for f in funcs)
+    elif op == 'not':
+        return lambda n, s: not _compile(tree[1])(n, s)
+    elif op == 'group':
+        return _compile(tree[1])
+    else:
+        raise error.ProgrammingError('illegal tree: %r' % tree)
+
+def compile(text):
+    """generate a function (path, size) -> bool from filter specification.
+
+    "text" could contain "&", "|", "()", "!" for common logic operations,
+    ".extname" for file extension test, ">size" for size test, "/dir/subdir"
+    for directory test. The text could also be "always" or "!always" if no test
+    is wanted.
+
+    For example, "(.php & >10MB) | .zip | (/bin & !/bin/README)" will catch all
+    php files whose size is greater than 10 MB, all files whose name ends with
+    ".zip", and all files under "bin" in the repo root except for "bin/README".
+    """
+    return _compile(_parse(text))
diff --git a/tests/test-lfs-filterlang.py b/tests/test-lfs-filterlang.py
new file mode 100644
--- /dev/null
+++ b/tests/test-lfs-filterlang.py
@@ -0,0 +1,33 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import os
+import sys
+
+# make it runnable directly without run-tests.py
+sys.path[0:0] = [os.path.join(os.path.dirname(__file__), '..')]
+
+from hgext.lfs import filterlang
+
+def check(text, truecases, falsecases):
+    f = filterlang.compile(text)
+    for args in truecases:
+        if not f(*args):
+            print('unexpected: %r should include %r' % (text, args))
+    for args in falsecases:
+        if f(*args):
+            print('unexpected: %r should exclude %r' % (text, args))
+
+check('always', [('a.php', 123), ('b.txt', 0)], [])
+check('!!!!((!(!!always)))', [], [('a.php', 123), ('b.txt', 0)])
+
+check('/a & (.b | .c)', [('a/b.b', 0), ('a/c.c', 0)], [('b/c.c', 0)])
+check('(/a & .b) | .c', [('a/b.b', 0), ('a/c.c', 0), ('b/c.c', 0)], [])
+
+check('!!.bin | >20B | /bin | !>10 | !always',
+      [('a.bin', 11), ('b.txt', 21), ('bin/abc', 11)],
+      [('a.notbin', 11), ('b.txt', 11), ('bin2/abc', 11)])
+
+check('(.php & >10KB) | .zip | (/bin & !/bin/README) | >1M',
+      [('a.php', 15000), ('a.zip', 0), ('bin/a', 0), ('bin/README', 1e7)],
+      [('a.php', 5000), ('b.zip2', 0), ('t/bin/a', 0), ('bin/README', 1)])