[PATCH 1 of 2 v2] matcher: use re2 bindings if available

Bryan O'Sullivan bos at serpentine.com
Fri Jun 1 17:35:16 CDT 2012


# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1338589580 25200
# Node ID 45de1c920fcaea462542a2eaee90c3ac0a83162e
# Parent  17d31c0fe181484e513832afa1c7a1095685dc25
matcher: use re2 bindings if available

There are two sets of Python re2 bindings available on the internet;
this code works with both.

Using re2 can greatly improve "hg status" performance when a .hgignore
file becomes even modestly complex.

Example: "hg status" on a clean tree with 134K files, where "hg
debugignore" reports a regexp 4256 bytes in size.

  no .hgignore: 1.76 sec
  Python re:    2.79
  re2:          1.82

The overhead of regexp matching drops from 1.03 seconds with stock
re to 0.06 with re2.

(For comparison, a git repo with the same contents and .gitignore
file runs "git status -s" in 1.71 seconds, i.e. only slightly faster
than hg with re2.)

diff --git a/mercurial/match.py b/mercurial/match.py
--- a/mercurial/match.py
+++ b/mercurial/match.py
@@ -9,6 +9,14 @@ import re
 import scmutil, util, fileset
 from i18n import _
 
+def _rematcher(pat):
+    m = util.compilere(pat)
+    try:
+        # slightly faster, provided by facebook's re2 bindings
+        return m.test_match
+    except AttributeError:
+        return m.match
+
 def _expandsets(pats, ctx):
     '''convert set: patterns into a list of files in the given context'''
     fset = set()
@@ -280,7 +288,7 @@ def _buildregexmatch(pats, tail):
         pat = '(?:%s)' % '|'.join([_regex(k, p, tail) for (k, p) in pats])
         if len(pat) > 20000:
             raise OverflowError
-        return pat, re.compile(pat).match
+        return pat, _rematcher(pat)
     except OverflowError:
         # We're using a Python with a tiny regex engine and we
         # made it explode, so we'll divide the pattern list in two
@@ -294,7 +302,7 @@ def _buildregexmatch(pats, tail):
     except re.error:
         for k, p in pats:
             try:
-                re.compile('(?:%s)' % _regex(k, p, tail))
+                _rematcher('(?:%s)' % _regex(k, p, tail))
             except re.error:
                 raise util.Abort(_("invalid pattern (%s): %s") % (k, p))
         raise util.Abort(_("invalid pattern"))
diff --git a/mercurial/util.py b/mercurial/util.py
--- a/mercurial/util.py
+++ b/mercurial/util.py
@@ -617,6 +617,30 @@ def checkcase(path):
     except OSError:
         return True
 
+try:
+    import re2
+    _re2 = None
+except ImportError:
+    _re2 = False
+
+def compilere(pat):
+    '''Compile a regular expression, using re2 if possible
+
+    For best performance, use only re2-compatible regexp features.'''
+    global _re2
+    if _re2 is None:
+        try:
+            re2.compile
+            _re2 = True
+        except ImportError:
+            _re2 = False
+    if _re2:
+        try:
+            return re2.compile(pat)
+        except re2.error:
+            pass
+    return re.compile(pat)
+
 _fspathcache = {}
 def fspath(name, root):
     '''Get name in the case stored in the filesystem


More information about the Mercurial-devel mailing list