[PATCH 3 of 3 STABLE] revset: more optimally handle __sub__ (issue4352)

Gregory Szorc gregory.szorc at gmail.com
Sun Sep 7 20:44:36 CDT 2014


# HG changeset patch
# User Gregory Szorc <gregory.szorc at gmail.com>
# Date 1410111760 25200
#      Sun Sep 07 10:42:40 2014 -0700
# Node ID e2d2918c41fb0fe878afc3a2796f5a1b68939f46
# Parent  5857865f46f0275786b68ae8dc64c579edcddabb
revset: more optimally handle __sub__ (issue4352)

This patch introduces the _diffset class. This class is used to more
optimally compute set differences (__sub__).

The initial implementation in this patch is not yet fully optimal.
If the set on the right hand side of the "-" operator is a lazy set,
it will be evaluated fully before results are returned. In the
future, this set should be iterated much like _addset does it.

The revset benchmarks run against a clone of the Firefox repository
show the following statistically significant changes (before
followed by after):

revset #6: roots(0::tip)
wall 0.712861 comb 0.710000 user 0.700000 sys 0.010000 (best of 14)
wall 0.745027 comb 0.750000 user 0.750000 sys 0.000000 (best of 13)

revset #9: author(lmoscovicz) or author(mpm)
wall 39.410163 comb 39.140000 user 38.340000 sys 0.800000 (best of 3)
wall 45.392911 comb 45.030000 user 43.740000 sys 1.290000 (best of 3)

revset #10: author(mpm) or author(lmoscovicz)
wall 39.231088 comb 38.950000 user 38.180000 sys 0.770000 (best of 3)
wall 45.568156 comb 45.170000 user 43.820000 sys 1.350000 (best of 3)

revset #16: roots((0::) - (0::tip))
wall 459.033609 comb 458.880000 user 458.850000 sys 0.030000 (best of 3)
wall  10.230451 comb  10.220000 user  10.200000 sys 0.020000 (best of 3)

revset #22: max(::(tip~20) - obsolete())
wall 0.888049 comb 0.890000 user 0.890000 sys 0.000000 (best of 11)
wall 1.040373 comb 1.030000 user 1.030000 sys 0.000000 (best of 10)

revset #24: (not public() - obsolete())
wall 0.377076 comb 0.380000 user 0.380000 sys 0.000000 (best of 26)
wall 0.191621 comb 0.190000 user 0.190000 sys 0.000000 (best of 50)

While there are a handful of small regressions, there is a major win
in revset #16. Since a version of this revset runs at clone time and
makes cloning the Firefox repository many minutes slower than it
should be, the author believes the temporary and minor regressions
are justified.

diff --git a/mercurial/revset.py b/mercurial/revset.py
--- a/mercurial/revset.py
+++ b/mercurial/revset.py
@@ -2350,9 +2350,9 @@ class lazyset(object):
     def __and__(self, x):
         return lazyset(self, x.__contains__)
 
     def __sub__(self, x):
-        return lazyset(self, lambda r: r not in x)
+        return _diffset(self, x)
 
     def __add__(self, x):
         return _addset(self, x)
 
@@ -2414,10 +2414,14 @@ class orderedlazyset(_orderedsetmixin, l
         return orderedlazyset(self, x.__contains__,
                 ascending=self._ascending)
 
     def __sub__(self, x):
-        return orderedlazyset(self, lambda r: r not in x,
-                ascending=self._ascending)
+        kwargs = {}
+        if self.isascending() and x.isascending():
+            kwargs['ascending'] = True
+        if self.isdescending() and x.isdescending():
+            kwargs['ascending'] = False
+        return _diffset(self, x, **kwargs)
 
     def __add__(self, x):
         kwargs = {}
         if self.isascending() and x.isascending():
@@ -2500,12 +2504,15 @@ class _lazysetiteratormixin(_orderedsetm
                 kwargs['ascending'] = False
         return _addset(self, other, **kwargs)
 
     def __sub__(self, other):
-        filterfunc = lambda r: r not in other
+        kwargs = {}
         if self._ascending is not None:
-            return orderedlazyset(self, filterfunc, ascending=self._ascending)
-        return lazyset(self, filterfunc)
+            if self.isascending() and other.isascending():
+                kwargs['ascending'] = True
+            if self.isdescending() and other.isdescending():
+                kwargs['ascending'] = False
+        return _diffset(self, other, **kwargs)
 
     def __iter__(self):
         if self._genlist:
             return iter(self._genlist)
@@ -2612,8 +2619,32 @@ class _addset(_lazysetiteratormixin):
 
     def __contains__(self, x):
         return x in self._r1 or x in self._r2
 
+class _diffset(_lazysetiteratormixin):
+    """Represents the difference between two sets.
+
+    This class is similar to _addset. It exists to optimally compute the
+    difference between two sets. Its impact is most felt when operating
+    on two lazy sets.
+    """
+    def __contains__(self, x):
+        return x in self._list
+
+    def _iterator(self):
+        if self._iter is None:
+            def gen():
+                # We could further optimize this if r2 is ordered. See
+                # _addset._iterator for inspiration.
+                s = self._r2.set()
+                for r in self._r1:
+                    if r not in s:
+                        yield r
+
+            self._iter = _generatorset(gen())
+
+        return self._iter
+
 class _generatorset(object):
     """Wrap a generator for lazy iteration
 
     Wrapper structure for generators that provides lazy membership and can
@@ -2790,14 +2821,14 @@ class spanset(_orderedsetmixin):
         else:
             return orderedlazyset(self, x.__contains__, ascending=False)
 
     def __sub__(self, x):
-        if isinstance(x, baseset):
-            x = x.set()
-        if self._start <= self._end:
-            return orderedlazyset(self, lambda r: r not in x)
-        else:
-            return orderedlazyset(self, lambda r: r not in x, ascending=False)
+        kwargs = {}
+        if self.isascending() and x.isascending():
+            kwargs['ascending'] = True
+        if self.isdescending() and x.isdescending():
+            kwargs['ascending'] = False
+        return _diffset(self, x, **kwargs)
 
     def __add__(self, x):
         kwargs = {}
         if self.isascending() and x.isascending():


More information about the Mercurial-devel mailing list