[PATCH] revset: improve performance of _generatorset.__contains__ (issue 4201)

Gregory Szorc gregory.szorc at gmail.com
Mon Mar 24 22:01:09 CDT 2014


# HG changeset patch
# User Gregory Szorc <gregory.szorc at gmail.com>
# Date 1395716418 25200
#      Mon Mar 24 20:00:18 2014 -0700
# Node ID 6f64e244c57706cfb123c32c4fadef63690eed40
# Parent  3879ac3858ffd9bb46e19fcc3a2b31d7bb2b54c5
revset: improve performance of _generatorset.__contains__ (issue 4201)

_generatorset.__contains__ and __contains__ from child classes were
calling into __iter__ to look for values. Since all
previously-encountered values from the generator were cached and checked
in __contains__ before this iteration, __contains__ was effectively
performing iteration busy work which could lead to an explosion of
redundant work.

This patch changes __contains__ to be more intelligent. Instead of
looking at all values via __iter__, __contains__ will instead go
straight to "new" values from the underlying generator.

On a clone of the Firefox repository with around 200,000 changesets,
this patch decreases the execution time of the revset '::(200067)::'
from ~100s to ~4s on the author's machine. Rebase operations (which use
the aforementioned revset), speed up accordingly.

diff --git a/mercurial/revset.py b/mercurial/revset.py
--- a/mercurial/revset.py
+++ b/mercurial/revset.py
@@ -2625,42 +2625,43 @@ class _generatorset(object):
         self._genlist = baseset([])
         self._iterated = False
         self._finished = False
 
     def __contains__(self, x):
         if x in self._cache:
             return self._cache[x]
 
-        # Use __iter__ which caches values and stores them into self._genlist
-        for l in self:
+        # Use new values only, as existing values would be cached.
+        for l in self._consumegen():
             if l == x:
                 return True
 
         self._finished = True
         self._cache[x] = False
         return False
 
     def __iter__(self):
         if self._iterated:
             # At least a part of the list should be cached if iteration has
             # started over the generatorset.
             for l in self._genlist:
                 yield l
-        else:
-            # Starting iteration over the generatorset.
-            self._iterated = True
+
+        for item in self._consumegen():
+            yield item
+
+    def _consumegen(self):
+        self._iterated = True
 
         for item in self._gen:
             self._cache[item] = True
             self._genlist.append(item)
             yield item
 
-        # Iteration over the generator has finished. Whole value list should be
-        # cached in self._genlist
         self._finished = True
 
     def set(self):
         return self
 
     def sort(self, reverse=False):
         if not self._finished:
             for i in self:
@@ -2675,17 +2676,18 @@ class _ascgeneratorset(_generatorset):
 
     This class does not duck-type baseset and it's only supposed to be used
     internally
     """
     def __contains__(self, x):
         if x in self._cache:
             return self._cache[x]
 
-        for l in self:
+        # Use new values only, as existing values would be cached.
+        for l in self._consumegen():
             if l == x:
                 return True
             if l > x:
                 break
 
         self._cache[x] = False
         return False
 
@@ -2697,17 +2699,18 @@ class _descgeneratorset(_generatorset):
 
     This class does not duck-type baseset and it's only supposed to be used
     internally
     """
     def __contains__(self, x):
         if x in self._cache:
             return self._cache[x]
 
-        for l in self:
+        # Use new values only, as existing values would be cached.
+        for l in self._consumegen():
             if l == x:
                 return True
             if l < x:
                 break
 
         self._cache[x] = False
         return False
 


More information about the Mercurial-devel mailing list