[PATCH] localrepo: Strip now incrementally updates the branchheads cache

Joshua Redstone joshua.redstone at fb.com
Sun May 13 08:59:43 CDT 2012


# HG changeset patch
# User Joshua Redstone <joshua.redstone at fb.com>
# Date 1336757754 25200
# Node ID 29779e7bd0c99a0cb8559e82ecef037eba59f535
# Parent  ddd4996740c785cc187766249977ea3ece8c17ab
localrepo: Strip now incrementally updates the branchheads cache

Destroying history via strip used to invalidate the branchheads cache, causing
it to be regenerated the next time it is read. This is expensive in large repos.
This change converts strip to pass info to localrepo.destroyed() to enable to it
to incrementally update the cache, improving the performance of strip and other
operations that depend on it (e.g., rebase).

This change also strengthens a bit the integrity checking of the branchheads
cache when it is read, by rejecting the cache if it has nodes in it that no
longer exist.

Feel free to offer any suggestions for improving the python.  I wasn't sure if
there's a better way of doing the elimination of duplicates from arrays.

diff -r ddd4996740c7 -r 29779e7bd0c9 mercurial/localrepo.py
--- a/mercurial/localrepo.py	Tue May 08 12:05:45 2012 -0500
+++ b/mercurial/localrepo.py	Fri May 11 10:35:54 2012 -0700
@@ -546,6 +546,9 @@
                     continue
                 node, label = l.split(" ", 1)
                 label = encoding.tolocal(label.strip())
+                if not node in self:
+                    raise ValueError('invalidating branch cache because node '+
+                                     '%s does not exist' % node)
                 partial.setdefault(label, []).append(bin(node))
         except KeyboardInterrupt:
             raise
@@ -567,6 +570,10 @@
             pass
 
     def _updatebranchcache(self, partial, ctxgen):
+        """Given a branchhead cache, partial, that may have extra nodes or be
+        missing heads, and a generator of nodes that are at least a superset of
+        heads missing, this function updates partial to be correct.
+        """
         # collect new branch entries
         newbranches = {}
         for c in ctxgen:
@@ -577,21 +584,42 @@
         for branch, newnodes in newbranches.iteritems():
             bheads = partial.setdefault(branch, [])
             bheads.extend(newnodes)
-            if len(bheads) <= 1:
-                continue
-            bheads = sorted(bheads, key=lambda x: self[x].rev())
-            # starting from tip means fewer passes over reachable
-            while newnodes:
-                latest = newnodes.pop()
-                if latest not in bheads:
-                    continue
-                minbhrev = self[bheads[0]].node()
-                reachable = self.changelog.reachable(latest, minbhrev)
-                reachable.remove(latest)
-                if reachable:
-                    bheads = [b for b in bheads if b not in reachable]
+            # Remove duplicates - nodes that are in newnodes and are already in
+            # bheads.  This can happen if you strip a node and its parent was
+            # already a head (because they're on different branches).
+            bheads = list(set(bheads))
+
+            # Remove candidate heads that no longer are in the repo (e.g., as
+            # the result of a strip that just happened).
+            # avoid using 'bhead in self' here because that dives down into
+            # branchcache code somewhat recrusively.
+            bheads = [bhead for bhead in bheads \
+                          if self.changelog.hasnode(bhead)]
+            if len(bheads) > 1:
+                bheads = sorted(bheads, key=lambda x: self[x].rev())
+                # starting from tip means fewer passes over reachable
+                while newnodes:
+                    latest = newnodes.pop()
+                    if latest not in bheads:
+                        continue
+                    minbhrev = self[bheads[0]].node()
+                    reachable = self.changelog.reachable(latest, minbhrev)
+                    reachable.remove(latest)
+                    if reachable:
+                        bheads = [b for b in bheads if b not in reachable]
             partial[branch] = bheads
 
+        # There may be branches that cease to exist when the last commit in the
+        # branch was stripped.  This code filters them out.  Note that the
+        # branch that ceased to exist may not be in newbranches because
+        # newbranches is the set of candidate heads, which when you strip the
+        # last commit in a branch will be the parent branch.
+        for branch in partial.keys():
+            nodes = [head for head in partial[branch] \
+                         if self.changelog.hasnode(head)]
+            if len(nodes) < 1:
+                del partial[branch]
+
     def lookup(self, key):
         return self[key].node()
 
@@ -853,6 +881,9 @@
             else:
                 ui.status(_('working directory now based on '
                             'revision %d\n') % parents)
+        # TODO: if we know which new heads may result from this rollback, pass
+        # them to destroy(), which will prevent the branchhead cache from being
+        # invalidated.
         self.destroyed()
         return 0
 
@@ -1293,12 +1324,27 @@
                 tr.release()
             lock.release()
 
-    def destroyed(self):
+    def destroyed(self, newheadrevs=None):
         '''Inform the repository that nodes have been destroyed.
         Intended for use by strip and rollback, so there's a common
-        place for anything that has to be done after destroying history.'''
-        # XXX it might be nice if we could take the list of destroyed
-        # nodes, but I don't see an easy way for rollback() to do that
+        place for anything that has to be done after destroying history.
+
+        If you know the branchheadcache was uptodate before nodes were removed
+        and you also know the set of candidate set of new heads that may have
+        resulted from the destruction, you can set newheadrevs.  This will
+        enable the code to update the branchheads cache, rather than having
+        future code decide it's invalid and regenrating it.
+        '''
+        if newheadrevs:
+            tiprev = len(self) - 1
+            ctxgen = (self[rev] for rev in newheadrevs)
+            self._updatebranchcache(self._branchcache, ctxgen)
+            self._writebranchcache(self._branchcache, self.changelog.tip(),
+                                   tiprev)
+        else:
+            # No info to update the cache.  If nodes were destroyed, the cache
+            # is stale and this will be caught the next time it is read.
+            pass
 
         # Ensure the persistent tag cache is updated.  Doing it now
         # means that the tag cache only has to worry about destroyed
@@ -1694,7 +1740,7 @@
                     # * missingheads part of comon (::commonheads)
                     common = set(outgoing.common)
                     cheads = [node for node in revs if node in common]
-                    # and 
+                    # and
                     # * commonheads parents on missing
                     revset = self.set('%ln and parents(roots(%ln))',
                                      outgoing.commonheads,
diff -r ddd4996740c7 -r 29779e7bd0c9 mercurial/repair.py
--- a/mercurial/repair.py	Tue May 08 12:05:45 2012 -0500
+++ b/mercurial/repair.py	Fri May 11 10:35:54 2012 -0700
@@ -56,6 +56,11 @@
     return s
 
 def strip(ui, repo, nodelist, backup="all", topic='backup'):
+    # It simplifies the logic around updating the branchheads cache if we only
+    # have to consider the effect of the stripped revisions and not revisions
+    # missing because the cache is out-of-date.
+    repo.updatebranchcache()
+
     cl = repo.changelog
     # TODO handle undo of merge sets
     if isinstance(nodelist, str):
@@ -63,6 +68,16 @@
     striplist = [cl.rev(node) for node in nodelist]
     striprev = min(striplist)
 
+    # Set of potential new heads resulting from the strip.  The parents of any
+    # node removed could be a new head because the node to be removed could have
+    # been the only child of the parent.
+    # Do a list->set->list conversion to remove duplicates.
+    stringstriplist = [str(rev) for rev in striplist]
+    newheadrevs = list(set(repo.revs("parents(%lr::) - %lr::", stringstriplist,
+                                     stringstriplist)))
+    #for rev in newheadrevs:
+    #    print "Got candidate rev ", rev
+
     keeppartialbundle = backup == 'strip'
 
     # Some revisions with rev > striprev may not be descendants of striprev.
@@ -169,7 +184,7 @@
                     % chgrpfile)
         raise
 
-    repo.destroyed()
+    repo.destroyed(newheadrevs)
 
     # remove potential unknown phase
     # XXX using to_strip data would be faster
diff -r ddd4996740c7 -r 29779e7bd0c9 tests/test-commit-amend.t
--- a/tests/test-commit-amend.t	Tue May 08 12:05:45 2012 -0500
+++ b/tests/test-commit-amend.t	Fri May 11 10:35:54 2012 -0700
@@ -247,6 +247,7 @@
 
   $ echo b >> b
   $ hg ci -m 'reopen branch'
+  created new head
   reopening closed branch head 4
   $ echo b >> b
   $ hg ci --amend --close-branch
diff -r ddd4996740c7 -r 29779e7bd0c9 tests/test-mq-caches.t
--- a/tests/test-mq-caches.t	Tue May 08 12:05:45 2012 -0500
+++ b/tests/test-mq-caches.t	Fri May 11 10:35:54 2012 -0700
@@ -36,7 +36,8 @@
   $ hg qrefresh -m 'patch 1'
   $ show_branch_cache
   tip: 0
-  No branch cache
+  d986d5caac23a7d44a46efc0ddaf5eb9665844cf 0
+  d986d5caac23a7d44a46efc0ddaf5eb9665844cf default
 
 some regular revisions
 
@@ -75,8 +76,8 @@
   $ hg qrefresh -m 'patch 2'
   $ show_branch_cache 1
   tip: 3
-  c229711f16da3d7591f89b1b8d963b79bda22714 1
-  c229711f16da3d7591f89b1b8d963b79bda22714 bar
+  982611f6955f9c48d3365decea203217c945ef0d 2
+  982611f6955f9c48d3365decea203217c945ef0d bar
   dc25e3827021582e979f600811852e36cbe57341 foo
   branch foo: 3
   branch bar: 2
diff -r ddd4996740c7 -r 29779e7bd0c9 tests/test-rebase-cache.t
--- a/tests/test-rebase-cache.t	Tue May 08 12:05:45 2012 -0500
+++ b/tests/test-rebase-cache.t	Fri May 11 10:35:54 2012 -0700
@@ -2,10 +2,8 @@
   > [extensions]
   > graphlog=
   > rebase=
-  > 
   > [phases]
   > publish=False
-  > 
   > [alias]
   > tglog  = log -G --template "{rev}: '{desc}' {branches}\n"
   > theads = heads --template "{rev}: '{desc}' {branches}\n"
@@ -72,7 +70,7 @@
   $ hg clone -q -u . a a1
   $ cd a1
 
-  $ hg tglog  
+  $ hg tglog
   @  8: 'F' branch3
   |
   o  7: 'branch3' branch3
@@ -91,6 +89,7 @@
   |/
   o  0: 'A'
   
+
   $ hg branches
   branch3                        8:4666b71e8e32
   branch2                        6:5097051d331d
@@ -119,7 +118,7 @@
   2: 'B' branch1
   0: 'A' 
 
-  $ hg tglog  
+  $ hg tglog
   @  8: 'E' branch3
   |
   o  7: 'D' branch3
@@ -138,6 +137,7 @@
   |/
   o  0: 'A'
   
+
   $ cd ..
 
 
@@ -165,6 +165,7 @@
   |/
   o  0: 'A'
   
+
   $ hg rebase --detach -s 8 -d 6
   saved backup bundle to $TESTTMP/a2/.hg/strip-backup/*-backup.hg (glob)
 
@@ -200,6 +201,7 @@
   |/
   o  0: 'A'
   
+
   $ hg verify -q
 
   $ cd ..
@@ -229,6 +231,7 @@
   |/
   o  0: 'A'
   
+
   $ hg rebase --detach -s 7 -d 6
   saved backup bundle to $TESTTMP/a3/.hg/strip-backup/*-backup.hg (glob)
 
@@ -243,7 +246,7 @@
   2: 'B' branch1
   0: 'A' 
 
-  $ hg tglog   
+  $ hg tglog
   @  7: 'F' branch2
   |
   o  6: 'E' branch2
@@ -260,5 +263,6 @@
   |/
   o  0: 'A'
   
+
   $ hg verify -q
 


More information about the Mercurial-devel mailing list