[PATCH 8 of 8] tags: implement persistent tag caching (issue548)

Mon Jul 13 21:03:27 CDT 2009

# HG changeset patch
# User Greg Ward <greg-hg at gerg.ca>
# Date 1247536884 14400
# Node ID c456f1ebd65d7a9d947641c4f9234748bc618608
# Parent  bfc26f905db3664fc1932de253db7b6f20f85772
tags: implement persistent tag caching (issue548).

- rename findglobaltags() to findglobaltags1() (so the "no cache"
  implementation is still there if we need it)
- add findglobaltags2() and make findglobaltags() an alias for it
  (disabling tag caching is a one-line patch)
- factor out tagcache class with methods readcache() and writecache();
  the expensive part of tag finding (iterate over heads and find
  .hgtags filenode) is now in tagcache.readcache()

diff --git a/mercurial/tags.py b/mercurial/tags.py
--- a/mercurial/tags.py
+++ b/mercurial/tags.py
@@ -6,16 +6,16 @@
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2, incorporated herein by reference.
 
-# Currently this module only deals with reading tags.  Soon it will grow
-# support for caching tag info.  Eventually, it could take care of
-# updating (adding/removing/moving) tags too.
+# Currently this module only deals with reading tags (with caching).
+# Eventually, it could take care of updating (adding/removing/moving)
+# tags too.
 
 from node import bin, hex
 from i18n import _
 import encoding
 import error
 
-def findglobaltags(ui, repo, alltags, tagtypes):
+def findglobaltags1(ui, repo, alltags, tagtypes):
     '''Find global tags in repo by reading .hgtags from every head that
     has a distinct version of it.  Updates the dicts alltags, tagtypes
     in place: alltags maps tag name to (node, hist) pair (see _readtags()
@@ -44,6 +44,32 @@
             ui, repo, fctx.data().splitlines(), fctx)
         _updatetags(filetags, "global", alltags, tagtypes)
 
+def findglobaltags2(ui, repo, alltags, tagtypes):
+    '''Same as findglobaltags1(), but with caching.'''
+    cache = tagcache(ui, repo)
+    (heads, tagfnode) = cache.readcache()
+
+    #ui.debug("iterating over %d head(s) for .hgtags...\n" % len(heads))
+    seen = set()                    # set of fnode
+    fctx = None
+    for head in reversed(heads):    # oldest to newest
+        fnode = tagfnode.get(head)
+        if fnode and fnode not in seen:
+            seen.add(fnode)
+            if not fctx:
+                fctx = repo.filectx('.hgtags', fileid=fnode)
+            else:
+                fctx = fctx.filectx(fnode)
+
+            filetags = _readtags(ui, repo, fctx.data().splitlines(), fctx)
+            _updatetags(filetags, 'global', alltags, tagtypes)
+
+    # and update the cache (if necessary)
+    cache.writecache(heads, tagfnode)
+
+# Set this to findglobaltags1 to disable tag caching.
+findglobaltags = findglobaltags2
+
 def readlocaltags(ui, repo, alltags, tagtypes):
     '''Read local tags in repo.  Update alltags and tagtypes.'''
     try:
@@ -120,3 +146,136 @@
         alltags[name] = anode, ahist
         tagtypes[name] = tagtype
 
+
+class tagcache(object):
+    '''Object for managing persistent tag cache.  Only needs to live for
+    as long as it takes to open, read, update, and write the cache.'''
+
+    # The tag cache only stores info about heads, not the tag contents
+    # from each head.  I.e. it doesn't try to squeeze out the maximum
+    # performance, but is simpler has a better chance of actually
+    # working correctly.  And this gives the biggest performance win: it
+    # avoids looking up .hgtags in the manifest for every head, and it
+    # can avoid calling heads() at all if there have been no changes to
+    # the repo.
+    #
+    # Caching .hgtags content is unexpectedly difficult because of
+    # strip/rollback and tag rank.  I suspect the only way to get it
+    # right is to store the exact data from .hgtags on each head -- in
+    # which case why not just read it straight from the .hgtags revlog? 
+    # So that is exactly what we do.
+
+    __slots__ = ['ui',
+                 'repo',
+                 'shouldwrite',
+                ]
+
+    cacheversion = 0                    # format version
+
+    def __init__(self, ui, repo):
+        self.ui = ui
+        self.repo = repo
+        self.shouldwrite = False
+
+    def readcache(self):
+        '''Read the tag cache and return a tuple (heads, fnodes).  heads
+        is the list of all heads currently in the repository (ordered
+        from tip to oldest) and fnodes is a mapping from head to .hgtags
+        filenode.  Caller is responsible for reading tag info from each
+        head.'''
+
+        (ui, repo) = (self.ui, self.repo)
+        try:
+            cachefile = repo.opener('tags.cache', 'r')
+            #ui.debug('reading tag cache from %s\n' % cachefile.name)
+        except IOError:
+            cachefile = None
+
+        # The cache file consists of lines like
+        #   <headrev> <headnode> [<tagnode>]
+        # where <headrev> and <headnode> redundantly identify a
+        # repository head from the time the cache was written, and
+        # <tagnode> is the filenode of .hgtags on that head.  Heads with
+        # no .hgtags file will have no <tagnode>.  The cache is ordered
+        # from tip to oldest (which is why <headrev> is there: a quick
+        # visual check is all that's required to ensure correct order).
+        # 
+        # This information is enough to let us avoid the most expensive
+        # part of finding global tags, which is looking up <tagnode> in
+        # the manifest for each head.
+        cacheheads = []                 # list of headnode
+        cachefnode = {}                 # map headnode to filenode
+        if cachefile:
+            for line in cachefile:
+                line = line.rstrip().split()
+                head = bin(line[1])
+                cacheheads.append(head)
+                if len(line) == 3:
+                    fnode = bin(line[2])
+                    cachefnode[head] = fnode
+
+        # See if any heads have been destroyed by strip or rollback.
+        # That'll ensure that writecache() writes accurate data, and it
+        # makes life easier for localrepo._findglobaltags().
+        goodheads = []
+        for head in cacheheads:
+            try:
+                repo.changelog.rev(head)
+                goodheads.append(head)
+            except error.LookupError:
+                pass
+        if len(goodheads) < len(cacheheads):
+            self.shouldwrite = True
+
+        # If the first head == current tip, there have been no new
+        # changesets since the cache was written.  Just tell our caller
+        # to read .hgtags from every head that has it.
+        if goodheads and goodheads[0] == repo.changelog.tip():
+            #ui.debug('tag cache: tip not changed, so cache is up-to-date\n')
+            return (goodheads, cachefnode)
+
+        # Tip has changed, so we have to find new heads.
+        currentheads = repo.heads()
+        newheads = [head
+                    for head in currentheads
+                    if head not in set(goodheads)]
+        if newheads:
+            self.shouldwrite = True
+        #ui.debug('tag cache: found %d uncached head(s)\n' % len(newheads))
+
+        # Now we have to lookup the .hgtags filenode for every new head.
+        # This is the most expensive part of finding tags, so
+        # performance will depend primarily on the size of newheads.
+        # Worst case: newheads == currentheads (no cache file).
+        for head in newheads:
+            cctx = self.repo[head]
+            try:
+                fnode = cctx.filenode('.hgtags')
+                cachefnode[head] = fnode
+            except error.LookupError:
+                # no .hgtags file on this head
+                pass
+
+        # Everything in newheads should be closer to tip than everything
+        # in goodheads.  And both are already sorted tip-to-oldest, so
+        # we can just concatenate them.  (XXX unless someone edits the
+        # cache file in a sneaky attempt to trip us up.)
+        return (newheads + goodheads, cachefnode)
+
+    def writecache(self, heads, tagfnode):
+        if not self.shouldwrite:
+            return
+
+        cachefile = self.repo.opener('tags.cache', 'w', atomictemp=True)
+        #self.ui.debug('writing cache file %s\n' % cachefile.name)
+
+        for head in heads:
+            rev = self.repo[head].rev()
+            fnode = tagfnode.get(head)
+            if fnode:
+                cachefile.write('%d %s %s\n' % (rev, hex(head), hex(fnode)))
+            else:
+                cachefile.write('%d %s\n' % (rev, hex(head)))
+
+        cachefile.rename()
+        cachefile.close()
diff --git a/tests/test-empty.out b/tests/test-empty.out
--- a/tests/test-empty.out
+++ b/tests/test-empty.out
@@ -19,3 +19,4 @@
 hgrc
 requires
 store
+tags.cache
diff --git a/tests/test-tags b/tests/test-tags
--- a/tests/test-tags
+++ b/tests/test-tags
@@ -4,7 +4,9 @@
 mkdir t
 cd t
 hg init
+[ -f .hg/tags.cache ] && echo "tag cache exists" || echo "no tag cache"
 hg id
+[ -f .hg/tags.cache ] && echo "tag cache exists" || echo "no tag cache"
 echo a > a
 hg add a
 hg commit -m "test"
@@ -25,6 +27,10 @@
 hg tags
 hg identify
 
+# repeat with cold tag cache
+rm -f .hg/tags.cache
+hg identify
+
 echo "% create a branch"
 echo bb > a
 hg status
diff --git a/tests/test-tags.out b/tests/test-tags.out
--- a/tests/test-tags.out
+++ b/tests/test-tags.out
@@ -1,5 +1,7 @@
 % setup
+no tag cache
 000000000000 tip
+tag cache exists
 0 files updated, 0 files merged, 0 files removed, 0 files unresolved
 acb14030fe0a tip
 % create local tag with long name
@@ -10,6 +12,7 @@
 tip                                1:b9154636be93
 first                              0:acb14030fe0a
 b9154636be93 tip
+b9154636be93 tip
 % create a branch
 M a
 b9154636be93+ tip
@@ -73,7 +76,10 @@
 rev 4: .hgtags:
 bbd179dfa0a71671c253b3ae0aa1513b60d199fa bar
 .hg/tags.cache:
-no such file
+4 0c192d7d5e6b78a714de54a2e9627952a877e25a 0c04f2a8af31de17fab7422878ee5a2dadbc943d
+3 6fa450212aeb2a21ed616a54aea39a4a27894cd7 7d3b718c964ef37b89e550ebdafd5789e76ce1b0
+2 7a94127795a33c10a370c93f731fd9fea0b79af6 0c04f2a8af31de17fab7422878ee5a2dadbc943d
+0 bbd179dfa0a71671c253b3ae0aa1513b60d199fa
 % test tag removal
 changeset:   5:5f6e8655b1c7
 tag:         tip