[PATCH 09 of 10] tags: implement persistent tag caching (issue548)

Thu Jul 16 09:42:55 CDT 2009

# HG changeset patch
# User Greg Ward <greg-hg at gerg.ca>
# Date 1247755182 14400
# Node ID 6e2c4b6f08cf2f9aa0ca1df2b10c72947d709492
# Parent  878a5abb739d62d2a6f419bf89d1be9e144f9968
tags: implement persistent tag caching (issue548).

- rename findglobaltags() to findglobaltags1() (so the "no cache"
  implementation is still there if we need it)
- add findglobaltags2() and make findglobaltags() an alias for it
  (disabling tag caching is a one-line patch)
- factor out tagcache class with methods readcache() and writecache();
  the expensive part of tag finding (iterate over heads and find
  .hgtags filenode) is now in tagcache.readcache()

diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -915,11 +915,20 @@
         '''Inform the repository that nodes have been destroyed.
         Intended for use by strip and rollback, so there's a common
         place for anything that has to be done after destroying history.'''
-        # Do nothing for now: this is a placeholder that will be used
-        # when we add tag caching.
         # XXX it might be nice if we could take the list of destroyed
         # nodes, but I don't see an easy way for rollback() to do that
-        pass
+
+        # Ensure the persistent tag cache is updated.  Doing it now
+        # means that the tag cache only has to worry about destroyed
+        # heads immediately after a strip/rollback.  That in turn
+        # guarantees that "cachetip == currenttip" (comparing both rev
+        # and node) always means no nodes have been added or destroyed.
+
+        # XXX this is suboptimal when qrefresh'ing: we strip the current
+        # head, refresh the tag cache, then immediately add a new head.
+        # But I think doing it this way is necessary for the "instant
+        # tag cache retrieval" case to work.
+        tags_.findglobaltags(self.ui, self, {}, {})
 
     def walk(self, match, node=None):
         '''
diff --git a/mercurial/tags.py b/mercurial/tags.py
--- a/mercurial/tags.py
+++ b/mercurial/tags.py
@@ -6,16 +6,29 @@
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2, incorporated herein by reference.
 
-# Currently this module only deals with reading tags.  Soon it will grow
-# support for caching tag info.  Eventually, it could take care of
-# updating (adding/removing/moving) tags too.
+# Currently this module only deals with reading and caching tags.
+# Eventually, it could take care of updating (adding/removing/moving)
+# tags too.
 
-from node import bin, hex
+import os
+from node import nullid, bin, hex, short
 from i18n import _
 import encoding
 import error
 
-def findglobaltags(ui, repo, alltags, tagtypes):
+def _debugalways(ui, *msg):
+    ui.write(*msg)
+
+def _debugconditional(ui, *msg):
+    ui.debug(*msg)
+
+def _debugnever(ui, *msg):
+    pass
+
+_debug = _debugalways
+_debug = _debugnever
+
+def findglobaltags1(ui, repo, alltags, tagtypes):
     '''Find global tags in repo by reading .hgtags from every head that
     has a distinct version of it.  Updates the dicts alltags, tagtypes
     in place: alltags maps tag name to (node, hist) pair (see _readtags()
@@ -44,6 +57,36 @@
             ui, repo, fctx.data().splitlines(), fctx)
         _updatetags(filetags, "global", alltags, tagtypes)
 
+def findglobaltags2(ui, repo, alltags, tagtypes):
+    '''Same as findglobaltags1(), but with caching.'''
+    (heads, tagfnode, shouldwrite) = _readtagcache(ui, repo)
+
+    _debug(ui, "reading tags from %d head(s): %s\n"
+           % (len(heads), map(short, reversed(heads))))
+    seen = set()                    # set of fnode
+    fctx = None
+    for head in reversed(heads):        # oldest to newest
+        assert head in repo.changelog.nodemap, \
+               "tag cache returned bogus head %s" % short(head)
+
+        fnode = tagfnode.get(head)
+        if fnode and fnode not in seen:
+            seen.add(fnode)
+            if not fctx:
+                fctx = repo.filectx('.hgtags', fileid=fnode)
+            else:
+                fctx = fctx.filectx(fnode)
+
+            filetags = _readtags(ui, repo, fctx.data().splitlines(), fctx)
+            _updatetags(filetags, 'global', alltags, tagtypes)
+
+    # and update the cache (if necessary)
+    if shouldwrite:
+        _writetagcache(ui, repo, heads, tagfnode)
+
+# Set this to findglobaltags1 to disable tag caching.
+findglobaltags = findglobaltags2
+
 def readlocaltags(ui, repo, alltags, tagtypes):
     '''Read local tags in repo.  Update alltags and tagtypes.'''
     try:
@@ -120,3 +163,148 @@
         alltags[name] = anode, ahist
         tagtypes[name] = tagtype
 
+
+# The tag cache only stores info about heads, not the tag contents
+# from each head.  I.e. it doesn't try to squeeze out the maximum
+# performance, but is simpler has a better chance of actually
+# working correctly.  And this gives the biggest performance win: it
+# avoids looking up .hgtags in the manifest for every head, and it
+# can avoid calling heads() at all if there have been no changes to
+# the repo.
+
+def _readtagcache(ui, repo):
+    '''Read the tag cache and return a tuple (heads, fnodes,
+    shouldwrite).  heads is the list of all heads currently in the
+    repository (ordered from tip to oldest) and fnodes is a mapping from
+    head to .hgtags filenode.  Caller is responsible for reading tag
+    info from each head.'''
+
+    try:
+        cachefile = repo.opener('tags.cache', 'r')
+        _debug(ui, 'reading tag cache from %s\n' % cachefile.name)
+    except IOError:
+        cachefile = None
+
+    # The cache file consists of lines like
+    #   <headrev> <headnode> [<tagnode>]
+    # where <headrev> and <headnode> redundantly identify a repository
+    # head from the time the cache was written, and <tagnode> is the
+    # filenode of .hgtags on that head.  Heads with no .hgtags file will
+    # have no <tagnode>.  The cache is ordered from tip to oldest (which
+    # is part of why <headrev> is there: a quick visual check is all
+    # that's required to ensure correct order).
+    # 
+    # This information is enough to let us avoid the most expensive part
+    # of finding global tags, which is looking up <tagnode> in the
+    # manifest for each head.
+    cacherevs = []                      # list of headrev
+    cacheheads = []                     # list of headnode
+    cachefnode = {}                     # map headnode to filenode
+    if cachefile:
+        for line in cachefile:
+            line = line.rstrip().split()
+            cacherevs.append(int(line[0]))
+            headnode = bin(line[1])
+            cacheheads.append(headnode)
+            if len(line) == 3:
+                fnode = bin(line[2])
+                cachefnode[headnode] = fnode
+
+        cachefile.close()
+
+    tipnode = repo.changelog.tip()
+    tiprev = len(repo.changelog) - 1
+
+    # Case 1 (common): tip is the same, so nothing has changed.
+    # (Unchanged tip trivially means no changesets have been added.
+    # But, thanks to localrepository.destroyed(), it also means none
+    # have been destroyed by strip or rollback.)
+    if cacheheads and cacheheads[0] == tipnode and cacherevs[0] == tiprev:
+        _debug(ui, "tag cache: tip unchanged\n")
+        return (cacheheads, cachefnode, False)
+        
+    repoheads = repo.heads()
+
+    # Case 2 (uncommon): empty repo; get out quickly and don't bother
+    # writing an empty cache.
+    if repoheads == [nullid]:
+        return ([], {}, False)
+
+    # Case 3 (uncommon): cache file missing or empty.
+    if not cacheheads:
+        _debug(ui, 'tag cache: cache file missing or empty\n')
+
+    # Case 4 (uncommon): tip rev decreased.  This should only happen
+    # when we're called from localrepository.destroyed().  Refresh the
+    # cache so future invocations will not see disappeared heads in the
+    # cache.
+    elif cacheheads and tiprev < cacherevs[0]:
+        _debug(ui,
+               'tag cache: tip rev decremented (from %d to %d), '
+               'so we must be destroying nodes\n'
+               % (cacherevs[0], tiprev))
+
+    # Case 5 (common): tip has changed, so we've added/replaced heads.
+    else:
+        _debug(ui,
+               'tag cache: tip has changed (%d:%s); must find new heads\n'
+               % (tiprev, short(tipnode)))
+
+    # Luckily, the code to handle cases 3, 4, 5 is the same.  So the
+    # above if/elif/else can disappear once we're confident this thing
+    # actually works and we don't need the debug output.
+
+    # N.B. in case 4 (nodes destroyed), "new head" really means "newly
+    # exposed".
+    newheads = [head
+                for head in repoheads
+                if head not in set(cacheheads)]
+    _debug(ui, 'tag cache: found %d head(s) not in cache: %s\n'
+           % (len(newheads), map(short, newheads)))
+
+    # Now we have to lookup the .hgtags filenode for every new head.
+    # This is the most expensive part of finding tags, so performance
+    # depends primarily on the size of newheads.  Worst case: no cache
+    # file, so newheads == repoheads.
+    for head in newheads:
+        cctx = repo[head]
+        try:
+            fnode = cctx.filenode('.hgtags')
+            cachefnode[head] = fnode
+        except error.LookupError:
+            # no .hgtags file on this head
+            pass
+
+    # Caller has to iterate over all heads, but can use the filenodes in
+    # cachefnode to get to each .hgtags revision quickly.
+    return (repoheads, cachefnode, True)
+
+def _writetagcache(ui, repo, heads, tagfnode):
+
+    cachefile = repo.opener('tags.cache', 'w', atomictemp=True)
+    _debug(ui, 'writing cache file %s\n' % cachefile.name)
+
+    realheads = repo.heads()            # for sanity checks below
+    for head in heads:
+        # temporary sanity checks; these can probably be removed
+        # once this code has been in crew for a few weeks
+        assert head in repo.changelog.nodemap, \
+               'trying to write non-existent node %s to tag cache' % short(head)
+        assert head in realheads, \
+               'trying to write non-head %s to tag cache' % short(head)
+        assert head != nullid, \
+               'trying to write nullid to tag cache'
+
+        # This can't fail because of the first assert above.  When/if we
+        # remove that assert, we might want to catch LookupError here
+        # and downgrade it to a warning.
+        rev = repo.changelog.rev(head)
+
+        fnode = tagfnode.get(head)
+        if fnode:
+            cachefile.write('%d %s %s\n' % (rev, hex(head), hex(fnode)))
+        else:
+            cachefile.write('%d %s\n' % (rev, hex(head)))
+
+    cachefile.rename()
+    cachefile.close()
diff --git a/tests/test-mq b/tests/test-mq
--- a/tests/test-mq
+++ b/tests/test-mq
@@ -107,9 +107,18 @@
 hg qpop
 checkundo qpop
 
-echo % qpush
+echo % qpush with dump of tag cache
 
+# Dump the tag cache to ensure that it has exactly one head after qpush.
+rm -f .hg/tags.cache
+hg tags > /dev/null
+echo ".hg/tags.cache (pre qpush):"
+sed 's/ [0-9a-f]*//' .hg/tags.cache
 hg qpush
+hg tags > /dev/null
+echo ".hg/tags.cache (post qpush):"
+sed 's/ [0-9a-f]*//' .hg/tags.cache
+
 checkundo qpush
 
 cd ..
diff --git a/tests/test-mq.out b/tests/test-mq.out
--- a/tests/test-mq.out
+++ b/tests/test-mq.out
@@ -110,9 +110,13 @@
 % qpop
 popping test.patch
 patch queue now empty
-% qpush
+% qpush with dump of tag cache
+.hg/tags.cache (pre qpush):
+1
 applying test.patch
 now at: test.patch
+.hg/tags.cache (post qpush):
+2
 % pop/push outside repo
 popping test.patch
 patch queue now empty
diff --git a/tests/test-tags b/tests/test-tags
--- a/tests/test-tags
+++ b/tests/test-tags
@@ -1,15 +1,22 @@
 #!/bin/sh
 
+cacheexists() {
+    [ -f .hg/tags.cache ] && echo "tag cache exists" || echo "no tag cache"
+}
+
 echo "% setup"
 mkdir t
 cd t
 hg init
+cacheexists
 hg id
+cacheexists
 echo a > a
 hg add a
 hg commit -m "test"
 hg co
 hg identify
+cacheexists
 
 echo "% create local tag with long name"
 T=`hg identify --debug --id`
@@ -25,6 +32,10 @@
 hg tags
 hg identify
 
+# repeat with cold tag cache
+rm -f .hg/tags.cache
+hg identify
+
 echo "% create a branch"
 echo bb > a
 hg status
diff --git a/tests/test-tags.out b/tests/test-tags.out
--- a/tests/test-tags.out
+++ b/tests/test-tags.out
@@ -1,7 +1,10 @@
 % setup
+no tag cache
 000000000000 tip
+no tag cache
 0 files updated, 0 files merged, 0 files removed, 0 files unresolved
 acb14030fe0a tip
+tag cache exists
 % create local tag with long name
 tip                                0:acb14030fe0a
 This is a local tag with a really long name!     0:acb14030fe0a
@@ -10,6 +13,7 @@
 tip                                1:b9154636be93
 first                              0:acb14030fe0a
 b9154636be93 tip
+b9154636be93 tip
 % create a branch
 M a
 b9154636be93+ tip
@@ -73,7 +77,9 @@
 rev 4: .hgtags:
 bbd179dfa0a71671c253b3ae0aa1513b60d199fa bar
 .hg/tags.cache:
-no such file
+4 0c192d7d5e6b78a714de54a2e9627952a877e25a 0c04f2a8af31de17fab7422878ee5a2dadbc943d
+3 6fa450212aeb2a21ed616a54aea39a4a27894cd7 7d3b718c964ef37b89e550ebdafd5789e76ce1b0
+2 7a94127795a33c10a370c93f731fd9fea0b79af6 0c04f2a8af31de17fab7422878ee5a2dadbc943d
 % test tag removal
 changeset:   5:5f6e8655b1c7
 tag:         tip