[PATCH 8 of 8] localrepo: implement persistent tag caching

Sun Jul 12 20:02:11 CDT 2009

# HG changeset patch
# User Greg Ward <greg-hg at gerg.ca>
# Date 1247446899 14400
# Node ID 932640a34ea88da78d680fdc3ab5e9b77e601592
# Parent  6106257165fc2d3a461f67c5c361085fd02271d4
localrepo: implement persistent tag caching

- factor out localrepository._findglobaltags()
- factor out tagcache class with methods readcache() and writecache();
  the expensive part of tag finding (iterate over heads and find
  .hgtags filenode) is now in tagcache.readcache()
- add nulltagcache so we can easily revert to non-cached tags
  if necessary

diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -258,6 +258,7 @@
         # be one tagtype for all such "virtual" tags?  Or is the status
         # quo fine?
 
+        # tag names are in UTF-8 in these two dicts
         alltags = {}                    # map tag name to (node, hist)
         tagtypes = {}
 
@@ -285,27 +286,11 @@
                 alltags[name] = anode, ahist
                 tagtypes[name] = tagtype
 
-        seen = set()
-        fctx = None
-        ctxs = []                       # list of filectx
-        for node in self.heads():
-            try:
-                fnode = self[node].filenode('.hgtags')
-            except error.LookupError:
-                continue
-            if fnode not in seen:
-                seen.add(fnode)
-                if not fctx:
-                    fctx = self.filectx('.hgtags', fileid=fnode)
-                else:
-                    fctx = fctx.filectx(fnode)
-                ctxs.append(fctx)
+        self._findglobaltags(updatetags)
+        #self.ui.debug("found %d global tags\n" % len(alltags))
 
-        # read the tags file from each head, ending with the tip
-        for fctx in reversed(ctxs):
-            filetags = self._readtags(fctx.data().splitlines(), fctx)
-            updatetags(filetags, "global")
-
+        # this is cheap: no need to cache it
+        #self.ui.debug("reading localtags...\n")
         try:
             data = encoding.fromlocal(self.opener("localtags").read())
             # localtags are stored in the local character set
@@ -318,10 +303,15 @@
         tags = {}
         for (name, (node, hist)) in alltags.iteritems():
             if node != nullid:
-                tags[name] = node
+                tags[encoding.tolocal(name)] = node
         tags['tip'] = self.changelog.tip()
+
+        # re-encode tagtypes keys to local encoding
+        tagtypes = dict([(encoding.tolocal(name), ttype)
+                         for (name, ttype) in tagtypes.iteritems()])
         return (tags, tagtypes)
 
+    # tag names here: UTF-8 in and UTF-8 out
     def _readtags(self, lines, fn):
         '''Read tag definitions from a file (or any source of
         lines).  Return a mapping from tag name to (node, hist):
@@ -344,7 +334,7 @@
             except ValueError:
                 warn(_("cannot parse entry"))
                 continue
-            name = encoding.tolocal(name.strip()) # stored in UTF-8
+            name = name.strip()         # keep it in UTF-8 for now
             try:
                 nodebin = bin(nodehex)
             except TypeError:
@@ -362,6 +352,32 @@
             filetags[name] = (nodebin, hist)
         return filetags
 
+    def _findglobaltags(self, updatetags):
+        # Use nulltagcache rather than tagcache to disable tag caching.
+        # Useful if you find a bug in the tag cache or want to compare
+        # performance.
+        #cache = nulltagcache(self.ui, self)
+        cache = tagcache(self.ui, self)
+        (heads, tagfnode) = cache.readcache()
+
+        #self.ui.debug("iterating over %d head(s) for .hgtags...\n" % len(heads))
+        seen = set()                    # set of fnode
+        fctx = None
+        for head in reversed(heads):    # oldest to newest
+            fnode = tagfnode.get(head)
+            if fnode and fnode not in seen:
+                seen.add(fnode)
+                if not fctx:
+                    fctx = self.filectx('.hgtags', fileid=fnode)
+                else:
+                    fctx = fctx.filectx(fnode)
+
+                filetags = self._readtags(fctx.data().splitlines(), fctx)
+                updatetags(filetags, 'global')
+
+        # and update the cache (if necessary)
+        cache.writecache(heads, tagfnode)
+
     def tagtype(self, tagname):
         '''
         return the type of the given tag. result can be:
@@ -2200,6 +2216,208 @@
             return self.stream_in(remote)
         return self.pull(remote, heads)
 
+class nulltagcache(object):
+    """Object that acts like tagcache, but does no caching.  Useful
+    to restore old behaviour with minimal code change."""
+
+    def __init__(self, ui, repo):
+        self.ui = ui
+        self.repo = repo
+
+    def readcache(self):
+        # XXX this loop is copied from tagcache.readcache(): should
+        # factor it out if nulltagcache is going to stick around for the
+        # long term
+        heads = self.repo.heads()
+        fnodes = {}
+        for head in heads:
+            cctx = self.repo[head]
+            try:
+                fnodes[head] = cctx.filenode('.hgtags')
+            except error.LookupError:
+                pass
+
+        return (heads, fnodes)
+
+    def writecache(self, heads, tagfnode):
+        pass
+
+class tagcache(object):
+    '''Object for managing persistent tag cache.  Only needs to live for
+    as long as it takes to open, read, update, and write the cache.'''
+
+    # The tag cache stores only info about heads, not the tag contents
+    # from each head.  I.e. this doesn't try to squeeze out the maximum
+    # performance, but it is simpler has a better chance at actually
+    # working correctly.  And this gives the biggest performance win: it
+    # avoids looking up .hgtags in the manifest for every head, and it
+    # can avoid calling heads() at all if there have been no changes to
+    # the repo.
+    #
+    # Caching .hgtags content is unexpectedly difficult because of
+    # strip/rollback and tag rank.  I suspect the only way to get it
+    # right is to store the exact data from .hgtags on each head -- in
+    # which case why not just read it straight from the .hgtags revlog? 
+    # So that is exactly what we do.
+
+    __slots__ = ['ui',
+                 'repo',
+                 'shouldwrite',
+                ]
+
+    cacheversion = 0                    # format version
+
+    def __init__(self, ui, repo):
+        self.ui = ui
+        self.repo = repo
+        self.shouldwrite = False
+
+    def readcache(self):
+        '''Read the tag cache and return a tuple (heads, fnodes).  heads
+        is the list of all heads currently in the repository (ordered
+        from tip to oldest) and fnodes is a mapping from head to .hgtags
+        filenode.  Caller is responsible for reading tag info from each
+        head.'''
+
+        (ui, repo) = (self.ui, self.repo)
+        try:
+            # XXX should we treat this as a binary file (really, text
+            # Unix line endings on all platforms) from the beginning?
+            # That might avoid difficulties in reading the first 16
+            # bytes if we switch to a truly binary format in future (see
+            # _readversion()).
+            cachefile = repo.opener('tags.cache', 'r')
+            #ui.debug('reading tag cache from %s\n' % cachefile.name)
+        except IOError:
+            cachefile = None
+        else:
+            canuse = self._readversion(cachefile)
+            if not canuse:
+                cachefile.close()
+                cachefile = None
+
+        # The cache file consists of lines like
+        #   <headrev> <headnode> [<tagnode>]
+        # where <headrev> and <headnode> redundantly identify a
+        # repository head from the time the cache was written, and
+        # <tagnode> is the filenode of .hgtags on that head.  Heads with
+        # no .hgtags file will have no <tagnode>.  The cache is ordered
+        # from tip to oldest (which is why <headrev> is there: a quick
+        # visual check is all that's required to ensure correct order).
+        # 
+        # This information is enough to let us avoid the most expensive
+        # part of finding global tags, which is looking up <tagnode> in
+        # the manifest for each head.
+        cacheheads = []                 # list of headnode
+        cachefnode = {}                 # map headnode to filenode
+        if cachefile:
+            for line in cachefile:
+                line = line.rstrip().split()
+                head = bin(line[1])
+                cacheheads.append(head)
+                if len(line) == 3:
+                    fnode = bin(line[2])
+                    cachefnode[head] = fnode
+
+        # Determine which heads have been destroyed by strip or
+        # rollback.  That'll ensure that writecache() writes accurate
+        # data, and it makes life easier for
+        # localrepo._findglobaltags().
+        goodheads = []
+        for head in cacheheads:
+            try:
+                repo.changelog.rev(head)
+                goodheads.append(head)
+            except error.LookupError:
+                pass
+        if len(goodheads) < len(cacheheads):
+            self.shouldwrite = True
+
+        # Optimization #1: if the first head == current tip, there have
+        # been new changesets since the cache was written.  All we need
+        # to do is tell our caller to read .hgtags from each relevant
+        # head.
+        if goodheads and goodheads[0] == repo.changelog.tip():
+            #ui.debug('tag cache: tip not changed, so cache is up-to-date\n')
+            return (goodheads, cachefnode)
+
+        # Tip has changed, so we have to find new heads.
+        currentheads = repo.heads()
+        newheads = [head
+                    for head in currentheads
+                    if head not in set(goodheads)]
+        if newheads:
+            self.shouldwrite = True
+        #ui.debug('tag cache: found %d uncached head(s)\n' % len(newheads))
+
+        # Now we have to lookup the .hgtags filenode for every new head.
+        # This is the most expensive part of finding tags, so
+        # performance will depend primarily on the size of newheads.
+        # When there is no cache file, newheads == currentheads, so
+        # that's the worst case.
+        for head in newheads:
+            cctx = self.repo[head]
+            try:
+                fnode = cctx.filenode('.hgtags')
+                cachefnode[head] = fnode
+            except error.LookupError:
+                # no .hgtags file on this head
+                pass
+
+        # Everything in newheads should be closer to tip than everything
+        # in goodheads.  And both are already sorted tip-to-oldest, so
+        # we can just concatenate them.  (XXX unless someone edits the
+        # cache file in a sneaky attempt to trip us up.)
+        return (newheads + goodheads, cachefnode)
+
+    def _readversion(self, cachefile):
+        '''Read the first line of the cache file, which contains the
+        file version number.  Return true if we can use this cache file.'''
+
+        # The first line looks like 'hgtagcache 0000', where 0000 is the
+        # cache file version number in hex.  (This should make it
+        # possible to switch to a binary format if necessary in future,
+        # where the first 16 bytes will be 'hgtagcache xxxx\n'.)
+        firstline = cachefile.next()
+        if not (len(firstline) == 16 and
+                firstline[:11] == 'hgtagcache '):
+            self.ui.warn(_('invalid tag cache file (ignoring it)\n'))
+            return False
+        else:
+            try:
+                version = int(firstline[11:15], 16)
+            except ValueError:
+                ui.warn(_('invalid tag cache version (%r) (ignoring file)')
+                        % firstline[11:15])
+                return False
+
+        if version > self.cacheversion:
+            ui.warn(_('tag cache file from a later Mercurial version '
+                      '(ignoring it)'))
+            return False
+
+        return True
+
+    def writecache(self, heads, tagfnode):
+        if not self.shouldwrite:
+            return
+
+        cachefile = self.repo.opener('tags.cache', 'w', atomictemp=True)
+        #self.ui.debug('writing cache file %s\n' % cachefile.name)
+
+        cachefile.write('hgtagcache %04x\n' % self.cacheversion)
+        for head in heads:
+            rev = self.repo[head].rev()
+            fnode = tagfnode.get(head)
+            if fnode:
+                cachefile.write('%d %s %s\n' % (rev, hex(head), hex(fnode)))
+            else:
+                cachefile.write('%d %s\n' % (rev, hex(head)))
+
+        cachefile.rename()
+        cachefile.close()
+
+
 # used to avoid circular references so destructors work
 def aftertrans(files):
     renamefiles = [tuple(t) for t in files]
diff --git a/tests/test-empty.out b/tests/test-empty.out
--- a/tests/test-empty.out
+++ b/tests/test-empty.out
@@ -19,3 +19,4 @@
 hgrc
 requires
 store
+tags.cache
diff --git a/tests/test-tags b/tests/test-tags
--- a/tests/test-tags
+++ b/tests/test-tags
@@ -4,7 +4,9 @@
 mkdir t
 cd t
 hg init
+[ -f .hg/tags.cache ] && echo "tag cache exists" || echo "no tag cache"
 hg id
+[ -f .hg/tags.cache ] && echo "tag cache exists" || echo "no tag cache"
 echo a > a
 hg add a
 hg commit -m "test"
@@ -25,6 +27,10 @@
 hg tags
 hg identify
 
+# repeat with cold tag cache
+rm -f .hg/tags.cache
+hg identify
+
 echo "% create a branch"
 echo bb > a
 hg status
diff --git a/tests/test-tags.out b/tests/test-tags.out
--- a/tests/test-tags.out
+++ b/tests/test-tags.out
@@ -1,5 +1,7 @@
 % setup
+no tag cache
 000000000000 tip
+tag cache exists
 0 files updated, 0 files merged, 0 files removed, 0 files unresolved
 acb14030fe0a tip
 % create local tag with long name
@@ -10,6 +12,7 @@
 tip                                1:b9154636be93
 first                              0:acb14030fe0a
 b9154636be93 tip
+b9154636be93 tip
 % create a branch
 M a
 b9154636be93+ tip
@@ -73,7 +76,11 @@
 rev 4: .hgtags:
 bbd179dfa0a71671c253b3ae0aa1513b60d199fa bar
 .hg/tags.cache:
-no such file
+hgtagcache 0000
+4 0c192d7d5e6b78a714de54a2e9627952a877e25a 0c04f2a8af31de17fab7422878ee5a2dadbc943d
+3 6fa450212aeb2a21ed616a54aea39a4a27894cd7 7d3b718c964ef37b89e550ebdafd5789e76ce1b0
+2 7a94127795a33c10a370c93f731fd9fea0b79af6 0c04f2a8af31de17fab7422878ee5a2dadbc943d
+0 bbd179dfa0a71671c253b3ae0aa1513b60d199fa
 % test tag removal
 changeset:   5:5f6e8655b1c7
 tag:         tip