[PATCH] tags: extract .hgtags filenodes cache to a standalone file

Gregory Szorc gregory.szorc at gmail.com
Wed Apr 15 13:47:20 UTC 2015


# HG changeset patch
# User Gregory Szorc <gregory.szorc at gmail.com>
# Date 1429105088 14400
#      Wed Apr 15 09:38:08 2015 -0400
# Node ID 5fc661ca70da26fa42465eb490cf9ba9a8a2d18f
# Parent  4b085f907221b1c9c22e697849e362944dc66791
tags: extract .hgtags filenodes cache to a standalone file

Resolution of .hgtags filenodes values has historically been a
performance pain point for large repositories, where reading individual
manifests can take over 100ms. Multiplied by hundreds or even thousands
of heads and resolving .hgtags filenodes becomes a performance issue.

This patch establishes a standalone cache file holding the .hgtags
filenodes for each changeset. After this patch, the .hgtags filenode
for any particular changeset should only have to be computed once
during the lifetime of the repository.

The introduced hgtagsfnodes1 cache file is modeled after the rev branch
cache: the cache is effectively an array of entries consisting of a
changeset fragment and the filenode for a revision. The file grows in
proportion to the length of the repository (24 bytes per changeset) and
is truncated when the repository is stripped. The file is not written
unless tag info is requested and tags have changed since last time.

This patch partially addresses issue4550. Future patches will split the
"tags" cache file into per-filter files and will refactor the cache
format to not capture the .hgtags fnodes, as these are now stored in
the hgtagsfnodes1 cache. This patch is capable of standing alone. We
should not have to wait on the tags cache filter split and format
refactor for this patch to land.

diff --git a/mercurial/tags.py b/mercurial/tags.py
--- a/mercurial/tags.py
+++ b/mercurial/tags.py
@@ -14,17 +14,38 @@ from node import nullid, bin, hex, short
 from i18n import _
 import util
 import encoding
 import error
+from array import array
 import errno
 import time
 
-# The tags cache stores information about heads and the history of tags.
+# Tags computation can be expensive and caches exist to make it fast in
+# the common case.
+#
+# The "hgtagsfnodes1" cache file caches the .hgtags filenode values for
+# each revision in the repository. The file is effectively an array of
+# fixed length records. Read the docs for "hgtagsfnodescache" for technical
+# details.
+#
+# The .hgtags filenode cache grows in proportion to the length of the
+# changelog. The file is truncated when the # changelog is stripped.
+#
+# The purpose of the filenode cache is to avoid the most expensive part
+# of finding global tags, which is looking up the .hgtags filenode in the
+# manifest for each head. This can take dozens or over 100ms for
+# repositories with very large manifests. Multiplied by dozens or even
+# hundreds of heads and there is a significant performance concern.
+#
+# The "tags" cache stores information about heads and the history of tags.
 #
 # The cache file consists of two parts. The first part maps head nodes
 # to .hgtags filenodes. The second part is a history of tags. The two
 # parts are separated by an empty line.
 #
+# The filenodes part of "tags" has effectively been superseded by
+# "hgtagsfnodes1." It is being kept around for backwards compatbility.
+#
 # The first part consists of lines of the form:
 #
 #   <headrev> <headnode> [<hgtagsnode>]
 #
@@ -39,13 +60,8 @@ import time
 # <headrev> is there: a quick check of the tip from when the cache was
 # written against the current tip is all that is needed to check whether
 # the cache is up to date).
 #
-# The purpose of the filenode cache is to avoid the most expensive part
-# of finding global tags, which is looking up the .hgtags filenode in the
-# manifest for each head. This can take over a minute on repositories
-# that have large manifests and many heads.
-#
 # The second part of the tags cache consists of lines of the form:
 #
 #   <node> <tag>
 #
@@ -323,22 +339,25 @@ def _readtagcache(ui, repo):
     # Now we have to lookup the .hgtags filenode for every new head.
     # This is the most expensive part of finding tags, so performance
     # depends primarily on the size of newheads.  Worst case: no cache
     # file, so newheads == repoheads.
-    for head in reversed(newheads):
-        cctx = repo[head]
-        try:
-            fnode = cctx.filenode('.hgtags')
-            cachefnode[head] = fnode
-        except error.LookupError:
-            # no .hgtags file on this head
-            pass
+    fnodescache = hgtagsfnodescache(repo.unfiltered())
+    try:
+        for head in reversed(newheads):
+            fnode = fnodescache.getfnode(head)
+            if fnode != nullid:
+                cachefnode[head] = fnode
+
+        fnodescache.write()
+    finally:
+        # Break cycle.
+        fnodescache._repo = None
 
     duration = time.time() - starttime
     ui.log('tagscache',
-           'resolved %d tags cache entries from %d manifests in %0.4f '
+           '%d/%d cache hits/lookups in %0.4f '
            'seconds\n',
-           len(cachefnode), len(newheads), duration)
+           fnodescache.hitcount, fnodescache.lookupcount, duration)
 
     # Caller has to iterate over all heads, but can use the filenodes in
     # cachefnode to get to each .hgtags revision quickly.
     return (repoheads, cachefnode, None, True)
@@ -387,4 +406,129 @@ def _writetagcache(ui, repo, heads, tagf
     try:
         cachefile.close()
     except (OSError, IOError):
         pass
+
+_fnodescachefile = 'cache/hgtagsfnodes1'
+_fnodesrecsize = 4 + 20 # changeset fragment + filenode
+_fnodesmissingrec = '\xff' * 24
+
+class hgtagsfnodescache(object):
+    """Persisent cache mapping revisions to .hgtags filenodes.
+
+    The cache is an array of records. Each item in the array corresponds to
+    a changelog revision. Values in the array contain the first 4 bytes of
+    the node hash and the 20 bytes .hgtags filenode for that revision.
+
+    The first 4 bytes are present as a form of verification. Repository
+    stripping and rewriting may change the node at a numeric revision in the
+    changelog. The changeset fragment serves as a verifier to detect
+    rewriting. This logic is shared with the rev branch cache (see
+    branchmap.py).
+
+    The instance holds in memory the full cache content but entries are
+    only parsed on read.
+
+    Instances behave like lists. ``c[i]`` works where i is a rev or
+    changeset node. Missing indexes are populated automatically on access.
+    """
+    def __init__(self, repo):
+        assert repo.filtername is None
+
+        # This creates a cycle.
+        # ASSERTION: _readtagcache is only consumer and it manually breaks
+        # the cycle.
+        self._repo = repo
+
+        self.lookupcount = 0
+        self.hitcount = 0
+        self._dirtyoffset = None
+        self._raw = array('c')
+
+        data = repo.vfs.tryread(_fnodescachefile)
+        self._raw.fromstring(data)
+
+        # Adjust the array to agree with the length of the repo.
+        cllen = len(repo.changelog)
+        wantedlen = cllen * _fnodesrecsize
+        rawlen = len(self._raw)
+
+        if rawlen < wantedlen:
+            self._dirtyoffset = rawlen
+            self._raw.extend('\xff' * (wantedlen - rawlen))
+        elif rawlen > wantedlen:
+            for i in range(rawlen - wantedlen):
+                self._raw.pop()
+            self._dirtyoffset = len(self._raw)
+
+    def getfnode(self, node):
+        """Obtain the filenode of the .hgtags file at a specified revision.
+
+        If the value is in the cache, the entry will be validated and returned.
+        Otherwise, the filenode will be computed and returned.
+
+        If an .hgtags does not exist at the specified revision, nullid is
+        returned.
+        """
+        ctx = self._repo[node]
+        rev = ctx.rev()
+
+        self.lookupcount += 1
+
+        offset = rev * _fnodesrecsize
+        record = self._raw[offset:offset + _fnodesrecsize].tostring()
+        properprefix = node[0:4]
+
+        # Validate and return existing entry.
+        if record != _fnodesmissingrec:
+            fileprefix = record[0:4]
+
+            if fileprefix == properprefix:
+                self.hitcount += 1
+                return record[4:]
+
+            # Fall through.
+
+        # If we get here, the entry is either missing or invalid. Populate it.
+        try:
+            fnode = ctx.filenode('.hgtags')
+        except error.LookupError:
+            # No .hgtags file on this revision.
+            fnode = nullid
+
+        # Slices on array only accept other array.
+        entry = array('c', properprefix + fnode)
+        self._raw[offset:offset + _fnodesrecsize] = entry
+        # self._dirtyoffset could be None.
+        self._dirtyoffset = min(self._dirtyoffset, offset) or 0
+
+        return fnode
+
+    def write(self):
+        """Perform all necessary writes to cache file.
+
+        This may no-op if no writes are needed.
+        """
+        if self._dirtyoffset is None:
+            return
+
+        data = self._raw[self._dirtyoffset:]
+        if not data:
+            return
+
+        repo = self._repo
+
+        try:
+            f = repo.vfs.open(_fnodescachefile, 'ab')
+            f.seek(self._dirtyoffset)
+            f.truncate()
+            repo.ui.log('tagscache',
+                        'writing %d bytes to %s\n' % (
+                        len(data), _fnodescachefile))
+            f.write(data)
+            f.close()
+            self._dirtyoffset = None
+        except (IOError, OSError), inst:
+            repo.ui.log('tagscache',
+                        "couldn't write %s: %s\n" % (
+                        _fnodescachefile, inst))
+            return
diff --git a/tests/test-tags.t b/tests/test-tags.t
--- a/tests/test-tags.t
+++ b/tests/test-tags.t
@@ -11,8 +11,12 @@ Helper functions:
   $ cacheexists() {
   >   [ -f .hg/cache/tags ] && echo "tag cache exists" || echo "no tag cache"
   > }
 
+  $ fnodescacheexists() {
+  >   [ -f .hg/cache/hgtagsfnodes1 ] && echo "fnodes cache exists" || echo "no fnodes cache"
+  > }
+
   $ dumptags() {
   >     rev=$1
   >     echo "rev $rev: .hgtags:"
   >     hg cat -r$rev .hgtags
@@ -27,12 +31,16 @@ Setup:
   $ hg init t
   $ cd t
   $ cacheexists
   no tag cache
+  $ fnodescacheexists
+  no fnodes cache
   $ hg id
   000000000000 tip
   $ cacheexists
   no tag cache
+  $ fnodescacheexists
+  no fnodes cache
   $ echo a > a
   $ hg add a
   $ hg commit -m "test"
   $ hg co
@@ -40,16 +48,22 @@ Setup:
   $ hg identify
   acb14030fe0a tip
   $ cacheexists
   tag cache exists
+No fnodes cache because .hgtags file doesn't exist
+(this is an implementation detail)
+  $ fnodescacheexists
+  no fnodes cache
 
 Try corrupting the cache
 
   $ printf 'a b' > .hg/cache/tags
   $ hg identify
   acb14030fe0a tip
   $ cacheexists
   tag cache exists
+  $ fnodescacheexists
+  no fnodes cache
   $ hg identify
   acb14030fe0a tip
 
 Create local tag with long name:
@@ -73,18 +87,38 @@ Create a tag behind hg's back:
   first                              0:acb14030fe0a
   $ hg identify
   b9154636be93 tip
 
+We should have a fnodes cache now that we have a real tag
+The cache should have an empty entry for rev 0 and a valid entry for rev 1.
+
+
+  $ fnodescacheexists
+  fnodes cache exists
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=48
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff b9 15 46 36 26 b7 b4 a7 |..........F6&...|
+  0020: 73 e0 9e e3 c5 2f 51 0e 19 e0 5e 1f f9 66 d8 59 |s..../Q...^..f.Y|
+
 Repeat with cold tag cache:
 
-  $ rm -f .hg/cache/tags
+  $ rm -f .hg/cache/tags .hg/cache/hgtagsfnodes1
   $ hg identify
   b9154636be93 tip
 
+  $ fnodescacheexists
+  fnodes cache exists
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=48
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff b9 15 46 36 26 b7 b4 a7 |..........F6&...|
+  0020: 73 e0 9e e3 c5 2f 51 0e 19 e0 5e 1f f9 66 d8 59 |s..../Q...^..f.Y|
+
 And again, but now unable to write tag cache:
 
 #if unix-permissions
-  $ rm -f .hg/cache/tags
+  $ rm -f .hg/cache/tags .hg/cache/hgtagsfnodes1
   $ chmod 555 .hg
   $ hg identify
   b9154636be93 tip
   $ chmod 755 .hg
@@ -96,9 +130,9 @@ Tag cache debug info written to blackbox
   $ hg identify
   b9154636be93 tip
   $ hg blackbox -l 4
   1970/01/01 00:00:00 bob> identify
-  1970/01/01 00:00:00 bob> resolved 1 tags cache entries from 1 manifests in ?.???? seconds (glob)
+  1970/01/01 00:00:00 bob> 1/1 cache hits/lookups in * seconds (glob)
   1970/01/01 00:00:00 bob> writing tags cache file with 1 heads and 1 tags
   1970/01/01 00:00:00 bob> identify exited 0 after ?.?? seconds (glob)
 
 
@@ -120,11 +154,31 @@ Create a branch:
   $ echo 1 > b
   $ hg add b
   $ hg commit -m "branch"
   created new head
+
+Creating a new commit shouldn't append the .hgtags fnodes cache until
+tags info is accessed
+
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=48
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff b9 15 46 36 26 b7 b4 a7 |..........F6&...|
+  0020: 73 e0 9e e3 c5 2f 51 0e 19 e0 5e 1f f9 66 d8 59 |s..../Q...^..f.Y|
+
   $ hg id
   c8edf04160c7 tip
 
+First 4 bytes of record 3 are changeset fragment
+
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=72
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff b9 15 46 36 26 b7 b4 a7 |..........F6&...|
+  0020: 73 e0 9e e3 c5 2f 51 0e 19 e0 5e 1f f9 66 d8 59 |s..../Q...^..f.Y|
+  0030: c8 ed f0 41 00 00 00 00 00 00 00 00 00 00 00 00 |...A............|
+  0040: 00 00 00 00 00 00 00 00                         |........|
+
 Merge the two heads:
 
   $ hg merge 1
   1 files updated, 0 files merged, 0 files removed, 0 files unresolved
@@ -243,8 +297,109 @@ Dump cache:
   bbd179dfa0a71671c253b3ae0aa1513b60d199fa bar
   bbd179dfa0a71671c253b3ae0aa1513b60d199fa bar
   78391a272241d70354aa14c874552cad6b51bb42 bar
 
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=120
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0020: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0030: 7a 94 12 77 0c 04 f2 a8 af 31 de 17 fa b7 42 28 |z..w.....1....B(|
+  0040: 78 ee 5a 2d ad bc 94 3d 6f a4 50 21 7d 3b 71 8c |x.Z-...=o.P!};q.|
+  0050: 96 4e f3 7b 89 e5 50 eb da fd 57 89 e7 6c e1 b0 |.N.{..P...W..l..|
+  0060: 0c 19 2d 7d 0c 04 f2 a8 af 31 de 17 fa b7 42 28 |..-}.....1....B(|
+  0070: 78 ee 5a 2d ad bc 94 3d                         |x.Z-...=|
+
+Corrupt the .hgtags fnodes cache
+Extra junk data at the end should get overwritten on next cache update
+
+  $ echo extra >> .hg/cache/hgtagsfnodes1
+  $ echo dummy1 > foo
+  $ hg commit -m throwaway1
+
+  $ hg tags
+  tip                                5:8dbfe60eff30
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 5
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> writing 24 bytes to cache/hgtagsfnodes1
+  1970/01/01 00:00:00 bob> 0/1 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+
+#if unix-permissions no-root
+Errors writing to .hgtags fnodes cache are silently ignored
+
+  $ echo dummy2 > foo
+  $ hg commit -m throwaway2
+
+  $ chmod a-w .hg/cache/hgtagsfnodes1
+  $ rm -f .hg/cache/tags
+
+  $ hg tags
+  tip                                6:b968051b5cf3
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 5
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> couldn't write cache/hgtagsfnodes1: [Errno 13] Permission denied: '$TESTTMP/t2/.hg/cache/hgtagsfnodes1'
+  1970/01/01 00:00:00 bob> 2/3 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+
+  $ chmod a+w .hg/cache/hgtagsfnodes1
+#endif
+
+  $ rm -f .hg/cache/tags
+  $ hg tags
+  tip                                6:b968051b5cf3
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 5
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> writing 24 bytes to cache/hgtagsfnodes1
+  1970/01/01 00:00:00 bob> 2/3 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+
+  $ f --size .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=168
+
+Stripping doesn't truncate the tags cache until new data is available
+
+  $ hg -q --config extensions.strip= strip -r 5 --no-backup
+  $ hg tags
+  tip                                4:0c192d7d5e6b
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 4
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> 1/1 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+
+  $ f --size .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=168
+
+  $ echo dummy > foo
+  $ hg commit -m throwaway3
+
+  $ hg tags
+  tip                                5:035f65efb448
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 5
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> writing 24 bytes to cache/hgtagsfnodes1
+  1970/01/01 00:00:00 bob> 0/1 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+  $ f --size .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=144
+
+  $ hg -q --config extensions.strip= strip -r 5 --no-backup
+
 Test tag removal:
 
   $ hg tag --remove bar     # rev 5
   $ hg tip -vp


More information about the Mercurial-devel mailing list