[PATCH 2 of 2 v3] manifest: persist the manifestfulltext cache

Martijn Pieters mj at zopatista.com
Wed Aug 1 17:29:28 EDT 2018


# HG changeset patch
# User Martijn Pieters <mj at zopatista.com>
# Date 1533058674 -7200
#      Tue Jul 31 19:37:54 2018 +0200
# Branch stable
# Node ID 92180b2297bd8bff6e2f3bbbbbc9d6e4ea71ed9e
# Parent  23e31b60983e3fb78aeb8822de8c630798f46447
# EXP-Topic persistentmanifestcache
manifest: persist the manifestfulltext cache

Reconstructing the manifest from the revlog takes time, so much so that there
already is a LRU cache to avoid having to load a manifest multiple times.

This patch persists that LRU cache in the .hg/cache directory, so we can re-use
this cache across hg commands. Commit benchmark (run on Macos 10.13 on a
2017-model Macbook Pro with Core i7 2.9GHz and flash drive), testing without
and with patch run 5 times, baseline is r2a227782e754:

* committing to an existing file, against the mozilla-central repository.
  Baseline real time average 1.9692, with patch 1.3786.

A new debugcommand "hg debugmanifestfulltextcache" lets you inspect the cache,
clear it, or add specific manifest nodeids to it. When calling
repo.updatecaches(), the manifest(s) for the working copy parents are added to
the cache.

The hg perfmanifest command has an additional --clear-disk switch to clear this
cache when testing manifest loading performance.

Using this command to test performance on the firefox repository for revision
f947d902ed91, whose manifest has a delta chain length of 60540, we see:

$ hg perfmanifest f947d902ed91 --clear-disk
! wall 0.972253 comb 0.970000 user 0.850000 sys 0.120000 (best of 10)
$ hg debugmanifestfulltextcache -a `hg log --debug -r f947d902ed91 | grep manifest | cut -d: -f3`
Cache contains 1 manifest entries, in order of most to least recent:
id: 0294517df4aad07c70701db43bc7ff24c3ce7dbc, size 25.6 MB
Total cache data size 25.6 MB, on-disk 0 bytes
$ hg perfmanifest f947d902ed91
! wall 0.036748 comb 0.040000 user 0.020000 sys 0.020000 (best of 100)

Worst-case scenario: a manifest text loaded from a single delta; in the firefox
repository manifest node 9a1246ff762e is the chain base for the manifest
attached to revision f947d902ed91. Loading this from a full cache file is just
as fast as without the cache; the extra node ids ensure a big full cache:

$ for node in 9a1246ff762e 1a1922c14a3e 54a31d11a36a 0294517df4aa; do
>   hgd debugmanifestfulltextcache -a $node > /dev/null
> done
$ hgd perfmanifest -m 9a1246ff762e
! wall 0.077513 comb 0.080000 user 0.030000 sys 0.050000 (best of 100)
$ hgd perfmanifest -m 9a1246ff762e --clear-disk
! wall 0.078547 comb 0.080000 user 0.070000 sys 0.010000 (best of 100)

diff -r 23e31b60983e -r 92180b2297bd contrib/perf.py
--- a/contrib/perf.py	Wed Aug 01 23:04:02 2018 +0200
+++ b/contrib/perf.py	Tue Jul 31 19:37:54 2018 +0200
@@ -793,8 +793,9 @@
 
 @command('perfmanifest', [
             ('m', 'manifest-rev', False, 'Look up a manifest node revision'),
+            ('', 'clear-disk', False, 'clear on-disk caches too'),
          ], 'REV|NODE')
-def perfmanifest(ui, repo, rev, manifest_rev=False, **opts):
+def perfmanifest(ui, repo, rev, manifest_rev=False, clear_disk=False, **opts):
     """benchmark the time to read a manifest from disk and return a usable
     dict-like object
 
@@ -806,7 +807,7 @@
     else:
         t = repo.manifestlog._revlog.lookup(rev)
     def d():
-        repo.manifestlog.clearcaches()
+        repo.manifestlog.clearcaches(clear_persisted_data=clear_disk)
         repo.manifestlog[t].read()
     timer(d)
     fm.end()
diff -r 23e31b60983e -r 92180b2297bd mercurial/debugcommands.py
--- a/mercurial/debugcommands.py	Wed Aug 01 23:04:02 2018 +0200
+++ b/mercurial/debugcommands.py	Tue Jul 31 19:37:54 2018 +0200
@@ -1446,6 +1446,53 @@
 
     return held
 
+ at command('debugmanifestfulltextcache', [
+        ('', 'clear', False, _('clear the cache')),
+        ('a', 'add', '', _('add the given manifest node to the cache'),
+         _('NODE'))
+    ], '')
+def debugmanifestfulltextcache(ui, repo, add=None, **opts):
+    """show, clear or amend the contents of the manifest fulltext cache"""
+    with repo.lock():
+        r = repo.manifestlog._revlog
+        try:
+            cache = r._fulltextcache
+        except AttributeError:
+            ui.warn(_(
+                "Current revlog implementation doesn't appear to have a "
+                'manifest fulltext cache\n'))
+            return
+
+        if opts.get(r'clear'):
+            cache.clear()
+
+        if add:
+            try:
+                manifest = repo.manifestlog[r.lookup(add)]
+            except error.LookupError as e:
+                raise error.Abort(e, hint="Check your manifest node id")
+            manifest.read()  # stores revisision in cache too
+
+        if not len(cache):
+            ui.write(_('Cache empty'))
+        else:
+            ui.write(
+                _('Cache contains %d manifest entries, in order of most to '
+                  'least recent:\n') % (len(cache),))
+            totalsize = 0
+            for nodeid in cache:
+                # Use cache.get to not update the LRU order
+                data = cache.get(nodeid)
+                size = len(data)
+                totalsize += size + 24   # 20 bytes nodeid, 4 bytes size
+                ui.write(_('id: %s, size %s\n') % (
+                    hex(nodeid), util.bytecount(size)))
+            ondisk = cache._opener.stat('manifestfulltextcache').st_size
+            ui.write(
+                _('Total cache data size %s, on-disk %s\n') % (
+                    util.bytecount(totalsize), util.bytecount(ondisk))
+            )
+
 @command('debugmergestate', [], '')
 def debugmergestate(ui, repo, *args):
     """print merge state
diff -r 23e31b60983e -r 92180b2297bd mercurial/localrepo.py
--- a/mercurial/localrepo.py	Wed Aug 01 23:04:02 2018 +0200
+++ b/mercurial/localrepo.py	Tue Jul 31 19:37:54 2018 +0200
@@ -1610,6 +1610,10 @@
                 rbc.branchinfo(r)
             rbc.write()
 
+            # ensure the working copy parents are in the manifestfulltextcache
+            for ctx in self['.'].parents():
+                ctx.manifest()  # accessing the manifest is enough
+
     def invalidatecaches(self):
 
         if '_tagscache' in vars(self):
diff -r 23e31b60983e -r 92180b2297bd mercurial/manifest.py
--- a/mercurial/manifest.py	Wed Aug 01 23:04:02 2018 +0200
+++ b/mercurial/manifest.py	Tue Jul 31 19:37:54 2018 +0200
@@ -10,6 +10,7 @@
 import heapq
 import itertools
 import struct
+import weakref
 
 from .i18n import _
 from .node import (
@@ -1136,6 +1137,115 @@
             for subtree in subm.walksubtrees(matcher=matcher):
                 yield subtree
 
+class manifestfulltextcache(util.lrucachedict):
+    """File-backed LRU cache for the manifest cache
+
+    File consists of entries, up to EOF:
+
+    - 20 bytes node, 4 bytes length, <length> manifest data
+
+    These are written in reverse cache order (oldest to newest).
+
+    """
+    def __init__(self, max):
+        super(manifestfulltextcache, self).__init__(max)
+        self._dirty = False
+        self._read = False
+        self._opener = None
+
+    def read(self):
+        if self._read or self._opener is None:
+            return
+
+        try:
+            with self._opener('manifestfulltextcache') as fp:
+                set = super(manifestfulltextcache, self).__setitem__
+                # ignore trailing data, this is a cache, corruption is skipped
+                while True:
+                    node = fp.read(20)
+                    if len(node) < 20:
+                        break
+                    try:
+                        size = struct.unpack('>L', fp.read(4))[0]
+                    except struct.error:
+                        break
+                    value = bytearray(fp.read(size))
+                    if len(value) != size:
+                        break
+                    set(node, value)
+        except IOError:
+            # the file is allowed to be missing
+            pass
+
+        self._read = True
+        self._dirty = False
+
+    def write(self):
+        if not self._dirty or self._opener is None:
+            return
+        # rotate backwards to the first used node
+        with self._opener(
+                'manifestfulltextcache', 'w', atomictemp=True, checkambig=True
+            ) as fp:
+            node = self._head.prev
+            while True:
+                if node.key in self._cache:
+                    fp.write(node.key)
+                    fp.write(struct.pack('>L', len(node.value)))
+                    fp.write(node.value)
+                if node is self._head:
+                    break
+                node = node.prev
+
+    def __len__(self):
+        if not self._read:
+            self.read()
+        return super(manifestfulltextcache, self).__len__()
+
+    def __contains__(self, k):
+        if not self._read:
+            self.read()
+        return super(manifestfulltextcache, self).__contains__(k)
+
+    def __iter__(self):
+        if not self._read:
+            self.read()
+        return super(manifestfulltextcache, self).__iter__()
+
+    def __getitem__(self, k):
+        if not self._read:
+            self.read()
+        # the cache lru order can change on read
+        setdirty = self._cache.get(k) is not self._head
+        value = super(manifestfulltextcache, self).__getitem__(k)
+        if setdirty:
+            self._dirty = True
+        return value
+
+    def __setitem__(self, k, v):
+        if not self._read:
+            self.read()
+        super(manifestfulltextcache, self).__setitem__(k, v)
+        self._dirty = True
+
+    def __delitem__(self, k):
+        if not self._read:
+            self.read()
+        super(manifestfulltextcache, self).__delitem__(k)
+        self._dirty = True
+
+    def get(self, k, default=None):
+        if not self._read:
+            self.read()
+        return super(manifestfulltextcache, self).get(k, default=default)
+
+    def clear(self, clear_persisted_data=False):
+        super(manifestfulltextcache, self).clear()
+        if clear_persisted_data:
+            self._dirty = True
+            self.write()
+        self._read = False
+
 class manifestrevlog(revlog.revlog):
     '''A revlog that stores manifest texts. This is responsible for caching the
     full-text manifest contents.
@@ -1164,7 +1274,7 @@
 
         self._treeondisk = optiontreemanifest or treemanifest
 
-        self._fulltextcache = util.lrucachedict(cachesize)
+        self._fulltextcache = manifestfulltextcache(cachesize)
 
         if dir:
             assert self._treeondisk, 'opts is %r' % opts
@@ -1186,13 +1296,35 @@
                                              checkambig=not bool(dir),
                                              mmaplargeindex=True)
 
+    def _setupmanifestcachehooks(self, repo):
+        """Persist the manifestfulltextcache on lock release"""
+        if not util.safehasattr(repo, '_lockref'):
+            return
+
+        self._fulltextcache._opener = repo.cachevfs
+        reporef = weakref.ref(repo)
+        manifestrevlogref = weakref.ref(self)
+
+        def persistmanifestcache():
+            repo = reporef()
+            self = manifestrevlogref()
+            if repo is None or self is None:
+                return
+            if repo.manifestlog._revlog is not self:
+                # there's a different manifest in play now, abort
+                return
+            self._fulltextcache.write()
+
+        if repo._currentlock(repo._lockref) is not None:
+            repo._afterlock(persistmanifestcache)
+
     @property
     def fulltextcache(self):
         return self._fulltextcache
 
-    def clearcaches(self):
+    def clearcaches(self, clear_persisted_data=False):
         super(manifestrevlog, self).clearcaches()
-        self._fulltextcache.clear()
+        self._fulltextcache.clear(clear_persisted_data=clear_persisted_data)
         self._dirlogcache = {'': self}
 
     def dirlog(self, d):
@@ -1288,6 +1420,7 @@
         self._treeinmem = usetreemanifest
 
         self._revlog = repo._constructmanifest()
+        self._revlog._setupmanifestcachehooks(repo)
         self._narrowmatch = repo.narrowmatch()
 
         # A cache of the manifestctx or treemanifestctx for each directory
@@ -1345,9 +1478,9 @@
             mancache[node] = m
         return m
 
-    def clearcaches(self):
+    def clearcaches(self, clear_persisted_data=False):
         self._dirmancache.clear()
-        self._revlog.clearcaches()
+        self._revlog.clearcaches(clear_persisted_data=clear_persisted_data)
 
     def rev(self, node):
         return self._revlog.rev(node)
@@ -1421,9 +1554,12 @@
                 self._data = manifestdict()
             else:
                 rl = self._revlog()
-                text = rl.revision(self._node)
-                arraytext = bytearray(text)
-                rl._fulltextcache[self._node] = arraytext
+                if self._node in rl._fulltextcache:
+                    text = pycompat.bytestr(rl._fulltextcache[self._node])
+                else:
+                    text = rl.revision(self._node)
+                    arraytext = bytearray(text)
+                    rl._fulltextcache[self._node] = arraytext
                 self._data = manifestdict(text)
         return self._data
 
@@ -1523,9 +1659,12 @@
                 m.setnode(self._node)
                 self._data = m
             else:
-                text = rl.revision(self._node)
-                arraytext = bytearray(text)
-                rl.fulltextcache[self._node] = arraytext
+                if self._node in rl.fulltextcache:
+                    text = pycompat.bytestr(rl.fulltextcache[self._node])
+                else:
+                    text = rl.revision(self._node)
+                    arraytext = bytearray(text)
+                    rl.fulltextcache[self._node] = arraytext
                 self._data = treemanifest(dir=self._dir, text=text)
 
         return self._data
diff -r 23e31b60983e -r 92180b2297bd tests/test-clone.t
--- a/tests/test-clone.t	Wed Aug 01 23:04:02 2018 +0200
+++ b/tests/test-clone.t	Tue Jul 31 19:37:54 2018 +0200
@@ -47,6 +47,7 @@
   checklink (symlink !)
   checklink-target (symlink !)
   checknoexec (execbit !)
+  manifestfulltextcache
   rbc-names-v1
   rbc-revs-v1
 
diff -r 23e31b60983e -r 92180b2297bd tests/test-completion.t
--- a/tests/test-completion.t	Wed Aug 01 23:04:02 2018 +0200
+++ b/tests/test-completion.t	Tue Jul 31 19:37:54 2018 +0200
@@ -98,6 +98,7 @@
   debugknown
   debuglabelcomplete
   debuglocks
+  debugmanifestfulltextcache
   debugmergestate
   debugnamecomplete
   debugobsolete
@@ -284,6 +285,7 @@
   debugknown: 
   debuglabelcomplete: 
   debuglocks: force-lock, force-wlock, set-lock, set-wlock
+  debugmanifestfulltextcache: clear, add
   debugmergestate: 
   debugnamecomplete: 
   debugobsolete: flags, record-parents, rev, exclusive, index, delete, date, user, template
diff -r 23e31b60983e -r 92180b2297bd tests/test-debugcommands.t
--- a/tests/test-debugcommands.t	Wed Aug 01 23:04:02 2018 +0200
+++ b/tests/test-debugcommands.t	Tue Jul 31 19:37:54 2018 +0200
@@ -411,6 +411,7 @@
   $ ls -r .hg/cache/*
   .hg/cache/rbc-revs-v1
   .hg/cache/rbc-names-v1
+  .hg/cache/manifestfulltextcache
   .hg/cache/branch2-served
 
 Test debugcolor
diff -r 23e31b60983e -r 92180b2297bd tests/test-fncache.t
--- a/tests/test-fncache.t	Wed Aug 01 23:04:02 2018 +0200
+++ b/tests/test-fncache.t	Tue Jul 31 19:37:54 2018 +0200
@@ -88,6 +88,7 @@
   .hg/00manifest.i
   .hg/cache
   .hg/cache/branch2-served
+  .hg/cache/manifestfulltextcache
   .hg/cache/rbc-names-v1
   .hg/cache/rbc-revs-v1
   .hg/data
@@ -121,6 +122,7 @@
   .hg/00changelog.i
   .hg/cache
   .hg/cache/branch2-served
+  .hg/cache/manifestfulltextcache
   .hg/cache/rbc-names-v1
   .hg/cache/rbc-revs-v1
   .hg/dirstate
diff -r 23e31b60983e -r 92180b2297bd tests/test-hardlinks.t
--- a/tests/test-hardlinks.t	Wed Aug 01 23:04:02 2018 +0200
+++ b/tests/test-hardlinks.t	Tue Jul 31 19:37:54 2018 +0200
@@ -241,6 +241,7 @@
   2 r4/.hg/cache/checkisexec (execbit !)
   ? r4/.hg/cache/checklink-target (glob) (symlink !)
   2 r4/.hg/cache/checknoexec (execbit !)
+  2 r4/.hg/cache/manifestfulltextcache
   2 r4/.hg/cache/rbc-names-v1
   2 r4/.hg/cache/rbc-revs-v1
   2 r4/.hg/dirstate
@@ -291,6 +292,7 @@
   2 r4/.hg/cache/checkisexec (execbit !)
   2 r4/.hg/cache/checklink-target (symlink !)
   2 r4/.hg/cache/checknoexec (execbit !)
+  2 r4/.hg/cache/manifestfulltextcache
   2 r4/.hg/cache/rbc-names-v1
   2 r4/.hg/cache/rbc-revs-v1
   1 r4/.hg/dirstate
diff -r 23e31b60983e -r 92180b2297bd tests/test-help.t
--- a/tests/test-help.t	Wed Aug 01 23:04:02 2018 +0200
+++ b/tests/test-help.t	Tue Jul 31 19:37:54 2018 +0200
@@ -966,6 +966,9 @@
    debuginstall  test Mercurial installation
    debugknown    test whether node ids are known to a repo
    debuglocks    show or modify state of locks
+   debugmanifestfulltextcache
+                 show, clear or amend the contents of the manifest fulltext
+                 cache
    debugmergestate
                  print merge state
    debugnamecomplete
diff -r 23e31b60983e -r 92180b2297bd tests/test-inherit-mode.t
--- a/tests/test-inherit-mode.t	Wed Aug 01 23:04:02 2018 +0200
+++ b/tests/test-inherit-mode.t	Tue Jul 31 19:37:54 2018 +0200
@@ -69,6 +69,7 @@
   00600 ./.hg/00changelog.i
   00770 ./.hg/cache/
   00660 ./.hg/cache/branch2-served
+  00660 ./.hg/cache/manifestfulltextcache
   00660 ./.hg/cache/rbc-names-v1
   00660 ./.hg/cache/rbc-revs-v1
   00660 ./.hg/dirstate
diff -r 23e31b60983e -r 92180b2297bd tests/test-share.t
--- a/tests/test-share.t	Wed Aug 01 23:04:02 2018 +0200
+++ b/tests/test-share.t	Tue Jul 31 19:37:54 2018 +0200
@@ -32,6 +32,7 @@
   [1]
   $ ls -1 ../repo1/.hg/cache
   branch2-served
+  manifestfulltextcache
   rbc-names-v1
   rbc-revs-v1
   tags2-visible


More information about the Mercurial-devel mailing list