[PATCH 1 of 2 v4] localrepo: persistent caching of branch names

Mads Kiilerich mads at kiilerich.com
Wed Oct 15 20:48:23 CDT 2014


# HG changeset patch
# User Mads Kiilerich <madski at unity3d.com>
# Date 1413424006 -7200
#      Thu Oct 16 03:46:46 2014 +0200
# Node ID d87009e720063a8d6d80afdbe6bb9323e2849030
# Parent  48c0b101a9de1fdbd638daa858da845cd05a6be7
localrepo: persistent caching of branch names

It is expensive to retrieve the branch name. Very expensive when creating a
changectx and calling .branch() - slightly less when using
changelog.branchinfo().

Now, to really speed things up, cache the results on disk. To get efficient
lookup for revisions (constant size records) and avoid storing the same branch
name over and ever, store the name of each branch once with a fixed ordering.
For each repo revision, store the node hash and the index of the branch name.
To make it 100% stable against repository mutations, always check the node hash
before using the cache content.

The code for this is kind of similar to the branchmap handling and is placed in
the same module even though the name is not completely spot on.

This new method promise to make some operations up 20 times faster once it
actually is used.

A simpler approach that didn't store and validate node hashes for every
revision was significantly faster (x2) but could be tricked when modifying
history. The usual worst case would be that the whole cache was invalidated
when the repository history was modified, but when trying very hard it could be
tricked into not noticing changes.

diff --git a/mercurial/branchmap.py b/mercurial/branchmap.py
--- a/mercurial/branchmap.py
+++ b/mercurial/branchmap.py
@@ -9,6 +9,7 @@ from node import bin, hex, nullid, nullr
 import encoding
 import util
 import time
+import struct, array
 
 def _filename(repo):
     """name of a branchcache file for a given repo or repoview"""
@@ -285,3 +286,97 @@ class branchcache(dict):
         duration = time.time() - starttime
         repo.ui.log('branchcache', 'updated %s branch cache in %.4f seconds\n',
                     repo.filtername, duration)
+
+class revbranchcache(object):
+    """Persistent cache mapping from revision number to branch name.
+    Consistency is guaranteed by verifying the node hash."""
+
+    filename = 'cache/branchnames'
+    formatversion = 2345164374
+    headerfmt = '>LLL' # file header: version, records start, records length
+    recfmt = '>20sH' # a record: node hash, branch name reference
+    headersize = struct.calcsize(headerfmt)
+    recsize = struct.calcsize(recfmt)
+
+    def __init__(self, repo):
+        self._repo = repo
+        self._loaded = False
+        self._dirty = False
+        self._names = [] # branch names referenced from recfmt records
+        self._records = array.array('c') # bytes with structs of type recfmt
+
+    def _load(self):
+        """Load cached branch names."""
+        try:
+            data = self._repo.vfs.open(self.filename).read()
+        except IOError:
+            data = ''
+
+        self._dirty = True
+        reporecslen = len(self._repo) * self.recsize
+        if len(data) >= self.headersize:
+            # header
+            v, recsstart, recslen = struct.unpack_from(self.headerfmt, data)
+            if v == self.formatversion and len(data) == recsstart + recslen:
+                # between header and records: \0 separated branch names
+                if recsstart != self.headersize:
+                    self._names = \
+                        data[self.headersize:recsstart].split('\0')
+                # read records, cap at repo size
+                self._records.fromstring(
+                    buffer(data, recsstart, min(recslen, reporecslen)))
+                # only dirty if too many records (after strip)
+                self._dirty = recslen > reporecslen
+            else:
+                self._repo.ui.debug('branch cache file was invalid\n')
+
+        # pad to repo size
+        if len(self._records) < reporecslen:
+            self._records.extend(
+                '\xff' * (reporecslen - len(self._records)))
+
+        self._branchnamesindex = dict((b, r)
+                                      for r, b in enumerate(self._names))
+        self._node = self._repo.changelog.node
+        self._branchinfo = self._repo.changelog.branchinfo
+        self._loaded = True
+
+    def branch(self, rev):
+        """Return branch name of rev, using and updating persistent cache."""
+        if not self._loaded:
+            self._load()
+
+        node = self._node(rev)
+        cachenode, branchidx = struct.unpack_from(self.recfmt, self._records,
+                                                  rev * self.recsize)
+        if cachenode == node and branchidx < len(self._names):
+            return self._names[branchidx]
+        b, _close = self._branchinfo(rev)
+        if b in self._branchnamesindex:
+            branchidx = self._branchnamesindex[b]
+        else:
+            branchidx = len(self._names)
+            self._names.append(b)
+            self._branchnamesindex[b] = branchidx
+        struct.pack_into(self.recfmt, self._records, rev * self.recsize,
+                         node, branchidx)
+        self._dirty = True
+        return b
+
+    def save(self):
+        """Save branch cache if it is dirty."""
+        if self._dirty:
+            self._repo.ui.debug('writing branch cache file\n')
+            try:
+                f = self._repo.vfs.open(self.filename, 'w', atomictemp=True)
+                s = '\0'.join(self._names)
+                f.write(struct.pack(self.headerfmt, self.formatversion,
+                                    self.headersize + len(s),
+                                    len(self._records)))
+                f.write(s)
+                f.write(self._records)
+                f.close()
+            except IOError:
+                pass
+            self._dirty = False
+
diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -297,8 +297,11 @@ class localrepository(object):
         # - bookmark changes
         self.filteredrevcache = {}
 
+        self.revbranchcache = branchmap.revbranchcache(self)
+
     def close(self):
-        pass
+        if self.revbranchcache:
+            self.revbranchcache.save()
 
     def _restrictcapabilities(self, caps):
         # bundle2 is not ready for prime time, drop it unless explicitly
diff --git a/mercurial/statichttprepo.py b/mercurial/statichttprepo.py
--- a/mercurial/statichttprepo.py
+++ b/mercurial/statichttprepo.py
@@ -141,6 +141,7 @@ class statichttprepository(localrepo.loc
         self._branchcaches = {}
         self.encodepats = None
         self.decodepats = None
+        self.revbranchcache = None
 
     def _restrictcapabilities(self, caps):
         caps = super(statichttprepository, self)._restrictcapabilities(caps)


More information about the Mercurial-devel mailing list