[PATCH 1 of 2 v5] localrepo: persistent caching of branch names

Thu Oct 16 09:49:33 CDT 2014

# HG changeset patch
# User Mads Kiilerich <madski at unity3d.com>
# Date 1413470215 -7200
#      Thu Oct 16 16:36:55 2014 +0200
# Node ID efe19367c4809045dece68abded2a3297c16a7a8
# Parent  da2758c0aca04fe20fef2797f80c6079099afca5
localrepo: persistent caching of branch names

It is expensive to retrieve the branch name. Very expensive when creating a
changectx and calling .branch() - slightly less when using
changelog.branchinfo().

Now, to really speed things up, cache the results on disk. To get efficient
lookup for revisions (constant size records) and avoid storing the same branch
name over and ever, store the name of each branch once with a fixed ordering.
For each repo revision, store the node hash and the index of the branch name.
To make it 100% stable against repository mutations, always check the node hash
before using the cache content.

The code for this is kind of similar to the branchmap handling and is placed in
the same module even though the name is not completely spot on.

This new method promise to make some operations up 20 times faster once it
actually is used.

A simpler approach that didn't store and validate node hashes for every
revision was significantly faster (x2) but could be tricked when modifying
history. The usual worst case would be that the whole cache was invalidated
when the repository history was modified, but when trying very hard it could be
tricked into not noticing changes.

diff --git a/mercurial/branchmap.py b/mercurial/branchmap.py
--- a/mercurial/branchmap.py
+++ b/mercurial/branchmap.py
@@ -9,6 +9,7 @@ from node import bin, hex, nullid, nullr
 import encoding
 import util
 import time
+import struct, array
 
 def _filename(repo):
     """name of a branchcache file for a given repo or repoview"""
@@ -285,3 +286,101 @@ class branchcache(dict):
         duration = time.time() - starttime
         repo.ui.log('branchcache', 'updated %s branch cache in %.4f seconds\n',
                     repo.filtername, duration)
+
+class revbranchcache(object):
+    """Persistent cache mapping from revision number to branch name.
+    Consistency is guaranteed by verifying the node hash."""
+
+    filename = 'cache/branchnames'
+    magic = 2345164374
+    headerfmt = '>LLL' # file header: magic, records start, records length
+    recfmt = '>20sH' # a record: node hash, branch name reference
+    headersize = struct.calcsize(headerfmt)
+    recsize = struct.calcsize(recfmt)
+    # File format:
+    # First a header (headerfmt) with magic and start and length of the records.
+    # After header and until records starts: Branch names separated by \0.
+    # Branch names are appended on demand, with fixed indices once added.
+    # Records run until end of file and is records (recfmt) with the hash of
+    # the corresponding node and the index of the corresponding branch name.
+
+    def __init__(self, repo):
+        self._repo = repo
+        self._loaded = False
+        self._dirty = False
+        self._names = [] # branch names referenced from recfmt records
+        self._records = array.array('c') # bytes with structs of type recfmt
+
+    def _load(self):
+        """Load cached branch names."""
+        try:
+            data = self._repo.vfs.open(self.filename).read()
+        except IOError:
+            data = ''
+
+        self._dirty = True
+        reporecslen = len(self._repo) * self.recsize
+        if len(data) >= self.headersize:
+            # header
+            v, recsstart, recslen = struct.unpack_from(self.headerfmt, data)
+            if v == self.magic and len(data) == recsstart + recslen:
+                # between header and records: \0 separated branch names
+                if recsstart != self.headersize:
+                    self._names = \
+                        data[self.headersize:recsstart].split('\0')
+                # read records, cap at repo size
+                self._records.fromstring(
+                    buffer(data, recsstart, min(recslen, reporecslen)))
+                # successful read, only dirty if too many records (after strip)
+                self._dirty = recslen > reporecslen
+
+        # pad to repo size
+        if len(self._records) < reporecslen:
+            self._records.extend(
+                '\xff' * (reporecslen - len(self._records)))
+
+        self._branchnamesindex = dict((b, r)
+                                      for r, b in enumerate(self._names))
+        self._node = self._repo.changelog.node
+        self._branchinfo = self._repo.changelog.branchinfo
+        self._loaded = True
+
+    def branch(self, rev):
+        """Return branch name of rev, using and updating persistent cache."""
+        if not self._loaded:
+            self._load()
+
+        node = self._node(rev)
+        cachenode, branchidx = struct.unpack_from(self.recfmt, self._records,
+                                                  rev * self.recsize)
+        if cachenode == node and branchidx < len(self._names):
+            return self._names[branchidx]
+        b, _close = self._branchinfo(rev)
+        if b in self._branchnamesindex:
+            branchidx = self._branchnamesindex[b]
+        else:
+            branchidx = len(self._names)
+            self._names.append(b)
+            self._branchnamesindex[b] = branchidx
+        struct.pack_into(self.recfmt, self._records, rev * self.recsize,
+                         node, branchidx)
+        self._dirty = True
+        return b
+
+    def save(self):
+        """Save branch cache if it is dirty."""
+        if self._dirty:
+            self._repo.ui.debug('writing branch cache file\n')
+            try:
+                f = self._repo.vfs.open(self.filename, 'w', atomictemp=True)
+                s = '\0'.join(self._names)
+                f.write(struct.pack(self.headerfmt, self.magic,
+                                    self.headersize + len(s),
+                                    len(self._records)))
+                f.write(s)
+                f.write(self._records)
+                f.close()
+            except IOError:
+                pass
+            self._dirty = False
+
diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -297,8 +297,11 @@ class localrepository(object):
         # - bookmark changes
         self.filteredrevcache = {}
 
+        self.revbranchcache = branchmap.revbranchcache(self)
+
     def close(self):
-        pass
+        if self.revbranchcache:
+            self.revbranchcache.save()
 
     def _restrictcapabilities(self, caps):
         # bundle2 is not ready for prime time, drop it unless explicitly
diff --git a/mercurial/statichttprepo.py b/mercurial/statichttprepo.py
--- a/mercurial/statichttprepo.py
+++ b/mercurial/statichttprepo.py
@@ -141,6 +141,7 @@ class statichttprepository(localrepo.loc
         self._branchcaches = {}
         self.encodepats = None
         self.decodepats = None
+        self.revbranchcache = None
 
     def _restrictcapabilities(self, caps):
         caps = super(statichttprepository, self)._restrictcapabilities(caps)