[PATCH 3 of 6] branchcache: introduce revbranchcache for caching of revision branch names

Mads Kiilerich mads at kiilerich.com
Sun Dec 14 12:34:22 CST 2014


# HG changeset patch
# User Mads Kiilerich <madski at unity3d.com>
# Date 1418581983 -3600
#      Sun Dec 14 19:33:03 2014 +0100
# Node ID eeeae034dc7aa8501c3e8f510a0788a77e54bc9f
# Parent  4f99561d776947cc3911b645c485be1bc2cf0c1f
branchcache: introduce revbranchcache for caching of revision branch names

It is expensive to retrieve the branch name of a revision. Very expensive when
creating a changectx and calling .branch() every time - slightly less when
using changelog.branchinfo().

Now, to really speed things up, we cache the results on disk. To get constant
size records (so we can make efficient lookups based on revision number) and
avoid storing the same branch name over and ever, we store each branch name
once with a fixed index. For each repo revision, we store the node hash and the
index of the branch name. The whole cache file is rewritten atomically every
time it is updated. To make the system 100% stable against repository
mutations, we always check the node hash for each revision before using the
cache content.

This new method is still unused but promise to make some operations up 20 times
faster once it actually is used.

diff --git a/mercurial/branchmap.py b/mercurial/branchmap.py
--- a/mercurial/branchmap.py
+++ b/mercurial/branchmap.py
@@ -9,6 +9,7 @@
 import encoding
 import util
 import time
+import struct, array
 
 def _filename(repo):
     """name of a branchcache file for a given repo or repoview"""
@@ -286,3 +287,110 @@
         duration = time.time() - starttime
         repo.ui.log('branchcache', 'updated %s branch cache in %.4f seconds\n',
                     repo.filtername, duration)
+
+# Revision branch name cache
+
+rbcfilename = 'cache/revbranchnames'
+rbcheaderfmt = '>LL' # file header: records start, records length
+rbcrecfmt = '>20sH' # a record: node hash, branch name reference
+rbcheadersize = struct.calcsize(rbcheaderfmt)
+rbcrecsize = struct.calcsize(rbcrecfmt)
+rbcclosebitmask = 1 << (8 * struct.calcsize('>H') - 1)
+rbcbranchidxmask = rbcclosebitmask - 1
+
+class revbranchcache(object):
+    """Persistent cache mapping from revision number to branch name.
+    Consistency is guaranteed by verifying the node hash before use.
+
+    The file format is binary, everything in one file to ensure consistency:
+    First a header (rbcheaderfmt) with start and length of the records.
+    After header and until records starts: UTF-8 branch names separated by 0.
+    Branch names are added to the list on demand, with indices fixed once added.
+    Records run until end of file and is records (rbcrecfmt) with the hash of
+    the corresponding node, the index of the corresponding branch name, and a
+    flag for closed.
+    """
+
+    def __init__(self, repo):
+        self._repo = repo
+        self._dirty = False
+        reporecslen = len(self._repo) * rbcrecsize
+
+        try:
+            data = self._repo.vfs.open(rbcfilename).read()
+        except (IOError, OSError, util.Abort):
+            data = ''
+
+        self._namesutf8 = [] # utf8 branch names referenced from the records
+        self._records = array.array('c') # bytes with structs of type rbcrecfmt
+        if len(data) >= rbcheadersize:
+            # header
+            recsstart, recslen = struct.unpack_from(rbcheaderfmt, data)
+            if len(data) == recsstart + recslen:
+                # between header and records: \0 separated branch names
+                if recsstart != rbcheadersize:
+                    self._namesutf8 = \
+                        data[rbcheadersize:recsstart].split('\0')
+                # read records, cap at repo size
+                self._records.fromstring(
+                    buffer(data, recsstart, min(recslen, reporecslen)))
+
+        # pad to repo size
+        if len(self._records) < reporecslen:
+            self._records.extend(
+                '\xff' * (reporecslen - len(self._records)))
+
+        self._branchnamesindex = dict((b, r)
+                                      for r, b in enumerate(self._namesutf8))
+
+    def branchinfoutf8(self, rev):
+        """Return branch name and close flag for rev, using and updating
+        persistent cache."""
+        node = self._repo.changelog.node(rev)
+        recordindex = rev * rbcrecsize
+        if len(self._records) <= recordindex:
+            self._records.extend(
+                '\xff' * (len(self._repo) * rbcrecsize - len(self._records)))
+
+        # fast path: extract data from cache, use it if node is matching
+        cachenode, branchidx = struct.unpack_from(rbcrecfmt, self._records,
+                                                  recordindex)
+        close = branchidx & rbcclosebitmask
+        branchidx &= rbcbranchidxmask
+        if cachenode == node and branchidx < len(self._namesutf8):
+            butf8 = self._namesutf8[branchidx]
+            return butf8, close
+
+        # slow path: retrieve from changelog and store in cache
+        butf8, close = self._repo.changelog.branchinfoutf8(rev)
+        # reuse branch name entry to add a new one
+        if butf8 in self._branchnamesindex:
+            branchidx = self._branchnamesindex[butf8]
+        else:
+            branchidx = len(self._namesutf8)
+            if branchidx >= rbcbranchidxmask:
+                # fall-back to no caching if cache full
+                return butf8, close
+            self._namesutf8.append(butf8)
+            self._branchnamesindex[butf8] = branchidx
+        if close:
+            branchidx |= rbcclosebitmask
+        struct.pack_into(rbcrecfmt, self._records, recordindex,
+                         node, branchidx)
+        self._dirty = True
+        return butf8, close
+
+    def save(self):
+        """Save branch cache if it is dirty."""
+        if self._dirty:
+            try:
+                f = self._repo.vfs.open(rbcfilename, 'w', atomictemp=True)
+                s = '\0'.join(self._namesutf8)
+                f.write(struct.pack(rbcheaderfmt, rbcheadersize + len(s),
+                                    len(self._records)))
+                f.write(s)
+                f.write(self._records)
+                f.close()
+            except (IOError, OSError, util.Abort):
+                pass
+            self._dirty = False


More information about the Mercurial-devel mailing list