[PATCH 1 of 3 v2] branchcache: introduce revbranchcache for caching of revision branch names

Mads Kiilerich mads at kiilerich.com
Thu Jan 8 04:23:53 UTC 2015


# HG changeset patch
# User Mads Kiilerich <madski at unity3d.com>
# Date 1420671663 -3600
#      Thu Jan 08 00:01:03 2015 +0100
# Node ID 313a4d827c4d7b0a05c3feb99627be0933e72a60
# Parent  ed645dc672e56a44145fae3ae9f2c09021035da9
branchcache: introduce revbranchcache for caching of revision branch names

It is expensive to retrieve the branch name of a revision. Very expensive when
creating a changectx and calling .branch() every time - slightly less when
using changelog.branchinfo().

Now, to speed things up, provide a way to cache the results on disk in an
efficient format. Each branchname is assigned a number, and for each revision
we store the number of the corresponding branch name. The branch names are
stored in a dedicated file which is strictly append only.

Branch names are usually reused across several revisions, and the total list of
branch names will thus be so small that the whole set of names can be read
before using the cache.

The revision entries are stored in another file. This file is usually append
only, but if the repository has been modified, the file will be truncated and
the relevant parts rewritten on demand.

The entries for each revision are currently 23 bytes each, and the whole
revision file will thus be approximately 1/3 of 00changelog.i.

Each revision entry contains the corresponding node hash which always is
checked before the entry is used. That check is expensive but it makes sure
history modification is detected and handled correctly. It will also detect and
handle most revision file corruptions.

This is just a cache. A new format can always be introduced if other
requirements or ideas make that seem like a good idea. Rebuilding the cache is
not really more expensive than it was to run for example 'hg log -b branchname'
before this cache was introduced.

This new method is still unused but promise to make some operations several
times faster once it actually is used.

diff --git a/mercurial/branchmap.py b/mercurial/branchmap.py
--- a/mercurial/branchmap.py
+++ b/mercurial/branchmap.py
@@ -8,7 +8,7 @@
 from node import bin, hex, nullid, nullrev
 import encoding
 import util
-import time
+import time, struct, array
 
 def _filename(repo):
     """name of a branchcache file for a given repo or repoview"""
@@ -285,3 +285,125 @@ class branchcache(dict):
         duration = time.time() - starttime
         repo.ui.log('branchcache', 'updated %s branch cache in %.4f seconds\n',
                     repo.filtername, duration)
+
+# Revision branch info cache
+
+rbcversion = '-v1'
+rbcnames = 'cache/rbcnames' + rbcversion
+rbcrevs = 'cache/rbcrevs' + rbcversion
+# a record: 20 bytes node hash, 2 bytes branch name reference, 1 byte close
+rbcrecfmt = '>20sHB'
+rbcrecsize = struct.calcsize(rbcrecfmt)
+
+class revbranchcache(object):
+    """Persistent cache, mapping from revision number to branch name and close.
+    This is a low level cache, independent of filtering.
+
+    Branch names are stored in 'rbcnames' in internal encoding separated by 0.
+    rbcnames is append-only, and each branch name is only stored once and will
+    thus have a unique index.
+
+    The branch info for each revision is stored in 'rbcrevs' as constant size
+    records. The whole file is read into memory, but it is only 'parsed' on
+    demand. The file is usually append-only but will be truncated if repo
+    modification is detected. The record for each revision contains the
+    corresponding node hash, and the record is only used if it still matches.
+    Even a completely trashed rbcrevs fill thus still give the right result
+    while converging towards full recovery ... assuming no incorrectly matching
+    node hashes. rbcrevs is thus somewhat similar to 00changelog.i and will
+    have approximately the same size.
+    """
+
+    def __init__(self, repo):
+        self._names = [] # branch names in local encoding with static index
+        self._rbcrevs = array.array('c') # structs of type rbcrecfmt
+        try:
+            bndata = repo.vfs.read(rbcnames)
+            self._names = [encoding.tolocal(bn) for bn in bndata.split('\0')]
+        except (IOError, OSError, util.Abort), inst:
+            repo.ui.debug("couldn't read revision branch cache names: %s\n" %
+                          inst)
+        if self._names:
+            try:
+                data = repo.vfs.read(rbcrevs)
+                self._rbcrevs.fromstring(data)
+            except (IOError, OSError), inst:
+                repo.ui.debug("couldn't read revision branch cache: %s\n" %
+                              inst)
+        # remember number of good records on disk
+        self._rbcrevslen = min(len(self._rbcrevs) // rbcrecsize,
+                               len(repo))
+        if self._rbcrevslen == 0:
+            self._names = []
+        self._rbcnameslen = len(self._names) # number of good names on disk
+        self._namesreverse = dict((b, r) for r, b in enumerate(self._names))
+
+    def branchinfo(self, changelog, rev):
+        """Return branch name and close flag for rev, using and updating
+        persistent cache."""
+        rbcrevidx = rev * rbcrecsize
+
+        # if requested rev is missing add and populate all missing revs
+        if len(self._rbcrevs) < rbcrevidx + rbcrecsize:
+            first = len(self._rbcrevs) // rbcrecsize
+            self._rbcrevs.extend('\0' * (len(changelog) * rbcrecsize -
+                                         len(self._rbcrevs)))
+            for r in xrange(first, len(changelog)):
+                self._branchinfo(changelog, r)
+
+        # fast path: extract data from cache, use it if node is matching
+        reponode = changelog.node(rev)
+        cachenode, branchidx, close = struct.unpack_from(
+            rbcrecfmt, self._rbcrevs, rbcrevidx)
+        if cachenode == reponode:
+            return self._names[branchidx], close
+        # fall back to slow path and make sure it will be written to disk
+        self._rbcrevslen = min(self._rbcrevslen, rev)
+        return self._branchinfo(changelog, rev)
+
+    def _branchinfo(self, changelog, rev):
+        """Retrieve branch info from changelog and update _rbcrevs"""
+        b, close = changelog.branchinfo(rev)
+        if b in self._namesreverse:
+            branchidx = self._namesreverse[b]
+        else:
+            branchidx = len(self._names)
+            self._names.append(b)
+            self._namesreverse[b] = branchidx
+        reponode = changelog.node(rev)
+        struct.pack_into(rbcrecfmt, self._rbcrevs, rev * rbcrecsize,
+                         reponode, branchidx, bool(close))
+        return b, close
+
+    def write(self, repo):
+        """Save branch cache if it is dirty."""
+        if self._rbcnameslen < len(self._names):
+            try:
+                if self._rbcnameslen == 0:
+                    f = repo.vfs.open(rbcnames, 'wb')
+                else:
+                    f = repo.vfs.open(rbcnames, 'ab')
+                    f.write('\0')
+                f.write('\0'.join(encoding.fromlocal(b)
+                                  for b in self._names[self._rbcnameslen:]))
+                f.close()
+            except (IOError, OSError, util.Abort), inst:
+                repo.ui.debug("couldn't write revision branch cache names: "
+                              "%s\n" % inst)
+                return
+            self._rbcnameslen = len(self._names)
+
+        start = self._rbcrevslen * rbcrecsize
+        if start < len(self._rbcrevs):
+            try:
+                f = repo.vfs.open(rbcrevs, 'ab')
+                if f.tell() != start:
+                    f.seek(start)
+                    f.truncate()
+                f.write(self._rbcrevs[start : len(repo) * rbcrecsize])
+                f.close()
+            except (IOError, OSError, util.Abort), inst:
+                repo.ui.debug("couldn't write revision branch cache: %s\n" %
+                              inst)
+                return
+            self._rbcrevslen = len(repo)


More information about the Mercurial-devel mailing list