[PATCH] localrepo: introduce persistent caching of revset revision's branch names

Mads Kiilerich mads at kiilerich.com
Tue Oct 14 19:33:12 CDT 2014


# HG changeset patch
# User Mads Kiilerich <madski at unity3d.com>
# Date 1413333190 -7200
#      Wed Oct 15 02:33:10 2014 +0200
# Node ID 85189122b49b31dddbcddb7c0925afa019dc4403
# Parent  48c0b101a9de1fdbd638daa858da845cd05a6be7
localrepo: introduce persistent caching of revset revision's branch names

It is expensive to create a changectx and extract the branch name. That shows
up when filtering on branches in revsets.

To speed things up, cache the results on disk. To avoid using too much space,
all branch names are only stored once and each revision references one of these
names. To verify that the cache is valid, we also store the tip hash in the
cache file.

Test case, run on the relatively small hg repo:
  rm -f .hg/cache/branchnames
  HGRCPATH=/dev/null ./hg --time log -r 'branch(stable)&branch(default)'
  HGRCPATH=/dev/null ./hg --time log -r 'branch(stable)&branch(default)'
  HGRCPATH=/dev/null ./hg --time log -r 'branch(stable)&branch(default)'

Before:

time: real 1.230 secs (user 1.220+0.000 sys 0.010+0.000)
time: real 1.200 secs (user 1.180+0.000 sys 0.020+0.000)
time: real 1.180 secs (user 1.170+0.000 sys 0.010+0.000)

After (3 runs):

time: real 1.330 secs (user 1.320+0.000 sys 0.010+0.000)
time: real 0.080 secs (user 0.080+0.000 sys 0.000+0.000)
time: real 0.080 secs (user 0.080+0.000 sys 0.000+0.000)

time: real 1.310 secs (user 1.310+0.000 sys 0.020+0.000)
time: real 0.070 secs (user 0.080+0.000 sys 0.000+0.000)
time: real 0.080 secs (user 0.080+0.000 sys 0.000+0.000)

time: real 1.300 secs (user 1.280+0.000 sys 0.020+0.000)
time: real 0.070 secs (user 0.070+0.000 sys 0.000+0.000)
time: real 0.080 secs (user 0.090+0.000 sys 0.000+0.000)

Tests on bigger repo shows the same: First run is slightly slower while not
getting cache hits, next run will be up to 20 times faster.

diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -6,7 +6,7 @@
 # GNU General Public License version 2 or any later version.
 from node import hex, nullid, short
 from i18n import _
-import urllib
+import urllib, struct, array
 import peer, changegroup, subrepo, pushkey, obsolete, repoview
 import changelog, dirstate, filelog, manifest, context, bookmarks, phases
 import lock as lockmod
@@ -21,6 +21,13 @@ import branchmap, pathutil
 propertycache = util.propertycache
 filecache = scmutil.filecache
 
+branchcachefn = 'cache/branchnames'
+branchcachever = 2345164374 # TODO: reset when upstreaming
+headerfmt = '>LL20sL'
+headersize = struct.calcsize(headerfmt)
+reffmt = '>H'
+refsize = struct.calcsize(reffmt)
+
 class repofilecache(filecache):
     """All filecache usage on repo are done for logic that should be unfiltered
     """
@@ -179,6 +186,7 @@ class localrepository(object):
     openerreqs = set(('revlogv1', 'generaldelta'))
     requirements = ['revlogv1']
     filtername = None
+    _branchcachedirty = None
 
     # a list of (ui, featureset) functions.
     # only functions defined in module of enabled extensions are invoked
@@ -298,7 +306,7 @@ class localrepository(object):
         self.filteredrevcache = {}
 
     def close(self):
-        pass
+        self._branchcachesave()
 
     def _restrictcapabilities(self, caps):
         # bundle2 is not ready for prime time, drop it unless explicitly
@@ -723,6 +731,80 @@ class localrepository(object):
         repo = (remote and remote.local()) and remote or self
         return repo[key].branch()
 
+    def branch(self, rev):
+        """return name of rev's branch as self[rev].branch() but cache it.
+        First invocation of this method will load cached values and replace
+        this method with a more efficient getter."""
+        try:
+            data = self.vfs.open(branchcachefn).read()
+        except IOError:
+            data = ''
+
+        self._branches = []
+        self._branchrefs = array.array('c') # bytes from struct type reffmt
+        self.__dict__['_branchcachedirty'] = True
+        if len(data) >= headersize:
+            v, revs, headnode, bnsize = struct.unpack_from(headerfmt, data)
+            if (v == branchcachever and
+                (revs - 1) in self and
+                self[revs - 1].node() == headnode and
+                len(data) == headersize + bnsize + revs * refsize
+                ):
+                if bnsize:
+                    self._branches = \
+                        data[headersize : headersize + bnsize].split('\0')
+                self._branchrefs.fromstring(
+                    buffer(data, headersize + bnsize, revs * refsize))
+                self.__dict__['_branchcachedirty'] = False
+            else:
+                self.ui.debug('branch cache was outdated\n')
+        else:
+            self.ui.debug('branch cache was invalid\n')
+
+        if len(self._branchrefs) < len(self) * refsize:
+            self._branchrefs.extend(
+                '\xff' * (len(self) * refsize - len(self._branchrefs)))
+
+        self._branchnamesindex = dict((b, r)
+                                      for r, b in enumerate(self._branches))
+
+        self.__dict__['branch'] = self._branch
+        return self.branch(rev)
+
+    def _branch(self, rev):
+        """return name of rev's branch as self[rev].branch() using cache."""
+        branchref = struct.unpack_from(reffmt, self._branchrefs,
+                                       rev * refsize)[0]
+        if branchref < len(self._branches):
+            return self._branches[branchref]
+        b = self[rev].branch()
+        if b in self._branchnamesindex:
+            branchref = self._branchnamesindex[b]
+        else:
+            branchref = len(self._branches)
+            self._branches.append(b)
+            self._branchnamesindex[b] = branchref
+        struct.pack_into(reffmt, self._branchrefs, rev * refsize, branchref)
+        self.__dict__['_branchcachedirty'] = True
+        return b
+
+    def _branchcachesave(self):
+        """save branch cache if it is dirty"""
+        if self._branchcachedirty:
+            self.ui. debug('writing branch cache\n')
+            try:
+                f = self.vfs.open(branchcachefn, 'w', atomictemp=True)
+                s = '\0'.join(self._branches)
+                revs = len(self._branchrefs) / refsize
+                f.write(struct.pack(headerfmt, branchcachever, revs,
+                        self[revs - 1].node(), len(s)))
+                f.write(s)
+                f.write(self._branchrefs)
+                f.close()
+            except IOError:
+                pass
+            self.__dict__['_branchcachedirty'] = False
+
     def known(self, nodes):
         nm = self.changelog.nodemap
         pc = self._phasecache
@@ -1003,10 +1085,11 @@ class localrepository(object):
         return 0
 
     def invalidatecaches(self):
-
+        # can't use delattr on proxy
         if '_tagscache' in vars(self):
-            # can't use delattr on proxy
             del self.__dict__['_tagscache']
+        if 'branch' in vars(self):
+            del self.__dict__['branch']
 
         self.unfiltered()._branchcaches.clear()
         self.invalidatevolatilesets()
diff --git a/mercurial/revset.py b/mercurial/revset.py
--- a/mercurial/revset.py
+++ b/mercurial/revset.py
@@ -489,16 +489,16 @@ def branch(repo, subset, x):
             # note: falls through to the revspec case if no branch with
             # this name exists
             if pattern in repo.branchmap():
-                return subset.filter(lambda r: matcher(repo[r].branch()))
+                return subset.filter(lambda r: matcher(repo.branch(r)))
         else:
-            return subset.filter(lambda r: matcher(repo[r].branch()))
+            return subset.filter(lambda r: matcher(repo.branch(r)))
 
     s = getset(repo, spanset(repo), x)
     b = set()
     for r in s:
-        b.add(repo[r].branch())
+        b.add(repo.branch(r))
     c = s.__contains__
-    return subset.filter(lambda r: c(r) or repo[r].branch() in b)
+    return subset.filter(lambda r: c(r) or repo.branch(r) in b)
 
 def bumped(repo, subset, x):
     """``bumped()``
@@ -1431,7 +1431,7 @@ def matching(repo, subset, x):
     getfieldfuncs = []
     _funcs = {
         'user': lambda r: repo[r].user(),
-        'branch': lambda r: repo[r].branch(),
+        'branch': lambda r: repo.branch(r),
         'date': lambda r: repo[r].date(),
         'description': lambda r: repo[r].description(),
         'files': lambda r: repo[r].files(),
@@ -1532,9 +1532,9 @@ def sort(repo, subset, x):
             elif k == '-rev':
                 e.append(-r)
             elif k == 'branch':
-                e.append(c.branch())
+                e.append(repo.branch(r))
             elif k == '-branch':
-                e.append(invert(c.branch()))
+                e.append(invert(repo.branch(r)))
             elif k == 'desc':
                 e.append(c.description())
             elif k == '-desc':
diff --git a/tests/test-revset.t b/tests/test-revset.t
--- a/tests/test-revset.t
+++ b/tests/test-revset.t
@@ -744,6 +744,39 @@ issue2437
   $ log 'ancestors(8) and (heads(branch("-a-b-c-")) or heads(branch(é)))'
   4
 
+branchname caching
+
+  $ rm .hg/cache/branchnames
+  $ log 'branch("-a-b-c-")'
+  4
+  $ "$TESTDIR/md5sum.py" .hg/cache/branchnames
+  f9bf54761390e8736e245e7de0b5d9e6  .hg/cache/branchnames
+  $ echo > .hg/cache/branchnames
+  $ log 'branch("-a-b-c-")'
+  4
+  $ "$TESTDIR/md5sum.py" .hg/cache/branchnames
+  f9bf54761390e8736e245e7de0b5d9e6  .hg/cache/branchnames
+  $ echo >> .hg/cache/branchnames
+  $ log 'branch("-a-b-c-")'
+  4
+  $ "$TESTDIR/md5sum.py" .hg/cache/branchnames
+  f9bf54761390e8736e245e7de0b5d9e6  .hg/cache/branchnames
+  $ hg tag tag
+  $ "$TESTDIR/md5sum.py" .hg/cache/branchnames
+  f9bf54761390e8736e245e7de0b5d9e6  .hg/cache/branchnames
+  $ log 'branch("-a-b-c-")'
+  4
+  $ "$TESTDIR/md5sum.py" .hg/cache/branchnames
+  9025ff4c8e9d36f49921ae9b2752e6b9  .hg/cache/branchnames
+  $ hg up -qr '.^'
+  $ hg rollback -qf
+  $ "$TESTDIR/md5sum.py" .hg/cache/branchnames
+  9025ff4c8e9d36f49921ae9b2752e6b9  .hg/cache/branchnames
+  $ log 'branch("-a-b-c-")'
+  4
+  $ "$TESTDIR/md5sum.py" .hg/cache/branchnames
+  f9bf54761390e8736e245e7de0b5d9e6  .hg/cache/branchnames
+
 issue2654: report a parse error if the revset was not completely parsed
 
   $ log '1 OR 2'


More information about the Mercurial-devel mailing list