[PATCH 1 of 2 v2] localrepo: persistent caching of branch names

Augie Fackler raf at durin42.com
Wed Oct 15 13:35:45 CDT 2014


On Wed, Oct 15, 2014 at 08:14:22PM +0200, Mads Kiilerich wrote:
> # HG changeset patch
> # User Mads Kiilerich <madski at unity3d.com>
> # Date 1413396806 -7200
> #      Wed Oct 15 20:13:26 2014 +0200
> # Node ID bf7a0169677c0545a63e64690b0e49e50b376703
> # Parent  48c0b101a9de1fdbd638daa858da845cd05a6be7
> localrepo: persistent caching of branch names
>
> It is expensive to retrieve the branch name. Very expensive when creating a
> changectx and calling .branch() - slightly less when using
> changelog.branchinfo().
>
> Now, to really speed things up, cache the results on disk. For each repo
> revision store the node hash and a reference to the branch name. To avoid using
> too much space, each branch name is only stored once. To make it 100% stable
> against repository mutations, always check the node hash before using the
> cache content.
>
> This change promise to speed some operations up 4-6 times when it actually is
> used.
>
> A simpler approach that didn't store and validate node hashes for every
> revision had a 20 x speedup but could be tricked when modifying history. It
> would usually reset the cache, but when trying very hard it could be tricked
> into not noticing changes.
>
> diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
> --- a/mercurial/localrepo.py
> +++ b/mercurial/localrepo.py
> @@ -6,7 +6,7 @@
>  # GNU General Public License version 2 or any later version.
>  from node import hex, nullid, short
>  from i18n import _
> -import urllib
> +import urllib, struct, array
>  import peer, changegroup, subrepo, pushkey, obsolete, repoview
>  import changelog, dirstate, filelog, manifest, context, bookmarks, phases
>  import lock as lockmod
> @@ -21,6 +21,14 @@ import branchmap, pathutil
>  propertycache = util.propertycache
>  filecache = scmutil.filecache
>
> +# branch name caching
> +bcfilename = 'cache/branchnames'
> +bcversion = 2345164374
> +bcheadfmt = '>LLL'
> +bcheadsize = struct.calcsize(bcheadfmt)
> +bcrecfmt = '>20sH'
> +bcrecsize = struct.calcsize(bcrecfmt)
> +
>  class repofilecache(filecache):
>      """All filecache usage on repo are done for logic that should be unfiltered
>      """
> @@ -179,6 +187,7 @@ class localrepository(object):
>      openerreqs = set(('revlogv1', 'generaldelta'))
>      requirements = ['revlogv1']
>      filtername = None
> +    _branchcachedirty = None
>
>      # a list of (ui, featureset) functions.
>      # only functions defined in module of enabled extensions are invoked
> @@ -298,7 +307,7 @@ class localrepository(object):

Oh, and I'm not enthusiastic about stuffing new behavior into
localrepo. I'm open to suggestions on how we can better factor this.

>          self.filteredrevcache = {}
>
>      def close(self):
> -        pass
> +        self._branchcachesave()
>
>      def _restrictcapabilities(self, caps):
>          # bundle2 is not ready for prime time, drop it unless explicitly
> @@ -723,6 +732,74 @@ class localrepository(object):
>          repo = (remote and remote.local()) and remote or self
>          return repo[key].branch()
>
> +    def _branchcacheload(self):
> +        """Load cached branch values."""
> +        try:
> +            data = self.vfs.open(bcfilename).read()
> +        except IOError:
> +            data = ''
> +
> +        self._branches = []
> +        self._branchrecs = array.array('c') # bytes of struct type bcrecfmt
> +        self.__dict__['_branchcachedirty'] = True
> +        reporecslen = len(self) * bcrecsize
> +        if len(data) >= bcheadsize:
> +            v, recsstart, recslen = struct.unpack_from(bcheadfmt, data)
> +            if v == bcversion and len(data) == recsstart + recslen:
> +                if recsstart:
> +                    self._branches = \
> +                        data[bcheadsize:recsstart].split('\0')
> +                self._branchrecs.fromstring(
> +                    buffer(data, recsstart, min(recslen, reporecslen)))
> +                self.__dict__['_branchcachedirty'] = recslen > reporecslen
> +            else:
> +                self.ui.debug('branch cache file was invalid\n')
> +
> +        if len(self._branchrecs) < reporecslen:
> +            self._branchrecs.extend(
> +                '\xff' * (reporecslen - len(self._branchrecs)))
> +
> +        self._branchnamesindex = dict((b, r)
> +                                      for r, b in enumerate(self._branches))
> +
> +    def branch(self, rev):
> +        """return branch name of rev, using and updating persistent cache."""
> +        if self._branchcachedirty is None:
> +            self._branchcacheload()
> +
> +        node = self.changelog.node(rev)
> +        cachenode, branchidx = struct.unpack_from(bcrecfmt, self._branchrecs,
> +                                                  rev * bcrecsize)
> +        if cachenode == node and branchidx < len(self._branches):
> +            return self._branches[branchidx]
> +        b, _close = self.changelog.branchinfo(rev)
> +        if b in self._branchnamesindex:
> +            branchidx = self._branchnamesindex[b]
> +        else:
> +            branchidx = len(self._branches)
> +            self._branches.append(b)
> +            self._branchnamesindex[b] = branchidx
> +        struct.pack_into(bcrecfmt, self._branchrecs, rev * bcrecsize,
> +                         node, branchidx)
> +        self.__dict__['_branchcachedirty'] = True
> +        return b
> +
> +    def _branchcachesave(self):
> +        """save branch cache if it is dirty"""
> +        if self._branchcachedirty:
> +            self.ui.debug('writing branch cache file\n')
> +            try:
> +                f = self.vfs.open(bcfilename, 'w', atomictemp=True)
> +                s = '\0'.join(self._branches)
> +                f.write(struct.pack(bcheadfmt, bcversion,
> +                                    bcheadsize + len(s), len(self._branchrecs)))
> +                f.write(s)
> +                f.write(self._branchrecs)
> +                f.close()
> +            except IOError:
> +                pass
> +            self.__dict__['_branchcachedirty'] = False
> +
>      def known(self, nodes):
>          nm = self.changelog.nodemap
>          pc = self._phasecache
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel


More information about the Mercurial-devel mailing list