[PATCH 2 of 6 V2] obscache: add an abstract base class for changelog+obstore cache

Sat May 20 11:30:16 EDT 2017

# HG changeset patch
# User Pierre-Yves David <pierre-yves.david at octobus.net>
# Date 1495194829 -7200
#      Fri May 19 13:53:49 2017 +0200
# Node ID 41f64bdc68e3361782f19172c95db4775c44fa6c
# Parent  221be1ef98902fa695a709371f75e63f9b3e950a
# EXP-Topic obscache
# Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/
#              hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 41f64bdc68e3
obscache: add an abstract base class for changelog+obstore cache

There are many useful properties to cache around obsolescence. Some of them
needs data from both the changelog and the obsstore. We needs logic handle to
track changes and strip to both data structure are the same time. We also wants
these to allow incremental update when possible instead of recomputing the whole
set all the time.

As this is a common needs, we start by adding an abstract class reusable by
multiple cache.

This code was initially part of the evolve extensions and matured there.

diff --git a/mercurial/obsolete.py b/mercurial/obsolete.py
--- a/mercurial/obsolete.py
+++ b/mercurial/obsolete.py
@@ -69,6 +69,7 @@ comment associated with each format for 
 """
 from __future__ import absolute_import
 
+import abc
 import errno
 import hashlib
 import struct
@@ -1141,6 +1142,234 @@ def successorssets(repo, initialnode, ca
                 cache[current] = final
     return cache[initialnode]
 
+class dualsourcecache(object):
+    """An abstract class for cache that needs both changelog and obsstore
+
+    This class handle the tracking of changelog and obsstore update. It provide
+    data to performs incremental update (see the 'updatefrom' function for
+    details).  This class can also detect stripping of the changelog or the
+    obsstore and can reset the cache in this cache (see the 'clear' function
+    for details).
+
+    For this purpose it use a cache key covering changelog and obsstore
+    content:
+
+    The cache key parts are:
+    - tip-rev,
+    - tip-node,
+    - obsstore-length (nb markers),
+    - obsstore-file-size (in bytes),
+    - obsstore "cache key"
+    """
+
+    __metaclass__ = abc.ABCMeta
+
+    # default key used for an empty cache
+    emptykey = (node.nullrev, node.nullid, 0, 0, node.nullid)
+    _cachename = None # used for debug message
+
+    @abc.abstractmethod
+    def __init__(self):
+        super(dualsourcecache, self).__init__()
+        self._cachekey = None
+
+    @abc.abstractmethod
+    def _updatefrom(self, repo, revs, obsmarkers):
+        """override this method to update your cache data incrementally
+
+        revs:      list of new revision in the changelog
+        obsmarker: list of new obsmarkers in the obsstore
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def clear(self, reset=False):
+        """invalidate the cache content
+
+        if 'reset' is passed, we detected a strip and the cache will have to be
+        recomputed.
+
+        Subclasses MUST overide this method to actually affect the cache data.
+        """
+        if reset:
+            self._cachekey = self.emptykey if reset else None
+        else:
+            self._cachekey = None
+
+    @abc.abstractmethod
+    def load(self, repo):
+        """Load data from disk
+
+        Subclasses MUST restore the "cachekey" attribute while doing so.
+        """
+        raise NotImplementedError
+
+    # Useful public function (no need to override them)
+
+    def uptodate(self, repo):
+        """return True if the cache content is up to date False otherwise
+
+        This method can be used to detect of the cache is lagging behind new
+        data in either changelog or obsstore.
+        """
+        repo = repo.unfiltered()
+        if self._cachekey is None:
+            self.load(repo)
+        status = self._checkkey(repo.changelog, repo.obsstore)
+        return (status is not None
+                and status[0] == self._tiprev
+                and status[1] == self._obssize)
+
+    def update(self, repo):
+        """update the cache with new repository data
+
+        The update will be incremental when possible"""
+        repo = repo.unfiltered()
+        # If we do not have any data, try loading from disk
+        if self._cachekey is None:
+            self.load(repo)
+
+        cl = repo.changelog
+
+        upgrade = self._upgradeneeded(repo)
+        if upgrade is None:
+            return
+
+        reset, revs, obsmarkers, obskeypair = upgrade
+        if reset or self._cachekey is None:
+            repo.ui.log('cache', 'strip detected, %s cache reset\n'
+                        % self._cachename)
+            self.clear(reset=True)
+
+        starttime = util.timer()
+        self._updatefrom(repo, revs, obsmarkers)
+        duration = util.timer() - starttime
+        repo.ui.log('cache', 'updated %s in %.4f seconds (%sr, %so)\n',
+                    self._cachename, duration, len(revs), len(obsmarkers))
+
+        # update the key from the new data
+        key = list(self._cachekey)
+        if revs:
+            # tiprev
+            key[0] = revs[-1]
+            # tipnode
+            key[1] = cl.node(key[0])
+        if obsmarkers:
+            # obsstore length (nb obsmarker)
+            key[2] += len(obsmarkers)
+            # obsstore size + key
+            key[3], key[4] = obskeypair
+        self._cachekey = tuple(key)
+
+    # from here, there are internal function only
+
+    @property
+    def _tiprev(self):
+        """small utility to make the code less error prone"""
+        if self._cachekey is None:
+            return None
+        return self._cachekey[0]
+
+    @property
+    def _obssize(self):
+        """small utility to make the code less error prone"""
+        if self._cachekey is None:
+            return None
+        return self._cachekey[3]
+
+    def _checkkey(self, changelog, obsstore):
+        """key current key against changelog and obsstore
+
+        return:
+        - None: when the key is invalid (no key, strip detect, etc),
+        - (current-tiprev, current-obsszie, current-obskey) otherwise.
+        """
+        key = self._cachekey
+        if key is None:
+            return None
+
+        ### Is the cache valid ?
+        keytiprev, keytipnode, keyobslength, keyobssize, keyobskey = key
+        # check for changelog strip
+        tiprev = len(changelog) - 1
+        if (tiprev < keytiprev
+                or changelog.node(keytiprev) != keytipnode):
+            return None
+        # check for obsstore strip
+        obssize, obskey = obsstore.cachekey(index=keyobssize)
+        if obskey != keyobskey:
+            return None
+        if obssize != keyobssize:
+            # we want to return the obskey for the new size
+            __, obskey = obsstore.cachekey(index=obssize)
+        return tiprev, obssize, obskey
+
+    def _upgradeneeded(self, repo):
+        """return (reset, revs, markers, (obssize, obskey)) or None
+
+        If 'None' is returned, the cache is up to date, no upgrade are needed.
+
+        'reset': True if the current data must be destroyed (strip detected),
+
+        'revs': full list of new revisions that the cache must take in account,
+        'markers': full list of new markers the cache must take in account,
+        '(obssize, obskey)': the new obs-cachekey that match the new markers.
+
+        note: the new 'tiprev' can be computed using 'max(revs)' or revs[-1].
+        """
+
+        # We need to ensure we use the same changelog and obsstore through the
+        # processing. Otherwise some invalidation could update the object and
+        # their content after we computed the cache key.
+        cl = repo.changelog
+        obsstore = repo.obsstore
+
+        reset = False
+
+        status = self._checkkey(cl, obsstore)
+        if status is None:
+            reset = True
+            key = self.emptykey
+            obssize, obskey = obsstore.cachekey()
+            tiprev = len(cl) - 1
+        else:
+            key = self._cachekey
+            tiprev, obssize, obskey = status
+        keytiprev = key[0]
+        keyobslength = key[2]
+        keyobssize = key[3]
+
+        if not reset and keytiprev == tiprev and keyobssize == obssize:
+            return None # nothing to upgrade
+
+        ### cache is valid, is there anything to update
+
+        # any new changesets ?
+        revs = ()
+        if keytiprev < tiprev:
+            revs = list(cl.revs(start=keytiprev + 1, stop=tiprev))
+
+        # any new markers
+        markers = ()
+        if keyobssize < obssize:
+            # XXX Three are a small race change here. Since the obsstore might
+            # have move forward between the time we computed the cache key and
+            # we access the data. To fix this we need so "up to" argument when
+            # fetching the markers here. Otherwise we might return more markers
+            # than covered by the cache key.
+            #
+            # In practice the cache target usage only update in a transaction
+            # context, with the lock taken. So we should be fine.
+
+            # XXX we currently naively access all markers on the obsstore for
+            # the sake of simplicity. This will be updated to ways that does
+            # not requires to load the whole obsstore in the future.
+            markers = list(obsstore)
+            if keyobslength:
+                markers = markers[keyobslength:]
+
+        return reset, revs, markers, (obssize, obskey)
+
 # mapping of 'set-name' -> <function to compute this set>
 cachefuncs = {}
 def cachefor(name):