[PATCH 2 of 3 V3] chgcache: implement a smartcache layer

Wed Mar 8 13:52:06 EST 2017

On 08/03/2017 06:35, Jun Wu wrote:
> # HG changeset patch
> # User Jun Wu <quark at fb.com>
> # Date 1488949878 28800
> #      Tue Mar 07 21:11:18 2017 -0800
> # Node ID f0bded8d53c5c9a5cfb25d29dd99cf4eb3fb79b2
> # Parent  60eb2c2b5196a62d635dbe0eb1e29fdd945d5058
> # Available At https://urldefense.proofpoint.com/v2/url?u=https-3A__bitbucket.org_quark-2Dzju_hg-2Ddraft&d=DwIGaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=mEgSWILcY4c4W3zjApBQLA&m=reQmKdchqeMVRwrhw7ZqWyDvfs90FzZDm_PbdOvq4oo&s=Fii6rsLkxbQqCcmGkO37t_GzC3q7-yPDldCdBtsr2Zg&e=
> #              hg pull https://urldefense.proofpoint.com/v2/url?u=https-3A__bitbucket.org_quark-2Dzju_hg-2Ddraft&d=DwIGaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=mEgSWILcY4c4W3zjApBQLA&m=reQmKdchqeMVRwrhw7ZqWyDvfs90FzZDm_PbdOvq4oo&s=Fii6rsLkxbQqCcmGkO37t_GzC3q7-yPDldCdBtsr2Zg&e=  -r f0bded8d53c5
> chgcache: implement a smartcache layer
>
> See the docstring of smartcache. Basically it sets up a pattern where
> everything in the cache has a corresponding hash value to help test if it's
> valid quickly.
>
> It's mainly designed to be used with the repo state. See the next patch.
>
> diff --git a/mercurial/chgcache.py b/mercurial/chgcache.py
> --- a/mercurial/chgcache.py
> +++ b/mercurial/chgcache.py
> @@ -21,2 +21,61 @@ def set(key, value):
>      else:
>          _cache[key] = value
> +
> +class smartcache(object):
> +    """cache knowing how to load and invalidate values, for predefined keys
> +
> +    The cache object will only answer to a key who is in loadfunctable. The

"which is in", not "who is in", and drop the "The" at the end of this 
line, for better English.

> +    loadfunctable stores load functions which will do hashing and loading.
> +    smartcache will update or invalidate entries according to the hash, and
> +    provide the hash and value to load functions being called next time.
> +
> +    There is no "set" method. To pre-populate the cache, call "get" instead.
> +    This will make sure the hash values are always correctly set.
> +
> +    The end users using smartcache.get will only notice the values, the hashes
> +    and the cache is transparent to them.
> +
> +    A load function has the signature:
> +
> +        (state, oldhash, oldvalue) -> (newhash, newvalue)
> +
> +    Where state is where the load function reads information. oldhash, oldvalue
> +    is what currently being stored in the cache. The returned hash and value
> +    will be used to update the cache.
> +
> +    A load function usually looks like:
> +
> +        def loadfunc(state, oldhash, oldvalue):
> +            hash = state.quickhash()
> +            if hash == oldhash:
> +                return oldhash, oldvalue
> +            value = state.loadvalue()
> +            hash = hashvalue(value)
> +            # or, if hashvalue is expensive
> +            hashagain = state.quickhash()
> +            if hashagain != hash:
> +                # invalidate the cache entry without filling a new one
> +                hash = None
> +            return hash, value
> +

I don't like this interface - there's a lot for a load function author 
to get right. I would prefer to see a split into a pair of functions - 
one that gets the hash, one that gets the data - and have the cache code 
handle the state management instead.

> +    If predefined keys are not flexible enough, loadfunctable could be an
> +    object implementing "get" which generates load functions dynamically.
> +    """
> +
> +    def __init__(self, keyprefix, state, loadfunctable):
> +        self.keyprefix = keyprefix
> +        self.state = state
> +        self.loadfunctable = loadfunctable
> +
> +    def get(self, key):
> +        loadfunc = self.loadfunctable.get(key)
> +        if loadfunc is None:
> +            return None
> +        fullkey = self.keyprefix + key
> +        oldhash, oldvalue = get(fullkey) or [None, None]
> +        newhash, newvalue = loadfunc(self.state, oldhash, oldvalue)
> +        if newhash is None:
> +            set(fullkey, None)
> +        elif newvalue is not oldvalue or newhash != oldhash:
> +            set(fullkey, (newhash, newvalue))
> +        return newvalue
> diff --git a/tests/test-chgcache.py b/tests/test-chgcache.py
> new file mode 100644
> --- /dev/null
> +++ b/tests/test-chgcache.py
> @@ -0,0 +1,56 @@
> +from __future__ import absolute_import, print_function
> +
> +import os
> +
> +from mercurial import (
> +    chgcache,
> +    scmutil,
> +)
> +
> +filename = 'foo'
> +
> +def readfoo(vfs, oldhash, oldvalue):
> +    # NOTE: st_size is intentional for this test. Do not use it in real code if
> +    # the file could be rewritten (not append-only).
> +    try:
> +        newhash = vfs.stat(filename).st_size
> +    except OSError:
> +        return None, None
> +    if oldhash == newhash:
> +        print('cache hit')
> +        return oldhash, oldvalue
> +    else:
> +        print('cache miss')
> +        value = vfs.read(filename)
> +        # NOTE: This is wrong. In reality, we need to calculate the hash again,
> +        # and return None as the "newhash" if hashes mismatch, to mitigate
> +        # filesystem race conditions.
> +        # That said, in this test we do know nobody else will touch the file,
> +        # so it's fine.

I'd prefer to avoid the need for this comment - instead:

finalhash = vfs.stat(filename).st_size
if finalhash == newhash:
     return newhash, value
else:
     return None, value

This has the advantage of being a worked example for people who learn 
better from examples than from documentation.

> +        return newhash, value
> +
> +loadfuncs = {'foo': readfoo}
> +vfs = scmutil.vfs(os.environ['TESTTMP'])
> +
> +cache = chgcache.smartcache('vfs', vfs, loadfuncs)
> +
> +def printcache():
> +    print('cache["foo"] = %r' % cache.get('foo'))
> +
> +printcache() # None, because the file does not exist
> +
> +vfs.write(filename, 'a')
> +printcache() # cache miss, 'a'
> +printcache() # cache hit, 'a'
> +
> +vfs.write(filename, 'ab')
> +printcache() # cache miss, 'ab'
> +
> +vfs.write(filename, 'cd')
> +printcache() # cache hit, 'ab'
> +
> +vfs.unlink('foo')
> +printcache() # None, will invalidate the cache
> +
> +vfs.write(filename, 'ef')
> +printcache() # cache miss, 'ef'
> diff --git a/tests/test-chgcache.py.out b/tests/test-chgcache.py.out
> new file mode 100644
> --- /dev/null
> +++ b/tests/test-chgcache.py.out
> @@ -0,0 +1,12 @@
> +cache["foo"] = None
> +cache miss
> +cache["foo"] = 'a'
> +cache hit
> +cache["foo"] = 'a'
> +cache miss
> +cache["foo"] = 'ab'
> +cache hit
> +cache["foo"] = 'ab'
> +cache["foo"] = None
> +cache miss
> +cache["foo"] = 'ef'
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at mercurial-scm.org
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mercurial-2Dscm.org_mailman_listinfo_mercurial-2Ddevel&d=DwIGaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=mEgSWILcY4c4W3zjApBQLA&m=reQmKdchqeMVRwrhw7ZqWyDvfs90FzZDm_PbdOvq4oo&s=DZWjoUWpkqSpspZkpl_xGThGgkNdeyF3WiYgAFIIKnM&e=
>

-- 
Simon Farnsworth