[PATCH 15 of 15] speedy: add support for arbitrary file patterns in path query

Augie Fackler raf at durin42.com
Wed Dec 12 21:39:37 CST 2012


Overall, I think I like this. I'm excited that the super-simple implementation offers such great performance benefits. One thing I wonder about is if the local implementation (I'm assuming y'all are planning a large-scale logindex-as-a-service thing at fb) should require that the client have all the indexed changes, and make it possible to do reindexing after pulls.

Another thing is that maybe there could be support for multiple repositories somehow in the protocol, but I'm not sure that really makes sense on the other hand.

On Dec 11, 2012, at 12:38 PM, Tomasz Kleczek <tkleczek at fb.com> wrote:

> # HG changeset patch
> # User Tomasz Kleczek <tkleczek at fb.com>
> # Date 1355207317 28800
> # Branch stable
> # Node ID 88deb68f287f6ee05ed8e066fc52d01a5a218fe4
> # Parent  4b29d21f6585f07c8a55c64cb93ce6178be4e3eb
> speedy: add support for arbitrary file patterns in path query
> 
> This change speeds up commands such as:
> 
> log "glob:*"
> log "relglob:*.py"
> hg log d2 --include "**.py"
> 
> <placeholder: performance info>
> 
> Only the fileset patterns are still not supported server-side.
> 
> diff --git a/hgext/speedy/client.py b/hgext/speedy/client.py
> --- a/hgext/speedy/client.py
> +++ b/hgext/speedy/client.py
> @@ -91,18 +91,26 @@
>         `rev` is in fncache then fncache[rev] is a list with filenames
>         this rev modifies that are matching according to `match`.
> 
> -        If match containts only 'path' or 'relpath' patterns the query is sent
> +        If match doesn't contain any fileset expressions, the query is sent
>         to the server, otherwise it is performed locally as server doesn't
> -        support arbitrary patterns just yet.
> +        support fileset expressions.
>         """
> -        anynonpaths = bool(filter(lambda (k, v): k != 'path', match._pats))
> -        if match._includepats or match._excludepats or anynonpaths:
> -            # For now, server supports only literal paths and not arbitrary
> -            # patterns, fall back to the local query
> +        allpats = match._pats + match._includepats + match._excludepats
> +        anyfilesetexp = bool(filter(lambda (k, v): k == 'set', allpats))
> +        if anyfilesetexp:
> +            # Server doesn't support filesets, fall back to the local
> +            # fileset query
>             return cmdutil.filterrevs(self._repo, list(self._repo), match)
>         else:
> -            paths = [v for k, v in match._pats]
> -            wanted = self._proxy.request('path', (paths,))
> +            # Would like to send the match object over the network, but
> +            # it is not possible as it contains functions and working
> +            # dir context object. Send all parameters used to init the
> +            # object in a dict instead and create the object server-side.
> +            # match._ctx field is used only if one of the patterns to match is
> +            # a 'set:' pattern. Therefore we do not loose any information here.
> +            matchdict = dict(patterns=match._pats, include=match._includepats,
> +                    exclude=match._excludepats)
> +            wanted = self._proxy.request('path', (matchdict,))
>             return set(nodestorevs(self._repo, wanted)), {}
> 
> def _patchedauthor(metapeer, repo, subset, pats):
> diff --git a/hgext/speedy/index.py b/hgext/speedy/index.py
> --- a/hgext/speedy/index.py
> +++ b/hgext/speedy/index.py
> @@ -58,3 +58,16 @@
>         for path in paths:
>             filechgs.setdefault(path, []).append(ctx.node())
>     return filechgs
> +
> +def makefiles(ctxs):
> +    """Return the `files` index in the form of a dict.
> +
> +    `files` is keyed by file name, with each value being an empty string
> +
> +    ctxs: an iterable with change contexts for changes from which the index
> +        is to be created.
> +    """
> +    files = set()
> +    for ctx in ctxs:
> +        files.update(ctx.files())
> +    return dict([(fn, '') for fn in files])
> diff --git a/hgext/speedy/server.py b/hgext/speedy/server.py
> --- a/hgext/speedy/server.py
> +++ b/hgext/speedy/server.py
> @@ -14,6 +14,7 @@
> from mercurial import cmdutil
> from mercurial.i18n import _
> from mercurial import util
> +from mercurial import match as matchmod
> import index
> import protocol
> import tcptransport
> @@ -59,10 +60,44 @@
>         return [node for node, date in self.chgdate.iteritems()
>             if matcher(date)]
> 
> -    def path(self, paths):
> -        """Return a list of changesets that modify any of the paths.
> +    def path(self, matchargs):
> +        """Return a list of changesets that modify the specified paths.
> 
> -        Only the changes present in `subset` are returned.
> +        matchargs: a dict that may contain the following parameters:
> +            'patterns' - a pattern list
> +            'include' - a pattern list describing additional paths
> +                to include
> +            'exclude' - a pattern list describing additional paths
> +                to exclude
> +
> +        A pattern list is a an iterable with pairs of (kind, pattern),
> +                kind may be any pattern kind recognized by match.match
> +                constructor except for 'relpath' and 'set'.
> +
> +        If all patterns are literal paths, we can compute answer very
> +        fast using `filechgs` index (see _literalpath method). Otherwise,
> +        we have to fall back to an exhaustive search (see _patternpath
> +        method).
> +        """
> +        pats = matchargs.get('patterns', [])
> +        include = matchargs.get('include', [])
> +        exclude = matchargs.get('exclude', [])
> +        kinds = [k for k, v in pats]
> +        if not include and not exclude and kinds == ['path'] * len(kinds):
> +            paths = [ v for k, v in pats ]
> +            wanted = self._literalpath(paths)
> +        else:
> +            def patsconvert(pats):
> +                return[':'.join(p) for p in pats]
> +            patterns = patsconvert(pats)
> +            match = matchmod.match(self.repo.root, self.repo.root,
> +                    patterns, include=patsconvert(include),
> +                    exclude=patsconvert(exclude))
> +            wanted = self._patternpath(match)
> +        return wanted
> +
> +    def _literalpath(self, paths):
> +        """Return a list of changesets touching any of the paths.
> 
>         Uses `filechgs` index which provides the mapping from paths
>         (files and directories) to a list of changes touching this path.
> @@ -74,10 +109,23 @@
>             nodes.update(newnodes)
>         return list(nodes)
> 
> +    def _patternpath(self, match):
> +        """Return a list of changesets matching given files.
> +
> +        match: a callable that defines which files are relevant.
> +               File is relevant if match(filename) == True.
> +
> +        Slow compared to _literalpath since it iterates through all filenames
> +        in the repository history.
> +        """
> +        matchingfiles = filter(match, self.files.keys())
> +        return self._literalpath(matchingfiles)
> +
> indicecfg = {
>     'userchgs': index.makeuserchgs,
>     'chgdate': index.makechgdate,
>     'filechgs': index.makefilechgs,
> +    'files': index.makefiles,
> }
> 
> def makeserver(repo):
> diff --git a/tests/test-speedy.t b/tests/test-speedy.t
> --- a/tests/test-speedy.t
> +++ b/tests/test-speedy.t
> @@ -273,10 +273,10 @@
>   $ hg log --rev "date(10/20/2012) & user(testuser2)"
>   chg1
> 
> -  $ cat >> $TESTTMP/localrepo/.hg/hgrc <<EOF_END
> -  > [speedy]
> -  > client = False
> -  > EOF_END
> +  $ hg log "glob:d2/*" --exclude "**.py"
> +  chgx
> +  chgpushed
> +  chg4
> 
>   $ cd $TESTTMP/serverrepo
> 
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel



More information about the Mercurial-devel mailing list