[PATCH 2 of 5] findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes

Benoit Boissinot benoit.boissinot at ens-lyon.org
Sun Mar 21 08:48:33 CDT 2010


On Sun, Mar 21, 2010 at 06:33:27PM +1100, David Greenaway wrote:
> 
> diff --git a/mercurial/similar.py b/mercurial/similar.py
> --- a/mercurial/similar.py
> +++ b/mercurial/similar.py
> @@ -5,34 +5,56 @@
>  # This software may be used and distributed according to the terms of the
>  # GNU General Public License version 2 or any later version.
>  
> +import hashlib
> +

no hashlib in python2.4

>  from i18n import _
>  import util
>  import mdiff
>  import bdiff
>  
> -def findrenames(repo, added, removed, threshold):
> -    '''find renamed files -- yields (before, after, score) tuples'''
> +def _findexactmatches(repo, added, removed):
> +    '''find renamed files that have no changes
> +
> +    Takes a list of new filectxs and a list of removed filectxs, and yields
> +    (before, after) tuples of exact matches.
> +    '''
> +
> +    # Get hashes of added files.
> +    hashes = {}
> +    for i, fctx in enumerate(added):
> +        repo.ui.progress(_('searching for exact renames'),
> +                i, total=(len(added) + len(removed)))
> +        h = hashlib.sha1(fctx.data()).digest()

use util.sha1() instead

> +        hashes.setdefault(h, []).append(fctx)
> +
> +    # For each removed file, see if it corresponds to an added file.
> +    for i, fctx in enumerate(removed):
> +        repo.ui.progress(_('searching for exact renames'),
> +                i + len(added), total=(len(added) + len(removed)))
> +        h = hashlib.sha1(fctx.data()).digest()

ditto

>  
> +def findrenames(repo, added, removed, threshold):
> +    '''find renamed files -- yields (before, after, score) tuples'''
> +    commitedctx = repo['.']

I'd use parentctx instead?

> +    workingctx = repo[None]
>  
> +    # Fetch contexts for added and removed files.

I'm not sure a comment is needed, you just describe what the two lines
does, not the intent.

> +    addedfiles = [workingctx[fp] for fp in added]
> +    removedfiles = [commitedctx[fp] for fp in removed if fp in commitedctx]
> +
> +    #
> +    # Zero length files will be frequently unrelated to each other, and
> +    # tracking the deletion/addition of such a file will probably cause more
> +    # harm than good. We strip them out here to avoid matching them later on.
> +    #
> +    addedfiles = [x for x in addedfiles if x.size() > 0]
> +    removedfiles = [x for x in removedfiles if x.size() > 0]
> +
> +    # Find exact matches.
> +    handledfiles = set()
> +    for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
> +        handledfiles.add(b)
> +        yield (a, b, 1.0)
> +
> +    # If the user requested similar files to be matched, search for
> +    # them also.
> +    if threshold < 1.0:
> +        # Remove files already discovered.
> +        addedfiles = [x for x in addedfiles if x.path() not in handledfiles]
> +
> +        # Find partial matches.
> +        for (a, b, score) in _findsimilarmatches(repo,
> +                addedfiles, removedfiles, threshold):
> +            yield (a, b, score)
> +
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel

-- 
:wq


More information about the Mercurial-devel mailing list