[PATCH 2 of 5] findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes
Benoit Boissinot
benoit.boissinot at ens-lyon.org
Sun Mar 21 08:48:33 CDT 2010
On Sun, Mar 21, 2010 at 06:33:27PM +1100, David Greenaway wrote:
>
> diff --git a/mercurial/similar.py b/mercurial/similar.py
> --- a/mercurial/similar.py
> +++ b/mercurial/similar.py
> @@ -5,34 +5,56 @@
> # This software may be used and distributed according to the terms of the
> # GNU General Public License version 2 or any later version.
>
> +import hashlib
> +
no hashlib in python2.4
> from i18n import _
> import util
> import mdiff
> import bdiff
>
> -def findrenames(repo, added, removed, threshold):
> - '''find renamed files -- yields (before, after, score) tuples'''
> +def _findexactmatches(repo, added, removed):
> + '''find renamed files that have no changes
> +
> + Takes a list of new filectxs and a list of removed filectxs, and yields
> + (before, after) tuples of exact matches.
> + '''
> +
> + # Get hashes of added files.
> + hashes = {}
> + for i, fctx in enumerate(added):
> + repo.ui.progress(_('searching for exact renames'),
> + i, total=(len(added) + len(removed)))
> + h = hashlib.sha1(fctx.data()).digest()
use util.sha1() instead
> + hashes.setdefault(h, []).append(fctx)
> +
> + # For each removed file, see if it corresponds to an added file.
> + for i, fctx in enumerate(removed):
> + repo.ui.progress(_('searching for exact renames'),
> + i + len(added), total=(len(added) + len(removed)))
> + h = hashlib.sha1(fctx.data()).digest()
ditto
>
> +def findrenames(repo, added, removed, threshold):
> + '''find renamed files -- yields (before, after, score) tuples'''
> + commitedctx = repo['.']
I'd use parentctx instead?
> + workingctx = repo[None]
>
> + # Fetch contexts for added and removed files.
I'm not sure a comment is needed, you just describe what the two lines
does, not the intent.
> + addedfiles = [workingctx[fp] for fp in added]
> + removedfiles = [commitedctx[fp] for fp in removed if fp in commitedctx]
> +
> + #
> + # Zero length files will be frequently unrelated to each other, and
> + # tracking the deletion/addition of such a file will probably cause more
> + # harm than good. We strip them out here to avoid matching them later on.
> + #
> + addedfiles = [x for x in addedfiles if x.size() > 0]
> + removedfiles = [x for x in removedfiles if x.size() > 0]
> +
> + # Find exact matches.
> + handledfiles = set()
> + for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
> + handledfiles.add(b)
> + yield (a, b, 1.0)
> +
> + # If the user requested similar files to be matched, search for
> + # them also.
> + if threshold < 1.0:
> + # Remove files already discovered.
> + addedfiles = [x for x in addedfiles if x.path() not in handledfiles]
> +
> + # Find partial matches.
> + for (a, b, score) in _findsimilarmatches(repo,
> + addedfiles, removedfiles, threshold):
> + yield (a, b, score)
> +
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
--
:wq
More information about the Mercurial-devel
mailing list