[PATCH 2 of 5] findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes
Nicolas Dumazet
nicdumz at gmail.com
Mon Mar 22 21:12:08 CDT 2010
Hello!
2010/3/21 David Greenaway <hg-dev at davidgreenaway.com>:
> # HG changeset patch
> # User David Greenaway <hg-dev at davidgreenaway.com>
> # Date 1269151891 -39600
> # Node ID 153c602f7b20c95484033d2b91d05171f62f91b2
> # Parent 8bf2c8f1b35705f4f5b615aaf954a3cb034c5dc0
> findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
>
> We speed up 'findrenames' for the usecase when a user specifies they want
> a similarity of 100% by matching files by their exact SHA1 hash value. This
> reduces the number of comparisons required to find exact matches from O(n^2) to
> O(n).
>
> While it would be nice if we could just use mercurial's pre-calculated SHA1
> hash for existing files, this hash includes the file's ancestor information
> making it unsuitable for our purposes. Instead, we calculate the hash of old
> content from scratch.
>
> The following benchmarks were taken on the current head of crew:
>
> addremove 100% similarity:
> rm -rf *; hg up -C; mv tests tests.new
> hg --time addremove -s100 --dry-run
>
> before: real 176.350 secs (user 128.890+0.000 sys 47.430+0.000)
> after: real 2.130 secs (user 1.890+0.000 sys 0.240+0.000)
>
> addremove 75% similarity:
> rm -rf *; hg up -C; mv tests tests.new; \
> for i in tests.new/*; do echo x >> $i; done
> hg --time addremove -s75 --dry-run
>
> before: real 264.560 secs (user 215.130+0.000 sys 49.410+0.000)
> after: real 218.710 secs (user 172.790+0.000 sys 45.870+0.000)
>
> diff --git a/mercurial/similar.py b/mercurial/similar.py
> --- a/mercurial/similar.py
> +++ b/mercurial/similar.py
> @@ -5,34 +5,56 @@
> # This software may be used and distributed according to the terms of the
> # GNU General Public License version 2 or any later version.
>
> +import hashlib
> +
> from i18n import _
> import util
> import mdiff
> import bdiff
>
> -def findrenames(repo, added, removed, threshold):
> - '''find renamed files -- yields (before, after, score) tuples'''
> +def _findexactmatches(repo, added, removed):
> + '''find renamed files that have no changes
> +
> + Takes a list of new filectxs and a list of removed filectxs, and yields
> + (before, after) tuples of exact matches.
> + '''
> +
> + # Get hashes of added files.
> + hashes = {}
> + for i, fctx in enumerate(added):
> + repo.ui.progress(_('searching for exact renames'),
> + i, total=(len(added) + len(removed)))
Knitpick:
You might want a local "totallength", or "filecount" variable here
Cheers,
> + h = hashlib.sha1(fctx.data()).digest()
> + hashes.setdefault(h, []).append(fctx)
> +
> + # For each removed file, see if it corresponds to an added file.
> + for i, fctx in enumerate(removed):
> + repo.ui.progress(_('searching for exact renames'),
> + i + len(added), total=(len(added) + len(removed)))
> + h = hashlib.sha1(fctx.data()).digest()
> + for match in hashes.get(h, []):
> + yield (fctx.path(), match.path())
> +
> + # Done
> + repo.ui.progress(_('searching for exact renames'), None)
> +
> +def _findsimilarmatches(repo, added, removed, threshold):
> + '''find potentially renamed files based on similar file content
> +
> + Takes a list of new filectxs and a list of removed filectxs, and yields
> + (before, after, score) tuples of partial matches.
> + '''
> copies = {}
> - ctx = repo['.']
> for i, r in enumerate(removed):
> - repo.ui.progress(_('searching'), i, total=len(removed))
> - if r not in ctx:
> - continue
> - fctx = ctx.filectx(r)
> + repo.ui.progress(_('searching for similar files'), i, total=len(removed))
>
> # lazily load text
> @util.cachefunc
> def data():
> - orig = fctx.data()
> + orig = r.data()
> return orig, mdiff.splitnewlines(orig)
>
> def score(text):
> - if not len(text):
> - return 0.0
> - if not fctx.cmp(text):
> - return 1.0
> - if threshold == 1.0:
> - return 0.0
> orig, lines = data()
> # bdiff.blocks() returns blocks of matching lines
> # count the number of bytes in each
> @@ -47,13 +69,46 @@
>
> for a in added:
> bestscore = copies.get(a, (None, threshold))[1]
> - myscore = score(repo.wread(a))
> + myscore = score(a.data())
> if myscore >= bestscore:
> copies[a] = (r, myscore)
> repo.ui.progress(_('searching'), None)
>
> for dest, v in copies.iteritems():
> source, score = v
> - yield source, dest, score
> + yield source.path(), dest.path(), score
>
> +def findrenames(repo, added, removed, threshold):
> + '''find renamed files -- yields (before, after, score) tuples'''
> + commitedctx = repo['.']
> + workingctx = repo[None]
>
> + # Fetch contexts for added and removed files.
> + addedfiles = [workingctx[fp] for fp in added]
> + removedfiles = [commitedctx[fp] for fp in removed if fp in commitedctx]
> +
> + #
> + # Zero length files will be frequently unrelated to each other, and
> + # tracking the deletion/addition of such a file will probably cause more
> + # harm than good. We strip them out here to avoid matching them later on.
> + #
> + addedfiles = [x for x in addedfiles if x.size() > 0]
> + removedfiles = [x for x in removedfiles if x.size() > 0]
> +
> + # Find exact matches.
> + handledfiles = set()
> + for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
> + handledfiles.add(b)
> + yield (a, b, 1.0)
> +
> + # If the user requested similar files to be matched, search for
> + # them also.
> + if threshold < 1.0:
> + # Remove files already discovered.
> + addedfiles = [x for x in addedfiles if x.path() not in handledfiles]
> +
> + # Find partial matches.
> + for (a, b, score) in _findsimilarmatches(repo,
> + addedfiles, removedfiles, threshold):
> + yield (a, b, score)
> +
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>
--
Nicolas Dumazet — NicDumZ
More information about the Mercurial-devel
mailing list