[PATCH 2 of 5] findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes

Mon Mar 22 21:12:08 CDT 2010

Hello!

2010/3/21 David Greenaway <hg-dev at davidgreenaway.com>:
> # HG changeset patch
> # User David Greenaway <hg-dev at davidgreenaway.com>
> # Date 1269151891 -39600
> # Node ID 153c602f7b20c95484033d2b91d05171f62f91b2
> # Parent  8bf2c8f1b35705f4f5b615aaf954a3cb034c5dc0
> findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
>
> We speed up 'findrenames' for the usecase when a user specifies they want
> a similarity of 100% by matching files by their exact SHA1 hash value. This
> reduces the number of comparisons required to find exact matches from O(n^2) to
> O(n).
>
> While it would be nice if we could just use mercurial's pre-calculated SHA1
> hash for existing files, this hash includes the file's ancestor information
> making it unsuitable for our purposes. Instead, we calculate the hash of old
> content from scratch.
>
> The following benchmarks were taken on the current head of crew:
>
> addremove 100% similarity:
>  rm -rf *; hg up -C; mv tests tests.new
>  hg --time addremove -s100 --dry-run
>
>  before:  real 176.350 secs (user 128.890+0.000 sys 47.430+0.000)
>  after:   real   2.130 secs (user   1.890+0.000 sys  0.240+0.000)
>
> addremove 75% similarity:
>  rm -rf *; hg up -C; mv tests tests.new; \
>      for i in tests.new/*; do echo x >> $i; done
>  hg --time addremove -s75  --dry-run
>
>  before: real 264.560 secs (user 215.130+0.000 sys 49.410+0.000)
>  after:  real 218.710 secs (user 172.790+0.000 sys 45.870+0.000)
>
> diff --git a/mercurial/similar.py b/mercurial/similar.py
> --- a/mercurial/similar.py
> +++ b/mercurial/similar.py
> @@ -5,34 +5,56 @@
>  # This software may be used and distributed according to the terms of the
>  # GNU General Public License version 2 or any later version.
>
> +import hashlib
> +
>  from i18n import _
>  import util
>  import mdiff
>  import bdiff
>
> -def findrenames(repo, added, removed, threshold):
> -    '''find renamed files -- yields (before, after, score) tuples'''
> +def _findexactmatches(repo, added, removed):
> +    '''find renamed files that have no changes
> +
> +    Takes a list of new filectxs and a list of removed filectxs, and yields
> +    (before, after) tuples of exact matches.
> +    '''
> +
> +    # Get hashes of added files.
> +    hashes = {}
> +    for i, fctx in enumerate(added):
> +        repo.ui.progress(_('searching for exact renames'),
> +                i, total=(len(added) + len(removed)))

Knitpick:
You might want a local "totallength", or "filecount" variable here

Cheers,

> +        h = hashlib.sha1(fctx.data()).digest()
> +        hashes.setdefault(h, []).append(fctx)
> +
> +    # For each removed file, see if it corresponds to an added file.
> +    for i, fctx in enumerate(removed):
> +        repo.ui.progress(_('searching for exact renames'),
> +                i + len(added), total=(len(added) + len(removed)))
> +        h = hashlib.sha1(fctx.data()).digest()
> +        for match in hashes.get(h, []):
> +            yield (fctx.path(), match.path())
> +
> +    # Done
> +    repo.ui.progress(_('searching for exact renames'), None)
> +
> +def _findsimilarmatches(repo, added, removed, threshold):
> +    '''find potentially renamed files based on similar file content
> +
> +    Takes a list of new filectxs and a list of removed filectxs, and yields
> +    (before, after, score) tuples of partial matches.
> +    '''
>     copies = {}
> -    ctx = repo['.']
>     for i, r in enumerate(removed):
> -        repo.ui.progress(_('searching'), i, total=len(removed))
> -        if r not in ctx:
> -            continue
> -        fctx = ctx.filectx(r)
> +        repo.ui.progress(_('searching for similar files'), i, total=len(removed))
>
>         # lazily load text
>         @util.cachefunc
>         def data():
> -            orig = fctx.data()
> +            orig = r.data()
>             return orig, mdiff.splitnewlines(orig)
>
>         def score(text):
> -            if not len(text):
> -                return 0.0
> -            if not fctx.cmp(text):
> -                return 1.0
> -            if threshold == 1.0:
> -                return 0.0
>             orig, lines = data()
>             # bdiff.blocks() returns blocks of matching lines
>             # count the number of bytes in each
> @@ -47,13 +69,46 @@
>
>         for a in added:
>             bestscore = copies.get(a, (None, threshold))[1]
> -            myscore = score(repo.wread(a))
> +            myscore = score(a.data())
>             if myscore >= bestscore:
>                 copies[a] = (r, myscore)
>     repo.ui.progress(_('searching'), None)
>
>     for dest, v in copies.iteritems():
>         source, score = v
> -        yield source, dest, score
> +        yield source.path(), dest.path(), score
>
> +def findrenames(repo, added, removed, threshold):
> +    '''find renamed files -- yields (before, after, score) tuples'''
> +    commitedctx = repo['.']
> +    workingctx = repo[None]
>
> +    # Fetch contexts for added and removed files.
> +    addedfiles = [workingctx[fp] for fp in added]
> +    removedfiles = [commitedctx[fp] for fp in removed if fp in commitedctx]
> +
> +    #
> +    # Zero length files will be frequently unrelated to each other, and
> +    # tracking the deletion/addition of such a file will probably cause more
> +    # harm than good. We strip them out here to avoid matching them later on.
> +    #
> +    addedfiles = [x for x in addedfiles if x.size() > 0]
> +    removedfiles = [x for x in removedfiles if x.size() > 0]
> +
> +    # Find exact matches.
> +    handledfiles = set()
> +    for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
> +        handledfiles.add(b)
> +        yield (a, b, 1.0)
> +
> +    # If the user requested similar files to be matched, search for
> +    # them also.
> +    if threshold < 1.0:
> +        # Remove files already discovered.
> +        addedfiles = [x for x in addedfiles if x.path() not in handledfiles]
> +
> +        # Find partial matches.
> +        for (a, b, score) in _findsimilarmatches(repo,
> +                addedfiles, removedfiles, threshold):
> +            yield (a, b, score)
> +
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>

-- 
Nicolas Dumazet — NicDumZ