[PATCH 2 of 5] findrenames: Replace current similarity detection algorithm with an O(n + m) algorithm

David Greenaway hg-dev at davidgreenaway.com
Sat Mar 6 22:12:49 CST 2010


# HG changeset patch
# User David Greenaway <hg-dev at davidgreenaway.com>
# Date 1267934964 -39600
# Node ID 838395f8b30b2077d387c87aa5f0408f3071a345
# Parent  10649eca0e852b7f229e392f36812bbd6f89773c
findrenames: Replace current similarity detection algorithm with an O(n + m) algorithm.

The current findrenames() algorithm involves comparing O(n * m) files. This
becomes intractable for large repositories.

We implement a match-by-hash algorithm for finding similar files. The new
algorithm only requires reading each file once. A detailed description of
the algorithm is available as comments in the source.

Timings of:

    rm -rf *; hg up -C; for i in `find . -name "*.py"` ; do mv $i $i.new; echo x >> $i.new; done
    hg --time addremove -s 75

are as follows:

    before: Time: real 54.740 secs (user 50.020+0.000 sys 4.440+0.000)
    after:  Time: real 42.930 secs (user 42.440+0.000 sys 0.470+0.000)

The difference will become greater as repository sizes become larger.

diff --git a/mercurial/cmdutil.py b/mercurial/cmdutil.py
--- a/mercurial/cmdutil.py
+++ b/mercurial/cmdutil.py
@@ -9,6 +9,7 @@
 from i18n import _
 import os, sys, errno, re, glob, tempfile
 import mdiff, bdiff, util, templater, patch, error, encoding, templatekw
+import similar
 import match as _match
 
 revrangesep = ':'
@@ -285,48 +286,6 @@
 def matchfiles(repo, files):
     return _match.exact(repo.root, repo.getcwd(), files)
 
-def findrenames(added, removed, threshold):
-    """
-    Given two lists of files, yield (source, destination, score) tuples of
-    similar files.
-
-    The input 'added' and 'removed' lists should be lists of tuples containing
-    (filename, function to retrieve file data). The retrieval functions will
-    be given a single argument: the name of the file to retrieve.
-    """
-    copies = {}
-    for (r, r_data) in removed:
-        orig = r_data(r)
-
-        def score(text):
-            if not len(text):
-                return 0.0
-            if orig == text:
-                return 1.0
-            if threshold == 1.0:
-                return 0.0
-            # bdiff.blocks() returns blocks of matching lines
-            # count the number of bytes in each
-            equal = 0
-            alines = mdiff.splitnewlines(text)
-            matches = bdiff.blocks(text, orig)
-            for x1, x2, y1, y2 in matches:
-                for line in alines[x1:x2]:
-                    equal += len(line)
-
-            lengths = len(text) + len(orig)
-            return equal * 2.0 / lengths
-
-        for (a, a_data) in added:
-            bestscore = copies.get(a, (None, threshold))[1]
-            myscore = score(a_data(a))
-            if myscore >= bestscore:
-                copies[a] = (r, myscore)
-
-    for dest, v in copies.iteritems():
-        source, score = v
-        yield source, dest, score
-
 def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
     if dry_run is None:
         dry_run = opts.get('dry_run')
@@ -372,7 +331,7 @@
                 for x in removed + deleted if x in ctx]
 
         # Find renames
-        for old, new, score in findrenames(
+        for old, new, score in similar.findrenames(
                 source_data, target_data, similarity):
             if repo.ui.verbose or not m.exact(old) or not m.exact(new):
                 repo.ui.status(_('recording removal of %s as rename to %s '
diff --git a/mercurial/hash.py b/mercurial/hash.py
new file mode 100644
--- /dev/null
+++ b/mercurial/hash.py
@@ -0,0 +1,80 @@
+# hash.py - misc hashing functionality
+#
+# Copyright 2010 David Greenaway <hg-dev at davidgreenaway.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2, incorporated herein by reference.
+
+"""Miscellaneous hashing functionality"""
+
+import array
+
+_HASH_MULT = 257L
+_HASH_MOD = 4294967291L
+
+def simplehash(data):
+    """
+    Return a simple 32-bit hash for the given data string.
+    """
+    r = 0L
+    for i in data:
+        r *= _HASH_MULT
+        r += ord(i)
+        r %= _HASH_MOD
+    return int(r)
+
+def blocked_simplehash(data, block_size):
+    """
+    Split data into 'block_size' sized chunks, and return an array
+    of simple 32-bit hashes, one for each block. 'len(data) / block_size'
+    hashes will be returned.
+    """
+    result = []
+    x = 0
+    while x < len(data) - block_size + 1:
+        result.append(simplehash(data[x:x + block_size]))
+        x += block_size
+    return result
+
+def rolling_simplehash(data, block_size):
+    """
+    Return an array of simple 32-bit hashes, one for each substring of length
+    'block_size' in the string 'data'. 'len(data) - block_size + 1' hashes will
+    be returned.
+
+    This call is equivalent to:
+
+        [simplehash(data[x:block_size+x]) \\
+                for x in range(len(data) - block_size + 1)]
+
+    but is implemented in a way that should run significantly faster,
+    especially for large values of 'block_size'.
+    """
+    hashes = []
+
+    # Ensure we have enough data to calculate at least one hash.
+    data_len = len(data)
+    if data_len < block_size:
+        return hashes
+
+    # Calculate a "inverse" hash value, allowing us to remove
+    # values from the hash.
+    inverse_hash = _HASH_MULT;
+    for i in xrange(block_size - 1):
+        inverse_hash *= _HASH_MULT
+        inverse_hash %= _HASH_MOD
+
+    # Prime the hash variable.
+    h = simplehash(data[0:block_size])
+    hashes.append(int(h))
+
+    # Calculate a new hash for each new byte in the input data.
+    for i in xrange(1, len(data) - block_size + 1):
+        h *= _HASH_MULT;
+        h += (_HASH_MOD * 256) - (inverse_hash * ord(data[i - 1]));
+        h += ord(data[i + block_size - 1]);
+        h %= _HASH_MOD;
+        hashes.append(int(h))
+
+    return hashes
+
diff --git a/mercurial/similar.py b/mercurial/similar.py
new file mode 100644
--- /dev/null
+++ b/mercurial/similar.py
@@ -0,0 +1,185 @@
+# similar.py - mechanisms for finding similar files
+#
+# Copyright 2009 David Greenaway <hg-dev at davidgreenaway.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2, incorporated herein by reference.
+#
+
+import sys
+import hashlib
+import array
+import hash
+import os
+
+BLOCK_SIZE = 64
+
+def find_similar_files(old_files, new_files):
+    """
+    For each file in 'new_files', return a list of (filename, similarity)
+    tuples of similar files in 'old_files'.
+    """
+
+    #
+    # The algorithm we use here (inspired by Andrew Tridgell's "rsync"
+    # algorithm) attempts to find "similar" files between a list of 'n'
+    # files and a list of 'm' target files in O(n + m) time.
+    #
+    # At a high level, it splits the source files into k-byte blocks,
+    # and place them all into a hash table. We then see if any of the
+    # target files contain any matching blocks from the source files.
+    # The more blocks a particular target file has in common with
+    # a particular source file, the more similar those files are
+    # considered.
+    #
+    # The details of the algorithm are as follows:
+    #
+    # We split all of the source files into k-byte blocks and calculate
+    # a hash for each. (A file consisting of 'n' bytes will have 'n / k'
+    # hashes). We then store the hash in a dictionary, mapping to the
+    # filename of where the block came from:
+    #
+    #             +------------+------------+------------+-->
+    #    foo.txt: |To be, or not to be: that is the question> ...
+    #             +------------+------------+------------+-->
+    #              \          / \          / \          /
+    #               0xaabc30aa   0xbb9b0fbb   0xcc94bbcc
+    #
+    #    hashes:  { 0xaabc30aa : "foo.txt",
+    #               0xbb9b0fbb : "foo.txt",
+    #               0xcc94bbcc : "foo.txt",
+    #               ... }
+    #
+    # For each file in the target list, we then calculate hashes for
+    # each k-byte substring. (A file consisting of 'n' bytes will have
+    # 'n - k + 1' hashes). We search the dictionary of source-file
+    # hashes and track which source files we have common blocks with:
+    #
+    #             +------------+---------------+------------->
+    #    baz.txt: |To be, or not to be --- That is the questi> ...
+    #             +------------+---------------+------------->
+    #              \          /                 \          /
+    #               0xaabc30aa                   0xcc94bbcc
+    #
+    # Because we calculate the hash for every k-byte substring (and not
+    # just at block boundaries, like we did in the first stage), we
+    # still find matching blocks, even if the matches are not aligned
+    # to block boundaries.
+    #
+    # Once all matching blocks have been found, we can determine which
+    # source file we have the most common with, and hence which we are
+    # most "similar" to.
+    #
+    # There are a few catches:
+    #
+    # 1. We assume that if hash(x) == hash(y), then x == y.
+    #    This assumption will not always be true. We rely on false
+    #    matches being uncommon. Two unrelated files having one
+    #    or two collisions should not matter; if unrelated files
+    #    have many false hashes, incorrect matchings may ensue.
+    #
+    # 2. Permutations of blocks will not be detected. We may have two files
+    #    that are "100% similar" which are not in fact the same.
+    #
+    # 3. The algorithm, as stated, will misbehave when either the source
+    #    or target file contains duplicate blocks. We perform some trickery to
+    #    avoid such problems. Pathological cases such as large files filled
+    #    with zeroes should be handled correctly.
+    #
+
+    # Length of data in old_files.
+    original_lengths = {}
+
+    # A dictionary mapping hashes to filenames (and the number of times that
+    # hash appears in that particular file).
+    source_hashes = {}
+
+    #
+    # Split source files into k-byte sized blocks of data, and add them to our
+    # hash table.
+    #
+    for (filename, data_func) in old_files:
+        # Retrieve the data.
+        file_data = data_func(filename)
+        original_lengths[filename] = len(file_data)
+
+        # Hash every k-byte block.
+        file_hashes = {}
+        for h in hash.blocked_simplehash(file_data, BLOCK_SIZE):
+            file_hashes[h] = file_hashes.get(h, 0) + 1
+
+        # Add the counts to the global table.
+        for (h, c) in file_hashes.items():
+            source_hashes.setdefault(h, []).append((filename, c))
+
+    #
+    # Iterate through every k-byte sized block of data in the destination
+    # files, and search for them in the hash table to find similar blocks.
+    #
+    for (filename, data_func) in new_files:
+        # Retrieve this file's data, and calculate a hash for every
+        # k-byte block of data in it.
+        file_data = data_func(filename)
+        hashes = collapse(hash.rolling_simplehash(file_data, BLOCK_SIZE))
+
+        # Search for files with similar k-byte blocks.
+        possible_matches = {}
+        for (h, _) in hashes:
+            count = 1
+            files = source_hashes.get(h, [])
+            for (file, hits) in files:
+                hits = min(hits, count)
+                possible_matches[file] = possible_matches.get(file, 0) + hits
+
+        # Return the possible matches.
+        results = []
+        for (matched_filename, num_equal_blocks) in possible_matches.iteritems():
+            max_possible_matches = max(original_lengths[matched_filename],
+                    len(file_data)) / BLOCK_SIZE
+            percent_match = (1.0 * num_equal_blocks) / max_possible_matches
+            results.append((matched_filename, percent_match))
+
+        # Sort matches by decreasing similarity.
+        results.sort(key=(lambda (x,y): -y))
+
+        # Return the result.
+        yield filename, results
+
+def collapse(x):
+    """
+    Remove duplicate items from an array, returning a new array containing
+    tuples of the number of duplicates detected and the original item.
+    Order is not preserved.
+
+    >>> collapse(["cat", "cat", "dog", "cat"])
+    [("cat", 3), ("dog", 1)]
+    """
+    results = {}
+    for i in x:
+        results[i] = results.get(i, 0) + 1
+    return results.iteritems()
+
+def findrenames(added, removed, threshold):
+    """
+    Given two lists of files, yield (old, new, score) tuples of similar files.
+
+    The input 'added' and 'removed' lists should be lists of tuples containing
+    (filename, function to retrieve file data). The retrieval functions will
+    be given a single argument: the name of the file to retrieve.
+    """
+
+    # Find similar files.
+    for (added_file, removed_files) in \
+            find_similar_files(removed, added):
+        # Did we find any matches?
+        if len(removed_files) == 0:
+            continue
+
+        # Did it meet our minimum threshold?
+        (old_file, match_percent) = removed_files[0]
+        if match_percent < threshold:
+            continue
+
+        # Return the result.
+        yield old_file, added_file, match_percent
+
diff --git a/tests/test-hash.py b/tests/test-hash.py
new file mode 100644
--- /dev/null
+++ b/tests/test-hash.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+from mercurial import hash
+
+# Ensure the two given arrays are equal.
+def ensure_arrays_equal(a, b):
+    if len(a) != len(b):
+        print "*** Differing lengths: %d vs %d" % (len(a), len(b))
+        return
+
+    for i in xrange(len(a)):
+        if a[i] != b[i]:
+            print "*** Values differ: %08x vs %08x" % (a[i], b[i])
+            return
+
+#
+# Ensure that the rolling hash returns the same result as calling the
+# individual hash function.
+#
+def test_rolling_hash(data, block_size):
+    hashes_a = [hash.simplehash(data[x:block_size+x]) \
+            for x in range(len(data) - block_size + 1)]
+    hashes_b = hash.rolling_simplehash(data, block_size)
+    ensure_arrays_equal(hashes_a, hashes_b)
+
+test_rolling_hash("", 1)
+test_rolling_hash("a", 1)
+test_rolling_hash("a", 2)
+test_rolling_hash("abc", 1)
+test_rolling_hash("abc", 3)
+test_rolling_hash("\0\0\0\0\0\0\0", 1)
+test_rolling_hash("the quick brown fox jumps over the lazy dog", 5)
+test_rolling_hash("".join([str(x) for x in xrange(10000)]), 100)
+
+#
+# Ensure that the blocked hash returns the same result as calling the
+# individual hash function.
+#
+def test_blocked_hash(data, block_size):
+    hashes_a = [hash.simplehash(data[x:block_size+x]) \
+            for x in range(0, len(data) - block_size + 1, block_size)]
+    hashes_b = hash.blocked_simplehash(data, block_size)
+    ensure_arrays_equal(hashes_a, hashes_b)
+
+test_blocked_hash("", 1)
+test_blocked_hash("a", 1)
+test_blocked_hash("a", 2)
+test_blocked_hash("abc", 1)
+test_blocked_hash("abc", 3)
+test_blocked_hash("\0\0\0\0\0\0\0", 1)
+test_blocked_hash("the quick brown fox jumps over the lazy dog", 5)
+test_blocked_hash("".join([str(x) for x in xrange(10000)]), 100)
+
+#
+# Ensure that the standard hash function is half-sane; ensure
+# there are no collisions hashing 1000 different strings.
+#
+hashes = [hash.simplehash(str(x)) for x in xrange(1000)]
+if len(hashes) != len(set(hashes)):
+    print "Detected %d collissions in %d items." % (
+            len(hashes) - len(set(hashes)), len(hashes))
+


More information about the Mercurial-devel mailing list