[PATCH 2 of 5] findrenames: Replace current similarity detection algorithm with an O(n + m) algorithm
David Greenaway
hg-dev at davidgreenaway.com
Sat Mar 6 22:12:49 CST 2010
# HG changeset patch
# User David Greenaway <hg-dev at davidgreenaway.com>
# Date 1267934964 -39600
# Node ID 838395f8b30b2077d387c87aa5f0408f3071a345
# Parent 10649eca0e852b7f229e392f36812bbd6f89773c
findrenames: Replace current similarity detection algorithm with an O(n + m) algorithm.
The current findrenames() algorithm involves comparing O(n * m) files. This
becomes intractable for large repositories.
We implement a match-by-hash algorithm for finding similar files. The new
algorithm only requires reading each file once. A detailed description of
the algorithm is available as comments in the source.
Timings of:
rm -rf *; hg up -C; for i in `find . -name "*.py"` ; do mv $i $i.new; echo x >> $i.new; done
hg --time addremove -s 75
are as follows:
before: Time: real 54.740 secs (user 50.020+0.000 sys 4.440+0.000)
after: Time: real 42.930 secs (user 42.440+0.000 sys 0.470+0.000)
The difference will become greater as repository sizes become larger.
diff --git a/mercurial/cmdutil.py b/mercurial/cmdutil.py
--- a/mercurial/cmdutil.py
+++ b/mercurial/cmdutil.py
@@ -9,6 +9,7 @@
from i18n import _
import os, sys, errno, re, glob, tempfile
import mdiff, bdiff, util, templater, patch, error, encoding, templatekw
+import similar
import match as _match
revrangesep = ':'
@@ -285,48 +286,6 @@
def matchfiles(repo, files):
return _match.exact(repo.root, repo.getcwd(), files)
-def findrenames(added, removed, threshold):
- """
- Given two lists of files, yield (source, destination, score) tuples of
- similar files.
-
- The input 'added' and 'removed' lists should be lists of tuples containing
- (filename, function to retrieve file data). The retrieval functions will
- be given a single argument: the name of the file to retrieve.
- """
- copies = {}
- for (r, r_data) in removed:
- orig = r_data(r)
-
- def score(text):
- if not len(text):
- return 0.0
- if orig == text:
- return 1.0
- if threshold == 1.0:
- return 0.0
- # bdiff.blocks() returns blocks of matching lines
- # count the number of bytes in each
- equal = 0
- alines = mdiff.splitnewlines(text)
- matches = bdiff.blocks(text, orig)
- for x1, x2, y1, y2 in matches:
- for line in alines[x1:x2]:
- equal += len(line)
-
- lengths = len(text) + len(orig)
- return equal * 2.0 / lengths
-
- for (a, a_data) in added:
- bestscore = copies.get(a, (None, threshold))[1]
- myscore = score(a_data(a))
- if myscore >= bestscore:
- copies[a] = (r, myscore)
-
- for dest, v in copies.iteritems():
- source, score = v
- yield source, dest, score
-
def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
if dry_run is None:
dry_run = opts.get('dry_run')
@@ -372,7 +331,7 @@
for x in removed + deleted if x in ctx]
# Find renames
- for old, new, score in findrenames(
+ for old, new, score in similar.findrenames(
source_data, target_data, similarity):
if repo.ui.verbose or not m.exact(old) or not m.exact(new):
repo.ui.status(_('recording removal of %s as rename to %s '
diff --git a/mercurial/hash.py b/mercurial/hash.py
new file mode 100644
--- /dev/null
+++ b/mercurial/hash.py
@@ -0,0 +1,80 @@
+# hash.py - misc hashing functionality
+#
+# Copyright 2010 David Greenaway <hg-dev at davidgreenaway.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2, incorporated herein by reference.
+
+"""Miscellaneous hashing functionality"""
+
+import array
+
+_HASH_MULT = 257L
+_HASH_MOD = 4294967291L
+
+def simplehash(data):
+ """
+ Return a simple 32-bit hash for the given data string.
+ """
+ r = 0L
+ for i in data:
+ r *= _HASH_MULT
+ r += ord(i)
+ r %= _HASH_MOD
+ return int(r)
+
+def blocked_simplehash(data, block_size):
+ """
+ Split data into 'block_size' sized chunks, and return an array
+ of simple 32-bit hashes, one for each block. 'len(data) / block_size'
+ hashes will be returned.
+ """
+ result = []
+ x = 0
+ while x < len(data) - block_size + 1:
+ result.append(simplehash(data[x:x + block_size]))
+ x += block_size
+ return result
+
+def rolling_simplehash(data, block_size):
+ """
+ Return an array of simple 32-bit hashes, one for each substring of length
+ 'block_size' in the string 'data'. 'len(data) - block_size + 1' hashes will
+ be returned.
+
+ This call is equivalent to:
+
+ [simplehash(data[x:block_size+x]) \\
+ for x in range(len(data) - block_size + 1)]
+
+ but is implemented in a way that should run significantly faster,
+ especially for large values of 'block_size'.
+ """
+ hashes = []
+
+ # Ensure we have enough data to calculate at least one hash.
+ data_len = len(data)
+ if data_len < block_size:
+ return hashes
+
+ # Calculate a "inverse" hash value, allowing us to remove
+ # values from the hash.
+ inverse_hash = _HASH_MULT;
+ for i in xrange(block_size - 1):
+ inverse_hash *= _HASH_MULT
+ inverse_hash %= _HASH_MOD
+
+ # Prime the hash variable.
+ h = simplehash(data[0:block_size])
+ hashes.append(int(h))
+
+ # Calculate a new hash for each new byte in the input data.
+ for i in xrange(1, len(data) - block_size + 1):
+ h *= _HASH_MULT;
+ h += (_HASH_MOD * 256) - (inverse_hash * ord(data[i - 1]));
+ h += ord(data[i + block_size - 1]);
+ h %= _HASH_MOD;
+ hashes.append(int(h))
+
+ return hashes
+
diff --git a/mercurial/similar.py b/mercurial/similar.py
new file mode 100644
--- /dev/null
+++ b/mercurial/similar.py
@@ -0,0 +1,185 @@
+# similar.py - mechanisms for finding similar files
+#
+# Copyright 2009 David Greenaway <hg-dev at davidgreenaway.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2, incorporated herein by reference.
+#
+
+import sys
+import hashlib
+import array
+import hash
+import os
+
+BLOCK_SIZE = 64
+
+def find_similar_files(old_files, new_files):
+ """
+ For each file in 'new_files', return a list of (filename, similarity)
+ tuples of similar files in 'old_files'.
+ """
+
+ #
+ # The algorithm we use here (inspired by Andrew Tridgell's "rsync"
+ # algorithm) attempts to find "similar" files between a list of 'n'
+ # files and a list of 'm' target files in O(n + m) time.
+ #
+ # At a high level, it splits the source files into k-byte blocks,
+ # and place them all into a hash table. We then see if any of the
+ # target files contain any matching blocks from the source files.
+ # The more blocks a particular target file has in common with
+ # a particular source file, the more similar those files are
+ # considered.
+ #
+ # The details of the algorithm are as follows:
+ #
+ # We split all of the source files into k-byte blocks and calculate
+ # a hash for each. (A file consisting of 'n' bytes will have 'n / k'
+ # hashes). We then store the hash in a dictionary, mapping to the
+ # filename of where the block came from:
+ #
+ # +------------+------------+------------+-->
+ # foo.txt: |To be, or not to be: that is the question> ...
+ # +------------+------------+------------+-->
+ # \ / \ / \ /
+ # 0xaabc30aa 0xbb9b0fbb 0xcc94bbcc
+ #
+ # hashes: { 0xaabc30aa : "foo.txt",
+ # 0xbb9b0fbb : "foo.txt",
+ # 0xcc94bbcc : "foo.txt",
+ # ... }
+ #
+ # For each file in the target list, we then calculate hashes for
+ # each k-byte substring. (A file consisting of 'n' bytes will have
+ # 'n - k + 1' hashes). We search the dictionary of source-file
+ # hashes and track which source files we have common blocks with:
+ #
+ # +------------+---------------+------------->
+ # baz.txt: |To be, or not to be --- That is the questi> ...
+ # +------------+---------------+------------->
+ # \ / \ /
+ # 0xaabc30aa 0xcc94bbcc
+ #
+ # Because we calculate the hash for every k-byte substring (and not
+ # just at block boundaries, like we did in the first stage), we
+ # still find matching blocks, even if the matches are not aligned
+ # to block boundaries.
+ #
+ # Once all matching blocks have been found, we can determine which
+ # source file we have the most common with, and hence which we are
+ # most "similar" to.
+ #
+ # There are a few catches:
+ #
+ # 1. We assume that if hash(x) == hash(y), then x == y.
+ # This assumption will not always be true. We rely on false
+ # matches being uncommon. Two unrelated files having one
+ # or two collisions should not matter; if unrelated files
+ # have many false hashes, incorrect matchings may ensue.
+ #
+ # 2. Permutations of blocks will not be detected. We may have two files
+ # that are "100% similar" which are not in fact the same.
+ #
+ # 3. The algorithm, as stated, will misbehave when either the source
+ # or target file contains duplicate blocks. We perform some trickery to
+ # avoid such problems. Pathological cases such as large files filled
+ # with zeroes should be handled correctly.
+ #
+
+ # Length of data in old_files.
+ original_lengths = {}
+
+ # A dictionary mapping hashes to filenames (and the number of times that
+ # hash appears in that particular file).
+ source_hashes = {}
+
+ #
+ # Split source files into k-byte sized blocks of data, and add them to our
+ # hash table.
+ #
+ for (filename, data_func) in old_files:
+ # Retrieve the data.
+ file_data = data_func(filename)
+ original_lengths[filename] = len(file_data)
+
+ # Hash every k-byte block.
+ file_hashes = {}
+ for h in hash.blocked_simplehash(file_data, BLOCK_SIZE):
+ file_hashes[h] = file_hashes.get(h, 0) + 1
+
+ # Add the counts to the global table.
+ for (h, c) in file_hashes.items():
+ source_hashes.setdefault(h, []).append((filename, c))
+
+ #
+ # Iterate through every k-byte sized block of data in the destination
+ # files, and search for them in the hash table to find similar blocks.
+ #
+ for (filename, data_func) in new_files:
+ # Retrieve this file's data, and calculate a hash for every
+ # k-byte block of data in it.
+ file_data = data_func(filename)
+ hashes = collapse(hash.rolling_simplehash(file_data, BLOCK_SIZE))
+
+ # Search for files with similar k-byte blocks.
+ possible_matches = {}
+ for (h, _) in hashes:
+ count = 1
+ files = source_hashes.get(h, [])
+ for (file, hits) in files:
+ hits = min(hits, count)
+ possible_matches[file] = possible_matches.get(file, 0) + hits
+
+ # Return the possible matches.
+ results = []
+ for (matched_filename, num_equal_blocks) in possible_matches.iteritems():
+ max_possible_matches = max(original_lengths[matched_filename],
+ len(file_data)) / BLOCK_SIZE
+ percent_match = (1.0 * num_equal_blocks) / max_possible_matches
+ results.append((matched_filename, percent_match))
+
+ # Sort matches by decreasing similarity.
+ results.sort(key=(lambda (x,y): -y))
+
+ # Return the result.
+ yield filename, results
+
+def collapse(x):
+ """
+ Remove duplicate items from an array, returning a new array containing
+ tuples of the number of duplicates detected and the original item.
+ Order is not preserved.
+
+ >>> collapse(["cat", "cat", "dog", "cat"])
+ [("cat", 3), ("dog", 1)]
+ """
+ results = {}
+ for i in x:
+ results[i] = results.get(i, 0) + 1
+ return results.iteritems()
+
+def findrenames(added, removed, threshold):
+ """
+ Given two lists of files, yield (old, new, score) tuples of similar files.
+
+ The input 'added' and 'removed' lists should be lists of tuples containing
+ (filename, function to retrieve file data). The retrieval functions will
+ be given a single argument: the name of the file to retrieve.
+ """
+
+ # Find similar files.
+ for (added_file, removed_files) in \
+ find_similar_files(removed, added):
+ # Did we find any matches?
+ if len(removed_files) == 0:
+ continue
+
+ # Did it meet our minimum threshold?
+ (old_file, match_percent) = removed_files[0]
+ if match_percent < threshold:
+ continue
+
+ # Return the result.
+ yield old_file, added_file, match_percent
+
diff --git a/tests/test-hash.py b/tests/test-hash.py
new file mode 100644
--- /dev/null
+++ b/tests/test-hash.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+from mercurial import hash
+
+# Ensure the two given arrays are equal.
+def ensure_arrays_equal(a, b):
+ if len(a) != len(b):
+ print "*** Differing lengths: %d vs %d" % (len(a), len(b))
+ return
+
+ for i in xrange(len(a)):
+ if a[i] != b[i]:
+ print "*** Values differ: %08x vs %08x" % (a[i], b[i])
+ return
+
+#
+# Ensure that the rolling hash returns the same result as calling the
+# individual hash function.
+#
+def test_rolling_hash(data, block_size):
+ hashes_a = [hash.simplehash(data[x:block_size+x]) \
+ for x in range(len(data) - block_size + 1)]
+ hashes_b = hash.rolling_simplehash(data, block_size)
+ ensure_arrays_equal(hashes_a, hashes_b)
+
+test_rolling_hash("", 1)
+test_rolling_hash("a", 1)
+test_rolling_hash("a", 2)
+test_rolling_hash("abc", 1)
+test_rolling_hash("abc", 3)
+test_rolling_hash("\0\0\0\0\0\0\0", 1)
+test_rolling_hash("the quick brown fox jumps over the lazy dog", 5)
+test_rolling_hash("".join([str(x) for x in xrange(10000)]), 100)
+
+#
+# Ensure that the blocked hash returns the same result as calling the
+# individual hash function.
+#
+def test_blocked_hash(data, block_size):
+ hashes_a = [hash.simplehash(data[x:block_size+x]) \
+ for x in range(0, len(data) - block_size + 1, block_size)]
+ hashes_b = hash.blocked_simplehash(data, block_size)
+ ensure_arrays_equal(hashes_a, hashes_b)
+
+test_blocked_hash("", 1)
+test_blocked_hash("a", 1)
+test_blocked_hash("a", 2)
+test_blocked_hash("abc", 1)
+test_blocked_hash("abc", 3)
+test_blocked_hash("\0\0\0\0\0\0\0", 1)
+test_blocked_hash("the quick brown fox jumps over the lazy dog", 5)
+test_blocked_hash("".join([str(x) for x in xrange(10000)]), 100)
+
+#
+# Ensure that the standard hash function is half-sane; ensure
+# there are no collisions hashing 1000 different strings.
+#
+hashes = [hash.simplehash(str(x)) for x in xrange(1000)]
+if len(hashes) != len(set(hashes)):
+ print "Detected %d collissions in %d items." % (
+ len(hashes) - len(set(hashes)), len(hashes))
+
More information about the Mercurial-devel
mailing list