[PATCH 3 of 5] findrenames: Implementing simple hashing functionality

Sun Mar 21 02:33:28 CDT 2010

# HG changeset patch
# User David Greenaway <hg-dev at davidgreenaway.com>
# Date 1269151891 -39600
# Node ID 7e52acf702667d3720588e4b7ec8833b11797faa
# Parent  153c602f7b20c95484033d2b91d05171f62f91b2
findrenames: Implementing simple hashing functionality.

This patch implements a hashing library that calculates simple hashes of
strings and "rolling" hashes of strings (that is, efficient calculation of
hashes of several substrings of a single long string).

This functionality is used in later patches in order to implement a fast
findrenames() implementation.

diff --git a/mercurial/hash.py b/mercurial/hash.py
new file mode 100644
--- /dev/null
+++ b/mercurial/hash.py
@@ -0,0 +1,73 @@
+# hash.py - misc hashing functionality
+#
+# Copyright 2010 David Greenaway <hg-dev at davidgreenaway.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+_HASH_MULT = 257L
+_HASH_MOD = 4294967291L
+
+def simplehash(data):
+    '''return a simple 32-bit hash for the given data string'''
+    r = 0L
+    for i in data:
+        r *= _HASH_MULT
+        r += ord(i)
+        r %= _HASH_MOD
+    return int(r)
+
+def blockedhash(data, blocksize):
+    '''return a list of hashes of the input data
+
+    The data is split into 'blocksize' sized chunks and a simple 32-bit hash
+    is taken for each block. 'len(data) / blocksize' hashes will be returned.
+    '''
+    result = []
+    x = 0
+    while x < len(data) - blocksize + 1:
+        result.append(simplehash(data[x:x + blocksize]))
+        x += blocksize
+    return result
+
+def rollinghash(data, blocksize):
+    '''return a list of hashes of the input data
+
+    Each block of size 'blocksize' will be hashed with a simple 32-but hash
+    function. 'len(data) - blocksize + 1' hashes will be returned.
+
+    This call is equivalent to:
+
+        [simplehash(data[x:blocksize+x]) \\
+                for x in range(len(data) - blocksize + 1)]
+
+    but is implemented in a way that should run significantly faster,
+    especially for large values of 'blocksize'.
+    '''
+    hashes = []
+
+    # Ensure we have enough data to calculate at least one hash.
+    if len(data) < blocksize:
+        return hashes
+
+    # Calculate a "inverse" hash value, allowing us to remove
+    # values from the hash.
+    inversehash = _HASH_MULT
+    for i in xrange(blocksize - 1):
+        inversehash *= _HASH_MULT
+        inversehash %= _HASH_MOD
+
+    # Prime the hash variable.
+    h = simplehash(data[0:blocksize])
+    hashes.append(int(h))
+
+    # Calculate a new hash for each new byte in the input data.
+    for i in xrange(1, len(data) - blocksize + 1):
+        h *= _HASH_MULT
+        h += (_HASH_MOD * 256) - (inversehash * ord(data[i - 1]))
+        h += ord(data[i + blocksize - 1])
+        h %= _HASH_MOD
+        hashes.append(int(h))
+
+    return hashes
+
diff --git a/tests/test-hash.py b/tests/test-hash.py
new file mode 100644
--- /dev/null
+++ b/tests/test-hash.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+from mercurial import hash
+
+# Ensure the two given arrays are equal.
+def listsequal(a, b):
+    if len(a) != len(b):
+        print "*** Differing lengths: %d vs %d" % (len(a), len(b))
+        return
+
+    for i in xrange(len(a)):
+        if a[i] != b[i]:
+            print "*** Values differ: %08x vs %08x" % (a[i], b[i])
+            return
+
+#
+# Ensure that the rolling hash returns the same result as calling the
+# individual hash function.
+#
+def testrollinghash(data, blocksize):
+    a = [hash.simplehash(data[x:blocksize+x]) \
+            for x in range(len(data) - blocksize + 1)]
+    b = hash.rollinghash(data, blocksize)
+    listsequal(a, b)
+
+print "# test rolling hash"
+testrollinghash("", 1)
+testrollinghash("a", 1)
+testrollinghash("a", 2)
+testrollinghash("abc", 1)
+testrollinghash("abc", 3)
+testrollinghash("\0\0\0\0\0\0\0", 1)
+testrollinghash("the quick brown fox jumps over the lazy dog", 5)
+testrollinghash("".join([str(x) for x in xrange(10000)]), 100)
+
+#
+# Ensure that the blocked hash returns the same result as calling the
+# individual hash function.
+#
+def testblockedhash(data, blocksize):
+    a = [hash.simplehash(data[x:blocksize+x]) \
+            for x in range(0, len(data) - blocksize + 1, blocksize)]
+    b = hash.blockedhash(data, blocksize)
+    listsequal(a, b)
+
+print "# test blocked hash"
+testblockedhash("", 1)
+testblockedhash("a", 1)
+testblockedhash("a", 2)
+testblockedhash("abc", 1)
+testblockedhash("abc", 3)
+testblockedhash("\0\0\0\0\0\0\0", 1)
+testblockedhash("the quick brown fox jumps over the lazy dog", 5)
+testblockedhash("".join([str(x) for x in xrange(10000)]), 100)
+
+#
+# Ensure that the standard hash function is half-sane; ensure
+# there are no collisions hashing 1000 different strings.
+#
+print "# test simple hash"
+hashes = [hash.simplehash(str(x)) for x in xrange(1000)]
+if len(hashes) != len(set(hashes)):
+    print "Detected %d collissions in %d items." % (
+            len(hashes) - len(set(hashes)), len(hashes))
+