[PATCH 3 of 5] findrenames: Implementing simple hashing functionality
David Greenaway
hg-dev at davidgreenaway.com
Sun Mar 21 02:33:28 CDT 2010
# HG changeset patch
# User David Greenaway <hg-dev at davidgreenaway.com>
# Date 1269151891 -39600
# Node ID 7e52acf702667d3720588e4b7ec8833b11797faa
# Parent 153c602f7b20c95484033d2b91d05171f62f91b2
findrenames: Implementing simple hashing functionality.
This patch implements a hashing library that calculates simple hashes of
strings and "rolling" hashes of strings (that is, efficient calculation of
hashes of several substrings of a single long string).
This functionality is used in later patches in order to implement a fast
findrenames() implementation.
diff --git a/mercurial/hash.py b/mercurial/hash.py
new file mode 100644
--- /dev/null
+++ b/mercurial/hash.py
@@ -0,0 +1,73 @@
+# hash.py - misc hashing functionality
+#
+# Copyright 2010 David Greenaway <hg-dev at davidgreenaway.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+_HASH_MULT = 257L
+_HASH_MOD = 4294967291L
+
+def simplehash(data):
+ '''return a simple 32-bit hash for the given data string'''
+ r = 0L
+ for i in data:
+ r *= _HASH_MULT
+ r += ord(i)
+ r %= _HASH_MOD
+ return int(r)
+
+def blockedhash(data, blocksize):
+ '''return a list of hashes of the input data
+
+ The data is split into 'blocksize' sized chunks and a simple 32-bit hash
+ is taken for each block. 'len(data) / blocksize' hashes will be returned.
+ '''
+ result = []
+ x = 0
+ while x < len(data) - blocksize + 1:
+ result.append(simplehash(data[x:x + blocksize]))
+ x += blocksize
+ return result
+
+def rollinghash(data, blocksize):
+ '''return a list of hashes of the input data
+
+ Each block of size 'blocksize' will be hashed with a simple 32-but hash
+ function. 'len(data) - blocksize + 1' hashes will be returned.
+
+ This call is equivalent to:
+
+ [simplehash(data[x:blocksize+x]) \\
+ for x in range(len(data) - blocksize + 1)]
+
+ but is implemented in a way that should run significantly faster,
+ especially for large values of 'blocksize'.
+ '''
+ hashes = []
+
+ # Ensure we have enough data to calculate at least one hash.
+ if len(data) < blocksize:
+ return hashes
+
+ # Calculate a "inverse" hash value, allowing us to remove
+ # values from the hash.
+ inversehash = _HASH_MULT
+ for i in xrange(blocksize - 1):
+ inversehash *= _HASH_MULT
+ inversehash %= _HASH_MOD
+
+ # Prime the hash variable.
+ h = simplehash(data[0:blocksize])
+ hashes.append(int(h))
+
+ # Calculate a new hash for each new byte in the input data.
+ for i in xrange(1, len(data) - blocksize + 1):
+ h *= _HASH_MULT
+ h += (_HASH_MOD * 256) - (inversehash * ord(data[i - 1]))
+ h += ord(data[i + blocksize - 1])
+ h %= _HASH_MOD
+ hashes.append(int(h))
+
+ return hashes
+
diff --git a/tests/test-hash.py b/tests/test-hash.py
new file mode 100644
--- /dev/null
+++ b/tests/test-hash.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+from mercurial import hash
+
+# Ensure the two given arrays are equal.
+def listsequal(a, b):
+ if len(a) != len(b):
+ print "*** Differing lengths: %d vs %d" % (len(a), len(b))
+ return
+
+ for i in xrange(len(a)):
+ if a[i] != b[i]:
+ print "*** Values differ: %08x vs %08x" % (a[i], b[i])
+ return
+
+#
+# Ensure that the rolling hash returns the same result as calling the
+# individual hash function.
+#
+def testrollinghash(data, blocksize):
+ a = [hash.simplehash(data[x:blocksize+x]) \
+ for x in range(len(data) - blocksize + 1)]
+ b = hash.rollinghash(data, blocksize)
+ listsequal(a, b)
+
+print "# test rolling hash"
+testrollinghash("", 1)
+testrollinghash("a", 1)
+testrollinghash("a", 2)
+testrollinghash("abc", 1)
+testrollinghash("abc", 3)
+testrollinghash("\0\0\0\0\0\0\0", 1)
+testrollinghash("the quick brown fox jumps over the lazy dog", 5)
+testrollinghash("".join([str(x) for x in xrange(10000)]), 100)
+
+#
+# Ensure that the blocked hash returns the same result as calling the
+# individual hash function.
+#
+def testblockedhash(data, blocksize):
+ a = [hash.simplehash(data[x:blocksize+x]) \
+ for x in range(0, len(data) - blocksize + 1, blocksize)]
+ b = hash.blockedhash(data, blocksize)
+ listsequal(a, b)
+
+print "# test blocked hash"
+testblockedhash("", 1)
+testblockedhash("a", 1)
+testblockedhash("a", 2)
+testblockedhash("abc", 1)
+testblockedhash("abc", 3)
+testblockedhash("\0\0\0\0\0\0\0", 1)
+testblockedhash("the quick brown fox jumps over the lazy dog", 5)
+testblockedhash("".join([str(x) for x in xrange(10000)]), 100)
+
+#
+# Ensure that the standard hash function is half-sane; ensure
+# there are no collisions hashing 1000 different strings.
+#
+print "# test simple hash"
+hashes = [hash.simplehash(str(x)) for x in xrange(1000)]
+if len(hashes) != len(set(hashes)):
+ print "Detected %d collissions in %d items." % (
+ len(hashes) - len(set(hashes)), len(hashes))
+
More information about the Mercurial-devel
mailing list