D2626: xdiff: use xxhash in xdiff

indygreg (Gregory Szorc) phabricator at mercurial-scm.org
Sun Mar 4 01:38:03 UTC 2018


indygreg created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  Diffing functions split inputs into blocks, typically lines. When
  inspecting each block, the diffing function needs to see if two blocks
  are identical. Because repeated memcmp() can be expensive, diff
  implementations will hash each block to an integer and then do a simple
  integer/register compare to check for block equivalence.
  
  It looks like xxhash was using djb2 for this hash. This hash was fine
  for its use. But the hash was operating on single bytes at a time
  and the output from a previous byte was needed before feeding in a
  new byte. So our CPU pipeline was very constricted.
  
  This commit changes the hash to xxhash. This hashing function takes
  a start address and a length. Internally, it operates on multiple
  bytes at a time. This requires fewer assembly instructions and makes
  the hash faster.
  
  It's worth noting we are using the 32-bit version of xxhash. The 64-bit
  version is faster on 64-bit hardware and we should ideally use it.
  But the 64-bit version returns an unsigned long long and xdiff wants
  the hash to be 32 bits. We could probably truncate the hash. Let's
  use the 32-bit hash for now.
  
  This change yields a minor perf win on the mozilla-central repo:
  
    $ hg perfbdiff --alldata -c --count 10 --blocks --xdiff 400000
    ! wall 0.845893 comb 0.840000 user 0.840000 sys 0.000000 (best of 12)
    ! wall 0.796796 comb 0.790000 user 0.790000 sys 0.000000 (best of 13)
    
    $ hg perfbdiff -m --count 100 --blocks --xdiff 400000
    ! wall 10.026769 comb 10.030000 user 9.010000 sys 1.020000 (best of 3)
    ! wall  9.450092 comb  9.460000 user 8.470000 sys 0.990000 (best of 3)
  
  Taken on its own, this isn't a significant improvement. But it does
  open the door to more efficient line searching...

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D2626

AFFECTED FILES
  mercurial/thirdparty/xdiff/xutils.c
  setup.py

CHANGE DETAILS

diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -871,9 +871,13 @@
               include_dirs=common_include_dirs,
               depends=common_depends),
     Extension('mercurial.cext.bdiff', ['mercurial/bdiff.c',
-                                       'mercurial/cext/bdiff.c'] + xdiff_srcs,
+                                       'mercurial/cext/bdiff.c',
+                                       'mercurial/thirdparty/xxhash/xxhash.c',
+                                       ] + xdiff_srcs,
               include_dirs=common_include_dirs,
-              depends=common_depends + ['mercurial/bdiff.h'] + xdiff_headers),
+              depends=common_depends + ['mercurial/bdiff.h',
+                                        'mercurial/thirdparty/xxhash/xxhash.h',
+                                        ] + xdiff_headers),
     Extension('mercurial.cext.diffhelpers', ['mercurial/cext/diffhelpers.c'],
               include_dirs=common_include_dirs,
               depends=common_depends),
diff --git a/mercurial/thirdparty/xdiff/xutils.c b/mercurial/thirdparty/xdiff/xutils.c
--- a/mercurial/thirdparty/xdiff/xutils.c
+++ b/mercurial/thirdparty/xdiff/xutils.c
@@ -22,10 +22,13 @@
 
 #include <limits.h>
 #include <assert.h>
-#include "xinclude.h"
 
+/* Define XXHASH functions in a custom namespace */
+#define XXH_NAMESPACE XDIFF_
+#define XXH_PRIVATE_API
+#include "thirdparty/xxhash/xxhash.h"
 
-
+#include "xinclude.h"
 
 long xdl_bogosqrt(long n) {
 	long i;
@@ -299,19 +302,16 @@
 }
 
 unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
-	unsigned long ha = 5381;
+	XXH32_hash_t h;
 	char const *ptr = *data;
 
 	if (flags & XDF_WHITESPACE_FLAGS)
 		return xdl_hash_record_with_whitespace(data, top, flags);
 
-	for (; ptr < top && *ptr != '\n'; ptr++) {
-		ha += (ha << 5);
-		ha ^= (unsigned long) *ptr;
-	}
+	for (; ptr < top && *ptr != '\n'; ptr++) { }
+	h = XDIFF_XXH32(*data, ptr - *data + 1, 0);
 	*data = ptr < top ? ptr + 1: ptr;
-
-	return ha;
+	return h;
 }
 
 unsigned int xdl_hashbits(unsigned int size) {



To: indygreg, #hg-reviewers
Cc: mercurial-devel


More information about the Mercurial-devel mailing list