D2631: [RFC] xdiff: skip trimmed lines when preparing the hashtable

Sun Mar 4 02:51:55 UTC 2018

quark created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  NOTE: I'm still reading xdiffi.c to understand whether this is a safe
  optimization or not.
  
  xdiff has a "xdl_trim_ends" function that removes common prefix and suffix.
  
  Previously, xdiff will build a hashtable for all lines. That is a waste of
  time for trimmed lines. This diff changes the logic so trimmed lines will be
  ignored when building the hashtable. Note: the hashtable is still needed for
  shifting purpose, so it does not blindly take whatever `xdl_trim_ends` says,
  but also looks around.
  
  For the following test case:
  
    #!python
    open('a','w').write(''.join('%s\n' % (i % 100000) for i in xrange(10000000)))
    open('b','w').write(''.join('%s\n' % (i % 100000) for i in xrange(10000001)))
  
  This series reduces xdiff's time for the above case from 1.1 seconds (https://phab.mercurial-scm.org/D2604)
  to 0.6 seconds.
  
  However, GNU diffutils can perform even better (<0.1 seconds), there are
  still things to catch up.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D2631

AFFECTED FILES
  mercurial/thirdparty/xdiff/xprepare.c

CHANGE DETAILS

diff --git a/mercurial/thirdparty/xdiff/xprepare.c b/mercurial/thirdparty/xdiff/xprepare.c
--- a/mercurial/thirdparty/xdiff/xprepare.c
+++ b/mercurial/thirdparty/xdiff/xprepare.c
@@ -62,6 +62,7 @@
 static int xdl_clean_mmatch(char const *dis, long i, long s, long e);
 static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
 static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2);
+static long xdl_trim_reserved_lines(xdfile_t *xdf1, xdfile_t *xdf2);
 
 
 
@@ -234,21 +235,32 @@
  * unique. This makes future calculation faster - they can just compare "ha"
  * instead of comparing line content.
  */
-static int xdl_prepare_hashtable(unsigned int pass, mmfile_t *mf,
-		xpparam_t const *xpp, xdlclassifier_t *cf, xdfile_t *xdf) {
+static int xdl_prepare_hashtable(unsigned int pass, long reserved, mmfile_t
+		*mf, xpparam_t const *xpp, xdlclassifier_t *cf, xdfile_t *xdf)
+{
 	xrecord_t **rhash = NULL;
-	long nrec = xdf->nrec;
+	long nrec;
+
 	unsigned int hbits;
 	long hsize;
 	long i;
+	long start = xdf->dstart - reserved;
+	long end = xdf->dend + reserved;
+
+	if (start < 0)
+		start = 0;
+	if (end >= xdf->nrec)
+		end = xdf->nrec - 1;
+
+	nrec = end - start;
 
 	hbits = xdl_hashbits((unsigned int) nrec);
 	hsize = 1 << hbits;
 	if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *))))
 		goto abort;
 	memset(rhash, 0, hsize * sizeof(xrecord_t *));
 
-	for (i = 0; i < nrec; ++i) {
+	for (i = start; i <= end; i++) {
 		if (xdl_classify_record(pass, cf, rhash, hbits, xdf->recs[i]) < 0)
 			goto abort;
 	}
@@ -275,7 +287,7 @@
 
 int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
 		    xdfenv_t *xe) {
-	long enl1, enl2, sample;
+	long enl1, enl2, sample, reserved;
 	xdlclassifier_t cf;
 
 	memset(&cf, 0, sizeof(cf));
@@ -307,13 +319,14 @@
 		return -1;
 	}
 
-	if (xdl_prepare_hashtable(1, mf1, xpp, &cf, &xe->xdf1) < 0) {
+	reserved = xdl_trim_reserved_lines(&xe->xdf1, &xe->xdf2);
+	if (xdl_prepare_hashtable(1, reserved, mf1, xpp, &cf, &xe->xdf1) < 0) {
 		xdl_free_ctx(&xe->xdf1);
 		xdl_free_ctx(&xe->xdf2);
 		xdl_free_classifier(&cf);
 		return -1;
 	}
-	if (xdl_prepare_hashtable(2, mf2, xpp, &cf, &xe->xdf2) < 0) {
+	if (xdl_prepare_hashtable(2, reserved, mf2, xpp, &cf, &xe->xdf2) < 0) {
 		xdl_free_ctx(&xe->xdf1);
 		xdl_free_ctx(&xe->xdf2);
 		xdl_free_classifier(&cf);
@@ -462,7 +475,6 @@
 	return 0;
 }
 
-
 /*
  * Early trim initial and terminal matching records.
  */
@@ -500,3 +512,22 @@
 
 	return 0;
 }
+
+
+/*
+ * Return "reserved lines" for possible hunk shifting. Normally, only look at
+ * lines in dstart..dend range. But hunk shifting also needs accurate line
+ * hashes. Estimated hunk size and reserve lines for shifting purpose.
+ *
+ * This would be used by xdl_prepare_hashtable, to build accurate hash values.
+ */
+static long xdl_trim_reserved_lines(xdfile_t *xdf1, xdfile_t *xdf2) {
+	long lines = 0;
+	if (xdf1->dend > xdf1->dstart)
+		lines += xdf1->dend - xdf1->dstart;
+	if (xdf2->dend > xdf2->dstart)
+		lines += xdf2->dend - xdf2->dstart;
+	return lines;
+}
+
+



To: quark, #hg-reviewers
Cc: mercurial-devel