D2631: [RFC] xdiff: skip trimmed lines when preparing the hashtable
quark (Jun Wu)
phabricator at mercurial-scm.org
Sun Mar 4 02:51:55 UTC 2018
quark created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.
REVISION SUMMARY
NOTE: I'm still reading xdiffi.c to understand whether this is a safe
optimization or not.
xdiff has a "xdl_trim_ends" function that removes common prefix and suffix.
Previously, xdiff will build a hashtable for all lines. That is a waste of
time for trimmed lines. This diff changes the logic so trimmed lines will be
ignored when building the hashtable. Note: the hashtable is still needed for
shifting purpose, so it does not blindly take whatever `xdl_trim_ends` says,
but also looks around.
For the following test case:
#!python
open('a','w').write(''.join('%s\n' % (i % 100000) for i in xrange(10000000)))
open('b','w').write(''.join('%s\n' % (i % 100000) for i in xrange(10000001)))
This series reduces xdiff's time for the above case from 1.1 seconds (https://phab.mercurial-scm.org/D2604)
to 0.6 seconds.
However, GNU diffutils can perform even better (<0.1 seconds), there are
still things to catch up.
REPOSITORY
rHG Mercurial
REVISION DETAIL
https://phab.mercurial-scm.org/D2631
AFFECTED FILES
mercurial/thirdparty/xdiff/xprepare.c
CHANGE DETAILS
diff --git a/mercurial/thirdparty/xdiff/xprepare.c b/mercurial/thirdparty/xdiff/xprepare.c
--- a/mercurial/thirdparty/xdiff/xprepare.c
+++ b/mercurial/thirdparty/xdiff/xprepare.c
@@ -62,6 +62,7 @@
static int xdl_clean_mmatch(char const *dis, long i, long s, long e);
static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2);
+static long xdl_trim_reserved_lines(xdfile_t *xdf1, xdfile_t *xdf2);
@@ -234,21 +235,32 @@
* unique. This makes future calculation faster - they can just compare "ha"
* instead of comparing line content.
*/
-static int xdl_prepare_hashtable(unsigned int pass, mmfile_t *mf,
- xpparam_t const *xpp, xdlclassifier_t *cf, xdfile_t *xdf) {
+static int xdl_prepare_hashtable(unsigned int pass, long reserved, mmfile_t
+ *mf, xpparam_t const *xpp, xdlclassifier_t *cf, xdfile_t *xdf)
+{
xrecord_t **rhash = NULL;
- long nrec = xdf->nrec;
+ long nrec;
+
unsigned int hbits;
long hsize;
long i;
+ long start = xdf->dstart - reserved;
+ long end = xdf->dend + reserved;
+
+ if (start < 0)
+ start = 0;
+ if (end >= xdf->nrec)
+ end = xdf->nrec - 1;
+
+ nrec = end - start;
hbits = xdl_hashbits((unsigned int) nrec);
hsize = 1 << hbits;
if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *))))
goto abort;
memset(rhash, 0, hsize * sizeof(xrecord_t *));
- for (i = 0; i < nrec; ++i) {
+ for (i = start; i <= end; i++) {
if (xdl_classify_record(pass, cf, rhash, hbits, xdf->recs[i]) < 0)
goto abort;
}
@@ -275,7 +287,7 @@
int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
xdfenv_t *xe) {
- long enl1, enl2, sample;
+ long enl1, enl2, sample, reserved;
xdlclassifier_t cf;
memset(&cf, 0, sizeof(cf));
@@ -307,13 +319,14 @@
return -1;
}
- if (xdl_prepare_hashtable(1, mf1, xpp, &cf, &xe->xdf1) < 0) {
+ reserved = xdl_trim_reserved_lines(&xe->xdf1, &xe->xdf2);
+ if (xdl_prepare_hashtable(1, reserved, mf1, xpp, &cf, &xe->xdf1) < 0) {
xdl_free_ctx(&xe->xdf1);
xdl_free_ctx(&xe->xdf2);
xdl_free_classifier(&cf);
return -1;
}
- if (xdl_prepare_hashtable(2, mf2, xpp, &cf, &xe->xdf2) < 0) {
+ if (xdl_prepare_hashtable(2, reserved, mf2, xpp, &cf, &xe->xdf2) < 0) {
xdl_free_ctx(&xe->xdf1);
xdl_free_ctx(&xe->xdf2);
xdl_free_classifier(&cf);
@@ -462,7 +475,6 @@
return 0;
}
-
/*
* Early trim initial and terminal matching records.
*/
@@ -500,3 +512,22 @@
return 0;
}
+
+
+/*
+ * Return "reserved lines" for possible hunk shifting. Normally, only look at
+ * lines in dstart..dend range. But hunk shifting also needs accurate line
+ * hashes. Estimated hunk size and reserve lines for shifting purpose.
+ *
+ * This would be used by xdl_prepare_hashtable, to build accurate hash values.
+ */
+static long xdl_trim_reserved_lines(xdfile_t *xdf1, xdfile_t *xdf2) {
+ long lines = 0;
+ if (xdf1->dend > xdf1->dstart)
+ lines += xdf1->dend - xdf1->dstart;
+ if (xdf2->dend > xdf2->dstart)
+ lines += xdf2->dend - xdf2->dstart;
+ return lines;
+}
+
+
To: quark, #hg-reviewers
Cc: mercurial-devel
More information about the Mercurial-devel
mailing list