[PATCH] auto rename: best matches and speed improvements PLEASE REVIEW

Sat Sep 27 18:48:59 CDT 2008

# HG changeset patch
# User Herbert Griebel <herbertg at gmx.at>
# Date 1222555608 -7200
# Node ID d5cab1d3339f14d07723944493f88c9b28b49e28
# Parent  f29b674cc2210126c2899d94d882c367a8ea64bc
auto rename: best matches and speed improvements

The auto rename feature of addremove did not rename to the best match
and was very slow. The new implementation fixes this using all matches
and using statistics of the file content to speed up the matching.

To minimize file loading, file sizes and histograms of bytes are used
before actually loading and comparing two files. If the current
best match for a file cannot be exceeded, based on files sizes and
histogram, the compare is deferred, i.e., done when needed.

File names are also matched for a group of removed files that have
the same score to a group of added files. This allows to find best
matches when moving identical files.

diff -r f29b674cc221 -r d5cab1d3339f mercurial/bdiff.c

--- a/mercurial/bdiff.c	Wed Sep 17 11:34:37 2008 +0200
+++ b/mercurial/bdiff.c	Sun Sep 28 00:46:48 2008 +0200
@@ -356,11 +356,153 @@
 	return result ? result : PyErr_NoMemory();
 }

-static char mdiff_doc[] = "Efficient binary diff.";
+
+typedef struct {
+	unsigned int hist[256];
+	unsigned int filelen;
+} STATDATA;
+
+static PyObject* statistics(PyObject *self, PyObject *args)
+{
+	unsigned char *s;
+	int filelen, i;
+	PyObject *dataobj;
+	STATDATA *stat;
+	unsigned int *hist;
+	if (! PyArg_ParseTuple(args, "s#:statistics", &s, &filelen))
+		return NULL;
+	dataobj = PyString_FromStringAndSize(NULL, sizeof(STATDATA));
+	stat = (STATDATA*) PyString_AS_STRING(dataobj);
+	hist = stat->hist;
+	stat->filelen = filelen;
+
+	/*  build histo */
+	for (i=0; i<256; i++)
+		hist[i] = 0;
+	for (i=0; i<filelen; i++)
+		hist[s[i]] += 1;
+
+	return dataobj;
+}
+
+static PyObject* maxscorehist(PyObject *self, PyObject *args)
+{
+	int len1, len2, i;
+	STATDATA *stat1, *stat2;
+	unsigned int maxlen;
+	double maxscore;
+
+	if (! PyArg_ParseTuple(args, "s#s#:maxscorehist",
+			&stat1, &len1, &stat2, &len2))
+		return NULL;
+	if (len1 != sizeof(STATDATA) || len2 != sizeof(STATDATA)) {
+		PyErr_SetString(PyExc_ValueError, "length of data invalid");
+		return NULL;
+	}
+
+	maxlen = stat1->filelen > stat2->filelen ? stat1->filelen : stat2->filelen;
+	if (maxlen == 0)
+		maxscore = 1.;
+	else if (stat1->filelen == 0 || stat2->filelen == 0)
+		maxscore = 0.;
+	else {
+		unsigned int *h1 = stat1->hist;
+		unsigned int *h2 = stat2->hist;
+		unsigned int delta = stat1->filelen > stat2->filelen ?
+			stat1->filelen - stat2->filelen : stat2->filelen - stat1->filelen;
+		for (i=0; i<256; i++)
+			delta += (h1[i] > h2[i]) ? (h1[i]-h2[i]) : (h2[i]-h1[i]);
+		maxscore = 1. - delta / (double)(2 * maxlen);
+	}
+
+	return PyFloat_FromDouble(maxscore);
+}
+
+int levenshtein_distance(char* s, int n, char* t, int m)
+{
+	int k, i, j, cost, dmin, *d, d1, d2, d3, p;
+	if (n == 0)
+		return m;
+	if (m == 0)
+		return n;
+
+	m++;
+	n++;
+	d = (int*) malloc(m * n * sizeof(int));
+	for (k=0; k<n; k++)
+		d[k] = k;
+	for (k=0; k<m; k++)
+		d[k*n] = k;
+
+	for (i=1; i<n; i++) {
+		for (j=1; j<m; j++) {
+			p = j*n + i;
+			cost = (s[i-1] == t[j-1]) ? 0 : 1;
+			d3 = d[p-n-1] + cost;
+			d2 = d[p-1] + 1;
+			d1 = d[p-n] + 1;
+			if (d1 > d2)
+				d1 = d2;
+			if (d1 > d3)
+				d1 = d3;
+			d[p] = d1;
+		}
+	}
+	dmin = d[n*m-1];
+	free(d);
+
+	return dmin;
+}
+
+static PyObject* strsim(PyObject *self, PyObject *args)
+{
+	int n, m, d;
+	char *s, *t;
+	double sim;
+	if (! PyArg_ParseTuple(args, "s#s#:strsim", &s, &n, &t, &m))
+		return NULL;
+	d = levenshtein_distance(s, n, t, m);
+	if (n==0 && m==0)
+		sim = 1.0;
+	else if (n > m)
+		sim = (n-d) / (double)n;
+	else
+		sim = (m-d) / (double)m;
+	return PyFloat_FromDouble(sim);
+}
+
+static PyObject* reversesim(PyObject *self, PyObject *args)
+{
+	int n1, n2;
+	char *s1, *s2;
+	if (! PyArg_ParseTuple(args, "s#s#:reversesim", &s1, &n1, &s2, &n2))
+		return NULL;
+	if (n1 > n2) {
+		s1 += n1 - n2;
+		n1 = n2;
+	} else {
+		s2 += n2 - n1;
+		n2 = n1;
+	}
+	while (n1 > 0) {
+		n1--;
+		if (s1[n1] != s2[n1])
+			break;
+	}
+	return PyLong_FromLong(n2 - n1);
+}
+
+
+
+static char mdiff_doc[] = "Efficient binary diff and string similarity.";

 static PyMethodDef methods[] = {
 	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
 	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
+	{"statistics", statistics, METH_VARARGS, "get byte statistics\n"},
+	{"maxscorehist", maxscorehist, METH_VARARGS, "calculate maximum score\n"},
+	{"strsim", strsim, METH_VARARGS, "get similarity measure of strings\n"},
+	{"reversesim", reversesim, METH_VARARGS, "equal characters from right\n"},
 	{NULL, NULL}
 };

@@ -368,4 +510,3 @@
 {
 	Py_InitModule3("bdiff", methods, mdiff_doc);
 }
-
diff -r f29b674cc221 -r d5cab1d3339f mercurial/cmdutil.py
--- a/mercurial/cmdutil.py	Wed Sep 17 11:34:37 2008 +0200
+++ b/mercurial/cmdutil.py	Sun Sep 28 00:46:48 2008 +0200
@@ -10,6 +10,7 @@
 import os, sys, bisect, stat
 import mdiff, bdiff, util, templater, templatefilters, patch, errno
 import match as _match
+import copy as _copy

 revrangesep = ':'

@@ -245,29 +246,371 @@
     '''find renamed files -- yields (before, after, score) tuples'''
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
+    if not added or not removed:
+        return
     ctx = repo['.']
-    for a in added:
-        aa = repo.wread(a)
-        bestname, bestscore = None, threshold
-        for r in removed:
-            rr = ctx.filectx(r).data()

-            # bdiff.blocks() returns blocks of matching lines
-            # count the number of bytes in each
-            equal = 0
-            alines = mdiff.splitnewlines(aa)
-            matches = bdiff.blocks(aa, rr)
-            for x1,x2,y1,y2 in matches:
-                for line in alines[x1:x2]:
-                    equal += len(line)
+    # cash file attributes
+    nr = len(removed)
+    na = len(added)
+    rfiles = [()] * nr
+    afiles = [()] * na
+    for i, r in enumerate(removed):
+        rfiles[i] = (ctx.filectx(r).size(),
+                     os.path.split(r)[1], os.path.splitext(r)[1], i)
+    for i, a in enumerate(added):
+        afiles[i] = (repo.wfilesize(a),
+                     os.path.split(a)[1], os.path.splitext(a)[1], i)

-            lengths = len(aa) + len(rr)
-            if lengths:
-                myscore = equal*2.0 / lengths
-                if myscore >= bestscore:
-                    bestname, bestscore = r, myscore
-        if bestname:
-            yield bestname, a, bestscore
+    # check loop order
+    if na == 1:
+        addedfirst = True
+    elif nr == 1:
+        addedfirst = False
+    else:
+        # estimate times to decide loops
+        Sr = sum([f[0] for f in rfiles])
+        Sa = sum([f[0] for f in afiles])
+        dr = Sr*1.60882e-008 + nr*0.00145002
+        da = Sa*9.03138e-009 + na*0.00110696
+        d1 = da + na*dr
+        d2 = dr + nr*da
+        addedfirst = d1 <= d2
+
+    # # This code is only for getting the estimator coefficients
+    # # and should never run in a released version
+    #
+    # # Assumption:
+    # # duration = numBytesTotal * c1 + numFiles * c2
+    #
+    # def _measure(f1, f2):
+    #     s1a, s1r = 0, 0
+    #     added1 = added[int(na*f1):int(na*f2)]
+    #     removed1 = removed[int(nr*f1):int(nr*f2)]
+    #     n1a, n1r = len(added1), len(removed1)
+    #     import time
+    #     t0 = time.clock()
+    #     for a in added1:
+    #         s1a += len(repo.wread(a))
+    #     t1 = time.clock()
+    #     for r in removed1:
+    #         s1r += len(ctx.filectx(r).data())
+    #     t2 = time.clock()
+    #     d1a, d1r = t1 - t0, t2 - t1
+    #     return s1r, s1a, n1r, n1a, d1r, d1a
+    #
+    # def _solve2d(a1,a2,b1,b2,e1,e2):
+    #     """a1*x + b1*y = e1
+    #        a2*x + b2*y = e2"""
+    #     def det(a1,a2,b1,b2):
+    #         return a1*b2 - b1*a2
+    #     d = det(a1,a2,b1,b2)
+    #     x = det(e1,e2,b1,b2) / d
+    #     y = det(a1,a2,e1,e2) / d
+    #     return x, y
+    #
+    # s1r, s1a, n1r, n1a, d1r, d1a = _measure(0, .33)
+    # s2r, s2a, n2r, n2a, d2r, d2a = _measure(.33, 1)
+    # xc1r, xc2r = _solve2d(s1r, s2r, n1r, n2r, d1r, d2r)
+    # xc1a, xc2a = _solve2d(s1a, s2a, n1a, n2a, d1a, d2a)
+    # print 'dr = Sr*%g + nr*%g' % (xc1r, xc2r)
+    # print 'da = Sa*%g + na*%g' % (xc1a, xc2a)
+
+    if addedfirst:
+        filelist1, filelist2 = afiles, _copy.copy(rfiles)
+    else:
+        filelist1, filelist2 = rfiles, _copy.copy(afiles)
+
+    # init
+    minscore = threshold
+    rbased = nr < na
+    bestmatches = [[0, 0, True, -1, -1, False]] * min([nr, na])
+    rmatches = [[] for i in xrange(nr)]
+    amatches = [[] for i in xrange(na)]
+    rbestscores = [0] * nr
+    abestscores = [0] * na
+    rstat = [''] * nr
+    astat = [''] * na
+    # named constants
+    iscompared = True # needed for max function: True > False
+    notcompared = False
+
+    def _getscore(rr, aa, alines):
+        '''bdiff.blocks() returns blocks of matching lines
+           count the number of bytes in each'''
+        equal = 0
+        matches = bdiff.blocks(aa, rr)
+        for x1,x2,y1,y2 in matches:
+            for line in alines[x1:x2]:
+                equal += len(line)
+        score = float(equal)  / max([len(aa), len(rr)])
+        return score
+
+    def _addmatch(score, compared):
+        '''enters match into the 3 lists'''
+        namesim = bdiff.reversesim(removed[ir], added[ia])
+        bm = [score, namesim, compared, ir, ia, False]
+        rmatches[ir].append(bm)
+        amatches[ia].append(bm)
+        if rbased:
+            if bestmatches[ir] < bm:
+                bestmatches[ir][5] = False
+                bestmatches[ir] = bm
+                bm[5] = True
+        else:
+            if bestmatches[ia] < bm:
+                bestmatches[ia][5] = False
+                bestmatches[ia] = bm
+                bm[5] = True
+        if rbestscores[ir] < score:
+            rbestscores[ir] = score
+        if abestscores[ia] < score:
+            abestscores[ia] = score
+
+    # loop 1
+    for size1, name1, ext1, ifile1 in filelist1:
+        if addedfirst:
+            ia = ifile1
+        else:
+            ir = ifile1
+        loadfile1 = True
+
+        # check most similar file names first (moves)
+        nmoves = 0
+        for i in xrange(len(filelist2)):
+            if name1 == filelist2[i][1]:
+                if i != nmoves:
+                    # swap files
+                    filelist2[nmoves], filelist2[i] = \
+                            filelist2[i], filelist2[nmoves]
+                nmoves += 1
+
+        # loop 2
+        for size2, name2, ext2, ifile2 in filelist2:
+            if addedfirst:
+                ir = ifile2
+            else:
+                ia = ifile2
+
+            # special case: empty files
+            if not size1 or not size2:
+                if size1 == size2:
+                    score = 1
+                    _addmatch(score, iscompared)
+                continue
+
+            # maximum score that could be reached because of sizes
+            maxscoresize = \
+                float(size2)/size1 if size1>size2 else float(size1)/size2
+            if maxscoresize < minscore:
+                continue
+
+            # maximum score that could be reached because of statistics
+            maxscore = maxscoresize
+            if rstat[ir] and astat[ia]:
+                # histogram delta
+                maxscorehist = bdiff.maxscorehist(rstat[ir], astat[ia])
+                if maxscorehist < minscore:
+                    continue
+                # defer compares, take lowest upper bound
+                if maxscorehist < maxscore:
+                    maxscore = maxscorehist
+
+            # deferred compares
+            if ifile1 and (maxscore < rbestscores[ir] or \
+                    maxscore < abestscores[ia]):
+                _addmatch(maxscore, notcompared)
+                continue
+
+            # load file 1
+            if loadfile1:
+                if addedfirst:
+                    aa = repo.wread(added[ia])
+                    alines = mdiff.splitnewlines(aa)
+                    astat[ia] = bdiff.statistics(aa)
+                else:
+                    rr = ctx.filectx(removed[ir]).data()
+                    rstat[ir] = bdiff.statistics(rr)
+                loadfile1 = False
+
+            # load file 2
+            if addedfirst:
+                rr = ctx.filectx(removed[ir]).data()
+                if not rstat[ir]:
+                    rstat[ir] = bdiff.statistics(rr)
+            else:
+                aa = repo.wread(added[ia])
+                alines = mdiff.splitnewlines(aa)
+                if not astat[ia]:
+                    astat[ia] = bdiff.statistics(aa)
+
+            # score it
+            score = _getscore(rr, aa, alines)
+            if score >= minscore:
+                _addmatch(score, iscompared)
+
+    def _deferredcompare(m):
+        '''function returns 0 if max score was not reached'''
+        maxscore, namesim, compared, ir, ia, isbest = m
+        if compared:
+            return maxscore
+        if not maxscore:
+            return 0
+        rr = ctx.filectx(removed[ir]).data()
+        aa = repo.wread(added[ia])
+        alines = mdiff.splitnewlines(aa)
+        score = _getscore(rr, aa, alines)
+        m[2] = iscompared # not deferred anymore
+        if score == maxscore:
+            return maxscore
+        # update lists and get next best score
+        m[0] = 0 if score < minscore else score
+        if isbest:
+            m[5] = False # remove old from best
+            if rbased:
+                l = rmatches[ir]
+                m = max(l) if l else 0
+                bestmatches[ir] = m
+            else:
+                l = amatches[ia]
+                m = max(l) if l else 0
+                bestmatches[ia] = m
+            if m:
+                m[5] = True # add new to best
+        return 0
+
+    def _namematches(scorematches):
+        '''Find best matches for a group of removed and added files
+        Matching score (high to low): file extension equal
+                                      file name similarity (levenshtein)
+                                      full pathname similarity (levenshtein)
+                                      common substring length from right
+        The heuristic assumes that a file
+        extension change is the least likely.
+        '''
+        if len(scorematches) == 1:
+            yield scorematches[0][0]
+            return
+        best = [0] * len(scorematches)
+        for i,sm in enumerate(scorematches):
+            score, namesim, compared, ir, ia, isbest = sm[0]
+            r = removed[ir]
+            a = added[ia]
+            rname = rfiles[ir][1]
+            aname = afiles[ia][1]
+            rext = rfiles[ir][2]
+            aext = afiles[ia][2]
+            score1 = rext == aext
+            score2 = bdiff.strsim(rname, aname)
+            score3 = bdiff.strsim(r, a)
+            score4 = bdiff.reversesim(r, a)
+            ndepends = -len(sm[1]) + (0 if compared else -1)
+            best[i] = (score1, score2, score3, score4, ndepends, sm)
+        best.sort(reverse = True)
+        for score1, score2, score3, score4, n, sm in best:
+            m, depends = sm
+            score, namesim, compared, ir, ia, isbest = m
+            # depends on other deferred?
+            dismissed = False
+            for d in depends:
+                if _deferredcompare(d) != score:
+                    dismissed = True
+                    break
+            if dismissed:
+                continue
+            # check if deferred
+            if not compared:
+                if _deferredcompare(m) != score:
+                    continue
+            yield m
+
+    # assign best matches
+    while True:
+        bm = max(bestmatches) # get next best
+        if not bm:
+            break
+        score, namesim, compared, ir, ia, isbest = bm
+        if not score:
+            break
+        if not compared:
+            if _deferredcompare(bm) != score:
+                continue
+
+        # Find all files that have the same similarity to the two files.
+        # Name matching is necessary because:
+        #
+        # a -> x    score       basic case, scores higher have already
+        # a -> y    score       been handled, scores lower are not relevant
+        #
+        # a -> x    score       basic case
+        # b -> x    score
+        #
+        # a -> x    score       general case, finding all matches must
+        # a -> y    score       be done iteratively
+        # b -> y    score
+        # b -> z    score
+        # ...
+        #
+        # a -> x    score       not necessary to follow "100% links"
+        # b -> x    1           because link a -> y exists
+        # b -> y    1
+        # a -> y    score
+
+        rset, aset = [ir], [ia]
+        sm = [bm,[]]
+        rsetnew, asetnew = [sm], [sm]
+        scorematches = [sm]
+        while rsetnew:
+            # get all added files that match rsetnew files
+            for newm, depends in rsetnew:
+                if newm[2] == notcompared:
+                    depends = _copy.copy(depends)
+                    depends.append(newm)
+                l = [[m, depends] for m in rmatches[newm[3]] \
+                        if m[0]==score and m[4] not in aset]
+                scorematches += l
+                asetnew += l
+            rset += [m[3] for m, d in rsetnew]
+            rsetnew = []
+            # get all removed files that match asetnew files
+            for newm, depends in asetnew:
+                if newm[2] == notcompared:
+                    depends = _copy.copy(depends)
+                    depends.append(newm)
+                l = [[m, depends] for m in amatches[newm[4]] \
+                        if m[0]==score and m[3] not in rset]
+                scorematches += l
+                rsetnew += l
+            aset += [m[4] for m, d in asetnew]
+            asetnew = []
+
+        # find best name matches of the files with equal score
+        for bm in _namematches(scorematches):
+            score, namesim, compared, ir, ia, isbest = bm
+            if not score:
+                continue
+            yield removed[ir], added[ia], score
+            # invalidate ir and ia
+            for m in rmatches[ir]:
+                m[0] = 0
+            for m in amatches[ia]:
+                m[0] = 0
+            rmatches[ir] = []
+            amatches[ia] = []
+            # get next best match for invalidated
+            for m in bestmatches:
+                if m and not m[0]:
+                    score, namesim, compared, ir2, ia2, isbest = m
+                    if rbased:
+                        l = rmatches[ir2]
+                        m = max(l) if l else 0
+                        bestmatches[ir2] = m
+                    else:
+                        l = amatches[ia2]
+                        m = max(l) if l else 0
+                        bestmatches[ia2] = m
+                    if m:
+                        m[5] = True

 def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
     if dry_run is None:
diff -r f29b674cc221 -r d5cab1d3339f mercurial/localrepo.py
--- a/mercurial/localrepo.py	Wed Sep 17 11:34:37 2008 +0200
+++ b/mercurial/localrepo.py	Sun Sep 28 00:46:48 2008 +0200
@@ -528,6 +528,9 @@

     def adddatafilter(self, name, filter):
         self._datafilters[name] = filter
+
+    def wfilesize(self, filename):
+        return os.stat(self.wjoin(filename))[6]

     def wread(self, filename):
         if self._link(filename):
diff -r f29b674cc221 -r d5cab1d3339f tests/test-addremove-similar
--- a/tests/test-addremove-similar	Wed Sep 17 11:34:37 2008 +0200
+++ b/tests/test-addremove-similar	Sun Sep 28 00:46:48 2008 +0200
@@ -1,49 +1,97 @@
 #!/bin/sh

+echo % basic content-matching test
 hg init rep; cd rep

+echo % add 2 files
 touch empty-file
 python -c 'for x in range(10000): print x' > large-file
-
 hg addremove
-
 hg commit -m A

+echo % rename large-file to another-file, remove other
 rm large-file empty-file
 python -c 'for x in range(10,10000): print x' > another-file
-
-hg addremove -s50
-
+hg addremove -s 50
 hg commit -m B

-echo % comparing two empty files caused ZeroDivisionError in the past
-hg update -C 0
-rm empty-file
-touch another-empty-file
-hg addremove -s50
+echo % rename another-file to file1, add file2
+rm another-file
+python -c 'for x in range(20,10000): print x' > file1
+python -c 'for x in range(30,10000): print x' > file2
+hg addremove -s 50
+hg commit -m C

-cd ..
-
-hg init rep2; cd rep2
-
-python -c 'for x in range(10000): print x' > large-file
-python -c 'for x in range(50): print x' > tiny-file
-
-hg addremove
-
-hg commit -m A
-
-python -c 'for x in range(70): print x' > small-file
-rm tiny-file
-rm large-file
-
-hg addremove -s50
-
-hg commit -m B
-
-echo % should all fail
+echo % option-value test, should all fail
 hg addremove -s foo
 hg addremove -s -1
 hg addremove -s 1e6

+echo % content-matching test
+cd ..
+hg init rep2; cd rep2
+
+echo % add 2 files
+python -c 'for x in range(10000): print x' > large.file
+python -c 'for x in range(50): print x' > tiny.file
+hg addremove -s 50
+hg commit -m A
+
+echo % rename tiny.file to x/small.file
+python -c 'for x in range(70): print x' > small.file
+rm tiny.file
+rm large.file
+mkdir x
+mv *.file x
+hg addremove -s 50
+hg commit -m B
+
+echo % name-matching tests
+cd ..
+hg init rep3; cd rep3
+
+echo % add 5 files
+touch a.file
+touch b.file
+touch c.file
+mkdir dir1
+mkdir dir2
+touch dir1/z.file
+touch dir2/z.file
+hg addremove -s50
+hg commit -m T1
+
+echo % move all to dir0, add z.file
+mkdir dir0
+mv *.file dir0
+mv dir1 dir0
+mv dir2 dir0
+touch z.file
+hg addremove -s50
+hg commit -m T2
+
+echo % rename */z.file to */o.file
+mv dir0/dir1/z.file dir0/dir1/o.file
+mv dir0/dir2/z.file dir0/dir2/o.file
+hg addremove -s50
+hg commit -m T3
+
+echo % rename dir1 to dirA, dir2 to dirB
+mv dir0/dir1 dir0/dirA
+mv dir0/dir2 dir0/dirB
+hg addremove -s50
+hg commit -m T4
+
+echo % remove dir0 files
+rm dir0/*.file
+hg commit -A -m T5
+
+echo % rename dir0/dirA/o.file to dirX/dirA/x.file
+echo % rename dir0/dirB/o.file to dirX/dirB/x.file
+mv dir0/dirA/o.file dir0/dirA/x.file
+mv dir0/dirB/o.file dir0/dirB/x.file
+mv dir0 dirX
+hg addremove -s50
+hg commit -m T6
+
 true
diff -r f29b674cc221 -r d5cab1d3339f tests/test-addremove-similar.out
--- a/tests/test-addremove-similar.out	Wed Sep 17 11:34:37 2008 +0200
+++ b/tests/test-addremove-similar.out	Sun Sep 28 00:46:48 2008 +0200
@@ -1,20 +1,77 @@
+% basic content-matching test
+% add 2 files
 adding empty-file
 adding large-file
+% rename large-file to another-file, remove other
 adding another-file
 removing empty-file
 removing large-file
 recording removal of large-file as rename to another-file (99% similar)
-% comparing two empty files caused ZeroDivisionError in the past
-2 files updated, 0 files merged, 1 files removed, 0 files unresolved
-adding another-empty-file
-removing empty-file
-adding large-file
-adding tiny-file
-removing large-file
-adding small-file
-removing tiny-file
-recording removal of tiny-file as rename to small-file (82% similar)
-% should all fail
+% rename another-file to file1, add file2
+removing another-file
+adding file1
+adding file2
+recording removal of another-file as rename to file1 (99% similar)
+% option-value test, should all fail
 abort: similarity must be a number
 abort: similarity must be between 0 and 100
 abort: similarity must be between 0 and 100
+% content-matching test
+% add 2 files
+adding large.file
+adding tiny.file
+% rename tiny.file to x/small.file
+removing large.file
+removing tiny.file
+adding x/small.file
+recording removal of tiny.file as rename to x/small.file (70% similar)
+% name-matching tests
+% add 5 files
+adding a.file
+adding b.file
+adding c.file
+adding dir1/z.file
+adding dir2/z.file
+% move all to dir0, add z.file
+removing a.file
+removing b.file
+removing c.file
+adding dir0/a.file
+adding dir0/b.file
+adding dir0/c.file
+adding dir0/dir1/z.file
+adding dir0/dir2/z.file
+removing dir1/z.file
+removing dir2/z.file
+adding z.file
+recording removal of dir2/z.file as rename to dir0/dir2/z.file (100% similar)
+recording removal of dir1/z.file as rename to dir0/dir1/z.file (100% similar)
+recording removal of c.file as rename to dir0/c.file (100% similar)
+recording removal of b.file as rename to dir0/b.file (100% similar)
+recording removal of a.file as rename to dir0/a.file (100% similar)
+% rename */z.file to */o.file
+adding dir0/dir1/o.file
+removing dir0/dir1/z.file
+adding dir0/dir2/o.file
+removing dir0/dir2/z.file
+recording removal of dir0/dir2/z.file as rename to dir0/dir2/o.file (100% similar)
+recording removal of dir0/dir1/z.file as rename to dir0/dir1/o.file (100% similar)
+% rename dir1 to dirA, dir2 to dirB
+removing dir0/dir1/o.file
+removing dir0/dir2/o.file
+adding dir0/dirA/o.file
+adding dir0/dirB/o.file
+recording removal of dir0/dir2/o.file as rename to dir0/dirB/o.file (100% similar)
+recording removal of dir0/dir1/o.file as rename to dir0/dirA/o.file (100% similar)
+% remove dir0 files
+removing dir0/a.file
+removing dir0/b.file
+removing dir0/c.file
+% rename dir0/dirA/o.file to dirX/dirA/x.file
+% rename dir0/dirB/o.file to dirX/dirB/x.file
+removing dir0/dirA/o.file
+removing dir0/dirB/o.file
+adding dirX/dirA/x.file
+adding dirX/dirB/x.file
+recording removal of dir0/dirB/o.file as rename to dirX/dirB/x.file (100% similar)
+recording removal of dir0/dirA/o.file as rename to dirX/dirA/x.file (100% similar)
diff -r f29b674cc221 -r d5cab1d3339f tests/test-bdiff
--- a/tests/test-bdiff	Wed Sep 17 11:34:37 2008 +0200
+++ b/tests/test-bdiff	Sun Sep 28 00:46:48 2008 +0200
@@ -39,4 +39,57 @@
 test("a\n", "a\n")
 test("a\nb", "a\nb")

+
+def test(a, b, r, show=True):
+    s1 = bdiff.statistics(a)
+    s2 = bdiff.statistics(b)
+    s = bdiff.maxscorehist(s1, s2)
+    if not show:
+        a = b = '?'
+    print 'maxscorehist:', a, b, 'assert', s, '=', r, s==r
+    if r == 1.0:
+        print ' equal:', s1 == s2
+
+str1 = str2 = ''
+for i in xrange(256):
+    str1 += chr(i)
+    str2 += chr(255 - i)
+test(str1, str2, 1.0, show=False)
+test('a', 'a', 1.0)
+test('aa', 'ab', 0.5)
+test('aa', 'ba', 0.5)
+test('aa', 'bbba', 0.25)
+test('a', 'b', 0.0)
+test('a', '', 0.0)
+test('', 'a', 0.0)
+test('', '', 1.0)
+
+
+def test(a, b, r):
+    s = bdiff.strsim(a, b)
+    print 'strsim:', a, b, 'assert', s, '=', r, s==r
+
+test('a', 'a', 1.0)
+test('a', 'b', 0.0)
+test('a', 'aa', 0.5)
+test('aa', 'a', 0.5)
+test('aaab', 'aaaa', 0.75)
+test('aaaa', 'aabb', 0.5)
+test('aabb', 'aaaa', 0.5)
+test('a', '', 0.0)
+test('', 'a', 0.0)
+test('', '', 1.0)
+
+
+def test(a, b, r):
+    s = bdiff.reversesim(a, b)
+    print 'reversesim:', a, b, 'assert', s, '=', r, s==r
+
+test('a', 'a', 1)
+test('aa', 'a', 1)
+test('a', 'aa', 1)
+test('a', '', 0)
+test('', 'a', 0)
+test('', '', 0)
+
 print "done"
diff -r f29b674cc221 -r d5cab1d3339f tests/test-bdiff.out
--- a/tests/test-bdiff.out	Wed Sep 17 11:34:37 2008 +0200
+++ b/tests/test-bdiff.out	Sun Sep 28 00:46:48 2008 +0200
@@ -17,4 +17,32 @@
 *** 'abc' 'abc'
 *** 'a\n' 'a\n'
 *** 'a\nb' 'a\nb'
+maxscorehist: ? ? assert 1.0 = 1.0 True
+ equal: True
+maxscorehist: a a assert 1.0 = 1.0 True
+ equal: True
+maxscorehist: aa ab assert 0.5 = 0.5 True
+maxscorehist: aa ba assert 0.5 = 0.5 True
+maxscorehist: aa bbba assert 0.25 = 0.25 True
+maxscorehist: a b assert 0.0 = 0.0 True
+maxscorehist: a  assert 0.0 = 0.0 True
+maxscorehist:  a assert 0.0 = 0.0 True
+maxscorehist:   assert 1.0 = 1.0 True
+ equal: True
+strsim: a a assert 1.0 = 1.0 True
+strsim: a b assert 0.0 = 0.0 True
+strsim: a aa assert 0.5 = 0.5 True
+strsim: aa a assert 0.5 = 0.5 True
+strsim: aaab aaaa assert 0.75 = 0.75 True
+strsim: aaaa aabb assert 0.5 = 0.5 True
+strsim: aabb aaaa assert 0.5 = 0.5 True
+strsim: a  assert 0.0 = 0.0 True
+strsim:  a assert 0.0 = 0.0 True
+strsim:   assert 1.0 = 1.0 True
+reversesim: a a assert 1 = 1 True
+reversesim: aa a assert 1 = 1 True
+reversesim: a aa assert 1 = 1 True
+reversesim: a  assert 0 = 0 True
+reversesim:  a assert 0 = 0 True
+reversesim:   assert 0 = 0 True
 done
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: autorename_bestmatches_and_speed.patch
Url: http://selenic.com/pipermail/mercurial-devel/attachments/20080928/1550cf76/attachment.txt