[PATCH] auto rename: best matches and speed improvements UPDATE1

Wed Oct 1 07:50:04 CDT 2008

# HG changeset patch
# User Herbert Griebel <herbertg at gmx.at>
# Date 1222862946 -7200
# Node ID 7b673a95a55d0e32df824830a8419d4c1a220b9a
# Parent  f29b674cc2210126c2899d94d882c367a8ea64bc
auto rename: best matches and speed improvements

The auto rename feature of addremove did not rename to the best match
and was very slow. The new implementation fixes this using all matches
and using statistics of the file content to speed up the matching.

To minimize file loading, file sizes and byte-histograms are used
before actually loading and comparing two files. If the current
best match for a file cannot be exceeded, based on file sizes and
histograms, the compare is deferred, i.e., done later when needed.

File names are also matched for a group of removed files that have
the same score to a group of added files. This allows to find best
matches when moving identical files.

diff -r f29b674cc221 -r 7b673a95a55d mercurial/addremove.py

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/addremove.py	Wed Oct 01 14:09:06 2008 +0200
@@ -0,0 +1,356 @@
+# addremove.py - command addremove
+#
+# Copyright 2008 Herbert Griebel <herbertg at gmx.at>
+#
+# This software may be used and distributed according to the terms
+# of the GNU General Public License, incorporated herein by reference.
+
+from i18n import _
+import os
+import mdiff, bdiff, util
+import copy
+
+def findrenames(repo, added=None, removed=None, threshold=0.5):
+    '''find renamed files -- yields (before, after, score) tuples'''
+    if added is None or removed is None:
+        added, removed = repo.status()[1:3]
+    if not added or not removed:
+        return
+    ctx = repo['.']
+
+    # cash file attributes
+    nr = len(removed)
+    na = len(added)
+    rfiles = [()] * nr
+    afiles = [()] * na
+    for i, r in enumerate(removed):
+        rfiles[i] = (ctx.filectx(r).size(),
+                     os.path.split(r)[1], os.path.splitext(r)[1], i)
+    for i, a in enumerate(added):
+        afiles[i] = (repo.wfilesize(a),
+                     os.path.split(a)[1], os.path.splitext(a)[1], i)
+
+    # check loop order
+    if na == 1:
+        addedfirst = True
+    elif nr == 1:
+        addedfirst = False
+    else:
+        # estimate times to decide loops
+        Sr = sum([f[0] for f in rfiles])
+        Sa = sum([f[0] for f in afiles])
+        dr = Sr*1.60882e-008 + nr*0.00145002
+        da = Sa*9.03138e-009 + na*0.00110696
+        d1 = da + na*dr
+        d2 = dr + nr*da
+        addedfirst = d1 <= d2
+
+    # init
+    if addedfirst:
+        filelist1, filelist2 = afiles, copy.copy(rfiles)
+    else:
+        filelist1, filelist2 = rfiles, copy.copy(afiles)
+    minscore = threshold
+    rbased = nr < na
+    bestmatches = [[0, 0, True, -1, -1, False]] * min([nr, na])
+    rmatches = [[] for i in xrange(nr)]
+    amatches = [[] for i in xrange(na)]
+    rbestscores = [0] * nr
+    abestscores = [0] * na
+    rstat = [''] * nr
+    astat = [''] * na
+    # named constants
+    iscompared = True # needed for max function: True > False
+    notcompared = False
+
+    def _getscore(rr, aa, alines):
+        '''bdiff.blocks() returns blocks of matching lines
+           count the number of bytes in each'''
+        equal = 0
+        matches = bdiff.blocks(aa, rr)
+        for x1,x2,y1,y2 in matches:
+            for line in alines[x1:x2]:
+                equal += len(line)
+        score = float(equal)  / max([len(aa), len(rr)])
+        return score
+
+    def _addmatch(score, compared):
+        '''enters match into the 3 lists'''
+        namesim = bdiff.reversesim(removed[ir], added[ia])
+        m = [score, namesim, compared, ir, ia, False]
+        rmatches[ir].append(m)
+        amatches[ia].append(m)
+        if rbased:
+            if bestmatches[ir] < m:
+                bestmatches[ir][5] = False
+                bestmatches[ir] = m
+                m[5] = True
+        else:
+            if bestmatches[ia] < m:
+                bestmatches[ia][5] = False
+                bestmatches[ia] = m
+                m[5] = True
+        if rbestscores[ir] < score:
+            rbestscores[ir] = score
+        if abestscores[ia] < score:
+            abestscores[ia] = score
+
+    # loop 1
+    for size1, name1, ext1, ifile1 in filelist1:
+        if addedfirst:
+            ia = ifile1
+        else:
+            ir = ifile1
+        loadfile1 = True
+
+        # check similar file names first (moves)
+        nmoves = 0
+        for i in xrange(len(filelist2)):
+            if name1 == filelist2[i][1]:
+                if i != nmoves:
+                    # swap files
+                    filelist2[nmoves], filelist2[i] = \
+                            filelist2[i], filelist2[nmoves]
+                nmoves += 1
+
+        # loop 2
+        for size2, name2, ext2, ifile2 in filelist2:
+            if addedfirst:
+                ir = ifile2
+            else:
+                ia = ifile2
+
+            # special case: empty files
+            if not size1 or not size2:
+                score = 1 if size1 == size2 else 0
+                if score >= minscore:
+                    _addmatch(score, iscompared)
+                continue
+
+            # maximum score that could be reached because of sizes
+            maxscoresize = \
+                float(size2)/size1 if size1>size2 else float(size1)/size2
+            if maxscoresize < minscore:
+                continue
+
+            # maximum score that could be reached because of statistics
+            maxscore = maxscoresize
+            if rstat[ir] and astat[ia]:
+                # histogram delta
+                maxscorehist = bdiff.maxscorehist(rstat[ir], astat[ia])
+                if maxscorehist < minscore:
+                    continue
+                # defer compares, take lowest upper bound
+                if maxscorehist < maxscore:
+                    maxscore = maxscorehist
+
+            # deferred compares
+            if ifile1 and (maxscore < rbestscores[ir] or \
+                        maxscore < abestscores[ia]):
+                _addmatch(maxscore, notcompared)
+                continue
+
+            # load file 1
+            if loadfile1:
+                if addedfirst:
+                    aa = repo.wread(added[ia])
+                    alines = mdiff.splitnewlines(aa)
+                    astat[ia] = bdiff.statistics(aa)
+                else:
+                    rr = ctx.filectx(removed[ir]).data()
+                    rstat[ir] = bdiff.statistics(rr)
+                loadfile1 = False
+
+            # load file 2
+            if addedfirst:
+                rr = ctx.filectx(removed[ir]).data()
+                if not rstat[ir]:
+                    rstat[ir] = bdiff.statistics(rr)
+            else:
+                aa = repo.wread(added[ia])
+                alines = mdiff.splitnewlines(aa)
+                if not astat[ia]:
+                    astat[ia] = bdiff.statistics(aa)
+
+            # score it
+            score = _getscore(rr, aa, alines)
+            if score >= minscore:
+                _addmatch(score, iscompared)
+
+    def _deferredcompare(m):
+        '''function returns 0 if maxscore was not reached
+        and updates bestmatches'''
+        maxscore, namesim, compared, ir, ia, isbest = m
+        if compared or not maxscore:
+            return maxscore
+        rr = aa = ''
+        if not rstat[ir]:
+            rr = ctx.filectx(removed[ir]).data()
+            rstat[ir] = bdiff.statistics(rr)
+        if not astat[ia]:
+            aa = repo.wread(added[ia])
+            astat[ia] = bdiff.statistics(aa)
+        # check histogram diff before comparing
+        maxscorehist = bdiff.maxscorehist(rstat[ir], astat[ia])
+        if maxscorehist < minscore:
+            maxscorehist = 0
+        if maxscore > maxscorehist:
+            m[0] = maxscorehist
+            maxscore = 0 # do not compare, we need new max
+        if maxscore: # actually compare the files
+            if not rr:
+                rr = ctx.filectx(removed[ir]).data()
+            if not aa:
+                aa = repo.wread(added[ia])
+            alines = mdiff.splitnewlines(aa)
+            score = _getscore(rr, aa, alines)
+            m[2] = iscompared # not deferred anymore
+            if score == maxscore:
+                return maxscore
+            m[0] = 0 if score < minscore else score
+        if isbest: # get best score
+            m[5] = False # remove from best
+            if rbased:
+                l = rmatches[ir]
+                bm = max(l) if l else 0
+                bestmatches[ir] = bm
+            else:
+                l = amatches[ia]
+                bm = max(l) if l else 0
+                bestmatches[ia] = bm
+            if bm:
+                bm[5] = True # flag as best
+        return 0
+
+    def _namematches(scorematches):
+        '''Find best matches for a group of removed and added files
+        Matching score (high to low): file extension equal
+                                      file name similarity (levenshtein)
+                                      full pathname similarity (levenshtein)
+                                      common substring length from right
+        Hence we assume that a file extension change is least likely.'''
+        if len(scorematches) == 1:
+            yield scorematches[0][0]
+            return
+        # "name-score" all matches
+        best = [0] * len(scorematches)
+        for i,sm in enumerate(scorematches):
+            score, namesim, compared, ir, ia, isbest = sm[0]
+            r = removed[ir]
+            a = added[ia]
+            rname = rfiles[ir][1]
+            aname = afiles[ia][1]
+            rext = rfiles[ir][2]
+            aext = afiles[ia][2]
+            score1 = rext == aext
+            score2 = bdiff.strsim(rname, aname)
+            score3 = bdiff.strsim(r, a)
+            score4 = bdiff.reversesim(r, a)
+            ndepends = -len(sm[1]) + (0 if compared else -1)
+            best[i] = (score1, score2, score3, score4, ndepends, sm)
+        # output best name matches
+        best.sort(reverse = True)
+        for score1, score2, score3, score4, n, sm in best:
+            m, depends = sm
+            score, namesim, compared, ir, ia, isbest = m
+            # depends on other deferred?
+            if depends:
+                dismissed = False
+                for d in depends:
+                    if _deferredcompare(d) != score:
+                        dismissed = True
+                        break
+                if dismissed:
+                    continue
+            # check if deferred
+            if not compared:
+                if _deferredcompare(m) != score:
+                    continue
+            yield m
+
+    # assign best matches
+    while True:
+        bm = max(bestmatches) # get best match
+        if not bm:
+            break
+        score, namesim, compared, ir, ia, isbest = bm
+        if not score:
+            break
+        if not compared:
+            if _deferredcompare(bm) != score:
+                continue
+
+        # Find all files that have the same similarity to the two files.
+        # Examples when name matching is necessary:
+        #
+        # a -> x    score       basic case, a has the same score to x and y
+        # a -> y    score
+        #
+        # a -> x    score       basic case, x has the same score to a and b
+        # b -> x    score
+        #
+        # a -> x    score       general case, finding all matches must
+        # a -> y    score       be done iteratively; if the compare is
+        # b -> y    score       deferred, a dependency list is used,
+        # b -> z    score       because the score is only an upper bound
+        # ...
+        #
+        # a -> x    score       no need to follow "100% links"
+        # b -> x    1           because link a -> y exists
+        # b -> y    1
+        # a -> y    score
+        rset, aset = [ir], [ia]
+        sm = [bm, []]
+        rsetnew, asetnew = [sm], [sm]
+        scorematches = [sm]
+        while rsetnew:
+            # get all added files that match rsetnew files
+            for newm, depends in rsetnew:
+                if newm[2] == notcompared:
+                    depends = copy.copy(depends)
+                    depends.append(newm)
+                l = [[m, depends] for m in rmatches[newm[3]] \
+                        if m[0]==score and m[4] not in aset]
+                scorematches += l
+                asetnew += l
+            rset += [m[3] for m, d in rsetnew]
+            rsetnew = []
+            # get all removed files that match asetnew files
+            for newm, depends in asetnew:
+                if newm[2] == notcompared:
+                    depends = copy.copy(depends)
+                    depends.append(newm)
+                l = [[m, depends] for m in amatches[newm[4]] \
+                        if m[0]==score and m[3] not in rset]
+                scorematches += l
+                rsetnew += l
+            aset += [m[4] for m, d in asetnew]
+            asetnew = []
+
+        # find best name matches
+        for bm in _namematches(scorematches):
+            score, namesim, compared, ir, ia, isbest = bm
+            if not score:
+                continue
+            yield removed[ir], added[ia], score
+            # invalidate ir and ia
+            for m in rmatches[ir]:
+                m[0] = 0
+            for m in amatches[ia]:
+                m[0] = 0
+            rmatches[ir] = []
+            amatches[ia] = []
+            # get next best match for invalidated
+            for m in bestmatches:
+                if m and not m[0]:
+                    score, namesim, compared, ir2, ia2, isbest = m
+                    if rbased:
+                        l = rmatches[ir2]
+                        m = max(l) if l else 0
+                        bestmatches[ir2] = m
+                    else:
+                        l = amatches[ia2]
+                        m = max(l) if l else 0
+                        bestmatches[ia2] = m
+                    if m:
+                        m[5] = True # flag as best
diff -r f29b674cc221 -r 7b673a95a55d mercurial/bdiff.c
--- a/mercurial/bdiff.c	Wed Sep 17 11:34:37 2008 +0200
+++ b/mercurial/bdiff.c	Wed Oct 01 14:09:06 2008 +0200
@@ -2,6 +2,7 @@
  bdiff.c - efficient binary diff extension for Mercurial

  Copyright 2005, 2006 Matt Mackall <mpm at selenic.com>
+ Copyright 2008 Herbert Griebel <herbertg at gmx.at>

  This software may be used and distributed according to the terms of
  the GNU General Public License, incorporated herein by reference.
@@ -356,11 +357,153 @@
 	return result ? result : PyErr_NoMemory();
 }

-static char mdiff_doc[] = "Efficient binary diff.";
+
+typedef struct {
+	unsigned int hist[256];
+	unsigned int filelen;
+} STATDATA;
+
+static PyObject* statistics(PyObject *self, PyObject *args)
+{
+	unsigned char *s;
+	int filelen, i;
+	PyObject *dataobj;
+	STATDATA *stat;
+	unsigned int *hist;
+	if (! PyArg_ParseTuple(args, "s#:statistics", &s, &filelen))
+		return NULL;
+	dataobj = PyString_FromStringAndSize(NULL, sizeof(STATDATA));
+	stat = (STATDATA*) PyString_AS_STRING(dataobj);
+	hist = stat->hist;
+	stat->filelen = filelen;
+
+	/*  build histo */
+	for (i=0; i<256; i++)
+		hist[i] = 0;
+	for (i=0; i<filelen; i++)
+		hist[s[i]] += 1;
+
+	return dataobj;
+}
+
+static PyObject* maxscorehist(PyObject *self, PyObject *args)
+{
+	int len1, len2, i;
+	STATDATA *stat1, *stat2;
+	unsigned int maxlen;
+	double maxscore;
+
+	if (! PyArg_ParseTuple(args, "s#s#:maxscorehist",
+			&stat1, &len1, &stat2, &len2))
+		return NULL;
+	if (len1 != sizeof(STATDATA) || len2 != sizeof(STATDATA)) {
+		PyErr_SetString(PyExc_ValueError, "length of data invalid");
+		return NULL;
+	}
+
+	maxlen = stat1->filelen > stat2->filelen ? stat1->filelen : stat2->filelen;
+	if (maxlen == 0)
+		maxscore = 1.;
+	else if (stat1->filelen == 0 || stat2->filelen == 0)
+		maxscore = 0.;
+	else {
+		unsigned int *h1 = stat1->hist;
+		unsigned int *h2 = stat2->hist;
+		unsigned int delta = stat1->filelen > stat2->filelen ?
+			stat1->filelen - stat2->filelen : stat2->filelen - stat1->filelen;
+		for (i=0; i<256; i++)
+			delta += (h1[i] > h2[i]) ? (h1[i]-h2[i]) : (h2[i]-h1[i]);
+		maxscore = 1. - delta / (double)(2 * maxlen);
+	}
+
+	return PyFloat_FromDouble(maxscore);
+}
+
+int levenshtein_distance(char* s, int n, char* t, int m)
+{
+	int k, i, j, cost, dmin, *d, d1, d2, d3, p;
+	if (n == 0)
+		return m;
+	if (m == 0)
+		return n;
+
+	m++;
+	n++;
+	d = (int*) malloc(m * n * sizeof(int));
+	for (k=0; k<n; k++)
+		d[k] = k;
+	for (k=0; k<m; k++)
+		d[k*n] = k;
+
+	for (i=1; i<n; i++) {
+		for (j=1; j<m; j++) {
+			p = j*n + i;
+			cost = (s[i-1] == t[j-1]) ? 0 : 1;
+			d3 = d[p-n-1] + cost;
+			d2 = d[p-1] + 1;
+			d1 = d[p-n] + 1;
+			if (d1 > d2)
+				d1 = d2;
+			if (d1 > d3)
+				d1 = d3;
+			d[p] = d1;
+		}
+	}
+	dmin = d[n*m-1];
+	free(d);
+
+	return dmin;
+}
+
+static PyObject* strsim(PyObject *self, PyObject *args)
+{
+	int n, m, d;
+	char *s, *t;
+	double sim;
+	if (! PyArg_ParseTuple(args, "s#s#:strsim", &s, &n, &t, &m))
+		return NULL;
+	d = levenshtein_distance(s, n, t, m);
+	if (n==0 && m==0)
+		sim = 1.0;
+	else if (n > m)
+		sim = (n-d) / (double)n;
+	else
+		sim = (m-d) / (double)m;
+	return PyFloat_FromDouble(sim);
+}
+
+static PyObject* reversesim(PyObject *self, PyObject *args)
+{
+	int n1, n2;
+	char *s1, *s2;
+	if (! PyArg_ParseTuple(args, "s#s#:reversesim", &s1, &n1, &s2, &n2))
+		return NULL;
+	if (n1 > n2) {
+		s1 += n1 - n2;
+		n1 = n2;
+	} else {
+		s2 += n2 - n1;
+		n2 = n1;
+	}
+	while (n1 > 0) {
+		n1--;
+		if (s1[n1] != s2[n1])
+			break;
+	}
+	return PyLong_FromLong(n2 - n1);
+}
+
+
+
+static char mdiff_doc[] = "Efficient binary diff and string similarity.";

 static PyMethodDef methods[] = {
 	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
 	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
+	{"statistics", statistics, METH_VARARGS, "get byte statistics\n"},
+	{"maxscorehist", maxscorehist, METH_VARARGS, "calculate maximum score\n"},
+	{"strsim", strsim, METH_VARARGS, "get similarity measure of strings\n"},
+	{"reversesim", reversesim, METH_VARARGS, "equal characters from right\n"},
 	{NULL, NULL}
 };

@@ -368,4 +511,3 @@
 {
 	Py_InitModule3("bdiff", methods, mdiff_doc);
 }
-
diff -r f29b674cc221 -r 7b673a95a55d mercurial/cmdutil.py
--- a/mercurial/cmdutil.py	Wed Sep 17 11:34:37 2008 +0200
+++ b/mercurial/cmdutil.py	Wed Oct 01 14:09:06 2008 +0200
@@ -10,6 +10,7 @@
 import os, sys, bisect, stat
 import mdiff, bdiff, util, templater, templatefilters, patch, errno
 import match as _match
+import addremove as _addremove

 revrangesep = ':'

@@ -241,34 +242,6 @@
 def matchfiles(repo, files):
     return _match.exact(repo.root, repo.getcwd(), files)

-def findrenames(repo, added=None, removed=None, threshold=0.5):
-    '''find renamed files -- yields (before, after, score) tuples'''
-    if added is None or removed is None:
-        added, removed = repo.status()[1:3]
-    ctx = repo['.']
-    for a in added:
-        aa = repo.wread(a)
-        bestname, bestscore = None, threshold
-        for r in removed:
-            rr = ctx.filectx(r).data()
-
-            # bdiff.blocks() returns blocks of matching lines
-            # count the number of bytes in each
-            equal = 0
-            alines = mdiff.splitnewlines(aa)
-            matches = bdiff.blocks(aa, rr)
-            for x1,x2,y1,y2 in matches:
-                for line in alines[x1:x2]:
-                    equal += len(line)
-
-            lengths = len(aa) + len(rr)
-            if lengths:
-                myscore = equal*2.0 / lengths
-                if myscore >= bestscore:
-                    bestname, bestscore = r, myscore
-        if bestname:
-            yield bestname, a, bestscore
-
 def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
     if dry_run is None:
         dry_run = opts.get('dry_run')
@@ -302,7 +275,8 @@
         repo.remove(remove)
         repo.add(add)
     if similarity > 0:
-        for old, new, score in findrenames(repo, add, remove, similarity):
+        for old, new, score in _addremove.findrenames( \
+                    repo, add, remove, similarity):
             oldrel, oldexact = mapping[old]
             newrel, newexact = mapping[new]
             if repo.ui.verbose or not oldexact or not newexact:
diff -r f29b674cc221 -r 7b673a95a55d mercurial/localrepo.py
--- a/mercurial/localrepo.py	Wed Sep 17 11:34:37 2008 +0200
+++ b/mercurial/localrepo.py	Wed Oct 01 14:09:06 2008 +0200
@@ -528,6 +528,9 @@

     def adddatafilter(self, name, filter):
         self._datafilters[name] = filter
+
+    def wfilesize(self, filename):
+        return os.stat(self.wjoin(filename))[6]

     def wread(self, filename):
         if self._link(filename):
diff -r f29b674cc221 -r 7b673a95a55d tests/test-addremove-similar
--- a/tests/test-addremove-similar	Wed Sep 17 11:34:37 2008 +0200
+++ b/tests/test-addremove-similar	Wed Oct 01 14:09:06 2008 +0200
@@ -1,49 +1,97 @@
 #!/bin/sh

+echo % basic content-matching test
 hg init rep; cd rep

+echo % add 2 files
 touch empty-file
 python -c 'for x in range(10000): print x' > large-file
-
 hg addremove
-
 hg commit -m A

+echo % rename large-file to another-file, remove other
 rm large-file empty-file
 python -c 'for x in range(10,10000): print x' > another-file
-
-hg addremove -s50
-
+hg addremove -s 50
 hg commit -m B

-echo % comparing two empty files caused ZeroDivisionError in the past
-hg update -C 0
-rm empty-file
-touch another-empty-file
-hg addremove -s50
+echo % rename another-file to file1, add file2
+rm another-file
+python -c 'for x in range(20,10000): print x' > file1
+python -c 'for x in range(30,10000): print x' > file2
+hg addremove -s 50
+hg commit -m C

-cd ..
-
-hg init rep2; cd rep2
-
-python -c 'for x in range(10000): print x' > large-file
-python -c 'for x in range(50): print x' > tiny-file
-
-hg addremove
-
-hg commit -m A
-
-python -c 'for x in range(70): print x' > small-file
-rm tiny-file
-rm large-file
-
-hg addremove -s50
-
-hg commit -m B
-
-echo % should all fail
+echo % option-value test, should all fail
 hg addremove -s foo
 hg addremove -s -1
 hg addremove -s 1e6

+echo % content-matching test
+cd ..
+hg init rep2; cd rep2
+
+echo % add 2 files
+python -c 'for x in range(10000): print x' > large.file
+python -c 'for x in range(50): print x' > tiny.file
+hg addremove -s 50
+hg commit -m A
+
+echo % rename tiny.file to x/small.file
+python -c 'for x in range(70): print x' > small.file
+rm tiny.file
+rm large.file
+mkdir x
+mv *.file x
+hg addremove -s 50
+hg commit -m B
+
+echo % name-matching tests
+cd ..
+hg init rep3; cd rep3
+
+echo % add 5 files
+touch a.file
+touch b.file
+touch c.file
+mkdir dir1
+mkdir dir2
+touch dir1/z.file
+touch dir2/z.file
+hg addremove -s50
+hg commit -m T1
+
+echo % move all to dir0, add z.file
+mkdir dir0
+mv *.file dir0
+mv dir1 dir0
+mv dir2 dir0
+touch z.file
+hg addremove -s50
+hg commit -m T2
+
+echo % rename */z.file to */o.file
+mv dir0/dir1/z.file dir0/dir1/o.file
+mv dir0/dir2/z.file dir0/dir2/o.file
+hg addremove -s50
+hg commit -m T3
+
+echo % rename dir1 to dirA, dir2 to dirB
+mv dir0/dir1 dir0/dirA
+mv dir0/dir2 dir0/dirB
+hg addremove -s50
+hg commit -m T4
+
+echo % remove dir0 files
+rm dir0/*.file
+hg commit -A -m T5
+
+echo % rename dir0/dirA/o.file to dirX/dirA/x.file
+echo % rename dir0/dirB/o.file to dirX/dirB/x.file
+mv dir0/dirA/o.file dir0/dirA/x.file
+mv dir0/dirB/o.file dir0/dirB/x.file
+mv dir0 dirX
+hg addremove -s50
+hg commit -m T6
+
 true
diff -r f29b674cc221 -r 7b673a95a55d tests/test-addremove-similar.out
--- a/tests/test-addremove-similar.out	Wed Sep 17 11:34:37 2008 +0200
+++ b/tests/test-addremove-similar.out	Wed Oct 01 14:09:06 2008 +0200
@@ -1,20 +1,77 @@
+% basic content-matching test
+% add 2 files
 adding empty-file
 adding large-file
+% rename large-file to another-file, remove other
 adding another-file
 removing empty-file
 removing large-file
 recording removal of large-file as rename to another-file (99% similar)
-% comparing two empty files caused ZeroDivisionError in the past
-2 files updated, 0 files merged, 1 files removed, 0 files unresolved
-adding another-empty-file
-removing empty-file
-adding large-file
-adding tiny-file
-removing large-file
-adding small-file
-removing tiny-file
-recording removal of tiny-file as rename to small-file (82% similar)
-% should all fail
+% rename another-file to file1, add file2
+removing another-file
+adding file1
+adding file2
+recording removal of another-file as rename to file1 (99% similar)
+% option-value test, should all fail
 abort: similarity must be a number
 abort: similarity must be between 0 and 100
 abort: similarity must be between 0 and 100
+% content-matching test
+% add 2 files
+adding large.file
+adding tiny.file
+% rename tiny.file to x/small.file
+removing large.file
+removing tiny.file
+adding x/small.file
+recording removal of tiny.file as rename to x/small.file (70% similar)
+% name-matching tests
+% add 5 files
+adding a.file
+adding b.file
+adding c.file
+adding dir1/z.file
+adding dir2/z.file
+% move all to dir0, add z.file
+removing a.file
+removing b.file
+removing c.file
+adding dir0/a.file
+adding dir0/b.file
+adding dir0/c.file
+adding dir0/dir1/z.file
+adding dir0/dir2/z.file
+removing dir1/z.file
+removing dir2/z.file
+adding z.file
+recording removal of dir2/z.file as rename to dir0/dir2/z.file (100% similar)
+recording removal of dir1/z.file as rename to dir0/dir1/z.file (100% similar)
+recording removal of c.file as rename to dir0/c.file (100% similar)
+recording removal of b.file as rename to dir0/b.file (100% similar)
+recording removal of a.file as rename to dir0/a.file (100% similar)
+% rename */z.file to */o.file
+adding dir0/dir1/o.file
+removing dir0/dir1/z.file
+adding dir0/dir2/o.file
+removing dir0/dir2/z.file
+recording removal of dir0/dir2/z.file as rename to dir0/dir2/o.file (100% similar)
+recording removal of dir0/dir1/z.file as rename to dir0/dir1/o.file (100% similar)
+% rename dir1 to dirA, dir2 to dirB
+removing dir0/dir1/o.file
+removing dir0/dir2/o.file
+adding dir0/dirA/o.file
+adding dir0/dirB/o.file
+recording removal of dir0/dir2/o.file as rename to dir0/dirB/o.file (100% similar)
+recording removal of dir0/dir1/o.file as rename to dir0/dirA/o.file (100% similar)
+% remove dir0 files
+removing dir0/a.file
+removing dir0/b.file
+removing dir0/c.file
+% rename dir0/dirA/o.file to dirX/dirA/x.file
+% rename dir0/dirB/o.file to dirX/dirB/x.file
+removing dir0/dirA/o.file
+removing dir0/dirB/o.file
+adding dirX/dirA/x.file
+adding dirX/dirB/x.file
+recording removal of dir0/dirB/o.file as rename to dirX/dirB/x.file (100% similar)
+recording removal of dir0/dirA/o.file as rename to dirX/dirA/x.file (100% similar)
diff -r f29b674cc221 -r 7b673a95a55d tests/test-bdiff
--- a/tests/test-bdiff	Wed Sep 17 11:34:37 2008 +0200
+++ b/tests/test-bdiff	Wed Oct 01 14:09:06 2008 +0200
@@ -39,4 +39,57 @@
 test("a\n", "a\n")
 test("a\nb", "a\nb")

+
+def test(a, b, r, show=True):
+    s1 = bdiff.statistics(a)
+    s2 = bdiff.statistics(b)
+    s = bdiff.maxscorehist(s1, s2)
+    if not show:
+        a = b = '?'
+    print 'maxscorehist:', a, b, 'assert', s, '=', r, s==r
+    if r == 1.0:
+        print ' equal:', s1 == s2
+
+str1 = str2 = ''
+for i in xrange(256):
+    str1 += chr(i)
+    str2 += chr(255 - i)
+test(str1, str2, 1.0, show=False)
+test('a', 'a', 1.0)
+test('aa', 'ab', 0.5)
+test('aa', 'ba', 0.5)
+test('aa', 'bbba', 0.25)
+test('a', 'b', 0.0)
+test('a', '', 0.0)
+test('', 'a', 0.0)
+test('', '', 1.0)
+
+
+def test(a, b, r):
+    s = bdiff.strsim(a, b)
+    print 'strsim:', a, b, 'assert', s, '=', r, s==r
+
+test('a', 'a', 1.0)
+test('a', 'b', 0.0)
+test('a', 'aa', 0.5)
+test('aa', 'a', 0.5)
+test('aaab', 'aaaa', 0.75)
+test('aaaa', 'aabb', 0.5)
+test('aabb', 'aaaa', 0.5)
+test('a', '', 0.0)
+test('', 'a', 0.0)
+test('', '', 1.0)
+
+
+def test(a, b, r):
+    s = bdiff.reversesim(a, b)
+    print 'reversesim:', a, b, 'assert', s, '=', r, s==r
+
+test('a', 'a', 1)
+test('aa', 'a', 1)
+test('a', 'aa', 1)
+test('a', '', 0)
+test('', 'a', 0)
+test('', '', 0)
+
 print "done"
diff -r f29b674cc221 -r 7b673a95a55d tests/test-bdiff.out
--- a/tests/test-bdiff.out	Wed Sep 17 11:34:37 2008 +0200
+++ b/tests/test-bdiff.out	Wed Oct 01 14:09:06 2008 +0200
@@ -17,4 +17,32 @@
 *** 'abc' 'abc'
 *** 'a\n' 'a\n'
 *** 'a\nb' 'a\nb'
+maxscorehist: ? ? assert 1.0 = 1.0 True
+ equal: True
+maxscorehist: a a assert 1.0 = 1.0 True
+ equal: True
+maxscorehist: aa ab assert 0.5 = 0.5 True
+maxscorehist: aa ba assert 0.5 = 0.5 True
+maxscorehist: aa bbba assert 0.25 = 0.25 True
+maxscorehist: a b assert 0.0 = 0.0 True
+maxscorehist: a  assert 0.0 = 0.0 True
+maxscorehist:  a assert 0.0 = 0.0 True
+maxscorehist:   assert 1.0 = 1.0 True
+ equal: True
+strsim: a a assert 1.0 = 1.0 True
+strsim: a b assert 0.0 = 0.0 True
+strsim: a aa assert 0.5 = 0.5 True
+strsim: aa a assert 0.5 = 0.5 True
+strsim: aaab aaaa assert 0.75 = 0.75 True
+strsim: aaaa aabb assert 0.5 = 0.5 True
+strsim: aabb aaaa assert 0.5 = 0.5 True
+strsim: a  assert 0.0 = 0.0 True
+strsim:  a assert 0.0 = 0.0 True
+strsim:   assert 1.0 = 1.0 True
+reversesim: a a assert 1 = 1 True
+reversesim: aa a assert 1 = 1 True
+reversesim: a aa assert 1 = 1 True
+reversesim: a  assert 0 = 0 True
+reversesim:  a assert 0 = 0 True
+reversesim:   assert 0 = 0 True
 done