[PATCH] auto rename: best matches and speed improvement

Herbert Griebel herbertg at gmx.at
Mon Sep 15 16:15:35 UTC 2008


# HG changeset patch
# User Herbert Griebel <herbertg at gmx.at>
# Date 1221494924 -7200
# Node ID 3a69568152a76c82bcc20a7a18ecba2cc76e7cff
# Parent  0d513661d6c280fae68a96aea16231702959bfb6
auto rename: best matches and speed improvements

The auto rename feature of addremove did not rename to the best match
and was very slow. The new implementation fixes this using all matches
and using statistics of the file content to speed up the matching.

To minimize file loading, statistics are used before actually loading and
comparing two files. The statistics are the file size and the histogram
of the bytes, if the file has been loaded at least once. If the current
best match for a file cannot be exceeded, based on files sizes and
histogram, the compare is deferred when needed.

File names are also matched for a group of removed files that have
the same score to a group of added files. This allows to find best
matches when moving identical files.

diff -r 0d513661d6c2 -r 3a69568152a7 mercurial/bdiff.c
--- a/mercurial/bdiff.c	Sat Sep 13 10:46:47 2008 -0500
+++ b/mercurial/bdiff.c	Mon Sep 15 18:08:44 2008 +0200
@@ -356,11 +356,126 @@
 	return result ? result : PyErr_NoMemory();
 }

-static char mdiff_doc[] = "Efficient binary diff.";
+
+typedef struct {
+	unsigned int hist[256];
+	unsigned int filelen;
+} STATDATA;
+
+static PyObject* statistics(PyObject *self, PyObject *args)
+{
+	unsigned char *s;
+	int filelen, i;
+	PyObject *dataobj;
+	STATDATA *stat;
+	unsigned int *hist;
+	if (! PyArg_ParseTuple(args, "s#:statistics", &s, &filelen))
+		return NULL;
+	dataobj = PyString_FromStringAndSize(NULL, sizeof(STATDATA));
+	stat = (STATDATA*) PyString_AS_STRING(dataobj);
+	hist = stat->hist;
+	stat->filelen = filelen;
+
+	/*  build histo */
+	for (i=0; i<256; i++)
+		hist[i] = 0;
+	for (i=0; i<filelen; i++)
+		hist[s[i]] += 1;
+
+	return dataobj;
+}
+
+static PyObject* maxscorehist(PyObject *self, PyObject *args)
+{
+	int len1, len2, i, maxlen;
+	STATDATA *stat1, *stat2;
+	unsigned int *h1, *h2;
+	unsigned int delta = 0;
+	double maxscore;
+	if (! PyArg_ParseTuple(args, "s#s#:maxscorehist",
+								&stat1, &len1, &stat2, &len2))
+		return NULL;
+	if (len1 != sizeof(STATDATA) || len2 != sizeof(STATDATA)) {
+		PyErr_SetString(PyExc_ValueError, "length of data invalid");
+		return NULL;
+	}
+	h1 = stat1->hist;
+	h2 = stat2->hist;
+
+	for (i=0; i<256; i++)
+		delta += h1[i] > h2[i] ? h1[i]-h2[i] : h2[i]-h1[i];
+	maxlen = stat1->filelen > stat2->filelen ? stat1->filelen : stat2->filelen;
+	if (maxlen == 0)
+		maxscore = 1.;
+	else if (stat1->filelen == 0 || stat2->filelen == 0)
+		maxscore = 0.;
+	else
+		maxscore = 1. - delta / (double)(2 * maxlen);
+	return PyFloat_FromDouble(maxscore);
+}
+
+int levenshtein_distance(char* s, int n, char* t, int m)
+{
+	int k, i, j, cost, dmin, *d, d1, d2, d3;
+	if (n == 0)
+		return m;
+	if (m == 0)
+		return n;
+
+	m++;
+	n++;
+	d = (int*) malloc(m * n * sizeof(int));
+	for (k=0; k<n; k++)
+		d[k] = k;
+	for (k=0; k<m; k++)
+		d[k*n] = k;
+
+	for (i=1; i<n; i++) {
+		for (j=1; j<m; j++) {
+			cost = (s[i-1] == t[j-1]) ? 0 : 1;
+			d1 = d[(j-1)*n + i] + 1;
+			d2 = d[j*n + i-1] + 1;
+			d3 = d[(j-1)*n + i-1] + cost;
+			if (d1 > d2)
+				d1 = d2;
+			if (d1 > d3)
+				d1 = d3;
+			d[j*n+i] = d1;
+		}
+	}
+	dmin = d[n*m-1];
+	free(d);
+
+	return dmin;
+}
+
+static PyObject* strsim(PyObject *self, PyObject *args)
+{
+	int n, m, d;
+	char *s, *t;
+	double sim;
+	if (! PyArg_ParseTuple(args, "s#s#:strsim", &s, &n, &t, &m))
+		return NULL;
+	d = levenshtein_distance(s, n, t, m);
+	if (n==0 && m==0)
+		sim = 1.0;
+	else if (n > m)
+		sim = (n-d) / (double)n;
+	else
+		sim = (m-d) / (double)m;
+	return PyFloat_FromDouble(sim);
+}
+
+
+
+static char mdiff_doc[] = "Efficient binary diff and string similarity.";

 static PyMethodDef methods[] = {
 	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
 	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
+	{"statistics", statistics, METH_VARARGS, "get byte statistics\n"},
+	{"maxscorehist", maxscorehist, METH_VARARGS, "calculate maximum score\n"},
+	{"strsim", strsim, METH_VARARGS, "get similarity measure of strings\n"},
 	{NULL, NULL}
 };

@@ -368,4 +483,3 @@
 {
 	Py_InitModule3("bdiff", methods, mdiff_doc);
 }
-
diff -r 0d513661d6c2 -r 3a69568152a7 mercurial/cmdutil.py
--- a/mercurial/cmdutil.py	Sat Sep 13 10:46:47 2008 -0500
+++ b/mercurial/cmdutil.py	Mon Sep 15 18:08:44 2008 +0200
@@ -241,33 +241,319 @@
 def matchfiles(repo, files):
     return _match.exact(repo.root, repo.getcwd(), files)

-def findrenames(repo, added=None, removed=None, threshold=0.5):
+def reversesim(s1, s2):
+    imax = min([len(s1), len(s2)])
+    i = 1
+    while i <= imax and s1[-i] == s2[-i]:
+        i += 1
+    return i - 1
+
+def findrenames(repo, added=None, removed=None, similarity=50):
     '''find renamed files -- yields (before, after, score) tuples'''
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
+    if not added or not removed:
+        return
+    na = len(added)
+    nr = len(removed)
     ctx = repo['.']
-    for a in added:
-        aa = repo.wread(a)
-        bestname, bestscore = None, threshold
-        for r in removed:
-            rr = ctx.filectx(r).data()

-            # bdiff.blocks() returns blocks of matching lines
-            # count the number of bytes in each
-            equal = 0
-            alines = mdiff.splitnewlines(aa)
-            matches = bdiff.blocks(aa, rr)
-            for x1,x2,y1,y2 in matches:
-                for line in alines[x1:x2]:
-                    equal += len(line)
+    # cash file attributes
+    rfiles = [()] * nr
+    afiles = [()] * na
+    for i, r in enumerate(removed):
+        rfiles[i] = (ctx.filectx(r).size(),
+                     os.path.split(r)[1],
+                     os.path.splitext(r)[1],
+                     i)
+    for i, a in enumerate(added):
+        afiles[i] = (repo.wfilesize(a),
+                     os.path.split(a)[1],
+                     os.path.splitext(a)[1],
+                     i)

-            lengths = len(aa) + len(rr)
-            if lengths:
-                myscore = equal*2.0 / lengths
-                if myscore >= bestscore:
-                    bestname, bestscore = r, myscore
-        if bestname:
-            yield bestname, a, bestscore
+    # check loop order
+    if na == 1:
+        addedfirst = True
+    elif nr == 1:
+        addedfirst = False
+    else:
+        # estimate times to decide loops
+        Sr = sum([f[0] for f in rfiles])
+        Sa = sum([f[0] for f in afiles])
+        dr = Sr*1.60882e-008 + nr*0.00145002
+        da = Sa*9.03138e-009 + na*0.00110696
+        d1 = da + dr*na
+        d2 = dr + da*nr
+        addedfirst = d1 <= d2
+
+    # # This code is only for getting the estimator coefficients
+    # # and shoult never run in a released version
+    #
+    # # Assumption:
+    # # duration = numBytesTotal * c1 + numFiles * c2
+    #
+    # def _measure(f1, f2):
+    #     s1a, s1r = 0, 0
+    #     added1 = added[int(na*f1):int(na*f2)]
+    #     removed1 = removed[int(nr*f1):int(nr*f2)]
+    #     n1a, n1r = len(added1), len(removed1)
+    #     import time
+    #     t0 = time.clock()
+    #     for a in added1:
+    #         s1a += len(repo.wread(a))
+    #     t1 = time.clock()
+    #     for r in removed1:
+    #         s1r += len(ctx.filectx(r).data())
+    #     t2 = time.clock()
+    #     d1a, d1r = t1 - t0, t2 - t1
+    #     return s1r, s1a, n1r, n1a, d1r, d1a
+    #
+    # def _solve2d(a1,a2,b1,b2,e1,e2):
+    #     """a1*x + b1*y = e1
+    #        a2*x + b2*y = e2"""
+    #     def det(a1,a2,b1,b2):
+    #         return a1*b2 - b1*a2
+    #     d = det(a1,a2,b1,b2)
+    #     x = det(e1,e2,b1,b2) / d
+    #     y = det(a1,a2,e1,e2) / d
+    #     return x, y
+    #
+    # s1r, s1a, n1r, n1a, d1r, d1a = _measure(0, .33)
+    # s2r, s2a, n2r, n2a, d2r, d2a = _measure(.33, 1)
+    # xc1r, xc2r = _solve2d(s1r, s2r, n1r, n2r, d1r, d2r)
+    # xc1a, xc2a = _solve2d(s1a, s2a, n1a, n2a, d1a, d2a)
+    # print 'dr = Sr*%g + nr*%g' % (xc1r, xc2r)
+    # print 'da = Sa*%g + na*%g' % (xc1a, xc2a)
+
+    if addedfirst:
+        filelist1, filelist2 = afiles, rfiles
+    else:
+        filelist1, filelist2 = rfiles, afiles
+
+    # init
+    minscore = similarity / 100
+    rbestscores = [0] * nr
+    abestscores = [0] * na
+    stat1 = [''] * len(filelist1)
+    stat2 = [''] * len(filelist2)
+    bestmatches = []
+    notdeferred = 0
+    isdeferred = 1
+    ignoredeferred = 2
+
+    def _getscore(rr, aa, alines):
+        '''bdiff.blocks() returns blocks of matching lines
+           count the number of bytes in each'''
+        equal = 0
+        matches = bdiff.blocks(aa, rr)
+        for x1,x2,y1,y2 in matches:
+            for line in alines[x1:x2]:
+                equal += len(line)
+        score = equal*2.0 / (len(aa) + len(rr))
+        return score
+
+    # loop 1
+    for size1, name1, ext1, ifile1 in filelist1:
+        if addedfirst:
+            ia = ifile1
+        else:
+            ir = ifile1
+        loadfile1 = True
+
+        # loop 2
+        for size2, name2, ext2, ifile2 in filelist2:
+            if addedfirst:
+                ir = ifile2
+            else:
+                ia = ifile2
+
+            # special case: empty files
+            if not size1 or not size2:
+                if size1 == size2:
+                    bestmatches.append([1, ir, ia, notdeferred])
+                    rbestscores[ir] = 1
+                    abestscores[ia] = 1
+                continue
+
+            # maximum score that could be reached because of sizes
+            maxscoresize = \
+                float(size2)/size1 if size1>size2 else float(size1)/size2
+            if maxscoresize < minscore:
+                continue
+
+            # maximum score that could be reached because of statistics
+            maxscore = maxscoresize
+            if stat1[ifile1] and stat2[ifile2]:
+                # histogram delta
+                maxscorehist = bdiff.maxscorehist(stat1[ifile1], stat2[ifile2])
+                if maxscorehist < minscore:
+                    continue
+                # defer compares, take lowest upper bound
+                if maxscorehist < maxscore:
+                    maxscore = maxscorehist
+
+            # deferred compares
+            if maxscore <= rbestscores[ir] or maxscore <= abestscores[ia]:
+                bestmatches.append([maxscore, ir, ia, isdeferred])
+                continue
+
+            # load file 1
+            if loadfile1:
+                if addedfirst:
+                    aa = repo.wread(added[ia])
+                    alines = mdiff.splitnewlines(aa)
+                    stat1[ifile1] = bdiff.statistics(aa)
+                else:
+                    rr = ctx.filectx(removed[ir]).data()
+                    stat1[ifile1] = bdiff.statistics(rr)
+                loadfile1 = False
+
+            # load file 2
+            if addedfirst:
+                rr = ctx.filectx(removed[ir]).data()
+                if not stat2[ifile2]:
+                    stat2[ifile2] = bdiff.statistics(rr)
+            else:
+                aa = repo.wread(added[ia])
+                alines = mdiff.splitnewlines(aa)
+                if not stat2[ifile2]:
+                    stat2[ifile2] = bdiff.statistics(aa)
+
+            # score it
+            score = _getscore(rr, aa, alines)
+            if score >= minscore:
+                bestmatches.append([score, ir, ia, notdeferred])
+                if rbestscores[ir] <= score:
+                    rbestscores[ir] = score
+                    abestscores[ia] = score
+
+    def _deferredcompare(imatch):
+        maxscore, ir, ia, deferred = bestmatches[imatch]
+        bestmatches[imatch][3] = ignoredeferred
+        rr = ctx.filectx(removed[ir]).data()
+        aa = repo.wread(added[ia])
+        alines = mdiff.splitnewlines(aa)
+        score = _getscore(rr, aa, alines)
+        if score < minscore:
+            return 0
+        # insert sorted in list
+        iins = imatch + 1
+        while iins < nmatches and bestmatches[iins][0] > score:
+            iins += 1
+        while iins < nmatches and bestmatches[iins][0] == score and \
+                bestmatches[iins][1] > ir:
+            iins += 1
+        bestmatches.insert(iins, [score, ir, ia, notdeferred])
+        return score
+
+    def _namematches(seti):
+        '''Find best matches for removed files with added files
+        Matching score (high to low): file-extension
+                                      full pathname similarity (levenshtein)
+                                      common substring lenght from right
+        So the heuristic assumes that a file extension change
+        is the least likely.
+        '''
+        if len(seti) == 1:
+            score, ir, ia, deferred = bestmatches[seti[0]]
+            yield ir, ia
+            return
+        best = [()] * len(seti)
+        for i,imatch2 in enumerate(seti):
+            score, ir, ia, deferred = bestmatches[imatch2]
+            r = removed[ir]
+            a = added[ia]
+            rname = rfiles[ir][1]
+            aname = afiles[ia][1]
+            rext = rfiles[ir][2]
+            aext = afiles[ia][2]
+            score1 = 1 if rext==aext else 0
+            score2 = bdiff.strsim(rname, aname)
+            score3 = bdiff.strsim(r, a)
+            score4 = reversesim(r, a)
+            best[i] = (score1, score2, score3, score4, imatch2)
+        best.sort(reverse = True)
+        irx, iax = [], []
+        i, imax = 0, min([len(setr), len(seta)])
+        for ibest in xrange(len(seti)):
+            score, ir, ia, deferred = bestmatches[best[ibest][4]]
+            if ir in irx or ia in iax:
+                continue
+            if deferred:
+                score2 = _deferredcompare(imatch2)
+                if score2 != score :
+                    continue
+            irx.append(ir)
+            iax.append(ia)
+            yield ir, ia
+            i += 1
+            if i == imax:
+                break
+
+    # assign best matches
+    bestmatches.sort(reverse = True)
+    renamenew, renameold = [], []
+    nassignes = min([nr, na])
+    imatch = 0
+    nmatches = len(bestmatches)
+    while imatch < nmatches:
+        score, ir, ia, deferred = bestmatches[imatch]
+        imatch += 1
+        if ia in renamenew or ir in renameold:
+            continue
+        if deferred:
+            if deferred == isdeferred:
+                if _deferredcompare(imatch-1):
+                    nmatches += 1
+            continue
+
+        # find all files with equal score to the two files
+        setr, seta = [ir], [ia]
+        seti = [imatch-1]
+        bChanged = True
+        bFirst = True
+        while bChanged:
+            bChanged = False
+            # get all added files for setr files
+            irmin = min(setr)
+            imatch2 = imatch
+            while imatch2 < nmatches:
+                score2, ir, ia, deferred = bestmatches[imatch2]
+                if ir < irmin or score2 != score:
+                    break
+                if deferred != ignoredeferred:
+                    if ir in setr and imatch2 not in seti:
+                        seti.append(imatch2)
+                        if ia not in seta:
+                            seta.append(ia)
+                            bChanged = True
+                imatch2 += 1
+            if not bFirst and not bChanged:
+                break
+            bFirst = False
+            # get all removed files for seta files
+            imatch2 = imatch
+            while imatch2 < nmatches:
+                score2, ir, ia, deferred = bestmatches[imatch2]
+                if score2 != score:
+                    break
+                if deferred != ignoredeferred:
+                    if ia in seta and imatch2 not in seti:
+                        seti.append(imatch2)
+                        if ir not in setr:
+                            setr.append(ir)
+                            bChanged = True
+                imatch2 += 1
+
+        # find best name matches of the file set with equal score
+        for ir, ia in _namematches(seti):
+            renameold.append(ir)
+            renamenew.append(ia)
+            yield removed[ir], added[ia], score
+        if len(renameold) >= nassignes:
+            return
+        nmatches = len(bestmatches) # update from deferred compares

 def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
     if dry_run is None:
diff -r 0d513661d6c2 -r 3a69568152a7 mercurial/commands.py
--- a/mercurial/commands.py	Sat Sep 13 10:46:47 2008 -0500
+++ b/mercurial/commands.py	Mon Sep 15 18:08:44 2008 +0200
@@ -62,12 +62,12 @@
     parameter. Detecting renamed files this way can be expensive.
     """
     try:
-        sim = float(opts.get('similarity') or 0)
+        similarity = float(opts.get('similarity') or 0)
     except ValueError:
         raise util.Abort(_('similarity must be a number'))
-    if sim < 0 or sim > 100:
+    if similarity < 0 or similarity > 100:
         raise util.Abort(_('similarity must be between 0 and 100'))
-    return cmdutil.addremove(repo, pats, opts, similarity=sim/100.)
+    return cmdutil.addremove(repo, pats, opts, similarity=similarity)

 def annotate(ui, repo, *pats, **opts):
     """show changeset information per file line
diff -r 0d513661d6c2 -r 3a69568152a7 mercurial/localrepo.py
--- a/mercurial/localrepo.py	Sat Sep 13 10:46:47 2008 -0500
+++ b/mercurial/localrepo.py	Mon Sep 15 18:08:44 2008 +0200
@@ -528,6 +528,9 @@

     def adddatafilter(self, name, filter):
         self._datafilters[name] = filter
+
+    def wfilesize(self, filename):
+        return os.stat(self.wjoin(filename))[6]

     def wread(self, filename):
         if self._link(filename):
diff -r 0d513661d6c2 -r 3a69568152a7 tests/test-addremove-similar
--- a/tests/test-addremove-similar	Sat Sep 13 10:46:47 2008 -0500
+++ b/tests/test-addremove-similar	Mon Sep 15 18:08:44 2008 +0200
@@ -1,49 +1,84 @@
 #!/bin/sh

+echo % basic content-matching test
 hg init rep; cd rep

 touch empty-file
 python -c 'for x in range(10000): print x' > large-file
-
 hg addremove
-
 hg commit -m A

 rm large-file empty-file
 python -c 'for x in range(10,10000): print x' > another-file
-
-hg addremove -s50
-
+hg addremove -s 50
 hg commit -m B

-echo % comparing two empty files caused ZeroDivisionError in the past
-hg update -C 0
-rm empty-file
-touch another-empty-file
-hg addremove -s50
-
-cd ..
-
-hg init rep2; cd rep2
-
-python -c 'for x in range(10000): print x' > large-file
-python -c 'for x in range(50): print x' > tiny-file
-
-hg addremove
-
-hg commit -m A
-
-python -c 'for x in range(70): print x' > small-file
-rm tiny-file
-rm large-file
-
-hg addremove -s50
-
-hg commit -m B
-
-echo % should all fail
+echo % option-value test, should all fail
 hg addremove -s foo
 hg addremove -s -1
 hg addremove -s 1e6

+echo % content-matching test
+cd ..
+hg init rep2; cd rep2
+
+python -c 'for x in range(10000): print x' > large.file
+python -c 'for x in range(50): print x' > tiny.file
+hg addremove -s 50
+hg commit -m A
+
+python -c 'for x in range(70): print x' > small.file
+rm tiny.file
+rm large.file
+mkdir x
+mv *.file x
+hg addremove -s 50
+hg commit -m B
+
+echo % name-matching tests
+cd ..
+hg init rep3; cd rep3
+
+touch a.file
+touch b.file
+touch c.file
+mkdir dir1
+mkdir dir2
+touch dir1/z.file
+touch dir2/z.file
+hg addremove -s50
+hg commit -m T1
+
+echo % move all to dir0
+mkdir dir0
+mv *.file dir0
+mv dir1 dir0
+mv dir2 dir0
+touch z.file
+hg addremove -s50
+hg commit -m T2
+
+echo % change file name
+mv dir0/dir1/z.file dir0/dir1/o.file
+mv dir0/dir2/z.file dir0/dir2/o.file
+hg addremove -s50
+hg commit -m T3
+
+echo % change directories
+mv dir0/dir1 dir0/dirA
+mv dir0/dir2 dir0/dirB
+hg addremove -s50
+hg commit -m T4
+
+echo % remove dir0 files
+rm dir0/*.file
+hg commit -A -m T5
+
+echo % change file name and directories
+mv dir0/dirA/o.file dir0/dirA/x.file
+mv dir0/dirB/o.file dir0/dirB/x.file
+mv dir0 dirX
+hg addremove -s50
+hg commit -m T6
+
 true
diff -r 0d513661d6c2 -r 3a69568152a7 tests/test-addremove-similar.out
--- a/tests/test-addremove-similar.out	Sat Sep 13 10:46:47 2008 -0500
+++ b/tests/test-addremove-similar.out	Mon Sep 15 18:08:44 2008 +0200
@@ -1,20 +1,66 @@
+% basic content-matching test
 adding empty-file
 adding large-file
 adding another-file
 removing empty-file
 removing large-file
 recording removal of large-file as rename to another-file (99% similar)
-% comparing two empty files caused ZeroDivisionError in the past
-2 files updated, 0 files merged, 1 files removed, 0 files unresolved
-adding another-empty-file
-removing empty-file
-adding large-file
-adding tiny-file
-removing large-file
-adding small-file
-removing tiny-file
-recording removal of tiny-file as rename to small-file (82% similar)
-% should all fail
+% option-value test, should all fail
 abort: similarity must be a number
 abort: similarity must be between 0 and 100
 abort: similarity must be between 0 and 100
+% content-matching test
+adding large.file
+adding tiny.file
+removing large.file
+removing tiny.file
+adding x/small.file
+recording removal of tiny.file as rename to x/small.file (82% similar)
+% name-matching tests
+adding a.file
+adding b.file
+adding c.file
+adding dir1/z.file
+adding dir2/z.file
+% move all to dir0
+removing a.file
+removing b.file
+removing c.file
+adding dir0/a.file
+adding dir0/b.file
+adding dir0/c.file
+adding dir0/dir1/z.file
+adding dir0/dir2/z.file
+removing dir1/z.file
+removing dir2/z.file
+adding z.file
+recording removal of dir1/z.file as rename to dir0/dir1/z.file (100% similar)
+recording removal of dir2/z.file as rename to dir0/dir2/z.file (100% similar)
+recording removal of a.file as rename to dir0/a.file (100% similar)
+recording removal of b.file as rename to dir0/b.file (100% similar)
+recording removal of c.file as rename to dir0/c.file (100% similar)
+% change file name
+adding dir0/dir1/o.file
+removing dir0/dir1/z.file
+adding dir0/dir2/o.file
+removing dir0/dir2/z.file
+recording removal of dir0/dir1/z.file as rename to dir0/dir1/o.file (100% similar)
+recording removal of dir0/dir2/z.file as rename to dir0/dir2/o.file (100% similar)
+% change directories
+removing dir0/dir1/o.file
+removing dir0/dir2/o.file
+adding dir0/dirA/o.file
+adding dir0/dirB/o.file
+recording removal of dir0/dir1/o.file as rename to dir0/dirA/o.file (100% similar)
+recording removal of dir0/dir2/o.file as rename to dir0/dirB/o.file (100% similar)
+% remove dir0 files
+removing dir0/a.file
+removing dir0/b.file
+removing dir0/c.file
+% change file name and directories
+removing dir0/dirA/o.file
+removing dir0/dirB/o.file
+adding dirX/dirA/x.file
+adding dirX/dirB/x.file
+recording removal of dir0/dirA/o.file as rename to dirX/dirA/x.file (100% similar)
+recording removal of dir0/dirB/o.file as rename to dirX/dirB/x.file (100% similar)
diff -r 0d513661d6c2 -r 3a69568152a7 tests/test-bdiff
--- a/tests/test-bdiff	Sat Sep 13 10:46:47 2008 -0500
+++ b/tests/test-bdiff	Mon Sep 15 18:08:44 2008 +0200
@@ -39,4 +39,36 @@
 test("a\n", "a\n")
 test("a\nb", "a\nb")

+
+def test(a, b, r):
+    s1 = bdiff.statistics(a)
+    s2 = bdiff.statistics(b)
+    s = bdiff.maxscorehist(s1, s2)
+    print 'maxscorehist:', a, b, s, '=', r, s==r
+    if r == 1.0:
+        print ' equal:', s1 == s2
+
+test('a', 'a', 1.0)
+test('aa', 'ab', 0.5)
+test('ab', 'ba', 1.0)
+test('aa', 'ba', 0.5)
+test('abc', 'cba', 1.0)
+test('a', 'b', 0.0)
+test('a', '', 0.0)
+test('', '', 1.0)
+
+
+def test(a, b, r):
+    s = bdiff.strsim(a, b)
+    print 'strsim:', a, b, s, '=', r, s==r
+
+test('a', 'a', 1.0)
+test('a', 'b', 0.0)
+test('aaab', 'aaaa', 0.75)
+test('aaaa', 'aabb', 0.5)
+test('aabb', 'aaaa', 0.5)
+test('a', '', 0.0)
+test('', 'a', 0.0)
+
+
 print "done"
diff -r 0d513661d6c2 -r 3a69568152a7 tests/test-bdiff.out
--- a/tests/test-bdiff.out	Sat Sep 13 10:46:47 2008 -0500
+++ b/tests/test-bdiff.out	Mon Sep 15 18:08:44 2008 +0200
@@ -17,4 +17,23 @@
 *** 'abc' 'abc'
 *** 'a\n' 'a\n'
 *** 'a\nb' 'a\nb'
+maxscorehist: a a 1.0 = 1.0 True
+ equal: True
+maxscorehist: aa ab 0.5 = 0.5 True
+maxscorehist: ab ba 1.0 = 1.0 True
+ equal: True
+maxscorehist: aa ba 0.5 = 0.5 True
+maxscorehist: abc cba 1.0 = 1.0 True
+ equal: True
+maxscorehist: a b 0.0 = 0.0 True
+maxscorehist: a  0.0 = 0.0 True
+maxscorehist:   1.0 = 1.0 True
+ equal: True
+strsim: a a 1.0 = 1.0 True
+strsim: a b 0.0 = 0.0 True
+strsim: aaab aaaa 0.75 = 0.75 True
+strsim: aaaa aabb 0.5 = 0.5 True
+strsim: aabb aaaa 0.5 = 0.5 True
+strsim: a  0.0 = 0.0 True
+strsim:  a 0.0 = 0.0 True
 done

-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: autorename_bestmatches_and_speed.patch
Url: http://selenic.com/pipermail/mercurial-devel/attachments/20080915/cd59473d/attachment.txt 


More information about the Mercurial-devel mailing list