[PATCH] auto rename: best matches and speed improvements

Thu Sep 11 15:39:45 CDT 2008

# HG changeset patch
# User Herbert Griebel <herbertg at gmx.at>
# Date 1221165456 -7200
# Node ID b99693bd438686e62174098c8d0df05f43e8b1b9
# Parent  6dcbe191a9b51799a4cd40bd398e826bcd9b4dac
auto rename: best matches and speed improvements

The auto rename feature of addremove did not rename to the best match
and was very slow. The new implementation fixes this using all matches
and using statistics of the file content to speed up the matching.

diff -r 6dcbe191a9b5 -r b99693bd4386 mercurial/bdiff.c

--- a/mercurial/bdiff.c	Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/bdiff.c	Thu Sep 11 22:37:36 2008 +0200
@@ -356,11 +356,51 @@
 	return result ? result : PyErr_NoMemory();
 }

-static char mdiff_doc[] = "Efficient binary diff.";
+
+
+/****************************************/
+/*diff statistics */
+/****************************************/
+
+static PyObject* statistics(PyObject *self, PyObject *args)
+{
+	unsigned char *s;
+	int len, i;
+	int cmax = 0;
+	double sum = 0;
+	double hist[256];
+	PyObject *sumobj;
+	PyObject *maxobj;
+	PyObject *histtuple = PyTuple_New(256);
+	if (! PyArg_ParseTuple(args, "s#:diffstat", &s, &len))
+		return NULL;
+
+	/*  build histo */
+	for (i=0; i<256; i++)
+		hist[i] = 0;
+	for (i=0; i<len; i++)
+		hist[s[i]] += 1;
+
+	/* max and sum */
+	for (i=0; i<256; i++)
+	{
+		if (hist[i])
+			cmax = i;
+		sum += i * hist[i];
+	}
+
+	sumobj = PyFloat_FromDouble(sum);
+	maxobj = PyInt_FromLong(cmax);
+	for (i=0; i<256; i++)
+		PyTuple_SetItem(histtuple, i, PyFloat_FromDouble(hist[i]));
+	return PyTuple_Pack(3, sumobj, maxobj, histtuple);
+}
+static char mdiff_doc[] = "Efficient binary diff, diff statistics.";

 static PyMethodDef methods[] = {
 	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
 	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
+	{"statistics", statistics, METH_VARARGS, "get byte statistics\n"},
 	{NULL, NULL}
 };

@@ -368,4 +408,3 @@
 {
 	Py_InitModule3("bdiff", methods, mdiff_doc);
 }
-
diff -r 6dcbe191a9b5 -r b99693bd4386 mercurial/cmdutil.py
--- a/mercurial/cmdutil.py	Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/cmdutil.py	Thu Sep 11 22:37:36 2008 +0200
@@ -7,7 +7,7 @@

 from node import hex, nullid, nullrev, short
 from i18n import _
-import os, sys, bisect, stat
+import os, sys, math, bisect, stat
 import mdiff, bdiff, util, templater, templatefilters, patch, errno
 import match as _match

@@ -241,33 +241,314 @@
 def matchfiles(repo, files):
     return _match.exact(repo.root, repo.getcwd(), files)

-def findrenames(repo, added=None, removed=None, threshold=0.5):
-    '''find renamed files -- yields (before, after, score) tuples'''
+def findrenames(repo, similarity, added=None, removed=None):
+    '''find renamed files --
+       returns [renameold, ...], [renamenew, ...], [renamescore, ...]'''
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
+    if not added or not removed:
+        return [], [], []
+    na = len(added)
+    nr = len(removed)
     ctx = repo['.']
-    for a in added:
-        aa = repo.wread(a)
-        bestname, bestscore = None, threshold
-        for r in removed:
-            rr = ctx.filectx(r).data()

-            # bdiff.blocks() returns blocks of matching lines
-            # count the number of bytes in each
-            equal = 0
-            alines = mdiff.splitnewlines(aa)
-            matches = bdiff.blocks(aa, rr)
-            for x1,x2,y1,y2 in matches:
-                for line in alines[x1:x2]:
-                    equal += len(line)
+    # preload attributes
+    rfiles = [()] * nr
+    afiles = [()] * na
+    for i, r in enumerate(removed):
+        rfiles[i] = (ctx.filectx(r).size(),
+                     os.path.split(r)[1],
+                     os.path.splitext(r)[1],
+                     i)
+    for i, a in enumerate(added):
+        afiles[i] = (repo.wfilesize(a),
+                     os.path.split(a)[1],
+                     os.path.splitext(a)[1],
+                     i)

-            lengths = len(aa) + len(rr)
-            if lengths:
-                myscore = equal*2.0 / lengths
-                if myscore >= bestscore:
-                    bestname, bestscore = r, myscore
-        if bestname:
-            yield bestname, a, bestscore
+    # check loop order
+    if na == 1:
+        addedfirst = True
+    elif nr == 1:
+        addedfirst = False
+    else:
+        # decide loop order
+
+        # # this code is only for getting the estimator coefficients
+        # import time
+        # S1a, S1r = 0, 0
+        # added1, removed1 = added[:na/3], removed[:nr/3]
+        # N1a, N1r = len(added1), len(removed1)
+        # t0 = time.clock()
+        # for a in added1:
+        #     S1a += len(repo.wread(a))
+        # t1 = time.clock()
+        # for r in removed1:
+        #     S1r += len(ctx.filectx(r).data())
+        # t2 = time.clock()
+        # d1a, d1r = t1 - t0, t2 - t1
+        #
+        # S2a, S2r = 0, 0
+        # added2, removed2 = added[na/3:], removed[nr/3:]
+        # N2a, N2r = len(added2), len(removed2)
+        # t0 = time.clock()
+        # for a in added2:
+        #     S2a += len(repo.wread(a))
+        # t1 = time.clock()
+        # for r in removed2:
+        #     S2r += len(ctx.filectx(r).data())
+        # t2 = time.clock()
+        # d2a, d2r = t1 - t0, t2 - t1
+        #
+        # # calc estimator
+        #
+        # def solve2d(a1,a2,b1,b2,e1,e2):
+        #     """a1*x + b1*y = e1
+        #         a2*x + b2*y = e2"""
+        #     def det(a1,a2,b1,b2):
+        #         return a1*b2 - b1*a2
+        #     d = det(a1,a2,b1,b2)
+        #     x = det(e1,e2,b1,b2) / d
+        #     y = det(a1,a2,e1,e2) / d
+        #     return x, y
+        #
+        # # S*c1 + N*c2 = duration
+        # print S1a, S2a, N1a, N2a, d1a, d2a
+        # xc1a, xc2a = solve2d(S1a, S2a, N1a, N2a, d1a, d2a)
+        # xc1r, xc2r = solve2d(S1r, S2r, N1r, N2r, d1r, d2r)
+        # print 'c1r, c2r, c1a, c2a = %g, %g, %g, %g' \
+        #           % (xc1r, xc2r, xc1a, xc2a)
+
+        # estimate times to decide loops
+        c1r, c2r, c1a, c2a = 1.60882e-008, 0.00145002, 9.03138e-009, 0.00110696
+        Sr = sum([f[0] for f in rfiles])
+        Sa = sum([f[0] for f in afiles])
+        dr = Sr*c1r + nr*c2r
+        da = Sa*c1a + na*c2a
+        d1 = da + dr*na
+        d2 = dr + da*nr
+        addedfirst = d1 <= d2
+    if addedfirst:
+        filelist1, filelist2 = afiles, rfiles
+    else:
+        filelist1, filelist2 = rfiles, afiles
+
+    # init
+    minscore = similarity / 100
+    rbestscores = [0] * nr
+    rbestsumdiff = [-1] * nr
+    nhist = 256
+    sum1 = [-1] * len(filelist1)
+    sum2 = [-1] * len(filelist2)
+    maxbyte1 = [255] * len(filelist1)
+    maxbyte2 = [255] * len(filelist2)
+    hist1 = [(0)*nhist] * len(filelist1)
+    hist2 = [(0)*nhist] * len(filelist2)
+    bestmatches = []
+    notdeferred = 0
+    ignoredeferred = 1
+    isdeferred = 2
+
+    def _getscore():
+        '''bdiff.blocks() returns blocks of matching lines
+           count the number of bytes in each'''
+        equal = 0
+        matches = bdiff.blocks(aa, rr)
+        for x1,x2,y1,y2 in matches:
+            for line in alines[x1:x2]:
+                equal += len(line)
+        score = equal*2.0 / (len(aa) + len(rr))
+        return score
+
+    def _maxsumdiff(sim):
+        '''max sum difference based on similarity
+
+        example: 2 files, sim=40% (first 4 bytes), #=diff, o=equal
+
+                   uuuuu  <- upper bytes different
+        file1 oooo###### 40% 60% len=10
+                  l       <- lower bytes different
+        file2 oooo#      80% 20% len=5
+
+        maxscoresize = 50%
+        bounds: 0 <= actual sum upper bytes <= maxsumupper
+                0 <= actual sum lower bytes <= maxsumlower
+        max sum diff possible: maxsumlower + maxsumupper
+        '''
+        maxbyte = max([maxbyte1[ifile1], maxbyte2[ifile2]])
+        if size1 > size2:
+            maxsumupper = (size1 - size2) * maxbyte1[ifile1]
+            maxsumlower = math.ceil(size1 * (maxscoresize - sim)) * maxbyte
+        else:
+            maxsumlower = math.ceil(size2 * (maxscoresize - sim)) * maxbyte
+            maxsumupper = (size2 - size1) * maxbyte2[ifile2]
+        dmax = maxsumlower + maxsumupper
+        return dmax
+
+    def _maxscoresum():
+        '''maximum score possible based on sum difference d
+
+        when expecting the maximum sum for lower and upper
+        you get the min numbers of bytes different
+
+        see also _maxsumdiff
+        '''
+        maxbyte = max([maxbyte1[ifile1], maxbyte2[ifile2]])
+        if size1 > size2:
+            minbytesupper = size1 - size2
+            maxsumupper = minbytesupper * maxbyte1[ifile1]
+            minbyteslower = max([0, math.ceil((d - maxsumupper) / maxbyte)])
+            maxscoresum = 1 - (minbyteslower + minbytesupper) / size1
+        else:
+            minbytesupper = size2 - size1
+            maxsumupper = minbytesupper * maxbyte2[ifile2]
+            minbyteslower = max([0, math.ceil((d - maxsumupper) / maxbyte)])
+            maxscoresum = 1 - (minbyteslower + minbytesupper) / size2
+        return maxscoresum
+
+    ##################
+    # loop 1
+    for size1, name1, ext1, ifile1 in filelist1:
+        if addedfirst:
+            ia = ifile1
+            a = added[ia]
+        else:
+            ir = ifile1
+            r = removed[ir]
+        loadfile1 = True
+
+        ##################
+        # loop 2
+        for size2, name2, ext2, ifile2 in filelist2:
+            if addedfirst:
+                ir = ifile2
+                r = removed[ir]
+            else:
+                ia = ifile2
+                a = added[ia]
+
+            # special case: empty files
+            if not size1 or not size2:
+                if size1 == size2:
+                    bestmatches.append([1, ir, ia, notdeferred])
+                    rbestscores[ir] = 1
+                    rbestsumdiff[ir] = 0
+                continue
+
+            # maximum score that could be reached because of sizes
+            if size1 > size2:
+                maxscoresize = float(size2) / size1
+            else:
+                maxscoresize = float(size1) / size2
+            if maxscoresize < minscore:
+                continue
+
+            # maximum score that could be reached because of statistics
+            maxscore = maxscoresize
+            if sum1[ifile1] >= 0 and sum2[ifile2] >= 0:
+                # max delta of sums
+                d = abs(sum1[ifile1] - sum2[ifile2])
+                dmax = _maxsumdiff(minscore)
+                if d > dmax:
+                    continue
+                maxscoresum = _maxscoresum()
+                if maxscoresum < minscore:
+                    continue
+                # histogram delta
+                dhist = 0.0
+                for i in xrange(nhist):
+                    dhist += abs(hist1[ifile1][i] - hist2[ifile2][i])
+                maxscorehist = 1 - dhist / (2 * max([size1, size2]))
+                if maxscorehist < minscore:
+                    continue
+
+                # deferred compares
+                if maxscorehist < maxscore:
+                    maxscore = maxscorehist
+                if maxscoresum < maxscore:
+                    maxscore = maxscoresum
+                if rbestscores[ir] and rbestscores[ir] < maxscore:
+                    # a better match has a smaller d
+                    if rbestsumdiff[ir] > 0 and d > rbestsumdiff[ir]:
+                        maxscore = rbestscores[ir] # defer compare
+                    else:
+                        dmax = _maxsumdiff(rbestscores[ir])
+                        if d > dmax:
+                            # current best score cannot be reached with d
+                            maxscore = rbestscores[ir] # defer compare
+
+            # deferred compares
+            if maxscore <= rbestscores[ir]:
+                bestmatches.append([maxscore, ir, ia, isdeferred])
+                continue
+
+            # load file 1
+            if loadfile1:
+                if addedfirst:
+                    aa = repo.wread(a)
+                    alines = mdiff.splitnewlines(aa)
+                    sum1[ifile1], maxbyte1[ifile1], hist1[ifile1] = \
+                        bdiff.statistics(aa)
+                else:
+                    rr = ctx.filectx(r).data()
+                    sum1[ifile1], maxbyte1[ifile1], hist1[ifile1] = \
+                        bdiff.statistics(rr)
+                loadfile1 = False
+
+            # load file 2
+            if addedfirst:
+                rr = ctx.filectx(r).data()
+                if sum2[ifile2] < 0:
+                    sum2[ifile2], maxbyte2[ifile2], hist2[ifile2] = \
+                        bdiff.statistics(rr)
+            else:
+                aa = repo.wread(a)
+                alines = mdiff.splitnewlines(aa)
+                if sum2[ifile2] < 0:
+                    sum2[ifile2], maxbyte2[ifile2], hist2[ifile2] = \
+                        bdiff.statistics(aa)
+
+            # score it
+            score = _getscore()
+            if score >= minscore:
+                bestmatches.append([score, ir, ia, notdeferred])
+                if rbestscores[ir] <= score:
+                    rbestscores[ir] = score
+                    rbestsumdiff[ir] = abs(sum1[ifile1] - sum2[ifile2])
+
+    def _deferredcompare(imatch, nmatches):
+        score, ir, ia, deferred = bestmatches[imatch]
+        bestmatches[imatch][3] = ignoredeferred
+        rr = ctx.filectx(removed[ir]).data()
+        aa = repo.wread(added[ia])
+        alines = mdiff.splitnewlines(aa)
+        score = _getscore()
+        if score >= minscore:
+            # insert sorted in list
+            iins = imatch + 1
+            while iins < nmatches and bestmatches[iins][0] > score:
+                iins += 1
+            bestmatches.insert(iins, [score, ir, ia, notdeferred])
+            nmatches += 1
+        return nmatches
+
+    # assign best matches
+    bestmatches.sort(reverse = True)
+    renamenew, renameold, renamescore = [], [], []
+    imatch = 0
+    nmatches = len(bestmatches)
+    while imatch < nmatches:
+        score, ir, ia, deferred = bestmatches[imatch]
+        imatch += 1
+        if ia in renamenew or ir in renameold:
+            continue
+        if deferred:
+            continue
+        # add the best match
+        renameold.append(ir)
+        renamenew.append(ia)
+        renamescore.append(score)
+    return renameold, renamenew, renamescore

 def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
     if dry_run is None:
@@ -275,42 +556,47 @@
     if similarity is None:
         similarity = float(opts.get('similarity') or 0)
     add, remove = [], []
-    mapping = {}
-    audit_path = util.path_auditor(repo.root)
+    renamenew, renameold, renamescore = [], [], []
+    sadd, sremove = [], []
+    auditpath = util.path_auditor(repo.root)
     m = match(repo, pats, opts)
     for abs in repo.walk(m):
         target = repo.wjoin(abs)
         good = True
         try:
-            audit_path(abs)
+            auditpath(abs)
         except:
             good = False
         rel = m.rel(abs)
         exact = m.exact(abs)
         if good and abs not in repo.dirstate:
             add.append(abs)
-            mapping[abs] = rel, m.exact(abs)
+            sadd.append(((pats and rel) or abs, exact))
             if repo.ui.verbose or not exact:
                 repo.ui.status(_('adding %s\n') % ((pats and rel) or abs))
         if repo.dirstate[abs] != 'r' and (not good or not util.lexists(target)
             or (os.path.isdir(target) and not os.path.islink(target))):
             remove.append(abs)
-            mapping[abs] = rel, exact
+            sremove.append(((pats and rel) or abs, exact))
             if repo.ui.verbose or not exact:
                 repo.ui.status(_('removing %s\n') % ((pats and rel) or abs))
     if not dry_run:
         repo.remove(remove)
         repo.add(add)
+    # find file renames
     if similarity > 0:
-        for old, new, score in findrenames(repo, add, remove, similarity):
-            oldrel, oldexact = mapping[old]
-            newrel, newexact = mapping[new]
-            if repo.ui.verbose or not oldexact or not newexact:
-                repo.ui.status(_('recording removal of %s as rename to %s '
-                                 '(%d%% similar)\n') %
-                               (oldrel, newrel, score * 100))
-            if not dry_run:
-                repo.copy(old, new)
+        renameold, renamenew, renamescore = \
+            findrenames(repo, similarity, add, remove)
+    # output
+    for i in xrange(len(renameold)):
+        oldrel, oldexact = sremove[renameold[i]]
+        newrel, newexact = sadd[renamenew[i]]
+        if repo.ui.verbose or not oldexact or not newexact:
+            repo.ui.status(_('recording removal of %s as rename to %s '
+                             '(%d%% similar)\n') %
+                           (oldrel, newrel, renamescore[i] * 100))
+        if not dry_run:
+            repo.copy(remove[renameold[i]], add[renamenew[i]])

 def copy(ui, repo, pats, opts, rename=False):
     # called with the repo lock held
diff -r 6dcbe191a9b5 -r b99693bd4386 mercurial/commands.py
--- a/mercurial/commands.py	Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/commands.py	Thu Sep 11 22:37:36 2008 +0200
@@ -62,12 +62,12 @@
     parameter. Detecting renamed files this way can be expensive.
     """
     try:
-        sim = float(opts.get('similarity') or 0)
+        similarity = float(opts.get('similarity') or 0)
     except ValueError:
         raise util.Abort(_('similarity must be a number'))
-    if sim < 0 or sim > 100:
+    if similarity < 0 or similarity > 100:
         raise util.Abort(_('similarity must be between 0 and 100'))
-    return cmdutil.addremove(repo, pats, opts, similarity=sim/100.)
+    return cmdutil.addremove(repo, pats, opts, similarity=similarity)

 def annotate(ui, repo, *pats, **opts):
     """show changeset information per file line
diff -r 6dcbe191a9b5 -r b99693bd4386 mercurial/localrepo.py
--- a/mercurial/localrepo.py	Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/localrepo.py	Thu Sep 11 22:37:36 2008 +0200
@@ -528,6 +528,9 @@

     def adddatafilter(self, name, filter):
         self._datafilters[name] = filter
+
+    def wfilesize(self, filename):
+        return os.stat(self.wjoin(filename))[6]

     def wread(self, filename):
         if self._link(filename):
diff -r 6dcbe191a9b5 -r b99693bd4386 tests/test-addremove-similar
--- a/tests/test-addremove-similar	Mon Aug 18 16:50:36 2008 -0500
+++ b/tests/test-addremove-similar	Thu Sep 11 22:37:36 2008 +0200
@@ -26,19 +26,23 @@

 hg init rep2; cd rep2

-python -c 'for x in range(10000): print x' > large-file
-python -c 'for x in range(50): print x' > tiny-file
+python -c 'for x in range(10000): print x' > large.file
+python -c 'for x in range(50): print x' > tiny.file
+python -c 'for x in range(500): print 1 if x>0 else "a"' > a.file
+python -c 'for x in range(500): print 1 if x>0 else "b"' > b.file
+python -c 'for x in range(500): print 1 if x>0 else "c"' > c.file

-hg addremove
-
+hg addremove -s50
 hg commit -m A

-python -c 'for x in range(70): print x' > small-file
-rm tiny-file
-rm large-file
+python -c 'for x in range(70): print x' > small.file
+rm tiny.file
+rm large.file
+python -c 'for x in range(500): print 1 if x>0 else "d"' > d.file
+mkdir x
+mv *.* x

 hg addremove -s50
-
 hg commit -m B

 echo % should all fail
diff -r 6dcbe191a9b5 -r b99693bd4386 tests/test-addremove-similar.out
--- a/tests/test-addremove-similar.out	Mon Aug 18 16:50:36 2008 -0500
+++ b/tests/test-addremove-similar.out	Thu Sep 11 22:37:36 2008 +0200
@@ -8,12 +8,26 @@
 2 files updated, 0 files merged, 1 files removed, 0 files unresolved
 adding another-empty-file
 removing empty-file
-adding large-file
-adding tiny-file
-removing large-file
-adding small-file
-removing tiny-file
-recording removal of tiny-file as rename to small-file (82% similar)
+recording removal of empty-file as rename to another-empty-file (100% similar)
+adding a.file
+adding b.file
+adding c.file
+adding large.file
+adding tiny.file
+removing a.file
+removing b.file
+removing c.file
+removing large.file
+removing tiny.file
+adding x/a.file
+adding x/b.file
+adding x/c.file
+adding x/d.file
+adding x/small.file
+recording removal of c.file as rename to x/c.file (100% similar)
+recording removal of b.file as rename to x/b.file (100% similar)
+recording removal of a.file as rename to x/a.file (100% similar)
+recording removal of tiny.file as rename to x/small.file (82% similar)
 % should all fail
 abort: similarity must be a number
 abort: similarity must be between 0 and 100
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: mercurial-main_rev6908.patch
Url: http://selenic.com/pipermail/mercurial-devel/attachments/20080911/c6bb8227/attachment.txt