[PATCH] auto rename: best matches and speed improvement UPDATE2

Fri Aug 15 16:52:41 CDT 2008

I have improved the estimation about which one should
go first in the loop. I hope that now everything is fine!

The matching algorithm itself is very poor. I get values for
very different files with 45% similarity!


# HG changeset patch
# User Herbert Griebel <herbertg at gmx.at>
# Date 1218836870 -7200
# Node ID fc9107dd09b86a5930a70f443f310e5e137735cd
# Parent  438e02b4be73ffd80d4a6cc45b7c9273d6b26604
auto rename: best matches and speed improvement

The auto rename feature of addremove did not rename to the best match.
The old implementation looped over "added files" and selected the best
match for each individually. The next "added file" could also have its
best match with the same removed file, but which is far lower than the
one before. Therefore the last "added file" wins which may not be the
best.
The new implementation fixes this. The rename list is also sorted in
reverse order to make it easier to find the wrong matches with low
similarity. For speed improvements the loops are dynamically
interchanged which can speed up the matching considerably.
There is also a very rough estimation of the matching time, and
output has been enhanced to give the user feedback about the
progress.

diff -r 438e02b4be73 -r fc9107dd09b8 mercurial/cmdutil.py

--- a/mercurial/cmdutil.py	Wed Aug 13 20:36:37 2008 -0500
+++ b/mercurial/cmdutil.py	Fri Aug 15 23:47:50 2008 +0200
@@ -7,7 +7,7 @@

 from node import hex, nullid, nullrev, short
 from i18n import _
-import os, sys, bisect, stat
+import os, sys, bisect, stat, operator, time
 import mdiff, bdiff, util, templater, templatefilters, patch, errno
 import match as _match

@@ -246,12 +246,54 @@
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
     ctx = repo['.']
-    for a in added:
-        aa = repo.wread(a)
-        bestname, bestscore = None, threshold
+    repo.ui.status(_('\nfind renames:\n'))
+    repo.ui.status(_('%d added and %d removed') % (len(added), len(removed)))
+    if len(added) == 1:
+        addedfirst = True
+    elif len(removed) == 1:
+        addedfirst = False
+    else:
+        # estimate times to decide inner loop
+        t0 = time.clock()
+        for a in added:
+            repo.wread(a)
+        t1 = time.clock()
         for r in removed:
+            ctx.filectx(r).data()
+        t2 = time.clock()
+        d1 = (t1 - t0) + (t2 - t1)*len(added)
+        d2 = (t2 - t1) + (t1 - t0)*len(removed)
+        addedfirst = d1 <= d2
+        # assumption: matchtime is proportional to loading from repo
+        matchtime = len(added) * (t2 - t1) * 0.3
+        d = (d1 if addedfirst else d2) + matchtime
+        repo.ui.status(time.strftime(_(', estimated time: %H:%M:%S'),
+                                          time.gmtime(d)))
+    repo.ui.status(_('\n\n'))
+    if addedfirst:
+        l1, l2 = added, removed
+        s = _('%d') % len(added)
+    else:
+        l1, l2 = removed, added
+        s = _('%d') % len(removed)
+    fmt = _('[%%%dd/%s] checking rename with %%s\n') % (len(s), s)
+    bestmatches = []
+    t0 = time.clock()
+    for i, x1 in enumerate(l1):
+        repo.ui.status(fmt % (i+1, x1))
+        if addedfirst:
+            a = x1
+            aa = repo.wread(a)
+        else:
+            r = x1
             rr = ctx.filectx(r).data()
-
+        for x2 in l2:
+            if addedfirst:
+                r = x2
+                rr = ctx.filectx(r).data()
+            else:
+                a = x2
+                aa = repo.wread(a)
             # bdiff.blocks() returns blocks of matching lines
             # count the number of bytes in each
             equal = 0
@@ -260,14 +302,24 @@
             for x1,x2,y1,y2 in matches:
                 for line in alines[x1:x2]:
                     equal += len(line)
-
             lengths = len(aa) + len(rr)
             if lengths:
                 myscore = equal*2.0 / lengths
-                if myscore >= bestscore:
-                    bestname, bestscore = r, myscore
-        if bestname:
-            yield bestname, a, bestscore
+                if myscore >= threshold:
+                    bestmatches.append([myscore, a, r])
+    # assign best matches
+    bestmatches.sort(key = operator.itemgetter(0), reverse = True)
+    assigneda, assignedr, renames = [], [], []
+    rmax, amax = 0, 0
+    for bestscore, a, r in bestmatches:
+        if a not in assigneda and r not in assignedr:
+            renames.append([r, a, bestscore])
+            assigneda.append(a)
+            assignedr.append(r)
+            rmax, amax = max([rmax, len(r)]), max([amax, len(a)])
+    t1 = time.clock()
+    repo.ui.status(time.strftime(_('time: %H:%M:%S\n\n'), time.gmtime(t1 - t0)))
+    return renames, rmax, amax

 def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
     if dry_run is None:
@@ -302,13 +354,14 @@
         repo.remove(remove)
         repo.add(add)
     if similarity > 0:
-        for old, new, score in findrenames(repo, add, remove, similarity):
+        renames, rmax, amax = findrenames(repo, add, remove, similarity)
+        fmt = _('recording removal of %%-%ds as rename to %%-%ds '
+                                 '%%3d%%%% similar\n') % (rmax, amax)
+        for old, new, score in renames:
             oldrel, oldexact = mapping[old]
             newrel, newexact = mapping[new]
             if repo.ui.verbose or not oldexact or not newexact:
-                repo.ui.status(_('recording removal of %s as rename to %s '
-                                 '(%d%% similar)\n') %
-                               (oldrel, newrel, score * 100))
+                repo.ui.status(fmt % (oldrel, newrel, score * 100))
             if not dry_run:
                 repo.copy(old, new)