[PATCH] auto rename: best matches and speed improvement

Fri Aug 15 15:49:33 CDT 2008

# HG changeset patch
# User Herbert Griebel <herbertg at gmx.at>
# Date 1218832974 -7200
# Node ID 2ca9be7241ec8ac8ca1f6fdbc4e7c7b1e700adb7
# Parent  438e02b4be73ffd80d4a6cc45b7c9273d6b26604
auto rename: best matches and speed improvement

The auto rename feature of addremove did not rename to the best match.
The old implementation looped over "added files" and selected the best
match for each individually. The next "added file" could also have its
best match with the same removed file, but which is far lower than the
one before. Therefore the last "added file" wins which may not be the
best.
The new implementation fixes this. The rename list is also sorted in
reverse order to make it easier to find the wrong matches with low
similarity. For speed improvements the loops are dynamically
interchanged which can speed up the matching considerably.

diff -r 438e02b4be73 -r 2ca9be7241ec mercurial/cmdutil.py

--- a/mercurial/cmdutil.py    Wed Aug 13 20:36:37 2008 -0500
+++ b/mercurial/cmdutil.py    Fri Aug 15 22:42:54 2008 +0200
@@ -7,7 +7,7 @@
 
 from node import hex, nullid, nullrev, short
 from i18n import _
-import os, sys, bisect, stat
+import os, sys, bisect, stat, operator, time
 import mdiff, bdiff, util, templater, templatefilters, patch, errno
 import match as _match
 
@@ -246,12 +246,56 @@
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
     ctx = repo['.']
-    for a in added:
-        aa = repo.wread(a)
-        bestname, bestscore = None, threshold
+    print '\nfind renames:'
+    print '%d added and %d removed' % (len(added), len(removed)),
+    if len(added) == 1:
+        addedfirst = True
+    elif len(removed) == 1:
+        addedfirst = False
+    else:
+        # estimate times to decide inner loop
+        t0 = time.clock()
+        for a in added:
+            repo.wread(a)
+        t1 = time.clock()
         for r in removed:
+            ctx.filectx(r).data()
+        t2 = time.clock()
+        d1 = t1 - t0
+        d2 = t2 - t1
+        # assumption: matchtime is proportional to loading from repo
+        matchtime = len(added) * d2 * 0.36
+        addedfirst = d2 > d1
+        if addedfirst:
+            d = d1 + d2*len(added) + matchtime
+        else:
+            d = d2 + d1*len(removed) + matchtime
+        print time.strftime('- estimated time: %H:%M:%S', time.gmtime(d)),
+    print '\n'
+    if addedfirst:
+        l1, l2 = added, removed
+        s = '%d' % len(added)
+    else:
+        l1, l2 = removed, added
+        s = '%d' % len(removed)
+    fmt = '[%%%dd/%s] checking rename with %%s' % (len(s), s)
+    bestmatches = []
+    t0 = time.clock()
+    for i, x1 in enumerate(l1):
+        print fmt % (i+1, x1)
+        if addedfirst:
+            a = x1
+            aa = repo.wread(a)
+        else:
+            r = x1
             rr = ctx.filectx(r).data()
-
+        for x2 in l2:
+            if addedfirst:
+                r = x2
+                rr = ctx.filectx(r).data()
+            else:
+                a = x2
+                aa = repo.wread(a)
             # bdiff.blocks() returns blocks of matching lines
             # count the number of bytes in each
             equal = 0
@@ -260,14 +304,24 @@
             for x1,x2,y1,y2 in matches:
                 for line in alines[x1:x2]:
                     equal += len(line)
-
             lengths = len(aa) + len(rr)
             if lengths:
                 myscore = equal*2.0 / lengths
-                if myscore >= bestscore:
-                    bestname, bestscore = r, myscore
-        if bestname:
-            yield bestname, a, bestscore
+                if myscore >= threshold:
+                    bestmatches.append([myscore, a, r])
+    # assign best matches
+    bestmatches.sort(key = operator.itemgetter(0), reverse = True)
+    assigneda, assignedr, renames = [], [], []
+    rmax, amax = 0, 0
+    for bestscore, a, r in bestmatches:
+        if a not in assigneda and r not in assignedr:
+            renames.append([r, a, bestscore])
+            assigneda.append(a)
+            assignedr.append(r)
+            rmax, amax = max([rmax, len(r)]), max([amax, len(a)])
+    t1 = time.clock()
+    print time.strftime('time: %H:%M:%S\n', time.gmtime(t1 - t0))
+    return renames, rmax, amax
 
 def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
     if dry_run is None:
@@ -302,13 +356,14 @@
         repo.remove(remove)
         repo.add(add)
     if similarity > 0:
-        for old, new, score in findrenames(repo, add, remove, similarity):
+        renames, rmax, amax = findrenames(repo, add, remove, similarity)
+        fmt = _('recording removal of %%-%ds as rename to %%-%ds '
+                                 '%%3d%%%% similar\n') % (rmax, amax)
+        for old, new, score in renames:
             oldrel, oldexact = mapping[old]
             newrel, newexact = mapping[new]
             if repo.ui.verbose or not oldexact or not newexact:
-                repo.ui.status(_('recording removal of %s as rename to %s '
-                                 '(%d%% similar)\n') %
-                               (oldrel, newrel, score * 100))
+                repo.ui.status(fmt % (oldrel, newrel, score * 100))
             if not dry_run:
                 repo.copy(old, new)