[PATCH] auto rename: best matches and speed improvement UPDATE4

Fri Aug 15 18:48:08 CDT 2008

Sorry, it's late, two typos fixed.


# HG changeset patch
# User Herbert Griebel <herbertg at gmx.at>
# Date 1218843942 -7200
# Node ID 6cf73b7c3671f0737e25c72d45fa3ca805426df6
# Parent  438e02b4be73ffd80d4a6cc45b7c9273d6b26604
auto rename: best matches and speed improvement

The auto rename feature of addremove did not rename to the best match.
The old implementation looped over "added files" and selected the best
match for each individually. The next "added file" could also have its
best match with the same removed file, but which is far lower than the
one before. Therefore the last "added file" wins which may not be the
best.
The new implementation fixes this. The rename list is also sorted in
reverse order to make it easier to find the wrong matches with low
similarity. For speed improvements the loops are dynamically
interchanged which can speed up the matching considerably.
There is also a very rough estimation of the matching time, and
output has been enhanced to give the user feedback about the
progress.

diff -r 438e02b4be73 -r 6cf73b7c3671 mercurial/cmdutil.py

--- a/mercurial/cmdutil.py	Wed Aug 13 20:36:37 2008 -0500
+++ b/mercurial/cmdutil.py	Sat Aug 16 01:45:42 2008 +0200
@@ -7,7 +7,7 @@

 from node import hex, nullid, nullrev, short
 from i18n import _
-import os, sys, bisect, stat
+import os, sys, bisect, stat, operator, time
 import mdiff, bdiff, util, templater, templatefilters, patch, errno
 import match as _match

@@ -246,28 +246,82 @@
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
     ctx = repo['.']
-    for a in added:
-        aa = repo.wread(a)
-        bestname, bestscore = None, threshold
+    repo.ui.status('\n' + _('find renames') + ':\n')
+    repo.ui.status('%d %s %s %d %s' %
+        (len(added), _('added'), _('and'), len(removed), _('removed')))
+    if len(added) == 1:
+        addedfirst = True
+    elif len(removed) == 1:
+        addedfirst = False
+    else:
+        # estimate times to decide inner loop
+        t0 = time.clock()
+        for a in added:
+            repo.wread(a)
+        t1 = time.clock()
         for r in removed:
+            ctx.filectx(r).data()
+        t2 = time.clock()
+        d1 = (t1 - t0) + (t2 - t1)*len(added)
+        d2 = (t2 - t1) + (t1 - t0)*len(removed)
+        addedfirst = d1 <= d2
+        # assumption: matchtime is proportional to loading from repo
+        matchtime = len(added) * (t2 - t1) * 0.3
+        d = (d1 if addedfirst else d2) + matchtime
+        fmt = ', %s: %%H:%%M:%%S' % (_('estimated time'))
+        repo.ui.status(time.strftime(fmt, time.gmtime(d)))
+    repo.ui.status('\n\n')
+    if addedfirst:
+        l1, l2 = added, removed
+    else:
+        l1, l2 = removed, added
+    s = '%d' % len(l1)
+    fmt = '[%%%dd/%s] %s %%s\n' % (len(s), s, _('checking rename with'))
+    bestmatches = []
+    t0 = time.clock()
+    for i, x1 in enumerate(l1):
+        repo.ui.status(fmt % (i+1, x1))
+        if addedfirst:
+            a = x1
+            aa = repo.wread(a)
+            alines = mdiff.splitnewlines(aa)
+        else:
+            r = x1
             rr = ctx.filectx(r).data()
-
+        for x2 in l2:
+            if addedfirst:
+                r = x2
+                rr = ctx.filectx(r).data()
+            else:
+                a = x2
+                aa = repo.wread(a)
+                alines = mdiff.splitnewlines(aa)
             # bdiff.blocks() returns blocks of matching lines
             # count the number of bytes in each
             equal = 0
-            alines = mdiff.splitnewlines(aa)
             matches = bdiff.blocks(aa, rr)
             for x1,x2,y1,y2 in matches:
                 for line in alines[x1:x2]:
                     equal += len(line)
-
             lengths = len(aa) + len(rr)
             if lengths:
                 myscore = equal*2.0 / lengths
-                if myscore >= bestscore:
-                    bestname, bestscore = r, myscore
-        if bestname:
-            yield bestname, a, bestscore
+                if myscore >= threshold:
+                    bestmatches.append([myscore, a, r])
+    # assign best matches
+    bestmatches.sort(key = operator.itemgetter(0), reverse = True)
+    assigneda, assignedr, renames = [], [], []
+    rmax, amax = 0, 0
+    for bestscore, a, r in bestmatches:
+        if a not in assigneda and r not in assignedr:
+            renames.append([r, a, bestscore])
+            assigneda.append(a)
+            assignedr.append(r)
+            rmax, amax = max([rmax, len(r)]), max([amax, len(a)])
+    t1 = time.clock()
+    fmt = _('execution time') + ': %H:%M:%S\n\n'
+    repo.ui.status(time.strftime(fmt, time.gmtime(t1 - t0)))
+    return renames, rmax, amax

 def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
     if dry_run is None:
@@ -302,13 +356,15 @@
         repo.remove(remove)
         repo.add(add)
     if similarity > 0:
-        for old, new, score in findrenames(repo, add, remove, similarity):
+        renames, rmax, amax = findrenames(repo, add, remove, similarity)
+        fmt = ('%s %%-%ds %s %%-%ds %%3d%%%% %s\n') \
+                % (_('recording removal of'), rmax,
+                   _('as rename to'), amax, _('similar'))
+        for old, new, score in renames:
             oldrel, oldexact = mapping[old]
             newrel, newexact = mapping[new]
             if repo.ui.verbose or not oldexact or not newexact:
-                repo.ui.status(_('recording removal of %s as rename to %s '
-                                 '(%d%% similar)\n') %
-                               (oldrel, newrel, score * 100))
+                repo.ui.status(fmt % (oldrel, newrel, score * 100))
             if not dry_run:
                 repo.copy(old, new)