[PATCH] auto rename: best matches and speed improvement UPDATE6

Herbert Griebel herbertg at gmx.at
Sat Aug 23 09:35:34 CDT 2008


# HG changeset patch
# User Herbert Griebel <herbertg at gmx.at>
# Date 1219500551 -7200
# Node ID c66e542b1a4cda44f00fbcf29e6aea4e2dc9d4da
# Parent  6dcbe191a9b51799a4cd40bd398e826bcd9b4dac
auto rename: best matches and speed improvements

The auto rename feature of addremove did not rename to the best
match and was very slow. The new implementation fixes this.

File name matching:

Added files with the same similarity for a removed file are also
matched against the pathname so directory structure can be preserved
with regard to renaming identical files in multiple directories.
Example:

x/z
y/z
both files are identical

move to a:
a/x/z
a/y/z

you expect:
rename x/z to a/x/z
rename y/z to a/y/z

if you don't match pathes too, you could get x/z to a/y/z
because the z files are identical.

The current algorithm compares two pathnames starting from the end,
the score is simply the number of equal characters. So the full path
from right is matched. This works very good if you are just moving.
If you rename the file name too and move it, things get much more
complicated and you need a more sophisticated matching algorithm which
matches all parts of two pathes. So you need a basic string comparer
which gives a similarity measure. This may be a future improvement.

diff -r 6dcbe191a9b5 -r c66e542b1a4c mercurial/cmdutil.py
--- a/mercurial/cmdutil.py	Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/cmdutil.py	Sat Aug 23 16:09:11 2008 +0200
@@ -241,40 +241,267 @@
 def matchfiles(repo, files):
     return _match.exact(repo.root, repo.getcwd(), files)

-def findrenames(repo, added=None, removed=None, threshold=0.5):
-    '''find renamed files -- yields (before, after, score) tuples'''
+def matchending(s1, s2):
+    imax = min([len(s1), len(s2)])
+    i = 1
+    while i <= imax and s1[-i] == s2[-i]:
+        i += 1
+    return i - 1
+
+def findrenames(repo, added=None, removed=None, similarity=50):
+    '''find renamed files --
+       returns [renameold, ...], [renamenew, ...], [renamescore, ...]'''
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
+    Na = len(added)
+    Nr = len(removed)
+    if not added or not removed:
+        return [], [], []
     ctx = repo['.']
-    for a in added:
-        aa = repo.wread(a)
-        bestname, bestscore = None, threshold
-        for r in removed:
-            rr = ctx.filectx(r).data()

-            # bdiff.blocks() returns blocks of matching lines
-            # count the number of bytes in each
-            equal = 0
-            alines = mdiff.splitnewlines(aa)
-            matches = bdiff.blocks(aa, rr)
-            for x1,x2,y1,y2 in matches:
-                for line in alines[x1:x2]:
-                    equal += len(line)
+    # preload attributes
+    rfiles = [()] * Nr
+    afiles = [()] * Na
+    for i,r in enumerate(removed):
+        rfiles[i] = (ctx.filectx(r).size(), os.path.split(r)[1], r,
+                     os.path.splitext(r)[1], i)
+    for i,a in enumerate(added):
+        afiles[i] = (repo.wfilesize(a), os.path.split(a)[1], a,
+                     os.path.splitext(a)[1], i)

-            lengths = len(aa) + len(rr)
-            if lengths:
-                myscore = equal*2.0 / lengths
-                if myscore >= bestscore:
-                    bestname, bestscore = r, myscore
-        if bestname:
-            yield bestname, a, bestscore
+    # sort files by size
+    rfiles.sort()
+    afiles.sort()
+
+    # try to find moves
+    rins, ains = 0, 0
+    na = len(afiles)
+    for ir in range(len(rfiles)):
+        rname, rsize = rfiles[ir][1], rfiles[ir][0]
+        rmove = False
+        ia = ains
+        while ia < na:
+            if rsize == afiles[ia][0] and rname == afiles[ia][1]:
+                t = afiles[ia]
+                del afiles[ia]
+                afiles.insert(ains, t)
+                ains += 1
+                rmove = True
+            ia += 1
+        if rmove:
+            t = rfiles[ir]
+            del rfiles[ir]
+            rfiles.insert(rins, t)
+            rins += 1
+
+    # check loop order
+    if Na == 1:
+        addedfirst = True
+    elif Nr == 1:
+        addedfirst = False
+    else:
+        # measure times to decide loops
+        measure = 0
+        if measure:
+            # this code is only for getting the estimator
+            import time
+            S1a = 0
+            S1r = 0
+            added1 = added[:Na/3]
+            removed1 = removed[:Nr/3]
+            N1a = len(added1)
+            N1r = len(removed1)
+            t0 = time.clock()
+            for a in added1:
+                S1a += len(repo.wread(a))
+            t1 = time.clock()
+            for r in removed1:
+                S1r += len(ctx.filectx(r).data())
+            t2 = time.clock()
+            d1a = t1 - t0
+            d1r = t2 - t1
+
+            S2a = 0
+            S2r = 0
+            added2 = added[Na/3:]
+            removed2 = removed[Nr/3:]
+            N2a = len(added2)
+            N2r = len(removed2)
+            t0 = time.clock()
+            for a in added2:
+                S2a += len(repo.wread(a))
+            t1 = time.clock()
+            for r in removed2:
+                S2r += len(ctx.filectx(r).data())
+            t2 = time.clock()
+            d2a = t1 - t0
+            d2r = t2 - t1
+
+            # calc estimator
+
+            def solve2d(a1,a2,b1,b2,e1,e2):
+                """a1*x + b1*y = e1
+                   a2*x + b2*y = e2"""
+                def det(a1,a2,b1,b2):
+                    return a1*b2 - b1*a2
+                d = det(a1,a2,b1,b2)
+                x = det(e1,e2,b1,b2) / d
+                y = det(a1,a2,e1,e2) / d
+                return x, y
+
+            try:
+                # S*c1 + N*c2 = duration
+                print S1a, S2a, N1a, N2a, d1a, d2a
+                xc1a, xc2a = solve2d(S1a, S2a, N1a, N2a, d1a, d2a)
+                xc1r, xc2r = solve2d(S1r, S2r, N1r, N2r, d1r, d2r)
+                print 'c1r, c2r, c1a, c2a = %g, %g, %g, %g' \
+                        % (xc1r, xc2r, xc1a, xc2a)
+            except:
+                print 'calc error'
+
+        # estimate times to decide loops
+        c1r, c2r, c1a, c2a = 1.60882e-008, 0.00145002, 9.03138e-009, 0.00110696
+        Sr = sum([f[0] for f in rfiles])
+        Sa = sum([f[0] for f in afiles])
+        dr = Sr*c1r + Nr*c2r
+        da = Sa*c1a + Na*c2a
+        d1 = da + dr*Na
+        d2 = dr + da*Nr
+        addedfirst = d1 <= d2
+
+    if addedfirst:
+        filelist1, filelist2 = afiles, rfiles
+    else:
+        filelist1, filelist2 = rfiles, afiles
+
+    # init
+    numbered = len(filelist1) >= 20
+    if numbered:
+        s = '%d' % len(filelist1)
+        fmt = '%%%dd/%s %s %%s\n' % (len(s), s, _('checking rename with'))
+    else:
+        fmt = '%s %%s\n' % _('checking rename with')
+    minscore = similarity/100
+    minscoresize = minscore
+    bestmatches = []
+    rbestscores = [0] * Nr
+
+    def getscore():
+        # bdiff.blocks() returns blocks of matching lines
+        # count the number of bytes in each
+        l = len(aa) + len(rr)
+        if not l:
+            return 1
+        equal = 0
+        matches = bdiff.blocks(aa, rr)
+        for x1,x2,y1,y2 in matches:
+            for line in alines[x1:x2]:
+                equal += len(line)
+        score = equal*2.0 / l
+        return score
+
+    # loop 1
+    for ifile1, f1 in enumerate(filelist1):
+        size1, name1, path1, ext1, ref1 = f1
+        if addedfirst:
+            a = path1
+        else:
+            r = path1
+            ir = ref1
+        loadfile1 = True
+
+        # loop 2
+        for ifile2, f2 in enumerate(filelist2):
+            size2, name2, path2, ext2, ref2 = f2
+            if addedfirst:
+                r = path2
+                ir = ref2
+            else:
+                a = path2
+
+            # special case: empty files
+            if not size1 or not size2:
+                if size1 == size2:
+                    bestmatches.append((1, r, a, False))
+                    rbestscores[ir] = 1
+                continue
+
+            # check for maximum score that could be reached because of sizes
+            maxscore = float(size1)/size2 if size2>size1 else float(size2)/size1
+            if maxscore < minscoresize:
+                continue
+
+            # deferred compares
+            if maxscore < rbestscores[ir]:
+                bestmatches.append((maxscore, r, a, True))
+                continue
+
+            # load file 1
+            if loadfile1:
+                if addedfirst:
+                    aa = repo.wread(a)
+                    alines = mdiff.splitnewlines(aa)
+                else:
+                    rr = ctx.filectx(r).data()
+                loadfile1 = False
+
+            # load file 2
+            if addedfirst:
+                rr = ctx.filectx(r).data()
+            else:
+                aa = repo.wread(a)
+                alines = mdiff.splitnewlines(aa)
+
+            # score it
+            score = getscore()
+            if score >= minscore:
+                bestmatches.append((score, r, a, False))
+                if rbestscores[ir] < score:
+                    rbestscores[ir] = score
+
+    # assign best matches
+    bestmatches.sort(reverse = True)
+    renamenew, renameold, renamescore = [], [], []
+    imatch, nmatches = 0, len(bestmatches)
+    while imatch < nmatches:
+        bestscore, r, a, deferred = bestmatches[imatch]
+        imatch += 1
+        if a not in renamenew and r not in renameold:
+            if deferred:
+                rr = ctx.filectx(r).data()
+                aa = repo.wread(a)
+                alines = mdiff.splitnewlines(aa)
+                score = getscore()
+                if score >= minscore:
+                    # insert sorted in list
+                    iins = imatch
+                    while iins < nmatches and score < bestmatches[iins][0]:
+                        iins += 1
+                    bestmatches.insert(iins, (score, r, a, False))
+                    nmatches += 1
+                continue
+            # check pathname for other matches with equal score
+            imatch2 = imatch
+            maxscore = matchending(r, a)
+            while imatch2 < nmatches and bestmatches[imatch2][0] == bestscore:
+                if r == bestmatches[imatch2][1]:
+                    score = matchending(r, bestmatches[imatch2][2])
+                    if maxscore < score:
+                        maxscore = score
+                        a = bestmatches[imatch2][2]
+                imatch2 += 1
+            # add the best match
+            renameold.append(r)
+            renamenew.append(a)
+            renamescore.append(bestscore)
+    return renameold, renamenew, renamescore

 def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
     if dry_run is None:
         dry_run = opts.get('dry_run')
     if similarity is None:
         similarity = float(opts.get('similarity') or 0)
-    add, remove = [], []
+    add, remove, renamenew, renameold, renamescore = [], [], [], [], []
     mapping = {}
     audit_path = util.path_auditor(repo.root)
     m = match(repo, pats, opts)
@@ -289,28 +516,32 @@
         exact = m.exact(abs)
         if good and abs not in repo.dirstate:
             add.append(abs)
-            mapping[abs] = rel, m.exact(abs)
+            mapping[abs] = (pats and rel) or abs, exact
             if repo.ui.verbose or not exact:
                 repo.ui.status(_('adding %s\n') % ((pats and rel) or abs))
         if repo.dirstate[abs] != 'r' and (not good or not util.lexists(target)
             or (os.path.isdir(target) and not os.path.islink(target))):
             remove.append(abs)
-            mapping[abs] = rel, exact
+            mapping[abs] = (pats and rel) or abs, exact
             if repo.ui.verbose or not exact:
                 repo.ui.status(_('removing %s\n') % ((pats and rel) or abs))
     if not dry_run:
         repo.remove(remove)
         repo.add(add)
+    # find file renames
     if similarity > 0:
-        for old, new, score in findrenames(repo, add, remove, similarity):
-            oldrel, oldexact = mapping[old]
-            newrel, newexact = mapping[new]
-            if repo.ui.verbose or not oldexact or not newexact:
-                repo.ui.status(_('recording removal of %s as rename to %s '
-                                 '(%d%% similar)\n') %
-                               (oldrel, newrel, score * 100))
-            if not dry_run:
-                repo.copy(old, new)
+        renameold, renamenew, renamescore = findrenames( \
+            repo, add, remove, similarity)
+    # output
+    for old, new, score in zip(renameold, renamenew, renamescore):
+        oldrel, oldexact = mapping[old]
+        newrel, newexact = mapping[new]
+        if repo.ui.verbose or not oldexact or not newexact:
+            repo.ui.status(_('recording removal of %s as rename to %s '
+                             '(%d%% similar)\n') %
+                           (oldrel, newrel, score * 100))
+        if not dry_run:
+            repo.copy(old, new)

 def copy(ui, repo, pats, opts, rename=False):
     # called with the repo lock held
diff -r 6dcbe191a9b5 -r c66e542b1a4c mercurial/commands.py
--- a/mercurial/commands.py	Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/commands.py	Sat Aug 23 16:09:11 2008 +0200
@@ -62,12 +62,12 @@
     parameter. Detecting renamed files this way can be expensive.
     """
     try:
-        sim = float(opts.get('similarity') or 0)
+        similarity = float(opts.get('similarity') or 0)
     except ValueError:
         raise util.Abort(_('similarity must be a number'))
-    if sim < 0 or sim > 100:
+    if similarity < 0 or similarity > 100:
         raise util.Abort(_('similarity must be between 0 and 100'))
-    return cmdutil.addremove(repo, pats, opts, similarity=sim/100.)
+    return cmdutil.addremove(repo, pats, opts, similarity=similarity)

 def annotate(ui, repo, *pats, **opts):
     """show changeset information per file line
diff -r 6dcbe191a9b5 -r c66e542b1a4c mercurial/localrepo.py
--- a/mercurial/localrepo.py	Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/localrepo.py	Sat Aug 23 16:09:11 2008 +0200
@@ -528,6 +528,9 @@

     def adddatafilter(self, name, filter):
         self._datafilters[name] = filter
+
+    def wfilesize(self, filename):
+        return os.stat(self.wjoin(filename))[6]

     def wread(self, filename):
         if self._link(filename):
diff -r 6dcbe191a9b5 -r c66e542b1a4c tests/test-addremove-similar
--- a/tests/test-addremove-similar	Mon Aug 18 16:50:36 2008 -0500
+++ b/tests/test-addremove-similar	Sat Aug 23 16:09:11 2008 +0200
@@ -46,4 +46,20 @@
 hg addremove -s -1
 hg addremove -s 1e6

-true
+
+
+echo % name matching test
+
+touch a.z
+touch b.z
+touch c.z
+hg addremove -s50
+hg commit -m "adding 2nd file"
+
+echo % a.z to x/a.z
+echo % b.z to x/b.z
+echo % c.z to x/c.z
+mkdir x
+mv *.* x/
+hg addremove -s50
+hg commit -m "rename 3 moved files"
diff -r 6dcbe191a9b5 -r c66e542b1a4c tests/test-addremove-similar.out
--- a/tests/test-addremove-similar.out	Mon Aug 18 16:50:36 2008 -0500
+++ b/tests/test-addremove-similar.out	Sat Aug 23 16:09:11 2008 +0200
@@ -8,6 +8,7 @@
 2 files updated, 0 files merged, 1 files removed, 0 files unresolved
 adding another-empty-file
 removing empty-file
+recording removal of empty-file as rename to another-empty-file (100% similar)
 adding large-file
 adding tiny-file
 removing large-file
@@ -18,3 +19,19 @@
 abort: similarity must be a number
 abort: similarity must be between 0 and 100
 abort: similarity must be between 0 and 100
+% name matching test
+adding a.z
+adding b.z
+adding c.z
+% a.z to x/a.z
+% b.z to x/b.z
+% c.z to x/c.z
+removing a.z
+removing b.z
+removing c.z
+adding x/a.z
+adding x/b.z
+adding x/c.z
+recording removal of c.z as rename to x/c.z (100% similar)
+recording removal of b.z as rename to x/b.z (100% similar)
+recording removal of a.z as rename to x/a.z (100% similar)


More information about the Mercurial-devel mailing list