[PATCH] auto rename: best matches and speed improvements
Herbert Griebel
herbertg at gmx.at
Thu Sep 11 15:39:45 CDT 2008
# HG changeset patch
# User Herbert Griebel <herbertg at gmx.at>
# Date 1221165456 -7200
# Node ID b99693bd438686e62174098c8d0df05f43e8b1b9
# Parent 6dcbe191a9b51799a4cd40bd398e826bcd9b4dac
auto rename: best matches and speed improvements
The auto rename feature of addremove did not rename to the best match
and was very slow. The new implementation fixes this using all matches
and using statistics of the file content to speed up the matching.
diff -r 6dcbe191a9b5 -r b99693bd4386 mercurial/bdiff.c
--- a/mercurial/bdiff.c Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/bdiff.c Thu Sep 11 22:37:36 2008 +0200
@@ -356,11 +356,51 @@
return result ? result : PyErr_NoMemory();
}
-static char mdiff_doc[] = "Efficient binary diff.";
+
+
+/****************************************/
+/*diff statistics */
+/****************************************/
+
+static PyObject* statistics(PyObject *self, PyObject *args)
+{
+ unsigned char *s;
+ int len, i;
+ int cmax = 0;
+ double sum = 0;
+ double hist[256];
+ PyObject *sumobj;
+ PyObject *maxobj;
+ PyObject *histtuple = PyTuple_New(256);
+ if (! PyArg_ParseTuple(args, "s#:diffstat", &s, &len))
+ return NULL;
+
+ /* build histo */
+ for (i=0; i<256; i++)
+ hist[i] = 0;
+ for (i=0; i<len; i++)
+ hist[s[i]] += 1;
+
+ /* max and sum */
+ for (i=0; i<256; i++)
+ {
+ if (hist[i])
+ cmax = i;
+ sum += i * hist[i];
+ }
+
+ sumobj = PyFloat_FromDouble(sum);
+ maxobj = PyInt_FromLong(cmax);
+ for (i=0; i<256; i++)
+ PyTuple_SetItem(histtuple, i, PyFloat_FromDouble(hist[i]));
+ return PyTuple_Pack(3, sumobj, maxobj, histtuple);
+}
+static char mdiff_doc[] = "Efficient binary diff, diff statistics.";
static PyMethodDef methods[] = {
{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
+ {"statistics", statistics, METH_VARARGS, "get byte statistics\n"},
{NULL, NULL}
};
@@ -368,4 +408,3 @@
{
Py_InitModule3("bdiff", methods, mdiff_doc);
}
-
diff -r 6dcbe191a9b5 -r b99693bd4386 mercurial/cmdutil.py
--- a/mercurial/cmdutil.py Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/cmdutil.py Thu Sep 11 22:37:36 2008 +0200
@@ -7,7 +7,7 @@
from node import hex, nullid, nullrev, short
from i18n import _
-import os, sys, bisect, stat
+import os, sys, math, bisect, stat
import mdiff, bdiff, util, templater, templatefilters, patch, errno
import match as _match
@@ -241,33 +241,314 @@
def matchfiles(repo, files):
return _match.exact(repo.root, repo.getcwd(), files)
-def findrenames(repo, added=None, removed=None, threshold=0.5):
- '''find renamed files -- yields (before, after, score) tuples'''
+def findrenames(repo, similarity, added=None, removed=None):
+ '''find renamed files --
+ returns [renameold, ...], [renamenew, ...], [renamescore, ...]'''
if added is None or removed is None:
added, removed = repo.status()[1:3]
+ if not added or not removed:
+ return [], [], []
+ na = len(added)
+ nr = len(removed)
ctx = repo['.']
- for a in added:
- aa = repo.wread(a)
- bestname, bestscore = None, threshold
- for r in removed:
- rr = ctx.filectx(r).data()
- # bdiff.blocks() returns blocks of matching lines
- # count the number of bytes in each
- equal = 0
- alines = mdiff.splitnewlines(aa)
- matches = bdiff.blocks(aa, rr)
- for x1,x2,y1,y2 in matches:
- for line in alines[x1:x2]:
- equal += len(line)
+ # preload attributes
+ rfiles = [()] * nr
+ afiles = [()] * na
+ for i, r in enumerate(removed):
+ rfiles[i] = (ctx.filectx(r).size(),
+ os.path.split(r)[1],
+ os.path.splitext(r)[1],
+ i)
+ for i, a in enumerate(added):
+ afiles[i] = (repo.wfilesize(a),
+ os.path.split(a)[1],
+ os.path.splitext(a)[1],
+ i)
- lengths = len(aa) + len(rr)
- if lengths:
- myscore = equal*2.0 / lengths
- if myscore >= bestscore:
- bestname, bestscore = r, myscore
- if bestname:
- yield bestname, a, bestscore
+ # check loop order
+ if na == 1:
+ addedfirst = True
+ elif nr == 1:
+ addedfirst = False
+ else:
+ # decide loop order
+
+ # # this code is only for getting the estimator coefficients
+ # import time
+ # S1a, S1r = 0, 0
+ # added1, removed1 = added[:na/3], removed[:nr/3]
+ # N1a, N1r = len(added1), len(removed1)
+ # t0 = time.clock()
+ # for a in added1:
+ # S1a += len(repo.wread(a))
+ # t1 = time.clock()
+ # for r in removed1:
+ # S1r += len(ctx.filectx(r).data())
+ # t2 = time.clock()
+ # d1a, d1r = t1 - t0, t2 - t1
+ #
+ # S2a, S2r = 0, 0
+ # added2, removed2 = added[na/3:], removed[nr/3:]
+ # N2a, N2r = len(added2), len(removed2)
+ # t0 = time.clock()
+ # for a in added2:
+ # S2a += len(repo.wread(a))
+ # t1 = time.clock()
+ # for r in removed2:
+ # S2r += len(ctx.filectx(r).data())
+ # t2 = time.clock()
+ # d2a, d2r = t1 - t0, t2 - t1
+ #
+ # # calc estimator
+ #
+ # def solve2d(a1,a2,b1,b2,e1,e2):
+ # """a1*x + b1*y = e1
+ # a2*x + b2*y = e2"""
+ # def det(a1,a2,b1,b2):
+ # return a1*b2 - b1*a2
+ # d = det(a1,a2,b1,b2)
+ # x = det(e1,e2,b1,b2) / d
+ # y = det(a1,a2,e1,e2) / d
+ # return x, y
+ #
+ # # S*c1 + N*c2 = duration
+ # print S1a, S2a, N1a, N2a, d1a, d2a
+ # xc1a, xc2a = solve2d(S1a, S2a, N1a, N2a, d1a, d2a)
+ # xc1r, xc2r = solve2d(S1r, S2r, N1r, N2r, d1r, d2r)
+ # print 'c1r, c2r, c1a, c2a = %g, %g, %g, %g' \
+ # % (xc1r, xc2r, xc1a, xc2a)
+
+ # estimate times to decide loops
+ c1r, c2r, c1a, c2a = 1.60882e-008, 0.00145002, 9.03138e-009, 0.00110696
+ Sr = sum([f[0] for f in rfiles])
+ Sa = sum([f[0] for f in afiles])
+ dr = Sr*c1r + nr*c2r
+ da = Sa*c1a + na*c2a
+ d1 = da + dr*na
+ d2 = dr + da*nr
+ addedfirst = d1 <= d2
+ if addedfirst:
+ filelist1, filelist2 = afiles, rfiles
+ else:
+ filelist1, filelist2 = rfiles, afiles
+
+ # init
+ minscore = similarity / 100
+ rbestscores = [0] * nr
+ rbestsumdiff = [-1] * nr
+ nhist = 256
+ sum1 = [-1] * len(filelist1)
+ sum2 = [-1] * len(filelist2)
+ maxbyte1 = [255] * len(filelist1)
+ maxbyte2 = [255] * len(filelist2)
+ hist1 = [(0)*nhist] * len(filelist1)
+ hist2 = [(0)*nhist] * len(filelist2)
+ bestmatches = []
+ notdeferred = 0
+ ignoredeferred = 1
+ isdeferred = 2
+
+ def _getscore():
+ '''bdiff.blocks() returns blocks of matching lines
+ count the number of bytes in each'''
+ equal = 0
+ matches = bdiff.blocks(aa, rr)
+ for x1,x2,y1,y2 in matches:
+ for line in alines[x1:x2]:
+ equal += len(line)
+ score = equal*2.0 / (len(aa) + len(rr))
+ return score
+
+ def _maxsumdiff(sim):
+ '''max sum difference based on similarity
+
+ example: 2 files, sim=40% (first 4 bytes), #=diff, o=equal
+
+ uuuuu <- upper bytes different
+ file1 oooo###### 40% 60% len=10
+ l <- lower bytes different
+ file2 oooo# 80% 20% len=5
+
+ maxscoresize = 50%
+ bounds: 0 <= actual sum upper bytes <= maxsumupper
+ 0 <= actual sum lower bytes <= maxsumlower
+ max sum diff possible: maxsumlower + maxsumupper
+ '''
+ maxbyte = max([maxbyte1[ifile1], maxbyte2[ifile2]])
+ if size1 > size2:
+ maxsumupper = (size1 - size2) * maxbyte1[ifile1]
+ maxsumlower = math.ceil(size1 * (maxscoresize - sim)) * maxbyte
+ else:
+ maxsumlower = math.ceil(size2 * (maxscoresize - sim)) * maxbyte
+ maxsumupper = (size2 - size1) * maxbyte2[ifile2]
+ dmax = maxsumlower + maxsumupper
+ return dmax
+
+ def _maxscoresum():
+ '''maximum score possible based on sum difference d
+
+ when expecting the maximum sum for lower and upper
+ you get the min numbers of bytes different
+
+ see also _maxsumdiff
+ '''
+ maxbyte = max([maxbyte1[ifile1], maxbyte2[ifile2]])
+ if size1 > size2:
+ minbytesupper = size1 - size2
+ maxsumupper = minbytesupper * maxbyte1[ifile1]
+ minbyteslower = max([0, math.ceil((d - maxsumupper) / maxbyte)])
+ maxscoresum = 1 - (minbyteslower + minbytesupper) / size1
+ else:
+ minbytesupper = size2 - size1
+ maxsumupper = minbytesupper * maxbyte2[ifile2]
+ minbyteslower = max([0, math.ceil((d - maxsumupper) / maxbyte)])
+ maxscoresum = 1 - (minbyteslower + minbytesupper) / size2
+ return maxscoresum
+
+ ##################
+ # loop 1
+ for size1, name1, ext1, ifile1 in filelist1:
+ if addedfirst:
+ ia = ifile1
+ a = added[ia]
+ else:
+ ir = ifile1
+ r = removed[ir]
+ loadfile1 = True
+
+ ##################
+ # loop 2
+ for size2, name2, ext2, ifile2 in filelist2:
+ if addedfirst:
+ ir = ifile2
+ r = removed[ir]
+ else:
+ ia = ifile2
+ a = added[ia]
+
+ # special case: empty files
+ if not size1 or not size2:
+ if size1 == size2:
+ bestmatches.append([1, ir, ia, notdeferred])
+ rbestscores[ir] = 1
+ rbestsumdiff[ir] = 0
+ continue
+
+ # maximum score that could be reached because of sizes
+ if size1 > size2:
+ maxscoresize = float(size2) / size1
+ else:
+ maxscoresize = float(size1) / size2
+ if maxscoresize < minscore:
+ continue
+
+ # maximum score that could be reached because of statistics
+ maxscore = maxscoresize
+ if sum1[ifile1] >= 0 and sum2[ifile2] >= 0:
+ # max delta of sums
+ d = abs(sum1[ifile1] - sum2[ifile2])
+ dmax = _maxsumdiff(minscore)
+ if d > dmax:
+ continue
+ maxscoresum = _maxscoresum()
+ if maxscoresum < minscore:
+ continue
+ # histogram delta
+ dhist = 0.0
+ for i in xrange(nhist):
+ dhist += abs(hist1[ifile1][i] - hist2[ifile2][i])
+ maxscorehist = 1 - dhist / (2 * max([size1, size2]))
+ if maxscorehist < minscore:
+ continue
+
+ # deferred compares
+ if maxscorehist < maxscore:
+ maxscore = maxscorehist
+ if maxscoresum < maxscore:
+ maxscore = maxscoresum
+ if rbestscores[ir] and rbestscores[ir] < maxscore:
+ # a better match has a smaller d
+ if rbestsumdiff[ir] > 0 and d > rbestsumdiff[ir]:
+ maxscore = rbestscores[ir] # defer compare
+ else:
+ dmax = _maxsumdiff(rbestscores[ir])
+ if d > dmax:
+ # current best score cannot be reached with d
+ maxscore = rbestscores[ir] # defer compare
+
+ # deferred compares
+ if maxscore <= rbestscores[ir]:
+ bestmatches.append([maxscore, ir, ia, isdeferred])
+ continue
+
+ # load file 1
+ if loadfile1:
+ if addedfirst:
+ aa = repo.wread(a)
+ alines = mdiff.splitnewlines(aa)
+ sum1[ifile1], maxbyte1[ifile1], hist1[ifile1] = \
+ bdiff.statistics(aa)
+ else:
+ rr = ctx.filectx(r).data()
+ sum1[ifile1], maxbyte1[ifile1], hist1[ifile1] = \
+ bdiff.statistics(rr)
+ loadfile1 = False
+
+ # load file 2
+ if addedfirst:
+ rr = ctx.filectx(r).data()
+ if sum2[ifile2] < 0:
+ sum2[ifile2], maxbyte2[ifile2], hist2[ifile2] = \
+ bdiff.statistics(rr)
+ else:
+ aa = repo.wread(a)
+ alines = mdiff.splitnewlines(aa)
+ if sum2[ifile2] < 0:
+ sum2[ifile2], maxbyte2[ifile2], hist2[ifile2] = \
+ bdiff.statistics(aa)
+
+ # score it
+ score = _getscore()
+ if score >= minscore:
+ bestmatches.append([score, ir, ia, notdeferred])
+ if rbestscores[ir] <= score:
+ rbestscores[ir] = score
+ rbestsumdiff[ir] = abs(sum1[ifile1] - sum2[ifile2])
+
+ def _deferredcompare(imatch, nmatches):
+ score, ir, ia, deferred = bestmatches[imatch]
+ bestmatches[imatch][3] = ignoredeferred
+ rr = ctx.filectx(removed[ir]).data()
+ aa = repo.wread(added[ia])
+ alines = mdiff.splitnewlines(aa)
+ score = _getscore()
+ if score >= minscore:
+ # insert sorted in list
+ iins = imatch + 1
+ while iins < nmatches and bestmatches[iins][0] > score:
+ iins += 1
+ bestmatches.insert(iins, [score, ir, ia, notdeferred])
+ nmatches += 1
+ return nmatches
+
+ # assign best matches
+ bestmatches.sort(reverse = True)
+ renamenew, renameold, renamescore = [], [], []
+ imatch = 0
+ nmatches = len(bestmatches)
+ while imatch < nmatches:
+ score, ir, ia, deferred = bestmatches[imatch]
+ imatch += 1
+ if ia in renamenew or ir in renameold:
+ continue
+ if deferred:
+ continue
+ # add the best match
+ renameold.append(ir)
+ renamenew.append(ia)
+ renamescore.append(score)
+ return renameold, renamenew, renamescore
def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
if dry_run is None:
@@ -275,42 +556,47 @@
if similarity is None:
similarity = float(opts.get('similarity') or 0)
add, remove = [], []
- mapping = {}
- audit_path = util.path_auditor(repo.root)
+ renamenew, renameold, renamescore = [], [], []
+ sadd, sremove = [], []
+ auditpath = util.path_auditor(repo.root)
m = match(repo, pats, opts)
for abs in repo.walk(m):
target = repo.wjoin(abs)
good = True
try:
- audit_path(abs)
+ auditpath(abs)
except:
good = False
rel = m.rel(abs)
exact = m.exact(abs)
if good and abs not in repo.dirstate:
add.append(abs)
- mapping[abs] = rel, m.exact(abs)
+ sadd.append(((pats and rel) or abs, exact))
if repo.ui.verbose or not exact:
repo.ui.status(_('adding %s\n') % ((pats and rel) or abs))
if repo.dirstate[abs] != 'r' and (not good or not util.lexists(target)
or (os.path.isdir(target) and not os.path.islink(target))):
remove.append(abs)
- mapping[abs] = rel, exact
+ sremove.append(((pats and rel) or abs, exact))
if repo.ui.verbose or not exact:
repo.ui.status(_('removing %s\n') % ((pats and rel) or abs))
if not dry_run:
repo.remove(remove)
repo.add(add)
+ # find file renames
if similarity > 0:
- for old, new, score in findrenames(repo, add, remove, similarity):
- oldrel, oldexact = mapping[old]
- newrel, newexact = mapping[new]
- if repo.ui.verbose or not oldexact or not newexact:
- repo.ui.status(_('recording removal of %s as rename to %s '
- '(%d%% similar)\n') %
- (oldrel, newrel, score * 100))
- if not dry_run:
- repo.copy(old, new)
+ renameold, renamenew, renamescore = \
+ findrenames(repo, similarity, add, remove)
+ # output
+ for i in xrange(len(renameold)):
+ oldrel, oldexact = sremove[renameold[i]]
+ newrel, newexact = sadd[renamenew[i]]
+ if repo.ui.verbose or not oldexact or not newexact:
+ repo.ui.status(_('recording removal of %s as rename to %s '
+ '(%d%% similar)\n') %
+ (oldrel, newrel, renamescore[i] * 100))
+ if not dry_run:
+ repo.copy(remove[renameold[i]], add[renamenew[i]])
def copy(ui, repo, pats, opts, rename=False):
# called with the repo lock held
diff -r 6dcbe191a9b5 -r b99693bd4386 mercurial/commands.py
--- a/mercurial/commands.py Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/commands.py Thu Sep 11 22:37:36 2008 +0200
@@ -62,12 +62,12 @@
parameter. Detecting renamed files this way can be expensive.
"""
try:
- sim = float(opts.get('similarity') or 0)
+ similarity = float(opts.get('similarity') or 0)
except ValueError:
raise util.Abort(_('similarity must be a number'))
- if sim < 0 or sim > 100:
+ if similarity < 0 or similarity > 100:
raise util.Abort(_('similarity must be between 0 and 100'))
- return cmdutil.addremove(repo, pats, opts, similarity=sim/100.)
+ return cmdutil.addremove(repo, pats, opts, similarity=similarity)
def annotate(ui, repo, *pats, **opts):
"""show changeset information per file line
diff -r 6dcbe191a9b5 -r b99693bd4386 mercurial/localrepo.py
--- a/mercurial/localrepo.py Mon Aug 18 16:50:36 2008 -0500
+++ b/mercurial/localrepo.py Thu Sep 11 22:37:36 2008 +0200
@@ -528,6 +528,9 @@
def adddatafilter(self, name, filter):
self._datafilters[name] = filter
+
+ def wfilesize(self, filename):
+ return os.stat(self.wjoin(filename))[6]
def wread(self, filename):
if self._link(filename):
diff -r 6dcbe191a9b5 -r b99693bd4386 tests/test-addremove-similar
--- a/tests/test-addremove-similar Mon Aug 18 16:50:36 2008 -0500
+++ b/tests/test-addremove-similar Thu Sep 11 22:37:36 2008 +0200
@@ -26,19 +26,23 @@
hg init rep2; cd rep2
-python -c 'for x in range(10000): print x' > large-file
-python -c 'for x in range(50): print x' > tiny-file
+python -c 'for x in range(10000): print x' > large.file
+python -c 'for x in range(50): print x' > tiny.file
+python -c 'for x in range(500): print 1 if x>0 else "a"' > a.file
+python -c 'for x in range(500): print 1 if x>0 else "b"' > b.file
+python -c 'for x in range(500): print 1 if x>0 else "c"' > c.file
-hg addremove
-
+hg addremove -s50
hg commit -m A
-python -c 'for x in range(70): print x' > small-file
-rm tiny-file
-rm large-file
+python -c 'for x in range(70): print x' > small.file
+rm tiny.file
+rm large.file
+python -c 'for x in range(500): print 1 if x>0 else "d"' > d.file
+mkdir x
+mv *.* x
hg addremove -s50
-
hg commit -m B
echo % should all fail
diff -r 6dcbe191a9b5 -r b99693bd4386 tests/test-addremove-similar.out
--- a/tests/test-addremove-similar.out Mon Aug 18 16:50:36 2008 -0500
+++ b/tests/test-addremove-similar.out Thu Sep 11 22:37:36 2008 +0200
@@ -8,12 +8,26 @@
2 files updated, 0 files merged, 1 files removed, 0 files unresolved
adding another-empty-file
removing empty-file
-adding large-file
-adding tiny-file
-removing large-file
-adding small-file
-removing tiny-file
-recording removal of tiny-file as rename to small-file (82% similar)
+recording removal of empty-file as rename to another-empty-file (100% similar)
+adding a.file
+adding b.file
+adding c.file
+adding large.file
+adding tiny.file
+removing a.file
+removing b.file
+removing c.file
+removing large.file
+removing tiny.file
+adding x/a.file
+adding x/b.file
+adding x/c.file
+adding x/d.file
+adding x/small.file
+recording removal of c.file as rename to x/c.file (100% similar)
+recording removal of b.file as rename to x/b.file (100% similar)
+recording removal of a.file as rename to x/a.file (100% similar)
+recording removal of tiny.file as rename to x/small.file (82% similar)
% should all fail
abort: similarity must be a number
abort: similarity must be between 0 and 100
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: mercurial-main_rev6908.patch
Url: http://selenic.com/pipermail/mercurial-devel/attachments/20080911/c6bb8227/attachment.txt
More information about the Mercurial-devel
mailing list