[PATCH] auto rename: best matches and speed improvements UPDATE1
Benoit Boissinot
bboissin at gmail.com
Wed Oct 1 10:57:31 CDT 2008
On Wed, Oct 01, 2008 at 02:50:04PM +0200, Herbert Griebel wrote:
> # HG changeset patch
> # User Herbert Griebel <herbertg at gmx.at>
> # Date 1222862946 -7200
> # Node ID 7b673a95a55d0e32df824830a8419d4c1a220b9a
> # Parent f29b674cc2210126c2899d94d882c367a8ea64bc
> auto rename: best matches and speed improvements
>
>
> diff -r f29b674cc221 -r 7b673a95a55d mercurial/addremove.py
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/mercurial/addremove.py Wed Oct 01 14:09:06 2008 +0200
> @@ -0,0 +1,356 @@
> +# addremove.py - command addremove
> +#
> +# Copyright 2008 Herbert Griebel <herbertg at gmx.at>
> +#
> +# This software may be used and distributed according to the terms
> +# of the GNU General Public License, incorporated herein by reference.
> +
> +from i18n import _
> +import os
> +import mdiff, bdiff, util
> +import copy
> +
> +def findrenames(repo, added=None, removed=None, threshold=0.5):
> + '''find renamed files -- yields (before, after, score) tuples'''
> + if added is None or removed is None:
> + added, removed = repo.status()[1:3]
> + if not added or not removed:
> + return
> + ctx = repo['.']
rctx = repo['.']
actx = repo[None]
> +
> + # cash file attributes
cache ?
> + nr = len(removed)
> + na = len(added)
> + rfiles = [()] * nr
> + afiles = [()] * na
> + for i, r in enumerate(removed):
> + rfiles[i] = (ctx.filectx(r).size(),
> + os.path.split(r)[1], os.path.splitext(r)[1], i)
rctx.filectx(r).size()
> + for i, a in enumerate(added):
> + afiles[i] = (repo.wfilesize(a),
actx.filectx(r).size()
> + os.path.split(a)[1], os.path.splitext(a)[1], i)
could be a simple list comprehension (or refactored, since the only
difference is actx/rctx)
rfiles = [(ctx.filectx(r).size(),
os.path.split(r)[1], os.path.splitext(r)[1], i)
for i, r in enumerate(removed)]
> + # check loop order
> + if na == 1:
> + addedfirst = True
> + elif nr == 1:
> + addedfirst = False
> + else:
> + # estimate times to decide loops
> + Sr = sum([f[0] for f in rfiles])
> + Sa = sum([f[0] for f in afiles])
> + dr = Sr*1.60882e-008 + nr*0.00145002
> + da = Sa*9.03138e-009 + na*0.00110696
magic values ?
> [...]
> +
> + def _getscore(rr, aa, alines):
> + '''bdiff.blocks() returns blocks of matching lines
> + count the number of bytes in each'''
> + equal = 0
> + matches = bdiff.blocks(aa, rr)
> + for x1,x2,y1,y2 in matches:
x1, x2, ... (space after comma)
> + for line in alines[x1:x2]:
> + equal += len(line)
> + score = float(equal) / max([len(aa), len(rr)])
> + return score
> +
> + def _addmatch(score, compared):
> + '''enters match into the 3 lists'''
> + namesim = bdiff.reversesim(removed[ir], added[ia])
> + m = [score, namesim, compared, ir, ia, False]
> + rmatches[ir].append(m)
> + amatches[ia].append(m)
> + if rbased:
> + if bestmatches[ir] < m:
> + bestmatches[ir][5] = False
> + bestmatches[ir] = m
> + m[5] = True
> + else:
> + if bestmatches[ia] < m:
> + bestmatches[ia][5] = False
> + bestmatches[ia] = m
> + m[5] = True
could be refactored:
if rbased:
idx = ir
else:
idx = ia
if bestmatches[idx] < m:
bestmatches[idx][5] = False
bestmatches[idx] = m
m[5] = True
> + if rbestscores[ir] < score:
> + rbestscores[ir] = score
rbestscores[ir] = max(rbestscores[ir], score)
> + if abestscores[ia] < score:
> + abestscores[ia] = score
rbestscores[ir] = max(rbestscores[ir], score)
> +
> + # loop 1
> + for size1, name1, ext1, ifile1 in filelist1:
> + if addedfirst:
> + ia = ifile1
> + else:
> + ir = ifile1
> + loadfile1 = True
> +
> + # check similar file names first (moves)
> + nmoves = 0
> + for i in xrange(len(filelist2)):
> + if name1 == filelist2[i][1]:
> + if i != nmoves:
> + # swap files
> + filelist2[nmoves], filelist2[i] = \
> + filelist2[i], filelist2[nmoves]
> + nmoves += 1
> +
> + # loop 2
> + for size2, name2, ext2, ifile2 in filelist2:
> + if addedfirst:
> + ir = ifile2
> + else:
> + ia = ifile2
> +
> + # special case: empty files
> + if not size1 or not size2:
> + score = 1 if size1 == size2 else 0
if ... else was introduced in 2.5 (iirc), please use:
score = (size1 == size2) and 1 or 0
or
int(size1 == size2) (I don't know if this one is guaranteed to work for
all python version)
> + if score >= minscore:
> + _addmatch(score, iscompared)
> + continue
> +
> + # maximum score that could be reached because of sizes
> + maxscoresize = \
> + float(size2)/size1 if size1>size2 else float(size1)/size2
ditto (although x and t or f won't work if size2 == 0, which shouldn't
happend (special case above))
> + if maxscoresize < minscore:
> + continue
> +
> + # maximum score that could be reached because of statistics
> + maxscore = maxscoresize
> + if rstat[ir] and astat[ia]:
> + # histogram delta
> + maxscorehist = bdiff.maxscorehist(rstat[ir], astat[ia])
> + if maxscorehist < minscore:
> + continue
> + # defer compares, take lowest upper bound
> + if maxscorehist < maxscore:
> + maxscore = maxscorehist
maxscore = min(maxscore, maxscorehist)
> +
> + # deferred compares
> + if ifile1 and (maxscore < rbestscores[ir] or \
> + maxscore < abestscores[ia]):
> + _addmatch(maxscore, notcompared)
> + continue
> +
> + # load file 1
> + if loadfile1:
> + if addedfirst:
> + aa = repo.wread(added[ia])
actx.filectx(...).data()
> + alines = mdiff.splitnewlines(aa)
> + astat[ia] = bdiff.statistics(aa)
> + else:
> + rr = ctx.filectx(removed[ir]).data()
rctx...
> + rstat[ir] = bdiff.statistics(rr)
> + loadfile1 = False
> +
> + # load file 2
> + if addedfirst:
> + rr = ctx.filectx(removed[ir]).data()
rctx.filectx(...).data()
> + if not rstat[ir]:
> + rstat[ir] = bdiff.statistics(rr)
> + else:
> + aa = repo.wread(added[ia])
actx.filectx(...).data()
> + alines = mdiff.splitnewlines(aa)
> + if not astat[ia]:
> + astat[ia] = bdiff.statistics(aa)
> +
> + # score it
> + score = _getscore(rr, aa, alines)
> + if score >= minscore:
> + _addmatch(score, iscompared)
> +
> + def _deferredcompare(m):
> + '''function returns 0 if maxscore was not reached
> + and updates bestmatches'''
> + maxscore, namesim, compared, ir, ia, isbest = m
> + if compared or not maxscore:
> + return maxscore
> + rr = aa = ''
> + if not rstat[ir]:
> + rr = ctx.filectx(removed[ir]).data()
> + rstat[ir] = bdiff.statistics(rr)
> + if not astat[ia]:
> + aa = repo.wread(added[ia])
actx... (you could refactor)
> + astat[ia] = bdiff.statistics(aa)
> + # check histogram diff before comparing
> + maxscorehist = bdiff.maxscorehist(rstat[ir], astat[ia])
> + if maxscorehist < minscore:
> + maxscorehist = 0
> + if maxscore > maxscorehist:
> + m[0] = maxscorehist
> + maxscore = 0 # do not compare, we need new max
> + if maxscore: # actually compare the files
> + if not rr:
> + rr = ctx.filectx(removed[ir]).data()
rctx
> + if not aa:
> + aa = repo.wread(added[ia])
actx
> + alines = mdiff.splitnewlines(aa)
> + score = _getscore(rr, aa, alines)
> + m[2] = iscompared # not deferred anymore
> + if score == maxscore:
> + return maxscore
> + m[0] = 0 if score < minscore else score
if ... else should be changed
> + if isbest: # get best score
> + m[5] = False # remove from best
> + if rbased:
> + l = rmatches[ir]
> + bm = max(l) if l else 0
ditto
> + bestmatches[ir] = bm
> + else:
> + l = amatches[ia]
> + bm = max(l) if l else 0
ditto
> + bestmatches[ir] = bm
> + bestmatches[ia] = bm
can it be refactored ? (why is it asymetrical ?)
> + if bm:
> + bm[5] = True # flag as best
> + return 0
> +
> + def _namematches(scorematches):
> + '''Find best matches for a group of removed and added files
> + Matching score (high to low): file extension equal
> + file name similarity (levenshtein)
> + full pathname similarity (levenshtein)
> + common substring length from right
> + Hence we assume that a file extension change is least likely.'''
> + if len(scorematches) == 1:
> + yield scorematches[0][0]
> + return
> + # "name-score" all matches
> + best = [0] * len(scorematches)
> + for i,sm in enumerate(scorematches):
space after comma
> + score, namesim, compared, ir, ia, isbest = sm[0]
> + r = removed[ir]
> + a = added[ia]
> + rname = rfiles[ir][1]
> + aname = afiles[ia][1]
> + rext = rfiles[ir][2]
> + aext = afiles[ia][2]
> + score1 = rext == aext
> + score2 = bdiff.strsim(rname, aname)
> + score3 = bdiff.strsim(r, a)
> + score4 = bdiff.reversesim(r, a)
> + ndepends = -len(sm[1]) + (0 if compared else -1)
if ... else
> + best[i] = (score1, score2, score3, score4, ndepends, sm)
> + # output best name matches
> + best.sort(reverse = True)
> + for score1, score2, score3, score4, n, sm in best:
> + m, depends = sm
> + score, namesim, compared, ir, ia, isbest = m
> + # depends on other deferred?
> + if depends:
> + dismissed = False
> + for d in depends:
> + if _deferredcompare(d) != score:
> + dismissed = True
> + break
> + if dismissed:
> + continue
> + # check if deferred
> + if not compared:
> + if _deferredcompare(m) != score:
> + continue
> + yield m
> +
> + # assign best matches
> + while True:
> + bm = max(bestmatches) # get best match
> + if not bm:
> + break
> + score, namesim, compared, ir, ia, isbest = bm
> + if not score:
> + break
> + if not compared:
> + if _deferredcompare(bm) != score:
> + continue
> +
> + # Find all files that have the same similarity to the two files.
> + # Examples when name matching is necessary:
> + #
> + # a -> x score basic case, a has the same score to x and y
> + # a -> y score
> + #
> + # a -> x score basic case, x has the same score to a and b
> + # b -> x score
> + #
> + # a -> x score general case, finding all matches must
> + # a -> y score be done iteratively; if the compare is
> + # b -> y score deferred, a dependency list is used,
> + # b -> z score because the score is only an upper bound
> + # ...
> + #
> + # a -> x score no need to follow "100% links"
> + # b -> x 1 because link a -> y exists
> + # b -> y 1
> + # a -> y score
> + rset, aset = [ir], [ia]
> + sm = [bm, []]
> + rsetnew, asetnew = [sm], [sm]
> + scorematches = [sm]
> + while rsetnew:
> + # get all added files that match rsetnew files
> + for newm, depends in rsetnew:
> + if newm[2] == notcompared:
> + depends = copy.copy(depends)
> + depends.append(newm)
> + l = [[m, depends] for m in rmatches[newm[3]] \
> + if m[0]==score and m[4] not in aset]
> + scorematches += l
> + asetnew += l
> + rset += [m[3] for m, d in rsetnew]
> + rsetnew = []
> + # get all removed files that match asetnew files
> + for newm, depends in asetnew:
> + if newm[2] == notcompared:
> + depends = copy.copy(depends)
> + depends.append(newm)
> + l = [[m, depends] for m in amatches[newm[4]] \
> + if m[0]==score and m[3] not in rset]
> + scorematches += l
> + rsetnew += l
> + aset += [m[4] for m, d in asetnew]
> + asetnew = []
> +
> + # find best name matches
> + for bm in _namematches(scorematches):
> + score, namesim, compared, ir, ia, isbest = bm
> + if not score:
> + continue
> + yield removed[ir], added[ia], score
> + # invalidate ir and ia
> + for m in rmatches[ir]:
> + m[0] = 0
> + for m in amatches[ia]:
> + m[0] = 0
> + rmatches[ir] = []
> + amatches[ia] = []
> + # get next best match for invalidated
> + for m in bestmatches:
> + if m and not m[0]:
> + score, namesim, compared, ir2, ia2, isbest = m
> + if rbased:
> + l = rmatches[ir2]
> + m = max(l) if l else 0
> + bestmatches[ir2] = m
> + else:
> + l = amatches[ia2]
> + m = max(l) if l else 0
> + bestmatches[ia2] = m
> + if m:
> + m[5] = True # flag as best
> diff -r f29b674cc221 -r 7b673a95a55d mercurial/bdiff.c
> --- a/mercurial/bdiff.c Wed Sep 17 11:34:37 2008 +0200
> +++ b/mercurial/bdiff.c Wed Oct 01 14:09:06 2008 +0200
> @@ -2,6 +2,7 @@
> bdiff.c - efficient binary diff extension for Mercurial
>
> Copyright 2005, 2006 Matt Mackall <mpm at selenic.com>
> + Copyright 2008 Herbert Griebel <herbertg at gmx.at>
>
> This software may be used and distributed according to the terms of
> the GNU General Public License, incorporated herein by reference.
> @@ -356,11 +357,153 @@
> return result ? result : PyErr_NoMemory();
> }
>
> -static char mdiff_doc[] = "Efficient binary diff.";
> +
> +typedef struct {
> + unsigned int hist[256];
> + unsigned int filelen;
> +} STATDATA;
I think we follow the kernel coding style for C code, so please don't
typedef structs:
struct statistics {
...
}
> +
> +static PyObject* statistics(PyObject *self, PyObject *args)
> +{
> + unsigned char *s;
> + int filelen, i;
> + PyObject *dataobj;
> + STATDATA *stat;
> + unsigned int *hist;
> + if (! PyArg_ParseTuple(args, "s#:statistics", &s, &filelen))
> + return NULL;
> + dataobj = PyString_FromStringAndSize(NULL, sizeof(STATDATA));
> + stat = (STATDATA*) PyString_AS_STRING(dataobj);
You could put the pointer inside a CObject instead of putting it in a string
(and use malloc)
(one problem will be that the equality test won't work anymore with the
CObjects)
> + hist = stat->hist;
> + stat->filelen = filelen;
> +
> + /* build histo */
> + for (i=0; i<256; i++)
> + hist[i] = 0;
> + for (i=0; i<filelen; i++)
> + hist[s[i]] += 1;
> +
> + return dataobj;
> +}
> +
> +static PyObject* maxscorehist(PyObject *self, PyObject *args)
> +{
> + int len1, len2, i;
> + STATDATA *stat1, *stat2;
> + unsigned int maxlen;
> + double maxscore;
> +
> + if (! PyArg_ParseTuple(args, "s#s#:maxscorehist",
> + &stat1, &len1, &stat2, &len2))
> + return NULL;
> + if (len1 != sizeof(STATDATA) || len2 != sizeof(STATDATA)) {
> + PyErr_SetString(PyExc_ValueError, "length of data invalid");
> + return NULL;
> + }
and use PyCObject_Check() here
> +
> + maxlen = stat1->filelen > stat2->filelen ? stat1->filelen : stat2->filelen;
> + if (maxlen == 0)
> + maxscore = 1.;
> + else if (stat1->filelen == 0 || stat2->filelen == 0)
> + maxscore = 0.;
> + else {
> + unsigned int *h1 = stat1->hist;
> + unsigned int *h2 = stat2->hist;
> + unsigned int delta = stat1->filelen > stat2->filelen ?
> + stat1->filelen - stat2->filelen : stat2->filelen - stat1->filelen;
> + for (i=0; i<256; i++)
> + delta += (h1[i] > h2[i]) ? (h1[i]-h2[i]) : (h2[i]-h1[i]);
> + maxscore = 1. - delta / (double)(2 * maxlen);
> + }
> +
> + return PyFloat_FromDouble(maxscore);
> +}
> +
> +int levenshtein_distance(char* s, int n, char* t, int m)
static
> +{
> + int k, i, j, cost, dmin, *d, d1, d2, d3, p;
> + if (n == 0)
> + return m;
> + if (m == 0)
> + return n;
> +
> + m++;
> + n++;
> + d = (int*) malloc(m * n * sizeof(int));
no need to cast
if (!d) -> memoryerror, return -1 ?
> + for (k=0; k<n; k++)
> + d[k] = k;
> + for (k=0; k<m; k++)
> + d[k*n] = k;
> +
> + for (i=1; i<n; i++) {
> + for (j=1; j<m; j++) {
> + p = j*n + i;
> + cost = (s[i-1] == t[j-1]) ? 0 : 1;
> + d3 = d[p-n-1] + cost;
> + d2 = d[p-1] + 1;
> + d1 = d[p-n] + 1;
> + if (d1 > d2)
> + d1 = d2;
> + if (d1 > d3)
> + d1 = d3;
> + d[p] = d1;
> + }
> + }
> + dmin = d[n*m-1];
> + free(d);
> +
> + return dmin;
> +}
> +
> +static PyObject* strsim(PyObject *self, PyObject *args)
> +{
> + int n, m, d;
> + char *s, *t;
> + double sim;
> + if (! PyArg_ParseTuple(args, "s#s#:strsim", &s, &n, &t, &m))
> + return NULL;
> + d = levenshtein_distance(s, n, t, m);
if d < 0 ...
> + if (n==0 && m==0)
> + sim = 1.0;
> + else if (n > m)
> + sim = (n-d) / (double)n;
> + else
> + sim = (m-d) / (double)m;
> + return PyFloat_FromDouble(sim);
> +}
> +
> +static PyObject* reversesim(PyObject *self, PyObject *args)
> +{
> + int n1, n2;
> + char *s1, *s2;
> + if (! PyArg_ParseTuple(args, "s#s#:reversesim", &s1, &n1, &s2, &n2))
> + return NULL;
> + if (n1 > n2) {
> + s1 += n1 - n2;
> + n1 = n2;
> + } else {
> + s2 += n2 - n1;
> + n2 = n1;
> + }
> + while (n1 > 0) {
> + n1--;
> + if (s1[n1] != s2[n1])
> + break;
> + }
> + return PyLong_FromLong(n2 - n1);
> +}
> +
> +
> +
> +static char mdiff_doc[] = "Efficient binary diff and string similarity.";
>
> static PyMethodDef methods[] = {
> {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
> {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
> + {"statistics", statistics, METH_VARARGS, "get byte statistics\n"},
> + {"maxscorehist", maxscorehist, METH_VARARGS, "calculate maximum score\n"},
> + {"strsim", strsim, METH_VARARGS, "get similarity measure of strings\n"},
> + {"reversesim", reversesim, METH_VARARGS, "equal characters from right\n"},
> {NULL, NULL}
> };
>
> @@ -368,4 +511,3 @@
> {
> Py_InitModule3("bdiff", methods, mdiff_doc);
> }
> -
> diff -r f29b674cc221 -r 7b673a95a55d mercurial/cmdutil.py
> --- a/mercurial/cmdutil.py Wed Sep 17 11:34:37 2008 +0200
> +++ b/mercurial/cmdutil.py Wed Oct 01 14:09:06 2008 +0200
> @@ -10,6 +10,7 @@
> import os, sys, bisect, stat
> import mdiff, bdiff, util, templater, templatefilters, patch, errno
> import match as _match
> +import addremove as _addremove
>
> revrangesep = ':'
>
> @@ -241,34 +242,6 @@
> def matchfiles(repo, files):
> return _match.exact(repo.root, repo.getcwd(), files)
>
> -def findrenames(repo, added=None, removed=None, threshold=0.5):
> - '''find renamed files -- yields (before, after, score) tuples'''
> - if added is None or removed is None:
> - added, removed = repo.status()[1:3]
> - ctx = repo['.']
> - for a in added:
> - aa = repo.wread(a)
> - bestname, bestscore = None, threshold
> - for r in removed:
> - rr = ctx.filectx(r).data()
> -
> - # bdiff.blocks() returns blocks of matching lines
> - # count the number of bytes in each
> - equal = 0
> - alines = mdiff.splitnewlines(aa)
> - matches = bdiff.blocks(aa, rr)
> - for x1,x2,y1,y2 in matches:
> - for line in alines[x1:x2]:
> - equal += len(line)
> -
> - lengths = len(aa) + len(rr)
> - if lengths:
> - myscore = equal*2.0 / lengths
> - if myscore >= bestscore:
> - bestname, bestscore = r, myscore
> - if bestname:
> - yield bestname, a, bestscore
> -
> def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
> if dry_run is None:
> dry_run = opts.get('dry_run')
> @@ -302,7 +275,8 @@
> repo.remove(remove)
> repo.add(add)
> if similarity > 0:
> - for old, new, score in findrenames(repo, add, remove, similarity):
> + for old, new, score in _addremove.findrenames( \
> + repo, add, remove, similarity):
> oldrel, oldexact = mapping[old]
> newrel, newexact = mapping[new]
> if repo.ui.verbose or not oldexact or not newexact:
> diff -r f29b674cc221 -r 7b673a95a55d mercurial/localrepo.py
> --- a/mercurial/localrepo.py Wed Sep 17 11:34:37 2008 +0200
> +++ b/mercurial/localrepo.py Wed Oct 01 14:09:06 2008 +0200
> @@ -528,6 +528,9 @@
>
> def adddatafilter(self, name, filter):
> self._datafilters[name] = filter
> +
> + def wfilesize(self, filename):
> + return os.stat(self.wjoin(filename))[6]
not needed, if you use the context API
>
> def wread(self, filename):
> if self._link(filename):
> diff -r f29b674cc221 -r 7b673a95a55d tests/test-addremove-similar
> --- a/tests/test-addremove-similar Wed Sep 17 11:34:37 2008 +0200
> +++ b/tests/test-addremove-similar Wed Oct 01 14:09:06 2008 +0200
> @@ -1,49 +1,97 @@
> #!/bin/sh
>
> +echo % basic content-matching test
> hg init rep; cd rep
>
> +echo % add 2 files
> touch empty-file
> python -c 'for x in range(10000): print x' > large-file
> -
> hg addremove
> -
> hg commit -m A
>
> +echo % rename large-file to another-file, remove other
> rm large-file empty-file
> python -c 'for x in range(10,10000): print x' > another-file
> -
> -hg addremove -s50
> -
> +hg addremove -s 50
> hg commit -m B
>
> -echo % comparing two empty files caused ZeroDivisionError in the past
> -hg update -C 0
> -rm empty-file
> -touch another-empty-file
> -hg addremove -s50
> +echo % rename another-file to file1, add file2
> +rm another-file
> +python -c 'for x in range(20,10000): print x' > file1
> +python -c 'for x in range(30,10000): print x' > file2
> +hg addremove -s 50
> +hg commit -m C
>
> -cd ..
> -
> -hg init rep2; cd rep2
> -
> -python -c 'for x in range(10000): print x' > large-file
> -python -c 'for x in range(50): print x' > tiny-file
> -
> -hg addremove
> -
> -hg commit -m A
> -
> -python -c 'for x in range(70): print x' > small-file
> -rm tiny-file
> -rm large-file
> -
> -hg addremove -s50
> -
> -hg commit -m B
> -
> -echo % should all fail
> +echo % option-value test, should all fail
> hg addremove -s foo
> hg addremove -s -1
> hg addremove -s 1e6
>
> +echo % content-matching test
> +cd ..
> +hg init rep2; cd rep2
> +
> +echo % add 2 files
> +python -c 'for x in range(10000): print x' > large.file
> +python -c 'for x in range(50): print x' > tiny.file
> +hg addremove -s 50
> +hg commit -m A
> +
> +echo % rename tiny.file to x/small.file
> +python -c 'for x in range(70): print x' > small.file
> +rm tiny.file
> +rm large.file
> +mkdir x
> +mv *.file x
> +hg addremove -s 50
> +hg commit -m B
> +
> +echo % name-matching tests
> +cd ..
> +hg init rep3; cd rep3
> +
> +echo % add 5 files
> +touch a.file
> +touch b.file
> +touch c.file
> +mkdir dir1
> +mkdir dir2
> +touch dir1/z.file
> +touch dir2/z.file
> +hg addremove -s50
> +hg commit -m T1
> +
> +echo % move all to dir0, add z.file
> +mkdir dir0
> +mv *.file dir0
> +mv dir1 dir0
> +mv dir2 dir0
> +touch z.file
> +hg addremove -s50
> +hg commit -m T2
> +
> +echo % rename */z.file to */o.file
> +mv dir0/dir1/z.file dir0/dir1/o.file
> +mv dir0/dir2/z.file dir0/dir2/o.file
> +hg addremove -s50
> +hg commit -m T3
> +
> +echo % rename dir1 to dirA, dir2 to dirB
> +mv dir0/dir1 dir0/dirA
> +mv dir0/dir2 dir0/dirB
> +hg addremove -s50
> +hg commit -m T4
> +
> +echo % remove dir0 files
> +rm dir0/*.file
> +hg commit -A -m T5
> +
> +echo % rename dir0/dirA/o.file to dirX/dirA/x.file
> +echo % rename dir0/dirB/o.file to dirX/dirB/x.file
> +mv dir0/dirA/o.file dir0/dirA/x.file
> +mv dir0/dirB/o.file dir0/dirB/x.file
> +mv dir0 dirX
> +hg addremove -s50
> +hg commit -m T6
> +
> true
> diff -r f29b674cc221 -r 7b673a95a55d tests/test-addremove-similar.out
> --- a/tests/test-addremove-similar.out Wed Sep 17 11:34:37 2008 +0200
> +++ b/tests/test-addremove-similar.out Wed Oct 01 14:09:06 2008 +0200
> @@ -1,20 +1,77 @@
> +% basic content-matching test
> +% add 2 files
> adding empty-file
> adding large-file
> +% rename large-file to another-file, remove other
> adding another-file
> removing empty-file
> removing large-file
> recording removal of large-file as rename to another-file (99% similar)
> -% comparing two empty files caused ZeroDivisionError in the past
> -2 files updated, 0 files merged, 1 files removed, 0 files unresolved
> -adding another-empty-file
> -removing empty-file
> -adding large-file
> -adding tiny-file
> -removing large-file
> -adding small-file
> -removing tiny-file
> -recording removal of tiny-file as rename to small-file (82% similar)
> -% should all fail
> +% rename another-file to file1, add file2
> +removing another-file
> +adding file1
> +adding file2
> +recording removal of another-file as rename to file1 (99% similar)
> +% option-value test, should all fail
> abort: similarity must be a number
> abort: similarity must be between 0 and 100
> abort: similarity must be between 0 and 100
> +% content-matching test
> +% add 2 files
> +adding large.file
> +adding tiny.file
> +% rename tiny.file to x/small.file
> +removing large.file
> +removing tiny.file
> +adding x/small.file
> +recording removal of tiny.file as rename to x/small.file (70% similar)
> +% name-matching tests
> +% add 5 files
> +adding a.file
> +adding b.file
> +adding c.file
> +adding dir1/z.file
> +adding dir2/z.file
> +% move all to dir0, add z.file
> +removing a.file
> +removing b.file
> +removing c.file
> +adding dir0/a.file
> +adding dir0/b.file
> +adding dir0/c.file
> +adding dir0/dir1/z.file
> +adding dir0/dir2/z.file
> +removing dir1/z.file
> +removing dir2/z.file
> +adding z.file
> +recording removal of dir2/z.file as rename to dir0/dir2/z.file (100% similar)
> +recording removal of dir1/z.file as rename to dir0/dir1/z.file (100% similar)
> +recording removal of c.file as rename to dir0/c.file (100% similar)
> +recording removal of b.file as rename to dir0/b.file (100% similar)
> +recording removal of a.file as rename to dir0/a.file (100% similar)
> +% rename */z.file to */o.file
> +adding dir0/dir1/o.file
> +removing dir0/dir1/z.file
> +adding dir0/dir2/o.file
> +removing dir0/dir2/z.file
> +recording removal of dir0/dir2/z.file as rename to dir0/dir2/o.file (100% similar)
> +recording removal of dir0/dir1/z.file as rename to dir0/dir1/o.file (100% similar)
> +% rename dir1 to dirA, dir2 to dirB
> +removing dir0/dir1/o.file
> +removing dir0/dir2/o.file
> +adding dir0/dirA/o.file
> +adding dir0/dirB/o.file
> +recording removal of dir0/dir2/o.file as rename to dir0/dirB/o.file (100% similar)
> +recording removal of dir0/dir1/o.file as rename to dir0/dirA/o.file (100% similar)
> +% remove dir0 files
> +removing dir0/a.file
> +removing dir0/b.file
> +removing dir0/c.file
> +% rename dir0/dirA/o.file to dirX/dirA/x.file
> +% rename dir0/dirB/o.file to dirX/dirB/x.file
> +removing dir0/dirA/o.file
> +removing dir0/dirB/o.file
> +adding dirX/dirA/x.file
> +adding dirX/dirB/x.file
> +recording removal of dir0/dirB/o.file as rename to dirX/dirB/x.file (100% similar)
> +recording removal of dir0/dirA/o.file as rename to dirX/dirA/x.file (100% similar)
> diff -r f29b674cc221 -r 7b673a95a55d tests/test-bdiff
> --- a/tests/test-bdiff Wed Sep 17 11:34:37 2008 +0200
> +++ b/tests/test-bdiff Wed Oct 01 14:09:06 2008 +0200
> @@ -39,4 +39,57 @@
> test("a\n", "a\n")
> test("a\nb", "a\nb")
>
> +
> +def test(a, b, r, show=True):
> + s1 = bdiff.statistics(a)
> + s2 = bdiff.statistics(b)
> + s = bdiff.maxscorehist(s1, s2)
> + if not show:
> + a = b = '?'
> + print 'maxscorehist:', a, b, 'assert', s, '=', r, s==r
> + if r == 1.0:
> + print ' equal:', s1 == s2
> +
> +str1 = str2 = ''
> +for i in xrange(256):
> + str1 += chr(i)
> + str2 += chr(255 - i)
> +test(str1, str2, 1.0, show=False)
> +test('a', 'a', 1.0)
> +test('aa', 'ab', 0.5)
> +test('aa', 'ba', 0.5)
> +test('aa', 'bbba', 0.25)
> +test('a', 'b', 0.0)
> +test('a', '', 0.0)
> +test('', 'a', 0.0)
> +test('', '', 1.0)
> +
> +
> +def test(a, b, r):
> + s = bdiff.strsim(a, b)
> + print 'strsim:', a, b, 'assert', s, '=', r, s==r
> +
> +test('a', 'a', 1.0)
> +test('a', 'b', 0.0)
> +test('a', 'aa', 0.5)
> +test('aa', 'a', 0.5)
> +test('aaab', 'aaaa', 0.75)
> +test('aaaa', 'aabb', 0.5)
> +test('aabb', 'aaaa', 0.5)
> +test('a', '', 0.0)
> +test('', 'a', 0.0)
> +test('', '', 1.0)
> +
> +
> +def test(a, b, r):
> + s = bdiff.reversesim(a, b)
> + print 'reversesim:', a, b, 'assert', s, '=', r, s==r
> +
> +test('a', 'a', 1)
> +test('aa', 'a', 1)
> +test('a', 'aa', 1)
> +test('a', '', 0)
> +test('', 'a', 0)
> +test('', '', 0)
> +
> print "done"
> diff -r f29b674cc221 -r 7b673a95a55d tests/test-bdiff.out
> --- a/tests/test-bdiff.out Wed Sep 17 11:34:37 2008 +0200
> +++ b/tests/test-bdiff.out Wed Oct 01 14:09:06 2008 +0200
> @@ -17,4 +17,32 @@
> *** 'abc' 'abc'
> *** 'a\n' 'a\n'
> *** 'a\nb' 'a\nb'
> +maxscorehist: ? ? assert 1.0 = 1.0 True
> + equal: True
> +maxscorehist: a a assert 1.0 = 1.0 True
> + equal: True
> +maxscorehist: aa ab assert 0.5 = 0.5 True
> +maxscorehist: aa ba assert 0.5 = 0.5 True
> +maxscorehist: aa bbba assert 0.25 = 0.25 True
> +maxscorehist: a b assert 0.0 = 0.0 True
> +maxscorehist: a assert 0.0 = 0.0 True
> +maxscorehist: a assert 0.0 = 0.0 True
> +maxscorehist: assert 1.0 = 1.0 True
> + equal: True
> +strsim: a a assert 1.0 = 1.0 True
> +strsim: a b assert 0.0 = 0.0 True
> +strsim: a aa assert 0.5 = 0.5 True
> +strsim: aa a assert 0.5 = 0.5 True
> +strsim: aaab aaaa assert 0.75 = 0.75 True
> +strsim: aaaa aabb assert 0.5 = 0.5 True
> +strsim: aabb aaaa assert 0.5 = 0.5 True
> +strsim: a assert 0.0 = 0.0 True
> +strsim: a assert 0.0 = 0.0 True
> +strsim: assert 1.0 = 1.0 True
> +reversesim: a a assert 1 = 1 True
> +reversesim: aa a assert 1 = 1 True
> +reversesim: a aa assert 1 = 1 True
> +reversesim: a assert 0 = 0 True
> +reversesim: a assert 0 = 0 True
> +reversesim: assert 0 = 0 True
> done
>
--
:wq
More information about the Mercurial-devel
mailing list