[PATCH] auto rename: best matches and speed improvements UPDATE1

Wed Oct 1 10:57:31 CDT 2008

On Wed, Oct 01, 2008 at 02:50:04PM +0200, Herbert Griebel wrote:
> # HG changeset patch
> # User Herbert Griebel <herbertg at gmx.at>
> # Date 1222862946 -7200
> # Node ID 7b673a95a55d0e32df824830a8419d4c1a220b9a
> # Parent  f29b674cc2210126c2899d94d882c367a8ea64bc
> auto rename: best matches and speed improvements
> 
> 
> diff -r f29b674cc221 -r 7b673a95a55d mercurial/addremove.py
> --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
> +++ b/mercurial/addremove.py	Wed Oct 01 14:09:06 2008 +0200
> @@ -0,0 +1,356 @@
> +# addremove.py - command addremove
> +#
> +# Copyright 2008 Herbert Griebel <herbertg at gmx.at>
> +#
> +# This software may be used and distributed according to the terms
> +# of the GNU General Public License, incorporated herein by reference.
> +
> +from i18n import _
> +import os
> +import mdiff, bdiff, util
> +import copy
> +
> +def findrenames(repo, added=None, removed=None, threshold=0.5):
> +    '''find renamed files -- yields (before, after, score) tuples'''
> +    if added is None or removed is None:
> +        added, removed = repo.status()[1:3]
> +    if not added or not removed:
> +        return
> +    ctx = repo['.']
rctx = repo['.']
actx = repo[None]
> +
> +    # cash file attributes

cache ?

> +    nr = len(removed)
> +    na = len(added)
> +    rfiles = [()] * nr
> +    afiles = [()] * na
> +    for i, r in enumerate(removed):
> +        rfiles[i] = (ctx.filectx(r).size(),
> +                     os.path.split(r)[1], os.path.splitext(r)[1], i)
rctx.filectx(r).size()
> +    for i, a in enumerate(added):
> +        afiles[i] = (repo.wfilesize(a),
actx.filectx(r).size()
> +                     os.path.split(a)[1], os.path.splitext(a)[1], i)

could be a simple list comprehension (or refactored, since the only
difference is actx/rctx)
rfiles = [(ctx.filectx(r).size(),
           os.path.split(r)[1], os.path.splitext(r)[1], i)
	  for i, r in enumerate(removed)]

> +    # check loop order
> +    if na == 1:
> +        addedfirst = True
> +    elif nr == 1:
> +        addedfirst = False
> +    else:
> +        # estimate times to decide loops
> +        Sr = sum([f[0] for f in rfiles])
> +        Sa = sum([f[0] for f in afiles])
> +        dr = Sr*1.60882e-008 + nr*0.00145002
> +        da = Sa*9.03138e-009 + na*0.00110696

magic values ?

> [...]
> +
> +    def _getscore(rr, aa, alines):
> +        '''bdiff.blocks() returns blocks of matching lines
> +           count the number of bytes in each'''
> +        equal = 0
> +        matches = bdiff.blocks(aa, rr)
> +        for x1,x2,y1,y2 in matches:
x1, x2, ... (space after comma)

> +            for line in alines[x1:x2]:
> +                equal += len(line)
> +        score = float(equal)  / max([len(aa), len(rr)])
> +        return score
> +
> +    def _addmatch(score, compared):
> +        '''enters match into the 3 lists'''
> +        namesim = bdiff.reversesim(removed[ir], added[ia])
> +        m = [score, namesim, compared, ir, ia, False]
> +        rmatches[ir].append(m)
> +        amatches[ia].append(m)
> +        if rbased:
> +            if bestmatches[ir] < m:
> +                bestmatches[ir][5] = False
> +                bestmatches[ir] = m
> +                m[5] = True
> +        else:
> +            if bestmatches[ia] < m:
> +                bestmatches[ia][5] = False
> +                bestmatches[ia] = m
> +                m[5] = True
could be refactored:
if rbased:
    idx = ir
else:
    idx = ia
if bestmatches[idx] < m:
    bestmatches[idx][5] = False
    bestmatches[idx] = m
    m[5] = True


> +        if rbestscores[ir] < score:
> +            rbestscores[ir] = score
rbestscores[ir] = max(rbestscores[ir], score)
> +        if abestscores[ia] < score:
> +            abestscores[ia] = score
rbestscores[ir] = max(rbestscores[ir], score)
> +
> +    # loop 1
> +    for size1, name1, ext1, ifile1 in filelist1:
> +        if addedfirst:
> +            ia = ifile1
> +        else:
> +            ir = ifile1
> +        loadfile1 = True
> +
> +        # check similar file names first (moves)
> +        nmoves = 0
> +        for i in xrange(len(filelist2)):
> +            if name1 == filelist2[i][1]:
> +                if i != nmoves:
> +                    # swap files
> +                    filelist2[nmoves], filelist2[i] = \
> +                            filelist2[i], filelist2[nmoves]
> +                nmoves += 1
> +
> +        # loop 2
> +        for size2, name2, ext2, ifile2 in filelist2:
> +            if addedfirst:
> +                ir = ifile2
> +            else:
> +                ia = ifile2
> +
> +            # special case: empty files
> +            if not size1 or not size2:
> +                score = 1 if size1 == size2 else 0
if ... else was introduced in 2.5 (iirc), please use:
score = (size1 == size2) and 1 or 0
or
int(size1 == size2) (I don't know if this one is guaranteed to work for
all python version)

> +                if score >= minscore:
> +                    _addmatch(score, iscompared)
> +                continue
> +
> +            # maximum score that could be reached because of sizes
> +            maxscoresize = \
> +                float(size2)/size1 if size1>size2 else float(size1)/size2
ditto (although x and t or f won't work if size2 == 0, which shouldn't
happend (special case above))

> +            if maxscoresize < minscore:
> +                continue
> +
> +            # maximum score that could be reached because of statistics
> +            maxscore = maxscoresize
> +            if rstat[ir] and astat[ia]:
> +                # histogram delta
> +                maxscorehist = bdiff.maxscorehist(rstat[ir], astat[ia])
> +                if maxscorehist < minscore:
> +                    continue
> +                # defer compares, take lowest upper bound
> +                if maxscorehist < maxscore:
> +                    maxscore = maxscorehist
maxscore = min(maxscore, maxscorehist)
> +
> +            # deferred compares
> +            if ifile1 and (maxscore < rbestscores[ir] or \
> +                        maxscore < abestscores[ia]):
> +                _addmatch(maxscore, notcompared)
> +                continue
> +
> +            # load file 1
> +            if loadfile1:
> +                if addedfirst:
> +                    aa = repo.wread(added[ia])
actx.filectx(...).data()
> +                    alines = mdiff.splitnewlines(aa)
> +                    astat[ia] = bdiff.statistics(aa)
> +                else:
> +                    rr = ctx.filectx(removed[ir]).data()
rctx...
> +                    rstat[ir] = bdiff.statistics(rr)
> +                loadfile1 = False
> +
> +            # load file 2
> +            if addedfirst:
> +                rr = ctx.filectx(removed[ir]).data()
rctx.filectx(...).data()
> +                if not rstat[ir]:
> +                    rstat[ir] = bdiff.statistics(rr)
> +            else:
> +                aa = repo.wread(added[ia])
actx.filectx(...).data()
> +                alines = mdiff.splitnewlines(aa)
> +                if not astat[ia]:
> +                    astat[ia] = bdiff.statistics(aa)
> +
> +            # score it
> +            score = _getscore(rr, aa, alines)
> +            if score >= minscore:
> +                _addmatch(score, iscompared)
> +
> +    def _deferredcompare(m):
> +        '''function returns 0 if maxscore was not reached
> +        and updates bestmatches'''
> +        maxscore, namesim, compared, ir, ia, isbest = m
> +        if compared or not maxscore:
> +            return maxscore
> +        rr = aa = ''
> +        if not rstat[ir]:
> +            rr = ctx.filectx(removed[ir]).data()
> +            rstat[ir] = bdiff.statistics(rr)
> +        if not astat[ia]:
> +            aa = repo.wread(added[ia])
actx... (you could refactor)
> +            astat[ia] = bdiff.statistics(aa)
> +        # check histogram diff before comparing
> +        maxscorehist = bdiff.maxscorehist(rstat[ir], astat[ia])
> +        if maxscorehist < minscore:
> +            maxscorehist = 0
> +        if maxscore > maxscorehist:
> +            m[0] = maxscorehist
> +            maxscore = 0 # do not compare, we need new max
> +        if maxscore: # actually compare the files
> +            if not rr:
> +                rr = ctx.filectx(removed[ir]).data()
rctx
> +            if not aa:
> +                aa = repo.wread(added[ia])
actx
> +            alines = mdiff.splitnewlines(aa)
> +            score = _getscore(rr, aa, alines)
> +            m[2] = iscompared # not deferred anymore
> +            if score == maxscore:
> +                return maxscore
> +            m[0] = 0 if score < minscore else score
if ... else should be changed

> +        if isbest: # get best score
> +            m[5] = False # remove from best
> +            if rbased:
> +                l = rmatches[ir]
> +                bm = max(l) if l else 0
ditto
> +                bestmatches[ir] = bm
> +            else:
> +                l = amatches[ia]
> +                bm = max(l) if l else 0
ditto
> +                bestmatches[ir] = bm
> +                bestmatches[ia] = bm
can it be refactored ? (why is it asymetrical ?)
> +            if bm:
> +                bm[5] = True # flag as best
> +        return 0
> +
> +    def _namematches(scorematches):
> +        '''Find best matches for a group of removed and added files
> +        Matching score (high to low): file extension equal
> +                                      file name similarity (levenshtein)
> +                                      full pathname similarity (levenshtein)
> +                                      common substring length from right
> +        Hence we assume that a file extension change is least likely.'''
> +        if len(scorematches) == 1:
> +            yield scorematches[0][0]
> +            return
> +        # "name-score" all matches
> +        best = [0] * len(scorematches)
> +        for i,sm in enumerate(scorematches):
space after comma
> +            score, namesim, compared, ir, ia, isbest = sm[0]
> +            r = removed[ir]
> +            a = added[ia]
> +            rname = rfiles[ir][1]
> +            aname = afiles[ia][1]
> +            rext = rfiles[ir][2]
> +            aext = afiles[ia][2]
> +            score1 = rext == aext
> +            score2 = bdiff.strsim(rname, aname)
> +            score3 = bdiff.strsim(r, a)
> +            score4 = bdiff.reversesim(r, a)
> +            ndepends = -len(sm[1]) + (0 if compared else -1)
if ... else
> +            best[i] = (score1, score2, score3, score4, ndepends, sm)
> +        # output best name matches
> +        best.sort(reverse = True)
> +        for score1, score2, score3, score4, n, sm in best:
> +            m, depends = sm
> +            score, namesim, compared, ir, ia, isbest = m
> +            # depends on other deferred?
> +            if depends:
> +                dismissed = False
> +                for d in depends:
> +                    if _deferredcompare(d) != score:
> +                        dismissed = True
> +                        break
> +                if dismissed:
> +                    continue
> +            # check if deferred
> +            if not compared:
> +                if _deferredcompare(m) != score:
> +                    continue
> +            yield m
> +
> +    # assign best matches
> +    while True:
> +        bm = max(bestmatches) # get best match
> +        if not bm:
> +            break
> +        score, namesim, compared, ir, ia, isbest = bm
> +        if not score:
> +            break
> +        if not compared:
> +            if _deferredcompare(bm) != score:
> +                continue
> +
> +        # Find all files that have the same similarity to the two files.
> +        # Examples when name matching is necessary:
> +        #
> +        # a -> x    score       basic case, a has the same score to x and y
> +        # a -> y    score
> +        #
> +        # a -> x    score       basic case, x has the same score to a and b
> +        # b -> x    score
> +        #
> +        # a -> x    score       general case, finding all matches must
> +        # a -> y    score       be done iteratively; if the compare is
> +        # b -> y    score       deferred, a dependency list is used,
> +        # b -> z    score       because the score is only an upper bound
> +        # ...
> +        #
> +        # a -> x    score       no need to follow "100% links"
> +        # b -> x    1           because link a -> y exists
> +        # b -> y    1
> +        # a -> y    score
> +        rset, aset = [ir], [ia]
> +        sm = [bm, []]
> +        rsetnew, asetnew = [sm], [sm]
> +        scorematches = [sm]
> +        while rsetnew:
> +            # get all added files that match rsetnew files
> +            for newm, depends in rsetnew:
> +                if newm[2] == notcompared:
> +                    depends = copy.copy(depends)
> +                    depends.append(newm)
> +                l = [[m, depends] for m in rmatches[newm[3]] \
> +                        if m[0]==score and m[4] not in aset]
> +                scorematches += l
> +                asetnew += l
> +            rset += [m[3] for m, d in rsetnew]
> +            rsetnew = []
> +            # get all removed files that match asetnew files
> +            for newm, depends in asetnew:
> +                if newm[2] == notcompared:
> +                    depends = copy.copy(depends)
> +                    depends.append(newm)
> +                l = [[m, depends] for m in amatches[newm[4]] \
> +                        if m[0]==score and m[3] not in rset]
> +                scorematches += l
> +                rsetnew += l
> +            aset += [m[4] for m, d in asetnew]
> +            asetnew = []
> +
> +        # find best name matches
> +        for bm in _namematches(scorematches):
> +            score, namesim, compared, ir, ia, isbest = bm
> +            if not score:
> +                continue
> +            yield removed[ir], added[ia], score
> +            # invalidate ir and ia
> +            for m in rmatches[ir]:
> +                m[0] = 0
> +            for m in amatches[ia]:
> +                m[0] = 0
> +            rmatches[ir] = []
> +            amatches[ia] = []
> +            # get next best match for invalidated
> +            for m in bestmatches:
> +                if m and not m[0]:
> +                    score, namesim, compared, ir2, ia2, isbest = m
> +                    if rbased:
> +                        l = rmatches[ir2]
> +                        m = max(l) if l else 0
> +                        bestmatches[ir2] = m
> +                    else:
> +                        l = amatches[ia2]
> +                        m = max(l) if l else 0
> +                        bestmatches[ia2] = m
> +                    if m:
> +                        m[5] = True # flag as best
> diff -r f29b674cc221 -r 7b673a95a55d mercurial/bdiff.c
> --- a/mercurial/bdiff.c	Wed Sep 17 11:34:37 2008 +0200
> +++ b/mercurial/bdiff.c	Wed Oct 01 14:09:06 2008 +0200
> @@ -2,6 +2,7 @@
>   bdiff.c - efficient binary diff extension for Mercurial
> 
>   Copyright 2005, 2006 Matt Mackall <mpm at selenic.com>
> + Copyright 2008 Herbert Griebel <herbertg at gmx.at>
> 
>   This software may be used and distributed according to the terms of
>   the GNU General Public License, incorporated herein by reference.
> @@ -356,11 +357,153 @@
>  	return result ? result : PyErr_NoMemory();
>  }
> 
> -static char mdiff_doc[] = "Efficient binary diff.";
> +
> +typedef struct {
> +	unsigned int hist[256];
> +	unsigned int filelen;
> +} STATDATA;

I think we follow the kernel coding style for C code, so please don't
typedef structs:
 struct statistics {
 ...
 }
> +
> +static PyObject* statistics(PyObject *self, PyObject *args)
> +{
> +	unsigned char *s;
> +	int filelen, i;
> +	PyObject *dataobj;
> +	STATDATA *stat;
> +	unsigned int *hist;
> +	if (! PyArg_ParseTuple(args, "s#:statistics", &s, &filelen))
> +		return NULL;
> +	dataobj = PyString_FromStringAndSize(NULL, sizeof(STATDATA));
> +	stat = (STATDATA*) PyString_AS_STRING(dataobj);

You could put the pointer inside a CObject instead of putting it in a string
(and use malloc)

(one problem will be that the equality test won't work anymore with the
CObjects)
> +	hist = stat->hist;
> +	stat->filelen = filelen;
> +
> +	/*  build histo */
> +	for (i=0; i<256; i++)
> +		hist[i] = 0;
> +	for (i=0; i<filelen; i++)
> +		hist[s[i]] += 1;
> +
> +	return dataobj;
> +}
> +
> +static PyObject* maxscorehist(PyObject *self, PyObject *args)
> +{
> +	int len1, len2, i;
> +	STATDATA *stat1, *stat2;
> +	unsigned int maxlen;
> +	double maxscore;
> +
> +	if (! PyArg_ParseTuple(args, "s#s#:maxscorehist",
> +			&stat1, &len1, &stat2, &len2))
> +		return NULL;
> +	if (len1 != sizeof(STATDATA) || len2 != sizeof(STATDATA)) {
> +		PyErr_SetString(PyExc_ValueError, "length of data invalid");
> +		return NULL;
> +	}

and use PyCObject_Check() here
> +
> +	maxlen = stat1->filelen > stat2->filelen ? stat1->filelen : stat2->filelen;
> +	if (maxlen == 0)
> +		maxscore = 1.;
> +	else if (stat1->filelen == 0 || stat2->filelen == 0)
> +		maxscore = 0.;
> +	else {
> +		unsigned int *h1 = stat1->hist;
> +		unsigned int *h2 = stat2->hist;
> +		unsigned int delta = stat1->filelen > stat2->filelen ?
> +			stat1->filelen - stat2->filelen : stat2->filelen - stat1->filelen;
> +		for (i=0; i<256; i++)
> +			delta += (h1[i] > h2[i]) ? (h1[i]-h2[i]) : (h2[i]-h1[i]);
> +		maxscore = 1. - delta / (double)(2 * maxlen);
> +	}
> +
> +	return PyFloat_FromDouble(maxscore);
> +}
> +
> +int levenshtein_distance(char* s, int n, char* t, int m)
static
> +{
> +	int k, i, j, cost, dmin, *d, d1, d2, d3, p;
> +	if (n == 0)
> +		return m;
> +	if (m == 0)
> +		return n;
> +
> +	m++;
> +	n++;
> +	d = (int*) malloc(m * n * sizeof(int));
no need to cast

if (!d) -> memoryerror, return -1 ?

> +	for (k=0; k<n; k++)
> +		d[k] = k;
> +	for (k=0; k<m; k++)
> +		d[k*n] = k;
> +
> +	for (i=1; i<n; i++) {
> +		for (j=1; j<m; j++) {
> +			p = j*n + i;
> +			cost = (s[i-1] == t[j-1]) ? 0 : 1;
> +			d3 = d[p-n-1] + cost;
> +			d2 = d[p-1] + 1;
> +			d1 = d[p-n] + 1;
> +			if (d1 > d2)
> +				d1 = d2;
> +			if (d1 > d3)
> +				d1 = d3;
> +			d[p] = d1;
> +		}
> +	}
> +	dmin = d[n*m-1];
> +	free(d);
> +
> +	return dmin;
> +}
> +
> +static PyObject* strsim(PyObject *self, PyObject *args)
> +{
> +	int n, m, d;
> +	char *s, *t;
> +	double sim;
> +	if (! PyArg_ParseTuple(args, "s#s#:strsim", &s, &n, &t, &m))
> +		return NULL;
> +	d = levenshtein_distance(s, n, t, m);
if d < 0 ...
> +	if (n==0 && m==0)
> +		sim = 1.0;
> +	else if (n > m)
> +		sim = (n-d) / (double)n;
> +	else
> +		sim = (m-d) / (double)m;
> +	return PyFloat_FromDouble(sim);
> +}
> +
> +static PyObject* reversesim(PyObject *self, PyObject *args)
> +{
> +	int n1, n2;
> +	char *s1, *s2;
> +	if (! PyArg_ParseTuple(args, "s#s#:reversesim", &s1, &n1, &s2, &n2))
> +		return NULL;
> +	if (n1 > n2) {
> +		s1 += n1 - n2;
> +		n1 = n2;
> +	} else {
> +		s2 += n2 - n1;
> +		n2 = n1;
> +	}
> +	while (n1 > 0) {
> +		n1--;
> +		if (s1[n1] != s2[n1])
> +			break;
> +	}
> +	return PyLong_FromLong(n2 - n1);
> +}
> +
> +
> +
> +static char mdiff_doc[] = "Efficient binary diff and string similarity.";
> 
>  static PyMethodDef methods[] = {
>  	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
>  	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
> +	{"statistics", statistics, METH_VARARGS, "get byte statistics\n"},
> +	{"maxscorehist", maxscorehist, METH_VARARGS, "calculate maximum score\n"},
> +	{"strsim", strsim, METH_VARARGS, "get similarity measure of strings\n"},
> +	{"reversesim", reversesim, METH_VARARGS, "equal characters from right\n"},
>  	{NULL, NULL}
>  };
> 
> @@ -368,4 +511,3 @@
>  {
>  	Py_InitModule3("bdiff", methods, mdiff_doc);
>  }
> -
> diff -r f29b674cc221 -r 7b673a95a55d mercurial/cmdutil.py
> --- a/mercurial/cmdutil.py	Wed Sep 17 11:34:37 2008 +0200
> +++ b/mercurial/cmdutil.py	Wed Oct 01 14:09:06 2008 +0200
> @@ -10,6 +10,7 @@
>  import os, sys, bisect, stat
>  import mdiff, bdiff, util, templater, templatefilters, patch, errno
>  import match as _match
> +import addremove as _addremove
> 
>  revrangesep = ':'
> 
> @@ -241,34 +242,6 @@
>  def matchfiles(repo, files):
>      return _match.exact(repo.root, repo.getcwd(), files)
> 
> -def findrenames(repo, added=None, removed=None, threshold=0.5):
> -    '''find renamed files -- yields (before, after, score) tuples'''
> -    if added is None or removed is None:
> -        added, removed = repo.status()[1:3]
> -    ctx = repo['.']
> -    for a in added:
> -        aa = repo.wread(a)
> -        bestname, bestscore = None, threshold
> -        for r in removed:
> -            rr = ctx.filectx(r).data()
> -
> -            # bdiff.blocks() returns blocks of matching lines
> -            # count the number of bytes in each
> -            equal = 0
> -            alines = mdiff.splitnewlines(aa)
> -            matches = bdiff.blocks(aa, rr)
> -            for x1,x2,y1,y2 in matches:
> -                for line in alines[x1:x2]:
> -                    equal += len(line)
> -
> -            lengths = len(aa) + len(rr)
> -            if lengths:
> -                myscore = equal*2.0 / lengths
> -                if myscore >= bestscore:
> -                    bestname, bestscore = r, myscore
> -        if bestname:
> -            yield bestname, a, bestscore
> -
>  def addremove(repo, pats=[], opts={}, dry_run=None, similarity=None):
>      if dry_run is None:
>          dry_run = opts.get('dry_run')
> @@ -302,7 +275,8 @@
>          repo.remove(remove)
>          repo.add(add)
>      if similarity > 0:
> -        for old, new, score in findrenames(repo, add, remove, similarity):
> +        for old, new, score in _addremove.findrenames( \
> +                    repo, add, remove, similarity):
>              oldrel, oldexact = mapping[old]
>              newrel, newexact = mapping[new]
>              if repo.ui.verbose or not oldexact or not newexact:
> diff -r f29b674cc221 -r 7b673a95a55d mercurial/localrepo.py
> --- a/mercurial/localrepo.py	Wed Sep 17 11:34:37 2008 +0200
> +++ b/mercurial/localrepo.py	Wed Oct 01 14:09:06 2008 +0200
> @@ -528,6 +528,9 @@
> 
>      def adddatafilter(self, name, filter):
>          self._datafilters[name] = filter
> +
> +    def wfilesize(self, filename):
> +        return os.stat(self.wjoin(filename))[6]
not needed, if you use the context API
> 
>      def wread(self, filename):
>          if self._link(filename):
> diff -r f29b674cc221 -r 7b673a95a55d tests/test-addremove-similar
> --- a/tests/test-addremove-similar	Wed Sep 17 11:34:37 2008 +0200
> +++ b/tests/test-addremove-similar	Wed Oct 01 14:09:06 2008 +0200
> @@ -1,49 +1,97 @@
>  #!/bin/sh
> 
> +echo % basic content-matching test
>  hg init rep; cd rep
> 
> +echo % add 2 files
>  touch empty-file
>  python -c 'for x in range(10000): print x' > large-file
> -
>  hg addremove
> -
>  hg commit -m A
> 
> +echo % rename large-file to another-file, remove other
>  rm large-file empty-file
>  python -c 'for x in range(10,10000): print x' > another-file
> -
> -hg addremove -s50
> -
> +hg addremove -s 50
>  hg commit -m B
> 
> -echo % comparing two empty files caused ZeroDivisionError in the past
> -hg update -C 0
> -rm empty-file
> -touch another-empty-file
> -hg addremove -s50
> +echo % rename another-file to file1, add file2
> +rm another-file
> +python -c 'for x in range(20,10000): print x' > file1
> +python -c 'for x in range(30,10000): print x' > file2
> +hg addremove -s 50
> +hg commit -m C
> 
> -cd ..
> -
> -hg init rep2; cd rep2
> -
> -python -c 'for x in range(10000): print x' > large-file
> -python -c 'for x in range(50): print x' > tiny-file
> -
> -hg addremove
> -
> -hg commit -m A
> -
> -python -c 'for x in range(70): print x' > small-file
> -rm tiny-file
> -rm large-file
> -
> -hg addremove -s50
> -
> -hg commit -m B
> -
> -echo % should all fail
> +echo % option-value test, should all fail
>  hg addremove -s foo
>  hg addremove -s -1
>  hg addremove -s 1e6
> 
> +echo % content-matching test
> +cd ..
> +hg init rep2; cd rep2
> +
> +echo % add 2 files
> +python -c 'for x in range(10000): print x' > large.file
> +python -c 'for x in range(50): print x' > tiny.file
> +hg addremove -s 50
> +hg commit -m A
> +
> +echo % rename tiny.file to x/small.file
> +python -c 'for x in range(70): print x' > small.file
> +rm tiny.file
> +rm large.file
> +mkdir x
> +mv *.file x
> +hg addremove -s 50
> +hg commit -m B
> +
> +echo % name-matching tests
> +cd ..
> +hg init rep3; cd rep3
> +
> +echo % add 5 files
> +touch a.file
> +touch b.file
> +touch c.file
> +mkdir dir1
> +mkdir dir2
> +touch dir1/z.file
> +touch dir2/z.file
> +hg addremove -s50
> +hg commit -m T1
> +
> +echo % move all to dir0, add z.file
> +mkdir dir0
> +mv *.file dir0
> +mv dir1 dir0
> +mv dir2 dir0
> +touch z.file
> +hg addremove -s50
> +hg commit -m T2
> +
> +echo % rename */z.file to */o.file
> +mv dir0/dir1/z.file dir0/dir1/o.file
> +mv dir0/dir2/z.file dir0/dir2/o.file
> +hg addremove -s50
> +hg commit -m T3
> +
> +echo % rename dir1 to dirA, dir2 to dirB
> +mv dir0/dir1 dir0/dirA
> +mv dir0/dir2 dir0/dirB
> +hg addremove -s50
> +hg commit -m T4
> +
> +echo % remove dir0 files
> +rm dir0/*.file
> +hg commit -A -m T5
> +
> +echo % rename dir0/dirA/o.file to dirX/dirA/x.file
> +echo % rename dir0/dirB/o.file to dirX/dirB/x.file
> +mv dir0/dirA/o.file dir0/dirA/x.file
> +mv dir0/dirB/o.file dir0/dirB/x.file
> +mv dir0 dirX
> +hg addremove -s50
> +hg commit -m T6
> +
>  true
> diff -r f29b674cc221 -r 7b673a95a55d tests/test-addremove-similar.out
> --- a/tests/test-addremove-similar.out	Wed Sep 17 11:34:37 2008 +0200
> +++ b/tests/test-addremove-similar.out	Wed Oct 01 14:09:06 2008 +0200
> @@ -1,20 +1,77 @@
> +% basic content-matching test
> +% add 2 files
>  adding empty-file
>  adding large-file
> +% rename large-file to another-file, remove other
>  adding another-file
>  removing empty-file
>  removing large-file
>  recording removal of large-file as rename to another-file (99% similar)
> -% comparing two empty files caused ZeroDivisionError in the past
> -2 files updated, 0 files merged, 1 files removed, 0 files unresolved
> -adding another-empty-file
> -removing empty-file
> -adding large-file
> -adding tiny-file
> -removing large-file
> -adding small-file
> -removing tiny-file
> -recording removal of tiny-file as rename to small-file (82% similar)
> -% should all fail
> +% rename another-file to file1, add file2
> +removing another-file
> +adding file1
> +adding file2
> +recording removal of another-file as rename to file1 (99% similar)
> +% option-value test, should all fail
>  abort: similarity must be a number
>  abort: similarity must be between 0 and 100
>  abort: similarity must be between 0 and 100
> +% content-matching test
> +% add 2 files
> +adding large.file
> +adding tiny.file
> +% rename tiny.file to x/small.file
> +removing large.file
> +removing tiny.file
> +adding x/small.file
> +recording removal of tiny.file as rename to x/small.file (70% similar)
> +% name-matching tests
> +% add 5 files
> +adding a.file
> +adding b.file
> +adding c.file
> +adding dir1/z.file
> +adding dir2/z.file
> +% move all to dir0, add z.file
> +removing a.file
> +removing b.file
> +removing c.file
> +adding dir0/a.file
> +adding dir0/b.file
> +adding dir0/c.file
> +adding dir0/dir1/z.file
> +adding dir0/dir2/z.file
> +removing dir1/z.file
> +removing dir2/z.file
> +adding z.file
> +recording removal of dir2/z.file as rename to dir0/dir2/z.file (100% similar)
> +recording removal of dir1/z.file as rename to dir0/dir1/z.file (100% similar)
> +recording removal of c.file as rename to dir0/c.file (100% similar)
> +recording removal of b.file as rename to dir0/b.file (100% similar)
> +recording removal of a.file as rename to dir0/a.file (100% similar)
> +% rename */z.file to */o.file
> +adding dir0/dir1/o.file
> +removing dir0/dir1/z.file
> +adding dir0/dir2/o.file
> +removing dir0/dir2/z.file
> +recording removal of dir0/dir2/z.file as rename to dir0/dir2/o.file (100% similar)
> +recording removal of dir0/dir1/z.file as rename to dir0/dir1/o.file (100% similar)
> +% rename dir1 to dirA, dir2 to dirB
> +removing dir0/dir1/o.file
> +removing dir0/dir2/o.file
> +adding dir0/dirA/o.file
> +adding dir0/dirB/o.file
> +recording removal of dir0/dir2/o.file as rename to dir0/dirB/o.file (100% similar)
> +recording removal of dir0/dir1/o.file as rename to dir0/dirA/o.file (100% similar)
> +% remove dir0 files
> +removing dir0/a.file
> +removing dir0/b.file
> +removing dir0/c.file
> +% rename dir0/dirA/o.file to dirX/dirA/x.file
> +% rename dir0/dirB/o.file to dirX/dirB/x.file
> +removing dir0/dirA/o.file
> +removing dir0/dirB/o.file
> +adding dirX/dirA/x.file
> +adding dirX/dirB/x.file
> +recording removal of dir0/dirB/o.file as rename to dirX/dirB/x.file (100% similar)
> +recording removal of dir0/dirA/o.file as rename to dirX/dirA/x.file (100% similar)
> diff -r f29b674cc221 -r 7b673a95a55d tests/test-bdiff
> --- a/tests/test-bdiff	Wed Sep 17 11:34:37 2008 +0200
> +++ b/tests/test-bdiff	Wed Oct 01 14:09:06 2008 +0200
> @@ -39,4 +39,57 @@
>  test("a\n", "a\n")
>  test("a\nb", "a\nb")
> 
> +
> +def test(a, b, r, show=True):
> +    s1 = bdiff.statistics(a)
> +    s2 = bdiff.statistics(b)
> +    s = bdiff.maxscorehist(s1, s2)
> +    if not show:
> +        a = b = '?'
> +    print 'maxscorehist:', a, b, 'assert', s, '=', r, s==r
> +    if r == 1.0:
> +        print ' equal:', s1 == s2
> +
> +str1 = str2 = ''
> +for i in xrange(256):
> +    str1 += chr(i)
> +    str2 += chr(255 - i)
> +test(str1, str2, 1.0, show=False)
> +test('a', 'a', 1.0)
> +test('aa', 'ab', 0.5)
> +test('aa', 'ba', 0.5)
> +test('aa', 'bbba', 0.25)
> +test('a', 'b', 0.0)
> +test('a', '', 0.0)
> +test('', 'a', 0.0)
> +test('', '', 1.0)
> +
> +
> +def test(a, b, r):
> +    s = bdiff.strsim(a, b)
> +    print 'strsim:', a, b, 'assert', s, '=', r, s==r
> +
> +test('a', 'a', 1.0)
> +test('a', 'b', 0.0)
> +test('a', 'aa', 0.5)
> +test('aa', 'a', 0.5)
> +test('aaab', 'aaaa', 0.75)
> +test('aaaa', 'aabb', 0.5)
> +test('aabb', 'aaaa', 0.5)
> +test('a', '', 0.0)
> +test('', 'a', 0.0)
> +test('', '', 1.0)
> +
> +
> +def test(a, b, r):
> +    s = bdiff.reversesim(a, b)
> +    print 'reversesim:', a, b, 'assert', s, '=', r, s==r
> +
> +test('a', 'a', 1)
> +test('aa', 'a', 1)
> +test('a', 'aa', 1)
> +test('a', '', 0)
> +test('', 'a', 0)
> +test('', '', 0)
> +
>  print "done"
> diff -r f29b674cc221 -r 7b673a95a55d tests/test-bdiff.out
> --- a/tests/test-bdiff.out	Wed Sep 17 11:34:37 2008 +0200
> +++ b/tests/test-bdiff.out	Wed Oct 01 14:09:06 2008 +0200
> @@ -17,4 +17,32 @@
>  *** 'abc' 'abc'
>  *** 'a\n' 'a\n'
>  *** 'a\nb' 'a\nb'
> +maxscorehist: ? ? assert 1.0 = 1.0 True
> + equal: True
> +maxscorehist: a a assert 1.0 = 1.0 True
> + equal: True
> +maxscorehist: aa ab assert 0.5 = 0.5 True
> +maxscorehist: aa ba assert 0.5 = 0.5 True
> +maxscorehist: aa bbba assert 0.25 = 0.25 True
> +maxscorehist: a b assert 0.0 = 0.0 True
> +maxscorehist: a  assert 0.0 = 0.0 True
> +maxscorehist:  a assert 0.0 = 0.0 True
> +maxscorehist:   assert 1.0 = 1.0 True
> + equal: True
> +strsim: a a assert 1.0 = 1.0 True
> +strsim: a b assert 0.0 = 0.0 True
> +strsim: a aa assert 0.5 = 0.5 True
> +strsim: aa a assert 0.5 = 0.5 True
> +strsim: aaab aaaa assert 0.75 = 0.75 True
> +strsim: aaaa aabb assert 0.5 = 0.5 True
> +strsim: aabb aaaa assert 0.5 = 0.5 True
> +strsim: a  assert 0.0 = 0.0 True
> +strsim:  a assert 0.0 = 0.0 True
> +strsim:   assert 1.0 = 1.0 True
> +reversesim: a a assert 1 = 1 True
> +reversesim: aa a assert 1 = 1 True
> +reversesim: a aa assert 1 = 1 True
> +reversesim: a  assert 0 = 0 True
> +reversesim:  a assert 0 = 0 True
> +reversesim:   assert 0 = 0 True
>  done
> 

-- 
:wq