[PATCH 5 of 8 git-diff] similar: move score function to module level
Pierre-Yves David
pierre-yves.david at ens-lyon.org
Fri Jan 13 12:56:13 EST 2017
On 01/09/2017 08:49 PM, Sean Farley wrote:
> # HG changeset patch
> # User Sean Farley <sean at farley.io>
> # Date 1483850877 28800
> # Sat Jan 07 20:47:57 2017 -0800
> # Node ID 8561e240f25e851476abde452b850fc09a7fdf06
> # Parent 9af4ef2f80a8c5cb69ad4ff5d291ce81a35e4f39
> similar: move score function to module level
>
> Future patches will use this to report the similarity of a rename / copy
> in the patch output.
Martin spotted an interesting issue. This move a @cachefunc decorator to
the module level, so every data accessed to compute similarity will end
up cached for ever :-/. There is also a second caching added on the
score function that would keep filectx alive for ever (and was not
initially cached at all before this patch)
I played around a bit and made the following change to allow the top
level function to state cache free while not regressing about the
caching in the loop itself. Can you send a V2 (with my change or another
one as long as the caching issue is solved)
--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -43,16 +43,17 @@ def _findexactmatches(repo, added, remov
# Done
repo.ui.progress(_('searching for exact renames'), None)
- at util.cachefunc
def _ctxdata(fctx):
# lazily load text
orig = fctx.data()
return orig, mdiff.splitnewlines(orig)
- at util.cachefunc
def score(fctx1, fctx2):
- text = fctx1.data()
- orig, lines = _ctxdata(fctx2)
+ return _score(fcx1, _ctxdata(fctx2))
+
+def _score(fctx, otherdata):
+ orig, lines = otherdata
+ text = fctx.data()
# bdiff.blocks() returns blocks of matching lines
# count the number of bytes in each
equal = 0
@@ -74,10 +75,12 @@ def _findsimilarmatches(repo, added, rem
for i, r in enumerate(removed):
repo.ui.progress(_('searching for similar files'), i,
total=len(removed), unit=_('files'))
-
+ data = None
for a in added:
bestscore = copies.get(a, (None, threshold))[1]
- myscore = score(a, r)
+ if data is None:
+ data = _ctxdata(r)
+ myscore = _score(a, data)
if myscore >= bestscore:
copies[a] = (r, myscore)
repo.ui.progress(_('searching'), None)
>
> diff --git a/mercurial/similar.py b/mercurial/similar.py
> --- a/mercurial/similar.py
> +++ b/mercurial/similar.py
> @@ -41,10 +41,31 @@ def _findexactmatches(repo, added, remov
> yield (hashes[h], fctx)
>
> # Done
> repo.ui.progress(_('searching for exact renames'), None)
>
> + at util.cachefunc
> +def _ctxdata(fctx):
> + # lazily load text
> + orig = fctx.data()
> + return orig, mdiff.splitnewlines(orig)
> +
> + at util.cachefunc
> +def score(fctx1, fctx2):
> + text = fctx1.data()
> + orig, lines = _ctxdata(fctx2)
> + # bdiff.blocks() returns blocks of matching lines
> + # count the number of bytes in each
> + equal = 0
> + matches = bdiff.blocks(text, orig)
> + for x1, x2, y1, y2 in matches:
> + for line in lines[y1:y2]:
> + equal += len(line)
> +
> + lengths = len(text) + len(orig)
> + return equal * 2.0 / lengths
> +
> def _findsimilarmatches(repo, added, removed, threshold):
> '''find potentially renamed files based on similar file content
>
> Takes a list of new filectxs and a list of removed filectxs, and yields
> (before, after, score) tuples of partial matches.
> @@ -52,32 +73,13 @@ def _findsimilarmatches(repo, added, rem
> copies = {}
> for i, r in enumerate(removed):
> repo.ui.progress(_('searching for similar files'), i,
> total=len(removed), unit=_('files'))
>
> - # lazily load text
> - @util.cachefunc
> - def data():
> - orig = r.data()
> - return orig, mdiff.splitnewlines(orig)
> -
> - def score(text):
> - orig, lines = data()
> - # bdiff.blocks() returns blocks of matching lines
> - # count the number of bytes in each
> - equal = 0
> - matches = bdiff.blocks(text, orig)
> - for x1, x2, y1, y2 in matches:
> - for line in lines[y1:y2]:
> - equal += len(line)
> -
> - lengths = len(text) + len(orig)
> - return equal * 2.0 / lengths
> -
> for a in added:
> bestscore = copies.get(a, (None, threshold))[1]
> - myscore = score(a.data())
> + myscore = score(a, r)
> if myscore >= bestscore:
> copies[a] = (r, myscore)
> repo.ui.progress(_('searching'), None)
>
> for dest, v in copies.iteritems():
Cheers,
--
Pierre-Yves David
More information about the Mercurial-devel
mailing list