[PATCH 5 of 8 git-diff] similar: move score function to module level

Fri Jan 13 12:56:13 EST 2017

On 01/09/2017 08:49 PM, Sean Farley wrote:
> # HG changeset patch
> # User Sean Farley <sean at farley.io>
> # Date 1483850877 28800
> #      Sat Jan 07 20:47:57 2017 -0800
> # Node ID 8561e240f25e851476abde452b850fc09a7fdf06
> # Parent  9af4ef2f80a8c5cb69ad4ff5d291ce81a35e4f39
> similar: move score function to module level
>
> Future patches will use this to report the similarity of a rename / copy
> in the patch output.

Martin spotted an interesting issue. This move a @cachefunc decorator to 
the module level, so every data accessed to compute similarity will end 
up cached for ever :-/. There is also a second caching added on the 
score function that would keep filectx alive for ever (and was not 
initially cached at all before this patch)

I played around a bit and made the following change to allow the top 
level function to state cache free while not regressing about the 
caching in the loop itself. Can you send a V2 (with my change or another 
one as long as the caching issue is solved)

--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -43,16 +43,17 @@ def _findexactmatches(repo, added, remov
      # Done
      repo.ui.progress(_('searching for exact renames'), None)

- at util.cachefunc
  def _ctxdata(fctx):
      # lazily load text
      orig = fctx.data()
      return orig, mdiff.splitnewlines(orig)

- at util.cachefunc
  def score(fctx1, fctx2):
-    text = fctx1.data()
-    orig, lines = _ctxdata(fctx2)
+    return _score(fcx1, _ctxdata(fctx2))
+
+def _score(fctx, otherdata):
+    orig, lines = otherdata
+    text = fctx.data()
      # bdiff.blocks() returns blocks of matching lines
      # count the number of bytes in each
      equal = 0
@@ -74,10 +75,12 @@ def _findsimilarmatches(repo, added, rem
      for i, r in enumerate(removed):
          repo.ui.progress(_('searching for similar files'), i,
                           total=len(removed), unit=_('files'))
-
+        data = None
          for a in added:
              bestscore = copies.get(a, (None, threshold))[1]
-            myscore = score(a, r)
+            if data is None:
+                data = _ctxdata(r)
+            myscore = _score(a, data)
              if myscore >= bestscore:
                  copies[a] = (r, myscore)
      repo.ui.progress(_('searching'), None)



>
> diff --git a/mercurial/similar.py b/mercurial/similar.py
> --- a/mercurial/similar.py
> +++ b/mercurial/similar.py
> @@ -41,10 +41,31 @@ def _findexactmatches(repo, added, remov
>              yield (hashes[h], fctx)
>
>      # Done
>      repo.ui.progress(_('searching for exact renames'), None)
>
> + at util.cachefunc
> +def _ctxdata(fctx):
> +    # lazily load text
> +    orig = fctx.data()
> +    return orig, mdiff.splitnewlines(orig)
> +
> + at util.cachefunc
> +def score(fctx1, fctx2):
> +    text = fctx1.data()
> +    orig, lines = _ctxdata(fctx2)
> +    # bdiff.blocks() returns blocks of matching lines
> +    # count the number of bytes in each
> +    equal = 0
> +    matches = bdiff.blocks(text, orig)
> +    for x1, x2, y1, y2 in matches:
> +        for line in lines[y1:y2]:
> +            equal += len(line)
> +
> +    lengths = len(text) + len(orig)
> +    return equal * 2.0 / lengths
> +
>  def _findsimilarmatches(repo, added, removed, threshold):
>      '''find potentially renamed files based on similar file content
>
>      Takes a list of new filectxs and a list of removed filectxs, and yields
>      (before, after, score) tuples of partial matches.
> @@ -52,32 +73,13 @@ def _findsimilarmatches(repo, added, rem
>      copies = {}
>      for i, r in enumerate(removed):
>          repo.ui.progress(_('searching for similar files'), i,
>                           total=len(removed), unit=_('files'))
>
> -        # lazily load text
> -        @util.cachefunc
> -        def data():
> -            orig = r.data()
> -            return orig, mdiff.splitnewlines(orig)
> -
> -        def score(text):
> -            orig, lines = data()
> -            # bdiff.blocks() returns blocks of matching lines
> -            # count the number of bytes in each
> -            equal = 0
> -            matches = bdiff.blocks(text, orig)
> -            for x1, x2, y1, y2 in matches:
> -                for line in lines[y1:y2]:
> -                    equal += len(line)
> -
> -            lengths = len(text) + len(orig)
> -            return equal * 2.0 / lengths
> -
>          for a in added:
>              bestscore = copies.get(a, (None, threshold))[1]
> -            myscore = score(a.data())
> +            myscore = score(a, r)
>              if myscore >= bestscore:
>                  copies[a] = (r, myscore)
>      repo.ui.progress(_('searching'), None)
>
>      for dest, v in copies.iteritems():

Cheers,

-- 
Pierre-Yves David