[PATCH] mdiff: speed up showfunc for large diffs

Brodie Rao brodie at bitheap.org
Mon Sep 19 17:58:15 CDT 2011


# HG changeset patch
# User Brodie Rao <brodie at bitheap.org>
# Date 1316473083 25200
# Node ID 18b106673c3a2ae7395beb33c4278f238c786a93
# Parent  8542a9c9f67997a1faafc385f11a2fcab458772c
mdiff: speed up showfunc for large diffs

This addresses the following issues with showfunc:

- Silly usage of regular expressions.
- Doing str.rstrip() needlessly in an inner loop.
- Doing catastrophic backtracking when trying to find a function line.

Finding function text is now at worst O(n lines in the old file), and
at best close to O(n hunks).

Given a diff like this[1]:

 src/main/antlr3/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunker.g        |      4 +-
 src/main/java/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunkerLexer.java  |      2 +-
 src/main/java/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunkerParser.java |  29189 +++++----
 3 files changed, 14741 insertions(+), 14454 deletions(-)

[1]: https://bitbucket.org/wwmm/chemicaltagger/changeset/d2bfbaecd4fc/raw

Without this change, hg log --stat --config diff.showfunc=1 takes an
absurdly long time to complete:

   CallCount    Recursive    Total(ms)   Inline(ms) module:lineno(function)
       32813            0     80.3546     40.6086   mercurial.mdiff:160(yieldhunk)
   +65062746            0     25.7227     25.7227   +<method 'match' of '_sre.SRE_Pattern' objects>
   +65062746            0     14.0221     14.0221   +<method 'rstrip' of 'str' objects>
       +1809            0      0.0009      0.0009   +mercurial.mdiff:148(contextend)
       +1809            0      0.0003      0.0003   +<len>
    65062746            0     25.7227     25.7227   <method 'match' of '_sre.SRE_Pattern' objects>
    65062763            0     14.0221     14.0221   <method 'rstrip' of 'str' objects>
         543            0      0.1631      0.1631   <zlib.decompress>
           3            0      0.0505      0.0505   <mercurial.bdiff.blocks>
       31007            0     80.4564      0.0477   mercurial.mdiff:147(_unidiff)
      +32813            0     80.3546     40.6086   +mercurial.mdiff:160(yieldhunk)
          +3            0      0.0505      0.0505   +<mercurial.bdiff.blocks>
       +3618            0      0.0022      0.0022   +mercurial.mdiff:154(contextstart)
       +5427            0      0.0013      0.0013   +<len>
          +3            0      0.0001      0.0000   +re:188(compile)
           1            0     80.8381      0.0322   mercurial.patch:1777(diffstatdata)
     +107499            0      0.0235      0.0235   +<method 'startswith' of 'str' objects>
      +31014            0     80.7820      0.0071   +mercurial.util:1284(iterlines)
          +3            0      0.0000      0.0000   +<method 'search' of '_sre.SRE_Pattern' objects>
          +4            0      0.0000      0.0000   +mercurial.patch:1783(addresult)
          +3            0      0.0000      0.0000   +<method 'group' of '_sre.SRE_Match' objects>
           6            0      0.0444      0.0283   mercurial.mdiff:12(splitnewlines)
          +6            0      0.0160      0.0160   +<method 'split' of 'str' objects>
          32            0      0.0246      0.0246   <method 'update' of '_hashlib.HASH' objects>
          11            0      0.0236      0.0236   <method 'read' of 'file' objects>
Time: real 80.880 secs (user 80.200+0.000 sys 0.380+0.000)

With this change, it's almost as fast as not using showfunc at all:

   CallCount    Recursive    Total(ms)   Inline(ms) module:lineno(function)
         543            0      0.1699      0.1699   <zlib.decompress>
           3            0      0.0501      0.0501   <mercurial.bdiff.blocks>
       32813            0      0.0415      0.0348   mercurial.mdiff:161(yieldhunk)
      +70837            0      0.0058      0.0058   +<method 'isalnum' of 'str' objects>
       +1809            0      0.0006      0.0006   +mercurial.mdiff:148(contextend)
       +1809            0      0.0002      0.0002   +<len>
           1            0      0.4879      0.0310   mercurial.patch:1777(diffstatdata)
     +107499            0      0.0230      0.0230   +<method 'startswith' of 'str' objects>
      +31014            0      0.4335      0.0065   +mercurial.util:1284(iterlines)
          +3            0      0.0000      0.0000   +<method 'search' of '_sre.SRE_Pattern' objects>
          +4            0      0.0000      0.0000   +mercurial.patch:1783(addresult)
          +1            0      0.0004      0.0000   +re:188(compile)
          32            0      0.0293      0.0293   <method 'update' of '_hashlib.HASH' objects>
           6            0      0.0427      0.0279   mercurial.mdiff:12(splitnewlines)
          +6            0      0.0147      0.0147   +<method 'split' of 'str' objects>
       31007            0      0.1169      0.0235   mercurial.mdiff:147(_unidiff)
          +3            0      0.0501      0.0501   +<mercurial.bdiff.blocks>
      +32813            0      0.0415      0.0348   +mercurial.mdiff:161(yieldhunk)
       +3618            0      0.0012      0.0012   +mercurial.mdiff:154(contextstart)
       +5427            0      0.0006      0.0006   +<len>
      107597            0      0.0230      0.0230   <method 'startswith' of 'str' objects>
          16            0      0.0213      0.0213   <mercurial.mpatch.patches>
         194            0      0.0149      0.0149   <method 'split' of 'str' objects>
Time: real 0.530 secs (user 0.450+0.000 sys 0.070+0.000)

diff --git a/mercurial/mdiff.py b/mercurial/mdiff.py
--- a/mercurial/mdiff.py
+++ b/mercurial/mdiff.py
@@ -157,6 +157,7 @@ def _unidiff(t1, t2, l1, l2, opts=defaul
             return 0
         return ret
 
+    lastfunc = [0, '']
     def yieldhunk(hunk):
         (astart, a2, bstart, b2, delta) = hunk
         aend = contextend(a2, len(l1))
@@ -165,13 +166,19 @@ def _unidiff(t1, t2, l1, l2, opts=defaul
 
         func = ""
         if opts.showfunc:
-            # walk backwards from the start of the context
-            # to find a line starting with an alphanumeric char.
-            for x in xrange(astart - 1, -1, -1):
-                t = l1[x].rstrip()
-                if funcre.match(t):
-                    func = ' ' + t[:40]
+            lastpos, func = lastfunc
+            # walk backwards from the start of the context up to the start of
+            # the previous hunk context until we find a line starting with an
+            # alphanumeric char.
+            for i in xrange(astart - 1, lastpos - 1, -1):
+                if l1[i].isalnum():
+                    func = ' ' + l1[i].rstrip()[:40]
+                    lastfunc[1] = func
                     break
+            # by recording this hunk's starting point as the next place to
+            # start looking for function lines, we avoid reading any line in
+            # the file more than once.
+            lastfunc[0] = astart
 
         yield "@@ -%d,%d +%d,%d @@%s\n" % (astart + 1, alen,
                                            bstart + 1, blen, func)
@@ -180,9 +187,6 @@ def _unidiff(t1, t2, l1, l2, opts=defaul
         for x in xrange(a2, aend):
             yield ' ' + l1[x]
 
-    if opts.showfunc:
-        funcre = re.compile('\w')
-
     # bdiff.blocks gives us the matching sequences in the files.  The loop
     # below finds the spaces between those matching sequences and translates
     # them into diff output.


More information about the Mercurial-devel mailing list