[PATCH] revised manifest speedups

Chris Mason mason at suse.com
Wed Aug 24 10:04:36 CDT 2005


Hello everyone,

Last night I reworked my latest manifest speedups to use the python array 
type instead of my nlbuf extension.  I think the code is much less complex 
now, and this patch is only 3 or 4 seconds slower than my C extension was.

The algorithm is still based on bisect.  The basic idea is to forget about 
maintaining a line based representation of the manifest and store it 
all in a big char array.  The manifestsearch function does the heavy
lifting of finding offsets into the buffer for delta generation.

This has been through light testing here.  Matt, if it seems reasonable, I'll
hammer on it.

-------------
# HG changeset patch
# User mason at suse.com
Optimize manifest.add

Testing shows that manifest.add is spending a significant percentage of
its time running calcoffsets and doing text = "".join(addlist).  This
patch removes the need for both of these by storying the manifest in a
character array, and using a modified bisect search to find lines without
the help of a separate index of line offsets.

manifest.add was also reworked to push delta construction/combination into the 
main loop.  

Time to apply 2751 patches (without psyco, ext3 noatime,data=writeback):

Stock hg: 4m45s real 3m32s user 55s sys
patched:  2m48s real 1m53s user 43s sys
quilt:    2m30s real   45s user 50s sys

(quilt does much more io...)

Some changes to the compression code were required because it was trying
to concatenate a string with the buffer from the new array.  The fix also
reduces string duplication in general.

Index: mine/mercurial/hg.py
===================================================================
--- mine.orig/mercurial/hg.py	2005-08-24 10:26:52.000000000 -0400
+++ mine/mercurial/hg.py	2005-08-24 10:27:25.000000000 -0400
@@ -10,7 +10,7 @@ import util
 from revlog import *
 from demandload import *
 demandload(globals(), "re lock urllib urllib2 transaction time socket")
-demandload(globals(), "tempfile httprangereader bdiff urlparse")
+demandload(globals(), "tempfile httprangereader bdiff urlparse array")
 demandload(globals(), "bisect errno select stat")
 
 class filelog(revlog):
@@ -104,7 +104,6 @@ class manifest(revlog):
     def __init__(self, opener):
         self.mapcache = None
         self.listcache = None
-        self.addlist = None
         revlog.__init__(self, opener, "00manifest.i", "00manifest.d")
 
     def read(self, node):
@@ -114,8 +113,9 @@ class manifest(revlog):
         text = self.revision(node)
         map = {}
         flag = {}
-        self.listcache = (text, text.splitlines(1))
-        for l in self.listcache[1]:
+        self.listcache = array.array('c', text)
+        lines = text.splitlines(1)
+        for l in lines:
             (f, n) = l.split('\0')
             map[f] = bin(n[:40])
             flag[f] = (n[40:-1] == "x")
@@ -129,67 +129,66 @@ class manifest(revlog):
         return self.mapcache[2]
 
     def diff(self, a, b):
-        # this is sneaky, as we're not actually using a and b
-        if self.listcache and self.addlist and self.listcache[0] == a:
-            d = mdiff.diff(self.listcache[1], self.addlist, 1)
-            if mdiff.patch(a, d) != b:
-                sys.stderr.write("*** sortdiff failed, falling back ***\n")
-                return mdiff.textdiff(a, b)
-            return d
-        else:
-            return mdiff.textdiff(a, b)
+        return mdiff.textdiff(a, b)
 
     def add(self, map, flags, transaction, link, p1=None, p2=None,
             changed=None):
-        # directly generate the mdiff delta from the data collected during
-        # the bisect loop below
-        def gendelta(delta):
-            i = 0
-            result = []
-            while i < len(delta):
-                start = delta[i][2]
-                end = delta[i][3]
-                l = delta[i][4]
-                if l == None:
-                    l = ""
-                while i < len(delta) - 1 and start <= delta[i+1][2] \
-                          and end >= delta[i+1][2]:
-                    if delta[i+1][3] > end:
-                        end = delta[i+1][3]
-                    if delta[i+1][4]:
-                        l += delta[i+1][4]
+
+        # returns a tuple (start, end).  If the string is found
+        # m[start:end] are the line containing that string.  If start == end
+        # the string was not found and they indicate the proper sorted 
+        # insertion point.  This was taken from bisect_left, and modified
+        # to find line start/end as it goes along.
+        #
+        # m should be a buffer or a string
+        # s is a string
+        #
+        def manifestsearch(m, s, lo=0, hi=None):
+            def advance(i, c):
+                while i < lenm and m[i] != c:
                     i += 1
-                result.append(struct.pack(">lll", start, end, len(l)) +  l)
-                i += 1
-            return result
+                return i
+            lenm = len(m)
+            if not hi:
+                hi = lenm
+            while lo < hi:
+                mid = (lo + hi) // 2
+                start = mid
+                while start > 0 and m[start-1] != '\n':
+                    start -= 1
+                end = advance(start, '\0')
+                if m[start:end] < s:
+                    # we know that after the null there are 40 bytes of sha1
+                    # this translates to the bisect lo = mid + 1
+                    lo = advance(end + 40, '\n') + 1
+                else:
+                    # this translates to the bisect hi = mid
+                    hi = start
+            end = advance(lo, '\0')
+            found = m[lo:end]
+            if cmp(s, found) == 0: 
+                # we know that after the null there are 40 bytes of sha1
+                end = advance(end + 40, '\n')
+                return (lo, end+1)
+            else:
+                return (lo, lo)
 
         # apply the changes collected during the bisect loop to our addlist
-        def addlistdelta(addlist, delta):
-            # apply the deltas to the addlist.  start from the bottom up
+        # return a delta suitable for addrevision
+        def addlistdelta(addlist, x):
+            # start from the bottom up
             # so changes to the offsets don't mess things up.
-            i = len(delta)
+            i = len(x)
             while i > 0:
                 i -= 1
-                start = delta[i][0]
-                end = delta[i][1]
-                if delta[i][4]:
-                    addlist[start:end] = [delta[i][4]]
+                start = x[i][0]
+                end = x[i][1]
+                if x[i][2]:
+                    addlist[start:end] = array.array('c', x[i][2])
                 else:
                     del addlist[start:end]
-            return addlist
-
-        # calculate the byte offset of the start of each line in the
-        # manifest
-        def calcoffsets(addlist):
-            offsets = [0] * (len(addlist) + 1)
-            offset = 0
-            i = 0
-            while i < len(addlist):
-                offsets[i] = offset
-                offset += len(addlist[i])
-                i += 1
-            offsets[i] = offset
-            return offsets
+            return "".join([struct.pack(">lll", d[0], d[1], len(d[2])) + d[2] \
+                            for d in x ])
 
         # if we're using the listcache, make sure it is valid and
         # parented by the same node we're diffing against
@@ -198,15 +197,13 @@ class manifest(revlog):
             files = map.keys()
             files.sort()
 
-            self.addlist = ["%s\000%s%s\n" %
+            text = ["%s\000%s%s\n" %
                             (f, hex(map[f]), flags[f] and "x" or '')
                             for f in files]
+            self.listcache = array.array('c', "".join(text))
             cachedelta = None
         else:
-            addlist = self.listcache[1]
-
-            # find the starting offset for each line in the add list
-            offsets = calcoffsets(addlist)
+            addlist = self.listcache
 
             # combine the changed lists into one list for sorting
             work = [[x, 0] for x in changed[0]]
@@ -214,48 +211,53 @@ class manifest(revlog):
             work.sort()
 
             delta = []
-            bs = 0
+            dstart = None
+            dend = None
+            dline = [""]
+            start = 0
+            addbuf = buffer(addlist)
 
+            # start with a readonly loop that finds the offset of
+            # each line and creates the deltas
             for w in work:
                 f = w[0]
                 # bs will either be the index of the item or the insert point
-                bs = bisect.bisect(addlist, f, bs)
-                if bs < len(addlist):
-                    fn = addlist[bs][:addlist[bs].index('\0')]
-                else:
-                    fn = None
+                start, end = manifestsearch(addbuf, f, start)
                 if w[1] == 0:
                     l = "%s\000%s%s\n" % (f, hex(map[f]),
                                           flags[f] and "x" or '')
                 else:
-                    l = None
-                start = bs
-                if fn != f:
-                    # item not found, insert a new one
-                    end = bs
-                    if w[1] == 1:
-                        sys.stderr.write("failed to remove %s from manifest\n"
-                                         % f)
-                        sys.exit(1)
+                    l = ""
+                if start == end and w[1] == 1:
+                    # item we want to delete was not found, error out
+                    sys.stderr.write("failed to remove %s from manifest\n" % f)
+                    sys.exit(1)
+                if dstart != None and dstart <= start and dend >= start:
+                    if dend < end:
+                        dend = end
+                    if l:
+                        dline.append(l)
                 else:
-                    # item is found, replace/delete the existing line
-                    end = bs + 1
-                delta.append([start, end, offsets[start], offsets[end], l])
-
-            self.addlist = addlistdelta(addlist, delta)
-            if self.mapcache[0] == self.tip():
-                cachedelta = "".join(gendelta(delta))
-            else:
+                    if dstart != None:
+                        delta.append([dstart, dend, "".join(dline)])
+                    dstart = start
+                    dend = end
+                    dline = [l]
+
+            if dstart != None:
+                delta.append([dstart, dend, "".join(dline)])
+               
+            # apply the delta to the addlist, and get a delta for addrevision
+            cachedelta = addlistdelta(addlist, delta)
+
+            # the delta is only valid if we've been processing the tip revision
+            if self.mapcache[0] != self.tip():
                 cachedelta = None
+            self.listcache = addlist
 
-        text = "".join(self.addlist)
-        if cachedelta and mdiff.patch(self.listcache[0], cachedelta) != text:
-            sys.stderr.write("manifest delta failure\n")
-            sys.exit(1)
-        n = self.addrevision(text, transaction, link, p1, p2, cachedelta)
+        n = self.addrevision(buffer(self.listcache), transaction, link, p1,  \
+                             p2, cachedelta)
         self.mapcache = (n, map, flags)
-        self.listcache = (text, self.addlist)
-        self.addlist = None
 
         return n
 
Index: mine/mercurial/revlog.py
===================================================================
--- mine.orig/mercurial/revlog.py	2005-08-15 21:03:06.000000000 -0400
+++ mine/mercurial/revlog.py	2005-08-24 10:27:23.000000000 -0400
@@ -16,15 +16,15 @@ def bin(node): return binascii.unhexlify
 def short(node): return hex(node[:6])
 
 def compress(text):
-    if not text: return text
+    if not text: return ("", text)
     if len(text) < 44:
-        if text[0] == '\0': return text
-        return 'u' + text
+        if text[0] == '\0': return ("", text)
+        return ('u', text)
     bin = zlib.compress(text)
     if len(bin) > len(text):
-        if text[0] == '\0': return text
-        return 'u' + text
-    return bin
+        if text[0] == '\0': return ("", text)
+        return ('u', text)
+    return ("", bin)
 
 def decompress(bin):
     if not bin: return bin
@@ -294,14 +294,16 @@ class revlog:
             end = self.end(t)
             if not d:
                 prev = self.revision(self.tip())
-                d = self.diff(prev, text)
+                d = self.diff(prev, str(text))
             data = compress(d)
-            dist = end - start + len(data)
+            l = len(data[1]) + len(data[0])
+            dist = end - start + l
 
         # full versions are inserted when the needed deltas
         # become comparable to the uncompressed text
         if not n or dist > len(text) * 2:
             data = compress(text)
+            l = len(data[1]) + len(data[0])
             base = n
         else:
             base = self.base(t)
@@ -310,14 +312,17 @@ class revlog:
         if t >= 0:
             offset = self.end(t)
 
-        e = (offset, len(data), base, link, p1, p2, node)
+        e = (offset, l, base, link, p1, p2, node)
 
         self.index.append(e)
         self.nodemap[node] = n
         entry = struct.pack(indexformat, *e)
 
         transaction.add(self.datafile, e[0])
-        self.opener(self.datafile, "a").write(data)
+        f = self.opener(self.datafile, "a")
+        if data[0]:
+            f.write(data[0])
+        f.write(data[1])
         transaction.add(self.indexfile, n * len(entry))
         self.opener(self.indexfile, "a").write(entry)
 
@@ -523,7 +528,8 @@ class revlog:
             # current size.
 
             if chain == prev:
-                cdelta = compress(delta)
+                tempd = compress(delta)
+                cdelta = tempd[0] + tempd[1]
 
             if chain != prev or (end - start + len(cdelta)) > measure * 2:
                 # flush our writes here so we can read it in revision


More information about the Mercurial mailing list