[patch 2/2] Optimize manifest.add

Chris Mason mason at suse.com
Thu Sep 1 08:47:26 CDT 2005


# HG changeset patch
# User mason at suse.com
Optimize manifest.add

Testing shows that manifest.add is spending a significant percentage of
its time running calcoffsets and doing text = "".join(addlist).  This
patch removes the need for both of these by storying the manifest in a
character array, and using a modified bisect search to find lines without
the help of a separate index of line offsets.

manifest.add was also reworked to push delta construction/combination into the 
main loop.  

Time to apply 2751 patches (without psyco, ext3 noatime,data=writeback):

Stock hg: 4m45s real 3m32s user 55s sys
patched:  2m48s real 1m53s user 43s sys
quilt:    2m30s real   45s user 50s sys

(quilt does much more io...)

Some changes to the compression code were required because it was trying
to concatenate a string with the buffer from the new array.  The fix also
reduces string duplication in general.

Index: crew/mercurial/manifest.py
===================================================================
--- crew.orig/mercurial/manifest.py	2005-09-01 08:07:08.000000000 -0400
+++ crew/mercurial/manifest.py	2005-09-01 08:20:15.000000000 -0400
@@ -8,13 +8,12 @@
 import sys, struct
 from revlog import *
 from demandload import *
-demandload(globals(), "bisect")
+demandload(globals(), "bisect array")
 
 class manifest(revlog):
     def __init__(self, opener):
         self.mapcache = None
         self.listcache = None
-        self.addlist = None
         revlog.__init__(self, opener, "00manifest.i", "00manifest.d")
 
     def read(self, node):
@@ -24,8 +23,9 @@ class manifest(revlog):
         text = self.revision(node)
         map = {}
         flag = {}
-        self.listcache = (text, text.splitlines(1))
-        for l in self.listcache[1]:
+        self.listcache = array.array('c', text)
+        lines = text.splitlines(1)
+        for l in lines:
             (f, n) = l.split('\0')
             map[f] = bin(n[:40])
             flag[f] = (n[40:-1] == "x")
@@ -39,66 +39,66 @@ class manifest(revlog):
         return self.mapcache[2]
 
     def diff(self, a, b):
-        # this is sneaky, as we're not actually using a and b
-        if self.listcache and self.addlist and self.listcache[0] == a:
-            d = mdiff.diff(self.listcache[1], self.addlist, 1)
-            if mdiff.patch(a, d) != b:
-                raise AssertionError("sortdiff failed!")
-            return d
-        else:
-            return mdiff.textdiff(a, b)
+        return mdiff.textdiff(str(a), str(b))
 
     def add(self, map, flags, transaction, link, p1=None, p2=None,
             changed=None):
-        # directly generate the mdiff delta from the data collected during
-        # the bisect loop below
-        def gendelta(delta):
-            i = 0
-            result = []
-            while i < len(delta):
-                start = delta[i][2]
-                end = delta[i][3]
-                l = delta[i][4]
-                if l == None:
-                    l = ""
-                while i < len(delta) - 1 and start <= delta[i+1][2] \
-                          and end >= delta[i+1][2]:
-                    if delta[i+1][3] > end:
-                        end = delta[i+1][3]
-                    if delta[i+1][4]:
-                        l += delta[i+1][4]
+
+        # returns a tuple (start, end).  If the string is found
+        # m[start:end] are the line containing that string.  If start == end
+        # the string was not found and they indicate the proper sorted 
+        # insertion point.  This was taken from bisect_left, and modified
+        # to find line start/end as it goes along.
+        #
+        # m should be a buffer or a string
+        # s is a string
+        #
+        def manifestsearch(m, s, lo=0, hi=None):
+            def advance(i, c):
+                while i < lenm and m[i] != c:
                     i += 1
-                result.append(struct.pack(">lll", start, end, len(l)) +  l)
-                i += 1
-            return result
+                return i
+            lenm = len(m)
+            if not hi:
+                hi = lenm
+            while lo < hi:
+                mid = (lo + hi) // 2
+                start = mid
+                while start > 0 and m[start-1] != '\n':
+                    start -= 1
+                end = advance(start, '\0')
+                if m[start:end] < s:
+                    # we know that after the null there are 40 bytes of sha1
+                    # this translates to the bisect lo = mid + 1
+                    lo = advance(end + 40, '\n') + 1
+                else:
+                    # this translates to the bisect hi = mid
+                    hi = start
+            end = advance(lo, '\0')
+            found = m[lo:end]
+            if cmp(s, found) == 0: 
+                # we know that after the null there are 40 bytes of sha1
+                end = advance(end + 40, '\n')
+                return (lo, end+1)
+            else:
+                return (lo, lo)
 
         # apply the changes collected during the bisect loop to our addlist
-        def addlistdelta(addlist, delta):
-            # apply the deltas to the addlist.  start from the bottom up
+        # return a delta suitable for addrevision
+        def addlistdelta(addlist, x):
+            # start from the bottom up
             # so changes to the offsets don't mess things up.
-            i = len(delta)
+            i = len(x)
             while i > 0:
                 i -= 1
-                start = delta[i][0]
-                end = delta[i][1]
-                if delta[i][4]:
-                    addlist[start:end] = [delta[i][4]]
+                start = x[i][0]
+                end = x[i][1]
+                if x[i][2]:
+                    addlist[start:end] = array.array('c', x[i][2])
                 else:
                     del addlist[start:end]
-            return addlist
-
-        # calculate the byte offset of the start of each line in the
-        # manifest
-        def calcoffsets(addlist):
-            offsets = [0] * (len(addlist) + 1)
-            offset = 0
-            i = 0
-            while i < len(addlist):
-                offsets[i] = offset
-                offset += len(addlist[i])
-                i += 1
-            offsets[i] = offset
-            return offsets
+            return "".join([struct.pack(">lll", d[0], d[1], len(d[2])) + d[2] \
+                            for d in x ])
 
         # if we're using the listcache, make sure it is valid and
         # parented by the same node we're diffing against
@@ -107,15 +107,13 @@ class manifest(revlog):
             files = map.keys()
             files.sort()
 
-            self.addlist = ["%s\000%s%s\n" %
+            text = ["%s\000%s%s\n" %
                             (f, hex(map[f]), flags[f] and "x" or '')
                             for f in files]
+            self.listcache = array.array('c', "".join(text))
             cachedelta = None
         else:
-            addlist = self.listcache[1]
-
-            # find the starting offset for each line in the add list
-            offsets = calcoffsets(addlist)
+            addlist = self.listcache
 
             # combine the changed lists into one list for sorting
             work = [[x, 0] for x in changed[0]]
@@ -123,45 +121,53 @@ class manifest(revlog):
             work.sort()
 
             delta = []
-            bs = 0
+            dstart = None
+            dend = None
+            dline = [""]
+            start = 0
+            # zero copy representation of addlist as a buffer
+            addbuf = buffer(addlist)
 
+            # start with a readonly loop that finds the offset of
+            # each line and creates the deltas
             for w in work:
                 f = w[0]
                 # bs will either be the index of the item or the insert point
-                bs = bisect.bisect(addlist, f, bs)
-                if bs < len(addlist):
-                    fn = addlist[bs][:addlist[bs].index('\0')]
-                else:
-                    fn = None
+                start, end = manifestsearch(addbuf, f, start)
                 if w[1] == 0:
                     l = "%s\000%s%s\n" % (f, hex(map[f]),
                                           flags[f] and "x" or '')
                 else:
-                    l = None
-                start = bs
-                if fn != f:
-                    # item not found, insert a new one
-                    end = bs
-                    if w[1] == 1:
-                        raise AssertionError(
+                    l = ""
+                if start == end and w[1] == 1:
+                    # item we want to delete was not found, error out
+                    raise AssertionError(
                             "failed to remove %s from manifest\n" % f)
+                if dstart != None and dstart <= start and dend >= start:
+                    if dend < end:
+                        dend = end
+                    if l:
+                        dline.append(l)
                 else:
-                    # item is found, replace/delete the existing line
-                    end = bs + 1
-                delta.append([start, end, offsets[start], offsets[end], l])
-
-            self.addlist = addlistdelta(addlist, delta)
-            if self.mapcache[0] == self.tip():
-                cachedelta = "".join(gendelta(delta))
-            else:
+                    if dstart != None:
+                        delta.append([dstart, dend, "".join(dline)])
+                    dstart = start
+                    dend = end
+                    dline = [l]
+
+            if dstart != None:
+                delta.append([dstart, dend, "".join(dline)])
+               
+            # apply the delta to the addlist, and get a delta for addrevision
+            cachedelta = addlistdelta(addlist, delta)
+
+            # the delta is only valid if we've been processing the tip revision
+            if self.mapcache[0] != self.tip():
                 cachedelta = None
+            self.listcache = addlist
 
-        text = "".join(self.addlist)
-        if cachedelta and mdiff.patch(self.listcache[0], cachedelta) != text:
-            raise AssertionError("manifest delta failure\n")
-        n = self.addrevision(text, transaction, link, p1, p2, cachedelta)
+        n = self.addrevision(buffer(self.listcache), transaction, link, p1,  \
+                             p2, cachedelta)
         self.mapcache = (n, map, flags)
-        self.listcache = (text, self.addlist)
-        self.addlist = None
 
         return n
Index: crew/mercurial/revlog.py
===================================================================
--- crew.orig/mercurial/revlog.py	2005-08-31 21:30:47.000000000 -0400
+++ crew/mercurial/revlog.py	2005-09-01 08:18:14.000000000 -0400
@@ -30,15 +30,15 @@ def hash(text, p1, p2):
 
 def compress(text):
     """ generate a possibly-compressed representation of text """
-    if not text: return text
+    if not text: return ("", text)
     if len(text) < 44:
-        if text[0] == '\0': return text
-        return 'u' + text
+        if text[0] == '\0': return ("", text)
+        return ('u', text)
     bin = zlib.compress(text)
     if len(bin) > len(text):
-        if text[0] == '\0': return text
-        return 'u' + text
-    return bin
+        if text[0] == '\0': return ("", text)
+        return ('u', text)
+    return ("", bin)
 
 def decompress(bin):
     """ decompress the given input """
@@ -382,14 +382,16 @@ class revlog:
             end = self.end(t)
             if not d:
                 prev = self.revision(self.tip())
-                d = self.diff(prev, text)
+                d = self.diff(prev, str(text))
             data = compress(d)
-            dist = end - start + len(data)
+            l = len(data[1]) + len(data[0])
+            dist = end - start + l
 
         # full versions are inserted when the needed deltas
         # become comparable to the uncompressed text
         if not n or dist > len(text) * 2:
             data = compress(text)
+            l = len(data[1]) + len(data[0])
             base = n
         else:
             base = self.base(t)
@@ -398,14 +400,17 @@ class revlog:
         if t >= 0:
             offset = self.end(t)
 
-        e = (offset, len(data), base, link, p1, p2, node)
+        e = (offset, l, base, link, p1, p2, node)
 
         self.index.append(e)
         self.nodemap[node] = n
         entry = struct.pack(indexformat, *e)
 
         transaction.add(self.datafile, e[0])
-        self.opener(self.datafile, "a").write(data)
+        f = self.opener(self.datafile, "a")
+        if data[0]:
+            f.write(data[0])
+        f.write(data[1])
         transaction.add(self.indexfile, n * len(entry))
         self.opener(self.indexfile, "a").write(entry)
 
@@ -618,7 +623,8 @@ class revlog:
             # current size.
 
             if chain == prev:
-                cdelta = compress(delta)
+                tempd = compress(delta)
+                cdelta = tempd[0] + tempd[1]
 
             if chain != prev or (end - start + len(cdelta)) > measure * 2:
                 # flush our writes here so we can read it in revision

--


More information about the Mercurial mailing list