[PATCH 3 of 6] treemanifest: create treemanifest class

Martin von Zweigbergk martinvonz at google.com
Tue Mar 10 23:23:35 CDT 2015


# HG changeset patch
# User Martin von Zweigbergk <martinvonz at google.com>
# Date 1425931076 25200
#      Mon Mar 09 12:57:56 2015 -0700
# Node ID 34ad94f42d39e9ad3e7a7b8695f419bc2ff8b37f
# Parent  5671d3b1883e8fdce48e43aabce6ca5c86f6ea70
treemanifest: create treemanifest class

There are a number of problems with large and flat manifests. Copying
from http://mercurial.selenic.com/wiki/ManifestShardingPlan:

 * manifest too large for RAM

 * manifest resolution too much CPU (long delta chains)

 * committing is slow because entire manifest has to be hashed

 * impossible for narrow clone to leave out part of manifest as all is
   needed to calculate new hash

 * diffing two revisions involves traversing entire subdirectories
   even if identical

This is a first step in a series introducing a manifest revlog per
directory.

This change adds boolean configuration option
experimental.treemanifest. When the option is enabled, manifests are
parsed into a new tree data structure with one tree node per
directory. At this point, it is just a different data structure in
memory; there is still just a single manifest revlog on disk.

The code is not yet optimized. Enabling the option makes most or all
operations slower. Once we start storing manifest revlogs for every
directory, it should be possible to make many of these operations much
faster. The fastdelta() optimization has been explicitly disabled (and
not implemented) for the treemanifests.

Tests can be run using the new data structure by switching the config
option default in localrepo._applyrequirements(). All tests pass.

diff -r 5671d3b1883e -r 34ad94f42d39 mercurial/localrepo.py
--- a/mercurial/localrepo.py	Tue Mar 10 16:26:13 2015 -0700
+++ b/mercurial/localrepo.py	Mon Mar 09 12:57:56 2015 -0700
@@ -326,6 +326,9 @@
         manifestcachesize = self.ui.configint('format', 'manifestcachesize')
         if manifestcachesize is not None:
             self.svfs.options['manifestcachesize'] = manifestcachesize
+        usetreemanifest = self.ui.configbool('experimental', 'treemanifest')
+        if usetreemanifest is not None:
+            self.svfs.options['usetreemanifest'] = usetreemanifest
 
     def _writerequirements(self):
         reqfile = self.vfs("requires", "w")
diff -r 5671d3b1883e -r 34ad94f42d39 mercurial/manifest.py
--- a/mercurial/manifest.py	Tue Mar 10 16:26:13 2015 -0700
+++ b/mercurial/manifest.py	Mon Mar 09 12:57:56 2015 -0700
@@ -311,22 +311,240 @@
                    + content for start, end, content in x)
     return deltatext, newaddlist
 
+def _splittopdir(f):
+    if '/' in f:
+        dir, subpath = f.split('/', 1)
+        return dir + '/', subpath
+    else:
+        return '', f
+
+class treemanifest(object):
+    def __init__(self, text=''):
+        self._dirs = {}
+        self._files = {}
+        self._flags = {}
+        mfdict = manifestdict(text)
+        for f, n in mfdict.iteritems():
+            self[f] = n
+            if mfdict.flags(f):
+                self.setflag(f, mfdict.flags(f))
+
+    def __len__(self):
+        size = len(self._files)
+        for m in self._dirs.values():
+            size += m.__len__()
+        return size
+
+    def iteritems(self):
+        for p, n in sorted(self._dirs.items() + self._files.items()):
+            if p in self._files:
+                yield p, n
+            else:
+                for sf, sn in n.iteritems():
+                    yield p + sf, sn
+
+    def iterkeys(self):
+        for p in sorted(self._dirs.keys() + self._files.keys()):
+            if p in self._files:
+                yield p
+            else:
+                for f in self._dirs[p].iterkeys():
+                    yield p + f
+
+    def keys(self):
+        return list(self.iterkeys())
+
+    def __iter__(self):
+        return (f for f in self.iterkeys())
+
+    def __contains__(self, f):
+        if f is None:
+            return False
+        dir, subpath = _splittopdir(f)
+        if dir:
+            if dir not in self._dirs:
+                return False
+            return self._dirs[dir].__contains__(subpath)
+        else:
+            return f in self._files
+
+    def get(self, f, default=None):
+        dir, subpath = _splittopdir(f)
+        if dir:
+            if dir not in self._dirs:
+                return default
+            return self._dirs[dir].get(subpath, default)
+        else:
+            return self._files.get(f, default)
+
+    def __getitem__(self, f):
+        dir, subpath = _splittopdir(f)
+        if dir:
+            return self._dirs[dir].__getitem__(subpath)
+        else:
+            return self._files[f]
+
+    def flags(self, f):
+        dir, subpath = _splittopdir(f)
+        if dir:
+            if dir not in self._dirs:
+                return ''
+            return self._dirs[dir].flags(subpath)
+        else:
+            if f in self._dirs:
+                return ''
+            return self._flags.get(f, '')
+
+    def find(self, f):
+        dir, subpath = _splittopdir(f)
+        if dir:
+            return self._dirs[dir].find(subpath)
+        else:
+            return self._files[f], self._flags.get(f, '')
+
+    def __delitem__(self, f):
+        dir, subpath = _splittopdir(f)
+        if dir:
+            self._dirs[dir].__delitem__(subpath)
+            if not self._dirs[dir]._dirs and not self._dirs[dir]._files:
+                del self._dirs[dir]
+        else:
+            del self._files[f]
+            if f in self._flags:
+                del self._flags[f]
+
+    def __setitem__(self, f, n):
+        assert n is not None
+        dir, subpath = _splittopdir(f)
+        if dir:
+            if dir not in self._dirs:
+                self._dirs[dir] = treemanifest()
+            self._dirs[dir].__setitem__(subpath, n)
+        else:
+            self._files[f] = n
+
+    def setflag(self, f, flags):
+        """Set the flags (symlink, executable) for path f."""
+        dir, subpath = _splittopdir(f)
+        if dir:
+            if dir not in self._dirs:
+                self._dirs[dir] = treemanifest()
+            self._dirs[dir].setflag(subpath, flags)
+        else:
+            self._flags[f] = flags
+
+    def copy(self):
+        copy = treemanifest()
+        for d in self._dirs:
+            copy._dirs[d] = self._dirs[d].copy()
+        copy._files = dict.copy(self._files)
+        copy._flags = dict.copy(self._flags)
+        return copy
+
+    def intersectfiles(self, files):
+        '''make a new treemanifest with the intersection of self with files
+
+        The algorithm assumes that files is much smaller than self.'''
+        ret = treemanifest()
+        for fn in files:
+            if fn in self:
+                ret[fn] = self[fn]
+                flags = self.flags(fn)
+                if flags:
+                    ret.setflag(fn, flags)
+        return ret
+
+    def filesnotin(self, m2):
+        '''Set of files in this manifest that are not in the other'''
+        files = set(self.iterkeys())
+        files.difference_update(m2.iterkeys())
+        return files
+
+    def matches(self, match):
+        '''generate a new manifest filtered by the match argument'''
+        if match.always():
+            return self.copy()
+
+        files = match.files()
+        if (match.matchfn == match.exact or
+            (not match.anypats() and util.all(fn in self for fn in files))):
+            return self.intersectfiles(files)
+
+        m = self.copy()
+        for fn in m.keys():
+            if not match(fn):
+                del m[fn]
+        return m
+
+    def diff(self, m2, clean=False):
+        '''Finds changes between the current manifest and m2.
+
+        Args:
+          m2: the manifest to which this manifest should be compared.
+          clean: if true, include files unchanged between these manifests
+                 with a None value in the returned dictionary.
+
+        The result is returned as a dict with filename as key and
+        values of the form ((n1,fl1),(n2,fl2)), where n1/n2 is the
+        nodeid in the current/other manifest and fl1/fl2 is the flag
+        in the current/other manifest. Where the file does not exist,
+        the nodeid will be None and the flags will be the empty
+        string.
+        '''
+        diff = {}
+
+        for fn, n1 in self.iteritems():
+            fl1 = self.flags(fn)
+            n2 = m2.get(fn, None)
+            fl2 = m2.flags(fn)
+            if n2 is None:
+                fl2 = ''
+            if n1 != n2 or fl1 != fl2:
+                diff[fn] = ((n1, fl1), (n2, fl2))
+            elif clean:
+                diff[fn] = None
+
+        for fn, n2 in m2.iteritems():
+            if fn not in self:
+                fl2 = m2.flags(fn)
+                diff[fn] = ((None, ''), (n2, fl2))
+
+        return diff
+
+    def text(self):
+        """Get the full data of this manifest as a bytestring."""
+        fl = self.keys()
+        _checkforbidden(fl)
+
+        hex, flags = revlog.hex, self.flags
+        # if this is changed to support newlines in filenames,
+        # be sure to check the templates/ dir again (especially *-raw.tmpl)
+        return ''.join("%s\0%s%s\n" % (f, hex(self[f]), flags(f)) for f in fl)
+
 class manifest(revlog.revlog):
     def __init__(self, opener):
         # During normal operations, we expect to deal with not more than four
         # revs at a time (such as during commit --amend). When rebasing large
         # stacks of commits, the number can go up, hence the config knob below.
         cachesize = 4
+        usetreemanifest = False
         opts = getattr(opener, 'options', None)
         if opts is not None:
             cachesize = opts.get('manifestcachesize', cachesize)
+            usetreemanifest = opts.get('usetreemanifest', usetreemanifest)
         self._mancache = util.lrucachedict(cachesize)
         revlog.revlog.__init__(self, opener, "00manifest.i")
+        self._usetreemanifest = usetreemanifest
+
+    def _newmanifest(self, data=''):
+        if self._usetreemanifest:
+            return treemanifest(data)
+        return manifestdict(data)
 
     def readdelta(self, node):
         r = self.rev(node)
         d = mdiff.patchtext(self.revdiff(self.deltaparent(r), r))
-        return manifestdict(d)
+        return self._newmanifest(d)
 
     def readfast(self, node):
         '''use the faster of readdelta or read'''
@@ -338,12 +556,12 @@
 
     def read(self, node):
         if node == revlog.nullid:
-            return manifestdict() # don't upset local cache
+            return self._newmanifest() # don't upset local cache
         if node in self._mancache:
             return self._mancache[node][0]
         text = self.revision(node)
         arraytext = array.array('c', text)
-        m = manifestdict(text)
+        m = self._newmanifest(text)
         self._mancache[node] = (m, arraytext)
         return m
 
@@ -355,12 +573,12 @@
             return m.get(f), m.flags(f)
         text = self.revision(node)
         try:
-            return manifestdict(text).find(f)
+            return self._newmanifest(text).find(f)
         except KeyError:
             return None, None
 
     def add(self, m, transaction, link, p1, p2, added, removed):
-        if p1 in self._mancache:
+        if p1 in self._mancache and not self._usetreemanifest:
             # If our first parent is in the manifest cache, we can
             # compute a delta here using properties we know about the
             # manifest up-front, which may save time later for the


More information about the Mercurial-devel mailing list