[PATCH 3 of 6] treemanifest: create treemanifest class

Martin von Zweigbergk martinvonz at google.com
Wed Mar 11 16:05:37 CDT 2015


Yep, working on it! :-) As I said on IRC, there'll be a V2 soon. If someone
feels like queuing the first 2 patches, go ahead (they're not changing in
V2).

On Wed, Mar 11, 2015 at 1:59 PM Augie Fackler <raf at durin42.com> wrote:

> On Tue, Mar 10, 2015 at 09:23:35PM -0700, Martin von Zweigbergk wrote:
> > # HG changeset patch
> > # User Martin von Zweigbergk <martinvonz at google.com>
> > # Date 1425931076 25200
> > #      Mon Mar 09 12:57:56 2015 -0700
> > # Node ID 34ad94f42d39e9ad3e7a7b8695f419bc2ff8b37f
> > # Parent  5671d3b1883e8fdce48e43aabce6ca5c86f6ea70
> > treemanifest: create treemanifest class
> >
> > There are a number of problems with large and flat manifests. Copying
> > from http://mercurial.selenic.com/wiki/ManifestShardingPlan:
> >
> >  * manifest too large for RAM
> >
> >  * manifest resolution too much CPU (long delta chains)
> >
> >  * committing is slow because entire manifest has to be hashed
> >
> >  * impossible for narrow clone to leave out part of manifest as all is
> >    needed to calculate new hash
> >
> >  * diffing two revisions involves traversing entire subdirectories
> >    even if identical
> >
> > This is a first step in a series introducing a manifest revlog per
> > directory.
> >
> > This change adds boolean configuration option
> > experimental.treemanifest. When the option is enabled, manifests are
> > parsed into a new tree data structure with one tree node per
> > directory. At this point, it is just a different data structure in
> > memory; there is still just a single manifest revlog on disk.
> >
> > The code is not yet optimized. Enabling the option makes most or all
> > operations slower. Once we start storing manifest revlogs for every
> > directory, it should be possible to make many of these operations much
> > faster. The fastdelta() optimization has been explicitly disabled (and
> > not implemented) for the treemanifests.
> >
> > Tests can be run using the new data structure by switching the config
> > option default in localrepo._applyrequirements(). All tests pass.
> >
> > diff -r 5671d3b1883e -r 34ad94f42d39 mercurial/localrepo.py
> > --- a/mercurial/localrepo.py  Tue Mar 10 16:26:13 2015 -0700
> > +++ b/mercurial/localrepo.py  Mon Mar 09 12:57:56 2015 -0700
> > @@ -326,6 +326,9 @@
> >          manifestcachesize = self.ui.configint('format',
> 'manifestcachesize')
> >          if manifestcachesize is not None:
> >              self.svfs.options['manifestcachesize'] = manifestcachesize
> > +        usetreemanifest = self.ui.configbool('experimental',
> 'treemanifest')
> > +        if usetreemanifest is not None:
> > +            self.svfs.options['usetreemanifest'] = usetreemanifest
> >
> >      def _writerequirements(self):
> >          reqfile = self.vfs("requires", "w")
> > diff -r 5671d3b1883e -r 34ad94f42d39 mercurial/manifest.py
> > --- a/mercurial/manifest.py   Tue Mar 10 16:26:13 2015 -0700
> > +++ b/mercurial/manifest.py   Mon Mar 09 12:57:56 2015 -0700
> > @@ -311,22 +311,240 @@
> >                     + content for start, end, content in x)
> >      return deltatext, newaddlist
> >
> > +def _splittopdir(f):
> > +    if '/' in f:
> > +        dir, subpath = f.split('/', 1)
> > +        return dir + '/', subpath
> > +    else:
> > +        return '', f
> > +
> > +class treemanifest(object):
>
> This class can (and probably should) use _lazymanifest directly,
> rather than proxying through the higher-level dict-like abstraction.
>
> > +    def __init__(self, text=''):
> > +        self._dirs = {}
> > +        self._files = {}
> > +        self._flags = {}
> > +        mfdict = manifestdict(text)
> > +        for f, n in mfdict.iteritems():
> > +            self[f] = n
> > +            if mfdict.flags(f):
> > +                self.setflag(f, mfdict.flags(f))
> > +
> > +    def __len__(self):
> > +        size = len(self._files)
> > +        for m in self._dirs.values():
> > +            size += m.__len__()
> > +        return size
> > +
> > +    def iteritems(self):
> > +        for p, n in sorted(self._dirs.items() + self._files.items()):
> > +            if p in self._files:
> > +                yield p, n
> > +            else:
> > +                for sf, sn in n.iteritems():
> > +                    yield p + sf, sn
> > +
> > +    def iterkeys(self):
> > +        for p in sorted(self._dirs.keys() + self._files.keys()):
> > +            if p in self._files:
> > +                yield p
> > +            else:
> > +                for f in self._dirs[p].iterkeys():
> > +                    yield p + f
> > +
> > +    def keys(self):
> > +        return list(self.iterkeys())
> > +
> > +    def __iter__(self):
> > +        return (f for f in self.iterkeys())
> > +
> > +    def __contains__(self, f):
> > +        if f is None:
> > +            return False
> > +        dir, subpath = _splittopdir(f)
> > +        if dir:
> > +            if dir not in self._dirs:
> > +                return False
> > +            return self._dirs[dir].__contains__(subpath)
> > +        else:
> > +            return f in self._files
> > +
> > +    def get(self, f, default=None):
> > +        dir, subpath = _splittopdir(f)
> > +        if dir:
> > +            if dir not in self._dirs:
> > +                return default
> > +            return self._dirs[dir].get(subpath, default)
> > +        else:
> > +            return self._files.get(f, default)
> > +
> > +    def __getitem__(self, f):
> > +        dir, subpath = _splittopdir(f)
> > +        if dir:
> > +            return self._dirs[dir].__getitem__(subpath)
> > +        else:
> > +            return self._files[f]
> > +
> > +    def flags(self, f):
> > +        dir, subpath = _splittopdir(f)
> > +        if dir:
> > +            if dir not in self._dirs:
> > +                return ''
> > +            return self._dirs[dir].flags(subpath)
> > +        else:
> > +            if f in self._dirs:
> > +                return ''
> > +            return self._flags.get(f, '')
> > +
> > +    def find(self, f):
> > +        dir, subpath = _splittopdir(f)
> > +        if dir:
> > +            return self._dirs[dir].find(subpath)
> > +        else:
> > +            return self._files[f], self._flags.get(f, '')
> > +
> > +    def __delitem__(self, f):
> > +        dir, subpath = _splittopdir(f)
> > +        if dir:
> > +            self._dirs[dir].__delitem__(subpath)
> > +            if not self._dirs[dir]._dirs and not self._dirs[dir]._files:
> > +                del self._dirs[dir]
> > +        else:
> > +            del self._files[f]
> > +            if f in self._flags:
> > +                del self._flags[f]
> > +
> > +    def __setitem__(self, f, n):
> > +        assert n is not None
> > +        dir, subpath = _splittopdir(f)
> > +        if dir:
> > +            if dir not in self._dirs:
> > +                self._dirs[dir] = treemanifest()
> > +            self._dirs[dir].__setitem__(subpath, n)
> > +        else:
> > +            self._files[f] = n
> > +
> > +    def setflag(self, f, flags):
> > +        """Set the flags (symlink, executable) for path f."""
> > +        dir, subpath = _splittopdir(f)
> > +        if dir:
> > +            if dir not in self._dirs:
> > +                self._dirs[dir] = treemanifest()
> > +            self._dirs[dir].setflag(subpath, flags)
> > +        else:
> > +            self._flags[f] = flags
> > +
> > +    def copy(self):
> > +        copy = treemanifest()
> > +        for d in self._dirs:
> > +            copy._dirs[d] = self._dirs[d].copy()
> > +        copy._files = dict.copy(self._files)
> > +        copy._flags = dict.copy(self._flags)
> > +        return copy
> > +
> > +    def intersectfiles(self, files):
> > +        '''make a new treemanifest with the intersection of self with
> files
> > +
> > +        The algorithm assumes that files is much smaller than self.'''
> > +        ret = treemanifest()
> > +        for fn in files:
> > +            if fn in self:
> > +                ret[fn] = self[fn]
> > +                flags = self.flags(fn)
> > +                if flags:
> > +                    ret.setflag(fn, flags)
> > +        return ret
> > +
> > +    def filesnotin(self, m2):
> > +        '''Set of files in this manifest that are not in the other'''
> > +        files = set(self.iterkeys())
> > +        files.difference_update(m2.iterkeys())
> > +        return files
> > +
> > +    def matches(self, match):
> > +        '''generate a new manifest filtered by the match argument'''
> > +        if match.always():
> > +            return self.copy()
> > +
> > +        files = match.files()
> > +        if (match.matchfn == match.exact or
> > +            (not match.anypats() and util.all(fn in self for fn in
> files))):
> > +            return self.intersectfiles(files)
> > +
> > +        m = self.copy()
> > +        for fn in m.keys():
> > +            if not match(fn):
> > +                del m[fn]
> > +        return m
> > +
> > +    def diff(self, m2, clean=False):
> > +        '''Finds changes between the current manifest and m2.
> > +
> > +        Args:
> > +          m2: the manifest to which this manifest should be compared.
> > +          clean: if true, include files unchanged between these
> manifests
> > +                 with a None value in the returned dictionary.
> > +
> > +        The result is returned as a dict with filename as key and
> > +        values of the form ((n1,fl1),(n2,fl2)), where n1/n2 is the
> > +        nodeid in the current/other manifest and fl1/fl2 is the flag
> > +        in the current/other manifest. Where the file does not exist,
> > +        the nodeid will be None and the flags will be the empty
> > +        string.
> > +        '''
> > +        diff = {}
> > +
> > +        for fn, n1 in self.iteritems():
> > +            fl1 = self.flags(fn)
> > +            n2 = m2.get(fn, None)
> > +            fl2 = m2.flags(fn)
> > +            if n2 is None:
> > +                fl2 = ''
> > +            if n1 != n2 or fl1 != fl2:
> > +                diff[fn] = ((n1, fl1), (n2, fl2))
> > +            elif clean:
> > +                diff[fn] = None
> > +
> > +        for fn, n2 in m2.iteritems():
> > +            if fn not in self:
> > +                fl2 = m2.flags(fn)
> > +                diff[fn] = ((None, ''), (n2, fl2))
> > +
> > +        return diff
> > +
> > +    def text(self):
> > +        """Get the full data of this manifest as a bytestring."""
> > +        fl = self.keys()
> > +        _checkforbidden(fl)
> > +
> > +        hex, flags = revlog.hex, self.flags
> > +        # if this is changed to support newlines in filenames,
> > +        # be sure to check the templates/ dir again (especially
> *-raw.tmpl)
> > +        return ''.join("%s\0%s%s\n" % (f, hex(self[f]), flags(f)) for f
> in fl)
> > +
> >  class manifest(revlog.revlog):
> >      def __init__(self, opener):
> >          # During normal operations, we expect to deal with not more
> than four
> >          # revs at a time (such as during commit --amend). When rebasing
> large
> >          # stacks of commits, the number can go up, hence the config
> knob below.
> >          cachesize = 4
> > +        usetreemanifest = False
> >          opts = getattr(opener, 'options', None)
> >          if opts is not None:
> >              cachesize = opts.get('manifestcachesize', cachesize)
> > +            usetreemanifest = opts.get('usetreemanifest',
> usetreemanifest)
> >          self._mancache = util.lrucachedict(cachesize)
> >          revlog.revlog.__init__(self, opener, "00manifest.i")
> > +        self._usetreemanifest = usetreemanifest
> > +
> > +    def _newmanifest(self, data=''):
> > +        if self._usetreemanifest:
> > +            return treemanifest(data)
> > +        return manifestdict(data)
> >
> >      def readdelta(self, node):
> >          r = self.rev(node)
> >          d = mdiff.patchtext(self.revdiff(self.deltaparent(r), r))
> > -        return manifestdict(d)
> > +        return self._newmanifest(d)
> >
> >      def readfast(self, node):
> >          '''use the faster of readdelta or read'''
> > @@ -338,12 +556,12 @@
> >
> >      def read(self, node):
> >          if node == revlog.nullid:
> > -            return manifestdict() # don't upset local cache
> > +            return self._newmanifest() # don't upset local cache
> >          if node in self._mancache:
> >              return self._mancache[node][0]
> >          text = self.revision(node)
> >          arraytext = array.array('c', text)
> > -        m = manifestdict(text)
> > +        m = self._newmanifest(text)
> >          self._mancache[node] = (m, arraytext)
> >          return m
> >
> > @@ -355,12 +573,12 @@
> >              return m.get(f), m.flags(f)
> >          text = self.revision(node)
> >          try:
> > -            return manifestdict(text).find(f)
> > +            return self._newmanifest(text).find(f)
> >          except KeyError:
> >              return None, None
> >
> >      def add(self, m, transaction, link, p1, p2, added, removed):
> > -        if p1 in self._mancache:
> > +        if p1 in self._mancache and not self._usetreemanifest:
> >              # If our first parent is in the manifest cache, we can
> >              # compute a delta here using properties we know about the
> >              # manifest up-front, which may save time later for the
> > _______________________________________________
> > Mercurial-devel mailing list
> > Mercurial-devel at selenic.com
> > http://selenic.com/mailman/listinfo/mercurial-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://selenic.com/pipermail/mercurial-devel/attachments/20150311/d587afb7/attachment.html>


More information about the Mercurial-devel mailing list