[PATCH 3 of 6] treemanifest: create treemanifest class
Martin von Zweigbergk
martinvonz at google.com
Wed Mar 11 16:05:37 CDT 2015
Yep, working on it! :-) As I said on IRC, there'll be a V2 soon. If someone
feels like queuing the first 2 patches, go ahead (they're not changing in
V2).
On Wed, Mar 11, 2015 at 1:59 PM Augie Fackler <raf at durin42.com> wrote:
> On Tue, Mar 10, 2015 at 09:23:35PM -0700, Martin von Zweigbergk wrote:
> > # HG changeset patch
> > # User Martin von Zweigbergk <martinvonz at google.com>
> > # Date 1425931076 25200
> > # Mon Mar 09 12:57:56 2015 -0700
> > # Node ID 34ad94f42d39e9ad3e7a7b8695f419bc2ff8b37f
> > # Parent 5671d3b1883e8fdce48e43aabce6ca5c86f6ea70
> > treemanifest: create treemanifest class
> >
> > There are a number of problems with large and flat manifests. Copying
> > from http://mercurial.selenic.com/wiki/ManifestShardingPlan:
> >
> > * manifest too large for RAM
> >
> > * manifest resolution too much CPU (long delta chains)
> >
> > * committing is slow because entire manifest has to be hashed
> >
> > * impossible for narrow clone to leave out part of manifest as all is
> > needed to calculate new hash
> >
> > * diffing two revisions involves traversing entire subdirectories
> > even if identical
> >
> > This is a first step in a series introducing a manifest revlog per
> > directory.
> >
> > This change adds boolean configuration option
> > experimental.treemanifest. When the option is enabled, manifests are
> > parsed into a new tree data structure with one tree node per
> > directory. At this point, it is just a different data structure in
> > memory; there is still just a single manifest revlog on disk.
> >
> > The code is not yet optimized. Enabling the option makes most or all
> > operations slower. Once we start storing manifest revlogs for every
> > directory, it should be possible to make many of these operations much
> > faster. The fastdelta() optimization has been explicitly disabled (and
> > not implemented) for the treemanifests.
> >
> > Tests can be run using the new data structure by switching the config
> > option default in localrepo._applyrequirements(). All tests pass.
> >
> > diff -r 5671d3b1883e -r 34ad94f42d39 mercurial/localrepo.py
> > --- a/mercurial/localrepo.py Tue Mar 10 16:26:13 2015 -0700
> > +++ b/mercurial/localrepo.py Mon Mar 09 12:57:56 2015 -0700
> > @@ -326,6 +326,9 @@
> > manifestcachesize = self.ui.configint('format',
> 'manifestcachesize')
> > if manifestcachesize is not None:
> > self.svfs.options['manifestcachesize'] = manifestcachesize
> > + usetreemanifest = self.ui.configbool('experimental',
> 'treemanifest')
> > + if usetreemanifest is not None:
> > + self.svfs.options['usetreemanifest'] = usetreemanifest
> >
> > def _writerequirements(self):
> > reqfile = self.vfs("requires", "w")
> > diff -r 5671d3b1883e -r 34ad94f42d39 mercurial/manifest.py
> > --- a/mercurial/manifest.py Tue Mar 10 16:26:13 2015 -0700
> > +++ b/mercurial/manifest.py Mon Mar 09 12:57:56 2015 -0700
> > @@ -311,22 +311,240 @@
> > + content for start, end, content in x)
> > return deltatext, newaddlist
> >
> > +def _splittopdir(f):
> > + if '/' in f:
> > + dir, subpath = f.split('/', 1)
> > + return dir + '/', subpath
> > + else:
> > + return '', f
> > +
> > +class treemanifest(object):
>
> This class can (and probably should) use _lazymanifest directly,
> rather than proxying through the higher-level dict-like abstraction.
>
> > + def __init__(self, text=''):
> > + self._dirs = {}
> > + self._files = {}
> > + self._flags = {}
> > + mfdict = manifestdict(text)
> > + for f, n in mfdict.iteritems():
> > + self[f] = n
> > + if mfdict.flags(f):
> > + self.setflag(f, mfdict.flags(f))
> > +
> > + def __len__(self):
> > + size = len(self._files)
> > + for m in self._dirs.values():
> > + size += m.__len__()
> > + return size
> > +
> > + def iteritems(self):
> > + for p, n in sorted(self._dirs.items() + self._files.items()):
> > + if p in self._files:
> > + yield p, n
> > + else:
> > + for sf, sn in n.iteritems():
> > + yield p + sf, sn
> > +
> > + def iterkeys(self):
> > + for p in sorted(self._dirs.keys() + self._files.keys()):
> > + if p in self._files:
> > + yield p
> > + else:
> > + for f in self._dirs[p].iterkeys():
> > + yield p + f
> > +
> > + def keys(self):
> > + return list(self.iterkeys())
> > +
> > + def __iter__(self):
> > + return (f for f in self.iterkeys())
> > +
> > + def __contains__(self, f):
> > + if f is None:
> > + return False
> > + dir, subpath = _splittopdir(f)
> > + if dir:
> > + if dir not in self._dirs:
> > + return False
> > + return self._dirs[dir].__contains__(subpath)
> > + else:
> > + return f in self._files
> > +
> > + def get(self, f, default=None):
> > + dir, subpath = _splittopdir(f)
> > + if dir:
> > + if dir not in self._dirs:
> > + return default
> > + return self._dirs[dir].get(subpath, default)
> > + else:
> > + return self._files.get(f, default)
> > +
> > + def __getitem__(self, f):
> > + dir, subpath = _splittopdir(f)
> > + if dir:
> > + return self._dirs[dir].__getitem__(subpath)
> > + else:
> > + return self._files[f]
> > +
> > + def flags(self, f):
> > + dir, subpath = _splittopdir(f)
> > + if dir:
> > + if dir not in self._dirs:
> > + return ''
> > + return self._dirs[dir].flags(subpath)
> > + else:
> > + if f in self._dirs:
> > + return ''
> > + return self._flags.get(f, '')
> > +
> > + def find(self, f):
> > + dir, subpath = _splittopdir(f)
> > + if dir:
> > + return self._dirs[dir].find(subpath)
> > + else:
> > + return self._files[f], self._flags.get(f, '')
> > +
> > + def __delitem__(self, f):
> > + dir, subpath = _splittopdir(f)
> > + if dir:
> > + self._dirs[dir].__delitem__(subpath)
> > + if not self._dirs[dir]._dirs and not self._dirs[dir]._files:
> > + del self._dirs[dir]
> > + else:
> > + del self._files[f]
> > + if f in self._flags:
> > + del self._flags[f]
> > +
> > + def __setitem__(self, f, n):
> > + assert n is not None
> > + dir, subpath = _splittopdir(f)
> > + if dir:
> > + if dir not in self._dirs:
> > + self._dirs[dir] = treemanifest()
> > + self._dirs[dir].__setitem__(subpath, n)
> > + else:
> > + self._files[f] = n
> > +
> > + def setflag(self, f, flags):
> > + """Set the flags (symlink, executable) for path f."""
> > + dir, subpath = _splittopdir(f)
> > + if dir:
> > + if dir not in self._dirs:
> > + self._dirs[dir] = treemanifest()
> > + self._dirs[dir].setflag(subpath, flags)
> > + else:
> > + self._flags[f] = flags
> > +
> > + def copy(self):
> > + copy = treemanifest()
> > + for d in self._dirs:
> > + copy._dirs[d] = self._dirs[d].copy()
> > + copy._files = dict.copy(self._files)
> > + copy._flags = dict.copy(self._flags)
> > + return copy
> > +
> > + def intersectfiles(self, files):
> > + '''make a new treemanifest with the intersection of self with
> files
> > +
> > + The algorithm assumes that files is much smaller than self.'''
> > + ret = treemanifest()
> > + for fn in files:
> > + if fn in self:
> > + ret[fn] = self[fn]
> > + flags = self.flags(fn)
> > + if flags:
> > + ret.setflag(fn, flags)
> > + return ret
> > +
> > + def filesnotin(self, m2):
> > + '''Set of files in this manifest that are not in the other'''
> > + files = set(self.iterkeys())
> > + files.difference_update(m2.iterkeys())
> > + return files
> > +
> > + def matches(self, match):
> > + '''generate a new manifest filtered by the match argument'''
> > + if match.always():
> > + return self.copy()
> > +
> > + files = match.files()
> > + if (match.matchfn == match.exact or
> > + (not match.anypats() and util.all(fn in self for fn in
> files))):
> > + return self.intersectfiles(files)
> > +
> > + m = self.copy()
> > + for fn in m.keys():
> > + if not match(fn):
> > + del m[fn]
> > + return m
> > +
> > + def diff(self, m2, clean=False):
> > + '''Finds changes between the current manifest and m2.
> > +
> > + Args:
> > + m2: the manifest to which this manifest should be compared.
> > + clean: if true, include files unchanged between these
> manifests
> > + with a None value in the returned dictionary.
> > +
> > + The result is returned as a dict with filename as key and
> > + values of the form ((n1,fl1),(n2,fl2)), where n1/n2 is the
> > + nodeid in the current/other manifest and fl1/fl2 is the flag
> > + in the current/other manifest. Where the file does not exist,
> > + the nodeid will be None and the flags will be the empty
> > + string.
> > + '''
> > + diff = {}
> > +
> > + for fn, n1 in self.iteritems():
> > + fl1 = self.flags(fn)
> > + n2 = m2.get(fn, None)
> > + fl2 = m2.flags(fn)
> > + if n2 is None:
> > + fl2 = ''
> > + if n1 != n2 or fl1 != fl2:
> > + diff[fn] = ((n1, fl1), (n2, fl2))
> > + elif clean:
> > + diff[fn] = None
> > +
> > + for fn, n2 in m2.iteritems():
> > + if fn not in self:
> > + fl2 = m2.flags(fn)
> > + diff[fn] = ((None, ''), (n2, fl2))
> > +
> > + return diff
> > +
> > + def text(self):
> > + """Get the full data of this manifest as a bytestring."""
> > + fl = self.keys()
> > + _checkforbidden(fl)
> > +
> > + hex, flags = revlog.hex, self.flags
> > + # if this is changed to support newlines in filenames,
> > + # be sure to check the templates/ dir again (especially
> *-raw.tmpl)
> > + return ''.join("%s\0%s%s\n" % (f, hex(self[f]), flags(f)) for f
> in fl)
> > +
> > class manifest(revlog.revlog):
> > def __init__(self, opener):
> > # During normal operations, we expect to deal with not more
> than four
> > # revs at a time (such as during commit --amend). When rebasing
> large
> > # stacks of commits, the number can go up, hence the config
> knob below.
> > cachesize = 4
> > + usetreemanifest = False
> > opts = getattr(opener, 'options', None)
> > if opts is not None:
> > cachesize = opts.get('manifestcachesize', cachesize)
> > + usetreemanifest = opts.get('usetreemanifest',
> usetreemanifest)
> > self._mancache = util.lrucachedict(cachesize)
> > revlog.revlog.__init__(self, opener, "00manifest.i")
> > + self._usetreemanifest = usetreemanifest
> > +
> > + def _newmanifest(self, data=''):
> > + if self._usetreemanifest:
> > + return treemanifest(data)
> > + return manifestdict(data)
> >
> > def readdelta(self, node):
> > r = self.rev(node)
> > d = mdiff.patchtext(self.revdiff(self.deltaparent(r), r))
> > - return manifestdict(d)
> > + return self._newmanifest(d)
> >
> > def readfast(self, node):
> > '''use the faster of readdelta or read'''
> > @@ -338,12 +556,12 @@
> >
> > def read(self, node):
> > if node == revlog.nullid:
> > - return manifestdict() # don't upset local cache
> > + return self._newmanifest() # don't upset local cache
> > if node in self._mancache:
> > return self._mancache[node][0]
> > text = self.revision(node)
> > arraytext = array.array('c', text)
> > - m = manifestdict(text)
> > + m = self._newmanifest(text)
> > self._mancache[node] = (m, arraytext)
> > return m
> >
> > @@ -355,12 +573,12 @@
> > return m.get(f), m.flags(f)
> > text = self.revision(node)
> > try:
> > - return manifestdict(text).find(f)
> > + return self._newmanifest(text).find(f)
> > except KeyError:
> > return None, None
> >
> > def add(self, m, transaction, link, p1, p2, added, removed):
> > - if p1 in self._mancache:
> > + if p1 in self._mancache and not self._usetreemanifest:
> > # If our first parent is in the manifest cache, we can
> > # compute a delta here using properties we know about the
> > # manifest up-front, which may save time later for the
> > _______________________________________________
> > Mercurial-devel mailing list
> > Mercurial-devel at selenic.com
> > http://selenic.com/mailman/listinfo/mercurial-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://selenic.com/pipermail/mercurial-devel/attachments/20150311/d587afb7/attachment.html>
More information about the Mercurial-devel
mailing list