[PATCH 2 of 4 v3 lazy-manifest] manifest: use lazymanifest instead of manifestdict
Augie Fackler
raf at durin42.com
Fri Feb 6 17:24:27 CST 2015
# HG changeset patch
# User Augie Fackler <augie at google.com>
# Date 1420746668 18000
# Thu Jan 08 14:51:08 2015 -0500
# Node ID 8f5634aec154113442966c0dd78d74f758cc1e41
# Parent fcbc20ed057e63c23f5f5bf385c9d75a9ec21dbe
manifest: use lazymanifest instead of manifestdict
Before any of my related changes on mozilla-central:
perfmanifest tip
! wall 0.268805 comb 0.260000 user 0.260000 sys 0.000000 (best of 37)
perftags
! result: 162
! wall 0.007099 comb 0.000000 user 0.000000 sys 0.000000 (best of 401)
perfstatus
! wall 0.415680 comb 0.420000 user 0.260000 sys 0.160000 (best of 24)
hgperf export tip
! wall 0.142118 comb 0.140000 user 0.140000 sys 0.000000 (best of 67)
after all of my changes on mozilla-central:
./hg:
perfmanifest tip
! wall 0.232640 comb 0.230000 user 0.220000 sys 0.010000 (best of 43)
perftags
! result: 162
! wall 0.007057 comb 0.010000 user 0.000000 sys 0.010000 (best of 395)
perfstatus
! wall 0.415503 comb 0.420000 user 0.280000 sys 0.140000 (best of 24)
hgperf export tip
! wall 0.025096 comb 0.030000 user 0.030000 sys 0.000000 (best of 102)
so it's no real change in performance on perf{manifest,tags,status},
but is a huge win on 'hgperf export tip'.
This should open the door to doing fast hashing on a per-directory
basis for the manifest type, which is the current plan for supporting
narrow-clone-friendly manifests.
There's a little performance work that could still be done here:
fastdelta() could be done significantly more intelligently by using
the internal state of the lazymanifest type in C, but that seems like
good future work.
Users of pure-Python Mercurial will likely want to spend time
optimizing the version of lazymanifest in pure/parsers.py. Right now
it's the smallest amount of work I could do to make Mercurial still
work in pure-Python, and isn't at all lazy.
diff --git a/hgext/largefiles/reposetup.py b/hgext/largefiles/reposetup.py
--- a/hgext/largefiles/reposetup.py
+++ b/hgext/largefiles/reposetup.py
@@ -35,17 +35,18 @@ def reposetup(ui, repo):
def __getitem__(self, changeid):
ctx = super(lfilesrepo, self).__getitem__(changeid)
if self.lfstatus:
- class lfilesmanifestdict(manifest.manifestdict):
+ class lfilesmanifest(manifest.lazymanifest):
def __contains__(self, filename):
- orig = super(lfilesmanifestdict, self).__contains__
+ orig = super(lfilesmanifest, self).__contains__
return orig(filename) or orig(lfutil.standin(filename))
+
class lfilesctx(ctx.__class__):
def files(self):
filenames = super(lfilesctx, self).files()
return [lfutil.splitstandin(f) or f for f in filenames]
def manifest(self):
man1 = super(lfilesctx, self).manifest()
- man1.__class__ = lfilesmanifestdict
+ man1.__class__ = lfilesmanifest
return man1
def filectx(self, path, fileid=None, filelog=None):
orig = super(lfilesctx, self).filectx
diff --git a/mercurial/manifest.py b/mercurial/manifest.py
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -9,35 +9,40 @@ from i18n import _
import mdiff, parsers, error, revlog, util
import array, struct
-class manifestdict(dict):
- def __init__(self, mapping=None, flags=None):
- if mapping is None:
- mapping = {}
- if flags is None:
- flags = {}
- dict.__init__(self, mapping)
- self._flags = flags
- def __setitem__(self, k, v):
- assert v is not None
- dict.__setitem__(self, k, v)
- def flags(self, f):
- return self._flags.get(f, "")
- def setflag(self, f, flags):
- """Set the flags (symlink, executable) for path f."""
- self._flags[f] = flags
- def copy(self):
- return manifestdict(self, dict.copy(self._flags))
+class lazymanifest(object):
+ def __init__(self, data):
+ self._lm = parsers.lazymanifest(data)
+
+ def __getitem__(self, key):
+ return self._lm[key][0]
+
+ def __len__(self):
+ return len(self._lm)
+
+ def __setitem__(self, key, node):
+ self._lm[key] = node, self.flags(key, '')
+
+ def __contains__(self, key):
+ return key in self._lm
+
+ def __delitem__(self, key):
+ del self._lm[key]
+
+ def keys(self):
+ return [x[0] for x in self._lm]
+
+ def iterkeys(self):
+ return iter(self.keys())
+
def intersectfiles(self, files):
- '''make a new manifestdict with the intersection of self with files
+ '''make a new lazymanifest with the intersection of self with files
The algorithm assumes that files is much smaller than self.'''
- ret = manifestdict()
+ ret = lazymanifest('')
+ lm = self._lm
for fn in files:
- if fn in self:
- ret[fn] = self[fn]
- flags = self._flags.get(fn, None)
- if flags:
- ret._flags[fn] = flags
+ if fn in lm:
+ ret._lm[fn] = self._lm[fn]
return ret
def matches(self, match):
@@ -50,11 +55,9 @@ class manifestdict(dict):
(not match.anypats() and util.all(fn in self for fn in files))):
return self.intersectfiles(files)
- mf = self.copy()
- for fn in mf.keys():
- if not match(fn):
- del mf[fn]
- return mf
+ lm = lazymanifest('')
+ lm._lm = self._lm.filtercopy(match)
+ return lm
def diff(self, m2, clean=False):
'''Finds changes between the current manifest and m2.
@@ -71,35 +74,36 @@ class manifestdict(dict):
the nodeid will be None and the flags will be the empty
string.
'''
- diff = {}
+ return self._lm.diff(m2._lm, clean)
- for fn, n1 in self.iteritems():
- fl1 = self._flags.get(fn, '')
- n2 = m2.get(fn, None)
- fl2 = m2._flags.get(fn, '')
- if n2 is None:
- fl2 = ''
- if n1 != n2 or fl1 != fl2:
- diff[fn] = ((n1, fl1), (n2, fl2))
- elif clean:
- diff[fn] = None
+ def setflag(self, key, flag):
+ self._lm[key] = self[key], flag
- for fn, n2 in m2.iteritems():
- if fn not in self:
- fl2 = m2._flags.get(fn, '')
- diff[fn] = ((None, ''), (n2, fl2))
+ def get(self, key, default=None):
+ try:
+ return self._lm[key][0]
+ except KeyError:
+ return default
- return diff
+ def flags(self, key, default=''):
+ try:
+ return self._lm[key][1]
+ except KeyError:
+ return default
+
+ def __iter__(self):
+ return (x[0] for x in self._lm)
+
+ def copy(self):
+ c = lazymanifest('')
+ c._lm = self._lm.copy()
+ return c
+
+ def iteritems(self):
+ return (x[:2] for x in self._lm)
def text(self):
- """Get the full data of this manifest as a bytestring."""
- fl = sorted(self)
- _checkforbidden(fl)
-
- hex, flags = revlog.hex, self.flags
- # if this is changed to support newlines in filenames,
- # be sure to check the templates/ dir again (especially *-raw.tmpl)
- return ''.join("%s\0%s%s\n" % (f, hex(self[f]), flags(f)) for f in fl)
+ return self._lm.text()
def fastdelta(self, base, changes):
"""Given a base manifest text as an array.array and a list of changes
@@ -119,7 +123,8 @@ class manifestdict(dict):
# bs will either be the index of the item or the insert point
start, end = _msearch(addbuf, f, start)
if not todelete:
- l = "%s\0%s%s\n" % (f, revlog.hex(self[f]), self.flags(f))
+ h, fl = self._lm[f]
+ l = "%s\0%s%s\n" % (f, revlog.hex(h), fl)
else:
if start == end:
# item we want to delete was not found, error out
@@ -213,11 +218,6 @@ def _addlistdelta(addlist, x):
+ content for start, end, content in x)
return deltatext, newaddlist
-def _parse(lines):
- mfdict = manifestdict()
- parsers.parse_manifest(mfdict, mfdict._flags, lines)
- return mfdict
-
class manifest(revlog.revlog):
def __init__(self, opener):
# During normal operations, we expect to deal with not more than four
@@ -232,7 +232,8 @@ class manifest(revlog.revlog):
def readdelta(self, node):
r = self.rev(node)
- return _parse(mdiff.patchtext(self.revdiff(self.deltaparent(r), r)))
+ data = mdiff.patchtext(self.revdiff(self.deltaparent(r), r))
+ return lazymanifest(data)
def readfast(self, node):
'''use the faster of readdelta or read'''
@@ -244,12 +245,12 @@ class manifest(revlog.revlog):
def read(self, node):
if node == revlog.nullid:
- return manifestdict() # don't upset local cache
+ return lazymanifest('') # don't upset local cache
if node in self._mancache:
return self._mancache[node][0]
text = self.revision(node)
arraytext = array.array('c', text)
- mapping = _parse(text)
+ mapping = lazymanifest(text)
self._mancache[node] = (mapping, arraytext)
return mapping
@@ -260,12 +261,10 @@ class manifest(revlog.revlog):
mapping = self._mancache[node][0]
return mapping.get(f), mapping.flags(f)
text = self.revision(node)
- start, end = _msearch(text, f)
- if start == end:
+ try:
+ return lazymanifest(text)._lm[f]
+ except KeyError:
return None, None
- l = text[start:end]
- f, n = l.split('\0')
- return revlog.bin(n[:40]), n[40:-1]
def add(self, map, transaction, link, p1, p2, added, removed):
if p1 in self._mancache:
diff --git a/mercurial/pure/parsers.py b/mercurial/pure/parsers.py
--- a/mercurial/pure/parsers.py
+++ b/mercurial/pure/parsers.py
@@ -5,7 +5,7 @@
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
-from mercurial.node import bin, nullid
+from mercurial.node import bin, nullid, hex
from mercurial import util
import struct, zlib, cStringIO
@@ -21,15 +21,6 @@ def dirstatetuple(*x):
# x is a tuple
return x
-def parse_manifest(mfdict, fdict, lines):
- for l in lines.splitlines():
- f, n = l.split('\0')
- if len(n) > 40:
- fdict[f] = n[40:]
- mfdict[f] = bin(n[:40])
- else:
- mfdict[f] = bin(n)
-
def parse_index2(data, inline):
def gettype(q):
return int(q & 0xFFFF)
@@ -119,3 +110,80 @@ def pack_dirstate(dmap, copymap, pl, now
write(e)
write(f)
return cs.getvalue()
+
+class lazymanifest(dict):
+ """This is the pure implementation of lazymanifest.
+
+ It has not been optimized *at all* and is not lazy.
+ """
+
+ def __init__(self, data):
+ # This init method does a little bit of excessive-looking
+ # precondition checking. This is so that the behavior of this
+ # class exactly matches its C counterpart to try and help
+ # prevent surprise breakage for anyone that develops against
+ # the pure version.
+ if data and data[-1] != '\n':
+ raise ValueError('Manifest did not end in a newline.')
+ dict.__init__(self)
+ prev = None
+ for l in data.splitlines():
+ if prev is not None and prev > l:
+ raise ValueError('Manifest lines not in sorted order.')
+ prev = l
+ f, n = l.split('\0')
+ if len(n) > 40:
+ self[f] = bin(n[:40]), n[40:]
+ else:
+ self[f] = bin(n), ''
+
+ def __setitem__(self, k, v):
+ node, flag = v
+ assert node is not None
+ if len(node) > 21:
+ node = node[:21] # match c implementation behavior
+ dict.__setitem__(self, k, (node, flag))
+
+ def __iter__(self):
+ return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
+
+ def copy(self):
+ c = lazymanifest('')
+ c.update(self)
+ return c
+
+ def diff(self, m2, clean=False):
+ '''Finds changes between the current manifest and m2.'''
+ diff = {}
+
+ for fn, e1 in self.iteritems():
+ if fn not in m2:
+ diff[fn] = e1, (None, '')
+ else:
+ e2 = m2[fn]
+ if e1 != e2:
+ diff[fn] = e1, e2
+ elif clean:
+ diff[fn] = None
+
+ for fn, e2 in m2.iteritems():
+ if fn not in self:
+ diff[fn] = (None, ''), e2
+
+ return diff
+
+ def filtercopy(self, filterfn):
+ c = lazymanifest('')
+ for f, n, fl in self:
+ if filterfn(f):
+ c[f] = n, fl
+ return c
+
+ def text(self):
+ """Get the full data of this manifest as a bytestring."""
+ fl = sorted(self)
+
+ _hex = hex
+ # if this is changed to support newlines in filenames,
+ # be sure to check the templates/ dir again (especially *-raw.tmpl)
+ return ''.join("%s\0%s%s\n" % (f, _hex(n), flag) for f, n, flag in fl)
diff --git a/tests/test-manifest.py b/tests/test-manifest.py
--- a/tests/test-manifest.py
+++ b/tests/test-manifest.py
@@ -5,6 +5,7 @@ import itertools
import silenttestrunner
from mercurial import parsers
+from mercurial import manifest as manifestmod
HASH_1 = '1' * 40
HASH_2 = 'f' * 40
@@ -216,6 +217,13 @@ class testmanifest(unittest.TestCase):
self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
self.assertEqual(len(m), len(list(m)))
+ def testIntersectFiles(self):
+ m = manifestmod.lazymanifest(A_HUGE_MANIFEST)
+ m2 = m.intersectfiles(['file1', 'file200', 'file300'])
+ w = ('file1\0%sx\n'
+ 'file200\0%sl\n'
+ 'file300\0%s\n') % (HASH_2, HASH_1, HASH_1)
+ self.assertEqual(w, m2.text())
if __name__ == '__main__':
silenttestrunner.main(__name__)
More information about the Mercurial-devel
mailing list