[PATCH 3 of 6 v6 lazy-manifest] manifest: split manifestdict into high-level and low-level logic

Martin von Zweigbergk martinvonz at google.com
Sun Mar 8 00:43:17 CST 2015


On Sat, Mar 7, 2015 at 9:25 AM Augie Fackler <raf at durin42.com> wrote:

> # HG changeset patch
> # User Augie Fackler <augie at google.com>
> # Date 1425747879 18000
> #      Sat Mar 07 12:04:39 2015 -0500
> # Node ID 74e64852b07f9cfb5a7b89d827dd9e1f01314b1b
> # Parent  2524c117ce836e18402cac792361f0e44cac42c5
> manifest: split manifestdict into high-level and low-level logic
>
> The low-level logic type (_lazymanifest) matches the behavior of the C
> implementation introduced in a5f1bccd. A future patch will use that
> when available.
>
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -9,53 +9,119 @@ from i18n import _
>  import mdiff, parsers, error, revlog, util
>  import array, struct
>
> -# Pure Python fallback
> -def _parsemanifest(mfdict, fdict, lines):
> -    bin = revlog.bin
> -    for l in lines.splitlines():
> -        f, n = l.split('\0')
> -        if len(n) > 40:
> -            fdict[f] = n[40:]
> -            mfdict[f] = bin(n[:40])
> -        else:
> -            mfdict[f] = bin(n)
>
> -def _parse(lines, mfdict, flags):
> -    try:
> -        parsers.parse_manifest(mfdict, flags, lines)
> -    except AttributeError:
> -        _parsemanifest(mfdict, flags, lines)
> -    return mfdict
> +class _lazymanifest(dict):
> +    """This is the pure implementation of lazymanifest.
>
> -class manifestdict(dict):
> -    def __init__(self, data=''):
> -        self._flags = {}
> -        _parse(data, self, self._flags)
> +    It has not been optimized *at all* and is not lazy.
> +    """
> +
> +    def __init__(self, data):
> +        # This init method does a little bit of excessive-looking
> +        # precondition checking. This is so that the behavior of this
> +        # class exactly matches its C counterpart to try and help
> +        # prevent surprise breakage for anyone that develops against
> +        # the pure version.
> +        if data and data[-1] != '\n':
> +            raise ValueError('Manifest did not end in a newline.')
> +        dict.__init__(self)
> +        prev = None
> +        for l in data.splitlines():
> +            if prev is not None and prev > l:
> +                raise ValueError('Manifest lines not in sorted order.')
> +            prev = l
> +            f, n = l.split('\0')
> +            if len(n) > 40:
> +                self[f] = revlog.bin(n[:40]), n[40:]
> +            else:
> +                self[f] = revlog.bin(n), ''
>
>      def __setitem__(self, k, v):
> -        assert v is not None
> -        dict.__setitem__(self, k, v)
> -    def flags(self, f):
> -        return self._flags.get(f, "")
> -    def setflag(self, f, flags):
> -        """Set the flags (symlink, executable) for path f."""
> -        self._flags[f] = flags
> +        node, flag = v
> +        assert node is not None
> +        if len(node) > 21:
> +            node = node[:21] # match c implementation behavior
> +        dict.__setitem__(self, k, (node, flag))
> +
> +    def __iter__(self):
> +        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
>

What's the reason for yielding (f, n, fl) instead of the more natural (f,
(n, fl))? The infamous effect tuples have on Python's GC? Also see
questions further down.

+
>      def copy(self):
> -        copy = manifestdict()
> -        dict.__init__(copy, self)
> -        copy._flags = dict.copy(self._flags)
> -        return copy
> +        c = _lazymanifest('')
> +        c.update(self)
> +        return c
> +
> +    def diff(self, m2, clean=False):
> +        '''Finds changes between the current manifest and m2.'''
> +        diff = {}
> +
> +        for fn, e1 in self.iteritems():
> +            if fn not in m2:
> +                diff[fn] = e1, (None, '')
> +            else:
> +                e2 = m2[fn]
> +                if e1 != e2:
> +                    diff[fn] = e1, e2
> +                elif clean:
> +                    diff[fn] = None
> +
> +        for fn, e2 in m2.iteritems():
> +            if fn not in self:
> +                diff[fn] = (None, ''), e2
> +
> +        return diff
> +
> +    def filtercopy(self, filterfn):
> +        c = _lazymanifest('')
> +        for f, n, fl in self:
> +            if filterfn(f):
> +                c[f] = n, fl
> +        return c
> +
> +    def text(self):
> +        """Get the full data of this manifest as a bytestring."""
> +        fl = sorted(self)
> +
> +        _hex = revlog.hex
> +        # if this is changed to support newlines in filenames,
> +        # be sure to check the templates/ dir again (especially
> *-raw.tmpl)
> +        return ''.join("%s\0%s%s\n" % (
> +            f, _hex(n[:20]), flag) for f, n, flag in fl)
> +
> +class manifestdict(object):
>

So _lazymanifest is not lazy, manifestdict is not a dict, but _lazymanifest
is a dict :-) Just a note, not asking for anything to change.


> +    def __init__(self, data=''):
> +        self._lm = _lazymanifest(data)
> +
> +    def __getitem__(self, key):
> +        return self._lm[key][0]
> +
> +    def __len__(self):
> +        return len(self._lm)
> +
> +    def __setitem__(self, key, node):
> +        self._lm[key] = node, self.flags(key, '')
> +
> +    def __contains__(self, key):
> +        return key in self._lm
> +
> +    def __delitem__(self, key):
> +        del self._lm[key]
> +
> +    def keys(self):
> +        return [x[0] for x in self._lm]
> +
> +    def iterkeys(self):
> +        return iter(self.keys())
> +
>      def intersectfiles(self, files):
> -        '''make a new manifestdict with the intersection of self with
> files
> +        '''make a new lazymanifest with the intersection of self with
> files
>
>          The algorithm assumes that files is much smaller than self.'''
>          ret = manifestdict()
> +        lm = self._lm
>          for fn in files:
> -            if fn in self:
> -                ret[fn] = self[fn]
> -                flags = self._flags.get(fn, None)
> -                if flags:
> -                    ret._flags[fn] = flags
> +            if fn in lm:
> +                ret._lm[fn] = self._lm[fn]
>          return ret
>
>      def filesnotin(self, m2):
> @@ -74,11 +140,9 @@ class manifestdict(dict):
>              (not match.anypats() and util.all(fn in self for fn in
> files))):
>              return self.intersectfiles(files)
>
> -        m = self.copy()
> -        for fn in m.keys():
> -            if not match(fn):
> -                del m[fn]
> -        return m
> +        lm = manifestdict('')
> +        lm._lm = self._lm.filtercopy(match)
> +        return lm
>
>      def diff(self, m2, clean=False):
>          '''Finds changes between the current manifest and m2.
> @@ -95,35 +159,36 @@ class manifestdict(dict):
>          the nodeid will be None and the flags will be the empty
>          string.
>          '''
> -        diff = {}
> +        return self._lm.diff(m2._lm, clean)
>
> -        for fn, n1 in self.iteritems():
> -            fl1 = self._flags.get(fn, '')
> -            n2 = m2.get(fn, None)
> -            fl2 = m2._flags.get(fn, '')
> -            if n2 is None:
> -                fl2 = ''
> -            if n1 != n2 or fl1 != fl2:
> -                diff[fn] = ((n1, fl1), (n2, fl2))
> -            elif clean:
> -                diff[fn] = None
> +    def setflag(self, key, flag):
> +        self._lm[key] = self[key], flag
>
> -        for fn, n2 in m2.iteritems():
> -            if fn not in self:
> -                fl2 = m2._flags.get(fn, '')
> -                diff[fn] = ((None, ''), (n2, fl2))
> +    def get(self, key, default=None):
> +        try:
> +            return self._lm[key][0]
> +        except KeyError:
> +            return default
>
> -        return diff
> +    def flags(self, key, default=''):
> +        try:
> +            return self._lm[key][1]
> +        except KeyError:
> +            return default
> +
> +    def __iter__(self):
> +        return (x[0] for x in self._lm)
>

If the answer to my question above was about GC, then wouldn't this (the
creation of tuples that are immediately thrown away) be an equally big
problem? If the low-level type's __iter__ was left untouched (and
iteritems() would be used where __iter__ is currently used), then there
wouldn't be any tuples involved here.


> +
> +    def copy(self):
> +        c = manifestdict('')
> +        c._lm = self._lm.copy()
> +        return c
> +
> +    def iteritems(self):
> +        return (x[:2] for x in self._lm)
>

Similar comment here: this creates a 3-tuple that gets converted into a
2-tuple. That does seem better than creating 2 2-tuples (f, (n, e)) that
converted into another one (f, n). Even better would be to add a new
filesandnodes() generator that yields exactly the wanted (f, n) tuples, no?
OTOH, is that really what we want in many places? I have a feeling that
most places that user iteritems() would actually prefer all three fields.
Yielding (f, (n, e)) seems cleaner to me, but if GC is a concern, then
perhaps we'd have to yield (f, n, e). What do you think?


>
>      def text(self):
> -        """Get the full data of this manifest as a bytestring."""
> -        fl = sorted(self)
> -        _checkforbidden(fl)
> -
> -        hex, flags = revlog.hex, self.flags
> -        # if this is changed to support newlines in filenames,
> -        # be sure to check the templates/ dir again (especially
> *-raw.tmpl)
> -        return ''.join("%s\0%s%s\n" % (f, hex(self[f]), flags(f)) for f
> in fl)
> +        return self._lm.text()
>
>      def fastdelta(self, base, changes):
>          """Given a base manifest text as an array.array and a list of
> changes
> @@ -143,7 +208,8 @@ class manifestdict(dict):
>              # bs will either be the index of the item or the insert point
>              start, end = _msearch(addbuf, f, start)
>              if not todelete:
> -                l = "%s\0%s%s\n" % (f, revlog.hex(self[f]), self.flags(f))
> +                h, fl = self._lm[f]
> +                l = "%s\0%s%s\n" % (f, revlog.hex(h), fl)
>              else:
>                  if start == end:
>                      # item we want to delete was not found, error out
> @@ -280,12 +346,10 @@ class manifest(revlog.revlog):
>              m = self._mancache[node][0]
>              return m.get(f), m.flags(f)
>          text = self.revision(node)
> -        start, end = _msearch(text, f)
> -        if start == end:
> +        try:
> +            return manifestdict(text)._lm[f]
> +        except KeyError:
>              return None, None
> -        l = text[start:end]
> -        f, n = l.split('\0')
> -        return revlog.bin(n[:40]), n[40:-1]
>
>      def add(self, m, transaction, link, p1, p2, added, removed):
>          if p1 in self._mancache:
> diff --git a/tests/test-manifest.py b/tests/test-manifest.py
> --- a/tests/test-manifest.py
> +++ b/tests/test-manifest.py
> @@ -4,7 +4,7 @@ import itertools
>
>  import silenttestrunner
>
> -from mercurial import parsers
> +from mercurial import manifest as manifestmod
>
>  HASH_1 = '1' * 40
>  HASH_2 = 'f' * 40
> @@ -38,12 +38,12 @@ class testmanifest(unittest.TestCase):
>          self.assert_(thing in container, msg)
>
>      def testEmptyManifest(self):
> -        m = parsers.lazymanifest('')
> +        m = manifestmod._lazymanifest('')
>          self.assertEqual(0, len(m))
>          self.assertEqual([], list(m))
>
>      def testManifest(self):
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          want = [
>              ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
>              ('foo', binascii.unhexlify(HASH_1), ''),
> @@ -58,13 +58,13 @@ class testmanifest(unittest.TestCase):
>      def testSetItem(self):
>          want = binascii.unhexlify(HASH_1), ''
>
> -        m = parsers.lazymanifest('')
> +        m = manifestmod._lazymanifest('')
>          m['a'] = want
>          self.assertIn('a', m)
>          self.assertEqual(want, m['a'])
>          self.assertEqual('a\0' + HASH_1 + '\n', m.text())
>
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          m['a'] = want
>          self.assertEqual(want, m['a'])
>          self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST,
> @@ -76,7 +76,7 @@ class testmanifest(unittest.TestCase):
>      def testCompaction(self):
>          unhex = binascii.unhexlify
>          h1, h2 = unhex(HASH_1), unhex(HASH_2)
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          m['alpha'] = h1, ''
>          m['beta'] = h2, ''
>          del m['foo']
> @@ -91,8 +91,8 @@ class testmanifest(unittest.TestCase):
>          self.assertEqual(w, list(m))
>
>      def testSetGetNodeSuffix(self):
> -        clean = parsers.lazymanifest(A_SHORT_MANIFEST)
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        clean = manifestmod._lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          h, f = m['foo']
>          want = h + 'a', f
>          # Merge code wants to set 21-byte fake hashes at times
> @@ -120,7 +120,7 @@ class testmanifest(unittest.TestCase):
>          self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m))
>
>      def testFilterCopyException(self):
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          def filt(path):
>              if path == 'foo':
>                  assert False
> @@ -128,7 +128,7 @@ class testmanifest(unittest.TestCase):
>          self.assertRaises(AssertionError, m.filtercopy, filt)
>
>      def testRemoveItem(self):
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          del m['foo']
>          self.assertRaises(KeyError, lambda : m['foo'])
>          self.assertEqual(1, len(m))
> @@ -138,9 +138,9 @@ class testmanifest(unittest.TestCase):
>          MISSING = (None, '')
>          addl = 'z-only-in-left\0' + HASH_1 + '\n'
>          addr = 'z-only-in-right\0' + HASH_2 + 'x\n'
> -        left = parsers.lazymanifest(
> +        left = manifestmod._lazymanifest(
>              A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl)
> -        right = parsers.lazymanifest(A_SHORT_MANIFEST + addr)
> +        right = manifestmod._lazymanifest(A_SHORT_MANIFEST + addr)
>          want = {
>              'foo': ((binascii.unhexlify(HASH_3), 'x'),
>                      (binascii.unhexlify(HASH_1), '')),
> @@ -154,14 +154,14 @@ class testmanifest(unittest.TestCase):
>              'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')),
>              'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')),
>              }
> -        self.assertEqual(want, parsers.lazymanifest('').diff(left))
> +        self.assertEqual(want, manifestmod._lazymanifest('').diff(left))
>
>          want = {
>              'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'),
> MISSING),
>              'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING),
>              'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
>              }
> -        self.assertEqual(want, left.diff(parsers.lazymanifest('')))
> +        self.assertEqual(want, left.diff(manifestmod._lazymanifest('')))
>          copy = right.copy()
>          del copy['z-only-in-right']
>          del right['foo']
> @@ -171,7 +171,7 @@ class testmanifest(unittest.TestCase):
>              }
>          self.assertEqual(want, right.diff(copy))
>
> -        short = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        short = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          pruned = short.copy()
>          del pruned['foo']
>          want = {
> @@ -192,30 +192,37 @@ class testmanifest(unittest.TestCase):
>          backwards = ''.join(
>              l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if
> l)
>          try:
> -            parsers.lazymanifest(backwards)
> +            manifestmod._lazymanifest(backwards)
>              self.fail('Should have raised ValueError')
>          except ValueError, v:
>              self.assertIn('Manifest lines not in sorted order.', str(v))
>
>      def testNoTerminalNewline(self):
>          try:
> -            parsers.lazymanifest(A_SHORT_MANIFEST + 'wat')
> +            manifestmod._lazymanifest(A_SHORT_MANIFEST + 'wat')
>              self.fail('Should have raised ValueError')
>          except ValueError, v:
>              self.assertIn('Manifest did not end in a newline.', str(v))
>
>      def testNoNewLineAtAll(self):
>          try:
> -            parsers.lazymanifest('wat')
> +            manifestmod._lazymanifest('wat')
>              self.fail('Should have raised ValueError')
>          except ValueError, v:
>              self.assertIn('Manifest did not end in a newline.', str(v))
>
>      def testHugeManifest(self):
> -        m = parsers.lazymanifest(A_HUGE_MANIFEST)
> +        m = manifestmod._lazymanifest(A_HUGE_MANIFEST)
>          self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
>          self.assertEqual(len(m), len(list(m)))
>
> +    def testIntersectFiles(self):
> +        m = manifestmod.manifestdict(A_HUGE_MANIFEST)
> +        m2 = m.intersectfiles(['file1', 'file200', 'file300'])
> +        w = ('file1\0%sx\n'
> +             'file200\0%sl\n'
> +             'file300\0%s\n') % (HASH_2, HASH_1, HASH_1)
> +        self.assertEqual(w, m2.text())
>
>  if __name__ == '__main__':
>      silenttestrunner.main(__name__)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://selenic.com/pipermail/mercurial-devel/attachments/20150308/514c44e2/attachment.html>


More information about the Mercurial-devel mailing list