[PATCH 2 of 3] manifest: API to obtain new nodes in a manifest

Martin von Zweigbergk martinvonz at google.com
Wed Mar 15 14:32:54 EDT 2017


On Tue, Mar 14, 2017 at 12:33 PM, Gregory Szorc <gregory.szorc at gmail.com> wrote:
> # HG changeset patch
> # User Gregory Szorc <gregory.szorc at gmail.com>
> # Date 1489519771 25200
> #      Tue Mar 14 12:29:31 2017 -0700
> # Node ID c130f8c7496a823c92d3d71880e5beb9fb60c0f7
> # Parent  5e02fae5419fcd9462b5be11f7f6d6dc4a04fc92
> manifest: API to obtain new nodes in a manifest
>
> The server.validate config option attempts to verify that all
> new node references seen in an incoming changegroup are present
> in local storage before closing the transaction. This prevents
> buggy (or malicious) clients from adding a manifest that references
> a filelog node that doesn't exist.
>
> Today, server.validate calls manifestctx.readdelta() after
> processing each manifest in the changegroup. readdelta() loads
> the raw diff between the manifest in the revlog and its
> revlog delta parent. For most manifests, this "just works."
> But, when the delta parent is null, the diff is effectively
> a fulltext, and server.validate thinks that *every* file in the
> manifest is new. At the end of changegroup processing, it
> has to iterate through each of those filelogs and verify the
> node exists. You can imagine how slow this is when the manifest
> contains 100,000+ entries and isn't backed by super fast
> storage.
>
> This patch introduces a new API on the manifestctx instance to
> obtain new nodes in a manifest from the perspective of backing
> storage. For the case where a delta is stored in the revlog, it
> effectively does readdelta(). When a fulltext is stored in the
> revlog, it falls back to a fulltext-based diff between the
> revlog parents. This can be slow. But it is faster than having
> server.validate actually open all of the filelogs.
>
> I didn't implement support for tree manifests because I'm not
> sure how. Perhaps someone can enlighten me.

As Durham said on patch 3, you can just call do it in terms of diff.
Luckily, there is already a function that does what you want:
readdelta(). So "return self.readdelta()" should be all you need.

>
> Testing this in .t tests was a bit difficult because these are
> low-level APIs. I gave up and wrote a .py test instead.
>
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -1444,6 +1444,44 @@ class manifestctx(object):
>              return self.readdelta()
>          return self.read()
>
> +    def readstoragenewnodes(self):
> +        """Returns (path, node) pairs of new nodes in this manifest.
> +
> +        In the common case, this looks at the storage delta between this
> +        manifest and its storage parent. In the case where a fulltext is
> +        stored, it computes the delta between its logical parents and uses
> +        that.
> +
> +        The function may emit duplicate entries.
> +
> +        This function is useful for identifying the logically new entries
> +        in a manifest in storage. It is **not** equivalent to diffing a
> +        manifest and should not be used in that capacity.
> +        """
> +        rl = self._revlog()
> +        rev = rl.rev(self._node)
> +        deltaparent = rl.deltaparent(rev)
> +
> +        if deltaparent != revlog.nullrev:
> +            d = mdiff.patchtext(rl.revdiff(deltaparent, rev))
> +            for e in manifestdict(d).iteritems():
> +                yield e
> +
> +            return
> +
> +        # Fulltext in storage. We need to diff against parents and emit
> +        # changes.
> +        data = self.read()
> +
> +        for prev in self.parents:
> +            if prev == revlog.nullid:
> +                continue
> +
> +            base = self._manifestlog[prev].read()
> +            for filename, delta in base.diff(data).iteritems():
> +                if delta[1][0]:
> +                    yield filename, delta[1][0]
> +
>      def readdelta(self, shallow=False):
>          '''Returns a manifest containing just the entries that are present
>          in this manifest, but not in its p1 manifest. This is efficient to read
> diff --git a/tests/test-manifest.py b/tests/test-manifest.py
> --- a/tests/test-manifest.py
> +++ b/tests/test-manifest.py
> @@ -1,13 +1,17 @@
>  from __future__ import absolute_import
>
>  import binascii
> +import collections
>  import itertools
>  import silenttestrunner
>  import unittest
>
> +from mercurial.node import nullid
>  from mercurial import (
> +    hg,
>      manifest as manifestmod,
>      match as matchmod,
> +    ui as uimod,
>  )
>
>  EMTPY_MANIFEST = ''
> @@ -467,5 +471,86 @@ class testtreemanifest(unittest.TestCase
>      def parsemanifest(self, text):
>          return manifestmod.treemanifest('', text)
>
> +class testmanifestctx(unittest.TestCase):
> +    _repocount = 0
> +
> +    def _newrepo(self):
> +        ui = uimod.ui()
> +        repo = hg.repository(ui, path='repo-%d' % self._repocount, create=True)
> +        self._repocount += 1
> +        return repo
> +
> +        return manifestmod.manifestlog(repo.vfs, repo)
> +
> +    def testreadstoragenewnodes(self):
> +        repo = self._newrepo()
> +        rl = manifestmod.manifestrevlog(repo.svfs)
> +
> +        m0 = manifestmod.manifestdict()
> +        for i in range(20):
> +            m0['file%d' % i] = '\x01' * 20
> +
> +        # m1 has a minor change and will be stored as a delta in the revlog.
> +        m1 = m0.copy()
> +        m1['file10'] = '\x02' * 20
> +        m1['file15'] = '\x03' * 20
> +
> +        with repo.transaction('test') as tr:
> +            m0node = rl.add(m0, tr, 0, nullid, nullid, [], [])
> +            m1node = rl.add(m1, tr, 0, m0node, nullid, ['file10', 'file15'],
> +                            [])
> +
> +        # Simple delta works.
> +        self.assertEqual(rl.deltaparent(1), 0)
> +        ml = manifestmod.manifestlog(repo.svfs, repo)
> +        res = list(ml[m1node].readstoragenewnodes())
> +        self.assertEqual(res, [
> +            ('file10', m1['file10']),
> +            ('file15', m1['file15'])])
> +
> +        # Now force a fulltext to be stored in revlog.
> +        rl._maxchainlen = 0
> +        m2 = m0.copy()
> +        m2['file12'] = '\x04' * 20
> +        with repo.transaction('test') as tr:
> +            m2node = rl.add(m2, tr, 0, m1node, nullid, ['file12'], [])
> +
> +        self.assertEqual(rl.deltaparent(2), -1)
> +
> +        ml = manifestmod.manifestlog(repo.svfs, repo)
> +        res = list(ml[m2node].readstoragenewnodes())
> +        self.assertEqual(res, [('file12', m2['file12'])])
> +
> +        # Now force a fulltext with a merge.
> +        m3 = m0.copy()
> +        # m1 (p2) modified file10 and file15. We carry one of those forward
> +        # verbatim and modify another.
> +        m3['file10'] = m1['file10']
> +        m3['file15'] = '\x05' * 20
> +        # We also simulate a change to another random file.
> +        m3['file18'] = '\x06' * 20
> +
> +        with repo.transaction('test') as tr:
> +            m3node = rl.add(m3, tr, 0, m0node, m1node,
> +                            ['file10', 'file15', 'file18'], [])
> +
> +        self.assertEqual(rl.deltaparent(3), -1)
> +
> +        ml = manifestmod.manifestlog(repo.svfs, repo)
> +        # Result has differences from both parents.
> +        # diff(m0, m3) will contribute 10, 15, 18
> +        # diff(m1, m3) will contribute 15, 18
> +
> +        # The output may have dupes. So consolidate.
> +        diffs = collections.defaultdict(set)
> +        for path, node in ml[m3node].readstoragenewnodes():
> +            diffs[path].add(node)
> +
> +        self.assertEqual(diffs, {
> +            'file10': set([m3['file10']]),
> +            'file15': set([m3['file15']]),
> +            'file18': set([m3['file18']]),
> +        })
> +
>  if __name__ == '__main__':
>      silenttestrunner.main(__name__)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


More information about the Mercurial-devel mailing list