[PATCH 2 of 3] manifest: API to obtain new nodes in a manifest

Durham Goode durham at fb.com
Wed Mar 15 13:34:36 EDT 2017



On 3/14/17 12:33 PM, Gregory Szorc wrote:
> # HG changeset patch
> # User Gregory Szorc <gregory.szorc at gmail.com>
> # Date 1489519771 25200
> #      Tue Mar 14 12:29:31 2017 -0700
> # Node ID c130f8c7496a823c92d3d71880e5beb9fb60c0f7
> # Parent  5e02fae5419fcd9462b5be11f7f6d6dc4a04fc92
> manifest: API to obtain new nodes in a manifest
>
> The server.validate config option attempts to verify that all
> new node references seen in an incoming changegroup are present
> in local storage before closing the transaction. This prevents
> buggy (or malicious) clients from adding a manifest that references
> a filelog node that doesn't exist.
>
> Today, server.validate calls manifestctx.readdelta() after
> processing each manifest in the changegroup. readdelta() loads
> the raw diff between the manifest in the revlog and its
> revlog delta parent. For most manifests, this "just works."
> But, when the delta parent is null, the diff is effectively
> a fulltext, and server.validate thinks that *every* file in the
> manifest is new. At the end of changegroup processing, it
> has to iterate through each of those filelogs and verify the
> node exists. You can imagine how slow this is when the manifest
> contains 100,000+ entries and isn't backed by super fast
> storage.
>
> This patch introduces a new API on the manifestctx instance to
> obtain new nodes in a manifest from the perspective of backing
> storage. For the case where a delta is stored in the revlog, it
> effectively does readdelta(). When a fulltext is stored in the
> revlog, it falls back to a fulltext-based diff between the
> revlog parents. This can be slow. But it is faster than having
> server.validate actually open all of the filelogs.
>
> I didn't implement support for tree manifests because I'm not
> sure how. Perhaps someone can enlighten me.
>
> Testing this in .t tests was a bit difficult because these are
> low-level APIs. I gave up and wrote a .py test instead.
>
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -1444,6 +1444,44 @@ class manifestctx(object):
>              return self.readdelta()
>          return self.read()
>
> +    def readstoragenewnodes(self):
> +        """Returns (path, node) pairs of new nodes in this manifest.
> +
> +        In the common case, this looks at the storage delta between this
> +        manifest and its storage parent. In the case where a fulltext is
> +        stored, it computes the delta between its logical parents and uses
> +        that.
> +
> +        The function may emit duplicate entries.
> +
> +        This function is useful for identifying the logically new entries
> +        in a manifest in storage. It is **not** equivalent to diffing a
Might be worth making it more explicit about what "logically new 
entries" are.  For instance, if the deltabase is p1, then the results 
are the new nodes in the manifest.  If the deltabase is not p1, then the 
results are the differences between this manifest and that random other 
manifest, but in a way that may be useful for incrementally verifying 
appends to the store.
> +        manifest and should not be used in that capacity.
> +        """
> +        rl = self._revlog()
> +        rev = rl.rev(self._node)
> +        deltaparent = rl.deltaparent(rev)
> +
> +        if deltaparent != revlog.nullrev:

The definition of this function might be cleaner if we say "if 
deltaparent != rl.parentrevs(self._node)[0]:".  That way the result is 
always "new nodes, relative to p1", and it just happens to be fast most 
of the time.

> +            d = mdiff.patchtext(rl.revdiff(deltaparent, rev))
> +            for e in manifestdict(d).iteritems():
> +                yield e
> +
> +            return
> +
> +        # Fulltext in storage. We need to diff against parents and emit
> +        # changes.
> +        data = self.read()
> +
> +        for prev in self.parents:
> +            if prev == revlog.nullid:
> +                continue
> +
> +            base = self._manifestlog[prev].read()
> +            for filename, delta in base.diff(data).iteritems():
> +                if delta[1][0]:
> +                    yield filename, delta[1][0]
> +
>      def readdelta(self, shallow=False):
>          '''Returns a manifest containing just the entries that are present
>          in this manifest, but not in its p1 manifest. This is efficient to read
> diff --git a/tests/test-manifest.py b/tests/test-manifest.py
> --- a/tests/test-manifest.py
> +++ b/tests/test-manifest.py
> @@ -1,13 +1,17 @@
>  from __future__ import absolute_import
>
>  import binascii
> +import collections
>  import itertools
>  import silenttestrunner
>  import unittest
>
> +from mercurial.node import nullid
>  from mercurial import (
> +    hg,
>      manifest as manifestmod,
>      match as matchmod,
> +    ui as uimod,
>  )
>
>  EMTPY_MANIFEST = ''
> @@ -467,5 +471,86 @@ class testtreemanifest(unittest.TestCase
>      def parsemanifest(self, text):
>          return manifestmod.treemanifest('', text)
>
> +class testmanifestctx(unittest.TestCase):
> +    _repocount = 0
> +
> +    def _newrepo(self):
> +        ui = uimod.ui()
> +        repo = hg.repository(ui, path='repo-%d' % self._repocount, create=True)
> +        self._repocount += 1
> +        return repo
> +
> +        return manifestmod.manifestlog(repo.vfs, repo)
> +
> +    def testreadstoragenewnodes(self):
> +        repo = self._newrepo()
> +        rl = manifestmod.manifestrevlog(repo.svfs)
> +
> +        m0 = manifestmod.manifestdict()
> +        for i in range(20):
> +            m0['file%d' % i] = '\x01' * 20
> +
> +        # m1 has a minor change and will be stored as a delta in the revlog.
> +        m1 = m0.copy()
> +        m1['file10'] = '\x02' * 20
> +        m1['file15'] = '\x03' * 20
> +
> +        with repo.transaction('test') as tr:
> +            m0node = rl.add(m0, tr, 0, nullid, nullid, [], [])
> +            m1node = rl.add(m1, tr, 0, m0node, nullid, ['file10', 'file15'],
> +                            [])
> +
> +        # Simple delta works.
> +        self.assertEqual(rl.deltaparent(1), 0)
> +        ml = manifestmod.manifestlog(repo.svfs, repo)
> +        res = list(ml[m1node].readstoragenewnodes())
> +        self.assertEqual(res, [
> +            ('file10', m1['file10']),
> +            ('file15', m1['file15'])])
> +
> +        # Now force a fulltext to be stored in revlog.
> +        rl._maxchainlen = 0
> +        m2 = m0.copy()
> +        m2['file12'] = '\x04' * 20
> +        with repo.transaction('test') as tr:
> +            m2node = rl.add(m2, tr, 0, m1node, nullid, ['file12'], [])
> +
> +        self.assertEqual(rl.deltaparent(2), -1)
> +
> +        ml = manifestmod.manifestlog(repo.svfs, repo)
> +        res = list(ml[m2node].readstoragenewnodes())
> +        self.assertEqual(res, [('file12', m2['file12'])])
> +
> +        # Now force a fulltext with a merge.
> +        m3 = m0.copy()
> +        # m1 (p2) modified file10 and file15. We carry one of those forward
> +        # verbatim and modify another.
> +        m3['file10'] = m1['file10']
> +        m3['file15'] = '\x05' * 20
> +        # We also simulate a change to another random file.
> +        m3['file18'] = '\x06' * 20
> +
> +        with repo.transaction('test') as tr:
> +            m3node = rl.add(m3, tr, 0, m0node, m1node,
> +                            ['file10', 'file15', 'file18'], [])
> +
> +        self.assertEqual(rl.deltaparent(3), -1)
> +
> +        ml = manifestmod.manifestlog(repo.svfs, repo)
> +        # Result has differences from both parents.
> +        # diff(m0, m3) will contribute 10, 15, 18
> +        # diff(m1, m3) will contribute 15, 18
> +
> +        # The output may have dupes. So consolidate.
> +        diffs = collections.defaultdict(set)
> +        for path, node in ml[m3node].readstoragenewnodes():
> +            diffs[path].add(node)
> +
> +        self.assertEqual(diffs, {
> +            'file10': set([m3['file10']]),
> +            'file15': set([m3['file15']]),
> +            'file18': set([m3['file18']]),
> +        })
> +
>  if __name__ == '__main__':
>      silenttestrunner.main(__name__)
>


More information about the Mercurial-devel mailing list