[PATCH 2 of 3] manifest: API to obtain new nodes in a manifest
Martin von Zweigbergk
martinvonz at google.com
Wed Mar 15 14:32:54 EDT 2017
On Tue, Mar 14, 2017 at 12:33 PM, Gregory Szorc <gregory.szorc at gmail.com> wrote:
> # HG changeset patch
> # User Gregory Szorc <gregory.szorc at gmail.com>
> # Date 1489519771 25200
> # Tue Mar 14 12:29:31 2017 -0700
> # Node ID c130f8c7496a823c92d3d71880e5beb9fb60c0f7
> # Parent 5e02fae5419fcd9462b5be11f7f6d6dc4a04fc92
> manifest: API to obtain new nodes in a manifest
>
> The server.validate config option attempts to verify that all
> new node references seen in an incoming changegroup are present
> in local storage before closing the transaction. This prevents
> buggy (or malicious) clients from adding a manifest that references
> a filelog node that doesn't exist.
>
> Today, server.validate calls manifestctx.readdelta() after
> processing each manifest in the changegroup. readdelta() loads
> the raw diff between the manifest in the revlog and its
> revlog delta parent. For most manifests, this "just works."
> But, when the delta parent is null, the diff is effectively
> a fulltext, and server.validate thinks that *every* file in the
> manifest is new. At the end of changegroup processing, it
> has to iterate through each of those filelogs and verify the
> node exists. You can imagine how slow this is when the manifest
> contains 100,000+ entries and isn't backed by super fast
> storage.
>
> This patch introduces a new API on the manifestctx instance to
> obtain new nodes in a manifest from the perspective of backing
> storage. For the case where a delta is stored in the revlog, it
> effectively does readdelta(). When a fulltext is stored in the
> revlog, it falls back to a fulltext-based diff between the
> revlog parents. This can be slow. But it is faster than having
> server.validate actually open all of the filelogs.
>
> I didn't implement support for tree manifests because I'm not
> sure how. Perhaps someone can enlighten me.
As Durham said on patch 3, you can just call do it in terms of diff.
Luckily, there is already a function that does what you want:
readdelta(). So "return self.readdelta()" should be all you need.
>
> Testing this in .t tests was a bit difficult because these are
> low-level APIs. I gave up and wrote a .py test instead.
>
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -1444,6 +1444,44 @@ class manifestctx(object):
> return self.readdelta()
> return self.read()
>
> + def readstoragenewnodes(self):
> + """Returns (path, node) pairs of new nodes in this manifest.
> +
> + In the common case, this looks at the storage delta between this
> + manifest and its storage parent. In the case where a fulltext is
> + stored, it computes the delta between its logical parents and uses
> + that.
> +
> + The function may emit duplicate entries.
> +
> + This function is useful for identifying the logically new entries
> + in a manifest in storage. It is **not** equivalent to diffing a
> + manifest and should not be used in that capacity.
> + """
> + rl = self._revlog()
> + rev = rl.rev(self._node)
> + deltaparent = rl.deltaparent(rev)
> +
> + if deltaparent != revlog.nullrev:
> + d = mdiff.patchtext(rl.revdiff(deltaparent, rev))
> + for e in manifestdict(d).iteritems():
> + yield e
> +
> + return
> +
> + # Fulltext in storage. We need to diff against parents and emit
> + # changes.
> + data = self.read()
> +
> + for prev in self.parents:
> + if prev == revlog.nullid:
> + continue
> +
> + base = self._manifestlog[prev].read()
> + for filename, delta in base.diff(data).iteritems():
> + if delta[1][0]:
> + yield filename, delta[1][0]
> +
> def readdelta(self, shallow=False):
> '''Returns a manifest containing just the entries that are present
> in this manifest, but not in its p1 manifest. This is efficient to read
> diff --git a/tests/test-manifest.py b/tests/test-manifest.py
> --- a/tests/test-manifest.py
> +++ b/tests/test-manifest.py
> @@ -1,13 +1,17 @@
> from __future__ import absolute_import
>
> import binascii
> +import collections
> import itertools
> import silenttestrunner
> import unittest
>
> +from mercurial.node import nullid
> from mercurial import (
> + hg,
> manifest as manifestmod,
> match as matchmod,
> + ui as uimod,
> )
>
> EMTPY_MANIFEST = ''
> @@ -467,5 +471,86 @@ class testtreemanifest(unittest.TestCase
> def parsemanifest(self, text):
> return manifestmod.treemanifest('', text)
>
> +class testmanifestctx(unittest.TestCase):
> + _repocount = 0
> +
> + def _newrepo(self):
> + ui = uimod.ui()
> + repo = hg.repository(ui, path='repo-%d' % self._repocount, create=True)
> + self._repocount += 1
> + return repo
> +
> + return manifestmod.manifestlog(repo.vfs, repo)
> +
> + def testreadstoragenewnodes(self):
> + repo = self._newrepo()
> + rl = manifestmod.manifestrevlog(repo.svfs)
> +
> + m0 = manifestmod.manifestdict()
> + for i in range(20):
> + m0['file%d' % i] = '\x01' * 20
> +
> + # m1 has a minor change and will be stored as a delta in the revlog.
> + m1 = m0.copy()
> + m1['file10'] = '\x02' * 20
> + m1['file15'] = '\x03' * 20
> +
> + with repo.transaction('test') as tr:
> + m0node = rl.add(m0, tr, 0, nullid, nullid, [], [])
> + m1node = rl.add(m1, tr, 0, m0node, nullid, ['file10', 'file15'],
> + [])
> +
> + # Simple delta works.
> + self.assertEqual(rl.deltaparent(1), 0)
> + ml = manifestmod.manifestlog(repo.svfs, repo)
> + res = list(ml[m1node].readstoragenewnodes())
> + self.assertEqual(res, [
> + ('file10', m1['file10']),
> + ('file15', m1['file15'])])
> +
> + # Now force a fulltext to be stored in revlog.
> + rl._maxchainlen = 0
> + m2 = m0.copy()
> + m2['file12'] = '\x04' * 20
> + with repo.transaction('test') as tr:
> + m2node = rl.add(m2, tr, 0, m1node, nullid, ['file12'], [])
> +
> + self.assertEqual(rl.deltaparent(2), -1)
> +
> + ml = manifestmod.manifestlog(repo.svfs, repo)
> + res = list(ml[m2node].readstoragenewnodes())
> + self.assertEqual(res, [('file12', m2['file12'])])
> +
> + # Now force a fulltext with a merge.
> + m3 = m0.copy()
> + # m1 (p2) modified file10 and file15. We carry one of those forward
> + # verbatim and modify another.
> + m3['file10'] = m1['file10']
> + m3['file15'] = '\x05' * 20
> + # We also simulate a change to another random file.
> + m3['file18'] = '\x06' * 20
> +
> + with repo.transaction('test') as tr:
> + m3node = rl.add(m3, tr, 0, m0node, m1node,
> + ['file10', 'file15', 'file18'], [])
> +
> + self.assertEqual(rl.deltaparent(3), -1)
> +
> + ml = manifestmod.manifestlog(repo.svfs, repo)
> + # Result has differences from both parents.
> + # diff(m0, m3) will contribute 10, 15, 18
> + # diff(m1, m3) will contribute 15, 18
> +
> + # The output may have dupes. So consolidate.
> + diffs = collections.defaultdict(set)
> + for path, node in ml[m3node].readstoragenewnodes():
> + diffs[path].add(node)
> +
> + self.assertEqual(diffs, {
> + 'file10': set([m3['file10']]),
> + 'file15': set([m3['file15']]),
> + 'file18': set([m3['file18']]),
> + })
> +
> if __name__ == '__main__':
> silenttestrunner.main(__name__)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
More information about the Mercurial-devel
mailing list