[PATCH 2 of 3] manifest: API to obtain new nodes in a manifest

Gregory Szorc gregory.szorc at gmail.com
Tue Mar 14 15:33:07 EDT 2017


# HG changeset patch
# User Gregory Szorc <gregory.szorc at gmail.com>
# Date 1489519771 25200
#      Tue Mar 14 12:29:31 2017 -0700
# Node ID c130f8c7496a823c92d3d71880e5beb9fb60c0f7
# Parent  5e02fae5419fcd9462b5be11f7f6d6dc4a04fc92
manifest: API to obtain new nodes in a manifest

The server.validate config option attempts to verify that all
new node references seen in an incoming changegroup are present
in local storage before closing the transaction. This prevents
buggy (or malicious) clients from adding a manifest that references
a filelog node that doesn't exist.

Today, server.validate calls manifestctx.readdelta() after
processing each manifest in the changegroup. readdelta() loads
the raw diff between the manifest in the revlog and its
revlog delta parent. For most manifests, this "just works."
But, when the delta parent is null, the diff is effectively
a fulltext, and server.validate thinks that *every* file in the
manifest is new. At the end of changegroup processing, it
has to iterate through each of those filelogs and verify the
node exists. You can imagine how slow this is when the manifest
contains 100,000+ entries and isn't backed by super fast
storage.

This patch introduces a new API on the manifestctx instance to
obtain new nodes in a manifest from the perspective of backing
storage. For the case where a delta is stored in the revlog, it
effectively does readdelta(). When a fulltext is stored in the
revlog, it falls back to a fulltext-based diff between the
revlog parents. This can be slow. But it is faster than having
server.validate actually open all of the filelogs.

I didn't implement support for tree manifests because I'm not
sure how. Perhaps someone can enlighten me.

Testing this in .t tests was a bit difficult because these are
low-level APIs. I gave up and wrote a .py test instead.

diff --git a/mercurial/manifest.py b/mercurial/manifest.py
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -1444,6 +1444,44 @@ class manifestctx(object):
             return self.readdelta()
         return self.read()
 
+    def readstoragenewnodes(self):
+        """Returns (path, node) pairs of new nodes in this manifest.
+
+        In the common case, this looks at the storage delta between this
+        manifest and its storage parent. In the case where a fulltext is
+        stored, it computes the delta between its logical parents and uses
+        that.
+
+        The function may emit duplicate entries.
+
+        This function is useful for identifying the logically new entries
+        in a manifest in storage. It is **not** equivalent to diffing a
+        manifest and should not be used in that capacity.
+        """
+        rl = self._revlog()
+        rev = rl.rev(self._node)
+        deltaparent = rl.deltaparent(rev)
+
+        if deltaparent != revlog.nullrev:
+            d = mdiff.patchtext(rl.revdiff(deltaparent, rev))
+            for e in manifestdict(d).iteritems():
+                yield e
+
+            return
+
+        # Fulltext in storage. We need to diff against parents and emit
+        # changes.
+        data = self.read()
+
+        for prev in self.parents:
+            if prev == revlog.nullid:
+                continue
+
+            base = self._manifestlog[prev].read()
+            for filename, delta in base.diff(data).iteritems():
+                if delta[1][0]:
+                    yield filename, delta[1][0]
+
     def readdelta(self, shallow=False):
         '''Returns a manifest containing just the entries that are present
         in this manifest, but not in its p1 manifest. This is efficient to read
diff --git a/tests/test-manifest.py b/tests/test-manifest.py
--- a/tests/test-manifest.py
+++ b/tests/test-manifest.py
@@ -1,13 +1,17 @@
 from __future__ import absolute_import
 
 import binascii
+import collections
 import itertools
 import silenttestrunner
 import unittest
 
+from mercurial.node import nullid
 from mercurial import (
+    hg,
     manifest as manifestmod,
     match as matchmod,
+    ui as uimod,
 )
 
 EMTPY_MANIFEST = ''
@@ -467,5 +471,86 @@ class testtreemanifest(unittest.TestCase
     def parsemanifest(self, text):
         return manifestmod.treemanifest('', text)
 
+class testmanifestctx(unittest.TestCase):
+    _repocount = 0
+
+    def _newrepo(self):
+        ui = uimod.ui()
+        repo = hg.repository(ui, path='repo-%d' % self._repocount, create=True)
+        self._repocount += 1
+        return repo
+
+        return manifestmod.manifestlog(repo.vfs, repo)
+
+    def testreadstoragenewnodes(self):
+        repo = self._newrepo()
+        rl = manifestmod.manifestrevlog(repo.svfs)
+
+        m0 = manifestmod.manifestdict()
+        for i in range(20):
+            m0['file%d' % i] = '\x01' * 20
+
+        # m1 has a minor change and will be stored as a delta in the revlog.
+        m1 = m0.copy()
+        m1['file10'] = '\x02' * 20
+        m1['file15'] = '\x03' * 20
+
+        with repo.transaction('test') as tr:
+            m0node = rl.add(m0, tr, 0, nullid, nullid, [], [])
+            m1node = rl.add(m1, tr, 0, m0node, nullid, ['file10', 'file15'],
+                            [])
+
+        # Simple delta works.
+        self.assertEqual(rl.deltaparent(1), 0)
+        ml = manifestmod.manifestlog(repo.svfs, repo)
+        res = list(ml[m1node].readstoragenewnodes())
+        self.assertEqual(res, [
+            ('file10', m1['file10']),
+            ('file15', m1['file15'])])
+
+        # Now force a fulltext to be stored in revlog.
+        rl._maxchainlen = 0
+        m2 = m0.copy()
+        m2['file12'] = '\x04' * 20
+        with repo.transaction('test') as tr:
+            m2node = rl.add(m2, tr, 0, m1node, nullid, ['file12'], [])
+
+        self.assertEqual(rl.deltaparent(2), -1)
+
+        ml = manifestmod.manifestlog(repo.svfs, repo)
+        res = list(ml[m2node].readstoragenewnodes())
+        self.assertEqual(res, [('file12', m2['file12'])])
+
+        # Now force a fulltext with a merge.
+        m3 = m0.copy()
+        # m1 (p2) modified file10 and file15. We carry one of those forward
+        # verbatim and modify another.
+        m3['file10'] = m1['file10']
+        m3['file15'] = '\x05' * 20
+        # We also simulate a change to another random file.
+        m3['file18'] = '\x06' * 20
+
+        with repo.transaction('test') as tr:
+            m3node = rl.add(m3, tr, 0, m0node, m1node,
+                            ['file10', 'file15', 'file18'], [])
+
+        self.assertEqual(rl.deltaparent(3), -1)
+
+        ml = manifestmod.manifestlog(repo.svfs, repo)
+        # Result has differences from both parents.
+        # diff(m0, m3) will contribute 10, 15, 18
+        # diff(m1, m3) will contribute 15, 18
+
+        # The output may have dupes. So consolidate.
+        diffs = collections.defaultdict(set)
+        for path, node in ml[m3node].readstoragenewnodes():
+            diffs[path].add(node)
+
+        self.assertEqual(diffs, {
+            'file10': set([m3['file10']]),
+            'file15': set([m3['file15']]),
+            'file18': set([m3['file18']]),
+        })
+
 if __name__ == '__main__':
     silenttestrunner.main(__name__)


More information about the Mercurial-devel mailing list