[PATCH 03 of 12] revlog: split classes related to deltas computation in a new module

Boris Feld boris.feld at octobus.net
Sat Aug 18 05:27:18 EDT 2018


# HG changeset patch
# User Boris Feld <boris.feld at octobus.net>
# Date 1534378131 -7200
#      Thu Aug 16 02:08:51 2018 +0200
# Node ID 0f38a0bcba5817146fb3a4e079016abc9a424646
# Parent  4a61a7a37a499387658b6270ea0b2dadfef86156
# EXP-Topic sparse-snapshot
# Available At https://bitbucket.org/octobus/mercurial-devel/
#              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r 0f38a0bcba58
revlog: split classes related to deltas computation in a new module

The revlog module is getting big and this logic is getting more and more
advanced. Moving it to `mercurial.revlogutils.deltas` split about 25% off
revlog.py and will help this logic to become less interleaved with revlog.

The code is simply moved without modification (but for namespace changes).
Refactoring and improvement will be made in later changesets.

diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -67,6 +67,9 @@ from . import (
     templatefilters,
     util,
 )
+from .revlogutils import (
+    deltas as deltautil,
+)
 from .utils import (
     stringutil,
 )
@@ -609,210 +612,6 @@ def _slicechunktodensity(revlog, revs, d
         yield chunk
 
 @attr.s(slots=True, frozen=True)
-class _deltainfo(object):
-    distance = attr.ib()
-    deltalen = attr.ib()
-    data = attr.ib()
-    base = attr.ib()
-    chainbase = attr.ib()
-    chainlen = attr.ib()
-    compresseddeltalen = attr.ib()
-    snapshotdepth = attr.ib()
-
-class _deltacomputer(object):
-    def __init__(self, revlog):
-        self.revlog = revlog
-
-    def _getcandidaterevs(self, p1, p2, cachedelta):
-        """
-        Provides revisions that present an interest to be diffed against,
-        grouped by level of easiness.
-        """
-        revlog = self.revlog
-        gdelta = revlog._generaldelta
-        curr = len(revlog)
-        prev = curr - 1
-        p1r, p2r = revlog.rev(p1), revlog.rev(p2)
-
-        # should we try to build a delta?
-        if prev != nullrev and revlog.storedeltachains:
-            tested = set()
-            # This condition is true most of the time when processing
-            # changegroup data into a generaldelta repo. The only time it
-            # isn't true is if this is the first revision in a delta chain
-            # or if ``format.generaldelta=true`` disabled ``lazydeltabase``.
-            if cachedelta and gdelta and revlog._lazydeltabase:
-                # Assume what we received from the server is a good choice
-                # build delta will reuse the cache
-                yield (cachedelta[0],)
-                tested.add(cachedelta[0])
-
-            if gdelta:
-                # exclude already lazy tested base if any
-                parents = [p for p in (p1r, p2r)
-                           if p != nullrev and p not in tested]
-
-                if not revlog._deltabothparents and len(parents) == 2:
-                    parents.sort()
-                    # To minimize the chance of having to build a fulltext,
-                    # pick first whichever parent is closest to us (max rev)
-                    yield (parents[1],)
-                    # then the other one (min rev) if the first did not fit
-                    yield (parents[0],)
-                    tested.update(parents)
-                elif len(parents) > 0:
-                    # Test all parents (1 or 2), and keep the best candidate
-                    yield parents
-                    tested.update(parents)
-
-            if prev not in tested:
-                # other approach failed try against prev to hopefully save us a
-                # fulltext.
-                yield (prev,)
-                tested.add(prev)
-
-    def buildtext(self, revinfo, fh):
-        """Builds a fulltext version of a revision
-
-        revinfo: _revisioninfo instance that contains all needed info
-        fh:      file handle to either the .i or the .d revlog file,
-                 depending on whether it is inlined or not
-        """
-        btext = revinfo.btext
-        if btext[0] is not None:
-            return btext[0]
-
-        revlog = self.revlog
-        cachedelta = revinfo.cachedelta
-        flags = revinfo.flags
-        node = revinfo.node
-
-        baserev = cachedelta[0]
-        delta = cachedelta[1]
-        # special case deltas which replace entire base; no need to decode
-        # base revision. this neatly avoids censored bases, which throw when
-        # they're decoded.
-        hlen = struct.calcsize(">lll")
-        if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
-                                                   len(delta) - hlen):
-            btext[0] = delta[hlen:]
-        else:
-            # deltabase is rawtext before changed by flag processors, which is
-            # equivalent to non-raw text
-            basetext = revlog.revision(baserev, _df=fh, raw=False)
-            btext[0] = mdiff.patch(basetext, delta)
-
-        try:
-            res = revlog._processflags(btext[0], flags, 'read', raw=True)
-            btext[0], validatehash = res
-            if validatehash:
-                revlog.checkhash(btext[0], node, p1=revinfo.p1, p2=revinfo.p2)
-            if flags & REVIDX_ISCENSORED:
-                raise RevlogError(_('node %s is not censored') % node)
-        except CensoredNodeError:
-            # must pass the censored index flag to add censored revisions
-            if not flags & REVIDX_ISCENSORED:
-                raise
-        return btext[0]
-
-    def _builddeltadiff(self, base, revinfo, fh):
-        revlog = self.revlog
-        t = self.buildtext(revinfo, fh)
-        if revlog.iscensored(base):
-            # deltas based on a censored revision must replace the
-            # full content in one patch, so delta works everywhere
-            header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
-            delta = header + t
-        else:
-            ptext = revlog.revision(base, _df=fh, raw=True)
-            delta = mdiff.textdiff(ptext, t)
-
-        return delta
-
-    def _builddeltainfo(self, revinfo, base, fh):
-        # can we use the cached delta?
-        if revinfo.cachedelta and revinfo.cachedelta[0] == base:
-            delta = revinfo.cachedelta[1]
-        else:
-            delta = self._builddeltadiff(base, revinfo, fh)
-        revlog = self.revlog
-        header, data = revlog.compress(delta)
-        deltalen = len(header) + len(data)
-        chainbase = revlog.chainbase(base)
-        offset = revlog.end(len(revlog) - 1)
-        dist = deltalen + offset - revlog.start(chainbase)
-        if revlog._generaldelta:
-            deltabase = base
-        else:
-            deltabase = chainbase
-        chainlen, compresseddeltalen = revlog._chaininfo(base)
-        chainlen += 1
-        compresseddeltalen += deltalen
-
-        revlog = self.revlog
-        snapshotdepth = None
-        if deltabase == nullrev:
-            snapshotdepth = 0
-        elif revlog._sparserevlog and revlog.issnapshot(deltabase):
-            # A delta chain should always be one full snapshot,
-            # zero or more semi-snapshots, and zero or more deltas
-            p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
-            if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
-                snapshotdepth = len(revlog._deltachain(deltabase)[0])
-
-        return _deltainfo(dist, deltalen, (header, data), deltabase,
-                          chainbase, chainlen, compresseddeltalen,
-                          snapshotdepth)
-
-    def finddeltainfo(self, revinfo, fh):
-        """Find an acceptable delta against a candidate revision
-
-        revinfo: information about the revision (instance of _revisioninfo)
-        fh:      file handle to either the .i or the .d revlog file,
-                 depending on whether it is inlined or not
-
-        Returns the first acceptable candidate revision, as ordered by
-        _getcandidaterevs
-        """
-        if not revinfo.textlen:
-            return None # empty file do not need delta
-
-        cachedelta = revinfo.cachedelta
-        p1 = revinfo.p1
-        p2 = revinfo.p2
-        revlog = self.revlog
-
-        deltalength = self.revlog.length
-        deltaparent = self.revlog.deltaparent
-
-        deltainfo = None
-        deltas_limit = revinfo.textlen * LIMIT_DELTA2TEXT
-        for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta):
-            # filter out delta base that will never produce good delta
-            candidaterevs = [r for r in candidaterevs
-                             if self.revlog.length(r) <= deltas_limit]
-            nominateddeltas = []
-            for candidaterev in candidaterevs:
-                # skip over empty delta (no need to include them in a chain)
-                while candidaterev != nullrev and not deltalength(candidaterev):
-                    candidaterev = deltaparent(candidaterev)
-                # no need to try a delta against nullid, this will be handled
-                # by fulltext later.
-                if candidaterev == nullrev:
-                    continue
-                # no delta for rawtext-changing revs (see "candelta" for why)
-                if revlog.flags(candidaterev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
-                    continue
-                candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
-                if revlog._isgooddeltainfo(candidatedelta, revinfo):
-                    nominateddeltas.append(candidatedelta)
-            if nominateddeltas:
-                deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
-                break
-
-        return deltainfo
-
- at attr.s(slots=True, frozen=True)
 class _revisioninfo(object):
     """Information about a revision that allows building its fulltext
     node:       expected hash of the revision
@@ -2375,7 +2174,7 @@ class revlog(object):
             computed by default as hash(text, p1, p2), however subclasses might
             use different hashing method (and override checkhash() in such case)
         flags - the known flags to set on the revision
-        deltacomputer - an optional _deltacomputer instance shared between
+        deltacomputer - an optional deltacomputer instance shared between
             multiple calls
         """
         if link == nullrev:
@@ -2634,7 +2433,7 @@ class revlog(object):
             textlen = len(rawtext)
 
         if deltacomputer is None:
-            deltacomputer = _deltacomputer(self)
+            deltacomputer = deltautil.deltacomputer(self)
 
         revinfo = _revisioninfo(node, p1, p2, btext, textlen, cachedelta, flags)
 
@@ -2736,7 +2535,7 @@ class revlog(object):
                 dfh.flush()
             ifh.flush()
         try:
-            deltacomputer = _deltacomputer(self)
+            deltacomputer = deltautil.deltacomputer(self)
             # loop through our set of deltas
             for data in deltas:
                 node, p1, p2, linknode, deltabase, delta, flags = data
@@ -3028,7 +2827,7 @@ class revlog(object):
             populatecachedelta = deltareuse in (self.DELTAREUSEALWAYS,
                                                 self.DELTAREUSESAMEREVS)
 
-            deltacomputer = _deltacomputer(destrevlog)
+            deltacomputer = deltautil.deltacomputer(destrevlog)
             index = self.index
             for rev in self:
                 entry = index[rev]
diff --git a/mercurial/revlogutils/deltas.py b/mercurial/revlogutils/deltas.py
new file mode 100644
--- /dev/null
+++ b/mercurial/revlogutils/deltas.py
@@ -0,0 +1,240 @@
+# revlogdeltas.py - Logic around delta computation for revlog
+#
+# Copyright 2005-2007 Matt Mackall <mpm at selenic.com>
+# Copyright 2018 Octobus <contact at octobus.net>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+"""Helper class to compute deltas stored inside revlogs"""
+
+from __future__ import absolute_import
+
+import struct
+
+# import stuff from node for others to import from revlog
+from ..node import (
+    nullrev,
+)
+from ..i18n import _
+
+from .constants import (
+    LIMIT_DELTA2TEXT,
+    REVIDX_ISCENSORED,
+    REVIDX_RAWTEXT_CHANGING_FLAGS,
+)
+
+from ..thirdparty import (
+    attr,
+)
+
+from .. import (
+    error,
+    mdiff,
+)
+
+RevlogError = error.RevlogError
+CensoredNodeError = error.CensoredNodeError
+
+ at attr.s(slots=True, frozen=True)
+class _deltainfo(object):
+    distance = attr.ib()
+    deltalen = attr.ib()
+    data = attr.ib()
+    base = attr.ib()
+    chainbase = attr.ib()
+    chainlen = attr.ib()
+    compresseddeltalen = attr.ib()
+    snapshotdepth = attr.ib()
+
+class deltacomputer(object):
+    def __init__(self, revlog):
+        self.revlog = revlog
+
+    def _getcandidaterevs(self, p1, p2, cachedelta):
+        """
+        Provides revisions that present an interest to be diffed against,
+        grouped by level of easiness.
+        """
+        revlog = self.revlog
+        gdelta = revlog._generaldelta
+        curr = len(revlog)
+        prev = curr - 1
+        p1r, p2r = revlog.rev(p1), revlog.rev(p2)
+
+        # should we try to build a delta?
+        if prev != nullrev and revlog.storedeltachains:
+            tested = set()
+            # This condition is true most of the time when processing
+            # changegroup data into a generaldelta repo. The only time it
+            # isn't true is if this is the first revision in a delta chain
+            # or if ``format.generaldelta=true`` disabled ``lazydeltabase``.
+            if cachedelta and gdelta and revlog._lazydeltabase:
+                # Assume what we received from the server is a good choice
+                # build delta will reuse the cache
+                yield (cachedelta[0],)
+                tested.add(cachedelta[0])
+
+            if gdelta:
+                # exclude already lazy tested base if any
+                parents = [p for p in (p1r, p2r)
+                           if p != nullrev and p not in tested]
+
+                if not revlog._deltabothparents and len(parents) == 2:
+                    parents.sort()
+                    # To minimize the chance of having to build a fulltext,
+                    # pick first whichever parent is closest to us (max rev)
+                    yield (parents[1],)
+                    # then the other one (min rev) if the first did not fit
+                    yield (parents[0],)
+                    tested.update(parents)
+                elif len(parents) > 0:
+                    # Test all parents (1 or 2), and keep the best candidate
+                    yield parents
+                    tested.update(parents)
+
+            if prev not in tested:
+                # other approach failed try against prev to hopefully save us a
+                # fulltext.
+                yield (prev,)
+                tested.add(prev)
+
+    def buildtext(self, revinfo, fh):
+        """Builds a fulltext version of a revision
+
+        revinfo: _revisioninfo instance that contains all needed info
+        fh:      file handle to either the .i or the .d revlog file,
+                 depending on whether it is inlined or not
+        """
+        btext = revinfo.btext
+        if btext[0] is not None:
+            return btext[0]
+
+        revlog = self.revlog
+        cachedelta = revinfo.cachedelta
+        flags = revinfo.flags
+        node = revinfo.node
+
+        baserev = cachedelta[0]
+        delta = cachedelta[1]
+        # special case deltas which replace entire base; no need to decode
+        # base revision. this neatly avoids censored bases, which throw when
+        # they're decoded.
+        hlen = struct.calcsize(">lll")
+        if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
+                                                   len(delta) - hlen):
+            btext[0] = delta[hlen:]
+        else:
+            # deltabase is rawtext before changed by flag processors, which is
+            # equivalent to non-raw text
+            basetext = revlog.revision(baserev, _df=fh, raw=False)
+            btext[0] = mdiff.patch(basetext, delta)
+
+        try:
+            res = revlog._processflags(btext[0], flags, 'read', raw=True)
+            btext[0], validatehash = res
+            if validatehash:
+                revlog.checkhash(btext[0], node, p1=revinfo.p1, p2=revinfo.p2)
+            if flags & REVIDX_ISCENSORED:
+                raise RevlogError(_('node %s is not censored') % node)
+        except CensoredNodeError:
+            # must pass the censored index flag to add censored revisions
+            if not flags & REVIDX_ISCENSORED:
+                raise
+        return btext[0]
+
+    def _builddeltadiff(self, base, revinfo, fh):
+        revlog = self.revlog
+        t = self.buildtext(revinfo, fh)
+        if revlog.iscensored(base):
+            # deltas based on a censored revision must replace the
+            # full content in one patch, so delta works everywhere
+            header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
+            delta = header + t
+        else:
+            ptext = revlog.revision(base, _df=fh, raw=True)
+            delta = mdiff.textdiff(ptext, t)
+
+        return delta
+
+    def _builddeltainfo(self, revinfo, base, fh):
+        # can we use the cached delta?
+        if revinfo.cachedelta and revinfo.cachedelta[0] == base:
+            delta = revinfo.cachedelta[1]
+        else:
+            delta = self._builddeltadiff(base, revinfo, fh)
+        revlog = self.revlog
+        header, data = revlog.compress(delta)
+        deltalen = len(header) + len(data)
+        chainbase = revlog.chainbase(base)
+        offset = revlog.end(len(revlog) - 1)
+        dist = deltalen + offset - revlog.start(chainbase)
+        if revlog._generaldelta:
+            deltabase = base
+        else:
+            deltabase = chainbase
+        chainlen, compresseddeltalen = revlog._chaininfo(base)
+        chainlen += 1
+        compresseddeltalen += deltalen
+
+        revlog = self.revlog
+        snapshotdepth = None
+        if deltabase == nullrev:
+            snapshotdepth = 0
+        elif revlog._sparserevlog and revlog.issnapshot(deltabase):
+            # A delta chain should always be one full snapshot,
+            # zero or more semi-snapshots, and zero or more deltas
+            p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
+            if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
+                snapshotdepth = len(revlog._deltachain(deltabase)[0])
+
+        return _deltainfo(dist, deltalen, (header, data), deltabase,
+                          chainbase, chainlen, compresseddeltalen,
+                          snapshotdepth)
+
+    def finddeltainfo(self, revinfo, fh):
+        """Find an acceptable delta against a candidate revision
+
+        revinfo: information about the revision (instance of _revisioninfo)
+        fh:      file handle to either the .i or the .d revlog file,
+                 depending on whether it is inlined or not
+
+        Returns the first acceptable candidate revision, as ordered by
+        _getcandidaterevs
+        """
+        if not revinfo.textlen:
+            return None # empty file do not need delta
+
+        cachedelta = revinfo.cachedelta
+        p1 = revinfo.p1
+        p2 = revinfo.p2
+        revlog = self.revlog
+
+        deltalength = self.revlog.length
+        deltaparent = self.revlog.deltaparent
+
+        deltainfo = None
+        deltas_limit = revinfo.textlen * LIMIT_DELTA2TEXT
+        for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta):
+            # filter out delta base that will never produce good delta
+            candidaterevs = [r for r in candidaterevs
+                             if self.revlog.length(r) <= deltas_limit]
+            nominateddeltas = []
+            for candidaterev in candidaterevs:
+                # skip over empty delta (no need to include them in a chain)
+                while candidaterev != nullrev and not deltalength(candidaterev):
+                    candidaterev = deltaparent(candidaterev)
+                # no need to try a delta against nullid, this will be handled
+                # by fulltext later.
+                if candidaterev == nullrev:
+                    continue
+                # no delta for rawtext-changing revs (see "candelta" for why)
+                if revlog.flags(candidaterev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
+                    continue
+                candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
+                if revlog._isgooddeltainfo(candidatedelta, revinfo):
+                    nominateddeltas.append(candidatedelta)
+            if nominateddeltas:
+                deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
+                break
+
+        return deltainfo


More information about the Mercurial-devel mailing list