[PATCH 05 of 12] revlog: move the good delta heuristic in revlogutils.deltas

Boris Feld boris.feld at octobus.net
Sat Aug 18 05:27:20 EDT 2018


# HG changeset patch
# User Boris Feld <boris.feld at octobus.net>
# Date 1534380638 -7200
#      Thu Aug 16 02:50:38 2018 +0200
# Node ID 9c7224a86df8f316dbfb006ba8e81fc3ce831303
# Parent  07af069da30de0c6ae0ce8229a0ddd7c22c89313
# EXP-Topic sparse-snapshot
# Available At https://bitbucket.org/octobus/mercurial-devel/
#              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r 9c7224a86df8
revlog: move the good delta heuristic in revlogutils.deltas

This is logic about building delta chains. We move it with related logic. The
only user of this logic moved in this module already.

diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -37,7 +37,6 @@ from .i18n import _
 from .revlogutils.constants import (
     FLAG_GENERALDELTA,
     FLAG_INLINE_DATA,
-    LIMIT_DELTA2TEXT,
     REVIDX_DEFAULT_FLAGS,
     REVIDX_ELLIPSIS,
     REVIDX_EXTSTORED,
@@ -1897,96 +1896,6 @@ class revlog(object):
 
         return compressor.decompress(data)
 
-    def _isgooddeltainfo(self, deltainfo, revinfo):
-        """Returns True if the given delta is good. Good means that it is within
-        the disk span, disk size, and chain length bounds that we know to be
-        performant."""
-        if deltainfo is None:
-            return False
-
-        # - 'deltainfo.distance' is the distance from the base revision --
-        #   bounding it limits the amount of I/O we need to do.
-        # - 'deltainfo.compresseddeltalen' is the sum of the total size of
-        #   deltas we need to apply -- bounding it limits the amount of CPU
-        #   we consume.
-
-        if self._sparserevlog:
-            # As sparse-read will be used, we can consider that the distance,
-            # instead of being the span of the whole chunk,
-            # is the span of the largest read chunk
-            base = deltainfo.base
-
-            if base != nullrev:
-                deltachain = self._deltachain(base)[0]
-            else:
-                deltachain = []
-
-            # search for the first non-snapshot revision
-            for idx, r in enumerate(deltachain):
-                if not self.issnapshot(r):
-                    break
-            deltachain = deltachain[idx:]
-            chunks = deltautil.slicechunk(self, deltachain, deltainfo)
-            all_span = [deltautil.segmentspan(self, revs, deltainfo)
-                        for revs in chunks]
-            distance = max(all_span)
-        else:
-            distance = deltainfo.distance
-
-        textlen = revinfo.textlen
-        defaultmax = textlen * 4
-        maxdist = self._maxdeltachainspan
-        if not maxdist:
-            maxdist = distance # ensure the conditional pass
-        maxdist = max(maxdist, defaultmax)
-        if self._sparserevlog and maxdist < self._srmingapsize:
-            # In multiple place, we are ignoring irrelevant data range below a
-            # certain size. Be also apply this tradeoff here and relax span
-            # constraint for small enought content.
-            maxdist = self._srmingapsize
-
-        # Bad delta from read span:
-        #
-        #   If the span of data read is larger than the maximum allowed.
-        if maxdist < distance:
-            return False
-
-        # Bad delta from new delta size:
-        #
-        #   If the delta size is larger than the target text, storing the
-        #   delta will be inefficient.
-        if textlen < deltainfo.deltalen:
-            return False
-
-        # Bad delta from cumulated payload size:
-        #
-        #   If the sum of delta get larger than K * target text length.
-        if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
-            return False
-
-        # Bad delta from chain length:
-        #
-        #   If the number of delta in the chain gets too high.
-        if self._maxchainlen and  self._maxchainlen < deltainfo.chainlen:
-            return False
-
-        # bad delta from intermediate snapshot size limit
-        #
-        #   If an intermediate snapshot size is higher than the limit.  The
-        #   limit exist to prevent endless chain of intermediate delta to be
-        #   created.
-        if (deltainfo.snapshotdepth is not None and
-                (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
-            return False
-
-        # bad delta if new intermediate snapshot is larger than the previous
-        # snapshot
-        if (deltainfo.snapshotdepth
-                and self.length(deltainfo.base) < deltainfo.deltalen):
-            return False
-
-        return True
-
     def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags,
                      cachedelta, ifh, dfh, alwayscache=False,
                      deltacomputer=None):
diff --git a/mercurial/revlogutils/constants.py b/mercurial/revlogutils/constants.py
--- a/mercurial/revlogutils/constants.py
+++ b/mercurial/revlogutils/constants.py
@@ -41,6 +41,3 @@ REVIDX_FLAGS_ORDER = [
 REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER)
 # bitmark for flags that could cause rawdata content change
 REVIDX_RAWTEXT_CHANGING_FLAGS = REVIDX_ISCENSORED | REVIDX_EXTSTORED
-
-# maximum <delta-chain-data>/<revision-text-length> ratio
-LIMIT_DELTA2TEXT = 2
diff --git a/mercurial/revlogutils/deltas.py b/mercurial/revlogutils/deltas.py
--- a/mercurial/revlogutils/deltas.py
+++ b/mercurial/revlogutils/deltas.py
@@ -19,7 +19,6 @@ from ..node import (
 from ..i18n import _
 
 from .constants import (
-    LIMIT_DELTA2TEXT,
     REVIDX_ISCENSORED,
     REVIDX_RAWTEXT_CHANGING_FLAGS,
 )
@@ -36,6 +35,9 @@ from .. import (
 RevlogError = error.RevlogError
 CensoredNodeError = error.CensoredNodeError
 
+# maximum <delta-chain-data>/<revision-text-length> ratio
+LIMIT_DELTA2TEXT = 2
+
 class _testrevlog(object):
     """minimalist fake revlog to use in doctests"""
 
@@ -447,6 +449,97 @@ class _deltainfo(object):
     compresseddeltalen = attr.ib()
     snapshotdepth = attr.ib()
 
+def isgooddeltainfo(revlog, deltainfo, revinfo):
+    """Returns True if the given delta is good. Good means that it is within
+    the disk span, disk size, and chain length bounds that we know to be
+    performant."""
+    if deltainfo is None:
+        return False
+
+    # - 'deltainfo.distance' is the distance from the base revision --
+    #   bounding it limits the amount of I/O we need to do.
+    # - 'deltainfo.compresseddeltalen' is the sum of the total size of
+    #   deltas we need to apply -- bounding it limits the amount of CPU
+    #   we consume.
+
+    if revlog._sparserevlog:
+        # As sparse-read will be used, we can consider that the distance,
+        # instead of being the span of the whole chunk,
+        # is the span of the largest read chunk
+        base = deltainfo.base
+
+        if base != nullrev:
+            deltachain = revlog._deltachain(base)[0]
+        else:
+            deltachain = []
+
+        # search for the first non-snapshot revision
+        for idx, r in enumerate(deltachain):
+            if not revlog.issnapshot(r):
+                break
+        deltachain = deltachain[idx:]
+        chunks = slicechunk(revlog, deltachain, deltainfo)
+        all_span = [segmentspan(revlog, revs, deltainfo)
+                    for revs in chunks]
+        distance = max(all_span)
+    else:
+        distance = deltainfo.distance
+
+    textlen = revinfo.textlen
+    defaultmax = textlen * 4
+    maxdist = revlog._maxdeltachainspan
+    if not maxdist:
+        maxdist = distance # ensure the conditional pass
+    maxdist = max(maxdist, defaultmax)
+    if revlog._sparserevlog and maxdist < revlog._srmingapsize:
+        # In multiple place, we are ignoring irrelevant data range below a
+        # certain size. Be also apply this tradeoff here and relax span
+        # constraint for small enought content.
+        maxdist = revlog._srmingapsize
+
+    # Bad delta from read span:
+    #
+    #   If the span of data read is larger than the maximum allowed.
+    if maxdist < distance:
+        return False
+
+    # Bad delta from new delta size:
+    #
+    #   If the delta size is larger than the target text, storing the
+    #   delta will be inefficient.
+    if textlen < deltainfo.deltalen:
+        return False
+
+    # Bad delta from cumulated payload size:
+    #
+    #   If the sum of delta get larger than K * target text length.
+    if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
+        return False
+
+    # Bad delta from chain length:
+    #
+    #   If the number of delta in the chain gets too high.
+    if (revlog._maxchainlen
+            and revlog._maxchainlen < deltainfo.chainlen):
+        return False
+
+    # bad delta from intermediate snapshot size limit
+    #
+    #   If an intermediate snapshot size is higher than the limit.  The
+    #   limit exist to prevent endless chain of intermediate delta to be
+    #   created.
+    if (deltainfo.snapshotdepth is not None and
+            (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
+        return False
+
+    # bad delta if new intermediate snapshot is larger than the previous
+    # snapshot
+    if (deltainfo.snapshotdepth
+            and revlog.length(deltainfo.base) < deltainfo.deltalen):
+        return False
+
+    return True
+
 class deltacomputer(object):
     def __init__(self, revlog):
         self.revlog = revlog
@@ -632,7 +725,7 @@ class deltacomputer(object):
                 if revlog.flags(candidaterev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
                     continue
                 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
-                if revlog._isgooddeltainfo(candidatedelta, revinfo):
+                if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
                     nominateddeltas.append(candidatedelta)
             if nominateddeltas:
                 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)


More information about the Mercurial-devel mailing list