[PATCH 05 of 12] revlog: move the good delta heuristic in revlogutils.deltas

Gregory Szorc gregory.szorc at gmail.com
Fri Aug 24 13:55:10 EDT 2018


On Sat, Aug 18, 2018 at 2:27 AM Boris Feld <boris.feld at octobus.net> wrote:

> # HG changeset patch
> # User Boris Feld <boris.feld at octobus.net>
> # Date 1534380638 -7200
> #      Thu Aug 16 02:50:38 2018 +0200
> # Node ID 9c7224a86df8f316dbfb006ba8e81fc3ce831303
> # Parent  07af069da30de0c6ae0ce8229a0ddd7c22c89313
> # EXP-Topic sparse-snapshot
> # Available At https://bitbucket.org/octobus/mercurial-devel/
> #              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r
> 9c7224a86df8
> revlog: move the good delta heuristic in revlogutils.deltas
>
> This is logic about building delta chains. We move it with related logic.
> The
> only user of this logic moved in this module already.
>

I'm all for sending the code move patches piecemeal to make things easier
to review. But I think we should squash patches 3-5 on landing so code
history is preserved.


>
> diff --git a/mercurial/revlog.py b/mercurial/revlog.py
> --- a/mercurial/revlog.py
> +++ b/mercurial/revlog.py
> @@ -37,7 +37,6 @@ from .i18n import _
>  from .revlogutils.constants import (
>      FLAG_GENERALDELTA,
>      FLAG_INLINE_DATA,
> -    LIMIT_DELTA2TEXT,
>      REVIDX_DEFAULT_FLAGS,
>      REVIDX_ELLIPSIS,
>      REVIDX_EXTSTORED,
> @@ -1897,96 +1896,6 @@ class revlog(object):
>
>          return compressor.decompress(data)
>
> -    def _isgooddeltainfo(self, deltainfo, revinfo):
> -        """Returns True if the given delta is good. Good means that it is
> within
> -        the disk span, disk size, and chain length bounds that we know to
> be
> -        performant."""
> -        if deltainfo is None:
> -            return False
> -
> -        # - 'deltainfo.distance' is the distance from the base revision --
> -        #   bounding it limits the amount of I/O we need to do.
> -        # - 'deltainfo.compresseddeltalen' is the sum of the total size of
> -        #   deltas we need to apply -- bounding it limits the amount of
> CPU
> -        #   we consume.
> -
> -        if self._sparserevlog:
> -            # As sparse-read will be used, we can consider that the
> distance,
> -            # instead of being the span of the whole chunk,
> -            # is the span of the largest read chunk
> -            base = deltainfo.base
> -
> -            if base != nullrev:
> -                deltachain = self._deltachain(base)[0]
> -            else:
> -                deltachain = []
> -
> -            # search for the first non-snapshot revision
> -            for idx, r in enumerate(deltachain):
> -                if not self.issnapshot(r):
> -                    break
> -            deltachain = deltachain[idx:]
> -            chunks = deltautil.slicechunk(self, deltachain, deltainfo)
> -            all_span = [deltautil.segmentspan(self, revs, deltainfo)
> -                        for revs in chunks]
> -            distance = max(all_span)
> -        else:
> -            distance = deltainfo.distance
> -
> -        textlen = revinfo.textlen
> -        defaultmax = textlen * 4
> -        maxdist = self._maxdeltachainspan
> -        if not maxdist:
> -            maxdist = distance # ensure the conditional pass
> -        maxdist = max(maxdist, defaultmax)
> -        if self._sparserevlog and maxdist < self._srmingapsize:
> -            # In multiple place, we are ignoring irrelevant data range
> below a
> -            # certain size. Be also apply this tradeoff here and relax
> span
> -            # constraint for small enought content.
> -            maxdist = self._srmingapsize
> -
> -        # Bad delta from read span:
> -        #
> -        #   If the span of data read is larger than the maximum allowed.
> -        if maxdist < distance:
> -            return False
> -
> -        # Bad delta from new delta size:
> -        #
> -        #   If the delta size is larger than the target text, storing the
> -        #   delta will be inefficient.
> -        if textlen < deltainfo.deltalen:
> -            return False
> -
> -        # Bad delta from cumulated payload size:
> -        #
> -        #   If the sum of delta get larger than K * target text length.
> -        if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
> -            return False
> -
> -        # Bad delta from chain length:
> -        #
> -        #   If the number of delta in the chain gets too high.
> -        if self._maxchainlen and  self._maxchainlen < deltainfo.chainlen:
> -            return False
> -
> -        # bad delta from intermediate snapshot size limit
> -        #
> -        #   If an intermediate snapshot size is higher than the limit.
> The
> -        #   limit exist to prevent endless chain of intermediate delta to
> be
> -        #   created.
> -        if (deltainfo.snapshotdepth is not None and
> -                (textlen >> deltainfo.snapshotdepth) <
> deltainfo.deltalen):
> -            return False
> -
> -        # bad delta if new intermediate snapshot is larger than the
> previous
> -        # snapshot
> -        if (deltainfo.snapshotdepth
> -                and self.length(deltainfo.base) < deltainfo.deltalen):
> -            return False
> -
> -        return True
> -
>      def _addrevision(self, node, rawtext, transaction, link, p1, p2,
> flags,
>                       cachedelta, ifh, dfh, alwayscache=False,
>                       deltacomputer=None):
> diff --git a/mercurial/revlogutils/constants.py
> b/mercurial/revlogutils/constants.py
> --- a/mercurial/revlogutils/constants.py
> +++ b/mercurial/revlogutils/constants.py
> @@ -41,6 +41,3 @@ REVIDX_FLAGS_ORDER = [
>  REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER)
>  # bitmark for flags that could cause rawdata content change
>  REVIDX_RAWTEXT_CHANGING_FLAGS = REVIDX_ISCENSORED | REVIDX_EXTSTORED
> -
> -# maximum <delta-chain-data>/<revision-text-length> ratio
> -LIMIT_DELTA2TEXT = 2
> diff --git a/mercurial/revlogutils/deltas.py
> b/mercurial/revlogutils/deltas.py
> --- a/mercurial/revlogutils/deltas.py
> +++ b/mercurial/revlogutils/deltas.py
> @@ -19,7 +19,6 @@ from ..node import (
>  from ..i18n import _
>
>  from .constants import (
> -    LIMIT_DELTA2TEXT,
>      REVIDX_ISCENSORED,
>      REVIDX_RAWTEXT_CHANGING_FLAGS,
>  )
> @@ -36,6 +35,9 @@ from .. import (
>  RevlogError = error.RevlogError
>  CensoredNodeError = error.CensoredNodeError
>
> +# maximum <delta-chain-data>/<revision-text-length> ratio
> +LIMIT_DELTA2TEXT = 2
> +
>  class _testrevlog(object):
>      """minimalist fake revlog to use in doctests"""
>
> @@ -447,6 +449,97 @@ class _deltainfo(object):
>      compresseddeltalen = attr.ib()
>      snapshotdepth = attr.ib()
>
> +def isgooddeltainfo(revlog, deltainfo, revinfo):
> +    """Returns True if the given delta is good. Good means that it is
> within
> +    the disk span, disk size, and chain length bounds that we know to be
> +    performant."""
> +    if deltainfo is None:
> +        return False
> +
> +    # - 'deltainfo.distance' is the distance from the base revision --
> +    #   bounding it limits the amount of I/O we need to do.
> +    # - 'deltainfo.compresseddeltalen' is the sum of the total size of
> +    #   deltas we need to apply -- bounding it limits the amount of CPU
> +    #   we consume.
> +
> +    if revlog._sparserevlog:
> +        # As sparse-read will be used, we can consider that the distance,
> +        # instead of being the span of the whole chunk,
> +        # is the span of the largest read chunk
> +        base = deltainfo.base
> +
> +        if base != nullrev:
> +            deltachain = revlog._deltachain(base)[0]
> +        else:
> +            deltachain = []
> +
> +        # search for the first non-snapshot revision
> +        for idx, r in enumerate(deltachain):
> +            if not revlog.issnapshot(r):
> +                break
> +        deltachain = deltachain[idx:]
> +        chunks = slicechunk(revlog, deltachain, deltainfo)
> +        all_span = [segmentspan(revlog, revs, deltainfo)
> +                    for revs in chunks]
> +        distance = max(all_span)
> +    else:
> +        distance = deltainfo.distance
> +
> +    textlen = revinfo.textlen
> +    defaultmax = textlen * 4
> +    maxdist = revlog._maxdeltachainspan
> +    if not maxdist:
> +        maxdist = distance # ensure the conditional pass
> +    maxdist = max(maxdist, defaultmax)
> +    if revlog._sparserevlog and maxdist < revlog._srmingapsize:
> +        # In multiple place, we are ignoring irrelevant data range below a
> +        # certain size. Be also apply this tradeoff here and relax span
> +        # constraint for small enought content.
> +        maxdist = revlog._srmingapsize
> +
> +    # Bad delta from read span:
> +    #
> +    #   If the span of data read is larger than the maximum allowed.
> +    if maxdist < distance:
> +        return False
> +
> +    # Bad delta from new delta size:
> +    #
> +    #   If the delta size is larger than the target text, storing the
> +    #   delta will be inefficient.
> +    if textlen < deltainfo.deltalen:
> +        return False
> +
> +    # Bad delta from cumulated payload size:
> +    #
> +    #   If the sum of delta get larger than K * target text length.
> +    if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
> +        return False
> +
> +    # Bad delta from chain length:
> +    #
> +    #   If the number of delta in the chain gets too high.
> +    if (revlog._maxchainlen
> +            and revlog._maxchainlen < deltainfo.chainlen):
> +        return False
> +
> +    # bad delta from intermediate snapshot size limit
> +    #
> +    #   If an intermediate snapshot size is higher than the limit.  The
> +    #   limit exist to prevent endless chain of intermediate delta to be
> +    #   created.
> +    if (deltainfo.snapshotdepth is not None and
> +            (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
> +        return False
> +
> +    # bad delta if new intermediate snapshot is larger than the previous
> +    # snapshot
> +    if (deltainfo.snapshotdepth
> +            and revlog.length(deltainfo.base) < deltainfo.deltalen):
> +        return False
> +
> +    return True
> +
>  class deltacomputer(object):
>      def __init__(self, revlog):
>          self.revlog = revlog
> @@ -632,7 +725,7 @@ class deltacomputer(object):
>                  if revlog.flags(candidaterev) &
> REVIDX_RAWTEXT_CHANGING_FLAGS:
>                      continue
>                  candidatedelta = self._builddeltainfo(revinfo,
> candidaterev, fh)
> -                if revlog._isgooddeltainfo(candidatedelta, revinfo):
> +                if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
>                      nominateddeltas.append(candidatedelta)
>              if nominateddeltas:
>                  deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.mercurial-scm.org/pipermail/mercurial-devel/attachments/20180824/25910df4/attachment.html>


More information about the Mercurial-devel mailing list