[PATCH 05 of 12] revlog: move the good delta heuristic in revlogutils.deltas
Gregory Szorc
gregory.szorc at gmail.com
Fri Aug 24 13:55:10 EDT 2018
On Sat, Aug 18, 2018 at 2:27 AM Boris Feld <boris.feld at octobus.net> wrote:
> # HG changeset patch
> # User Boris Feld <boris.feld at octobus.net>
> # Date 1534380638 -7200
> # Thu Aug 16 02:50:38 2018 +0200
> # Node ID 9c7224a86df8f316dbfb006ba8e81fc3ce831303
> # Parent 07af069da30de0c6ae0ce8229a0ddd7c22c89313
> # EXP-Topic sparse-snapshot
> # Available At https://bitbucket.org/octobus/mercurial-devel/
> # hg pull https://bitbucket.org/octobus/mercurial-devel/ -r
> 9c7224a86df8
> revlog: move the good delta heuristic in revlogutils.deltas
>
> This is logic about building delta chains. We move it with related logic.
> The
> only user of this logic moved in this module already.
>
I'm all for sending the code move patches piecemeal to make things easier
to review. But I think we should squash patches 3-5 on landing so code
history is preserved.
>
> diff --git a/mercurial/revlog.py b/mercurial/revlog.py
> --- a/mercurial/revlog.py
> +++ b/mercurial/revlog.py
> @@ -37,7 +37,6 @@ from .i18n import _
> from .revlogutils.constants import (
> FLAG_GENERALDELTA,
> FLAG_INLINE_DATA,
> - LIMIT_DELTA2TEXT,
> REVIDX_DEFAULT_FLAGS,
> REVIDX_ELLIPSIS,
> REVIDX_EXTSTORED,
> @@ -1897,96 +1896,6 @@ class revlog(object):
>
> return compressor.decompress(data)
>
> - def _isgooddeltainfo(self, deltainfo, revinfo):
> - """Returns True if the given delta is good. Good means that it is
> within
> - the disk span, disk size, and chain length bounds that we know to
> be
> - performant."""
> - if deltainfo is None:
> - return False
> -
> - # - 'deltainfo.distance' is the distance from the base revision --
> - # bounding it limits the amount of I/O we need to do.
> - # - 'deltainfo.compresseddeltalen' is the sum of the total size of
> - # deltas we need to apply -- bounding it limits the amount of
> CPU
> - # we consume.
> -
> - if self._sparserevlog:
> - # As sparse-read will be used, we can consider that the
> distance,
> - # instead of being the span of the whole chunk,
> - # is the span of the largest read chunk
> - base = deltainfo.base
> -
> - if base != nullrev:
> - deltachain = self._deltachain(base)[0]
> - else:
> - deltachain = []
> -
> - # search for the first non-snapshot revision
> - for idx, r in enumerate(deltachain):
> - if not self.issnapshot(r):
> - break
> - deltachain = deltachain[idx:]
> - chunks = deltautil.slicechunk(self, deltachain, deltainfo)
> - all_span = [deltautil.segmentspan(self, revs, deltainfo)
> - for revs in chunks]
> - distance = max(all_span)
> - else:
> - distance = deltainfo.distance
> -
> - textlen = revinfo.textlen
> - defaultmax = textlen * 4
> - maxdist = self._maxdeltachainspan
> - if not maxdist:
> - maxdist = distance # ensure the conditional pass
> - maxdist = max(maxdist, defaultmax)
> - if self._sparserevlog and maxdist < self._srmingapsize:
> - # In multiple place, we are ignoring irrelevant data range
> below a
> - # certain size. Be also apply this tradeoff here and relax
> span
> - # constraint for small enought content.
> - maxdist = self._srmingapsize
> -
> - # Bad delta from read span:
> - #
> - # If the span of data read is larger than the maximum allowed.
> - if maxdist < distance:
> - return False
> -
> - # Bad delta from new delta size:
> - #
> - # If the delta size is larger than the target text, storing the
> - # delta will be inefficient.
> - if textlen < deltainfo.deltalen:
> - return False
> -
> - # Bad delta from cumulated payload size:
> - #
> - # If the sum of delta get larger than K * target text length.
> - if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
> - return False
> -
> - # Bad delta from chain length:
> - #
> - # If the number of delta in the chain gets too high.
> - if self._maxchainlen and self._maxchainlen < deltainfo.chainlen:
> - return False
> -
> - # bad delta from intermediate snapshot size limit
> - #
> - # If an intermediate snapshot size is higher than the limit.
> The
> - # limit exist to prevent endless chain of intermediate delta to
> be
> - # created.
> - if (deltainfo.snapshotdepth is not None and
> - (textlen >> deltainfo.snapshotdepth) <
> deltainfo.deltalen):
> - return False
> -
> - # bad delta if new intermediate snapshot is larger than the
> previous
> - # snapshot
> - if (deltainfo.snapshotdepth
> - and self.length(deltainfo.base) < deltainfo.deltalen):
> - return False
> -
> - return True
> -
> def _addrevision(self, node, rawtext, transaction, link, p1, p2,
> flags,
> cachedelta, ifh, dfh, alwayscache=False,
> deltacomputer=None):
> diff --git a/mercurial/revlogutils/constants.py
> b/mercurial/revlogutils/constants.py
> --- a/mercurial/revlogutils/constants.py
> +++ b/mercurial/revlogutils/constants.py
> @@ -41,6 +41,3 @@ REVIDX_FLAGS_ORDER = [
> REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER)
> # bitmark for flags that could cause rawdata content change
> REVIDX_RAWTEXT_CHANGING_FLAGS = REVIDX_ISCENSORED | REVIDX_EXTSTORED
> -
> -# maximum <delta-chain-data>/<revision-text-length> ratio
> -LIMIT_DELTA2TEXT = 2
> diff --git a/mercurial/revlogutils/deltas.py
> b/mercurial/revlogutils/deltas.py
> --- a/mercurial/revlogutils/deltas.py
> +++ b/mercurial/revlogutils/deltas.py
> @@ -19,7 +19,6 @@ from ..node import (
> from ..i18n import _
>
> from .constants import (
> - LIMIT_DELTA2TEXT,
> REVIDX_ISCENSORED,
> REVIDX_RAWTEXT_CHANGING_FLAGS,
> )
> @@ -36,6 +35,9 @@ from .. import (
> RevlogError = error.RevlogError
> CensoredNodeError = error.CensoredNodeError
>
> +# maximum <delta-chain-data>/<revision-text-length> ratio
> +LIMIT_DELTA2TEXT = 2
> +
> class _testrevlog(object):
> """minimalist fake revlog to use in doctests"""
>
> @@ -447,6 +449,97 @@ class _deltainfo(object):
> compresseddeltalen = attr.ib()
> snapshotdepth = attr.ib()
>
> +def isgooddeltainfo(revlog, deltainfo, revinfo):
> + """Returns True if the given delta is good. Good means that it is
> within
> + the disk span, disk size, and chain length bounds that we know to be
> + performant."""
> + if deltainfo is None:
> + return False
> +
> + # - 'deltainfo.distance' is the distance from the base revision --
> + # bounding it limits the amount of I/O we need to do.
> + # - 'deltainfo.compresseddeltalen' is the sum of the total size of
> + # deltas we need to apply -- bounding it limits the amount of CPU
> + # we consume.
> +
> + if revlog._sparserevlog:
> + # As sparse-read will be used, we can consider that the distance,
> + # instead of being the span of the whole chunk,
> + # is the span of the largest read chunk
> + base = deltainfo.base
> +
> + if base != nullrev:
> + deltachain = revlog._deltachain(base)[0]
> + else:
> + deltachain = []
> +
> + # search for the first non-snapshot revision
> + for idx, r in enumerate(deltachain):
> + if not revlog.issnapshot(r):
> + break
> + deltachain = deltachain[idx:]
> + chunks = slicechunk(revlog, deltachain, deltainfo)
> + all_span = [segmentspan(revlog, revs, deltainfo)
> + for revs in chunks]
> + distance = max(all_span)
> + else:
> + distance = deltainfo.distance
> +
> + textlen = revinfo.textlen
> + defaultmax = textlen * 4
> + maxdist = revlog._maxdeltachainspan
> + if not maxdist:
> + maxdist = distance # ensure the conditional pass
> + maxdist = max(maxdist, defaultmax)
> + if revlog._sparserevlog and maxdist < revlog._srmingapsize:
> + # In multiple place, we are ignoring irrelevant data range below a
> + # certain size. Be also apply this tradeoff here and relax span
> + # constraint for small enought content.
> + maxdist = revlog._srmingapsize
> +
> + # Bad delta from read span:
> + #
> + # If the span of data read is larger than the maximum allowed.
> + if maxdist < distance:
> + return False
> +
> + # Bad delta from new delta size:
> + #
> + # If the delta size is larger than the target text, storing the
> + # delta will be inefficient.
> + if textlen < deltainfo.deltalen:
> + return False
> +
> + # Bad delta from cumulated payload size:
> + #
> + # If the sum of delta get larger than K * target text length.
> + if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
> + return False
> +
> + # Bad delta from chain length:
> + #
> + # If the number of delta in the chain gets too high.
> + if (revlog._maxchainlen
> + and revlog._maxchainlen < deltainfo.chainlen):
> + return False
> +
> + # bad delta from intermediate snapshot size limit
> + #
> + # If an intermediate snapshot size is higher than the limit. The
> + # limit exist to prevent endless chain of intermediate delta to be
> + # created.
> + if (deltainfo.snapshotdepth is not None and
> + (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
> + return False
> +
> + # bad delta if new intermediate snapshot is larger than the previous
> + # snapshot
> + if (deltainfo.snapshotdepth
> + and revlog.length(deltainfo.base) < deltainfo.deltalen):
> + return False
> +
> + return True
> +
> class deltacomputer(object):
> def __init__(self, revlog):
> self.revlog = revlog
> @@ -632,7 +725,7 @@ class deltacomputer(object):
> if revlog.flags(candidaterev) &
> REVIDX_RAWTEXT_CHANGING_FLAGS:
> continue
> candidatedelta = self._builddeltainfo(revinfo,
> candidaterev, fh)
> - if revlog._isgooddeltainfo(candidatedelta, revinfo):
> + if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
> nominateddeltas.append(candidatedelta)
> if nominateddeltas:
> deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.mercurial-scm.org/pipermail/mercurial-devel/attachments/20180824/25910df4/attachment.html>
More information about the Mercurial-devel
mailing list