[PATCH 03 of 11 V2] revlog: move function related to delta chain slicing in revlogutils.deltas

Boris Feld boris.feld at octobus.net
Mon Aug 27 06:06:30 EDT 2018


# HG changeset patch
# User Boris Feld <boris.feld at octobus.net>
# Date 1534380822 -7200
#      Thu Aug 16 02:53:42 2018 +0200
# Node ID a061be6dce561b65bf8e008333f7737466dbcdf1
# Parent  7ca654074a1bec3ebe28506d71a2157ce026e7ad
# EXP-Topic sparse-snapshot
# Available At https://bitbucket.org/octobus/mercurial-devel/
#              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r a061be6dce56
revlog: move function related to delta chain slicing in revlogutils.deltas

This logic is related to delta chains, we move it in the utility module related
to deltas. This is needed before we can move the "good delta" logic in that
module.

diff --git a/mercurial/debugcommands.py b/mercurial/debugcommands.py
--- a/mercurial/debugcommands.py
+++ b/mercurial/debugcommands.py
@@ -88,6 +88,10 @@ from .utils import (
     stringutil,
 )
 
+from .revlogutils import (
+    deltas as deltautil
+)
+
 release = lockmod.release
 
 command = registrar.command()
@@ -706,7 +710,7 @@ def debugdeltachain(ui, repo, file_=None
             largestblock = 0
             srchunks = 0
 
-            for revschunk in revlog._slicechunk(r, chain):
+            for revschunk in deltautil.slicechunk(r, chain):
                 srchunks += 1
                 blkend = start(revschunk[-1]) + length(revschunk[-1])
                 blksize = blkend - start(revschunk[0])
diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -17,7 +17,6 @@ import collections
 import contextlib
 import errno
 import hashlib
-import heapq
 import os
 import re
 import struct
@@ -214,406 +213,6 @@ def hash(text, p1, p2):
     s.update(text)
     return s.digest()
 
-class _testrevlog(object):
-    """minimalist fake revlog to use in doctests"""
-
-    def __init__(self, data, density=0.5, mingap=0):
-        """data is an list of revision payload boundaries"""
-        self._data = data
-        self._srdensitythreshold = density
-        self._srmingapsize = mingap
-
-    def start(self, rev):
-        if rev == 0:
-            return 0
-        return self._data[rev - 1]
-
-    def end(self, rev):
-        return self._data[rev]
-
-    def length(self, rev):
-        return self.end(rev) - self.start(rev)
-
-    def __len__(self):
-        return len(self._data)
-
-def _trimchunk(revlog, revs, startidx, endidx=None):
-    """returns revs[startidx:endidx] without empty trailing revs
-
-    Doctest Setup
-    >>> revlog = _testrevlog([
-    ...  5,  #0
-    ...  10, #1
-    ...  12, #2
-    ...  12, #3 (empty)
-    ...  17, #4
-    ...  21, #5
-    ...  21, #6 (empty)
-    ... ])
-
-    Contiguous cases:
-    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
-    [0, 1, 2, 3, 4, 5]
-    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
-    [0, 1, 2, 3, 4]
-    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
-    [0, 1, 2]
-    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
-    [2]
-    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
-    [3, 4, 5]
-    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
-    [3, 4]
-
-    Discontiguous cases:
-    >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
-    [1, 3, 5]
-    >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
-    [1]
-    >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
-    [3, 5]
-    >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
-    [3, 5]
-    """
-    length = revlog.length
-
-    if endidx is None:
-        endidx = len(revs)
-
-    # If we have a non-emtpy delta candidate, there are nothing to trim
-    if revs[endidx - 1] < len(revlog):
-        # Trim empty revs at the end, except the very first revision of a chain
-        while (endidx > 1
-                and endidx > startidx
-                and length(revs[endidx - 1]) == 0):
-            endidx -= 1
-
-    return revs[startidx:endidx]
-
-def _segmentspan(revlog, revs, deltainfo=None):
-    """Get the byte span of a segment of revisions
-
-    revs is a sorted array of revision numbers
-
-    >>> revlog = _testrevlog([
-    ...  5,  #0
-    ...  10, #1
-    ...  12, #2
-    ...  12, #3 (empty)
-    ...  17, #4
-    ... ])
-
-    >>> _segmentspan(revlog, [0, 1, 2, 3, 4])
-    17
-    >>> _segmentspan(revlog, [0, 4])
-    17
-    >>> _segmentspan(revlog, [3, 4])
-    5
-    >>> _segmentspan(revlog, [1, 2, 3,])
-    7
-    >>> _segmentspan(revlog, [1, 3])
-    7
-    """
-    if not revs:
-        return 0
-    if deltainfo is not None and len(revlog) <= revs[-1]:
-        if len(revs) == 1:
-            return deltainfo.deltalen
-        offset = revlog.end(len(revlog) - 1)
-        end = deltainfo.deltalen + offset
-    else:
-        end = revlog.end(revs[-1])
-    return end - revlog.start(revs[0])
-
-def _slicechunk(revlog, revs, deltainfo=None, targetsize=None):
-    """slice revs to reduce the amount of unrelated data to be read from disk.
-
-    ``revs`` is sliced into groups that should be read in one time.
-    Assume that revs are sorted.
-
-    The initial chunk is sliced until the overall density (payload/chunks-span
-    ratio) is above `revlog._srdensitythreshold`. No gap smaller than
-    `revlog._srmingapsize` is skipped.
-
-    If `targetsize` is set, no chunk larger than `targetsize` will be yield.
-    For consistency with other slicing choice, this limit won't go lower than
-    `revlog._srmingapsize`.
-
-    If individual revisions chunk are larger than this limit, they will still
-    be raised individually.
-
-    >>> revlog = _testrevlog([
-    ...  5,  #00 (5)
-    ...  10, #01 (5)
-    ...  12, #02 (2)
-    ...  12, #03 (empty)
-    ...  27, #04 (15)
-    ...  31, #05 (4)
-    ...  31, #06 (empty)
-    ...  42, #07 (11)
-    ...  47, #08 (5)
-    ...  47, #09 (empty)
-    ...  48, #10 (1)
-    ...  51, #11 (3)
-    ...  74, #12 (23)
-    ...  85, #13 (11)
-    ...  86, #14 (1)
-    ...  91, #15 (5)
-    ... ])
-
-    >>> list(_slicechunk(revlog, list(range(16))))
-    [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
-    >>> list(_slicechunk(revlog, [0, 15]))
-    [[0], [15]]
-    >>> list(_slicechunk(revlog, [0, 11, 15]))
-    [[0], [11], [15]]
-    >>> list(_slicechunk(revlog, [0, 11, 13, 15]))
-    [[0], [11, 13, 15]]
-    >>> list(_slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
-    [[1, 2], [5, 8, 10, 11], [14]]
-
-    Slicing with a maximum chunk size
-    >>> list(_slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
-    [[0], [11], [13], [15]]
-    >>> list(_slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
-    [[0], [11], [13, 15]]
-    """
-    if targetsize is not None:
-        targetsize = max(targetsize, revlog._srmingapsize)
-    # targetsize should not be specified when evaluating delta candidates:
-    # * targetsize is used to ensure we stay within specification when reading,
-    # * deltainfo is used to pick are good delta chain when writing.
-    if not (deltainfo is None or targetsize is None):
-        msg = 'cannot use `targetsize` with a `deltainfo`'
-        raise error.ProgrammingError(msg)
-    for chunk in _slicechunktodensity(revlog, revs,
-                                      deltainfo,
-                                      revlog._srdensitythreshold,
-                                      revlog._srmingapsize):
-        for subchunk in _slicechunktosize(revlog, chunk, targetsize):
-            yield subchunk
-
-def _slicechunktosize(revlog, revs, targetsize=None):
-    """slice revs to match the target size
-
-    This is intended to be used on chunk that density slicing selected by that
-    are still too large compared to the read garantee of revlog. This might
-    happens when "minimal gap size" interrupted the slicing or when chain are
-    built in a way that create large blocks next to each other.
-
-    >>> revlog = _testrevlog([
-    ...  3,  #0 (3)
-    ...  5,  #1 (2)
-    ...  6,  #2 (1)
-    ...  8,  #3 (2)
-    ...  8,  #4 (empty)
-    ...  11, #5 (3)
-    ...  12, #6 (1)
-    ...  13, #7 (1)
-    ...  14, #8 (1)
-    ... ])
-
-    Cases where chunk is already small enough
-    >>> list(_slicechunktosize(revlog, [0], 3))
-    [[0]]
-    >>> list(_slicechunktosize(revlog, [6, 7], 3))
-    [[6, 7]]
-    >>> list(_slicechunktosize(revlog, [0], None))
-    [[0]]
-    >>> list(_slicechunktosize(revlog, [6, 7], None))
-    [[6, 7]]
-
-    cases where we need actual slicing
-    >>> list(_slicechunktosize(revlog, [0, 1], 3))
-    [[0], [1]]
-    >>> list(_slicechunktosize(revlog, [1, 3], 3))
-    [[1], [3]]
-    >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
-    [[1, 2], [3]]
-    >>> list(_slicechunktosize(revlog, [3, 5], 3))
-    [[3], [5]]
-    >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
-    [[3], [5]]
-    >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
-    [[5], [6, 7, 8]]
-    >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
-    [[0], [1, 2], [3], [5], [6, 7, 8]]
-
-    Case with too large individual chunk (must return valid chunk)
-    >>> list(_slicechunktosize(revlog, [0, 1], 2))
-    [[0], [1]]
-    >>> list(_slicechunktosize(revlog, [1, 3], 1))
-    [[1], [3]]
-    >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
-    [[3], [5]]
-    """
-    assert targetsize is None or 0 <= targetsize
-    if targetsize is None or _segmentspan(revlog, revs) <= targetsize:
-        yield revs
-        return
-
-    startrevidx = 0
-    startdata = revlog.start(revs[0])
-    endrevidx = 0
-    iterrevs = enumerate(revs)
-    next(iterrevs) # skip first rev.
-    for idx, r in iterrevs:
-        span = revlog.end(r) - startdata
-        if span <= targetsize:
-            endrevidx = idx
-        else:
-            chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1)
-            if chunk:
-                yield chunk
-            startrevidx = idx
-            startdata = revlog.start(r)
-            endrevidx = idx
-    yield _trimchunk(revlog, revs, startrevidx)
-
-def _slicechunktodensity(revlog, revs, deltainfo=None, targetdensity=0.5,
-                         mingapsize=0):
-    """slice revs to reduce the amount of unrelated data to be read from disk.
-
-    ``revs`` is sliced into groups that should be read in one time.
-    Assume that revs are sorted.
-
-    ``deltainfo`` is a _deltainfo instance of a revision that we would append
-    to the top of the revlog.
-
-    The initial chunk is sliced until the overall density (payload/chunks-span
-    ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
-    skipped.
-
-    >>> revlog = _testrevlog([
-    ...  5,  #00 (5)
-    ...  10, #01 (5)
-    ...  12, #02 (2)
-    ...  12, #03 (empty)
-    ...  27, #04 (15)
-    ...  31, #05 (4)
-    ...  31, #06 (empty)
-    ...  42, #07 (11)
-    ...  47, #08 (5)
-    ...  47, #09 (empty)
-    ...  48, #10 (1)
-    ...  51, #11 (3)
-    ...  74, #12 (23)
-    ...  85, #13 (11)
-    ...  86, #14 (1)
-    ...  91, #15 (5)
-    ... ])
-
-    >>> list(_slicechunktodensity(revlog, list(range(16))))
-    [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
-    >>> list(_slicechunktodensity(revlog, [0, 15]))
-    [[0], [15]]
-    >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
-    [[0], [11], [15]]
-    >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
-    [[0], [11, 13, 15]]
-    >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
-    [[1, 2], [5, 8, 10, 11], [14]]
-    >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
-    ...                           mingapsize=20))
-    [[1, 2, 3, 5, 8, 10, 11], [14]]
-    >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
-    ...                           targetdensity=0.95))
-    [[1, 2], [5], [8, 10, 11], [14]]
-    >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
-    ...                           targetdensity=0.95, mingapsize=12))
-    [[1, 2], [5, 8, 10, 11], [14]]
-    """
-    start = revlog.start
-    length = revlog.length
-
-    if len(revs) <= 1:
-        yield revs
-        return
-
-    nextrev = len(revlog)
-    nextoffset = revlog.end(nextrev - 1)
-
-    if deltainfo is None:
-        deltachainspan = _segmentspan(revlog, revs)
-        chainpayload = sum(length(r) for r in revs)
-    else:
-        deltachainspan = deltainfo.distance
-        chainpayload = deltainfo.compresseddeltalen
-
-    if deltachainspan < mingapsize:
-        yield revs
-        return
-
-    readdata = deltachainspan
-
-    if deltachainspan:
-        density = chainpayload / float(deltachainspan)
-    else:
-        density = 1.0
-
-    if density >= targetdensity:
-        yield revs
-        return
-
-    if deltainfo is not None and deltainfo.deltalen:
-        revs = list(revs)
-        revs.append(nextrev)
-
-    # Store the gaps in a heap to have them sorted by decreasing size
-    gapsheap = []
-    heapq.heapify(gapsheap)
-    prevend = None
-    for i, rev in enumerate(revs):
-        if rev < nextrev:
-            revstart = start(rev)
-            revlen = length(rev)
-        else:
-            revstart = nextoffset
-            revlen = deltainfo.deltalen
-
-        # Skip empty revisions to form larger holes
-        if revlen == 0:
-            continue
-
-        if prevend is not None:
-            gapsize = revstart - prevend
-            # only consider holes that are large enough
-            if gapsize > mingapsize:
-                heapq.heappush(gapsheap, (-gapsize, i))
-
-        prevend = revstart + revlen
-
-    # Collect the indices of the largest holes until the density is acceptable
-    indicesheap = []
-    heapq.heapify(indicesheap)
-    while gapsheap and density < targetdensity:
-        oppgapsize, gapidx = heapq.heappop(gapsheap)
-
-        heapq.heappush(indicesheap, gapidx)
-
-        # the gap sizes are stored as negatives to be sorted decreasingly
-        # by the heap
-        readdata -= (-oppgapsize)
-        if readdata > 0:
-            density = chainpayload / float(readdata)
-        else:
-            density = 1.0
-
-    # Cut the revs at collected indices
-    previdx = 0
-    while indicesheap:
-        idx = heapq.heappop(indicesheap)
-
-        chunk = _trimchunk(revlog, revs, previdx, idx)
-        if chunk:
-            yield chunk
-
-        previdx = idx
-
-    chunk = _trimchunk(revlog, revs, previdx)
-    if chunk:
-        yield chunk
-
 @attr.s(slots=True, frozen=True)
 class _revisioninfo(object):
     """Information about a revision that allows building its fulltext
@@ -1894,7 +1493,8 @@ class revlog(object):
         if not self._withsparseread:
             slicedchunks = (revs,)
         else:
-            slicedchunks = _slicechunk(self, revs, targetsize=targetsize)
+            slicedchunks = deltautil.slicechunk(self, revs,
+                                                targetsize=targetsize)
 
         for revschunk in slicedchunks:
             firstrev = revschunk[0]
@@ -2344,8 +1944,9 @@ class revlog(object):
                 if not self.issnapshot(r):
                     break
             deltachain = deltachain[idx:]
-            chunks = _slicechunk(self, deltachain, deltainfo)
-            all_span = [_segmentspan(self, revs, deltainfo) for revs in chunks]
+            chunks = deltautil.slicechunk(self, deltachain, deltainfo)
+            all_span = [deltautil.segmentspan(self, revs, deltainfo)
+                        for revs in chunks]
             distance = max(all_span)
         else:
             distance = deltainfo.distance
diff --git a/mercurial/revlogutils/deltas.py b/mercurial/revlogutils/deltas.py
--- a/mercurial/revlogutils/deltas.py
+++ b/mercurial/revlogutils/deltas.py
@@ -9,6 +9,7 @@
 
 from __future__ import absolute_import
 
+import heapq
 import struct
 
 # import stuff from node for others to import from revlog
@@ -35,6 +36,406 @@ from .. import (
 RevlogError = error.RevlogError
 CensoredNodeError = error.CensoredNodeError
 
+class _testrevlog(object):
+    """minimalist fake revlog to use in doctests"""
+
+    def __init__(self, data, density=0.5, mingap=0):
+        """data is an list of revision payload boundaries"""
+        self._data = data
+        self._srdensitythreshold = density
+        self._srmingapsize = mingap
+
+    def start(self, rev):
+        if rev == 0:
+            return 0
+        return self._data[rev - 1]
+
+    def end(self, rev):
+        return self._data[rev]
+
+    def length(self, rev):
+        return self.end(rev) - self.start(rev)
+
+    def __len__(self):
+        return len(self._data)
+
+def slicechunk(revlog, revs, deltainfo=None, targetsize=None):
+    """slice revs to reduce the amount of unrelated data to be read from disk.
+
+    ``revs`` is sliced into groups that should be read in one time.
+    Assume that revs are sorted.
+
+    The initial chunk is sliced until the overall density (payload/chunks-span
+    ratio) is above `revlog._srdensitythreshold`. No gap smaller than
+    `revlog._srmingapsize` is skipped.
+
+    If `targetsize` is set, no chunk larger than `targetsize` will be yield.
+    For consistency with other slicing choice, this limit won't go lower than
+    `revlog._srmingapsize`.
+
+    If individual revisions chunk are larger than this limit, they will still
+    be raised individually.
+
+    >>> revlog = _testrevlog([
+    ...  5,  #00 (5)
+    ...  10, #01 (5)
+    ...  12, #02 (2)
+    ...  12, #03 (empty)
+    ...  27, #04 (15)
+    ...  31, #05 (4)
+    ...  31, #06 (empty)
+    ...  42, #07 (11)
+    ...  47, #08 (5)
+    ...  47, #09 (empty)
+    ...  48, #10 (1)
+    ...  51, #11 (3)
+    ...  74, #12 (23)
+    ...  85, #13 (11)
+    ...  86, #14 (1)
+    ...  91, #15 (5)
+    ... ])
+
+    >>> list(slicechunk(revlog, list(range(16))))
+    [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
+    >>> list(slicechunk(revlog, [0, 15]))
+    [[0], [15]]
+    >>> list(slicechunk(revlog, [0, 11, 15]))
+    [[0], [11], [15]]
+    >>> list(slicechunk(revlog, [0, 11, 13, 15]))
+    [[0], [11, 13, 15]]
+    >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
+    [[1, 2], [5, 8, 10, 11], [14]]
+
+    Slicing with a maximum chunk size
+    >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
+    [[0], [11], [13], [15]]
+    >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
+    [[0], [11], [13, 15]]
+    """
+    if targetsize is not None:
+        targetsize = max(targetsize, revlog._srmingapsize)
+    # targetsize should not be specified when evaluating delta candidates:
+    # * targetsize is used to ensure we stay within specification when reading,
+    # * deltainfo is used to pick are good delta chain when writing.
+    if not (deltainfo is None or targetsize is None):
+        msg = 'cannot use `targetsize` with a `deltainfo`'
+        raise error.ProgrammingError(msg)
+    for chunk in _slicechunktodensity(revlog, revs,
+                                      deltainfo,
+                                      revlog._srdensitythreshold,
+                                      revlog._srmingapsize):
+        for subchunk in _slicechunktosize(revlog, chunk, targetsize):
+            yield subchunk
+
+def _slicechunktosize(revlog, revs, targetsize=None):
+    """slice revs to match the target size
+
+    This is intended to be used on chunk that density slicing selected by that
+    are still too large compared to the read garantee of revlog. This might
+    happens when "minimal gap size" interrupted the slicing or when chain are
+    built in a way that create large blocks next to each other.
+
+    >>> revlog = _testrevlog([
+    ...  3,  #0 (3)
+    ...  5,  #1 (2)
+    ...  6,  #2 (1)
+    ...  8,  #3 (2)
+    ...  8,  #4 (empty)
+    ...  11, #5 (3)
+    ...  12, #6 (1)
+    ...  13, #7 (1)
+    ...  14, #8 (1)
+    ... ])
+
+    Cases where chunk is already small enough
+    >>> list(_slicechunktosize(revlog, [0], 3))
+    [[0]]
+    >>> list(_slicechunktosize(revlog, [6, 7], 3))
+    [[6, 7]]
+    >>> list(_slicechunktosize(revlog, [0], None))
+    [[0]]
+    >>> list(_slicechunktosize(revlog, [6, 7], None))
+    [[6, 7]]
+
+    cases where we need actual slicing
+    >>> list(_slicechunktosize(revlog, [0, 1], 3))
+    [[0], [1]]
+    >>> list(_slicechunktosize(revlog, [1, 3], 3))
+    [[1], [3]]
+    >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
+    [[1, 2], [3]]
+    >>> list(_slicechunktosize(revlog, [3, 5], 3))
+    [[3], [5]]
+    >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
+    [[3], [5]]
+    >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
+    [[5], [6, 7, 8]]
+    >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
+    [[0], [1, 2], [3], [5], [6, 7, 8]]
+
+    Case with too large individual chunk (must return valid chunk)
+    >>> list(_slicechunktosize(revlog, [0, 1], 2))
+    [[0], [1]]
+    >>> list(_slicechunktosize(revlog, [1, 3], 1))
+    [[1], [3]]
+    >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
+    [[3], [5]]
+    """
+    assert targetsize is None or 0 <= targetsize
+    if targetsize is None or segmentspan(revlog, revs) <= targetsize:
+        yield revs
+        return
+
+    startrevidx = 0
+    startdata = revlog.start(revs[0])
+    endrevidx = 0
+    iterrevs = enumerate(revs)
+    next(iterrevs) # skip first rev.
+    for idx, r in iterrevs:
+        span = revlog.end(r) - startdata
+        if span <= targetsize:
+            endrevidx = idx
+        else:
+            chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1)
+            if chunk:
+                yield chunk
+            startrevidx = idx
+            startdata = revlog.start(r)
+            endrevidx = idx
+    yield _trimchunk(revlog, revs, startrevidx)
+
+def _slicechunktodensity(revlog, revs, deltainfo=None, targetdensity=0.5,
+                         mingapsize=0):
+    """slice revs to reduce the amount of unrelated data to be read from disk.
+
+    ``revs`` is sliced into groups that should be read in one time.
+    Assume that revs are sorted.
+
+    ``deltainfo`` is a _deltainfo instance of a revision that we would append
+    to the top of the revlog.
+
+    The initial chunk is sliced until the overall density (payload/chunks-span
+    ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
+    skipped.
+
+    >>> revlog = _testrevlog([
+    ...  5,  #00 (5)
+    ...  10, #01 (5)
+    ...  12, #02 (2)
+    ...  12, #03 (empty)
+    ...  27, #04 (15)
+    ...  31, #05 (4)
+    ...  31, #06 (empty)
+    ...  42, #07 (11)
+    ...  47, #08 (5)
+    ...  47, #09 (empty)
+    ...  48, #10 (1)
+    ...  51, #11 (3)
+    ...  74, #12 (23)
+    ...  85, #13 (11)
+    ...  86, #14 (1)
+    ...  91, #15 (5)
+    ... ])
+
+    >>> list(_slicechunktodensity(revlog, list(range(16))))
+    [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
+    >>> list(_slicechunktodensity(revlog, [0, 15]))
+    [[0], [15]]
+    >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
+    [[0], [11], [15]]
+    >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
+    [[0], [11, 13, 15]]
+    >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
+    [[1, 2], [5, 8, 10, 11], [14]]
+    >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
+    ...                           mingapsize=20))
+    [[1, 2, 3, 5, 8, 10, 11], [14]]
+    >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
+    ...                           targetdensity=0.95))
+    [[1, 2], [5], [8, 10, 11], [14]]
+    >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
+    ...                           targetdensity=0.95, mingapsize=12))
+    [[1, 2], [5, 8, 10, 11], [14]]
+    """
+    start = revlog.start
+    length = revlog.length
+
+    if len(revs) <= 1:
+        yield revs
+        return
+
+    nextrev = len(revlog)
+    nextoffset = revlog.end(nextrev - 1)
+
+    if deltainfo is None:
+        deltachainspan = segmentspan(revlog, revs)
+        chainpayload = sum(length(r) for r in revs)
+    else:
+        deltachainspan = deltainfo.distance
+        chainpayload = deltainfo.compresseddeltalen
+
+    if deltachainspan < mingapsize:
+        yield revs
+        return
+
+    readdata = deltachainspan
+
+    if deltachainspan:
+        density = chainpayload / float(deltachainspan)
+    else:
+        density = 1.0
+
+    if density >= targetdensity:
+        yield revs
+        return
+
+    if deltainfo is not None and deltainfo.deltalen:
+        revs = list(revs)
+        revs.append(nextrev)
+
+    # Store the gaps in a heap to have them sorted by decreasing size
+    gapsheap = []
+    heapq.heapify(gapsheap)
+    prevend = None
+    for i, rev in enumerate(revs):
+        if rev < nextrev:
+            revstart = start(rev)
+            revlen = length(rev)
+        else:
+            revstart = nextoffset
+            revlen = deltainfo.deltalen
+
+        # Skip empty revisions to form larger holes
+        if revlen == 0:
+            continue
+
+        if prevend is not None:
+            gapsize = revstart - prevend
+            # only consider holes that are large enough
+            if gapsize > mingapsize:
+                heapq.heappush(gapsheap, (-gapsize, i))
+
+        prevend = revstart + revlen
+
+    # Collect the indices of the largest holes until the density is acceptable
+    indicesheap = []
+    heapq.heapify(indicesheap)
+    while gapsheap and density < targetdensity:
+        oppgapsize, gapidx = heapq.heappop(gapsheap)
+
+        heapq.heappush(indicesheap, gapidx)
+
+        # the gap sizes are stored as negatives to be sorted decreasingly
+        # by the heap
+        readdata -= (-oppgapsize)
+        if readdata > 0:
+            density = chainpayload / float(readdata)
+        else:
+            density = 1.0
+
+    # Cut the revs at collected indices
+    previdx = 0
+    while indicesheap:
+        idx = heapq.heappop(indicesheap)
+
+        chunk = _trimchunk(revlog, revs, previdx, idx)
+        if chunk:
+            yield chunk
+
+        previdx = idx
+
+    chunk = _trimchunk(revlog, revs, previdx)
+    if chunk:
+        yield chunk
+
+def _trimchunk(revlog, revs, startidx, endidx=None):
+    """returns revs[startidx:endidx] without empty trailing revs
+
+    Doctest Setup
+    >>> revlog = _testrevlog([
+    ...  5,  #0
+    ...  10, #1
+    ...  12, #2
+    ...  12, #3 (empty)
+    ...  17, #4
+    ...  21, #5
+    ...  21, #6 (empty)
+    ... ])
+
+    Contiguous cases:
+    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
+    [0, 1, 2, 3, 4, 5]
+    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
+    [0, 1, 2, 3, 4]
+    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
+    [0, 1, 2]
+    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
+    [2]
+    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
+    [3, 4, 5]
+    >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
+    [3, 4]
+
+    Discontiguous cases:
+    >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
+    [1, 3, 5]
+    >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
+    [1]
+    >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
+    [3, 5]
+    >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
+    [3, 5]
+    """
+    length = revlog.length
+
+    if endidx is None:
+        endidx = len(revs)
+
+    # If we have a non-emtpy delta candidate, there are nothing to trim
+    if revs[endidx - 1] < len(revlog):
+        # Trim empty revs at the end, except the very first revision of a chain
+        while (endidx > 1
+                and endidx > startidx
+                and length(revs[endidx - 1]) == 0):
+            endidx -= 1
+
+    return revs[startidx:endidx]
+
+def segmentspan(revlog, revs, deltainfo=None):
+    """Get the byte span of a segment of revisions
+
+    revs is a sorted array of revision numbers
+
+    >>> revlog = _testrevlog([
+    ...  5,  #0
+    ...  10, #1
+    ...  12, #2
+    ...  12, #3 (empty)
+    ...  17, #4
+    ... ])
+
+    >>> segmentspan(revlog, [0, 1, 2, 3, 4])
+    17
+    >>> segmentspan(revlog, [0, 4])
+    17
+    >>> segmentspan(revlog, [3, 4])
+    5
+    >>> segmentspan(revlog, [1, 2, 3,])
+    7
+    >>> segmentspan(revlog, [1, 3])
+    7
+    """
+    if not revs:
+        return 0
+    if deltainfo is not None and len(revlog) <= revs[-1]:
+        if len(revs) == 1:
+            return deltainfo.deltalen
+        offset = revlog.end(len(revlog) - 1)
+        end = deltainfo.deltalen + offset
+    else:
+        end = revlog.end(revs[-1])
+    return end - revlog.start(revs[0])
+
 @attr.s(slots=True, frozen=True)
 class _deltainfo(object):
     distance = attr.ib()


More information about the Mercurial-devel mailing list