[PATCH 03 of 12] revlog: split classes related to deltas computation in a new module

Gregory Szorc gregory.szorc at gmail.com
Fri Aug 24 13:44:46 EDT 2018


On Sat, Aug 18, 2018 at 2:27 AM Boris Feld <boris.feld at octobus.net> wrote:

> # HG changeset patch
> # User Boris Feld <boris.feld at octobus.net>
> # Date 1534378131 -7200
> #      Thu Aug 16 02:08:51 2018 +0200
> # Node ID 0f38a0bcba5817146fb3a4e079016abc9a424646
> # Parent  4a61a7a37a499387658b6270ea0b2dadfef86156
> # EXP-Topic sparse-snapshot
> # Available At https://bitbucket.org/octobus/mercurial-devel/
> #              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r
> 0f38a0bcba58
> revlog: split classes related to deltas computation in a new module
>

This patch failed to import properly. Probably due to a change in revlog.py.

Could you please resend or give me an upstream repo to pull from with an
updated version?

Also, I think you should `hg cp` the file so history of the moved code is
preserved. Delta logic is complicated and we shouldn't be making code
archeology more difficult by not preserving the copy annotation.

Also, bonus points if you resend this with "revlogutils" renamed to
"storage" or something similar. (Also, I'm unsure we need a sub-package for
all this stuff quite yet. Although I do like the idea of having all storage
code in one place.)


>
> The revlog module is getting big and this logic is getting more and more
> advanced. Moving it to `mercurial.revlogutils.deltas` split about 25% off
> revlog.py and will help this logic to become less interleaved with revlog.
>
> The code is simply moved without modification (but for namespace changes).
> Refactoring and improvement will be made in later changesets.
>
> diff --git a/mercurial/revlog.py b/mercurial/revlog.py
> --- a/mercurial/revlog.py
> +++ b/mercurial/revlog.py
> @@ -67,6 +67,9 @@ from . import (
>      templatefilters,
>      util,
>  )
> +from .revlogutils import (
> +    deltas as deltautil,
> +)
>  from .utils import (
>      stringutil,
>  )
> @@ -609,210 +612,6 @@ def _slicechunktodensity(revlog, revs, d
>          yield chunk
>
>  @attr.s(slots=True, frozen=True)
> -class _deltainfo(object):
> -    distance = attr.ib()
> -    deltalen = attr.ib()
> -    data = attr.ib()
> -    base = attr.ib()
> -    chainbase = attr.ib()
> -    chainlen = attr.ib()
> -    compresseddeltalen = attr.ib()
> -    snapshotdepth = attr.ib()
> -
> -class _deltacomputer(object):
> -    def __init__(self, revlog):
> -        self.revlog = revlog
> -
> -    def _getcandidaterevs(self, p1, p2, cachedelta):
> -        """
> -        Provides revisions that present an interest to be diffed against,
> -        grouped by level of easiness.
> -        """
> -        revlog = self.revlog
> -        gdelta = revlog._generaldelta
> -        curr = len(revlog)
> -        prev = curr - 1
> -        p1r, p2r = revlog.rev(p1), revlog.rev(p2)
> -
> -        # should we try to build a delta?
> -        if prev != nullrev and revlog.storedeltachains:
> -            tested = set()
> -            # This condition is true most of the time when processing
> -            # changegroup data into a generaldelta repo. The only time it
> -            # isn't true is if this is the first revision in a delta chain
> -            # or if ``format.generaldelta=true`` disabled
> ``lazydeltabase``.
> -            if cachedelta and gdelta and revlog._lazydeltabase:
> -                # Assume what we received from the server is a good choice
> -                # build delta will reuse the cache
> -                yield (cachedelta[0],)
> -                tested.add(cachedelta[0])
> -
> -            if gdelta:
> -                # exclude already lazy tested base if any
> -                parents = [p for p in (p1r, p2r)
> -                           if p != nullrev and p not in tested]
> -
> -                if not revlog._deltabothparents and len(parents) == 2:
> -                    parents.sort()
> -                    # To minimize the chance of having to build a
> fulltext,
> -                    # pick first whichever parent is closest to us (max
> rev)
> -                    yield (parents[1],)
> -                    # then the other one (min rev) if the first did not
> fit
> -                    yield (parents[0],)
> -                    tested.update(parents)
> -                elif len(parents) > 0:
> -                    # Test all parents (1 or 2), and keep the best
> candidate
> -                    yield parents
> -                    tested.update(parents)
> -
> -            if prev not in tested:
> -                # other approach failed try against prev to hopefully
> save us a
> -                # fulltext.
> -                yield (prev,)
> -                tested.add(prev)
> -
> -    def buildtext(self, revinfo, fh):
> -        """Builds a fulltext version of a revision
> -
> -        revinfo: _revisioninfo instance that contains all needed info
> -        fh:      file handle to either the .i or the .d revlog file,
> -                 depending on whether it is inlined or not
> -        """
> -        btext = revinfo.btext
> -        if btext[0] is not None:
> -            return btext[0]
> -
> -        revlog = self.revlog
> -        cachedelta = revinfo.cachedelta
> -        flags = revinfo.flags
> -        node = revinfo.node
> -
> -        baserev = cachedelta[0]
> -        delta = cachedelta[1]
> -        # special case deltas which replace entire base; no need to decode
> -        # base revision. this neatly avoids censored bases, which throw
> when
> -        # they're decoded.
> -        hlen = struct.calcsize(">lll")
> -        if delta[:hlen] ==
> mdiff.replacediffheader(revlog.rawsize(baserev),
> -                                                   len(delta) - hlen):
> -            btext[0] = delta[hlen:]
> -        else:
> -            # deltabase is rawtext before changed by flag processors,
> which is
> -            # equivalent to non-raw text
> -            basetext = revlog.revision(baserev, _df=fh, raw=False)
> -            btext[0] = mdiff.patch(basetext, delta)
> -
> -        try:
> -            res = revlog._processflags(btext[0], flags, 'read', raw=True)
> -            btext[0], validatehash = res
> -            if validatehash:
> -                revlog.checkhash(btext[0], node, p1=revinfo.p1,
> p2=revinfo.p2)
> -            if flags & REVIDX_ISCENSORED:
> -                raise RevlogError(_('node %s is not censored') % node)
> -        except CensoredNodeError:
> -            # must pass the censored index flag to add censored revisions
> -            if not flags & REVIDX_ISCENSORED:
> -                raise
> -        return btext[0]
> -
> -    def _builddeltadiff(self, base, revinfo, fh):
> -        revlog = self.revlog
> -        t = self.buildtext(revinfo, fh)
> -        if revlog.iscensored(base):
> -            # deltas based on a censored revision must replace the
> -            # full content in one patch, so delta works everywhere
> -            header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
> -            delta = header + t
> -        else:
> -            ptext = revlog.revision(base, _df=fh, raw=True)
> -            delta = mdiff.textdiff(ptext, t)
> -
> -        return delta
> -
> -    def _builddeltainfo(self, revinfo, base, fh):
> -        # can we use the cached delta?
> -        if revinfo.cachedelta and revinfo.cachedelta[0] == base:
> -            delta = revinfo.cachedelta[1]
> -        else:
> -            delta = self._builddeltadiff(base, revinfo, fh)
> -        revlog = self.revlog
> -        header, data = revlog.compress(delta)
> -        deltalen = len(header) + len(data)
> -        chainbase = revlog.chainbase(base)
> -        offset = revlog.end(len(revlog) - 1)
> -        dist = deltalen + offset - revlog.start(chainbase)
> -        if revlog._generaldelta:
> -            deltabase = base
> -        else:
> -            deltabase = chainbase
> -        chainlen, compresseddeltalen = revlog._chaininfo(base)
> -        chainlen += 1
> -        compresseddeltalen += deltalen
> -
> -        revlog = self.revlog
> -        snapshotdepth = None
> -        if deltabase == nullrev:
> -            snapshotdepth = 0
> -        elif revlog._sparserevlog and revlog.issnapshot(deltabase):
> -            # A delta chain should always be one full snapshot,
> -            # zero or more semi-snapshots, and zero or more deltas
> -            p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
> -            if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
> -                snapshotdepth = len(revlog._deltachain(deltabase)[0])
> -
> -        return _deltainfo(dist, deltalen, (header, data), deltabase,
> -                          chainbase, chainlen, compresseddeltalen,
> -                          snapshotdepth)
> -
> -    def finddeltainfo(self, revinfo, fh):
> -        """Find an acceptable delta against a candidate revision
> -
> -        revinfo: information about the revision (instance of
> _revisioninfo)
> -        fh:      file handle to either the .i or the .d revlog file,
> -                 depending on whether it is inlined or not
> -
> -        Returns the first acceptable candidate revision, as ordered by
> -        _getcandidaterevs
> -        """
> -        if not revinfo.textlen:
> -            return None # empty file do not need delta
> -
> -        cachedelta = revinfo.cachedelta
> -        p1 = revinfo.p1
> -        p2 = revinfo.p2
> -        revlog = self.revlog
> -
> -        deltalength = self.revlog.length
> -        deltaparent = self.revlog.deltaparent
> -
> -        deltainfo = None
> -        deltas_limit = revinfo.textlen * LIMIT_DELTA2TEXT
> -        for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta):
> -            # filter out delta base that will never produce good delta
> -            candidaterevs = [r for r in candidaterevs
> -                             if self.revlog.length(r) <= deltas_limit]
> -            nominateddeltas = []
> -            for candidaterev in candidaterevs:
> -                # skip over empty delta (no need to include them in a
> chain)
> -                while candidaterev != nullrev and not
> deltalength(candidaterev):
> -                    candidaterev = deltaparent(candidaterev)
> -                # no need to try a delta against nullid, this will be
> handled
> -                # by fulltext later.
> -                if candidaterev == nullrev:
> -                    continue
> -                # no delta for rawtext-changing revs (see "candelta" for
> why)
> -                if revlog.flags(candidaterev) &
> REVIDX_RAWTEXT_CHANGING_FLAGS:
> -                    continue
> -                candidatedelta = self._builddeltainfo(revinfo,
> candidaterev, fh)
> -                if revlog._isgooddeltainfo(candidatedelta, revinfo):
> -                    nominateddeltas.append(candidatedelta)
> -            if nominateddeltas:
> -                deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
> -                break
> -
> -        return deltainfo
> -
> - at attr.s(slots=True, frozen=True)
>  class _revisioninfo(object):
>      """Information about a revision that allows building its fulltext
>      node:       expected hash of the revision
> @@ -2375,7 +2174,7 @@ class revlog(object):
>              computed by default as hash(text, p1, p2), however subclasses
> might
>              use different hashing method (and override checkhash() in
> such case)
>          flags - the known flags to set on the revision
> -        deltacomputer - an optional _deltacomputer instance shared between
> +        deltacomputer - an optional deltacomputer instance shared between
>              multiple calls
>          """
>          if link == nullrev:
> @@ -2634,7 +2433,7 @@ class revlog(object):
>              textlen = len(rawtext)
>
>          if deltacomputer is None:
> -            deltacomputer = _deltacomputer(self)
> +            deltacomputer = deltautil.deltacomputer(self)
>
>          revinfo = _revisioninfo(node, p1, p2, btext, textlen, cachedelta,
> flags)
>
> @@ -2736,7 +2535,7 @@ class revlog(object):
>                  dfh.flush()
>              ifh.flush()
>          try:
> -            deltacomputer = _deltacomputer(self)
> +            deltacomputer = deltautil.deltacomputer(self)
>              # loop through our set of deltas
>              for data in deltas:
>                  node, p1, p2, linknode, deltabase, delta, flags = data
> @@ -3028,7 +2827,7 @@ class revlog(object):
>              populatecachedelta = deltareuse in (self.DELTAREUSEALWAYS,
>                                                  self.DELTAREUSESAMEREVS)
>
> -            deltacomputer = _deltacomputer(destrevlog)
> +            deltacomputer = deltautil.deltacomputer(destrevlog)
>              index = self.index
>              for rev in self:
>                  entry = index[rev]
> diff --git a/mercurial/revlogutils/deltas.py
> b/mercurial/revlogutils/deltas.py
> new file mode 100644
> --- /dev/null
> +++ b/mercurial/revlogutils/deltas.py
> @@ -0,0 +1,240 @@
> +# revlogdeltas.py - Logic around delta computation for revlog
> +#
> +# Copyright 2005-2007 Matt Mackall <mpm at selenic.com>
> +# Copyright 2018 Octobus <contact at octobus.net>
> +#
> +# This software may be used and distributed according to the terms of the
> +# GNU General Public License version 2 or any later version.
> +"""Helper class to compute deltas stored inside revlogs"""
> +
> +from __future__ import absolute_import
> +
> +import struct
> +
> +# import stuff from node for others to import from revlog
> +from ..node import (
> +    nullrev,
> +)
> +from ..i18n import _
> +
> +from .constants import (
> +    LIMIT_DELTA2TEXT,
> +    REVIDX_ISCENSORED,
> +    REVIDX_RAWTEXT_CHANGING_FLAGS,
> +)
> +
> +from ..thirdparty import (
> +    attr,
> +)
> +
> +from .. import (
> +    error,
> +    mdiff,
> +)
> +
> +RevlogError = error.RevlogError
> +CensoredNodeError = error.CensoredNodeError
> +
> + at attr.s(slots=True, frozen=True)
> +class _deltainfo(object):
> +    distance = attr.ib()
> +    deltalen = attr.ib()
> +    data = attr.ib()
> +    base = attr.ib()
> +    chainbase = attr.ib()
> +    chainlen = attr.ib()
> +    compresseddeltalen = attr.ib()
> +    snapshotdepth = attr.ib()
> +
> +class deltacomputer(object):
> +    def __init__(self, revlog):
> +        self.revlog = revlog
> +
> +    def _getcandidaterevs(self, p1, p2, cachedelta):
> +        """
> +        Provides revisions that present an interest to be diffed against,
> +        grouped by level of easiness.
> +        """
> +        revlog = self.revlog
> +        gdelta = revlog._generaldelta
> +        curr = len(revlog)
> +        prev = curr - 1
> +        p1r, p2r = revlog.rev(p1), revlog.rev(p2)
> +
> +        # should we try to build a delta?
> +        if prev != nullrev and revlog.storedeltachains:
> +            tested = set()
> +            # This condition is true most of the time when processing
> +            # changegroup data into a generaldelta repo. The only time it
> +            # isn't true is if this is the first revision in a delta chain
> +            # or if ``format.generaldelta=true`` disabled
> ``lazydeltabase``.
> +            if cachedelta and gdelta and revlog._lazydeltabase:
> +                # Assume what we received from the server is a good choice
> +                # build delta will reuse the cache
> +                yield (cachedelta[0],)
> +                tested.add(cachedelta[0])
> +
> +            if gdelta:
> +                # exclude already lazy tested base if any
> +                parents = [p for p in (p1r, p2r)
> +                           if p != nullrev and p not in tested]
> +
> +                if not revlog._deltabothparents and len(parents) == 2:
> +                    parents.sort()
> +                    # To minimize the chance of having to build a
> fulltext,
> +                    # pick first whichever parent is closest to us (max
> rev)
> +                    yield (parents[1],)
> +                    # then the other one (min rev) if the first did not
> fit
> +                    yield (parents[0],)
> +                    tested.update(parents)
> +                elif len(parents) > 0:
> +                    # Test all parents (1 or 2), and keep the best
> candidate
> +                    yield parents
> +                    tested.update(parents)
> +
> +            if prev not in tested:
> +                # other approach failed try against prev to hopefully
> save us a
> +                # fulltext.
> +                yield (prev,)
> +                tested.add(prev)
> +
> +    def buildtext(self, revinfo, fh):
> +        """Builds a fulltext version of a revision
> +
> +        revinfo: _revisioninfo instance that contains all needed info
> +        fh:      file handle to either the .i or the .d revlog file,
> +                 depending on whether it is inlined or not
> +        """
> +        btext = revinfo.btext
> +        if btext[0] is not None:
> +            return btext[0]
> +
> +        revlog = self.revlog
> +        cachedelta = revinfo.cachedelta
> +        flags = revinfo.flags
> +        node = revinfo.node
> +
> +        baserev = cachedelta[0]
> +        delta = cachedelta[1]
> +        # special case deltas which replace entire base; no need to decode
> +        # base revision. this neatly avoids censored bases, which throw
> when
> +        # they're decoded.
> +        hlen = struct.calcsize(">lll")
> +        if delta[:hlen] ==
> mdiff.replacediffheader(revlog.rawsize(baserev),
> +                                                   len(delta) - hlen):
> +            btext[0] = delta[hlen:]
> +        else:
> +            # deltabase is rawtext before changed by flag processors,
> which is
> +            # equivalent to non-raw text
> +            basetext = revlog.revision(baserev, _df=fh, raw=False)
> +            btext[0] = mdiff.patch(basetext, delta)
> +
> +        try:
> +            res = revlog._processflags(btext[0], flags, 'read', raw=True)
> +            btext[0], validatehash = res
> +            if validatehash:
> +                revlog.checkhash(btext[0], node, p1=revinfo.p1,
> p2=revinfo.p2)
> +            if flags & REVIDX_ISCENSORED:
> +                raise RevlogError(_('node %s is not censored') % node)
> +        except CensoredNodeError:
> +            # must pass the censored index flag to add censored revisions
> +            if not flags & REVIDX_ISCENSORED:
> +                raise
> +        return btext[0]
> +
> +    def _builddeltadiff(self, base, revinfo, fh):
> +        revlog = self.revlog
> +        t = self.buildtext(revinfo, fh)
> +        if revlog.iscensored(base):
> +            # deltas based on a censored revision must replace the
> +            # full content in one patch, so delta works everywhere
> +            header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
> +            delta = header + t
> +        else:
> +            ptext = revlog.revision(base, _df=fh, raw=True)
> +            delta = mdiff.textdiff(ptext, t)
> +
> +        return delta
> +
> +    def _builddeltainfo(self, revinfo, base, fh):
> +        # can we use the cached delta?
> +        if revinfo.cachedelta and revinfo.cachedelta[0] == base:
> +            delta = revinfo.cachedelta[1]
> +        else:
> +            delta = self._builddeltadiff(base, revinfo, fh)
> +        revlog = self.revlog
> +        header, data = revlog.compress(delta)
> +        deltalen = len(header) + len(data)
> +        chainbase = revlog.chainbase(base)
> +        offset = revlog.end(len(revlog) - 1)
> +        dist = deltalen + offset - revlog.start(chainbase)
> +        if revlog._generaldelta:
> +            deltabase = base
> +        else:
> +            deltabase = chainbase
> +        chainlen, compresseddeltalen = revlog._chaininfo(base)
> +        chainlen += 1
> +        compresseddeltalen += deltalen
> +
> +        revlog = self.revlog
> +        snapshotdepth = None
> +        if deltabase == nullrev:
> +            snapshotdepth = 0
> +        elif revlog._sparserevlog and revlog.issnapshot(deltabase):
> +            # A delta chain should always be one full snapshot,
> +            # zero or more semi-snapshots, and zero or more deltas
> +            p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
> +            if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
> +                snapshotdepth = len(revlog._deltachain(deltabase)[0])
> +
> +        return _deltainfo(dist, deltalen, (header, data), deltabase,
> +                          chainbase, chainlen, compresseddeltalen,
> +                          snapshotdepth)
> +
> +    def finddeltainfo(self, revinfo, fh):
> +        """Find an acceptable delta against a candidate revision
> +
> +        revinfo: information about the revision (instance of
> _revisioninfo)
> +        fh:      file handle to either the .i or the .d revlog file,
> +                 depending on whether it is inlined or not
> +
> +        Returns the first acceptable candidate revision, as ordered by
> +        _getcandidaterevs
> +        """
> +        if not revinfo.textlen:
> +            return None # empty file do not need delta
> +
> +        cachedelta = revinfo.cachedelta
> +        p1 = revinfo.p1
> +        p2 = revinfo.p2
> +        revlog = self.revlog
> +
> +        deltalength = self.revlog.length
> +        deltaparent = self.revlog.deltaparent
> +
> +        deltainfo = None
> +        deltas_limit = revinfo.textlen * LIMIT_DELTA2TEXT
> +        for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta):
> +            # filter out delta base that will never produce good delta
> +            candidaterevs = [r for r in candidaterevs
> +                             if self.revlog.length(r) <= deltas_limit]
> +            nominateddeltas = []
> +            for candidaterev in candidaterevs:
> +                # skip over empty delta (no need to include them in a
> chain)
> +                while candidaterev != nullrev and not
> deltalength(candidaterev):
> +                    candidaterev = deltaparent(candidaterev)
> +                # no need to try a delta against nullid, this will be
> handled
> +                # by fulltext later.
> +                if candidaterev == nullrev:
> +                    continue
> +                # no delta for rawtext-changing revs (see "candelta" for
> why)
> +                if revlog.flags(candidaterev) &
> REVIDX_RAWTEXT_CHANGING_FLAGS:
> +                    continue
> +                candidatedelta = self._builddeltainfo(revinfo,
> candidaterev, fh)
> +                if revlog._isgooddeltainfo(candidatedelta, revinfo):
> +                    nominateddeltas.append(candidatedelta)
> +            if nominateddeltas:
> +                deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
> +                break
> +
> +        return deltainfo
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.mercurial-scm.org/pipermail/mercurial-devel/attachments/20180824/435a52f7/attachment.html>


More information about the Mercurial-devel mailing list