[PATCH 2 of 2 V2] revlog: group delta computation methods under _deltacomputer object

Augie Fackler raf at durin42.com
Wed Jan 17 17:26:03 EST 2018



> On Jan 17, 2018, at 06:26, Paul Morelle <paul.morelle at octobus.net> wrote:
> 
> # HG changeset patch
> # User Paul Morelle <paul.morelle at octobus.net>
> # Date 1515961692 -3600
> #      Sun Jan 14 21:28:12 2018 +0100
> # Node ID 2eca3d9c1e76197e946adc4f15c2ede5a4c251dc
> # Parent  800aa38d15a621c7beecea286a1f0ab9750783ee
> # EXP-Topic refactor-revlog
> # Available At https://bitbucket.org/octobus/mercurial-devel/
> #              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r 2eca3d9c1e76
> revlog: group delta computation methods under _deltacomputer object

These patches look superficially fine, but only describe the "what" of the change. What's the goal? Can we get some "why" in the commit messages?

> 
> diff -r 800aa38d15a6 -r 2eca3d9c1e76 mercurial/revlog.py
> --- a/mercurial/revlog.py	Sun Jan 14 14:36:22 2018 +0100
> +++ b/mercurial/revlog.py	Sun Jan 14 21:28:12 2018 +0100
> @@ -264,6 +264,155 @@
>     chainlen = attr.ib()
>     compresseddeltalen = attr.ib()
> 
> +class _deltacomputer(object):
> +    def __init__(self, revlog):
> +        self.revlog = revlog
> +
> +    def _getcandidaterevs(self, p1, p2, cachedelta):
> +        """
> +        Provides revisions that present an interest to be diffed against,
> +        grouped by level of easiness.
> +        """
> +        revlog = self.revlog
> +        curr = len(revlog)
> +        prev = curr - 1
> +        p1r, p2r = revlog.rev(p1), revlog.rev(p2)
> +
> +        # should we try to build a delta?
> +        if prev != nullrev and revlog.storedeltachains:
> +            tested = set()
> +            # This condition is true most of the time when processing
> +            # changegroup data into a generaldelta repo. The only time it
> +            # isn't true is if this is the first revision in a delta chain
> +            # or if ``format.generaldelta=true`` disabled ``lazydeltabase``.
> +            if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
> +                # Assume what we received from the server is a good choice
> +                # build delta will reuse the cache
> +                yield (cachedelta[0],)
> +                tested.add(cachedelta[0])
> +
> +            if revlog._generaldelta:
> +                # exclude already lazy tested base if any
> +                parents = [p for p in (p1r, p2r)
> +                           if p != nullrev and p not in tested]
> +                if parents and not revlog._aggressivemergedeltas:
> +                    # Pick whichever parent is closer to us (to minimize the
> +                    # chance of having to build a fulltext).
> +                    parents = [max(parents)]
> +                tested.update(parents)
> +                yield parents
> +
> +            if prev not in tested:
> +                # other approach failed try against prev to hopefully save us a
> +                # fulltext.
> +                yield (prev,)
> +
> +    def buildtext(self, revinfo, fh):
> +        """Builds a fulltext version of a revision
> +
> +        revinfo: _revisioninfo instance that contains all needed info
> +        fh:      file handle to either the .i or the .d revlog file,
> +                 depending on whether it is inlined or not
> +        """
> +        btext = revinfo.btext
> +        if btext[0] is not None:
> +            return btext[0]
> +
> +        revlog = self.revlog
> +        cachedelta = revinfo.cachedelta
> +        flags = revinfo.flags
> +        node = revinfo.node
> +
> +        baserev = cachedelta[0]
> +        delta = cachedelta[1]
> +        # special case deltas which replace entire base; no need to decode
> +        # base revision. this neatly avoids censored bases, which throw when
> +        # they're decoded.
> +        hlen = struct.calcsize(">lll")
> +        if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
> +                                                   len(delta) - hlen):
> +            btext[0] = delta[hlen:]
> +        else:
> +            basetext = revlog.revision(baserev, _df=fh, raw=True)
> +            btext[0] = mdiff.patch(basetext, delta)
> +
> +        try:
> +            res = revlog._processflags(btext[0], flags, 'read', raw=True)
> +            btext[0], validatehash = res
> +            if validatehash:
> +                revlog.checkhash(btext[0], node, p1=revinfo.p1, p2=revinfo.p2)
> +            if flags & REVIDX_ISCENSORED:
> +                raise RevlogError(_('node %s is not censored') % node)
> +        except CensoredNodeError:
> +            # must pass the censored index flag to add censored revisions
> +            if not flags & REVIDX_ISCENSORED:
> +                raise
> +        return btext[0]
> +
> +    def _builddeltadiff(self, base, revinfo, fh):
> +        revlog = self.revlog
> +        t = self.buildtext(revinfo, fh)
> +        if revlog.iscensored(base):
> +            # deltas based on a censored revision must replace the
> +            # full content in one patch, so delta works everywhere
> +            header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
> +            delta = header + t
> +        else:
> +            ptext = revlog.revision(base, _df=fh, raw=True)
> +            delta = mdiff.textdiff(ptext, t)
> +
> +        return delta
> +
> +    def _builddeltainfo(self, revinfo, base, fh):
> +        # can we use the cached delta?
> +        if revinfo.cachedelta and revinfo.cachedelta[0] == base:
> +            delta = revinfo.cachedelta[1]
> +        else:
> +            delta = self._builddeltadiff(base, revinfo, fh)
> +        revlog = self.revlog
> +        header, data = revlog.compress(delta)
> +        deltalen = len(header) + len(data)
> +        chainbase = revlog.chainbase(base)
> +        offset = revlog.end(len(revlog) - 1)
> +        dist = deltalen + offset - revlog.start(chainbase)
> +        if revlog._generaldelta:
> +            deltabase = base
> +        else:
> +            deltabase = chainbase
> +        chainlen, compresseddeltalen = revlog._chaininfo(base)
> +        chainlen += 1
> +        compresseddeltalen += deltalen
> +        return _deltainfo(dist, deltalen, (header, data), deltabase,
> +                         chainbase, chainlen, compresseddeltalen)
> +
> +    def finddeltainfo(self, revinfo, fh):
> +        """Find an acceptable delta against a candidate revision
> +
> +        revinfo: information about the revision (instance of _revisioninfo)
> +        fh:      file handle to either the .i or the .d revlog file,
> +                 depending on whether it is inlined or not
> +
> +        Returns the first acceptable candidate revision, as ordered by
> +        _getcandidaterevs
> +        """
> +        cachedelta = revinfo.cachedelta
> +        p1 = revinfo.p1
> +        p2 = revinfo.p2
> +        revlog = self.revlog
> +
> +        deltainfo = None
> +        for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta):
> +            nominateddeltas = []
> +            for candidaterev in candidaterevs:
> +                candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
> +                if revlog._isgooddeltainfo(candidatedelta, revinfo.textlen):
> +                    nominateddeltas.append(candidatedelta)
> +            if nominateddeltas:
> +                deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
> +                break
> +
> +        return deltainfo
> +
> @attr.s(slots=True, frozen=True)
> class _revisioninfo(object):
>     """Information about a revision that allows building its fulltext
> @@ -1721,7 +1870,7 @@
>         self._chunkclear()
> 
>     def addrevision(self, text, transaction, link, p1, p2, cachedelta=None,
> -                    node=None, flags=REVIDX_DEFAULT_FLAGS):
> +                    node=None, flags=REVIDX_DEFAULT_FLAGS, deltacomputer=None):
>         """add a revision to the log
> 
>         text - the revision data to add
> @@ -1733,6 +1882,8 @@
>             computed by default as hash(text, p1, p2), however subclasses might
>             use different hashing method (and override checkhash() in such case)
>         flags - the known flags to set on the revision
> +        deltacomputer - an optional _deltacomputer instance shared between
> +            multiple calls
>         """
>         if link == nullrev:
>             raise RevlogError(_("attempted to add linkrev -1 to %s")
> @@ -1761,10 +1912,11 @@
>             self.checkhash(rawtext, node, p1=p1, p2=p2)
> 
>         return self.addrawrevision(rawtext, transaction, link, p1, p2, node,
> -                                   flags, cachedelta=cachedelta)
> +                                   flags, cachedelta=cachedelta,
> +                                   deltacomputer=deltacomputer)
> 
>     def addrawrevision(self, rawtext, transaction, link, p1, p2, node, flags,
> -                       cachedelta=None):
> +                       cachedelta=None, deltacomputer=None):
>         """add a raw revision with known flags, node and parents
>         useful when reusing a revision not stored in this revlog (ex: received
>         over wire, or read from an external bundle).
> @@ -1775,7 +1927,8 @@
>         ifh = self.opener(self.indexfile, "a+", checkambig=self._checkambig)
>         try:
>             return self._addrevision(node, rawtext, transaction, link, p1, p2,
> -                                     flags, cachedelta, ifh, dfh)
> +                                     flags, cachedelta, ifh, dfh,
> +                                     deltacomputer=deltacomputer)
>         finally:
>             if dfh:
>                 dfh.close()
> @@ -1875,154 +2028,18 @@
> 
>         return True
> 
> -    def _getcandidaterevs(self, p1, p2, cachedelta):
> -        """
> -        Provides revisions that present an interest to be diffed against,
> -        grouped by level of easiness.
> -        """
> -        curr = len(self)
> -        prev = curr - 1
> -        p1r, p2r = self.rev(p1), self.rev(p2)
> -
> -        # should we try to build a delta?
> -        if prev != nullrev and self.storedeltachains:
> -            tested = set()
> -            # This condition is true most of the time when processing
> -            # changegroup data into a generaldelta repo. The only time it
> -            # isn't true is if this is the first revision in a delta chain
> -            # or if ``format.generaldelta=true`` disabled ``lazydeltabase``.
> -            if cachedelta and self._generaldelta and self._lazydeltabase:
> -                # Assume what we received from the server is a good choice
> -                # build delta will reuse the cache
> -                yield (cachedelta[0],)
> -                tested.add(cachedelta[0])
> -
> -            if self._generaldelta:
> -                # exclude already lazy tested base if any
> -                parents = [p for p in (p1r, p2r)
> -                           if p != nullrev and p not in tested]
> -                if parents and not self._aggressivemergedeltas:
> -                    # Pick whichever parent is closer to us (to minimize the
> -                    # chance of having to build a fulltext).
> -                    parents = [max(parents)]
> -                tested.update(parents)
> -                yield parents
> -
> -            if prev not in tested:
> -                # other approach failed try against prev to hopefully save us a
> -                # fulltext.
> -                yield (prev,)
> -
> -    def _buildtext(self, revinfo, fh):
> -        """Builds a fulltext version of a revision
> -
> -        revinfo: _revisioninfo instance that contains all needed info
> -        fh:      file handle to either the .i or the .d revlog file,
> -                 depending on whether it is inlined or not
> -        """
> -        btext = revinfo.btext
> -        if btext[0] is not None:
> -            return btext[0]
> -
> -        cachedelta = revinfo.cachedelta
> -        flags = revinfo.flags
> -        node = revinfo.node
> -
> -        baserev = cachedelta[0]
> -        delta = cachedelta[1]
> -        # special case deltas which replace entire base; no need to decode
> -        # base revision. this neatly avoids censored bases, which throw when
> -        # they're decoded.
> -        hlen = struct.calcsize(">lll")
> -        if delta[:hlen] == mdiff.replacediffheader(self.rawsize(baserev),
> -                                                   len(delta) - hlen):
> -            btext[0] = delta[hlen:]
> -        else:
> -            basetext = self.revision(baserev, _df=fh, raw=True)
> -            btext[0] = mdiff.patch(basetext, delta)
> -
> -        try:
> -            res = self._processflags(btext[0], flags, 'read', raw=True)
> -            btext[0], validatehash = res
> -            if validatehash:
> -                self.checkhash(btext[0], node, p1=revinfo.p1, p2=revinfo.p2)
> -            if flags & REVIDX_ISCENSORED:
> -                raise RevlogError(_('node %s is not censored') % node)
> -        except CensoredNodeError:
> -            # must pass the censored index flag to add censored revisions
> -            if not flags & REVIDX_ISCENSORED:
> -                raise
> -        return btext[0]
> -
> -    def _builddeltadiff(self, base, revinfo, fh):
> -        t = self._buildtext(revinfo, fh)
> -        if self.iscensored(base):
> -            # deltas based on a censored revision must replace the
> -            # full content in one patch, so delta works everywhere
> -            header = mdiff.replacediffheader(self.rawsize(base), len(t))
> -            delta = header + t
> -        else:
> -            ptext = self.revision(base, _df=fh, raw=True)
> -            delta = mdiff.textdiff(ptext, t)
> -
> -        return delta
> -
> -    def _builddeltainfo(self, revinfo, base, fh):
> -        # can we use the cached delta?
> -        if revinfo.cachedelta and revinfo.cachedelta[0] == base:
> -            delta = revinfo.cachedelta[1]
> -        else:
> -            delta = self._builddeltadiff(base, revinfo, fh)
> -        header, data = self.compress(delta)
> -        deltalen = len(header) + len(data)
> -        chainbase = self.chainbase(base)
> -        offset = self.end(len(self) - 1)
> -        dist = deltalen + offset - self.start(chainbase)
> -        if self._generaldelta:
> -            deltabase = base
> -        else:
> -            deltabase = chainbase
> -        chainlen, compresseddeltalen = self._chaininfo(base)
> -        chainlen += 1
> -        compresseddeltalen += deltalen
> -        return _deltainfo(dist, deltalen, (header, data), deltabase,
> -                         chainbase, chainlen, compresseddeltalen)
> -
> -    def _finddeltainfo(self, revinfo, fh):
> -        """Find an acceptable delta against a candidate revision
> -
> -        revinfo: information about the revision (instance of _revisioninfo)
> -        fh:      file handle to either the .i or the .d revlog file,
> -                 depending on whether it is inlined or not
> -
> -        Returns the first acceptable candidate revision, as ordered by
> -        _getcandidaterevs
> -        """
> -        cachedelta = revinfo.cachedelta
> -        p1 = revinfo.p1
> -        p2 = revinfo.p2
> -
> -        deltainfo = None
> -        for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta):
> -            nominateddeltas = []
> -            for candidaterev in candidaterevs:
> -                candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
> -                if self._isgooddeltainfo(candidatedelta, revinfo.textlen):
> -                    nominateddeltas.append(candidatedelta)
> -            if nominateddeltas:
> -                deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
> -                break
> -
> -        return deltainfo
> -
>     def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags,
> -                     cachedelta, ifh, dfh, alwayscache=False):
> +                     cachedelta, ifh, dfh, alwayscache=False,
> +                     deltacomputer=None):
>         """internal function to add revisions to the log
> 
>         see addrevision for argument descriptions.
> 
>         note: "addrevision" takes non-raw text, "_addrevision" takes raw text.
> 
> +        if "deltacomputer" is not provided or None, a defaultdeltacomputer will
> +        be used.
> +
>         invariants:
>         - rawtext is optional (can be None); if not set, cachedelta must be set.
>           if both are set, they must correspond to each other.
> @@ -2054,8 +2071,11 @@
>         else:
>             textlen = len(rawtext)
> 
> +        if deltacomputer is None:
> +            deltacomputer = _deltacomputer(self)
> +
>         revinfo = _revisioninfo(node, p1, p2, btext, textlen, cachedelta, flags)
> -        deltainfo = self._finddeltainfo(revinfo, fh)
> +        deltainfo = deltacomputer.finddeltainfo(revinfo, fh)
> 
>         if deltainfo is not None:
>             base = deltainfo.base
> @@ -2063,7 +2083,7 @@
>             data = deltainfo.data
>             l = deltainfo.deltalen
>         else:
> -            rawtext = self._buildtext(revinfo, fh)
> +            rawtext = deltacomputer.buildtext(revinfo, fh)
>             data = self.compress(rawtext)
>             l = len(data[1]) + len(data[0])
>             base = chainbase = curr
> @@ -2077,7 +2097,7 @@
>         self._writeentry(transaction, ifh, dfh, entry, data, link, offset)
> 
>         if alwayscache and rawtext is None:
> -            rawtext = self._buildtext(revinfo, fh)
> +            rawtext = deltacomputer._buildtext(revinfo, fh)
> 
>         if type(rawtext) == str: # only accept immutable objects
>             self._cache = (node, curr, rawtext)
> @@ -2147,6 +2167,7 @@
>                 dfh.flush()
>             ifh.flush()
>         try:
> +            deltacomputer = _deltacomputer(self)
>             # loop through our set of deltas
>             for data in deltas:
>                 node, p1, p2, linknode, deltabase, delta, flags = data
> @@ -2193,7 +2214,8 @@
>                 self._addrevision(node, None, transaction, link,
>                                   p1, p2, flags, (baserev, delta),
>                                   ifh, dfh,
> -                                  alwayscache=bool(addrevisioncb))
> +                                  alwayscache=bool(addrevisioncb),
> +                                  deltacomputer=deltacomputer)
> 
>                 if addrevisioncb:
>                     addrevisioncb(self, node)
> @@ -2416,6 +2438,7 @@
>             populatecachedelta = deltareuse in (self.DELTAREUSEALWAYS,
>                                                 self.DELTAREUSESAMEREVS)
> 
> +            deltacomputer = _deltacomputer(destrevlog)
>             index = self.index
>             for rev in self:
>                 entry = index[rev]
> @@ -2444,7 +2467,8 @@
>                 if deltareuse == self.DELTAREUSEFULLADD:
>                     destrevlog.addrevision(rawtext, tr, linkrev, p1, p2,
>                                            cachedelta=cachedelta,
> -                                           node=node, flags=flags)
> +                                           node=node, flags=flags,
> +                                           deltacomputer=deltacomputer)
>                 else:
>                     ifh = destrevlog.opener(destrevlog.indexfile, 'a+',
>                                             checkambig=False)
> @@ -2453,7 +2477,8 @@
>                         dfh = destrevlog.opener(destrevlog.datafile, 'a+')
>                     try:
>                         destrevlog._addrevision(node, rawtext, tr, linkrev, p1,
> -                                                p2, flags, cachedelta, ifh, dfh)
> +                                                p2, flags, cachedelta, ifh, dfh,
> +                                                deltacomputer=deltacomputer)
>                     finally:
>                         if dfh:
>                             dfh.close()
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel



More information about the Mercurial-devel mailing list