[PATCH 1 of 2 V2] perf: introduce a perfrevlogwrite command

Wed Jan 16 09:33:03 UTC 2019

On Fri, Oct 19, 2018 at 02:25:13PM +0200, Boris Feld wrote:
> # HG changeset patch
> # User Boris Feld <boris.feld at octobus.net>
> # Date 1538556809 -7200
> #      Wed Oct 03 10:53:29 2018 +0200
> # Node ID 78ffc55c0d6034972ea3d49b8c565c563ef27121
> # Parent  824b687ff6af49622a18dfea760cf41a7abe5aa7
> # EXP-Topic revlog-perf
> # Available At https://bitbucket.org/octobus/mercurial-devel/
> #              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r 78ffc55c0d60
> perf: introduce a perfrevlogwrite command

Unless you can give an /extremely/ compelling reason, I'm going to
punt this to 4.9.

>
> The command record times taken by adding many revisions to a revlog. Timing
> each addition, individually. The "added revision" are recreations of the
> original ones.
>
> To time each addition individually, we have to handle the timing and the
> reporting ourselves.
>
> This command is introduced to track the impact of sparse-revlog format on
> delta computations at initial storage time. It starts with the full text, a
> situation similar to the "commit". Additions from an existing delta are better
> timed with bundles.
>
> The complaints from `check-perf-code.py` are not relevant. We are accessing
> and "revlog" opener, not a repository opener.
>
> diff --git a/contrib/perf.py b/contrib/perf.py
> --- a/contrib/perf.py
> +++ b/contrib/perf.py
> @@ -24,8 +24,10 @@ import functools
>  import gc
>  import os
>  import random
> +import shutil
>  import struct
>  import sys
> +import tempfile
>  import threading
>  import time
>  from mercurial import (
> @@ -1565,6 +1567,149 @@ def perfrevlogrevisions(ui, repo, file_=
>      timer(d)
>      fm.end()
>
> + at command(b'perfrevlogwrite', revlogopts + formatteropts +
> +         [(b's', b'startrev', 1000, b'revision to start writing at'),
> +          (b'', b'stoprev', -1, b'last revision to write'),
> +          (b'', b'count', 3, b'last revision to write'),
> +         ],
> +         b'-c|-m|FILE')
> +def perfrevlogwrite(ui, repo, file_=None, startrev=0, stoprev=-1, **opts):
> +    """Benchmark writing a series of revisions to a revlog.
> +    """
> +    opts = _byteskwargs(opts)
> +
> +    rl = cmdutil.openrevlog(repo, b'perfrevlogwrite', file_, opts)
> +    rllen = getlen(ui)(rl)
> +    if startrev < 0:
> +        startrev = rllen + startrev
> +    if stoprev < 0:
> +        stoprev = rllen + stoprev
> +
> +    ### actually gather results
> +    count = opts['count']
> +    if count <= 0:
> +        raise error.Abort('invalide run count: %d' % count)

typo: invalid (spurious e on the end in yours)

> +    allresults = []
> +    for c in range(count):
> +        allresults.append(_timeonewrite(ui, rl, startrev, stoprev, c + 1))
> +
> +    ### consolidate the results in a single list
> +    results = []
> +    for idx, (rev, t) in enumerate(allresults[0]):
> +        ts = [t]
> +        for other in allresults[1:]:
> +            orev, ot = other[idx]
> +            assert orev == rev
> +            ts.append(ot)
> +        results.append((rev, ts))
> +    resultcount = len(results)
> +
> +    ### Compute and display relevant statistics
> +
> +    # get a formatter
> +    fm = ui.formatter(b'perf', opts)
> +    displayall = ui.configbool(b"perf", b"all-timing", False)
> +
> +    # sorts results by median time
> +    results.sort(key=lambda x: sorted(x[1])[len(x[1]) // 2])
> +    # list of (name, index) to display)
> +    relevants = [
> +        ("min", 0),
> +        ("10%", resultcount * 10 // 100),
> +        ("25%", resultcount * 25 // 100),
> +        ("50%", resultcount * 70 // 100),
> +        ("75%", resultcount * 75 // 100),
> +        ("90%", resultcount * 90 // 100),
> +        ("95%", resultcount * 95 // 100),
> +        ("99%", resultcount * 99 // 100),
> +        ("max", -1),
> +    ]

Clarifying what this block does: you sort results by median time
(median of what? I'm a little lost) and then use integer division to
jump into the result set at the right position for emulating the
various %ile latencies, right?

Some sample output in the commit message would be nice.

> +    for name, idx in relevants:
> +        data = results[idx]
> +        title = '%s of %d, rev %d' % (name, resultcount, data[0])
> +        formatone(fm, data[1], title=title, displayall=displayall)
> +
> +    # XXX summing that many float will not be very precise, we ignore this fact
> +    # for now

Make this a TODO, or just a NOTE? it seems extremely unlikely we'll
ever do it, so maybe just make it "note that summing this many flaots
will ..." and make peace with that?

> +    totaltime = []
> +    for item in allresults:
> +        totaltime.append((sum(x[1][0] for x in item),
> +                          sum(x[1][1] for x in item),
> +                          sum(x[1][2] for x in item),)
> +        )
> +    formatone(fm, totaltime, title="total time (%d revs)" % resultcount,
> +              displayall=displayall)
> +    fm.end()
> +
> +def _timeonewrite(ui, orig, startrev, stoprev, runidx=None):
> +    timings = []
> +    tr = type('faketr', (object,) , {'add': lambda s, x, y, z=None:None})()
> +    with _temprevlog(ui, orig, startrev) as dest:
> +        revs = list(orig.revs(startrev, stoprev))
> +        total = len(revs)
> +        topic = 'adding'
> +        if runidx is not None:
> +            topic += ' (run #%d)' % runidx
> +        for idx, rev in enumerate(revs):
> +            ui.progress(topic, idx, unit='revs', total=total)
> +            text = orig.revision(rev)
> +            linkrev = orig.linkrev(rev)
> +            node = orig.node(rev)
> +            p1, p2 = orig.parents(node)
> +            with timeone() as r:
> +                dest.addrevision(text, tr, linkrev, p1, p2, node=node)
> +            timings.append((rev, r[0]))
> +        ui.progress(topic, total, unit='revs', total=total)
> +        ui.progress(topic, None, unit='revs', total=total)
> +    return timings
> +
> + at contextlib.contextmanager
> +def _temprevlog(ui, orig, truncaterev):
> +    from mercurial import vfs as vfsmod
> +
> +    if orig._inline:
> +        raise error.Abort('not supporting inline revlog (yet)')
> +
> +    origindexpath = orig.opener.join(orig.indexfile)
> +    origdatapath = orig.opener.join(orig.datafile)
> +    indexname = 'revlog.i'
> +    dataname = 'revlog.d'
> +
> +    tmpdir = tempfile.mkdtemp(prefix='tmp-hgperf-')
> +    try:
> +        # copy the data file in a temporary directory
> +        ui.debug('copying data in %s\n' % tmpdir)
> +        destindexpath = os.path.join(tmpdir, 'revlog.i')
> +        destdatapath = os.path.join(tmpdir, 'revlog.d')
> +        shutil.copyfile(origindexpath, destindexpath)
> +        shutil.copyfile(origdatapath, destdatapath)
> +
> +        # remove the data we want to add again
> +        ui.debug('truncating data to be rewritten\n')
> +        with open(destindexpath, 'ab') as index:
> +            index.seek(0)
> +            index.truncate(truncaterev * orig._io.size)
> +        with open(destdatapath, 'ab') as data:
> +            data.seek(0)
> +            data.truncate(orig.start(truncaterev))
> +
> +        # instantiate a new revlog from the temporary copy
> +        ui.debug('truncating adding to be rewritten\n')
> +        vfs = vfsmod.vfs(tmpdir)
> +        vfs.options = getattr(orig.opener, 'options', None)
> +
> +        dest = revlog.revlog(vfs,
> +                             indexfile=indexname,
> +                             datafile=dataname)
> +        if dest._inline:
> +            raise error.Abort('not supporting inline revlog (yet)')
> +        # make sure internals are initialized
> +        dest.revision(len(dest) - 1)
> +        yield dest
> +        del dest, vfs
> +    finally:
> +        shutil.rmtree(tmpdir, True)
> +
>  @command(b'perfrevlogchunks', revlogopts + formatteropts +
>           [(b'e', b'engines', b'', b'compression engines to use'),
>            (b's', b'startrev', 0, b'revision to start at')],
> diff --git a/tests/test-contrib-perf.t b/tests/test-contrib-perf.t
> --- a/tests/test-contrib-perf.t
> +++ b/tests/test-contrib-perf.t
> @@ -114,6 +114,8 @@ perfstatus
>                   Benchmark obtaining a revlog revision.
>     perfrevlogrevisions
>                   Benchmark reading a series of revisions from a revlog.
> +   perfrevlogwrite
> +                 Benchmark writing a series of revisions to a revlog.
>     perfrevrange  (no help text available)
>     perfrevset    benchmark the execution time of a revset
>     perfstartup   (no help text available)
> @@ -265,4 +267,16 @@ Check perf.py for historical portability
>    contrib/perf.py:\d+: (re)
>     >     from mercurial import (
>     import newer module separately in try clause for early Mercurial
> +  contrib/perf.py:\d+: (re)
> +   >     origindexpath = orig.opener.join(orig.indexfile)
> +   use getvfs()/getsvfs() for early Mercurial
> +  contrib/perf.py:\d+: (re)
> +   >     origdatapath = orig.opener.join(orig.datafile)
> +   use getvfs()/getsvfs() for early Mercurial
> +  contrib/perf.py:\d+: (re)
> +   >         vfs = vfsmod.vfs(tmpdir)
> +   use getvfs()/getsvfs() for early Mercurial
> +  contrib/perf.py:\d+: (re)
> +   >         vfs.options = getattr(orig.opener, 'options', None)
> +   use getvfs()/getsvfs() for early Mercurial
>    [1]
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel