[PATCH 5 of 5 RFC] RFC: hgwebproxy: an extension for running a caching hgweb proxy

Augie Fackler raf at durin42.com
Wed Nov 6 12:11:53 CST 2013


On Thu, Oct 10, 2013 at 04:35:53AM +0200, Mads Kiilerich wrote:
> # HG changeset patch
> # User Mads Kiilerich <madski at unity3d.com>
> # Date 1381372470 -7200
> #      Thu Oct 10 04:34:30 2013 +0200
> # Node ID 7ae5a3498414653b0aa6404b7adfaf3bae63f752
> # Parent  aaa67a78a1a9e116671082f79342b4f360cd8d2e
> RFC: hgwebproxy: an extension for running a caching hgweb proxy

Interesting, but I think we should talk about the relative merits of
this and lookaside clones, since the latter means you could use
something like AWS to store precomputed most-of-a-repo bundles.

>
> This proxy can serve as an "accelerator" or "concentrator" that might reduce
> the network traffic and improve the user experience where the bandwith is
> limited and the same data is fetched multiple times.
>
> diff --git a/hgext/hgwebproxy.py b/hgext/hgwebproxy.py
> new file mode 100644
> --- /dev/null
> +++ b/hgext/hgwebproxy.py
> @@ -0,0 +1,237 @@
> +# caching HTTP proxy for hgweb hosting
> +#
> +# Copyright Unity Technologies, Mads Kiilerich <madski at unity3d.com>
> +# Copyright Matt Mackall <mpm at selenic.com> and others
> +#
> +# This software may be used and distributed according to the terms of the
> +# GNU General Public License version 2 or any later version.
> +
> +'''Caching HTTP proxy for hgweb hosting
> +
> +This proxy can serve as an "accelerator" or "concentrator" that might reduce
> +the network traffic and improve the user experience where the bandwidth is
> +limited and the same data are fetched multiple times.
> +
> +Enable the extension with::
> +
> +  [extensions]
> +  hgwebproxy =
> +
> +Run the proxy as::
> +
> +  hg proxy --port 1234 http://servername/ /var/cache/hgrepos
> +
> +Instead of pointing Mercurial clients at::
> +
> +  http://servername/repos/name
> +
> +point them at the proxy:
> +
> +  http://proxyname:1234/repos/name
> +
> +The proxy will forward most of the protocol commands directly to the server.
> +(Because the proxy doesn't have state it will actually have to make twice as
> +many round-trips to the server as the client would have needed.)
> +
> +Some protocol commands returns a lot of data and are handled specially:
> +fetching bundles (for cloning and pulling) and fetching largefiles.
> +The proxy will serve these commands from the local cache.
> +Before doing that it will verify that it can serve it locally.
> +If something is missing it will first fetch it from the server and store it
> +locally so it also can be used next time.
> +
> +The proxy will by default assume that the server uses HTTP basic
> +authentication and it will forward authentication requests and credentials as
> +they are received. Credentials will not be checked for requests that are
> +served from the local cache. Someone with the full hash of a revision or
> +largefile will thus be able to retrieve it unauthenticated. The proxy should
> +thus only be used on networks and for servers where that is acceptable.
> +'''
> +
> +import os.path
> +import urllib2, posixpath
> +from mercurial import cmdutil, util, commands, hg, wireproto
> +from mercurial.hgweb import protocol
> +from mercurial.hgweb.common import ErrorResponse, HTTP_UNAUTHORIZED, HTTP_OK,\
> +    HTTP_BAD_REQUEST
> +from mercurial.hgweb.request import wsgirequest
> +from mercurial.i18n import _
> +from mercurial.node import short
> +from hgext.largefiles import lfutil, basestore
> +
> +cmdtable = {}
> +command = cmdutil.command(cmdtable)
> +testedwith = 'internal'
> +
> +commands.norepo += " proxy"
> +
> +
> +class proxyserver(object):
> +    def __init__(self, ui, serverurl, cachepath, anonymous=False):
> +        self.ui = ui
> +        self.serverurl = serverurl
> +        self.cachepath = cachepath
> +        self.anonymous = anonymous
> +
> +    def __call__(self, env, respond):
> +        req = wsgirequest(env, respond)
> +        return self.run_wsgi(req)
> +
> +    def run_wsgi(self, req):
> +        proto = protocol.webproto(req, self.ui)
> +
> +        u = util.url(self.serverurl)
> +        u.path = posixpath.join(u.path or '', req.env['PATH_INFO'])
> +
> +        # Simple path validation - probably only sufficient on Linux
> +        path = req.env['PATH_INFO'].replace('\\', '/').strip('/')
> +        if ':' in path or path.startswith('.') or '/.' in path:
> +            self.ui.debug('bad request path %r\n' % path)
> +            req.respond(HTTP_BAD_REQUEST, protocol.HGTYPE)
> +            return []
> +
> +        # MIME and HTTP allows multiple headers by the same name - we only
> +        # use and care about one
> +        args = dict((k, v[0]) for k, v in proto._args().items())
> +        cmd = args.pop('cmd')
> +        self.ui.debug("request for path: %s cmd: %s args: %s\n" %
> +                      (path, cmd,
> +                       ' '.join('%s=%s' for k, v in sorted(args.items()))
> +                       ))
> +
> +        # Forward HTTP authorization headers through the layers
> +        authheader = req.env.get('HTTP_AUTHORIZATION')
> +        if authheader and authheader.lower().startswith('basic '):
> +            userpasswd = authheader[6:].decode('base64')
> +            if ':' in userpasswd:
> +                u.user, u.passwd = userpasswd.split(':', 1)
> +        url = str(u)
> +
> +        if not (self.anonymous or u.user and u.passwd):
> +            self.ui.debug('authentication is mandatory and missing\n')
> +            er = ErrorResponse(HTTP_UNAUTHORIZED,
> +                'Authentication is mandatory',
> +                [('WWW-Authenticate',
> +                  'Basic realm="Mercurial Proxy Authentication"')])
> +            req.respond(er, protocol.HGTYPE)
> +            return ['HTTP authentication required']
> +
> +        try:
> +            # Forward most commands to the remote server
> +            if cmd not in ['getbundle', 'getlfile']:
> +                data = None
> +                if cmd == 'unbundle':
> +                    size = req.env['CONTENT_LENGTH']
> +                    self.ui.debug('reading bundle with size %s\n' % size)
> +                    data = req.read(int(size))
> +
> +                r = hg.peer(self.ui, {}, url)._call(cmd, data=data, **args)
> +                req.respond(HTTP_OK, protocol.HGTYPE)
> +                return [r]
> +
> +            # Special handling of commands that are worth caching
> +            repopath = os.path.join(self.cachepath, path)
> +            repo = hg.repository(self.ui, path=repopath,
> +                                 create=not os.path.exists(repopath))
> +
> +            # Fetch what we might be missing
> +            if cmd == 'getbundle':
> +                heads = [h for h in wireproto.decodelist(args['heads'])
> +                         if not h in repo]
> +                if heads:
> +                    self.ui.debug('fetching requested heads remotely: '
> +                                  '%s\n' %
> +                                  ' '.join(short(h) for h in heads))
> +                    r = repo.pull(hg.peer(self.ui, {}, url), heads=heads)
> +                    self.ui.debug('pull result: %r\n' % r)
> +                else:
> +                    self.ui.debug('serving bundle locally\n')
> +
> +            elif cmd == 'getlfile':
> +                sha = args['sha']
> +                filename = lfutil.findfile(repo, sha)
> +                if not filename:
> +                    self.ui.debug('fetching %s remotely\n' % sha)
> +                    store = basestore._openstore(repo,
> +                                                 hg.peer(self.ui, {}, url),
> +                                                 False)
> +                    gotit = store._gethash(sha, sha)
> +                    if not gotit:
> +                        self.ui.error(_('failed to get %s for %s\n') %
> +                                      (sha, path))
> +                else:
> +                    self.ui.debug('serving %s locally\n' % sha)
> +
> +            # Now serve it locally
> +            return protocol.call(repo, req, cmd)
> +
> +        except urllib2.HTTPError, inst:
> +            self.ui.error('HTTPError %s' % inst)
> +            req.respond(inst.code, protocol.HGTYPE)
> +            return ['HTTP error']
> +        except util.Abort, e: # hg.peer will abort when it gets 401
> +            if e.message != 'http authorization required':
> +                raise
> +            self.ui.debug('server requires authentication\n')
> +            er = ErrorResponse(HTTP_UNAUTHORIZED,
> +                'Authentication is mandatory',
> +                [('WWW-Authenticate',
> +                  'Basic realm="Mercurial Proxy Authentication"')])
> +            req.respond(er, protocol.HGTYPE)
> +            return ['HTTP authentication required']
> +
> +
> + at command('^proxy',
> +    [('A', 'accesslog', '', _('name of access log file to write to'),
> +     _('FILE')),
> +    ('d', 'daemon', None, _('run server in background')),
> +    ('', 'daemon-pipefds', '', _('used internally by daemon mode'), _('NUM')),
> +    ('E', 'errorlog', '', _('name of error log file to write to'), _('FILE')),
> +    # use string type, then we can check if something was passed
> +    ('p', 'port', '', _('port to listen on (default: 8000)'), _('PORT')),
> +    ('a', 'address', '', _('address to listen on (default: all interfaces)'),
> +     _('ADDR')),
> +    ('', 'prefix', '', _('prefix path to serve from (default: server root)'),
> +     _('PREFIX')),
> +    ('', 'pid-file', '', _('name of file to write process ID to'), _('FILE')),
> +    ('6', 'ipv6', None, _('use IPv6 in addition to IPv4')),
> +    ('', 'certificate', '', _('SSL certificate file'), _('FILE')),
> +    ('', 'anonymous', None, _("authentication is not mandatory"))],
> +    _('[OPTIONS]... SERVERURL CACHEPATH'))
> +def proxy(ui, serverurl, cachepath, **opts):
> +    """start stand-alone caching hgweb proxy
> +
> +    Start a local HTTP server that acts as a caching proxy for a remote
> +    server SERVERURL. Fetched data will be stored locally in the directory
> +    CACHEPATH and reused for future requests for the same data.
> +
> +    Please note that the proxy server will serve valid requests for cached
> +    data without checking credentials. It should thus only be used on networks
> +    and for servers and repositories where that is acceptable.
> +
> +    By default the proxy will request credentials for all requests before
> +    forwarding the requests to the server. Use --anonymous to disable that.
> +
> +    By default, the server logs accesses to stdout and errors to
> +    stderr. Use the -A/--accesslog and -E/--errorlog options to log to
> +    files.
> +
> +    To have the server choose a free port number to listen on, specify
> +    a port number of 0; in this case, the server will print the port
> +    number it uses.
> +
> +    Returns 0 on success.
> +    """
> +    if opts.get('port'):
> +        opts['port'] = util.getport(opts.get('port'))
> +
> +    u = util.url(serverurl)
> +    if u.scheme not in ['http', 'https', 'ssh']:
> +        raise util.Abort(_('invalid scheme in server url %s') % serverurl)
> +
> +    if not os.path.isdir(cachepath):
> +        raise util.Abort(_('cache path is not a directory'))
> +
> +    app = proxyserver(ui, serverurl, cachepath or '.', opts.get('anonymous'))
> +    service = commands.httpservice(ui, app, opts)
> +    cmdutil.service(opts, initfn=service.init, runfn=service.run)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel


More information about the Mercurial-devel mailing list