[PATCH 5 of 5 RFC] RFC: hgwebproxy: an extension for running a caching hgweb proxy
Augie Fackler
raf at durin42.com
Wed Nov 6 12:11:53 CST 2013
On Thu, Oct 10, 2013 at 04:35:53AM +0200, Mads Kiilerich wrote:
> # HG changeset patch
> # User Mads Kiilerich <madski at unity3d.com>
> # Date 1381372470 -7200
> # Thu Oct 10 04:34:30 2013 +0200
> # Node ID 7ae5a3498414653b0aa6404b7adfaf3bae63f752
> # Parent aaa67a78a1a9e116671082f79342b4f360cd8d2e
> RFC: hgwebproxy: an extension for running a caching hgweb proxy
Interesting, but I think we should talk about the relative merits of
this and lookaside clones, since the latter means you could use
something like AWS to store precomputed most-of-a-repo bundles.
>
> This proxy can serve as an "accelerator" or "concentrator" that might reduce
> the network traffic and improve the user experience where the bandwith is
> limited and the same data is fetched multiple times.
>
> diff --git a/hgext/hgwebproxy.py b/hgext/hgwebproxy.py
> new file mode 100644
> --- /dev/null
> +++ b/hgext/hgwebproxy.py
> @@ -0,0 +1,237 @@
> +# caching HTTP proxy for hgweb hosting
> +#
> +# Copyright Unity Technologies, Mads Kiilerich <madski at unity3d.com>
> +# Copyright Matt Mackall <mpm at selenic.com> and others
> +#
> +# This software may be used and distributed according to the terms of the
> +# GNU General Public License version 2 or any later version.
> +
> +'''Caching HTTP proxy for hgweb hosting
> +
> +This proxy can serve as an "accelerator" or "concentrator" that might reduce
> +the network traffic and improve the user experience where the bandwidth is
> +limited and the same data are fetched multiple times.
> +
> +Enable the extension with::
> +
> + [extensions]
> + hgwebproxy =
> +
> +Run the proxy as::
> +
> + hg proxy --port 1234 http://servername/ /var/cache/hgrepos
> +
> +Instead of pointing Mercurial clients at::
> +
> + http://servername/repos/name
> +
> +point them at the proxy:
> +
> + http://proxyname:1234/repos/name
> +
> +The proxy will forward most of the protocol commands directly to the server.
> +(Because the proxy doesn't have state it will actually have to make twice as
> +many round-trips to the server as the client would have needed.)
> +
> +Some protocol commands returns a lot of data and are handled specially:
> +fetching bundles (for cloning and pulling) and fetching largefiles.
> +The proxy will serve these commands from the local cache.
> +Before doing that it will verify that it can serve it locally.
> +If something is missing it will first fetch it from the server and store it
> +locally so it also can be used next time.
> +
> +The proxy will by default assume that the server uses HTTP basic
> +authentication and it will forward authentication requests and credentials as
> +they are received. Credentials will not be checked for requests that are
> +served from the local cache. Someone with the full hash of a revision or
> +largefile will thus be able to retrieve it unauthenticated. The proxy should
> +thus only be used on networks and for servers where that is acceptable.
> +'''
> +
> +import os.path
> +import urllib2, posixpath
> +from mercurial import cmdutil, util, commands, hg, wireproto
> +from mercurial.hgweb import protocol
> +from mercurial.hgweb.common import ErrorResponse, HTTP_UNAUTHORIZED, HTTP_OK,\
> + HTTP_BAD_REQUEST
> +from mercurial.hgweb.request import wsgirequest
> +from mercurial.i18n import _
> +from mercurial.node import short
> +from hgext.largefiles import lfutil, basestore
> +
> +cmdtable = {}
> +command = cmdutil.command(cmdtable)
> +testedwith = 'internal'
> +
> +commands.norepo += " proxy"
> +
> +
> +class proxyserver(object):
> + def __init__(self, ui, serverurl, cachepath, anonymous=False):
> + self.ui = ui
> + self.serverurl = serverurl
> + self.cachepath = cachepath
> + self.anonymous = anonymous
> +
> + def __call__(self, env, respond):
> + req = wsgirequest(env, respond)
> + return self.run_wsgi(req)
> +
> + def run_wsgi(self, req):
> + proto = protocol.webproto(req, self.ui)
> +
> + u = util.url(self.serverurl)
> + u.path = posixpath.join(u.path or '', req.env['PATH_INFO'])
> +
> + # Simple path validation - probably only sufficient on Linux
> + path = req.env['PATH_INFO'].replace('\\', '/').strip('/')
> + if ':' in path or path.startswith('.') or '/.' in path:
> + self.ui.debug('bad request path %r\n' % path)
> + req.respond(HTTP_BAD_REQUEST, protocol.HGTYPE)
> + return []
> +
> + # MIME and HTTP allows multiple headers by the same name - we only
> + # use and care about one
> + args = dict((k, v[0]) for k, v in proto._args().items())
> + cmd = args.pop('cmd')
> + self.ui.debug("request for path: %s cmd: %s args: %s\n" %
> + (path, cmd,
> + ' '.join('%s=%s' for k, v in sorted(args.items()))
> + ))
> +
> + # Forward HTTP authorization headers through the layers
> + authheader = req.env.get('HTTP_AUTHORIZATION')
> + if authheader and authheader.lower().startswith('basic '):
> + userpasswd = authheader[6:].decode('base64')
> + if ':' in userpasswd:
> + u.user, u.passwd = userpasswd.split(':', 1)
> + url = str(u)
> +
> + if not (self.anonymous or u.user and u.passwd):
> + self.ui.debug('authentication is mandatory and missing\n')
> + er = ErrorResponse(HTTP_UNAUTHORIZED,
> + 'Authentication is mandatory',
> + [('WWW-Authenticate',
> + 'Basic realm="Mercurial Proxy Authentication"')])
> + req.respond(er, protocol.HGTYPE)
> + return ['HTTP authentication required']
> +
> + try:
> + # Forward most commands to the remote server
> + if cmd not in ['getbundle', 'getlfile']:
> + data = None
> + if cmd == 'unbundle':
> + size = req.env['CONTENT_LENGTH']
> + self.ui.debug('reading bundle with size %s\n' % size)
> + data = req.read(int(size))
> +
> + r = hg.peer(self.ui, {}, url)._call(cmd, data=data, **args)
> + req.respond(HTTP_OK, protocol.HGTYPE)
> + return [r]
> +
> + # Special handling of commands that are worth caching
> + repopath = os.path.join(self.cachepath, path)
> + repo = hg.repository(self.ui, path=repopath,
> + create=not os.path.exists(repopath))
> +
> + # Fetch what we might be missing
> + if cmd == 'getbundle':
> + heads = [h for h in wireproto.decodelist(args['heads'])
> + if not h in repo]
> + if heads:
> + self.ui.debug('fetching requested heads remotely: '
> + '%s\n' %
> + ' '.join(short(h) for h in heads))
> + r = repo.pull(hg.peer(self.ui, {}, url), heads=heads)
> + self.ui.debug('pull result: %r\n' % r)
> + else:
> + self.ui.debug('serving bundle locally\n')
> +
> + elif cmd == 'getlfile':
> + sha = args['sha']
> + filename = lfutil.findfile(repo, sha)
> + if not filename:
> + self.ui.debug('fetching %s remotely\n' % sha)
> + store = basestore._openstore(repo,
> + hg.peer(self.ui, {}, url),
> + False)
> + gotit = store._gethash(sha, sha)
> + if not gotit:
> + self.ui.error(_('failed to get %s for %s\n') %
> + (sha, path))
> + else:
> + self.ui.debug('serving %s locally\n' % sha)
> +
> + # Now serve it locally
> + return protocol.call(repo, req, cmd)
> +
> + except urllib2.HTTPError, inst:
> + self.ui.error('HTTPError %s' % inst)
> + req.respond(inst.code, protocol.HGTYPE)
> + return ['HTTP error']
> + except util.Abort, e: # hg.peer will abort when it gets 401
> + if e.message != 'http authorization required':
> + raise
> + self.ui.debug('server requires authentication\n')
> + er = ErrorResponse(HTTP_UNAUTHORIZED,
> + 'Authentication is mandatory',
> + [('WWW-Authenticate',
> + 'Basic realm="Mercurial Proxy Authentication"')])
> + req.respond(er, protocol.HGTYPE)
> + return ['HTTP authentication required']
> +
> +
> + at command('^proxy',
> + [('A', 'accesslog', '', _('name of access log file to write to'),
> + _('FILE')),
> + ('d', 'daemon', None, _('run server in background')),
> + ('', 'daemon-pipefds', '', _('used internally by daemon mode'), _('NUM')),
> + ('E', 'errorlog', '', _('name of error log file to write to'), _('FILE')),
> + # use string type, then we can check if something was passed
> + ('p', 'port', '', _('port to listen on (default: 8000)'), _('PORT')),
> + ('a', 'address', '', _('address to listen on (default: all interfaces)'),
> + _('ADDR')),
> + ('', 'prefix', '', _('prefix path to serve from (default: server root)'),
> + _('PREFIX')),
> + ('', 'pid-file', '', _('name of file to write process ID to'), _('FILE')),
> + ('6', 'ipv6', None, _('use IPv6 in addition to IPv4')),
> + ('', 'certificate', '', _('SSL certificate file'), _('FILE')),
> + ('', 'anonymous', None, _("authentication is not mandatory"))],
> + _('[OPTIONS]... SERVERURL CACHEPATH'))
> +def proxy(ui, serverurl, cachepath, **opts):
> + """start stand-alone caching hgweb proxy
> +
> + Start a local HTTP server that acts as a caching proxy for a remote
> + server SERVERURL. Fetched data will be stored locally in the directory
> + CACHEPATH and reused for future requests for the same data.
> +
> + Please note that the proxy server will serve valid requests for cached
> + data without checking credentials. It should thus only be used on networks
> + and for servers and repositories where that is acceptable.
> +
> + By default the proxy will request credentials for all requests before
> + forwarding the requests to the server. Use --anonymous to disable that.
> +
> + By default, the server logs accesses to stdout and errors to
> + stderr. Use the -A/--accesslog and -E/--errorlog options to log to
> + files.
> +
> + To have the server choose a free port number to listen on, specify
> + a port number of 0; in this case, the server will print the port
> + number it uses.
> +
> + Returns 0 on success.
> + """
> + if opts.get('port'):
> + opts['port'] = util.getport(opts.get('port'))
> +
> + u = util.url(serverurl)
> + if u.scheme not in ['http', 'https', 'ssh']:
> + raise util.Abort(_('invalid scheme in server url %s') % serverurl)
> +
> + if not os.path.isdir(cachepath):
> + raise util.Abort(_('cache path is not a directory'))
> +
> + app = proxyserver(ui, serverurl, cachepath or '.', opts.get('anonymous'))
> + service = commands.httpservice(ui, app, opts)
> + cmdutil.service(opts, initfn=service.init, runfn=service.run)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
More information about the Mercurial-devel
mailing list