[PATCH 5 of 5 RFC] RFC: hgwebproxy: an extension for running a caching hgweb proxy

Mads Kiilerich mads at kiilerich.com
Wed Oct 9 21:35:53 CDT 2013


# HG changeset patch
# User Mads Kiilerich <madski at unity3d.com>
# Date 1381372470 -7200
#      Thu Oct 10 04:34:30 2013 +0200
# Node ID 7ae5a3498414653b0aa6404b7adfaf3bae63f752
# Parent  aaa67a78a1a9e116671082f79342b4f360cd8d2e
RFC: hgwebproxy: an extension for running a caching hgweb proxy

This proxy can serve as an "accelerator" or "concentrator" that might reduce
the network traffic and improve the user experience where the bandwith is
limited and the same data is fetched multiple times.

diff --git a/hgext/hgwebproxy.py b/hgext/hgwebproxy.py
new file mode 100644
--- /dev/null
+++ b/hgext/hgwebproxy.py
@@ -0,0 +1,237 @@
+# caching HTTP proxy for hgweb hosting
+#
+# Copyright Unity Technologies, Mads Kiilerich <madski at unity3d.com>
+# Copyright Matt Mackall <mpm at selenic.com> and others
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+'''Caching HTTP proxy for hgweb hosting
+
+This proxy can serve as an "accelerator" or "concentrator" that might reduce
+the network traffic and improve the user experience where the bandwidth is
+limited and the same data are fetched multiple times.
+
+Enable the extension with::
+
+  [extensions]
+  hgwebproxy =
+
+Run the proxy as::
+
+  hg proxy --port 1234 http://servername/ /var/cache/hgrepos
+
+Instead of pointing Mercurial clients at::
+
+  http://servername/repos/name
+
+point them at the proxy:
+
+  http://proxyname:1234/repos/name
+
+The proxy will forward most of the protocol commands directly to the server.
+(Because the proxy doesn't have state it will actually have to make twice as
+many round-trips to the server as the client would have needed.)
+
+Some protocol commands returns a lot of data and are handled specially:
+fetching bundles (for cloning and pulling) and fetching largefiles.
+The proxy will serve these commands from the local cache.
+Before doing that it will verify that it can serve it locally.
+If something is missing it will first fetch it from the server and store it
+locally so it also can be used next time.
+
+The proxy will by default assume that the server uses HTTP basic
+authentication and it will forward authentication requests and credentials as
+they are received. Credentials will not be checked for requests that are
+served from the local cache. Someone with the full hash of a revision or
+largefile will thus be able to retrieve it unauthenticated. The proxy should
+thus only be used on networks and for servers where that is acceptable.
+'''
+
+import os.path
+import urllib2, posixpath
+from mercurial import cmdutil, util, commands, hg, wireproto
+from mercurial.hgweb import protocol
+from mercurial.hgweb.common import ErrorResponse, HTTP_UNAUTHORIZED, HTTP_OK,\
+    HTTP_BAD_REQUEST
+from mercurial.hgweb.request import wsgirequest
+from mercurial.i18n import _
+from mercurial.node import short
+from hgext.largefiles import lfutil, basestore
+
+cmdtable = {}
+command = cmdutil.command(cmdtable)
+testedwith = 'internal'
+
+commands.norepo += " proxy"
+
+
+class proxyserver(object):
+    def __init__(self, ui, serverurl, cachepath, anonymous=False):
+        self.ui = ui
+        self.serverurl = serverurl
+        self.cachepath = cachepath
+        self.anonymous = anonymous
+
+    def __call__(self, env, respond):
+        req = wsgirequest(env, respond)
+        return self.run_wsgi(req)
+
+    def run_wsgi(self, req):
+        proto = protocol.webproto(req, self.ui)
+
+        u = util.url(self.serverurl)
+        u.path = posixpath.join(u.path or '', req.env['PATH_INFO'])
+
+        # Simple path validation - probably only sufficient on Linux
+        path = req.env['PATH_INFO'].replace('\\', '/').strip('/')
+        if ':' in path or path.startswith('.') or '/.' in path:
+            self.ui.debug('bad request path %r\n' % path)
+            req.respond(HTTP_BAD_REQUEST, protocol.HGTYPE)
+            return []
+
+        # MIME and HTTP allows multiple headers by the same name - we only
+        # use and care about one
+        args = dict((k, v[0]) for k, v in proto._args().items())
+        cmd = args.pop('cmd')
+        self.ui.debug("request for path: %s cmd: %s args: %s\n" %
+                      (path, cmd,
+                       ' '.join('%s=%s' for k, v in sorted(args.items()))
+                       ))
+
+        # Forward HTTP authorization headers through the layers
+        authheader = req.env.get('HTTP_AUTHORIZATION')
+        if authheader and authheader.lower().startswith('basic '):
+            userpasswd = authheader[6:].decode('base64')
+            if ':' in userpasswd:
+                u.user, u.passwd = userpasswd.split(':', 1)
+        url = str(u)
+
+        if not (self.anonymous or u.user and u.passwd):
+            self.ui.debug('authentication is mandatory and missing\n')
+            er = ErrorResponse(HTTP_UNAUTHORIZED,
+                'Authentication is mandatory',
+                [('WWW-Authenticate',
+                  'Basic realm="Mercurial Proxy Authentication"')])
+            req.respond(er, protocol.HGTYPE)
+            return ['HTTP authentication required']
+
+        try:
+            # Forward most commands to the remote server
+            if cmd not in ['getbundle', 'getlfile']:
+                data = None
+                if cmd == 'unbundle':
+                    size = req.env['CONTENT_LENGTH']
+                    self.ui.debug('reading bundle with size %s\n' % size)
+                    data = req.read(int(size))
+
+                r = hg.peer(self.ui, {}, url)._call(cmd, data=data, **args)
+                req.respond(HTTP_OK, protocol.HGTYPE)
+                return [r]
+
+            # Special handling of commands that are worth caching
+            repopath = os.path.join(self.cachepath, path)
+            repo = hg.repository(self.ui, path=repopath,
+                                 create=not os.path.exists(repopath))
+
+            # Fetch what we might be missing
+            if cmd == 'getbundle':
+                heads = [h for h in wireproto.decodelist(args['heads'])
+                         if not h in repo]
+                if heads:
+                    self.ui.debug('fetching requested heads remotely: '
+                                  '%s\n' %
+                                  ' '.join(short(h) for h in heads))
+                    r = repo.pull(hg.peer(self.ui, {}, url), heads=heads)
+                    self.ui.debug('pull result: %r\n' % r)
+                else:
+                    self.ui.debug('serving bundle locally\n')
+
+            elif cmd == 'getlfile':
+                sha = args['sha']
+                filename = lfutil.findfile(repo, sha)
+                if not filename:
+                    self.ui.debug('fetching %s remotely\n' % sha)
+                    store = basestore._openstore(repo,
+                                                 hg.peer(self.ui, {}, url),
+                                                 False)
+                    gotit = store._gethash(sha, sha)
+                    if not gotit:
+                        self.ui.error(_('failed to get %s for %s\n') %
+                                      (sha, path))
+                else:
+                    self.ui.debug('serving %s locally\n' % sha)
+
+            # Now serve it locally
+            return protocol.call(repo, req, cmd)
+
+        except urllib2.HTTPError, inst:
+            self.ui.error('HTTPError %s' % inst)
+            req.respond(inst.code, protocol.HGTYPE)
+            return ['HTTP error']
+        except util.Abort, e: # hg.peer will abort when it gets 401
+            if e.message != 'http authorization required':
+                raise
+            self.ui.debug('server requires authentication\n')
+            er = ErrorResponse(HTTP_UNAUTHORIZED,
+                'Authentication is mandatory',
+                [('WWW-Authenticate',
+                  'Basic realm="Mercurial Proxy Authentication"')])
+            req.respond(er, protocol.HGTYPE)
+            return ['HTTP authentication required']
+
+
+ at command('^proxy',
+    [('A', 'accesslog', '', _('name of access log file to write to'),
+     _('FILE')),
+    ('d', 'daemon', None, _('run server in background')),
+    ('', 'daemon-pipefds', '', _('used internally by daemon mode'), _('NUM')),
+    ('E', 'errorlog', '', _('name of error log file to write to'), _('FILE')),
+    # use string type, then we can check if something was passed
+    ('p', 'port', '', _('port to listen on (default: 8000)'), _('PORT')),
+    ('a', 'address', '', _('address to listen on (default: all interfaces)'),
+     _('ADDR')),
+    ('', 'prefix', '', _('prefix path to serve from (default: server root)'),
+     _('PREFIX')),
+    ('', 'pid-file', '', _('name of file to write process ID to'), _('FILE')),
+    ('6', 'ipv6', None, _('use IPv6 in addition to IPv4')),
+    ('', 'certificate', '', _('SSL certificate file'), _('FILE')),
+    ('', 'anonymous', None, _("authentication is not mandatory"))],
+    _('[OPTIONS]... SERVERURL CACHEPATH'))
+def proxy(ui, serverurl, cachepath, **opts):
+    """start stand-alone caching hgweb proxy
+
+    Start a local HTTP server that acts as a caching proxy for a remote
+    server SERVERURL. Fetched data will be stored locally in the directory
+    CACHEPATH and reused for future requests for the same data.
+
+    Please note that the proxy server will serve valid requests for cached
+    data without checking credentials. It should thus only be used on networks
+    and for servers and repositories where that is acceptable.
+
+    By default the proxy will request credentials for all requests before
+    forwarding the requests to the server. Use --anonymous to disable that.
+
+    By default, the server logs accesses to stdout and errors to
+    stderr. Use the -A/--accesslog and -E/--errorlog options to log to
+    files.
+
+    To have the server choose a free port number to listen on, specify
+    a port number of 0; in this case, the server will print the port
+    number it uses.
+
+    Returns 0 on success.
+    """
+    if opts.get('port'):
+        opts['port'] = util.getport(opts.get('port'))
+
+    u = util.url(serverurl)
+    if u.scheme not in ['http', 'https', 'ssh']:
+        raise util.Abort(_('invalid scheme in server url %s') % serverurl)
+
+    if not os.path.isdir(cachepath):
+        raise util.Abort(_('cache path is not a directory'))
+
+    app = proxyserver(ui, serverurl, cachepath or '.', opts.get('anonymous'))
+    service = commands.httpservice(ui, app, opts)
+    cmdutil.service(opts, initfn=service.init, runfn=service.run)


More information about the Mercurial-devel mailing list