D2822: hgweb: support constructing URLs from an alternate base URL

indygreg (Gregory Szorc) phabricator at mercurial-scm.org
Mon Mar 12 21:16:51 UTC 2018


indygreg created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  The web.baseurl config option allows server operators to define a
  custom URL for hosted content.
  
  The way it works today is that hgwebdir parses this config
  option into URL components then updates the appropriate
  WSGI environment variables so the request "lies" about its
  details. For example, SERVER_NAME is updated to reflect the
  alternate base URL's hostname.
  
  The WSGI environment should not be modified because WSGI
  applications may want to know the original request details (for
  debugging, etc).
  
  This commit teaches our request parser about the existence of
  an alternate base URL. If defined, the advertised URL and other
  self-reflected paths will take the alternate base URL into account.
  
  The hgweb WSGI application didn't use web.baseurl. But hgwebdir
  did. We update hgwebdir to alter the environment parsing
  accordingly. The old code around environment manipulation
  has been removed.
  
  With this change, parserequestfromenv() has grown to a bit
  unwieldy. Now that practically everyone is using it, it is
  obvious that there is some unused features that can be trimmed.
  So look for this in follow-up commits.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D2822

AFFECTED FILES
  mercurial/hgweb/hgwebdir_mod.py
  mercurial/hgweb/request.py
  tests/test-wsgirequest.py

CHANGE DETAILS

diff --git a/tests/test-wsgirequest.py b/tests/test-wsgirequest.py
--- a/tests/test-wsgirequest.py
+++ b/tests/test-wsgirequest.py
@@ -23,11 +23,12 @@
     r'wsgi.run_once': False,
 }
 
-def parse(env, bodyfh=None, reponame=None, extra=None):
+def parse(env, bodyfh=None, reponame=None, altbaseurl=None, extra=None):
     env = dict(env)
     env.update(extra or {})
 
-    return requestmod.parserequestfromenv(env, bodyfh, reponame=reponame)
+    return requestmod.parserequestfromenv(env, bodyfh, reponame=reponame,
+                                          altbaseurl=altbaseurl)
 
 class ParseRequestTests(unittest.TestCase):
     def testdefault(self):
@@ -242,6 +243,174 @@
         self.assertEqual(r.dispatchpath, b'path1/path2')
         self.assertEqual(r.reponame, b'prefix/repo')
 
+    def testaltbaseurl(self):
+        # Simple hostname remap.
+        r = parse(DEFAULT_ENV, altbaseurl='http://altserver')
+
+        self.assertEqual(r.url, b'http://testserver')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl, b'http://altserver')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'')
+        self.assertEqual(r.dispatchparts, [])
+        self.assertIsNone(r.dispatchpath)
+        self.assertIsNone(r.reponame)
+
+        # With a custom port.
+        r = parse(DEFAULT_ENV, altbaseurl='http://altserver:8000')
+        self.assertEqual(r.url, b'http://testserver')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl, b'http://altserver:8000')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver:8000')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'')
+        self.assertEqual(r.dispatchparts, [])
+        self.assertIsNone(r.dispatchpath)
+        self.assertIsNone(r.reponame)
+
+        # With a changed protocol.
+        r = parse(DEFAULT_ENV, altbaseurl='https://altserver')
+        self.assertEqual(r.url, b'http://testserver')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl, b'https://altserver')
+        self.assertEqual(r.advertisedbaseurl, b'https://altserver')
+        # URL scheme is defined as the actual scheme, not advertised.
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'')
+        self.assertEqual(r.dispatchparts, [])
+        self.assertIsNone(r.dispatchpath)
+        self.assertIsNone(r.reponame)
+
+        # Need to specify explicit port number for proper https:// alt URLs.
+        r = parse(DEFAULT_ENV, altbaseurl='https://altserver:443')
+        self.assertEqual(r.url, b'http://testserver')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl, b'https://altserver')
+        self.assertEqual(r.advertisedbaseurl, b'https://altserver')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'')
+        self.assertEqual(r.dispatchparts, [])
+        self.assertIsNone(r.dispatchpath)
+        self.assertIsNone(r.reponame)
+
+        # With only PATH_INFO defined.
+        r = parse(DEFAULT_ENV, altbaseurl='http://altserver', extra={
+            r'PATH_INFO': r'/path1/path2',
+        })
+        self.assertEqual(r.url, b'http://testserver/path1/path2')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl, b'http://altserver/path1/path2')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'')
+        self.assertEqual(r.dispatchparts, [b'path1', b'path2'])
+        self.assertEqual(r.dispatchpath, b'path1/path2')
+        self.assertIsNone(r.reponame)
+
+        # Path on alt URL.
+        r = parse(DEFAULT_ENV, altbaseurl='http://altserver/altpath')
+        self.assertEqual(r.url, b'http://testserver')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl, b'http://altserver/altpath')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'/altpath')
+        self.assertEqual(r.dispatchparts, [])
+        self.assertIsNone(r.dispatchpath)
+        self.assertIsNone(r.reponame)
+
+        # With a trailing slash.
+        r = parse(DEFAULT_ENV, altbaseurl='http://altserver/altpath/')
+        self.assertEqual(r.url, b'http://testserver')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl, b'http://altserver/altpath/')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'/altpath/')
+        self.assertEqual(r.dispatchparts, [])
+        self.assertIsNone(r.dispatchpath)
+        self.assertIsNone(r.reponame)
+
+        # PATH_INFO + path on alt URL.
+        r = parse(DEFAULT_ENV, altbaseurl='http://altserver/altpath', extra={
+            r'PATH_INFO': r'/path1/path2',
+        })
+        self.assertEqual(r.url, b'http://testserver/path1/path2')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl,
+                         b'http://altserver/altpath/path1/path2')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'/altpath')
+        self.assertEqual(r.dispatchparts, [b'path1', b'path2'])
+        self.assertEqual(r.dispatchpath, b'path1/path2')
+        self.assertIsNone(r.reponame)
+
+        # PATH_INFO + path on alt URL with trailing slash.
+        r = parse(DEFAULT_ENV, altbaseurl='http://altserver/altpath/', extra={
+            r'PATH_INFO': r'/path1/path2',
+        })
+        self.assertEqual(r.url, b'http://testserver/path1/path2')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl,
+                         b'http://altserver/altpath//path1/path2')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'/altpath/')
+        self.assertEqual(r.dispatchparts, [b'path1', b'path2'])
+        self.assertEqual(r.dispatchpath, b'path1/path2')
+        self.assertIsNone(r.reponame)
+
+        # Local SCRIPT_NAME is ignored.
+        r = parse(DEFAULT_ENV, altbaseurl='http://altserver', extra={
+            r'SCRIPT_NAME': r'/script',
+            r'PATH_INFO': r'/path1/path2',
+        })
+        self.assertEqual(r.url, b'http://testserver/script/path1/path2')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl, b'http://altserver/path1/path2')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'')
+        self.assertEqual(r.dispatchparts, [b'path1', b'path2'])
+        self.assertEqual(r.dispatchpath, b'path1/path2')
+        self.assertIsNone(r.reponame)
+
+        # Use remote's path for script name, app path
+        r = parse(DEFAULT_ENV, altbaseurl='http://altserver/altroot', extra={
+            r'SCRIPT_NAME': r'/script',
+            r'PATH_INFO': r'/path1/path2',
+        })
+        self.assertEqual(r.url, b'http://testserver/script/path1/path2')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl,
+                         b'http://altserver/altroot/path1/path2')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver')
+        self.assertEqual(r.urlscheme, b'http')
+        self.assertEqual(r.apppath, b'/altroot')
+        self.assertEqual(r.dispatchparts, [b'path1', b'path2'])
+        self.assertEqual(r.dispatchpath, b'path1/path2')
+        self.assertIsNone(r.reponame)
+
+        # reponame is factored in properly.
+        r = parse(DEFAULT_ENV, reponame=b'repo',
+                  altbaseurl='http://altserver/altroot',
+                  extra={
+                r'SCRIPT_NAME': r'/script',
+                r'PATH_INFO': r'/repo/path1/path2',
+            })
+
+        self.assertEqual(r.url, b'http://testserver/script/repo/path1/path2')
+        self.assertEqual(r.baseurl, b'http://testserver')
+        self.assertEqual(r.advertisedurl,
+                         b'http://altserver/altroot/repo/path1/path2')
+        self.assertEqual(r.advertisedbaseurl, b'http://altserver')
+        self.assertEqual(r.apppath, b'/altroot/repo')
+        self.assertEqual(r.dispatchparts, [b'path1', b'path2'])
+        self.assertEqual(r.dispatchpath, b'path1/path2')
+        self.assertEqual(r.reponame, b'repo')
+
 if __name__ == '__main__':
     import silenttestrunner
     silenttestrunner.main(__name__)
diff --git a/mercurial/hgweb/request.py b/mercurial/hgweb/request.py
--- a/mercurial/hgweb/request.py
+++ b/mercurial/hgweb/request.py
@@ -157,7 +157,7 @@
     # Request body input stream.
     bodyfh = attr.ib()
 
-def parserequestfromenv(env, bodyfh, reponame=None):
+def parserequestfromenv(env, bodyfh, reponame=None, altbaseurl=None):
     """Parse URL components from environment variables.
 
     WSGI defines request attributes via environment variables. This function
@@ -167,8 +167,18 @@
     string are effectively shifted from ``PATH_INFO`` to ``SCRIPT_NAME``.
     This simulates the world view of a WSGI application that processes
     requests from the base URL of a repo.
+
+    If ``altbaseurl`` (typically comes from ``web.baseurl`` config option)
+    is defined, it is used - instead of the WSGI environment variables - for
+    constructing URL components up to and including the WSGI application path.
+    For example, if the current WSGI application is at ``/repo`` and a request
+    is made to ``/rev/@`` with this argument set to
+    ``http://myserver:9000/prefix``, the URL and path components will resolve as
+    if the request were to ``http://myserver:9000/prefix/rev/@``. In other
+    words, ``wsgi.url_scheme``, ``SERVER_NAME``, ``SERVER_PORT``, and
+    ``SCRIPT_NAME`` are all effectively replaced by components from this URL.
     """
-    # PEP-0333 defines the WSGI spec and is a useful reference for this code.
+    # PEP 3333 defines the WSGI spec and is a useful reference for this code.
 
     # We first validate that the incoming object conforms with the WSGI spec.
     # We only want to be dealing with spec-conforming WSGI implementations.
@@ -184,38 +194,67 @@
         env = {k: v.encode('latin-1') if isinstance(v, str) else v
                for k, v in env.iteritems()}
 
+    if altbaseurl:
+        altbaseurl = util.url(altbaseurl)
+
     # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
     # the environment variables.
     # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
     # how URLs are reconstructed.
     fullurl = env['wsgi.url_scheme'] + '://'
-    advertisedfullurl = fullurl
+
+    if altbaseurl and altbaseurl.scheme:
+        advertisedfullurl = altbaseurl.scheme + '://'
+    else:
+        advertisedfullurl = fullurl
 
-    def addport(s):
-        if env['wsgi.url_scheme'] == 'https':
-            if env['SERVER_PORT'] != '443':
-                s += ':' + env['SERVER_PORT']
+    def addport(s, port):
+        if s.startswith('https://'):
+            if port != '443':
+                s += ':' + port
         else:
-            if env['SERVER_PORT'] != '80':
-                s += ':' + env['SERVER_PORT']
+            if port != '80':
+                s += ':' + port
 
         return s
 
     if env.get('HTTP_HOST'):
         fullurl += env['HTTP_HOST']
     else:
         fullurl += env['SERVER_NAME']
-        fullurl = addport(fullurl)
+        fullurl = addport(fullurl, env['SERVER_PORT'])
+
+    if altbaseurl and altbaseurl.host:
+        advertisedfullurl += altbaseurl.host
 
-    advertisedfullurl += env['SERVER_NAME']
-    advertisedfullurl = addport(advertisedfullurl)
+        if altbaseurl.port:
+            port = altbaseurl.port
+        elif altbaseurl.scheme == 'http' and not altbaseurl.port:
+            port = '80'
+        elif altbaseurl.scheme == 'https' and not altbaseurl.port:
+            port = '443'
+        else:
+            port = env['SERVER_PORT']
+
+        advertisedfullurl = addport(advertisedfullurl, port)
+    else:
+        advertisedfullurl += env['SERVER_NAME']
+        advertisedfullurl = addport(advertisedfullurl, env['SERVER_PORT'])
 
     baseurl = fullurl
     advertisedbaseurl = advertisedfullurl
 
     fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
-    advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
     fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
+
+    if altbaseurl:
+        path = altbaseurl.path or ''
+        if path and not path.startswith('/'):
+            path = '/' + path
+        advertisedfullurl += util.urlreq.quote(path)
+    else:
+        advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
+
     advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
 
     if env.get('QUERY_STRING'):
@@ -226,7 +265,12 @@
     # that represents the repository being dispatched to. When computing
     # the dispatch info, we ignore these leading path components.
 
-    apppath = env.get('SCRIPT_NAME', '')
+    if altbaseurl:
+        apppath = altbaseurl.path or ''
+        if apppath and not apppath.startswith('/'):
+            apppath = '/' + apppath
+    else:
+        apppath = env.get('SCRIPT_NAME', '')
 
     if reponame:
         repoprefix = '/' + reponame.strip('/')
@@ -545,7 +589,7 @@
     instantiate instances of this class, which provides higher-level APIs
     for obtaining request parameters, writing HTTP output, etc.
     """
-    def __init__(self, wsgienv, start_response):
+    def __init__(self, wsgienv, start_response, altbaseurl=None):
         version = wsgienv[r'wsgi.version']
         if (version < (1, 0)) or (version >= (2, 0)):
             raise RuntimeError("Unknown and unsupported WSGI version %d.%d"
@@ -563,7 +607,7 @@
         self.multiprocess = wsgienv[r'wsgi.multiprocess']
         self.run_once = wsgienv[r'wsgi.run_once']
         self.env = wsgienv
-        self.req = parserequestfromenv(wsgienv, inp)
+        self.req = parserequestfromenv(wsgienv, inp, altbaseurl=altbaseurl)
         self.res = wsgiresponse(self.req, start_response)
         self._start_response = start_response
         self.server_write = None
diff --git a/mercurial/hgweb/hgwebdir_mod.py b/mercurial/hgweb/hgwebdir_mod.py
--- a/mercurial/hgweb/hgwebdir_mod.py
+++ b/mercurial/hgweb/hgwebdir_mod.py
@@ -33,7 +33,6 @@
     error,
     hg,
     profiling,
-    pycompat,
     scmutil,
     templater,
     ui as uimod,
@@ -83,33 +82,6 @@
         yield (prefix + '/' +
                util.pconvert(path[len(roothead):]).lstrip('/')).strip('/'), path
 
-def geturlcgivars(baseurl, port):
-    """
-    Extract CGI variables from baseurl
-
-    >>> geturlcgivars(b"http://host.org/base", b"80")
-    ('host.org', '80', '/base')
-    >>> geturlcgivars(b"http://host.org:8000/base", b"80")
-    ('host.org', '8000', '/base')
-    >>> geturlcgivars(b'/base', 8000)
-    ('', '8000', '/base')
-    >>> geturlcgivars(b"base", b'8000')
-    ('', '8000', '/base')
-    >>> geturlcgivars(b"http://host", b'8000')
-    ('host', '8000', '/')
-    >>> geturlcgivars(b"http://host/", b'8000')
-    ('host', '8000', '/')
-    """
-    u = util.url(baseurl)
-    name = u.host or ''
-    if u.port:
-        port = u.port
-    path = u.path or ""
-    if not path.startswith('/'):
-        path = '/' + path
-
-    return name, pycompat.bytestr(port), path
-
 def readallowed(ui, req):
     """Check allow_read and deny_read config options of a repo's ui object
     to determine user permissions.  By default, with neither option set (or
@@ -359,7 +331,6 @@
         self.stripecount = self.ui.config('web', 'stripes')
         if self.stripecount:
             self.stripecount = int(self.stripecount)
-        self._baseurl = self.ui.config('web', 'baseurl')
         prefix = self.ui.config('web', 'prefix')
         if prefix.startswith('/'):
             prefix = prefix[1:]
@@ -376,7 +347,8 @@
         wsgicgi.launch(self)
 
     def __call__(self, env, respond):
-        wsgireq = requestmod.wsgirequest(env, respond)
+        baseurl = self.ui.config('web', 'baseurl')
+        wsgireq = requestmod.wsgirequest(env, respond, altbaseurl=baseurl)
         return self.run_wsgi(wsgireq)
 
     def run_wsgi(self, wsgireq):
@@ -455,7 +427,8 @@
                     # Re-parse the WSGI environment to take into account our
                     # repository path component.
                     wsgireq.req = requestmod.parserequestfromenv(
-                        wsgireq.env, wsgireq.req.bodyfh, reponame=virtualrepo)
+                        wsgireq.env, wsgireq.req.bodyfh, reponame=virtualrepo,
+                        altbaseurl=self.ui.config('web', 'baseurl'))
                     try:
                         # ensure caller gets private copy of ui
                         repo = hg.repository(self.ui.copy(), real)
@@ -502,7 +475,6 @@
                 for column in sortable]
 
         self.refresh()
-        self.updatereqenv(wsgireq.env)
 
         entries = indexentries(self.ui, self.repos, wsgireq, req,
                                self.stripecount, sortcolumn=sortcolumn,
@@ -524,8 +496,6 @@
         def config(section, name, default=uimod._unset, untrusted=True):
             return self.ui.config(section, name, default, untrusted)
 
-        self.updatereqenv(wsgireq.env)
-
         url = wsgireq.env.get('SCRIPT_NAME', '')
         if not url.endswith('/'):
             url += '/'
@@ -557,10 +527,3 @@
         }
         tmpl = templater.templater.frommapfile(mapfile, defaults=defaults)
         return tmpl
-
-    def updatereqenv(self, env):
-        if self._baseurl is not None:
-            name, port, path = geturlcgivars(self._baseurl, env['SERVER_PORT'])
-            env['SERVER_NAME'] = name
-            env['SERVER_PORT'] = port
-            env['SCRIPT_NAME'] = path



To: indygreg, #hg-reviewers
Cc: mercurial-devel


More information about the Mercurial-devel mailing list