[PATCH 2 of 3 RFC] url: move URL parsing functions into util to improve startup time

Brodie Rao brodie at bitheap.org
Sat Apr 30 09:06:53 CDT 2011


# HG changeset patch
# User Brodie Rao <brodie at bitheap.org>
# Date 1304172015 25200
# Node ID 75e0dc9378549f46352bb68ac431cf1a59bf9518
# Parent  d4fc43e11fad0fcad792320169ae26cfa6297e48
url: move URL parsing functions into util to improve startup time

The introduction of the new URL parsing code has created a startup
time regression. This is mainly due to the use of url.hasscheme() in
the ui class. It ends up importing many libraries that the url module
requires.

This fix helps marginally, but if we can get rid of the urllib import
in the URL parser all together, startup time will go back to normal.

perfstartup time before the URL refactoring (8796fb6af67e):

! wall 0.050692 comb 0.000000 user 0.000000 sys 0.000000 (best of 100)

current startup time (139fb11210bb):

! wall 0.070685 comb 0.000000 user 0.000000 sys 0.000000 (best of 100)

after this change:

! wall 0.064742 comb 0.000000 user 0.000000 sys 0.000000 (best of 100)

diff --git a/hgext/fetch.py b/hgext/fetch.py
--- a/hgext/fetch.py
+++ b/hgext/fetch.py
@@ -9,7 +9,7 @@
 
 from mercurial.i18n import _
 from mercurial.node import nullid, short
-from mercurial import commands, cmdutil, hg, util, url, error
+from mercurial import commands, cmdutil, hg, util, error
 from mercurial.lock import release
 
 def fetch(ui, repo, source='default', **opts):
@@ -66,7 +66,7 @@ def fetch(ui, repo, source='default', **
         other = hg.repository(hg.remoteui(repo, opts),
                               ui.expandpath(source))
         ui.status(_('pulling from %s\n') %
-                  url.hidepassword(ui.expandpath(source)))
+                  util.hidepassword(ui.expandpath(source)))
         revs = None
         if opts['rev']:
             try:
@@ -125,7 +125,7 @@ def fetch(ui, repo, source='default', **
             # we don't translate commit messages
             message = (cmdutil.logmessage(opts) or
                        ('Automated merge with %s' %
-                        url.removeauth(other.url())))
+                        util.removeauth(other.url())))
             editor = cmdutil.commiteditor
             if opts.get('force_editor') or opts.get('edit'):
                 editor = cmdutil.commitforceeditor
diff --git a/hgext/patchbomb.py b/hgext/patchbomb.py
--- a/hgext/patchbomb.py
+++ b/hgext/patchbomb.py
@@ -48,7 +48,7 @@ hgrc(5) for details.
 import os, errno, socket, tempfile, cStringIO, time
 import email.MIMEMultipart, email.MIMEBase
 import email.Utils, email.Encoders, email.Generator
-from mercurial import cmdutil, commands, hg, mail, patch, util, discovery, url
+from mercurial import cmdutil, commands, hg, mail, patch, util, discovery
 from mercurial.i18n import _
 from mercurial.node import bin
 
@@ -241,7 +241,7 @@ def patchbomb(ui, repo, *revs, **opts):
         if revs:
             revs = [repo.lookup(rev) for rev in revs]
         other = hg.repository(hg.remoteui(repo, opts), dest)
-        ui.status(_('comparing with %s\n') % url.hidepassword(dest))
+        ui.status(_('comparing with %s\n') % util.hidepassword(dest))
         o = discovery.findoutgoing(repo, other)
         if not o:
             ui.status(_("no changes found\n"))
diff --git a/hgext/schemes.py b/hgext/schemes.py
--- a/hgext/schemes.py
+++ b/hgext/schemes.py
@@ -41,7 +41,7 @@ same name.
 """
 
 import os, re
-from mercurial import extensions, hg, templater, url as urlmod, util
+from mercurial import extensions, hg, templater, util
 from mercurial.i18n import _
 
 
@@ -95,4 +95,4 @@ def extsetup(ui):
                                'letter %s:\\\n') % (scheme, scheme.upper()))
         hg.schemes[scheme] = ShortRepository(url, scheme, t)
 
-    extensions.wrapfunction(urlmod, 'hasdriveletter', hasdriveletter)
+    extensions.wrapfunction(util, 'hasdriveletter', hasdriveletter)
diff --git a/mercurial/bundlerepo.py b/mercurial/bundlerepo.py
--- a/mercurial/bundlerepo.py
+++ b/mercurial/bundlerepo.py
@@ -15,7 +15,7 @@ from node import nullid
 from i18n import _
 import os, struct, tempfile, shutil
 import changegroup, util, mdiff, discovery
-import localrepo, changelog, manifest, filelog, revlog, error, url
+import localrepo, changelog, manifest, filelog, revlog, error
 
 class bundlerevlog(revlog.revlog):
     def __init__(self, opener, indexfile, bundle,
@@ -274,7 +274,7 @@ def instance(ui, path, create):
             cwd = os.path.join(cwd,'')
             if parentpath.startswith(cwd):
                 parentpath = parentpath[len(cwd):]
-    u = url.url(path)
+    u = util.url(path)
     path = u.localpath()
     if u.scheme == 'bundle':
         s = path.split("+", 1)
diff --git a/mercurial/commands.py b/mercurial/commands.py
--- a/mercurial/commands.py
+++ b/mercurial/commands.py
@@ -2636,7 +2636,7 @@ def incoming(ui, repo, source="default",
         if 'bookmarks' not in other.listkeys('namespaces'):
             ui.warn(_("remote doesn't support bookmarks\n"))
             return 0
-        ui.status(_('comparing with %s\n') % url.hidepassword(source))
+        ui.status(_('comparing with %s\n') % util.hidepassword(source))
         return bookmarks.diff(ui, repo, other)
 
     ret = hg.incoming(ui, repo, source, opts)
@@ -2923,7 +2923,7 @@ def outgoing(ui, repo, dest=None, **opts
         if 'bookmarks' not in other.listkeys('namespaces'):
             ui.warn(_("remote doesn't support bookmarks\n"))
             return 0
-        ui.status(_('comparing with %s\n') % url.hidepassword(dest))
+        ui.status(_('comparing with %s\n') % util.hidepassword(dest))
         return bookmarks.diff(ui, other, repo)
 
     ret = hg.outgoing(ui, repo, dest, opts)
@@ -2997,13 +2997,13 @@ def paths(ui, repo, search=None):
     if search:
         for name, path in ui.configitems("paths"):
             if name == search:
-                ui.write("%s\n" % url.hidepassword(path))
+                ui.write("%s\n" % util.hidepassword(path))
                 return
         ui.warn(_("not found!\n"))
         return 1
     else:
         for name, path in ui.configitems("paths"):
-            ui.write("%s = %s\n" % (name, url.hidepassword(path)))
+            ui.write("%s = %s\n" % (name, util.hidepassword(path)))
 
 def postincoming(ui, repo, modheads, optupdate, checkout):
     if modheads == 0:
@@ -3046,7 +3046,7 @@ def pull(ui, repo, source="default", **o
     """
     source, branches = hg.parseurl(ui.expandpath(source), opts.get('branch'))
     other = hg.repository(hg.remoteui(repo, opts), source)
-    ui.status(_('pulling from %s\n') % url.hidepassword(source))
+    ui.status(_('pulling from %s\n') % util.hidepassword(source))
     revs, checkout = hg.addbranchrevs(repo, other, branches, opts.get('rev'))
 
     if opts.get('bookmark'):
@@ -3129,7 +3129,7 @@ def push(ui, repo, dest=None, **opts):
 
     dest = ui.expandpath(dest or 'default-push', dest or 'default')
     dest, branches = hg.parseurl(dest, opts.get('branch'))
-    ui.status(_('pushing to %s\n') % url.hidepassword(dest))
+    ui.status(_('pushing to %s\n') % util.hidepassword(dest))
     revs, checkout = hg.addbranchrevs(repo, repo, branches, opts.get('rev'))
     other = hg.repository(hg.remoteui(repo, opts), dest)
     if revs:
@@ -3948,7 +3948,7 @@ def summary(ui, repo, **opts):
         source, branches = hg.parseurl(ui.expandpath('default'))
         other = hg.repository(hg.remoteui(repo, {}), source)
         revs, checkout = hg.addbranchrevs(repo, other, branches, opts.get('rev'))
-        ui.debug('comparing with %s\n' % url.hidepassword(source))
+        ui.debug('comparing with %s\n' % util.hidepassword(source))
         repo.ui.pushbuffer()
         common, incoming, rheads = discovery.findcommonincoming(repo, other)
         repo.ui.popbuffer()
@@ -3958,7 +3958,7 @@ def summary(ui, repo, **opts):
         dest, branches = hg.parseurl(ui.expandpath('default-push', 'default'))
         revs, checkout = hg.addbranchrevs(repo, repo, branches, None)
         other = hg.repository(hg.remoteui(repo, {}), dest)
-        ui.debug('comparing with %s\n' % url.hidepassword(dest))
+        ui.debug('comparing with %s\n' % util.hidepassword(dest))
         repo.ui.pushbuffer()
         o = discovery.findoutgoing(repo, other)
         repo.ui.popbuffer()
diff --git a/mercurial/hg.py b/mercurial/hg.py
--- a/mercurial/hg.py
+++ b/mercurial/hg.py
@@ -11,13 +11,13 @@ from lock import release
 from node import hex, nullid, nullrev, short
 import localrepo, bundlerepo, httprepo, sshrepo, statichttprepo, bookmarks
 import lock, util, extensions, error, encoding, node
-import cmdutil, discovery, url
+import cmdutil, discovery
 import merge as mergemod
 import verify as verifymod
 import errno, os, shutil
 
 def _local(path):
-    path = util.expandpath(url.localpath(path))
+    path = util.expandpath(util.localpath(path))
     return (os.path.isfile(path) and bundlerepo or localrepo)
 
 def addbranchrevs(lrepo, repo, branches, revs):
@@ -54,7 +54,7 @@ def addbranchrevs(lrepo, repo, branches,
 def parseurl(path, branches=None):
     '''parse url#branch, returning (url, (branch, branches))'''
 
-    u = url.url(path)
+    u = util.url(path)
     branch = None
     if u.fragment:
         branch = u.fragment
@@ -71,7 +71,7 @@ schemes = {
 }
 
 def _lookup(path):
-    u = url.url(path)
+    u = util.url(path)
     scheme = u.scheme or 'file'
     thing = schemes.get(scheme) or schemes['file']
     try:
@@ -221,8 +221,8 @@ def clone(ui, source, dest=None, pull=Fa
     else:
         dest = ui.expandpath(dest)
 
-    dest = url.localpath(dest)
-    source = url.localpath(source)
+    dest = util.localpath(dest)
+    source = util.localpath(source)
 
     if os.path.exists(dest):
         if not os.path.isdir(dest):
@@ -248,7 +248,7 @@ def clone(ui, source, dest=None, pull=Fa
         abspath = origsource
         copy = False
         if src_repo.cancopy() and islocal(dest):
-            abspath = os.path.abspath(url.localpath(origsource))
+            abspath = os.path.abspath(util.localpath(origsource))
             copy = not pull and not rev
 
         if copy:
@@ -421,7 +421,7 @@ def _incoming(displaychlist, subreporecu
     """
     source, branches = parseurl(ui.expandpath(source), opts.get('branch'))
     other = repository(remoteui(repo, opts), source)
-    ui.status(_('comparing with %s\n') % url.hidepassword(source))
+    ui.status(_('comparing with %s\n') % util.hidepassword(source))
     revs, checkout = addbranchrevs(repo, other, branches, opts.get('rev'))
 
     if revs:
@@ -482,7 +482,7 @@ def incoming(ui, repo, source, opts):
 def _outgoing(ui, repo, dest, opts):
     dest = ui.expandpath(dest or 'default-push', dest or 'default')
     dest, branches = parseurl(dest, opts.get('branch'))
-    ui.status(_('comparing with %s\n') % url.hidepassword(dest))
+    ui.status(_('comparing with %s\n') % util.hidepassword(dest))
     revs, checkout = addbranchrevs(repo, repo, branches, opts.get('rev'))
     if revs:
         revs = [repo.lookup(rev) for rev in revs]
diff --git a/mercurial/hgweb/hgwebdir_mod.py b/mercurial/hgweb/hgwebdir_mod.py
--- a/mercurial/hgweb/hgwebdir_mod.py
+++ b/mercurial/hgweb/hgwebdir_mod.py
@@ -9,7 +9,7 @@
 import os, re, time
 from mercurial.i18n import _
 from mercurial import ui, hg, scmutil, util, templater
-from mercurial import error, encoding, url
+from mercurial import error, encoding
 from common import ErrorResponse, get_mtime, staticfile, paritygen, \
                    get_contact, HTTP_OK, HTTP_NOT_FOUND, HTTP_SERVER_ERROR
 from hgweb_mod import hgweb
@@ -364,7 +364,7 @@ class hgwebdir(object):
 
     def updatereqenv(self, env):
         if self._baseurl is not None:
-            u = url.url(self._baseurl)
+            u = util.url(self._baseurl)
             env['SERVER_NAME'] = u.host
             if u.port:
                 env['SERVER_PORT'] = u.port
diff --git a/mercurial/httprepo.py b/mercurial/httprepo.py
--- a/mercurial/httprepo.py
+++ b/mercurial/httprepo.py
@@ -28,7 +28,7 @@ class httprepository(wireproto.wirerepos
         self.path = path
         self.caps = None
         self.handler = None
-        u = url.url(path)
+        u = util.url(path)
         if u.query or u.fragment:
             raise util.Abort(_('unsupported URL component: "%s"') %
                              (u.query or u.fragment))
@@ -111,12 +111,12 @@ class httprepository(wireproto.wirerepos
         except AttributeError:
             proto = resp.headers['content-type']
 
-        safeurl = url.hidepassword(self._url)
+        safeurl = util.hidepassword(self._url)
         # accept old "text/plain" and "application/hg-changegroup" for now
         if not (proto.startswith('application/mercurial-') or
                 proto.startswith('text/plain') or
                 proto.startswith('application/hg-changegroup')):
-            self.ui.debug("requested URL: '%s'\n" % url.hidepassword(cu))
+            self.ui.debug("requested URL: '%s'\n" % util.hidepassword(cu))
             raise error.RepoError(
                 _("'%s' does not appear to be an hg repository:\n"
                   "---%%<--- (%s)\n%s\n---%%<---\n")
diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -14,7 +14,6 @@ import scmutil, util, extensions, hook, 
 import match as matchmod
 import merge as mergemod
 import tags as tagsmod
-import url as urlmod
 from lock import release
 import weakref, errno, os, time, inspect
 propertycache = util.propertycache
@@ -1692,7 +1691,7 @@ class localrepository(repo.repository):
         cl.delayupdate()
         oldheads = cl.heads()
 
-        tr = self.transaction("\n".join([srctype, urlmod.hidepassword(url)]))
+        tr = self.transaction("\n".join([srctype, util.hidepassword(url)]))
         try:
             trp = weakref.proxy(tr)
             # pull off the changeset group
@@ -1934,7 +1933,7 @@ def aftertrans(files):
     return a
 
 def instance(ui, path, create):
-    return localrepository(ui, urlmod.localpath(path), create)
+    return localrepository(ui, util.localpath(path), create)
 
 def islocal(path):
     return True
diff --git a/mercurial/sshrepo.py b/mercurial/sshrepo.py
--- a/mercurial/sshrepo.py
+++ b/mercurial/sshrepo.py
@@ -6,7 +6,7 @@
 # GNU General Public License version 2 or any later version.
 
 from i18n import _
-import util, error, wireproto, url
+import util, error, wireproto
 
 class remotelock(object):
     def __init__(self, repo):
@@ -23,7 +23,7 @@ class sshrepository(wireproto.wirereposi
         self._url = path
         self.ui = ui
 
-        u = url.url(path, parsequery=False, parsefragment=False)
+        u = util.url(path, parsequery=False, parsefragment=False)
         if u.scheme != 'ssh' or not u.host or u.path is None:
             self._abort(error.RepoError(_("couldn't parse location %s") % path))
 
diff --git a/mercurial/statichttprepo.py b/mercurial/statichttprepo.py
--- a/mercurial/statichttprepo.py
+++ b/mercurial/statichttprepo.py
@@ -85,7 +85,7 @@ class statichttprepository(localrepo.loc
         self.ui = ui
 
         self.root = path
-        u = url.url(path.rstrip('/') + "/.hg")
+        u = util.url(path.rstrip('/') + "/.hg")
         self.path, authinfo = u.authinfo()
 
         opener = build_opener(ui, authinfo)
diff --git a/mercurial/subrepo.py b/mercurial/subrepo.py
--- a/mercurial/subrepo.py
+++ b/mercurial/subrepo.py
@@ -8,7 +8,7 @@
 import errno, os, re, xml.dom.minidom, shutil, posixpath
 import stat, subprocess, tarfile
 from i18n import _
-import config, scmutil, util, node, error, cmdutil, url, bookmarks
+import config, scmutil, util, node, error, cmdutil, bookmarks
 hg = None
 propertycache = util.propertycache
 
@@ -194,13 +194,13 @@ def _abssource(repo, push=False, abort=T
     """return pull/push path of repo - either based on parent repo .hgsub info
     or on the top repo config. Abort or return None if no source found."""
     if hasattr(repo, '_subparent'):
-        source = url.url(repo._subsource)
+        source = util.url(repo._subsource)
         source.path = posixpath.normpath(source.path)
         if posixpath.isabs(source.path) or source.scheme:
             return str(source)
         parent = _abssource(repo._subparent, push, abort=False)
         if parent:
-            parent = url.url(parent)
+            parent = util.url(parent)
             parent.path = posixpath.join(parent.path, source.path)
             parent.path = posixpath.normpath(parent.path)
             return str(parent)
diff --git a/mercurial/ui.py b/mercurial/ui.py
--- a/mercurial/ui.py
+++ b/mercurial/ui.py
@@ -7,7 +7,7 @@
 
 from i18n import _
 import errno, getpass, os, socket, sys, tempfile, traceback
-import config, scmutil, util, error, url
+import config, scmutil, util, error
 
 class ui(object):
     def __init__(self, src=None):
@@ -111,7 +111,7 @@ class ui(object):
                                   % (n, p, self.configsource('paths', n)))
                         p = p.replace('%%', '%')
                     p = util.expandpath(p)
-                    if not url.hasscheme(p) and not os.path.isabs(p):
+                    if not util.hasscheme(p) and not os.path.isabs(p):
                         p = os.path.normpath(os.path.join(root, p))
                     c.set("paths", n, p)
 
@@ -332,7 +332,7 @@ class ui(object):
 
     def expandpath(self, loc, default=None):
         """Return repository location relative to cwd or from [paths]"""
-        if url.hasscheme(loc) or os.path.isdir(os.path.join(loc, '.hg')):
+        if util.hasscheme(loc) or os.path.isdir(os.path.join(loc, '.hg')):
             return loc
 
         path = self.config('paths', loc)
diff --git a/mercurial/url.py b/mercurial/url.py
--- a/mercurial/url.py
+++ b/mercurial/url.py
@@ -7,305 +7,11 @@
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.
 
-import urllib, urllib2, httplib, os, socket, cStringIO, re
+import urllib, urllib2, httplib, os, socket, cStringIO
 import __builtin__
 from i18n import _
 import keepalive, util
 
-class url(object):
-    """Reliable URL parser.
-
-    This parses URLs and provides attributes for the following
-    components:
-
-    <scheme>://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment>
-
-    Missing components are set to None. The only exception is
-    fragment, which is set to '' if present but empty.
-
-    If parsefragment is False, fragment is included in query. If
-    parsequery is False, query is included in path. If both are
-    False, both fragment and query are included in path.
-
-    See http://www.ietf.org/rfc/rfc2396.txt for more information.
-
-    Note that for backward compatibility reasons, bundle URLs do not
-    take host names. That means 'bundle://../' has a path of '../'.
-
-    Examples:
-
-    >>> url('http://www.ietf.org/rfc/rfc2396.txt')
-    <url scheme: 'http', host: 'www.ietf.org', path: 'rfc/rfc2396.txt'>
-    >>> url('ssh://[::1]:2200//home/joe/repo')
-    <url scheme: 'ssh', host: '[::1]', port: '2200', path: '/home/joe/repo'>
-    >>> url('file:///home/joe/repo')
-    <url scheme: 'file', path: '/home/joe/repo'>
-    >>> url('bundle:foo')
-    <url scheme: 'bundle', path: 'foo'>
-    >>> url('bundle://../foo')
-    <url scheme: 'bundle', path: '../foo'>
-    >>> url('c:\\\\foo\\\\bar')
-    <url path: 'c:\\\\foo\\\\bar'>
-
-    Authentication credentials:
-
-    >>> url('ssh://joe:xyz@x/repo')
-    <url scheme: 'ssh', user: 'joe', passwd: 'xyz', host: 'x', path: 'repo'>
-    >>> url('ssh://joe@x/repo')
-    <url scheme: 'ssh', user: 'joe', host: 'x', path: 'repo'>
-
-    Query strings and fragments:
-
-    >>> url('http://host/a?b#c')
-    <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'>
-    >>> url('http://host/a?b#c', parsequery=False, parsefragment=False)
-    <url scheme: 'http', host: 'host', path: 'a?b#c'>
-    """
-
-    _safechars = "!~*'()+"
-    _safepchars = "/!~*'()+"
-    _matchscheme = re.compile(r'^[a-zA-Z0-9+.\-]+:').match
-
-    def __init__(self, path, parsequery=True, parsefragment=True):
-        # We slowly chomp away at path until we have only the path left
-        self.scheme = self.user = self.passwd = self.host = None
-        self.port = self.path = self.query = self.fragment = None
-        self._localpath = True
-        self._hostport = ''
-        self._origpath = path
-
-        # special case for Windows drive letters
-        if hasdriveletter(path):
-            self.path = path
-            return
-
-        # For compatibility reasons, we can't handle bundle paths as
-        # normal URLS
-        if path.startswith('bundle:'):
-            self.scheme = 'bundle'
-            path = path[7:]
-            if path.startswith('//'):
-                path = path[2:]
-            self.path = path
-            return
-
-        if self._matchscheme(path):
-            parts = path.split(':', 1)
-            if parts[0]:
-                self.scheme, path = parts
-                self._localpath = False
-
-        if not path:
-            path = None
-            if self._localpath:
-                self.path = ''
-                return
-        else:
-            if parsefragment and '#' in path:
-                path, self.fragment = path.split('#', 1)
-                if not path:
-                    path = None
-            if self._localpath:
-                self.path = path
-                return
-
-            if parsequery and '?' in path:
-                path, self.query = path.split('?', 1)
-                if not path:
-                    path = None
-                if not self.query:
-                    self.query = None
-
-            # // is required to specify a host/authority
-            if path and path.startswith('//'):
-                parts = path[2:].split('/', 1)
-                if len(parts) > 1:
-                    self.host, path = parts
-                    path = path
-                else:
-                    self.host = parts[0]
-                    path = None
-                if not self.host:
-                    self.host = None
-                    if path:
-                        path = '/' + path
-
-            if self.host and '@' in self.host:
-                self.user, self.host = self.host.rsplit('@', 1)
-                if ':' in self.user:
-                    self.user, self.passwd = self.user.split(':', 1)
-                if not self.host:
-                    self.host = None
-
-            # Don't split on colons in IPv6 addresses without ports
-            if (self.host and ':' in self.host and
-                not (self.host.startswith('[') and self.host.endswith(']'))):
-                self._hostport = self.host
-                self.host, self.port = self.host.rsplit(':', 1)
-                if not self.host:
-                    self.host = None
-
-            if (self.host and self.scheme == 'file' and
-                self.host not in ('localhost', '127.0.0.1', '[::1]')):
-                raise util.Abort(_('file:// URLs can only refer to localhost'))
-
-        self.path = path
-
-        for a in ('user', 'passwd', 'host', 'port',
-                  'path', 'query', 'fragment'):
-            v = getattr(self, a)
-            if v is not None:
-                setattr(self, a, urllib.unquote(v))
-
-    def __repr__(self):
-        attrs = []
-        for a in ('scheme', 'user', 'passwd', 'host', 'port', 'path',
-                  'query', 'fragment'):
-            v = getattr(self, a)
-            if v is not None:
-                attrs.append('%s: %r' % (a, v))
-        return '<url %s>' % ', '.join(attrs)
-
-    def __str__(self):
-        """Join the URL's components back into a URL string.
-
-        Examples:
-
-        >>> str(url('http://user:pw@host:80/?foo#bar'))
-        'http://user:pw@host:80/?foo#bar'
-        >>> str(url('ssh://user:pw@[::1]:2200//home/joe#'))
-        'ssh://user:pw@[::1]:2200//home/joe#'
-        >>> str(url('http://localhost:80//'))
-        'http://localhost:80//'
-        >>> str(url('http://localhost:80/'))
-        'http://localhost:80/'
-        >>> str(url('http://localhost:80'))
-        'http://localhost:80/'
-        >>> str(url('bundle:foo'))
-        'bundle:foo'
-        >>> str(url('bundle://../foo'))
-        'bundle:../foo'
-        >>> str(url('path'))
-        'path'
-        """
-        if self._localpath:
-            s = self.path
-            if self.scheme == 'bundle':
-                s = 'bundle:' + s
-            if self.fragment:
-                s += '#' + self.fragment
-            return s
-
-        s = self.scheme + ':'
-        if (self.user or self.passwd or self.host or
-            self.scheme and not self.path):
-            s += '//'
-        if self.user:
-            s += urllib.quote(self.user, safe=self._safechars)
-        if self.passwd:
-            s += ':' + urllib.quote(self.passwd, safe=self._safechars)
-        if self.user or self.passwd:
-            s += '@'
-        if self.host:
-            if not (self.host.startswith('[') and self.host.endswith(']')):
-                s += urllib.quote(self.host)
-            else:
-                s += self.host
-        if self.port:
-            s += ':' + urllib.quote(self.port)
-        if self.host:
-            s += '/'
-        if self.path:
-            s += urllib.quote(self.path, safe=self._safepchars)
-        if self.query:
-            s += '?' + urllib.quote(self.query, safe=self._safepchars)
-        if self.fragment is not None:
-            s += '#' + urllib.quote(self.fragment, safe=self._safepchars)
-        return s
-
-    def authinfo(self):
-        user, passwd = self.user, self.passwd
-        try:
-            self.user, self.passwd = None, None
-            s = str(self)
-        finally:
-            self.user, self.passwd = user, passwd
-        if not self.user:
-            return (s, None)
-        return (s, (None, (str(self), self.host),
-                    self.user, self.passwd or ''))
-
-    def localpath(self):
-        if self.scheme == 'file' or self.scheme == 'bundle':
-            path = self.path or '/'
-            # For Windows, we need to promote hosts containing drive
-            # letters to paths with drive letters.
-            if hasdriveletter(self._hostport):
-                path = self._hostport + '/' + self.path
-            elif self.host is not None and self.path:
-                path = '/' + path
-            # We also need to handle the case of file:///C:/, which
-            # should return C:/, not /C:/.
-            elif hasdriveletter(path):
-                # Strip leading slash from paths with drive names
-                return path[1:]
-            return path
-        return self._origpath
-
-def hasscheme(path):
-    return bool(url(path).scheme)
-
-def hasdriveletter(path):
-    return path[1:2] == ':' and path[0:1].isalpha()
-
-def localpath(path):
-    return url(path, parsequery=False, parsefragment=False).localpath()
-
-def hidepassword(u):
-    '''hide user credential in a url string'''
-    u = url(u)
-    if u.passwd:
-        u.passwd = '***'
-    return str(u)
-
-def removeauth(u):
-    '''remove all authentication information from a url string'''
-    u = url(u)
-    u.user = u.passwd = None
-    return str(u)
-
-def readauthforuri(ui, uri):
-    # Read configuration
-    config = dict()
-    for key, val in ui.configitems('auth'):
-        if '.' not in key:
-            ui.warn(_("ignoring invalid [auth] key '%s'\n") % key)
-            continue
-        group, setting = key.rsplit('.', 1)
-        gdict = config.setdefault(group, dict())
-        if setting in ('username', 'cert', 'key'):
-            val = util.expandpath(val)
-        gdict[setting] = val
-
-    # Find the best match
-    scheme, hostpath = uri.split('://', 1)
-    bestlen = 0
-    bestauth = None
-    for group, auth in config.iteritems():
-        prefix = auth.get('prefix')
-        if not prefix:
-            continue
-        p = prefix.split('://', 1)
-        if len(p) > 1:
-            schemes, prefix = [p[0]], p[1]
-        else:
-            schemes = (auth.get('schemes') or 'https').split()
-        if (prefix == '*' or hostpath.startswith(prefix)) and \
-            len(prefix) > bestlen and scheme in schemes:
-            bestlen = len(prefix)
-            bestauth = group, auth
-    return bestauth
-
 class passwordmgr(urllib2.HTTPPasswordMgrWithDefaultRealm):
     def __init__(self, ui):
         urllib2.HTTPPasswordMgrWithDefaultRealm.__init__(self)
@@ -320,7 +26,7 @@ class passwordmgr(urllib2.HTTPPasswordMg
             return (user, passwd)
 
         if not user:
-            res = readauthforuri(self.ui, authuri)
+            res = util.readauthforuri(self.ui, authuri)
             if res:
                 group, auth = res
                 user, passwd = auth.get('username'), auth.get('password')
@@ -357,7 +63,7 @@ class proxyhandler(urllib2.ProxyHandler)
             if not (proxyurl.startswith('http:') or
                     proxyurl.startswith('https:')):
                 proxyurl = 'http://' + proxyurl + '/'
-            proxy = url(proxyurl)
+            proxy = util.url(proxyurl)
             if not proxy.user:
                 proxy.user = ui.config("http_proxy", "user")
                 proxy.passwd = ui.config("http_proxy", "passwd")
@@ -545,7 +251,7 @@ def _generic_start_transaction(handler, 
         new_tunnel = False
 
     if new_tunnel or tunnel_host == req.get_full_url(): # has proxy
-        u = url(tunnel_host)
+        u = util.url(tunnel_host)
         if new_tunnel or u.scheme == 'https': # only use CONNECT for HTTPS
             h.realhostport = ':'.join([u.host, (u.port or '443')])
             h.headers = req.headers.copy()
@@ -769,7 +475,7 @@ if has_https:
             return keepalive.KeepAliveHandler._start_transaction(self, h, req)
 
         def https_open(self, req):
-            res = readauthforuri(self.ui, req.get_full_url())
+            res = util.readauthforuri(self.ui, req.get_full_url())
             if res:
                 group, auth = res
                 self.auth = auth
@@ -876,7 +582,7 @@ def opener(ui, authinfo=None):
     return opener
 
 def open(ui, url_, data=None):
-    u = url(url_)
+    u = util.url(url_)
     if u.scheme:
         u.scheme = u.scheme.lower()
         url_, authinfo = u.authinfo()
diff --git a/mercurial/util.py b/mercurial/util.py
--- a/mercurial/util.py
+++ b/mercurial/util.py
@@ -17,7 +17,7 @@ from i18n import _
 import error, osutil, encoding
 import errno, re, shutil, sys, tempfile, traceback
 import os, time, calendar, textwrap, unicodedata, signal
-import imp, socket
+import imp, socket, urllib
 
 # Python compatibility
 
@@ -1284,3 +1284,297 @@ def parsebool(s):
     If s is not a valid boolean, returns None.
     """
     return _booleans.get(s.lower(), None)
+
+class url(object):
+    """Reliable URL parser.
+
+    This parses URLs and provides attributes for the following
+    components:
+
+    <scheme>://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment>
+
+    Missing components are set to None. The only exception is
+    fragment, which is set to '' if present but empty.
+
+    If parsefragment is False, fragment is included in query. If
+    parsequery is False, query is included in path. If both are
+    False, both fragment and query are included in path.
+
+    See http://www.ietf.org/rfc/rfc2396.txt for more information.
+
+    Note that for backward compatibility reasons, bundle URLs do not
+    take host names. That means 'bundle://../' has a path of '../'.
+
+    Examples:
+
+    >>> url('http://www.ietf.org/rfc/rfc2396.txt')
+    <url scheme: 'http', host: 'www.ietf.org', path: 'rfc/rfc2396.txt'>
+    >>> url('ssh://[::1]:2200//home/joe/repo')
+    <url scheme: 'ssh', host: '[::1]', port: '2200', path: '/home/joe/repo'>
+    >>> url('file:///home/joe/repo')
+    <url scheme: 'file', path: '/home/joe/repo'>
+    >>> url('bundle:foo')
+    <url scheme: 'bundle', path: 'foo'>
+    >>> url('bundle://../foo')
+    <url scheme: 'bundle', path: '../foo'>
+    >>> url('c:\\\\foo\\\\bar')
+    <url path: 'c:\\\\foo\\\\bar'>
+
+    Authentication credentials:
+
+    >>> url('ssh://joe:xyz@x/repo')
+    <url scheme: 'ssh', user: 'joe', passwd: 'xyz', host: 'x', path: 'repo'>
+    >>> url('ssh://joe@x/repo')
+    <url scheme: 'ssh', user: 'joe', host: 'x', path: 'repo'>
+
+    Query strings and fragments:
+
+    >>> url('http://host/a?b#c')
+    <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'>
+    >>> url('http://host/a?b#c', parsequery=False, parsefragment=False)
+    <url scheme: 'http', host: 'host', path: 'a?b#c'>
+    """
+
+    _safechars = "!~*'()+"
+    _safepchars = "/!~*'()+"
+    _matchscheme = re.compile(r'^[a-zA-Z0-9+.\-]+:').match
+
+    def __init__(self, path, parsequery=True, parsefragment=True):
+        # We slowly chomp away at path until we have only the path left
+        self.scheme = self.user = self.passwd = self.host = None
+        self.port = self.path = self.query = self.fragment = None
+        self._localpath = True
+        self._hostport = ''
+        self._origpath = path
+
+        # special case for Windows drive letters
+        if hasdriveletter(path):
+            self.path = path
+            return
+
+        # For compatibility reasons, we can't handle bundle paths as
+        # normal URLS
+        if path.startswith('bundle:'):
+            self.scheme = 'bundle'
+            path = path[7:]
+            if path.startswith('//'):
+                path = path[2:]
+            self.path = path
+            return
+
+        if self._matchscheme(path):
+            parts = path.split(':', 1)
+            if parts[0]:
+                self.scheme, path = parts
+                self._localpath = False
+
+        if not path:
+            path = None
+            if self._localpath:
+                self.path = ''
+                return
+        else:
+            if parsefragment and '#' in path:
+                path, self.fragment = path.split('#', 1)
+                if not path:
+                    path = None
+            if self._localpath:
+                self.path = path
+                return
+
+            if parsequery and '?' in path:
+                path, self.query = path.split('?', 1)
+                if not path:
+                    path = None
+                if not self.query:
+                    self.query = None
+
+            # // is required to specify a host/authority
+            if path and path.startswith('//'):
+                parts = path[2:].split('/', 1)
+                if len(parts) > 1:
+                    self.host, path = parts
+                    path = path
+                else:
+                    self.host = parts[0]
+                    path = None
+                if not self.host:
+                    self.host = None
+                    if path:
+                        path = '/' + path
+
+            if self.host and '@' in self.host:
+                self.user, self.host = self.host.rsplit('@', 1)
+                if ':' in self.user:
+                    self.user, self.passwd = self.user.split(':', 1)
+                if not self.host:
+                    self.host = None
+
+            # Don't split on colons in IPv6 addresses without ports
+            if (self.host and ':' in self.host and
+                not (self.host.startswith('[') and self.host.endswith(']'))):
+                self._hostport = self.host
+                self.host, self.port = self.host.rsplit(':', 1)
+                if not self.host:
+                    self.host = None
+
+            if (self.host and self.scheme == 'file' and
+                self.host not in ('localhost', '127.0.0.1', '[::1]')):
+                raise Abort(_('file:// URLs can only refer to localhost'))
+
+        self.path = path
+
+        for a in ('user', 'passwd', 'host', 'port',
+                  'path', 'query', 'fragment'):
+            v = getattr(self, a)
+            if v is not None:
+                setattr(self, a, urllib.unquote(v))
+
+    def __repr__(self):
+        attrs = []
+        for a in ('scheme', 'user', 'passwd', 'host', 'port', 'path',
+                  'query', 'fragment'):
+            v = getattr(self, a)
+            if v is not None:
+                attrs.append('%s: %r' % (a, v))
+        return '<url %s>' % ', '.join(attrs)
+
+    def __str__(self):
+        """Join the URL's components back into a URL string.
+
+        Examples:
+
+        >>> str(url('http://user:pw@host:80/?foo#bar'))
+        'http://user:pw@host:80/?foo#bar'
+        >>> str(url('ssh://user:pw@[::1]:2200//home/joe#'))
+        'ssh://user:pw@[::1]:2200//home/joe#'
+        >>> str(url('http://localhost:80//'))
+        'http://localhost:80//'
+        >>> str(url('http://localhost:80/'))
+        'http://localhost:80/'
+        >>> str(url('http://localhost:80'))
+        'http://localhost:80/'
+        >>> str(url('bundle:foo'))
+        'bundle:foo'
+        >>> str(url('bundle://../foo'))
+        'bundle:../foo'
+        >>> str(url('path'))
+        'path'
+        """
+        if self._localpath:
+            s = self.path
+            if self.scheme == 'bundle':
+                s = 'bundle:' + s
+            if self.fragment:
+                s += '#' + self.fragment
+            return s
+
+        s = self.scheme + ':'
+        if (self.user or self.passwd or self.host or
+            self.scheme and not self.path):
+            s += '//'
+        if self.user:
+            s += urllib.quote(self.user, safe=self._safechars)
+        if self.passwd:
+            s += ':' + urllib.quote(self.passwd, safe=self._safechars)
+        if self.user or self.passwd:
+            s += '@'
+        if self.host:
+            if not (self.host.startswith('[') and self.host.endswith(']')):
+                s += urllib.quote(self.host)
+            else:
+                s += self.host
+        if self.port:
+            s += ':' + urllib.quote(self.port)
+        if self.host:
+            s += '/'
+        if self.path:
+            s += urllib.quote(self.path, safe=self._safepchars)
+        if self.query:
+            s += '?' + urllib.quote(self.query, safe=self._safepchars)
+        if self.fragment is not None:
+            s += '#' + urllib.quote(self.fragment, safe=self._safepchars)
+        return s
+
+    def authinfo(self):
+        user, passwd = self.user, self.passwd
+        try:
+            self.user, self.passwd = None, None
+            s = str(self)
+        finally:
+            self.user, self.passwd = user, passwd
+        if not self.user:
+            return (s, None)
+        return (s, (None, (str(self), self.host),
+                    self.user, self.passwd or ''))
+
+    def localpath(self):
+        if self.scheme == 'file' or self.scheme == 'bundle':
+            path = self.path or '/'
+            # For Windows, we need to promote hosts containing drive
+            # letters to paths with drive letters.
+            if hasdriveletter(self._hostport):
+                path = self._hostport + '/' + self.path
+            elif self.host is not None and self.path:
+                path = '/' + path
+            # We also need to handle the case of file:///C:/, which
+            # should return C:/, not /C:/.
+            elif hasdriveletter(path):
+                # Strip leading slash from paths with drive names
+                return path[1:]
+            return path
+        return self._origpath
+
+def hasscheme(path):
+    return bool(url(path).scheme)
+
+def hasdriveletter(path):
+    return path[1:2] == ':' and path[0:1].isalpha()
+
+def localpath(path):
+    return url(path, parsequery=False, parsefragment=False).localpath()
+
+def hidepassword(u):
+    '''hide user credential in a url string'''
+    u = url(u)
+    if u.passwd:
+        u.passwd = '***'
+    return str(u)
+
+def removeauth(u):
+    '''remove all authentication information from a url string'''
+    u = url(u)
+    u.user = u.passwd = None
+    return str(u)
+
+def readauthforuri(ui, uri):
+    # Read configuration
+    config = dict()
+    for key, val in ui.configitems('auth'):
+        if '.' not in key:
+            ui.warn(_("ignoring invalid [auth] key '%s'\n") % key)
+            continue
+        group, setting = key.rsplit('.', 1)
+        gdict = config.setdefault(group, dict())
+        if setting in ('username', 'cert', 'key'):
+            val = expandpath(val)
+        gdict[setting] = val
+
+    # Find the best match
+    scheme, hostpath = uri.split('://', 1)
+    bestlen = 0
+    bestauth = None
+    for group, auth in config.iteritems():
+        prefix = auth.get('prefix')
+        if not prefix:
+            continue
+        p = prefix.split('://', 1)
+        if len(p) > 1:
+            schemes, prefix = [p[0]], p[1]
+        else:
+            schemes = (auth.get('schemes') or 'https').split()
+        if (prefix == '*' or hostpath.startswith(prefix)) and \
+            len(prefix) > bestlen and scheme in schemes:
+            bestlen = len(prefix)
+            bestauth = group, auth
+    return bestauth
diff --git a/tests/test-url.py b/tests/test-url.py
--- a/tests/test-url.py
+++ b/tests/test-url.py
@@ -53,7 +53,7 @@ import doctest
 
 def test_url():
     """
-    >>> from mercurial.url import url
+    >>> from mercurial.util import url
 
     This tests for edge cases in url.URL's parsing algorithm. Most of
     these aren't useful for documentation purposes, so they aren't


More information about the Mercurial-devel mailing list