[PATCH] introduce upath repositories (issue793)

Adrian Buehlmann adrian at cadifra.com
Fri Jun 27 10:32:52 CDT 2008


On 27.06.2008 02:48, Jesse Glick wrote:
> Adrian Buehlmann wrote:
>> upath encodes repository paths by prepending an underbar to every
>> path component inside .hg/store/data.
>>
>> This encoding ensures that repositories containing tracked files with
>> path components consisting of Windows reserved file names (e.g.
>> 'nul', 'aux', 'lpt', etc.) can be pulled to Windows.
> 
> Unfortunately this will also have the effect of making repo path names 
> even longer - which can turn a repository which is currently usable on 
> Windows into one which is unusable. For the repositories I work with, 
> which already have a few files pushing into the danger zone for Windows 
> users, this patch would add 16 characters to some repo files, which 
> could substantially restrict where the repo could be cloned.

Yep. Taking away another 16 chars in this situation is a no-no.

And since we currently have more or less dropped the option of
using long paths on Windows (http://www.cadifra.com/cgi-bin/repos/hg-longpath/),
I've started hacking on a new proof of concept patch which just encodes
the first char of reserved names:

http://www.cadifra.com/cgi-bin/repos/hg-auxencode/file/tip/auxencode.patch

My current version of this patch is f34e214d7128 (pasted below).

Maybe this encoding can be included in that discussed potential repo layout
change which will compress paths (hybrid solution by parren/mpm, as discussed).


# HG changeset patch
# User Adrian Buehlmann <adrian at cadifra.com>
# Date 1214568479 -7200
# Node ID fc75d4706ebe6e139cce5a1fb25e27e391bb9246
# Parent  3ef6adad4e43bccf7b8cc24263d70b0e3f27f02d
introduce auxencode repositories (issue793)

This change adds a new entry 'auxencode' in the .hg/requires file
for new repositories.

'auxencode' encodes repository path components that
* begin with a Windows reserved filename followed by a period, or
* are equal to a reserved filename

Only the all-lowercase variants of the reserved names (e.g. aux, prn,
lpt, com1, etc.) are encoded, since uppercase chars are already encoded by
the current encoding scheme ('X' -> '_x').

The encoding is done by encoding the first character, extending the current
tilde character encoding scheme to the full range of characters ('a' -> '~61').

A new decoding function util.fulldecode is provided, which can decode the
full range of encoded chars (~00..~FF). 'auxencode' repositories require
this function to be present and used.

Examples:
* 'aux'     -> '~61ux'
* 'foo.aux' -> 'foo.aux'    # doesn't need to be encoded
* 'aux.foo' -> '~61ux.foo'
* '\0aux'   -> '~00aux'     # same as current encoding
* 'Aux'     -> '_aux'       # same as current encoding
* '~aux'    -> '~7eaux'     # same as current encoding
* 'aux/com1/nul/prn/lpt1.txt' - >
     '~61ux/~63om1/~6eul/~70rn/~6cpt1.txt'

This encoding ensures that repositories containing tracked files
with path components consisting of Windows reserved file names
can be pulled to Windows repositories.

Note that revisions containing reserved filenames cannot be checked
out.

Older versions of Mercurial accessing an 'auxencode' repository will
abort with "abort: requirement 'auxencode' not supported!". This is
needed because older versions of Mercurial can't decode the full
range of encoded chars, because they don't have util.fulldecode.

diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -15,7 +15,7 @@

 class localrepository(repo.repository):
     capabilities = util.set(('lookup', 'changegroupsubset'))
-    supported = ('revlogv1', 'store')
+    supported = ('revlogv1', 'store', 'auxencode')

     def __init__(self, parentui, path=None, create=0):
         repo.repository.__init__(self)
@@ -34,6 +34,7 @@
                 if parentui.configbool('format', 'usestore', True):
                     os.mkdir(os.path.join(self.path, "store"))
                     requirements.append("store")
+                    requirements.append("auxencode")
                     # create an invalid changelog
                     self.opener("00changelog.i", "a").write(
                         '\0\0\0\2' # represents revlogv2
@@ -62,8 +63,12 @@

         # setup store
         if "store" in requirements:
-            self.encodefn = util.encodefilename
-            self.decodefn = util.decodefilename
+            if "auxencode" in requirements:
+                self.encodefn = util.auxencode
+                self.decodefn = util.auxdecode
+            else:
+                self.encodefn = util.encodefilename
+                self.decodefn = util.decodefilename
             self.spath = os.path.join(self.path, "store")
         else:
             self.encodefn = lambda x: x
diff --git a/mercurial/statichttprepo.py b/mercurial/statichttprepo.py
--- a/mercurial/statichttprepo.py
+++ b/mercurial/statichttprepo.py
@@ -55,8 +55,12 @@

         # setup store
         if "store" in requirements:
-            self.encodefn = util.encodefilename
-            self.decodefn = util.decodefilename
+            if "auxencode" in requirements:
+                self.encodefn = util.auxencode
+                self.decodefn = util.auxdecode
+            else:
+                self.encodefn = util.encodefilename
+                self.decodefn = util.decodefilename
             self.spath = self.path + "/store"
         else:
             self.encodefn = lambda x: x
diff --git a/mercurial/util.py b/mercurial/util.py
--- a/mercurial/util.py
+++ b/mercurial/util.py
@@ -15,7 +15,7 @@
 from i18n import _
 import cStringIO, errno, getpass, re, shutil, sys, tempfile
 import os, stat, threading, time, calendar, ConfigParser, locale, glob, osutil
-import imp, urlparse
+import imp, urlparse, string

 # Python compatibility

@@ -1351,9 +1351,10 @@
         return name
     return find_in_path(name, os.environ.get('PATH', ''), default=default)

+_windows_reserved_chars = '\\:*?"<>|'
 def _buildencodefun():
     e = '_'
-    win_reserved = [ord(x) for x in '\\:*?"<>|']
+    win_reserved = [ord(x) for x in _windows_reserved_chars]
     cmap = dict([ (chr(x), chr(x)) for x in xrange(127) ])
     for x in (range(32) + range(126, 256) + win_reserved):
         cmap[chr(x)] = "~%02x" % x
@@ -1378,6 +1379,66 @@
             lambda s: "".join(list(decode(s))))

 encodefilename, decodefilename = _buildencodefun()
+
+def fulldecode(s):
+    # same as decodefilename, but allows the full range for coded chars (~00..~FF)
+    state = 'n'
+    # possible states:
+    #  n  normal
+    #  ~  encoded char
+    #  _  encoded uppercase
+    res = ''
+    for c in s:
+        if (ord(c) < 32) or (ord(c) > 126) or c in _windows_reserved_chars:
+            raise KeyError
+        if state == 'n':      # normal
+            if c == '_':
+                state = '_'
+            elif c == '~':
+                state = '~'
+                digits = ''
+            else:
+                res += c
+        elif state == '_':    # encoded uppercase
+            if c == '_':
+                res += c
+            elif c == '~' or c == '/':
+                raise KeyError
+            else:
+                res += c.upper()
+            state = 'n'
+        elif state == '~':    # encoded char
+            if not c in string.hexdigits:
+                raise KeyError
+            digits += c
+            if len(digits) == 2:
+                res += chr(int(digits, 16))
+                digits = ''
+                state = 'n'
+        else:                 # unknown state
+            raise KeyError
+    if state != 'n':
+        raise KeyError
+    return res
+
+_windows_reserved_filenames = '''con prn aux nul
+    com1 com2 com3 com4 com5 com6 com7 com8 com9
+    lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9'''.split()
+def auxencode(path):
+    res = []
+    for n in encodefilename(path).split('/'):
+        if n:
+            base = n.split('.')[0]
+            if base and (base in _windows_reserved_filenames):
+                # we don't mask uppercase variants because we know that
+                # these will be encoded with underbars afterwards
+                # anyway (e.g. "Aux" -> "_aux", "aUx" -> "a_ux")
+                res.append("~%02x" % ord(n[0]) + n[1:]) # encode first char
+                continue
+        res.append(n)
+    return '/'.join(res)
+
+auxdecode = fulldecode

 def encodedopener(openerfn, fn):
     def o(path, *args, **kw):


More information about the Mercurial-devel mailing list