[PATCH] V13 of experiment for a simpler path encoding for hashed paths (for "fncache2")

Adrian Buehlmann adrian at cadifra.com
Sat Sep 29 17:30:33 CDT 2012


On 2012-09-29 11:29, Adrian Buehlmann wrote:
> # HG changeset patch
> # User Adrian Buehlmann <adrian at cadifra.com>
> # Date 1348910761 -7200
> # Node ID ca5bca5255271ae28320c4d107b11ba5c9dab1de
> # Parent  d6c7128e550de7d74aec27e86532ceec75a5c38f
> V13 of experiment for a simpler path encoding for hashed paths (for "fncache2")
> 
> Changes compared to V12:
> 
> - Escapes com0 and lpt0 as well. Explorer of Windows 7 refuses to create these,
>   even though those names are not specified by Microsoft as being reserved.
> 
> Changes in testcases:
> 
>    cutdirs('data/auxx/conx/prnx/nulx/comx/lptx/foo.i')
>            'data/auxx/conx/prnx/nulx/comx/lptx/foo~i'
>    cutdirs('data/com0/com1/com9/lpt0/lpt1/lpt9/foo.i')
>   -        'data/com0/com~/com~/lpt0/lpt~/lpt~/foo~i'
>   +        'data/com~/com~/com~/lpt~/lpt~/lpt~/foo~i'
>   +cutdirs('data/nul.txt/aux.txt/foo.i')
>   +        'data/nul~txt/aux~txt/foo~i'
> 
>    cutdirs('data/common/auxiliary/nulling/console/bla.com/foo.i')
>            'data/common/auxiliar/nulling/console/bla~com/foo~i'
> 
> diff --git a/mercurial/parsers.c b/mercurial/parsers.c
> --- a/mercurial/parsers.c
> +++ b/mercurial/parsers.c
> @@ -1508,6 +1508,7 @@
>  
>  PyObject *encodedir(PyObject *self, PyObject *args);
>  PyObject *pathencode(PyObject *self, PyObject *args);
> +PyObject *cutdirs(PyObject *self, PyObject *args);
>  
>  static PyMethodDef methods[] = {
>  	{"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
> @@ -1516,6 +1517,7 @@
>  	{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
>  	{"encodedir", encodedir, METH_VARARGS, "encodedir a path\n"},
>  	{"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"},
> +	{"cutdirs", cutdirs, METH_VARARGS, "fncache-encode a path\n"},
>  	{NULL, NULL}
>  };
>  
> diff --git a/mercurial/pathencode.c b/mercurial/pathencode.c
> --- a/mercurial/pathencode.c
> +++ b/mercurial/pathencode.c
> @@ -481,6 +481,92 @@
>  
>  static const Py_ssize_t maxstorepathlen = 120;
>  
> +static const char encchar[256] =
> +	"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
> +	"~!~#$%&'()~+,-~~0123456789~;~=~~"
> +	"@abcdefghijklmnopqrstuvwxyz[~]^_"
> +	"`abcdefghijklmnopqrstuvwxyz{~}~~"
> +	"~abcdefghijklmnopqrstuvwxyz{~}~~"
> +	"~!~#$%&'()~+,-~~0123456789~;~=~~"
> +	"@abcdefghijklmnopqrstuvwxyz[~]^_"
> +	"`abcdefghijklmnopqrstuvwxyz{~}~~";
> +
> +/* this encoding folds */
> +static inline char encodechar(char c)
> +{
> +	return encchar[0xff & c];
> +}
> +
> +static Py_ssize_t _cutdirs(char *dest, Py_ssize_t destlen, size_t destsize,
> +                           const char *src, Py_ssize_t len)
> +{
> +	Py_ssize_t i = 0, spaceleft = maxstorepathlen - 40 + 1;
> +	char seg[8];
> +	int seglen = 0;
> +	uint32_t cmp;
> +
> +	while (i < len && spaceleft > 0) {
> +		if (src[i] == '/' || src[i] == '\0') {
> +			if (seglen != 0) {
> +				if (seglen == 3) {
> +					cmp = seg[0] << 16 | seg[1] << 8 | seg[2];
> +					if (   cmp == 0x617578 /* aux */
> +					    || cmp == 0x636f6e /* con */
> +					    || cmp == 0x70726e /* prn */
> +					    || cmp == 0x6e756c /* nul */)
> +						seg[2] = '~';
> +				}
> +				else if (seglen == 4 && seg[3] <= '9'
> +				                     && seg[3] >= '0') {
> +					cmp = seg[0] << 16 | seg[1] << 8 | seg[2];
> +					if (   cmp == 0x636f6d /* com0..9 */
> +					    || cmp == 0x6c7074 /* lpt0..9 */)
> +						seg[3] = '~';
> +				}
> +				memcopy(dest, &destlen, destsize, &seg, seglen);
> +				seglen = 0;
> +			}
> +			charcopy(dest, &destlen, destsize, src[i++]);
> +			spaceleft--;
> +		}
> +		else if (seglen == sizeof(seg)) {
> +			i++;
> +		}
> +		else {
> +			seg[seglen++] = encodechar(src[i++]);
> +			spaceleft--;
> +		}
> +	}
> +
> +	return destlen;
> +}

cutdirs in Python

diff --git a/mercurial/store.py b/mercurial/store.py
--- a/mercurial/store.py
+++ b/mercurial/store.py
@@ -185,6 +185,41 @@
 _dirprefixlen = 8
 _maxshortdirslen = 8 * (_dirprefixlen + 1) - 4

+_encchar = ("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+            "~!~#$%&'()~+,-~~0123456789~;~=~~"
+            "@abcdefghijklmnopqrstuvwxyz[~]^_"
+            "`abcdefghijklmnopqrstuvwxyz{~}~~"
+            "~abcdefghijklmnopqrstuvwxyz{~}~~"
+            "~!~#$%&'()~+,-~~0123456789~;~=~~"
+            "@abcdefghijklmnopqrstuvwxyz[~]^_"
+            "`abcdefghijklmnopqrstuvwxyz{~}~~")
+
+def _foldencode(f): # preserves size
+    f = ''.join([_encchar[ord(c)] for c in f])
+    l = len(f)
+    if l == 3 and f[:3] in _winres3:
+        f = f[:2] + '~'
+    if (l == 4 and f[3] <= '9' and f[3] >= '0'
+               and f[:3] in _winres4):
+        f = f[:3] + '~'
+    return f
+
+def cutdirs(path):
+    parts = []
+    totallen = 0
+    for s in path.split('/'):
+        if len(s) > 8:
+            s = s[:8]
+        if totallen:
+            newlen = totallen + 1 + len(s)
+        else:
+            newlen = len(s)
+        if newlen > _maxstorepathlen - 40:
+            break
+        parts.append(s)
+        totallen = newlen
+    return '/'.join(map(_foldencode, parts))
+
 def _hashencode(path, dotencode):
     digest = _sha(path).hexdigest()
     le = lowerencode(path).split('/')[1:]


More information about the Mercurial-devel mailing list