[PATCH] ideas/experiment for a simpler path encoding for hashed paths (for "fncache2")

Adrian Buehlmann adrian at cadifra.com
Mon Sep 24 17:03:57 CDT 2012


# HG changeset patch
# User Adrian Buehlmann <adrian at cadifra.com>
# Date 1348524023 -7200
# Node ID 916efbaad41a418205d0eac300a2488c111696b4
# Parent  c713bf1139edad4f1c1e7e50d0b591c59d4e66ca
ideas/experiment for a simpler path encoding for hashed paths (for "fncache2")

Function cutdirs takes up to 8 characters of each directory and encodes
reserved chars, names, dots and periods at begin or end.
Uppercase is folded to lowercase. The encoding is size-preserving, folding and
non-reversible. The result of cutdirs is intended to be prepended in front of
the 40 byte sha1-hash of the path (followed by .i or .d).

  >>> from mercurial.parsers import cutdirs
  >>> cutdirs("helloworld")
  'hellowor'
  >>> cutdirs("Helloworld")
  'hellowor'
  >>> cutdirs("ABCDEFGHIJKLMNOPQRSTUVW")
  'abcdefgh'
  >>> cutdirs("ABCDEFGH/IJKLMNOP/QRSTUVW")
  'abcdefgh/ijklmnop/qrstuvw'
  >>> cutdirs("ABCDEFGH/IJKLMNOP/QRSTUVWX/YZ")
  'abcdefgh/ijklmnop/qrstuvwx/yz'
  >>> cutdirs("?")
  '~'
  >>> cutdirs("'")
  "'"
  >>> cutdirs("|")
  '~'
  >>> cutdirs("\04")
  'd'
  >>> cutdirs("\01")
  'a'
  >>> cutdirs("\31")
  'y'
  >>> cutdirs("ABCDEFGH/IJKLMNOP/QRSTUVWX/YZ")
  'abcdefgh/ijklmnop/qrstuvwx/yz'
  >>> cutdirs("ABCDEFGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
  'abcdefgh/ijklmnop/qrstuvwx/yz'
  >>> cutdirs(".ABCDEFGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
  '~abcdefg/ijklmnop/qrstuvwx/yz'
  >>> cutdirs(" ABCDEFGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
  '~abcdefg/ijklmnop/qrstuvwx/yz'
  >>> cutdirs(" AB..CDEFGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
  '~ab..cde/ijklmnop/qrstuvwx/yz'
  >>> cutdirs(" AB..CD.FGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
  '~ab..cd~/ijklmnop/qrstuvwx/yz'
  >>> cutdirs(" AB..CD FGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
  '~ab..cd~/ijklmnop/qrstuvwx/yz'
  >>> cutdirs(" AB..CD FGHkkkkk/auxIJKLMNOP/QRSTUVWX/YZ")
  '~ab..cd~/au~ijklm/qrstuvwx/yz'
  >>> cutdirs(" AB..CD FGHkkkkk/auxIJKLMNOP/QRSTUVWX/YZ/prn/nul")
  '~ab..cd~/au~ijklm/qrstuvwx/yz/pr~/nu~'
  >>> cutdirs(" AB..CD FGHkkkkk/auxIJKLMNOP/QRSTUVWX/YZ/prnoooo/nul")
  '~ab..cd~/au~ijklm/qrstuvwx/yz/pr~oooo/nu~'
  >>> cutdirs("a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9xxxxxxxxxxxxx")
  'a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9xxxxxx'
  >>> cutdirs("a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9/xxxxxxxxxxxxx")
  'a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9/xxxxxx'
  >>> cutdirs("\\?*<>|")
  '~~~~~~'

diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -1508,6 +1508,7 @@
 
 PyObject *encodedir(PyObject *self, PyObject *args);
 PyObject *pathencode(PyObject *self, PyObject *args);
+PyObject *cutdirs(PyObject *self, PyObject *args);
 
 static PyMethodDef methods[] = {
 	{"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
@@ -1516,6 +1517,7 @@
 	{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
 	{"encodedir", encodedir, METH_VARARGS, "encodedir a path\n"},
 	{"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"},
+	{"cutdirs", cutdirs, METH_VARARGS, "fncache-encode a path\n"},
 	{NULL, NULL}
 };
 
diff --git a/mercurial/pathencode.c b/mercurial/pathencode.c
--- a/mercurial/pathencode.c
+++ b/mercurial/pathencode.c
@@ -47,6 +47,13 @@
 	DEFAULT, /* byte of a path component after the first */
 };
 
+/* state machine for hashed paths */
+enum hpath_state {
+	HSTART,    /* first byte of a path component */
+	HDOTSPACE,
+	HDEFAULT,  /* byte of a path component after the first */
+};
+
 /* state machine for dir-encoding */
 enum dir_state {
 	DDOT,
@@ -479,7 +486,118 @@
 		       src, len, 1);
 }
 
+static char encchar[128] = "~abcdefghijklmnopqrstuvwxyz{~}~~"
+                           " !\"#$%&'()~+,-.~0123456789~;~=~~"
+                           "@abcdefghijklmnopqrstuvwxyz[~]^_"
+                           "`abcdefghijklmnopqrstuvwxyz{~}~~";
+
+/* this encoding folds */
+static inline char encodechar(char c)
+{
+	return c ? encchar[0x7f & c] : 0;
+}
+
 static const Py_ssize_t maxstorepathlen = 120;
+static const Py_ssize_t maxdirslen = 8;
+
+static Py_ssize_t _cutdirs(char *dest, Py_ssize_t destlen, size_t destsize,
+                           const char *src, Py_ssize_t len)
+{
+	enum hpath_state state = HSTART;
+	Py_ssize_t i = 0, n;
+	char c, c1, c2, lastc;
+
+	while (i < len && i <= maxstorepathlen - 42) {
+		switch (state) {
+		case HSTART:
+			state = HDEFAULT;
+			n = 0;
+			if (src[i] == '.' || src[i] == ' ') {
+				charcopy(dest, &destlen, destsize, '~');
+				i++; n++;
+			}
+			else if (i + 2 < len) {
+				c = encodechar(src[i]);
+				c1 = encodechar(src[i + 1]);
+				c2 = encodechar(src[i + 2]);
+				if ((c == 'a' && c1 == 'u' && c2 == 'x') ||
+				    (c == 'c' && c1 == 'o' && c2 == 'n') ||
+				    (c == 'p' && c1 == 'r' && c2 == 'n') ||
+				    (c == 'n' && c1 == 'u' && c2 == 'l') ||
+				    (c == 'c' && c1 == 'o' && c2 == 'm') ||
+				    (c == 'l' && c1 == 'p' && c2 == 't')) {
+					charcopy(dest, &destlen, destsize, c);
+					charcopy(dest, &destlen, destsize, c1);
+					charcopy(dest, &destlen, destsize, '~');
+					i += 3;	n += 3;
+				}
+			}
+			break;
+		case HDOTSPACE:
+			if (src[i] == '/' || src[i] == '\0') {
+				state = HSTART;
+				charcopy(dest, &destlen, destsize, '~');
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else if (n == maxdirslen) {
+				i++;
+			}
+			else {
+				state = HDEFAULT;
+				charcopy(dest, &destlen, destsize, lastc);
+			}
+			break;
+		case HDEFAULT:
+			if (src[i] == '.' || src[i] == ' ') {
+				state = HDOTSPACE;
+				lastc = src[i++];
+				n++;
+			}
+			else if (src[i] == '/') {
+				state = HSTART;
+				charcopy(dest, &destlen, destsize, '/');
+				i++;
+			}
+			else if (src[i] && n == maxdirslen) {
+				i++;
+			}
+			else {
+				c = encodechar(src[i]);
+				charcopy(dest, &destlen, destsize, c);
+				n++; ++i;
+			}
+			break;
+		}
+	}
+
+	return destlen;
+}
+
+PyObject *cutdirs(PyObject *self, PyObject *args)
+{
+	Py_ssize_t len, newlen;
+	PyObject *pathobj, *newobj;
+	char *path;
+
+	if (!PyArg_ParseTuple(args, "O:cutdirs", &pathobj))
+		return NULL;
+
+	if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
+		PyErr_SetString(PyExc_TypeError, "expected a string");
+		return NULL;
+	}
+
+	newlen = len ? _cutdirs(NULL, 0, 0, path, len + 1) : 1;
+
+	newobj = PyString_FromStringAndSize(NULL, newlen);
+
+	if (newobj) {
+		PyString_GET_SIZE(newobj)--;
+		_cutdirs(PyString_AS_STRING(newobj), 0, newlen, path, len + 1);
+	}
+
+	return newobj;
+}
 
 /*
  * We currently implement only basic encoding.


More information about the Mercurial-devel mailing list