[PATCH] ideas/experiment for a simpler path encoding for hashed paths (for "fncache2")
Adrian Buehlmann
adrian at cadifra.com
Mon Sep 24 17:03:57 CDT 2012
# HG changeset patch
# User Adrian Buehlmann <adrian at cadifra.com>
# Date 1348524023 -7200
# Node ID 916efbaad41a418205d0eac300a2488c111696b4
# Parent c713bf1139edad4f1c1e7e50d0b591c59d4e66ca
ideas/experiment for a simpler path encoding for hashed paths (for "fncache2")
Function cutdirs takes up to 8 characters of each directory and encodes
reserved chars, names, dots and periods at begin or end.
Uppercase is folded to lowercase. The encoding is size-preserving, folding and
non-reversible. The result of cutdirs is intended to be prepended in front of
the 40 byte sha1-hash of the path (followed by .i or .d).
>>> from mercurial.parsers import cutdirs
>>> cutdirs("helloworld")
'hellowor'
>>> cutdirs("Helloworld")
'hellowor'
>>> cutdirs("ABCDEFGHIJKLMNOPQRSTUVW")
'abcdefgh'
>>> cutdirs("ABCDEFGH/IJKLMNOP/QRSTUVW")
'abcdefgh/ijklmnop/qrstuvw'
>>> cutdirs("ABCDEFGH/IJKLMNOP/QRSTUVWX/YZ")
'abcdefgh/ijklmnop/qrstuvwx/yz'
>>> cutdirs("?")
'~'
>>> cutdirs("'")
"'"
>>> cutdirs("|")
'~'
>>> cutdirs("\04")
'd'
>>> cutdirs("\01")
'a'
>>> cutdirs("\31")
'y'
>>> cutdirs("ABCDEFGH/IJKLMNOP/QRSTUVWX/YZ")
'abcdefgh/ijklmnop/qrstuvwx/yz'
>>> cutdirs("ABCDEFGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
'abcdefgh/ijklmnop/qrstuvwx/yz'
>>> cutdirs(".ABCDEFGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
'~abcdefg/ijklmnop/qrstuvwx/yz'
>>> cutdirs(" ABCDEFGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
'~abcdefg/ijklmnop/qrstuvwx/yz'
>>> cutdirs(" AB..CDEFGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
'~ab..cde/ijklmnop/qrstuvwx/yz'
>>> cutdirs(" AB..CD.FGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
'~ab..cd~/ijklmnop/qrstuvwx/yz'
>>> cutdirs(" AB..CD FGHkkkkk/IJKLMNOP/QRSTUVWX/YZ")
'~ab..cd~/ijklmnop/qrstuvwx/yz'
>>> cutdirs(" AB..CD FGHkkkkk/auxIJKLMNOP/QRSTUVWX/YZ")
'~ab..cd~/au~ijklm/qrstuvwx/yz'
>>> cutdirs(" AB..CD FGHkkkkk/auxIJKLMNOP/QRSTUVWX/YZ/prn/nul")
'~ab..cd~/au~ijklm/qrstuvwx/yz/pr~/nu~'
>>> cutdirs(" AB..CD FGHkkkkk/auxIJKLMNOP/QRSTUVWX/YZ/prnoooo/nul")
'~ab..cd~/au~ijklm/qrstuvwx/yz/pr~oooo/nu~'
>>> cutdirs("a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9xxxxxxxxxxxxx")
'a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9xxxxxx'
>>> cutdirs("a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9/xxxxxxxxxxxxx")
'a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9/xxxxxx'
>>> cutdirs("\\?*<>|")
'~~~~~~'
diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -1508,6 +1508,7 @@
PyObject *encodedir(PyObject *self, PyObject *args);
PyObject *pathencode(PyObject *self, PyObject *args);
+PyObject *cutdirs(PyObject *self, PyObject *args);
static PyMethodDef methods[] = {
{"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
@@ -1516,6 +1517,7 @@
{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
{"encodedir", encodedir, METH_VARARGS, "encodedir a path\n"},
{"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"},
+ {"cutdirs", cutdirs, METH_VARARGS, "fncache-encode a path\n"},
{NULL, NULL}
};
diff --git a/mercurial/pathencode.c b/mercurial/pathencode.c
--- a/mercurial/pathencode.c
+++ b/mercurial/pathencode.c
@@ -47,6 +47,13 @@
DEFAULT, /* byte of a path component after the first */
};
+/* state machine for hashed paths */
+enum hpath_state {
+ HSTART, /* first byte of a path component */
+ HDOTSPACE,
+ HDEFAULT, /* byte of a path component after the first */
+};
+
/* state machine for dir-encoding */
enum dir_state {
DDOT,
@@ -479,7 +486,118 @@
src, len, 1);
}
+static char encchar[128] = "~abcdefghijklmnopqrstuvwxyz{~}~~"
+ " !\"#$%&'()~+,-.~0123456789~;~=~~"
+ "@abcdefghijklmnopqrstuvwxyz[~]^_"
+ "`abcdefghijklmnopqrstuvwxyz{~}~~";
+
+/* this encoding folds */
+static inline char encodechar(char c)
+{
+ return c ? encchar[0x7f & c] : 0;
+}
+
static const Py_ssize_t maxstorepathlen = 120;
+static const Py_ssize_t maxdirslen = 8;
+
+static Py_ssize_t _cutdirs(char *dest, Py_ssize_t destlen, size_t destsize,
+ const char *src, Py_ssize_t len)
+{
+ enum hpath_state state = HSTART;
+ Py_ssize_t i = 0, n;
+ char c, c1, c2, lastc;
+
+ while (i < len && i <= maxstorepathlen - 42) {
+ switch (state) {
+ case HSTART:
+ state = HDEFAULT;
+ n = 0;
+ if (src[i] == '.' || src[i] == ' ') {
+ charcopy(dest, &destlen, destsize, '~');
+ i++; n++;
+ }
+ else if (i + 2 < len) {
+ c = encodechar(src[i]);
+ c1 = encodechar(src[i + 1]);
+ c2 = encodechar(src[i + 2]);
+ if ((c == 'a' && c1 == 'u' && c2 == 'x') ||
+ (c == 'c' && c1 == 'o' && c2 == 'n') ||
+ (c == 'p' && c1 == 'r' && c2 == 'n') ||
+ (c == 'n' && c1 == 'u' && c2 == 'l') ||
+ (c == 'c' && c1 == 'o' && c2 == 'm') ||
+ (c == 'l' && c1 == 'p' && c2 == 't')) {
+ charcopy(dest, &destlen, destsize, c);
+ charcopy(dest, &destlen, destsize, c1);
+ charcopy(dest, &destlen, destsize, '~');
+ i += 3; n += 3;
+ }
+ }
+ break;
+ case HDOTSPACE:
+ if (src[i] == '/' || src[i] == '\0') {
+ state = HSTART;
+ charcopy(dest, &destlen, destsize, '~');
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else if (n == maxdirslen) {
+ i++;
+ }
+ else {
+ state = HDEFAULT;
+ charcopy(dest, &destlen, destsize, lastc);
+ }
+ break;
+ case HDEFAULT:
+ if (src[i] == '.' || src[i] == ' ') {
+ state = HDOTSPACE;
+ lastc = src[i++];
+ n++;
+ }
+ else if (src[i] == '/') {
+ state = HSTART;
+ charcopy(dest, &destlen, destsize, '/');
+ i++;
+ }
+ else if (src[i] && n == maxdirslen) {
+ i++;
+ }
+ else {
+ c = encodechar(src[i]);
+ charcopy(dest, &destlen, destsize, c);
+ n++; ++i;
+ }
+ break;
+ }
+ }
+
+ return destlen;
+}
+
+PyObject *cutdirs(PyObject *self, PyObject *args)
+{
+ Py_ssize_t len, newlen;
+ PyObject *pathobj, *newobj;
+ char *path;
+
+ if (!PyArg_ParseTuple(args, "O:cutdirs", &pathobj))
+ return NULL;
+
+ if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
+ PyErr_SetString(PyExc_TypeError, "expected a string");
+ return NULL;
+ }
+
+ newlen = len ? _cutdirs(NULL, 0, 0, path, len + 1) : 1;
+
+ newobj = PyString_FromStringAndSize(NULL, newlen);
+
+ if (newobj) {
+ PyString_GET_SIZE(newobj)--;
+ _cutdirs(PyString_AS_STRING(newobj), 0, newlen, path, len + 1);
+ }
+
+ return newobj;
+}
/*
* We currently implement only basic encoding.
More information about the Mercurial-devel
mailing list