[PATCH] V5 of experiment for a simpler path encoding for hashed paths (for "fncache2")
Adrian Buehlmann
adrian at cadifra.com
Wed Sep 26 16:36:05 CDT 2012
# HG changeset patch
# User Adrian Buehlmann <adrian at cadifra.com>
# Date 1348695178 -7200
# Node ID b2b894ef86dd9f4872813c44174372a58799f291
# Parent 68a3d36c2c29172e279fe6ccadc360fa7e7bad42
V5 of experiment for a simpler path encoding for hashed paths (for "fncache2")
Changes since V4:
Converted to state machine.
TODO: Python implementation
diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -1508,6 +1508,7 @@
PyObject *encodedir(PyObject *self, PyObject *args);
PyObject *pathencode(PyObject *self, PyObject *args);
+PyObject *cutdirs(PyObject *self, PyObject *args);
static PyMethodDef methods[] = {
{"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
@@ -1516,6 +1517,7 @@
{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
{"encodedir", encodedir, METH_VARARGS, "encodedir a path\n"},
{"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"},
+ {"cutdirs", cutdirs, METH_VARARGS, "fncache-encode a path\n"},
{NULL, NULL}
};
diff --git a/mercurial/pathencode.c b/mercurial/pathencode.c
--- a/mercurial/pathencode.c
+++ b/mercurial/pathencode.c
@@ -47,6 +47,22 @@
DEFAULT, /* byte of a path component after the first */
};
+/* state machine for hashed paths */
+enum hpath_state {
+ HSTART, /* first byte of a path component */
+ HA, /* "aux" */
+ HAU,
+ HC, /* "con" or "com" */
+ HCO,
+ HP, /* "prn" */
+ HPR,
+ HN, /* "nul" */
+ HNU,
+ HL, /* "lpt" */
+ HLP,
+ HDEFAULT, /* byte of a path component after the first */
+};
+
/* state machine for dir-encoding */
enum dir_state {
DDOT,
@@ -479,8 +495,171 @@
src, len, 1);
}
+static const char encchar[128] =
+ "~abcdefghijklmnopqrstuvwxyz{~}~~"
+ "~!\"#$%&'()~+,-~~0123456789~;~=~~"
+ "@abcdefghijklmnopqrstuvwxyz[~]^_"
+ "`abcdefghijklmnopqrstuvwxyz{~}~~";
+
+/* this encoding folds */
+static inline char encodechar(char c)
+{
+ return encchar[0x7f & c];
+}
+
static const Py_ssize_t maxstorepathlen = 120;
+static Py_ssize_t _cutdirs(char *dest, Py_ssize_t destlen, size_t destsize,
+ const char *src, Py_ssize_t len)
+{
+ int tail = 0;
+ enum hpath_state state = HSTART;
+ Py_ssize_t i = 0, dirlen = 0, spaceleft = maxstorepathlen - 40 + 1;
+ char c;
+
+ while (i < len && spaceleft > 0) {
+ if (src[i] == '/' || src[i] == '\0') {
+ state = HSTART;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ spaceleft--;
+ dirlen = 0;
+ continue;
+ }
+ if (dirlen == 8) {
+ i++;
+ continue;
+ }
+ c = encodechar(src[i++]);
+ switch (state)
+ {
+ case HSTART:
+ state = HDEFAULT;
+ switch (c)
+ {
+ case 'a':
+ state = HA;
+ break;
+ case 'c':
+ state = HC;
+ break;
+ case 'p':
+ state = HP;
+ break;
+ case 'n':
+ state = HN;
+ break;
+ case 'l':
+ state = HL;
+ break;
+ }
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HA:
+ if (c == 'u')
+ state = HAU;
+ else
+ state = HDEFAULT;
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HAU:
+ state = HDEFAULT;
+ if (c == 'x')
+ charcopy(dest, &destlen, destsize, '~');
+ else
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HC:
+ if (c == 'o')
+ state = HCO;
+ else
+ state = HDEFAULT;
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HCO:
+ state = HDEFAULT;
+ if (c == 'n' || c == 'm')
+ charcopy(dest, &destlen, destsize, '~');
+ else
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HP:
+ if (c == 'r')
+ state = HPR;
+ else
+ state = HDEFAULT;
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HPR:
+ state = HDEFAULT;
+ if (c == 'n')
+ charcopy(dest, &destlen, destsize, '~');
+ else
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HN:
+ if (c == 'u')
+ state = HNU;
+ else
+ state = HDEFAULT;
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HNU:
+ state = HDEFAULT;
+ if (c == 'l')
+ charcopy(dest, &destlen, destsize, '~');
+ else
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HL:
+ if (c == 'p')
+ state = HLP;
+ else
+ state = HDEFAULT;
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HLP:
+ state = HDEFAULT;
+ if (c == 't')
+ charcopy(dest, &destlen, destsize, '~');
+ else
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ case HDEFAULT:
+ charcopy(dest, &destlen, destsize, c);
+ break;
+ }
+ spaceleft--; dirlen++;
+ }
+
+ return destlen;
+}
+
+PyObject *cutdirs(PyObject *self, PyObject *args)
+{
+ Py_ssize_t len, newlen;
+ PyObject *pathobj, *newobj;
+ char *path;
+
+ if (!PyArg_ParseTuple(args, "O:cutdirs", &pathobj))
+ return NULL;
+
+ if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
+ PyErr_SetString(PyExc_TypeError, "expected a string");
+ return NULL;
+ }
+
+ newlen = len ? _cutdirs(NULL, 0, 0, path, len + 1) : 1;
+
+ newobj = PyString_FromStringAndSize(NULL, newlen);
+
+ if (newobj) {
+ PyString_GET_SIZE(newobj)--;
+ _cutdirs(PyString_AS_STRING(newobj), 0, newlen, path, len + 1);
+ }
+
+ return newobj;
+}
+
/*
* We currently implement only basic encoding.
*
diff --git a/tests/test-hybridencode.py b/tests/test-hybridencode.py
--- a/tests/test-hybridencode.py
+++ b/tests/test-hybridencode.py
@@ -455,3 +455,54 @@
'VWXYZ-1234567890-xxxxxxxxx-xxxxxxxxx-xxxxxxxx-xxxx'
'xxxxx-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwww'
'wwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww')
+
+from mercurial.parsers import cutdirs
+
+def cd(s):
+ print "cutdirs('%s')" % s.encode("string_escape")
+ print " '%s'" % cutdirs(s).encode("string_escape")
+
+cd('data/a/c/p/n/c/l/foo.i')
+cd('data/au/co/pr/nu/co/lp/foo.i')
+cd('data/aux/con/prn/nul/com/lpt/foo.i')
+cd('data/xaux/xcon/xprn/xnul/xcom/xlpt/foo.i')
+cd('data/auxx/conx/prnx/nulx/comx/lptx/foo.i')
+cd('data/com0/com1/com9/lpt0/lpt1/lpt9/foo.i')
+print
+cd('data/.hello/ hello/hello./hello /foo.i')
+cd('data/..hello/ hello/hello../hello /foo.i')
+cd('data/x.hello/x hello/hello.x/hello x/foo.i')
+cd('data/.hel..lo/ hel lo/hel..lo./hel lo /foo.i')
+print
+cd('data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo.i')
+cd('data/xabcdefgh/xijklmnop/xqrstuvwx/xyz012345/789/foo.i')
+cd('data/abcdefghx/ijklmnopx/qrstuvwxx/yz012345x/789/foo.i')
+cd('data/abcdefg.x/ijklmno x/qrstuvwxx/yz012345x/789/foo.i')
+print
+cd('data/ABCDEFGH/IJKLMNOP/QRSTUVWX/YZ012345/789/foo.i')
+print
+cd("data/01234567/89 !\"#%&/'()+,-.;/=[]^`{}/foo.i")
+print
+print "Windows reserved characters"
+cd('data/\\sl\\ash\\/:co:lon:/*st*ar*/?que?st?/foo.i')
+cd('data/"dqu"ot"/<le<ft</>rig>ht>/|pi|pe|/foo.i')
+print
+cd('data/01234567/01234567/01234567/01234567/01234567'
+ '/01234567/01234567/012345/fo/aux/bla.i')
+cd('data/01234567/01234567/01234567/01234567/01234567'
+ '/01234567/01234567/01234/fo/aux/bla.i')
+cd('data/01234567x/01234567x/01234567x/01234567x/01234567x'
+ '/01234567x/01234567x/01234567x/foo.i')
+print
+cd('data/the quick brown fox jumps over the lazy dog.i')
+print
+print "characters in ASCII code range 1..31"
+cd('data/\x01\x02\x03\x04\x05\x06\x07\x08'
+ '/\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10/foo.i')
+cd('data/\x11\x12\x13\x14\x15\x16\x17\x18'
+ '/\x19\x1a\x1b\x1c\x1d\x1e\x1f/foo.i')
+print
+cd('data/\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f/\x80\x81\x82/'
+ '\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff/a.i')
+print
+cd('data/\xae_\xae_\xae/bla.i')
diff --git a/tests/test-hybridencode.py.out b/tests/test-hybridencode.py.out
--- a/tests/test-hybridencode.py.out
+++ b/tests/test-hybridencode.py.out
@@ -486,3 +486,67 @@
A = 'data/12345678/12345678/12345678/12345678/12345678/12345678/12345678/12345/-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-123456789-12.3456789-12345-ABCDEFGHIJKLMNOPRSTUVWXYZ-abcdefghjiklmnopqrstuvwxyz-ABCDEFGHIJKLMNOPRSTUVWXYZ-1234567890-xxxxxxxxx-xxxxxxxxx-xxxxxxxx-xxxxxxxxx-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww'
B = 'dh/12345678/12345678/12345678/12345678/12345678/12345678/12345678/12345/28de8651e30eeb95f4b97edb7d12b281d3fb3ce0.3456789-12345-abcdefghijklmnoprstuvwxyz-abcdefghjiklmnopqrstuvwxyz-abcdefghijklmnoprstuvwxyz-1234567890-xxxxxxxxx-xxxxxxxxx-xxxxxxxx-xxxxxxxxx-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww'
+cutdirs('data/a/c/p/n/c/l/foo.i')
+ 'data/a/c/p/n/c/l/foo~i'
+cutdirs('data/au/co/pr/nu/co/lp/foo.i')
+ 'data/au/co/pr/nu/co/lp/foo~i'
+cutdirs('data/aux/con/prn/nul/com/lpt/foo.i')
+ 'data/au~/co~/pr~/nu~/co~/lp~/foo~i'
+cutdirs('data/xaux/xcon/xprn/xnul/xcom/xlpt/foo.i')
+ 'data/xaux/xcon/xprn/xnul/xcom/xlpt/foo~i'
+cutdirs('data/auxx/conx/prnx/nulx/comx/lptx/foo.i')
+ 'data/au~x/co~x/pr~x/nu~x/co~x/lp~x/foo~i'
+cutdirs('data/com0/com1/com9/lpt0/lpt1/lpt9/foo.i')
+ 'data/co~0/co~1/co~9/lp~0/lp~1/lp~9/foo~i'
+
+cutdirs('data/.hello/ hello/hello./hello /foo.i')
+ 'data/~hello/~hello/hello~/hello~/foo~i'
+cutdirs('data/..hello/ hello/hello../hello /foo.i')
+ 'data/~~hello/~~hello/hello~~/hello~~/foo~i'
+cutdirs('data/x.hello/x hello/hello.x/hello x/foo.i')
+ 'data/x~hello/x~hello/hello~x/hello~x/foo~i'
+cutdirs('data/.hel..lo/ hel lo/hel..lo./hel lo /foo.i')
+ 'data/~hel~~lo/~hel~~lo/hel~~lo~/hel~~lo~/foo~i'
+
+cutdirs('data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo.i')
+ 'data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo~i'
+cutdirs('data/xabcdefgh/xijklmnop/xqrstuvwx/xyz012345/789/foo.i')
+ 'data/xabcdefg/xijklmno/xqrstuvw/xyz01234/789/foo~i'
+cutdirs('data/abcdefghx/ijklmnopx/qrstuvwxx/yz012345x/789/foo.i')
+ 'data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo~i'
+cutdirs('data/abcdefg.x/ijklmno x/qrstuvwxx/yz012345x/789/foo.i')
+ 'data/abcdefg~/ijklmno~/qrstuvwx/yz012345/789/foo~i'
+
+cutdirs('data/ABCDEFGH/IJKLMNOP/QRSTUVWX/YZ012345/789/foo.i')
+ 'data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo~i'
+
+cutdirs('data/01234567/89 !"#%&/\'()+,-.;/=[]^`{}/foo.i')
+ 'data/01234567/89~!"#%&/\'()+,-~;/=[]^`{}/foo~i'
+
+Windows reserved characters
+cutdirs('data/\\sl\\ash\\/:co:lon:/*st*ar*/?que?st?/foo.i')
+ 'data/~sl~ash~/~co~lon~/~st~ar~/~que~st~/foo~i'
+cutdirs('data/"dqu"ot"/<le<ft</>rig>ht>/|pi|pe|/foo.i')
+ 'data/"dqu"ot"/~le~ft~/~rig~ht~/~pi~pe~/foo~i'
+
+cutdirs('data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/012345/fo/aux/bla.i')
+ 'data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/012345/fo/au'
+cutdirs('data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/01234/fo/aux/bla.i')
+ 'data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/01234/fo/au~'
+cutdirs('data/01234567x/01234567x/01234567x/01234567x/01234567x/01234567x/01234567x/01234567x/foo.i')
+ 'data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/01234567/foo'
+
+cutdirs('data/the quick brown fox jumps over the lazy dog.i')
+ 'data/the~quic'
+
+characters in ASCII code range 1..31
+cutdirs('data/\x01\x02\x03\x04\x05\x06\x07\x08/\t\n\x0b\x0c\r\x0e\x0f\x10/foo.i')
+ 'data/abcdefgh/ijklmnop/foo~i'
+cutdirs('data/\x11\x12\x13\x14\x15\x16\x17\x18/\x19\x1a\x1b\x1c\x1d\x1e\x1f/foo.i')
+ 'data/qrstuvwx/yz{~}~~/foo~i'
+
+cutdirs('data/xyz{|}~\x7f/\x80\x81\x82/\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff/a.i')
+ 'data/xyz{~}~~/~ab/xyz{~}~~/a~i'
+
+cutdirs('data/\xae_\xae_\xae/bla.i')
+ 'data/~_~_~/bla~i'
More information about the Mercurial-devel
mailing list