[PATCH] V11 of experiment for a simpler path encoding for hashed paths (for "fncache2")

Adrian Buehlmann adrian at cadifra.com
Fri Sep 28 06:26:59 CDT 2012


# HG changeset patch
# User Adrian Buehlmann <adrian at cadifra.com>
# Date 1348831411 -7200
# Node ID c740b5c4df2d5c13836ceb36317fbd097e99a37a
# Parent  1e98f31c318dbee8a9b641cc783a07a86d321524
V11 of experiment for a simpler path encoding for hashed paths (for "fncache2")

Changes compared to V10:

- Now assembles the current segment into local "char seg[8]". This causes the
  last incomplete segment to be abandoned and dropped *as a whole* if the total
  length limit has already been reached. Which is the same logic as Bryan's
  recent Python RFC patch. [1]

- We thus now can conveniently inspect and tweak the segment as a whole. We can
  now check for the exact reserved strings directly, grouping them by seglen,
  thus making the escaping less greedy (e.g. "command" was escaped to "co~mand"
  with V10, now it remains unchanged).
  If the segment doesn't have length 3 or 4, we don't even have to do any
  reserved names comparisons.
  Note that the char-encoding done still encodes all periods and spaces to ~,
  so we don't need to worry about embedded periods, e.g. reserved "aux.foo"
  can't happen, as it has already been encoded to "aux~foo", which is not
  reserved (we can alrady infer from the length that "aux~foo" can't be
  reserved).

- Updated the expected test output accordingly and added a few more cases.

[1] http://selenic.com/pipermail/mercurial-devel/2012-September/044535.html

diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -1508,6 +1508,7 @@
 
 PyObject *encodedir(PyObject *self, PyObject *args);
 PyObject *pathencode(PyObject *self, PyObject *args);
+PyObject *cutdirs(PyObject *self, PyObject *args);
 
 static PyMethodDef methods[] = {
 	{"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
@@ -1516,6 +1517,7 @@
 	{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
 	{"encodedir", encodedir, METH_VARARGS, "encodedir a path\n"},
 	{"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"},
+	{"cutdirs", cutdirs, METH_VARARGS, "fncache-encode a path\n"},
 	{NULL, NULL}
 };
 
diff --git a/mercurial/pathencode.c b/mercurial/pathencode.c
--- a/mercurial/pathencode.c
+++ b/mercurial/pathencode.c
@@ -481,6 +481,92 @@
 
 static const Py_ssize_t maxstorepathlen = 120;
 
+static const char encchar[256] =
+	"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+	"~!~#$%&'()~+,-~~0123456789~;~=~~"
+	"@abcdefghijklmnopqrstuvwxyz[~]^_"
+	"`abcdefghijklmnopqrstuvwxyz{~}~~"
+	"~abcdefghijklmnopqrstuvwxyz{~}~~"
+	"~!~#$%&'()~+,-~~0123456789~;~=~~"
+	"@abcdefghijklmnopqrstuvwxyz[~]^_"
+	"`abcdefghijklmnopqrstuvwxyz{~}~~";
+
+/* this encoding folds */
+static inline char encodechar(char c)
+{
+	return encchar[0xff & c];
+}
+
+static Py_ssize_t _cutdirs(char *dest, Py_ssize_t destlen, size_t destsize,
+			   const char *src, Py_ssize_t len)
+{
+	Py_ssize_t i = 0, spaceleft = maxstorepathlen - 40 + 1;
+	char seg[8];
+	int seglen = 0;
+	uint32_t cmp;
+
+	while (i < len && spaceleft > 0) {
+		if (src[i] == '/' || src[i] == '\0') {
+			if (seglen != 0) {
+				if (seglen == 3) {
+					cmp = seg[0] << 16 | seg[1] << 8 | seg[2];
+					if (   cmp == 0x617578 /* aux */
+					    || cmp == 0x636f6e /* con */
+					    || cmp == 0x70726e /* prn */
+					    || cmp == 0x6e756c /* nul */)
+						seg[2] = '~';
+				}
+				else if (seglen == 4) {
+					cmp = seg[0] << 16 | seg[1] << 8 | seg[2];
+					if ((   cmp == 0x636f6d /* com */
+					     || cmp == 0x6c7074 /* lpt */
+					     ) && seg[3] <= '9' && seg[3] >= '1')
+						seg[3] = '~';
+				}
+				memcopy(dest, &destlen, destsize, &seg, seglen);
+				seglen = 0;
+			}
+			charcopy(dest, &destlen, destsize, src[i++]);
+			spaceleft--;
+		}
+		else if (seglen == 8) {
+			i++;
+		}
+		else {
+			seg[seglen++] = encodechar(src[i++]);
+			spaceleft--;
+		}
+	}
+
+	return destlen;
+}
+
+PyObject *cutdirs(PyObject *self, PyObject *args)
+{
+	Py_ssize_t len, newlen;
+	PyObject *pathobj, *newobj;
+	char *path;
+
+	if (!PyArg_ParseTuple(args, "O:cutdirs", &pathobj))
+		return NULL;
+
+	if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
+		PyErr_SetString(PyExc_TypeError, "expected a string");
+		return NULL;
+	}
+
+	newlen = len ? _cutdirs(NULL, 0, 0, path, len + 1) : 1;
+
+	newobj = PyString_FromStringAndSize(NULL, newlen);
+
+	if (newobj) {
+		PyString_GET_SIZE(newobj)--;
+		_cutdirs(PyString_AS_STRING(newobj), 0, newlen, path, len + 1);
+	}
+
+	return newobj;
+}
+
 /*
  * We currently implement only basic encoding.
  *
diff --git a/tests/test-hybridencode.py b/tests/test-hybridencode.py
--- a/tests/test-hybridencode.py
+++ b/tests/test-hybridencode.py
@@ -455,3 +455,61 @@
           'VWXYZ-1234567890-xxxxxxxxx-xxxxxxxxx-xxxxxxxx-xxxx'
           'xxxxx-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwww'
           'wwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww')
+
+from mercurial.parsers import cutdirs
+
+def cd(s):
+    print "cutdirs('%s')" % s.encode("string_escape")
+    print "        '%s'" % cutdirs(s).encode("string_escape")
+
+cd('data/a/c/p/n/c/l/foo.i')
+cd('data/au/co/pr/nu/co/lp/foo.i')
+cd('data/aux/con/prn/nul/com/lpt/foo.i')
+cd('data/xaux/xcon/xprn/xnul/xcom/xlpt/foo.i')
+cd('data/auxx/conx/prnx/nulx/comx/lptx/foo.i')
+cd('data/com0/com1/com9/lpt0/lpt1/lpt9/foo.i')
+print
+cd('data/common/auxiliary/nulling/console/bla.com/foo.i')
+print
+cd('data/.hello/ hello/hello./hello /foo.i')
+cd('data/..hello/  hello/hello../hello  /foo.i')
+cd('data/x.hello/x hello/hello.x/hello x/foo.i')
+cd('data/.hel..lo/ hel  lo/hel..lo./hel  lo /foo.i')
+print
+cd('data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo.i')
+cd('data/xabcdefgh/xijklmnop/xqrstuvwx/xyz012345/789/foo.i')
+cd('data/abcdefghx/ijklmnopx/qrstuvwxx/yz012345x/789/foo.i')
+cd('data/abcdefg.x/ijklmno x/qrstuvwxx/yz012345x/789/foo.i')
+print
+cd('data/ABCDEFGH/IJKLMNOP/QRSTUVWX/YZ012345/789/foo.i')
+print
+cd("data/01234567/89 !\"#%&/'()+,-.;/=[]^`{}/foo.i")
+print
+print "Windows reserved characters"
+cd('data/\\sl\\ash\\/:co:lon:/*st*ar*/?que?st?/foo.i')
+cd('data/"dqu"ot"/<le<ft</>rig>ht>/|pi|pe|/foo.i')
+print
+cd('data/01234567/01234567/01234567/01234567/01234567'
+   '/01234567/01234567/012345/fo/aux/bla.i')
+cd('data/01234567/01234567/01234567/01234567/01234567'
+   '/01234567/01234567/01234/fo/aux/bla.i')
+cd('data/01234567x/01234567x/01234567x/01234567x/01234567x'
+  '/01234567x/01234567x/01234567x/foo.i')
+print
+cd('data/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z'
+   '/0/1/2/3/4/5/6/7/8/9/aux/bla.i')
+cd('data/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z'
+   '/0/1/2/3/4/5/6/7/8/9/-/aux/bla.i')
+print
+cd('data/the quick brown fox jumps over the lazy dog.i')
+print
+print "characters in ASCII code range 1..31"
+cd('data/\x01\x02\x03\x04\x05\x06\x07\x08'
+   '/\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10/foo.i')
+cd('data/\x11\x12\x13\x14\x15\x16\x17\x18'
+   '/\x19\x1a\x1b\x1c\x1d\x1e\x1f/foo.i')
+print
+cd('data/\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f/\x80\x81\x82/'
+    '\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff/a.i')
+print
+cd('data/\xae_\xae_\xae/bla.i')
diff --git a/tests/test-hybridencode.py.out b/tests/test-hybridencode.py.out
--- a/tests/test-hybridencode.py.out
+++ b/tests/test-hybridencode.py.out
@@ -486,3 +486,75 @@
 A = 'data/12345678/12345678/12345678/12345678/12345678/12345678/12345678/12345/-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-123456789-12.3456789-12345-ABCDEFGHIJKLMNOPRSTUVWXYZ-abcdefghjiklmnopqrstuvwxyz-ABCDEFGHIJKLMNOPRSTUVWXYZ-1234567890-xxxxxxxxx-xxxxxxxxx-xxxxxxxx-xxxxxxxxx-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww'
 B = 'dh/12345678/12345678/12345678/12345678/12345678/12345678/12345678/12345/28de8651e30eeb95f4b97edb7d12b281d3fb3ce0.3456789-12345-abcdefghijklmnoprstuvwxyz-abcdefghjiklmnopqrstuvwxyz-abcdefghijklmnoprstuvwxyz-1234567890-xxxxxxxxx-xxxxxxxxx-xxxxxxxx-xxxxxxxxx-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww-wwwwwwwww'
 
+cutdirs('data/a/c/p/n/c/l/foo.i')
+        'data/a/c/p/n/c/l/foo~i'
+cutdirs('data/au/co/pr/nu/co/lp/foo.i')
+        'data/au/co/pr/nu/co/lp/foo~i'
+cutdirs('data/aux/con/prn/nul/com/lpt/foo.i')
+        'data/au~/co~/pr~/nu~/com/lpt/foo~i'
+cutdirs('data/xaux/xcon/xprn/xnul/xcom/xlpt/foo.i')
+        'data/xaux/xcon/xprn/xnul/xcom/xlpt/foo~i'
+cutdirs('data/auxx/conx/prnx/nulx/comx/lptx/foo.i')
+        'data/auxx/conx/prnx/nulx/comx/lptx/foo~i'
+cutdirs('data/com0/com1/com9/lpt0/lpt1/lpt9/foo.i')
+        'data/com0/com~/com~/lpt0/lpt~/lpt~/foo~i'
+
+cutdirs('data/common/auxiliary/nulling/console/bla.com/foo.i')
+        'data/common/auxiliar/nulling/console/bla~com/foo~i'
+
+cutdirs('data/.hello/ hello/hello./hello /foo.i')
+        'data/~hello/~hello/hello~/hello~/foo~i'
+cutdirs('data/..hello/  hello/hello../hello  /foo.i')
+        'data/~~hello/~~hello/hello~~/hello~~/foo~i'
+cutdirs('data/x.hello/x hello/hello.x/hello x/foo.i')
+        'data/x~hello/x~hello/hello~x/hello~x/foo~i'
+cutdirs('data/.hel..lo/ hel  lo/hel..lo./hel  lo /foo.i')
+        'data/~hel~~lo/~hel~~lo/hel~~lo~/hel~~lo~/foo~i'
+
+cutdirs('data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo.i')
+        'data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo~i'
+cutdirs('data/xabcdefgh/xijklmnop/xqrstuvwx/xyz012345/789/foo.i')
+        'data/xabcdefg/xijklmno/xqrstuvw/xyz01234/789/foo~i'
+cutdirs('data/abcdefghx/ijklmnopx/qrstuvwxx/yz012345x/789/foo.i')
+        'data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo~i'
+cutdirs('data/abcdefg.x/ijklmno x/qrstuvwxx/yz012345x/789/foo.i')
+        'data/abcdefg~/ijklmno~/qrstuvwx/yz012345/789/foo~i'
+
+cutdirs('data/ABCDEFGH/IJKLMNOP/QRSTUVWX/YZ012345/789/foo.i')
+        'data/abcdefgh/ijklmnop/qrstuvwx/yz012345/789/foo~i'
+
+cutdirs('data/01234567/89 !"#%&/\'()+,-.;/=[]^`{}/foo.i')
+        'data/01234567/89~!~#%&/\'()+,-~;/=[]^`{}/foo~i'
+
+Windows reserved characters
+cutdirs('data/\\sl\\ash\\/:co:lon:/*st*ar*/?que?st?/foo.i')
+        'data/~sl~ash~/~co~lon~/~st~ar~/~que~st~/foo~i'
+cutdirs('data/"dqu"ot"/<le<ft</>rig>ht>/|pi|pe|/foo.i')
+        'data/~dqu~ot~/~le~ft~/~rig~ht~/~pi~pe~/foo~i'
+
+cutdirs('data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/012345/fo/aux/bla.i')
+        'data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/012345/fo'
+cutdirs('data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/01234/fo/aux/bla.i')
+        'data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/01234/fo/au~'
+cutdirs('data/01234567x/01234567x/01234567x/01234567x/01234567x/01234567x/01234567x/01234567x/foo.i')
+        'data/01234567/01234567/01234567/01234567/01234567/01234567/01234567/01234567'
+
+cutdirs('data/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9/aux/bla.i')
+        'data/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9/au~'
+cutdirs('data/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9/-/aux/bla.i')
+        'data/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/0/1/2/3/4/5/6/7/8/9/-'
+
+cutdirs('data/the quick brown fox jumps over the lazy dog.i')
+        'data/the~quic'
+
+characters in ASCII code range 1..31
+cutdirs('data/\x01\x02\x03\x04\x05\x06\x07\x08/\t\n\x0b\x0c\r\x0e\x0f\x10/foo.i')
+        'data/~~~~~~~~/~~~~~~~~/foo~i'
+cutdirs('data/\x11\x12\x13\x14\x15\x16\x17\x18/\x19\x1a\x1b\x1c\x1d\x1e\x1f/foo.i')
+        'data/~~~~~~~~/~~~~~~~/foo~i'
+
+cutdirs('data/xyz{|}~\x7f/\x80\x81\x82/\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff/a.i')
+        'data/xyz{~}~~/~ab/xyz{~}~~/a~i'
+
+cutdirs('data/\xae_\xae_\xae/bla.i')
+        'data/~_~_~/bla~i'


More information about the Mercurial-devel mailing list