[PATCH V3 abuehl] store: rewrite fncache path mangling code in C

Adrian Buehlmann adrian at cadifra.com
Thu Aug 30 14:00:15 CDT 2012


# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1346175814 25200
# Node ID 38c525eb4a5038733b9e9141878638d34bde1d8c
# Parent  1e104d8198d7cf8ab7151da0e764676ea01cc115
store: rewrite fncache path mangling code in C

The Python path mangling code used by fncache

Adapted for MS C compiler by Adrian Buehlmann

diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -1506,11 +1506,14 @@
 
 static char parsers_doc[] = "Efficient content parsing.";
 
+PyObject *pathencode(PyObject *self, PyObject *args);
+
 static PyMethodDef methods[] = {
 	{"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
 	{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
 	{"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
 	{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
+	{"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"},
 	{NULL, NULL}
 };
 
diff --git a/mercurial/pathencode.c b/mercurial/pathencode.c
new file mode 100644
--- /dev/null
+++ b/mercurial/pathencode.c
@@ -0,0 +1,677 @@
+/*
+ pathencode.c - efficient path name encoding
+
+ Copyright 2012 Facebook
+
+ This software may be used and distributed according to the terms of
+ the GNU General Public License, incorporated herein by reference.
+*/
+
+/*
+ * An implementation of the name encoding scheme used by the fncache
+ * store.  The common case is of a path < 120 bytes long, which is
+ * handled either in a single pass with no allocations or two passes
+ * with a single allocation.  For longer paths, multiple passes are
+ * required.
+ */
+
+#include <Python.h>
+#include <assert.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+
+#ifndef _MSC_VER
+#include <stdint.h>
+#else
+#define inline __inline
+typedef unsigned char uint8_t;
+typedef unsigned __int32 uint32_t;
+#endif
+
+/* state machine for the fast path */
+enum path_state {
+	START,   /* first byte of a path component */
+	A,       /* "AUX" */
+	AU,
+	THIRD,   /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
+	C,       /* "CON" or "COMn" */
+	CO,
+	COMLPT,  /* "COM" or "LPT" */
+	COMLPTn,
+	L,
+	LP,
+	N,
+	NU,
+	P,       /* "PRN" */
+	PR,
+	DOT,
+	H,       /* ".h" */
+	HGDI,    /* ".hg", ".d", or ".i" */
+	SPACE,
+	DEFAULT, /* byte of a path component after the first */
+};
+
+/* state machine for dir-encoding */
+enum dir_state {
+	DSTART,
+	DDOT,
+	DH,
+	DHGDI,
+	DDEFAULT,
+};
+
+static inline int isset(const uint32_t bitset[], char c)
+{
+	return bitset[c >> 5] & (1 << (c & 31));
+}
+
+static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
+			    char c)
+{
+	if (dest) {
+		assert(*destlen < destsize);
+		dest[*destlen] = c;
+	}
+	(*destlen)++;
+}
+
+static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
+			   const void *src, Py_ssize_t len)
+{
+	if (dest) {
+		assert(*destlen + len < destsize);
+		memcpy((void *)&dest[*destlen], src, len);
+	}
+	*destlen += len;
+}
+
+static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
+			     uint8_t c)
+{
+	static const char hexdigit[] = "0123456789abcdef";
+
+	charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
+	charcopy(dest, destlen, destsize, hexdigit[c & 15]);
+}
+
+/* 3-byte escape: tilde followed by two hex digits */
+static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
+			   char c)
+{
+	charcopy(dest, destlen, destsize, '~');
+	hexencode(dest, destlen, destsize, c);
+}
+
+static Py_ssize_t _encode(const uint32_t twobytes[8],
+			  const uint32_t onebyte[8],
+			  const uint32_t fastpath[8],
+			  char *dest, Py_ssize_t destlen, size_t destsize,
+			  const char *src, Py_ssize_t len)
+{
+	enum path_state state = START;
+	Py_ssize_t i = 0;
+
+	/*
+	 * Python strings end with a zero byte, which we use as a
+	 * terminal token as they are not valid inside path names.
+	 */
+
+	while (i < len) {
+		switch (state) {
+		case START:
+			switch (src[i]) {
+			case '/':
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			case '.':
+			case ' ':
+				state = DEFAULT;
+				escape3(dest, &destlen, destsize, src[i++]);
+				break;
+			case 'a':
+				state = A;
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			case 'c':
+				state = C;
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			case 'l':
+				state = L;
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			case 'n':
+				state = N;
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			case 'p':
+				state = P;
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			default:
+				state = DEFAULT;
+				break;
+			}
+			break;
+		case A:
+			if (src[i] == 'u') {
+				state = AU;
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DEFAULT;
+			break;
+		case AU:
+			if (src[i] == 'x') {
+				state = THIRD;
+				i++;
+			}
+			else state = DEFAULT;
+			break;
+		case THIRD:
+			state = DEFAULT;
+			switch (src[i]) {
+			case '.':
+			case '/':
+			case '\0':
+				escape3(dest, &destlen, destsize, src[i - 1]);
+				break;
+			default:
+				i--;
+				break;
+			}
+			break;
+		case C:
+			if (src[i] == 'o') {
+				state = CO;
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DEFAULT;
+			break;
+		case CO:
+			if (src[i] == 'm') {
+				state = COMLPT;
+				i++;
+			}
+			else if (src[i] == 'n') {
+				state = THIRD;
+				i++;
+			}
+			else state = DEFAULT;
+			break;
+		case COMLPT:
+			switch (src[i]) {
+			case '1': case '2': case '3': case '4': case '5':
+			case '6': case '7': case '8': case '9':
+				state = COMLPTn;
+				i++;
+				break;
+			default:
+				state = DEFAULT;
+				charcopy(dest, &destlen, destsize, src[i - 1]);
+				break;
+			}
+			break;
+		case COMLPTn:
+			state = DEFAULT;
+			switch (src[i]) {
+			case '.':
+			case '/':
+			case '\0':
+				escape3(dest, &destlen, destsize, src[i - 2]);
+				charcopy(dest, &destlen, destsize, src[i - 1]);
+				break;
+			default:
+				memcopy(dest, &destlen, destsize,
+					&src[i - 2], 2);
+				break;
+			}
+			break;
+		case L:
+			if (src[i] == 'p') {
+				state = LP;
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DEFAULT;
+			break;
+		case LP:
+			if (src[i] == 't') {
+				state = COMLPT;
+				i++;
+			}
+			else state = DEFAULT;
+			break;
+		case N:
+			if (src[i] == 'u') {
+				state = NU;
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DEFAULT;
+			break;
+		case NU:
+			if (src[i] == 'l') {
+				state = THIRD;
+				i++;
+			}
+			else state = DEFAULT;
+			break;
+		case P:
+			if (src[i] == 'r') {
+				state = PR;
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DEFAULT;
+			break;
+		case PR:
+			if (src[i] == 'n') {
+				state = THIRD;
+				i++;
+			}
+			else state = DEFAULT;
+			break;
+		case DOT:
+			switch (src[i]) {
+			case '/':
+			case '\0':
+				state = START;
+				memcopy(dest, &destlen, destsize, "~2e", 3);
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			case 'd':
+			case 'i':
+				state = HGDI;
+				charcopy(dest, &destlen, destsize, '.');
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			case 'h':
+				state = H;
+				charcopy(dest, &destlen, destsize, '.');
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			default:
+				state = DEFAULT;
+				charcopy(dest, &destlen, destsize, '.');
+				break;
+			}
+			break;
+		case H:
+			if (src[i] == 'g') {
+				state = HGDI;
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DEFAULT;
+			break;
+		case HGDI:
+			if (src[i] == '/') {
+				state = START;
+				memcopy(dest, &destlen, destsize, ".hg", 3);
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DEFAULT;
+			break;
+		case SPACE:
+			switch (src[i]) {
+			case '/':
+			case '\0':
+				state = START;
+				memcopy(dest, &destlen, destsize, "~20", 3);
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			default:
+				state = DEFAULT;
+				charcopy(dest, &destlen, destsize, ' ');
+				break;
+			}
+			break;
+		case DEFAULT:
+			while (i < len && isset(fastpath, src[i]))
+				charcopy(dest, &destlen, destsize, src[i++]);
+			if (i == len)
+				goto done;
+			switch (src[i]) {
+			case '.':
+				state = DOT;
+				i++;
+				break;
+			case ' ':
+				state = SPACE;
+				i++;
+				break;
+			case '/':
+				state = START;
+				charcopy(dest, &destlen, destsize, '/');
+				i++;
+				break;
+			default:
+				if (isset(onebyte, src[i])) {
+					do {
+						charcopy(dest, &destlen,
+							 destsize, src[i++]);
+					} while (i < len &&
+						 isset(fastpath, src[i]));
+				}
+				else if (isset(twobytes, src[i])) {
+					char c = src[i++];
+					charcopy(dest, &destlen, destsize, '_');
+					charcopy(dest, &destlen, destsize,
+						 c == '_' ? '_' : c + 32);
+				}
+				else
+					escape3(dest, &destlen, destsize,
+						src[i++]);
+				break;
+			}
+			break;
+		}
+	}
+done:
+	return destlen;
+}
+
+static Py_ssize_t basicencode(char *dest, size_t destsize,
+			      const char *src, Py_ssize_t len)
+{
+	static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe };
+
+	static const uint32_t onebyte[8] = {
+		1, 0x2bfffbfa, 0x68000001, 0x2fffffff,
+	};
+
+	static const uint32_t fastpath[8] = {
+		1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
+	};
+
+	Py_ssize_t destlen = 0;
+
+	if (len < 5 || memcmp(src, "data/", 5) != 0) {
+		memcopy(dest, &destlen, destsize, src, len);
+		return destlen;
+	}
+
+	memcopy(dest, &destlen, destsize, "data/", 5);
+
+	return _encode(twobytes, onebyte, fastpath, dest, destlen, destsize,
+		       src + 5, len - 5);
+}
+
+static Py_ssize_t direncode(char *dest, size_t destsize,
+			    const char *src, Py_ssize_t len)
+{
+	enum dir_state state = DSTART;
+	Py_ssize_t i = 5, destlen = 0;
+
+	memcopy(dest, &destlen, destsize, "data/", 5);
+
+	while (i < len) {
+		switch (state) {
+		case DSTART:
+			state = DDEFAULT;
+			charcopy(dest, &destlen, destsize, src[i++]);
+			break;
+		case DDOT:
+			switch (src[i]) {
+			case 'd':
+			case 'i':
+				state = DHGDI;
+				charcopy(dest, &destlen, destsize, '.');
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			case 'h':
+				state = DH;
+				charcopy(dest, &destlen, destsize, '.');
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			default:
+				state = DDEFAULT;
+				charcopy(dest, &destlen, destsize, '.');
+				break;
+			}
+		case DH:
+			if (src[i] == 'g') {
+				state = DHGDI;
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DDEFAULT;
+			break;
+		case DHGDI:
+			if (src[i] == '/') {
+				state = DSTART;
+				memcopy(dest, &destlen, destsize, ".hg", 3);
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DDEFAULT;
+			break;
+		case DDEFAULT:
+			switch (src[i]) {
+			case '.':
+				state = DDOT;
+				i++;
+				break;
+			case '/':
+				state = DSTART;
+			default:
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			}
+			break;
+		}
+	}
+
+	return destlen;
+}
+
+static Py_ssize_t auxencode(char *dest, size_t destsize,
+			    const char *src, Py_ssize_t len)
+{
+	static const uint32_t twobytes[8];
+
+	static const uint32_t onebyte[8] = {
+		~0, 0xffffbffe, ~0, ~0, ~0, ~0, ~0, ~0,
+	};
+
+	return _encode(twobytes, onebyte, onebyte, dest, 0, destsize,
+		       src, len);
+}
+
+static const Py_ssize_t maxstorepathlen = 120;
+
+static Py_ssize_t lowerencode(char *dest, size_t destsize,
+			      const char *src, Py_ssize_t len)
+{
+	static const uint32_t onebyte[8] = {
+		1, 0x2bfffbfb, 0xe8000001, 0x2fffffff
+	};
+
+	static const uint32_t lower[8] = { 0, 0, 0x7fffffe };
+
+	Py_ssize_t i = 0, destlen = 0;
+
+	while (i < len) {
+		if (isset(onebyte, src[i]))
+			charcopy(dest, &destlen, destsize, src[i++]);
+		else if (isset(lower, src[i]))
+			charcopy(dest, &destlen, destsize, src[i++] + 32);
+		else
+			escape3(dest, &destlen, destsize, src[i++]);
+	}
+
+	return destlen;
+}
+
+static Py_ssize_t hashmangle(char *dest, size_t destsize, const char *src,
+			     Py_ssize_t len, const char sha[20])
+{
+	static const Py_ssize_t dirprefixlen = 8;
+	static const Py_ssize_t maxshortdirslen = 68;
+
+	Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;
+	Py_ssize_t destlen = 0, slop;
+
+	while (lastslash >= 0 && src[lastslash] != '/') {
+		if (src[lastslash] == '.' && lastdot == -1)
+			lastdot = lastslash;
+		lastslash--;
+	}
+
+	for (i = d = p = 0; i < lastslash; i++, p++) {
+		if (src[i] == '/') {
+			if (destlen > maxshortdirslen)
+				break;
+			charcopy(dest, &destlen, destsize, src[i]);
+			p = -1;
+		}
+		else if (p < dirprefixlen)
+			charcopy(dest, &destlen, destsize, src[i]);
+	}
+
+	if (destlen > maxshortdirslen)
+		do {
+			destlen--;
+		} while (destlen > 0 && dest[destlen] != '/');
+
+	if (destlen > 3)
+		charcopy(dest, &destlen, destsize, '/');
+
+	slop = maxstorepathlen - destlen - 3 - (40 + len - lastdot - 1);
+	if (slop > len - lastslash - 2)
+		slop = len - lastslash - 2;
+
+	if (slop > 0)
+		memcopy(dest, &destlen, destsize, &src[lastslash + 1], slop);
+
+	for (i = 0; i < 20; i++)
+		hexencode(dest, &destlen, destsize, sha[i]);
+
+	if (lastdot >= 0)
+		memcopy(dest, &destlen, destsize, &src[lastdot],
+			len - lastdot);
+	else
+		charcopy(dest, &destlen, destsize, 0);
+
+	return destlen;
+}
+
+/*
+ * Avoiding a trip through Python would improve performance by 50%,
+ * but we don't encounter enough long names to be worth the code.
+ */
+static int sha1hash(char hash[20], const char *str, Py_ssize_t len)
+{
+	static PyObject *shafunc;
+	PyObject *shaobj, *hashobj;
+
+	if (shafunc == NULL) {
+		PyObject *util, *name = PyString_FromString("mercurial.util");
+
+		if (name == NULL)
+			return -1;
+
+		util = PyImport_Import(name);
+		Py_DECREF(name);
+
+		if (util == NULL) {
+			PyErr_SetString(PyExc_ImportError, "mercurial.util");
+			return -1;
+		}
+		shafunc = PyObject_GetAttrString(util, "sha1");
+		Py_DECREF(util);
+
+		if (shafunc == NULL) {
+			PyErr_SetString(PyExc_AttributeError,
+					"module 'mercurial.util' has no "
+					"attribute 'sha1'");
+			return -1;
+		}
+	}
+
+	shaobj = PyObject_CallFunction(shafunc, "s#", str, len);
+
+	if (shaobj == NULL)
+		return -1;
+
+	hashobj = PyObject_CallMethod(shaobj, "digest", "");
+	Py_DECREF(shaobj);
+
+	if (!PyString_Check(hashobj) || PyString_GET_SIZE(hashobj) != 20) {
+		PyErr_SetString(PyExc_TypeError,
+				"result of digest is not a 20-byte hash");
+		Py_DECREF(hashobj);
+		return -1;
+	}
+
+	memcpy(hash, PyString_AS_STRING(hashobj), 20);
+	Py_DECREF(hashobj);
+	return 0;
+}
+
+static Py_ssize_t hashencode(char *dest, size_t destsize, const char *src,
+			     Py_ssize_t len)
+{
+	const Py_ssize_t baselen = (len - 5) * 3;
+	char *dired = alloca(baselen);
+	char *lowered = alloca(baselen);
+	char *auxed = alloca(baselen);
+	char *mangled = alloca(baselen);
+	Py_ssize_t dirlen, lowerlen, auxlen, destlen;
+	char sha[20];
+
+	dirlen = direncode(dired, baselen, src, len);
+	if (sha1hash(sha, dired, dirlen - 1) == -1)
+		return -1;
+	lowerlen = lowerencode(lowered, baselen, src + 5, len - 5);
+	auxlen = auxencode(auxed, baselen, lowered, lowerlen);
+	memcpy(mangled, "dh/", 3);
+	destlen = hashmangle(mangled + 3, baselen - 3, auxed, auxlen, sha) + 2;
+	if (dest) {
+		assert(destlen <= destsize);
+		memcpy(dest, mangled, destlen);
+	}
+
+	return destlen;
+}
+
+PyObject *pathencode(PyObject *self, PyObject *args)
+{
+	Py_ssize_t len, newlen;
+	PyObject *pathobj, *newobj;
+	char *path;
+
+	if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
+		return NULL;
+
+	if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
+		PyErr_SetString(PyExc_TypeError, "expected a string");
+		return NULL;
+	}
+
+	newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
+
+	if (newlen == len + 1) {
+		Py_INCREF(pathobj);
+		return pathobj;
+	}
+
+	if (newlen <= maxstorepathlen + 1) {
+		newobj = PyString_FromStringAndSize(NULL, newlen);
+		PyString_GET_SIZE(newobj)--;
+
+		if (newobj)
+			basicencode(PyString_AS_STRING(newobj), newlen, path,
+				    len + 1);
+	} else {
+		newlen = 170;
+		newobj = PyString_FromStringAndSize(NULL, 170);
+
+		if (newobj) {
+			newlen = hashencode(PyString_AS_STRING(newobj),
+					    newlen, path, len + 1);
+			if (newlen == -1)
+				Py_CLEAR(newobj);
+			else
+				PyString_GET_SIZE(newobj) = newlen;
+		}
+	}
+
+	return newobj;
+}
diff --git a/mercurial/store.py b/mercurial/store.py
--- a/mercurial/store.py
+++ b/mercurial/store.py
@@ -6,7 +6,7 @@
 # GNU General Public License version 2 or any later version.
 
 from i18n import _
-import osutil, scmutil, util
+import osutil, scmutil, util, parsers
 import os, stat, errno
 
 _sha = util.sha1
@@ -425,8 +425,11 @@
 def store(requirements, path, openertype):
     if 'store' in requirements:
         if 'fncache' in requirements:
-            auxencode = lambda f: _auxencode(f, 'dotencode' in requirements)
-            encode = lambda f: _hybridencode(f, auxencode)
+            dotreq = 'dotencode' in requirements
+            encode = dotreq and getattr(parsers, 'pathencode')
+            if not encode:
+                auxencode = lambda f: _auxencode(f, dotreq)
+                encode = lambda f: _hybridencode(f, auxencode)
             return fncachestore(path, openertype, encode)
         return encodedstore(path, openertype)
     return basicstore(path, openertype)
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -421,7 +421,8 @@
     Extension('mercurial.bdiff', ['mercurial/bdiff.c']),
     Extension('mercurial.diffhelpers', ['mercurial/diffhelpers.c']),
     Extension('mercurial.mpatch', ['mercurial/mpatch.c']),
-    Extension('mercurial.parsers', ['mercurial/parsers.c']),
+    Extension('mercurial.parsers', ['mercurial/parsers.c',
+                                    'mercurial/pathencode.c']),
     ]
 
 osutil_ldflags = []


More information about the Mercurial-devel mailing list