[PATCH v2] store: rewrite fncache path mangling code in C
Bryan O'Sullivan
bos at serpentine.com
Tue Aug 28 12:43:43 CDT 2012
# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1346175814 25200
# Node ID 91f70954e9d681a35130aa24f66aaa7148d8ee1b
# Parent 99a2a4ae35e2180b7f825ef2677c36d538eac4ba
store: rewrite fncache path mangling code in C
The Python path mangling code used by fncache
diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -1506,11 +1506,14 @@
static char parsers_doc[] = "Efficient content parsing.";
+PyObject *pathencode(PyObject *self, PyObject *args);
+
static PyMethodDef methods[] = {
{"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
{"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
+ {"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"},
{NULL, NULL}
};
diff --git a/mercurial/pathencode.c b/mercurial/pathencode.c
new file mode 100644
--- /dev/null
+++ b/mercurial/pathencode.c
@@ -0,0 +1,669 @@
+/*
+ pathencode.c - efficient path name encoding
+
+ Copyright 2012 Facebook
+
+ This software may be used and distributed according to the terms of
+ the GNU General Public License, incorporated herein by reference.
+*/
+
+/*
+ * An implementation of the name encoding scheme used by the fncache
+ * store. The common case is of a path < 120 bytes long, which is
+ * handled either in a single pass with no allocations or two passes
+ * with a single allocation. For longer paths, multiple passes are
+ * required.
+ */
+
+#include <Python.h>
+#include <assert.h>
+#include <ctype.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* state machine for the fast path */
+enum path_state {
+ START, /* first byte of a path component */
+ A, /* "AUX" */
+ AU,
+ THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
+ C, /* "CON" or "COMn" */
+ CO,
+ COMLPT, /* "COM" or "LPT" */
+ COMLPTn,
+ L,
+ LP,
+ N,
+ NU,
+ P, /* "PRN" */
+ PR,
+ DOT,
+ H, /* ".h" */
+ HGDI, /* ".hg", ".d", or ".i" */
+ SPACE,
+ DEFAULT, /* byte of a path component after the first */
+};
+
+/* state machine for dir-encoding */
+enum dir_state {
+ DSTART,
+ DDOT,
+ DH,
+ DHGDI,
+ DDEFAULT,
+};
+
+static inline int isset(const uint32_t bitset[], char c)
+{
+ return bitset[c >> 5] & (1 << (c & 31));
+}
+
+static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
+ char c)
+{
+ if (dest) {
+ assert(*destlen < destsize);
+ dest[*destlen] = c;
+ }
+ (*destlen)++;
+}
+
+static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
+ const void *src, Py_ssize_t len)
+{
+ if (dest) {
+ assert(*destlen + len < destsize);
+ memcpy((void *)&dest[*destlen], src, len);
+ }
+ *destlen += len;
+}
+
+static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
+ uint8_t c)
+{
+ static const char hexdigit[] = "0123456789abcdef";
+
+ charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
+ charcopy(dest, destlen, destsize, hexdigit[c & 15]);
+}
+
+/* 3-byte escape: tilde followed by two hex digits */
+static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
+ char c)
+{
+ charcopy(dest, destlen, destsize, '~');
+ hexencode(dest, destlen, destsize, c);
+}
+
+static Py_ssize_t _encode(const uint32_t twobytes[8],
+ const uint32_t onebyte[8],
+ const uint32_t fastpath[8],
+ char *dest, Py_ssize_t destlen, size_t destsize,
+ const char *src, Py_ssize_t len)
+{
+ enum path_state state = START;
+ Py_ssize_t i = 0;
+
+ /*
+ * Python strings end with a zero byte, which we use as a
+ * terminal token as they are not valid inside path names.
+ */
+
+ while (i < len) {
+ switch (state) {
+ case START:
+ switch (src[i]) {
+ case '/':
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ case '.':
+ case ' ':
+ state = DEFAULT;
+ escape3(dest, &destlen, destsize, src[i++]);
+ break;
+ case 'a':
+ state = A;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ case 'c':
+ state = C;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ case 'l':
+ state = L;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ case 'n':
+ state = N;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ case 'p':
+ state = P;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ default:
+ state = DEFAULT;
+ break;
+ }
+ break;
+ case A:
+ if (src[i] == 'u') {
+ state = AU;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else state = DEFAULT;
+ break;
+ case AU:
+ if (src[i] == 'x') {
+ state = THIRD;
+ i++;
+ }
+ else state = DEFAULT;
+ break;
+ case THIRD:
+ state = DEFAULT;
+ switch (src[i]) {
+ case '.':
+ case '/':
+ case '\0':
+ escape3(dest, &destlen, destsize, src[i - 1]);
+ break;
+ default:
+ i--;
+ break;
+ }
+ break;
+ case C:
+ if (src[i] == 'o') {
+ state = CO;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else state = DEFAULT;
+ break;
+ case CO:
+ if (src[i] == 'm') {
+ state = COMLPT;
+ i++;
+ }
+ else if (src[i] == 'n') {
+ state = THIRD;
+ i++;
+ }
+ else state = DEFAULT;
+ break;
+ case COMLPT:
+ switch (src[i]) {
+ case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7': case '8': case '9':
+ state = COMLPTn;
+ i++;
+ break;
+ default:
+ state = DEFAULT;
+ charcopy(dest, &destlen, destsize, src[i - 1]);
+ break;
+ }
+ break;
+ case COMLPTn:
+ state = DEFAULT;
+ switch (src[i]) {
+ case '.':
+ case '/':
+ case '\0':
+ escape3(dest, &destlen, destsize, src[i - 2]);
+ charcopy(dest, &destlen, destsize, src[i - 1]);
+ break;
+ default:
+ memcopy(dest, &destlen, destsize,
+ &src[i - 2], 2);
+ break;
+ }
+ break;
+ case L:
+ if (src[i] == 'p') {
+ state = LP;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else state = DEFAULT;
+ break;
+ case LP:
+ if (src[i] == 't') {
+ state = COMLPT;
+ i++;
+ }
+ else state = DEFAULT;
+ break;
+ case N:
+ if (src[i] == 'u') {
+ state = NU;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else state = DEFAULT;
+ break;
+ case NU:
+ if (src[i] == 'l') {
+ state = THIRD;
+ i++;
+ }
+ else state = DEFAULT;
+ break;
+ case P:
+ if (src[i] == 'r') {
+ state = PR;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else state = DEFAULT;
+ break;
+ case PR:
+ if (src[i] == 'n') {
+ state = THIRD;
+ i++;
+ }
+ else state = DEFAULT;
+ break;
+ case DOT:
+ switch (src[i]) {
+ case '/':
+ case '\0':
+ state = START;
+ memcopy(dest, &destlen, destsize, "~2e", 3);
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ case 'd':
+ case 'i':
+ state = HGDI;
+ charcopy(dest, &destlen, destsize, '.');
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ case 'h':
+ state = H;
+ charcopy(dest, &destlen, destsize, '.');
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ default:
+ state = DEFAULT;
+ charcopy(dest, &destlen, destsize, '.');
+ break;
+ }
+ break;
+ case H:
+ if (src[i] == 'g') {
+ state = HGDI;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else state = DEFAULT;
+ break;
+ case HGDI:
+ if (src[i] == '/') {
+ state = START;
+ memcopy(dest, &destlen, destsize, ".hg", 3);
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else state = DEFAULT;
+ break;
+ case SPACE:
+ switch (src[i]) {
+ case '/':
+ case '\0':
+ state = START;
+ memcopy(dest, &destlen, destsize, "~20", 3);
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ default:
+ state = DEFAULT;
+ charcopy(dest, &destlen, destsize, ' ');
+ break;
+ }
+ break;
+ case DEFAULT:
+ while (i < len && isset(fastpath, src[i]))
+ charcopy(dest, &destlen, destsize, src[i++]);
+ if (i == len)
+ goto done;
+ switch (src[i]) {
+ case '.':
+ state = DOT;
+ i++;
+ break;
+ case ' ':
+ state = SPACE;
+ i++;
+ break;
+ case '/':
+ state = START;
+ charcopy(dest, &destlen, destsize, '/');
+ i++;
+ break;
+ default:
+ if (isset(onebyte, src[i])) {
+ do {
+ charcopy(dest, &destlen,
+ destsize, src[i++]);
+ } while (i < len &&
+ isset(fastpath, src[i]));
+ }
+ else if (isset(twobytes, src[i])) {
+ char c = src[i++];
+ charcopy(dest, &destlen, destsize, '_');
+ charcopy(dest, &destlen, destsize,
+ c == '_' ? '_' : c + 32);
+ }
+ else
+ escape3(dest, &destlen, destsize,
+ src[i++]);
+ break;
+ }
+ break;
+ }
+ }
+done:
+ return destlen;
+}
+
+static Py_ssize_t basicencode(char *dest, size_t destsize,
+ const char *src, Py_ssize_t len)
+{
+ static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe };
+
+ static const uint32_t onebyte[8] = {
+ 1, 0x2bfffbfa, 0x68000001, 0x2fffffff,
+ };
+
+ static const uint32_t fastpath[8] = {
+ 1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
+ };
+
+ Py_ssize_t destlen = 0;
+
+ if (len < 5 || memcmp(src, "data/", 5) != 0) {
+ memcopy(dest, &destlen, destsize, src, len);
+ return destlen;
+ }
+
+ memcopy(dest, &destlen, destsize, "data/", 5);
+
+ return _encode(twobytes, onebyte, fastpath, dest, destlen, destsize,
+ src + 5, len - 5);
+}
+
+static Py_ssize_t direncode(char *dest, size_t destsize,
+ const char *src, Py_ssize_t len)
+{
+ enum dir_state state = DSTART;
+ Py_ssize_t i = 5, destlen = 0;
+
+ memcopy(dest, &destlen, destsize, "data/", 5);
+
+ while (i < len) {
+ switch (state) {
+ case DSTART:
+ state = DDEFAULT;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ case DDOT:
+ switch (src[i]) {
+ case 'd':
+ case 'i':
+ state = DHGDI;
+ charcopy(dest, &destlen, destsize, '.');
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ case 'h':
+ state = DH;
+ charcopy(dest, &destlen, destsize, '.');
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ default:
+ state = DDEFAULT;
+ charcopy(dest, &destlen, destsize, '.');
+ break;
+ }
+ case DH:
+ if (src[i] == 'g') {
+ state = DHGDI;
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else state = DDEFAULT;
+ break;
+ case DHGDI:
+ if (src[i] == '/') {
+ state = DSTART;
+ memcopy(dest, &destlen, destsize, ".hg", 3);
+ charcopy(dest, &destlen, destsize, src[i++]);
+ }
+ else state = DDEFAULT;
+ break;
+ case DDEFAULT:
+ switch (src[i]) {
+ case '.':
+ state = DDOT;
+ i++;
+ break;
+ case '/':
+ state = DSTART;
+ default:
+ charcopy(dest, &destlen, destsize, src[i++]);
+ break;
+ }
+ break;
+ }
+ }
+
+ return destlen;
+}
+
+static Py_ssize_t auxencode(char *dest, size_t destsize,
+ const char *src, Py_ssize_t len)
+{
+ static const uint32_t twobytes[8];
+
+ static const uint32_t onebyte[8] = {
+ ~0, 0xffffbffe, ~0, ~0, ~0, ~0, ~0, ~0,
+ };
+
+ return _encode(twobytes, onebyte, onebyte, dest, 0, destsize,
+ src, len);
+}
+
+static const Py_ssize_t maxstorepathlen = 120;
+
+static Py_ssize_t lowerencode(char *dest, size_t destsize,
+ const char *src, Py_ssize_t len)
+{
+ static const uint32_t onebyte[8] = {
+ 1, 0x2bfffbfb, 0xe8000001, 0x2fffffff
+ };
+
+ static const uint32_t lower[8] = { 0, 0, 0x7fffffe };
+
+ Py_ssize_t i = 0, destlen = 0;
+
+ while (i < len) {
+ if (isset(onebyte, src[i]))
+ charcopy(dest, &destlen, destsize, src[i++]);
+ else if (isset(lower, src[i]))
+ charcopy(dest, &destlen, destsize, src[i++] + 32);
+ else
+ escape3(dest, &destlen, destsize, src[i++]);
+ }
+
+ return destlen;
+}
+
+static Py_ssize_t hashmangle(char *dest, size_t destsize, const char *src,
+ Py_ssize_t len, const char sha[20])
+{
+ static const Py_ssize_t dirprefixlen = 8;
+ static const Py_ssize_t maxshortdirslen = 68;
+
+ Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;
+ Py_ssize_t destlen = 0, slop;
+
+ while (lastslash >= 0 && src[lastslash] != '/') {
+ if (src[lastslash] == '.' && lastdot == -1)
+ lastdot = lastslash;
+ lastslash--;
+ }
+
+ for (i = d = p = 0; i < lastslash; i++, p++) {
+ if (src[i] == '/') {
+ if (destlen > maxshortdirslen)
+ break;
+ charcopy(dest, &destlen, destsize, src[i]);
+ p = -1;
+ }
+ else if (p < dirprefixlen)
+ charcopy(dest, &destlen, destsize, src[i]);
+ }
+
+ if (destlen > maxshortdirslen)
+ do {
+ destlen--;
+ } while (destlen > 0 && dest[destlen] != '/');
+
+ if (destlen > 3)
+ charcopy(dest, &destlen, destsize, '/');
+
+ slop = maxstorepathlen - destlen - 3 - (40 + len - lastdot - 1);
+ if (slop > len - lastslash - 2)
+ slop = len - lastslash - 2;
+
+ if (slop > 0)
+ memcopy(dest, &destlen, destsize, &src[lastslash + 1], slop);
+
+ for (i = 0; i < 20; i++)
+ hexencode(dest, &destlen, destsize, sha[i]);
+
+ if (lastdot >= 0)
+ memcopy(dest, &destlen, destsize, &src[lastdot],
+ len - lastdot);
+ else
+ charcopy(dest, &destlen, destsize, 0);
+
+ return destlen;
+}
+
+/*
+ * Avoiding a trip through Python would improve performance by 50%,
+ * but we don't encounter enough long names to be worth the code.
+ */
+static int sha1hash(char hash[20], const char *str, Py_ssize_t len)
+{
+ static PyObject *shafunc;
+ PyObject *shaobj, *hashobj;
+
+ if (shafunc == NULL) {
+ PyObject *util, *name = PyString_FromString("mercurial.util");
+
+ if (name == NULL)
+ return -1;
+
+ util = PyImport_Import(name);
+ Py_DECREF(name);
+
+ if (util == NULL) {
+ PyErr_SetString(PyExc_ImportError, "mercurial.util");
+ return -1;
+ }
+ shafunc = PyObject_GetAttrString(util, "sha1");
+ Py_DECREF(util);
+
+ if (shafunc == NULL) {
+ PyErr_SetString(PyExc_AttributeError,
+ "module 'mercurial.util' has no "
+ "attribute 'sha1'");
+ return -1;
+ }
+ }
+
+ shaobj = PyObject_CallFunction(shafunc, "s#", str, len);
+
+ if (shaobj == NULL)
+ return -1;
+
+ hashobj = PyObject_CallMethod(shaobj, "digest", "");
+ Py_DECREF(shaobj);
+
+ if (!PyString_Check(hashobj) || PyString_GET_SIZE(hashobj) != 20) {
+ PyErr_SetString(PyExc_TypeError,
+ "result of digest is not a 20-byte hash");
+ Py_DECREF(hashobj);
+ return -1;
+ }
+
+ memcpy(hash, PyString_AS_STRING(hashobj), 20);
+ Py_DECREF(hashobj);
+ return 0;
+}
+
+static Py_ssize_t hashencode(char *dest, size_t destsize, const char *src,
+ Py_ssize_t len)
+{
+ const Py_ssize_t baselen = (len - 5) * 3;
+ char dired[baselen];
+ char lowered[baselen];
+ char auxed[baselen];
+ char mangled[baselen];
+ Py_ssize_t dirlen, lowerlen, auxlen, destlen;
+ char sha[20];
+
+ dirlen = direncode(dired, baselen, src, len);
+ if (sha1hash(sha, dired, dirlen - 1) == -1)
+ return -1;
+ lowerlen = lowerencode(lowered, baselen, src + 5, len - 5);
+ auxlen = auxencode(auxed, baselen, lowered, lowerlen);
+ memcpy(mangled, "dh/", 3);
+ destlen = hashmangle(mangled + 3, baselen - 3, auxed, auxlen, sha) + 2;
+ if (dest) {
+ assert(destlen <= destsize);
+ memcpy(dest, mangled, destlen);
+ }
+
+ return destlen;
+}
+
+PyObject *pathencode(PyObject *self, PyObject *args)
+{
+ Py_ssize_t len, newlen;
+ PyObject *pathobj, *newobj;
+ char *path;
+
+ if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
+ return NULL;
+
+ if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
+ PyErr_SetString(PyExc_TypeError, "expected a string");
+ return NULL;
+ }
+
+ newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
+
+ if (newlen == len + 1) {
+ Py_INCREF(pathobj);
+ return pathobj;
+ }
+
+ if (newlen <= maxstorepathlen + 1) {
+ newobj = PyString_FromStringAndSize(NULL, newlen);
+ PyString_GET_SIZE(newobj)--;
+
+ if (newobj)
+ basicencode(PyString_AS_STRING(newobj), newlen, path,
+ len + 1);
+ } else {
+ newlen = 170;
+ newobj = PyString_FromStringAndSize(NULL, 170);
+
+ if (newobj) {
+ newlen = hashencode(PyString_AS_STRING(newobj),
+ newlen, path, len + 1);
+ if (newlen == -1)
+ Py_CLEAR(newobj);
+ else
+ PyString_GET_SIZE(newobj) = newlen;
+ }
+ }
+
+ return newobj;
+}
diff --git a/mercurial/store.py b/mercurial/store.py
--- a/mercurial/store.py
+++ b/mercurial/store.py
@@ -6,7 +6,7 @@
# GNU General Public License version 2 or any later version.
from i18n import _
-import osutil, scmutil, util
+import osutil, scmutil, util, parsers
import os, stat, errno
_sha = util.sha1
@@ -425,8 +425,11 @@
def store(requirements, path, openertype):
if 'store' in requirements:
if 'fncache' in requirements:
- auxencode = lambda f: _auxencode(f, 'dotencode' in requirements)
- encode = lambda f: _hybridencode(f, auxencode)
+ dotreq = 'dotencode' in requirements
+ encode = dotreq and getattr(parsers, 'pathencode')
+ if not encode:
+ auxencode = lambda f: _auxencode(f, dotreq)
+ encode = lambda f: _hybridencode(f, auxencode)
return fncachestore(path, openertype, encode)
return encodedstore(path, openertype)
return basicstore(path, openertype)
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -421,7 +421,8 @@
Extension('mercurial.bdiff', ['mercurial/bdiff.c']),
Extension('mercurial.diffhelpers', ['mercurial/diffhelpers.c']),
Extension('mercurial.mpatch', ['mercurial/mpatch.c']),
- Extension('mercurial.parsers', ['mercurial/parsers.c']),
+ Extension('mercurial.parsers', ['mercurial/parsers.c',
+ 'mercurial/pathencode.c']),
]
osutil_ldflags = []
More information about the Mercurial-devel
mailing list