[PATCH] store: rewrite fncache path mangling code in C

Tue Aug 28 03:51:45 CDT 2012

On 2012-08-27 23:00, Bryan O'Sullivan wrote:
> # HG changeset patch
> # User Bryan O'Sullivan <bryano at fb.com>
> # Date 1346101191 25200
> # Node ID 54414ba3b42b2cc38e618782c15aa13070e7a6c7
> # Parent  99a2a4ae35e2180b7f825ef2677c36d538eac4ba
> store: rewrite fncache path mangling code in C
> 
> The Python path mangling code used by fncache is complex and (perhaps
> surprisingly) slow enough to negatively affect the overall performance
> of Mercurial.
> 
> The name mangling scheme used by fncache consists of two paths, both of
> which are translated into C in this patch.
> 
> For a short path (< 120 bytes), the Python code can be reduced to a
> fairly tractable state machine that mangles the path in a single pass.
> 
> For longer paths, the mangling scheme is much more complicated, has
> poor data dependencies, and requires a name to be hashed using SHA-1.
> It's almost impossible to see how to implement this without multiple
> copying passes.
> 
> Raw performance: I measured in a repo containing 150,000 files in its tip
> manifest, with a median path name length of 57 bytes, and 95th percentile
> of 96 bytes.
> 
> In this repo, the Python code takes 3.95 seconds to encode all path names,
> while the C code (called from Python) takes 0.15 seconds.
> 
> Across several other large repositories, I've measured the speedup from
> the C code at between 25x and 40x.  For path names above 120 bytes where
> the hashed encoding method must be used, the speedup is about 20x.
> 
> Practical performance:
> 
>   "hg clone -U --uncompressed" of mozilla-central over http
>     Python: 20.5 seconds
>     mixed:  17.5 (server C, client Python)
>     all C:  14.8
> 
>   "hg update" from null to tip in mozilla-central
>     Python: 30.1 seconds
>     C:      28.7

I can't say I like this patch.

The performance improvements you are reporting are certainly nice, but
the correctness of this huge amount of C code is very hard to verify.

Please scroll down to read about one bug (I think) I found already.

> diff --git a/mercurial/parsers.c b/mercurial/parsers.c
> --- a/mercurial/parsers.c
> +++ b/mercurial/parsers.c
> @@ -1506,11 +1506,14 @@
>  
>  static char parsers_doc[] = "Efficient content parsing.";
>  
> +PyObject *pathencode(PyObject *self, PyObject *args);
> +
>  static PyMethodDef methods[] = {
>  	{"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
>  	{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
>  	{"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
>  	{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
> +	{"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"},
>  	{NULL, NULL}
>  };
>  
> diff --git a/mercurial/pathencode.c b/mercurial/pathencode.c
> new file mode 100644
> --- /dev/null
> +++ b/mercurial/pathencode.c
> @@ -0,0 +1,589 @@
> +#include <Python.h>
> +#include <assert.h>
> +#include <ctype.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#include "sha1.h"
> +
> +enum dir_state {
> +	DSTART,
> +	DDOT,
> +	DH,
> +	DHG,
> +	DDEFAULT,
> +};
> +
> +enum path_state {
> +	START,
> +	A,
> +	AU,
> +	THREE,
> +	C,
> +	CO,
> +	COMLPT,
> +	COMLPTN,
> +	L,
> +	LP,
> +	N,
> +	NU,
> +	P,
> +	PR,
> +	DOT,
> +	H,
> +	HG,
> +	SPACE,
> +	DEFAULT,
> +};
> +
> +static inline int check_bit(const uint32_t bitset[], char c)
> +{
> +	return bitset[c >> 5] & (1 << (c & 31));
> +}
> +
> +static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
> +			    char c)
> +{
> +	if (dest) {
> +		assert(*destlen < destsize);
> +		dest[*destlen] = c;
> +	}
> +	(*destlen)++;
> +}
> +
> +static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
> +			   const void *src, Py_ssize_t len)
> +{
> +	if (dest) {
> +		assert(*destlen + len < destsize);
> +		memcpy((void *)&dest[*destlen], src, len);
> +	}
> +	*destlen += len;
> +}
> +
> +static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
> +			     uint8_t c)
> +{
> +	static const char hexdigit[] = "0123456789abcdef";
> +
> +	charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
> +	charcopy(dest, destlen, destsize, hexdigit[c & 15]);
> +}
> +
> +static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
> +			   char c)
> +{
> +	charcopy(dest, destlen, destsize, '~');
> +	hexencode(dest, destlen, destsize, c);
> +}
> +
> +static Py_ssize_t _encode(const uint32_t twobytes[8],
> +			  const uint32_t onebyte[8],
> +			  const uint32_t fastpath[8],
> +			  char *dest, Py_ssize_t destlen, size_t destsize,
> +			  const char *src, Py_ssize_t len)
> +{
> +	enum path_state state = START;
> +	Py_ssize_t i = 0;
> +
> +	while (i < len) {
> +		switch (state) {
> +		case START:
> +			switch (src[i]) {
> +			case '/':
> +				charcopy(dest, &destlen, destsize, src[i++]);
> +				break;
> +			case '.':
> +			case ' ':
> +				state = DEFAULT;
> +				escape3(dest, &destlen, destsize, src[i++]);
> +				break;
> +			case 'a':
> +				state = A;
> +				charcopy(dest, &destlen, destsize, src[i++]);
> +				break;
> +			case 'c':
> +				state = C;
> +				charcopy(dest, &destlen, destsize, src[i++]);
> +				break;
> +			case 'l':
> +				state = L;
> +				charcopy(dest, &destlen, destsize, src[i++]);
> +				break;
> +			case 'n':
> +				state = N;
> +				charcopy(dest, &destlen, destsize, src[i++]);
> +				break;
> +			case 'p':
> +				state = P;
> +				charcopy(dest, &destlen, destsize, src[i++]);
> +				break;
> +			default:
> +				state = DEFAULT;
> +				break;
> +			}
> +			break;
> +		case A:
> +			if (src[i] == 'u') {
> +				state = AU;
> +				charcopy(dest, &destlen, destsize, src[i++]);
> +			}
> +			else state = DEFAULT;
> +			break;
> +		case AU:
> +			if (src[i] == 'x') {
> +				state = THREE;
> +				i++;
> +			}
> +			else state = DEFAULT;
> +			break;
> +		case THREE:
> +			state = DEFAULT;
> +			switch (src[i]) {
> +			case '.':
> +			case '/':
> +			case '\0':
> +				escape3(dest, &destlen, destsize, src[i - 1]);
> +				break;
> +			default:
> +				i--;
> +				break;
> +			}
> +			break;
> +		case C:
> +			if (src[i] == 'o') {
> +				state = CO;
> +				charcopy(dest, &destlen, destsize, src[i++]);
> +			}
> +			else state = DEFAULT;
> +			break;
> +		case CO:
> +			if (src[i] == 'm') {
> +				state = COMLPT;
> +				i++;
> +			}
> +			else if (src[i] == 'n') {
> +				state = THREE;
> +				i++;
> +			}
> +			else state = DEFAULT;
> +			break;
> +		case COMLPT:
> +			switch (src[i]) {
> +			case '0': case '1': case '2': case '3': case '4':
> +			case '5': case '6': case '7': case '8': case '9':

Here, you have included the case '0', but this isn't present in the
Python reference implementation:

http://selenic.com/repo/hg/file/99a2a4ae35e2/mercurial/store.py#l121

_winreservednames = '''con prn aux nul
    com1 com2 com3 com4 com5 com6 com7 com8 com9
    lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9'''.split()

Specifically, com0 and lpt0 are _not_ to be encoded, as these are not
reserved names on Windows.

BTW, this (Python implementation) was fully intentional and it matches
Microsoft's documentation:

http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx

> +				state = COMLPTN;
> +				i++;
> +				break;
> +			default:
> +				state = DEFAULT;
> +				charcopy(dest, &destlen, destsize, src[i - 1]);
> +				break;
> +			}
> +			break;
> +		case COMLPTN:
> +			state = DEFAULT;
> +			switch (src[i]) {
> +			case '.':
> +			case '/':
> +			case '\0':
> +				escape3(dest, &destlen, destsize, src[i - 2]);
> +				charcopy(dest, &destlen, destsize, src[i - 1]);
> +				break;
> +			default:
> +				memcopy(dest, &destlen, destsize,
> +					&src[i - 2], 2);
> +				break;
> +			}
> +			break;

[snipping the rest]